diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index a518dad0ff2..f0776974eec 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -6,25 +6,25 @@ PTX Instructions
 .. toctree::
    :maxdepth: 1
 
-   instructions/barrier.cluster
-   instructions/cp.async.bulk
-   instructions/cp.async.bulk.commit_group
-   instructions/cp.async.bulk.wait_group
-   instructions/cp.async.bulk.tensor
-   instructions/cp.reduce.async.bulk
-   instructions/cp.reduce.async.bulk.tensor
+   instructions/barrier_cluster
+   instructions/cp_async_bulk
+   instructions/cp_async_bulk_commit_group
+   instructions/cp_async_bulk_wait_group
+   instructions/cp_async_bulk_tensor
+   instructions/cp_reduce_async_bulk
+   instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier.init
-   instructions/mbarrier.arrive
-   instructions/mbarrier.expect_tx
-   instructions/mbarrier.test_wait
-   instructions/mbarrier.try_wait
-   instructions/red.async
-   instructions/st.async
-   instructions/tensormap.replace
-   instructions/tensormap.cp_fenceproxy
+   instructions/mbarrier_init
+   instructions/mbarrier_arrive
+   instructions/mbarrier_expect_tx
+   instructions/mbarrier_test_wait
+   instructions/mbarrier_try_wait
+   instructions/red_async
+   instructions/st_async
+   instructions/tensormap_replace
+   instructions/tensormap_cp_fenceproxy
    instructions/special_registers
 
 
diff --git a/docs/libcudacxx/ptx/instructions/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst
new file mode 100644
index 00000000000..bc8943bc619
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst
@@ -0,0 +1,16 @@
+.. _libcudacxx-ptx-instructions-barrier-cluster:
+
+barrier.cluster
+===============
+
+-  PTX ISA:
+   `barrier.cluster <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`__
+
+Similar functionality is provided through the builtins
+``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``,
+as well as the ``cooperative_groups::cluster_group``
+`API <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group>`__.
+
+The ``.aligned`` variants of the instructions are not exposed.
+
+.. include:: generated/barrier_cluster.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
new file mode 100644
index 00000000000..32121ef8a12
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
@@ -0,0 +1,30 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk:
+
+cp.async.bulk
+=============
+
+-  PTX ISA:
+   `cp.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`__
+
+Implementation notes
+--------------------
+
+**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and
+``size`` must be a multiple of 16.
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_multicast`` was enabled for
+   SM_90. This has been changed to SM_90a.
+
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_multicast.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
similarity index 58%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst
rename to docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
index cc549f54163..8efc5ac0488 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
@@ -6,10 +6,4 @@ cp.async.bulk.commit_group
 -  PTX ISA:
    `cp.async.bulk.commit_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__
 
-cp.async.bulk.commit_group
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_commit_group();
+.. include:: generated/cp_async_bulk_commit_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
new file mode 100644
index 00000000000..bde3488bac9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
@@ -0,0 +1,23 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor:
+
+cp.async.bulk.tensor
+====================
+
+-  PTX ISA:
+   `cp.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled
+   for SM_90. This has been changed to SM_90a.
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk_tensor.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_tensor_multicast.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
new file mode 100644
index 00000000000..e24bb0fc9fd
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group:
+
+cp.async.bulk.wait_group
+========================
+
+-  PTX ISA:
+   `cp.async.bulk.wait_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__
+
+.. include:: generated/cp_async_bulk_wait_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
new file mode 100644
index 00000000000..a4710b5ce30
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
@@ -0,0 +1,61 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk:
+
+cp.reduce.async.bulk
+====================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__
+
+
+Integer and floating point instructions
+---------------------------------------
+
+.. include:: generated/cp_reduce_async_bulk.rst
+
+Emulation of ``.s64`` instruction
+---------------------------------
+
+PTX does not currently (CTK 12.3) expose
+``cp.reduce.async.bulk.add.s64``. This exposure is emulated in
+``cuda::ptx`` using:
+
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
+   // .dst       = { .shared::cluster }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size,
+     uint64_t* rdsmem_bar);
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size);
+
+FP16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_f16.rst
+
+BF16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_bf16.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
new file mode 100644
index 00000000000..598d9e1e3ea
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor:
+
+cp.reduce.async.bulk.tensor
+===========================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`__
+
+.. include:: generated/cp_reduce_async_bulk_tensor.rst
diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst
index 8a4e7f281cb..82de170f63b 100644
--- a/docs/libcudacxx/ptx/instructions/fence.rst
+++ b/docs/libcudacxx/ptx/instructions/fence.rst
@@ -11,272 +11,25 @@ fence
 fence
 -----
 
-fence.sc.cta
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.gpu
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.sys
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.cta
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.gpu
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.sys
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.cluster
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cluster }
-   template <cuda::ptx::dot_sem Sem>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_cluster_t);
-
-fence.acq_rel.cluster
-^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cluster }
-   template <cuda::ptx::dot_sem Sem>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_cluster_t);
+.. include:: generated/fence.rst
 
 fence.mbarrier_init
 -------------------
 
-fence.mbarrier_init.release.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   template <typename=void>
-   __device__ static inline void fence_mbarrier_init(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t);
+.. include:: generated/fence_mbarrier_init.rst
 
 fence.proxy.alias
 -----------------
 
-fence.proxy.alias
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-   template <typename=void>
-   __device__ static inline void fence_proxy_alias();
+.. include:: generated/fence_proxy_alias.rst
 
 fence.proxy.async
 -----------------
 
-fence.proxy.async
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async; // 5. PTX ISA 80, SM_90
-   template <typename=void>
-   __device__ static inline void fence_proxy_async();
 
-fence.proxy.async.global
-^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
-
-fence.proxy.async.shared::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
-
-fence.proxy.async.shared::cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
+.. include:: generated/fence_proxy_async.rst
 
 fence.proxy.tensormap
 ---------------------
 
-fence.proxy.tensormap::generic.release.cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.gpu
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.sys
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.acquire.cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.gpu
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.sys
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
+.. include:: generated/fence_proxy_tensormap_generic.rst
diff --git a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
similarity index 70%
rename from docs/libcudacxx/ptx/instructions/barrier.cluster.rst
rename to docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
index 99048587eb5..bd994990c05 100644
--- a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -1,18 +1,3 @@
-.. _libcudacxx-ptx-instructions-barrier-cluster:
-
-barrier.cluster
-===============
-
--  PTX ISA:
-   `barrier.cluster <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`__
-
-Similar functionality is provided through the builtins
-``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``,
-as well as the ``cooperative_groups::cluster_group``
-`API <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group>`__.
-
-The ``.aligned`` variants of the instructions are not exposed.
-
 barrier.cluster.arrive
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
similarity index 57%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
index 434a44a15a4..f5c236f8bf9 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -1,26 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk:
-
-cp.async.bulk
-=============
-
--  PTX ISA:
-   `cp.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`__
-
-Implementation notes
---------------------
-
-**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and
-``size`` must be a multiple of 16.
-
-Changelog
----------
-
--  In earlier versions, ``cp_async_bulk_multicast`` was enabled for
-   SM_90. This has been changed to SM_90a.
-
-Unicast
--------
-
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -67,23 +44,3 @@ cp.async.bulk.global.shared::cta.bulk_group
      void* dstMem,
      const void* srcMem,
      const uint32_t& size);
-
-Multicast
----------
-
-cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* srcMem,
-     const uint32_t& size,
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
new file mode 100644
index 00000000000..984b4aff976
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -0,0 +1,7 @@
+cp.async.bulk.commit_group
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
new file mode 100644
index 00000000000..9cb15d06fa3
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -0,0 +1,16 @@
+cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
similarity index 59%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
index f095abcd1a3..40eb070e66a 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -1,20 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor:
-
-cp.async.bulk.tensor
-====================
-
--  PTX ISA:
-   `cp.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__
-
-Changelog
----------
-
--  In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled
-   for SM_90. This has been changed to SM_90a.
-
-Unicast
--------
-
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -169,91 +152,3 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
      const void* tensorMap,
      const int32_t (&tensorCoords)[5],
      const void* srcMem);
-
-Multicast
----------
-
-cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[1],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[2],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[3],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[4],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[5],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
new file mode 100644
index 00000000000..2481c80bf3c
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -0,0 +1,84 @@
+cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[1],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[2],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[3],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[4],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[5],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
similarity index 62%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
index 8939292d340..08ebd3c28a7 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group:
-
-cp.async.bulk.wait_group
-========================
-
--  PTX ISA:
-   `cp.async.bulk.wait_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__
-
 cp.async.bulk.wait_group
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
similarity index 80%
rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
index 571e1d9842f..cc82d633375 100644
--- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
@@ -1,15 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk:
-
-cp.reduce.async.bulk
-====================
-
--  PTX ISA:
-   `cp.reduce.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__
-
-
-Integer and floating point instructions
----------------------------------------
-
 cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -652,155 +640,3 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
      int64_t* dstMem,
      const int64_t* srcMem,
      uint32_t size);
-
-Emulation of ``.s64`` instruction
----------------------------------
-
-PTX does not currently (CTK 12.3) expose
-``cp.reduce.async.bulk.add.s64``. This exposure is emulated in
-``cuda::ptx`` using:
-
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
-   // .dst       = { .shared::cluster }
-   // .src       = { .shared::cta }
-   // .type      = { .s64 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     int64_t* dstMem,
-     const int64_t* srcMem,
-     uint32_t size,
-     uint64_t* rdsmem_bar);
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .s64 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     int64_t* dstMem,
-     const int64_t* srcMem,
-     uint32_t size);
-
-FP16 instructions
------------------
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .min }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_min_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .max }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_max_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-BF16 instructions
------------------
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .min }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_min_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .max }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_max_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
new file mode 100644
index 00000000000..e4dea98a119
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
@@ -0,0 +1,53 @@
+cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .min }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_min_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .max }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_max_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
new file mode 100644
index 00000000000..18c5e0bfc60
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
@@ -0,0 +1,53 @@
+cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .min }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_min_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .max }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_max_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
similarity index 98%
rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
index 7ea7b5675aa..c653b01cd60 100644
--- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor:
-
-cp.reduce.async.bulk.tensor
-===========================
-
--  PTX ISA:
-   `cp.reduce.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`__
-
 cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst
new file mode 100644
index 00000000000..2fe14dcb3b2
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst
@@ -0,0 +1,95 @@
+fence.sc.cta
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.gpu
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.sys
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.cta
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.gpu
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.sys
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.cluster
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cluster }
+   template <cuda::ptx::dot_sem Sem>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_cluster_t);
+
+fence.acq_rel.cluster
+^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cluster }
+   template <cuda::ptx::dot_sem Sem>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
new file mode 100644
index 00000000000..0f5298e3359
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
@@ -0,0 +1,11 @@
+fence.mbarrier_init.release.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   template <typename=void>
+   __device__ static inline void fence_mbarrier_init(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
new file mode 100644
index 00000000000..935aab9b6df
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
@@ -0,0 +1,7 @@
+fence.proxy.alias
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.alias; // 4. PTX ISA 75, SM_70
+   template <typename=void>
+   __device__ static inline void fence_proxy_alias();
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
new file mode 100644
index 00000000000..3e741a1f6c4
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
@@ -0,0 +1,37 @@
+fence.proxy.async
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async; // 5. PTX ISA 80, SM_90
+   template <typename=void>
+   __device__ static inline void fence_proxy_async();
+
+fence.proxy.async.global
+^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
+
+fence.proxy.async.shared::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
+
+fence.proxy.async.shared::cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
new file mode 100644
index 00000000000..db582971c3d
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
@@ -0,0 +1,103 @@
+fence.proxy.tensormap::generic.release.cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.gpu
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.sys
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.acquire.cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.gpu
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.sys
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
new file mode 100644
index 00000000000..c85f52ee302
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
@@ -0,0 +1,10 @@
+getctarank.shared::cluster.u32
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline uint32_t getctarank(
+     cuda::ptx::space_cluster_t,
+     const void* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
new file mode 100644
index 00000000000..92cd106cad9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -0,0 +1,111 @@
+mbarrier.arrive.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive(
+     uint64_t* addr);
+
+mbarrier.arrive.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive(
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr,
+     const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
new file mode 100644
index 00000000000..0087ae2f458
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -0,0 +1,47 @@
+mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
+
+mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
+
+mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
new file mode 100644
index 00000000000..b6d7edbbeee
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
@@ -0,0 +1,9 @@
+mbarrier.arrive.noComplete.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive_no_complete(
+     uint64_t* addr,
+     const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
similarity index 88%
rename from docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst
rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
index 9b40db58d0c..b87d6f62a23 100644
--- a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-expect_tx:
-
-mbarrier.expect_tx
-==================
-
--  PTX ISA:
-   `mbarrier.expect_tx <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx>`__
-
 mbarrier.expect_tx.relaxed.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
similarity index 50%
rename from docs/libcudacxx/ptx/instructions/mbarrier.init.rst
rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
index 8c7e65eeab6..3e529d86d78 100644
--- a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-init:
-
-mbarrier.init
-=============
-
--  PTX ISA:
-   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init>`__
-
 mbarrier.init.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
new file mode 100644
index 00000000000..4cb241c7ca8
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -0,0 +1,37 @@
+mbarrier.test_wait.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline bool mbarrier_test_wait(
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.test_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.test_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
new file mode 100644
index 00000000000..e750c4a543f
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -0,0 +1,37 @@
+mbarrier.test_wait.parity.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
+   template <typename=void>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.test_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
new file mode 100644
index 00000000000..ce648c66ee9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -0,0 +1,78 @@
+mbarrier.try_wait.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait(
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait(
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
new file mode 100644
index 00000000000..3210dc0eab1
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -0,0 +1,78 @@
+mbarrier.try_wait.parity.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/red.async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
similarity index 89%
rename from docs/libcudacxx/ptx/instructions/red.async.rst
rename to docs/libcudacxx/ptx/instructions/generated/red_async.rst
index 62599548a22..d6b9cf36549 100644
--- a/docs/libcudacxx/ptx/instructions/red.async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
@@ -1,16 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-red-async:
-
-red.async
-=========
-
--  PTX ISA:
-   `red.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async>`__
-
-.. _red.async-1:
-
-red.async
----------
-
 red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -191,20 +178,3 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
      int64_t* dest,
      const int64_t& value,
      int64_t* remote_bar);
-
-red.async ``.s64`` emulation
-----------------------------
-
-PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This
-exposure is emulated in ``cuda::ptx`` using
-
-.. code:: cuda
-
-   // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void red_async(
-     cuda::ptx::op_add_t,
-     int64_t* dest,
-     const int64_t& value,
-     int64_t* remote_bar);
diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst
new file mode 100644
index 00000000000..aa1add84781
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst
@@ -0,0 +1,383 @@
+tid.x
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_x();
+
+tid.y
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_y();
+
+tid.z
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_z();
+
+ntid.x
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_x();
+
+ntid.y
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_y();
+
+ntid.z
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_z();
+
+laneid
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%laneid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_laneid();
+
+warpid
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%warpid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_warpid();
+
+nwarpid
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nwarpid();
+
+ctaid.x
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_x();
+
+ctaid.y
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_y();
+
+ctaid.z
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_z();
+
+nctaid.x
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_x();
+
+nctaid.y
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_y();
+
+nctaid.z
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_z();
+
+smid
+^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%smid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_smid();
+
+nsmid
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nsmid();
+
+gridid
+^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%gridid; // PTX ISA 30
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_gridid();
+
+is_explicit_cluster
+^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool get_sreg_is_explicit_cluster();
+
+clusterid.x
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_x();
+
+clusterid.y
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_y();
+
+clusterid.z
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_z();
+
+nclusterid.x
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_x();
+
+nclusterid.y
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_y();
+
+nclusterid.z
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_z();
+
+cluster_ctaid.x
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
+
+cluster_ctaid.y
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
+
+cluster_ctaid.z
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
+
+cluster_nctaid.x
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
+
+cluster_nctaid.y
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
+
+cluster_nctaid.z
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
+
+cluster_ctarank
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctarank();
+
+cluster_nctarank
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctarank();
+
+lanemask_eq
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_eq();
+
+lanemask_le
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_le();
+
+lanemask_lt
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_lt();
+
+lanemask_ge
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_ge();
+
+lanemask_gt
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_gt();
+
+clock
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clock; // PTX ISA 10
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clock();
+
+clock_hi
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clock_hi();
+
+clock64
+^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_clock64();
+
+globaltimer
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_globaltimer();
+
+globaltimer_lo
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_globaltimer_lo();
+
+globaltimer_hi
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_globaltimer_hi();
+
+total_smem_size
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_total_smem_size();
+
+aggr_smem_size
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_aggr_smem_size();
+
+dynamic_smem_size
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_dynamic_smem_size();
+
+current_graph_exec
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_current_graph_exec();
diff --git a/docs/libcudacxx/ptx/instructions/st.async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
similarity index 83%
rename from docs/libcudacxx/ptx/instructions/st.async.rst
rename to docs/libcudacxx/ptx/instructions/generated/st_async.rst
index a2e1ebe46a6..c519ea57f70 100644
--- a/docs/libcudacxx/ptx/instructions/st.async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
@@ -1,16 +1,3 @@
-.. _libcudacxx-ptx-instructions-st-async:
-
-st.async
-========
-
--  PTX ISA:
-   `st.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async>`__
--  Used in: :ref:`How to use st.async <libcudacxx-ptx-examples-st-async>`
-
-**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For
-instance, the ``addr`` supplied to the ``v2.b32`` variant must be
-aligned to ``2 x 4 = 8`` bytes.
-
 st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
similarity index 89%
rename from docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst
rename to docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
index 1de158491a8..52fae102ad4 100644
--- a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy:
-
-tensormap.cp_fenceproxy
-=======================
-
--  PTX ISA:
-   `tensormap.cp_fenceproxy <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__
-
 tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
similarity index 97%
rename from docs/libcudacxx/ptx/instructions/tensormap.replace.rst
rename to docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
index 7d8b839584e..33e6f1d839a 100644
--- a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-tensormap-replace:
-
-tensormap.replace
-=================
-
--  PTX ISA:
-   `tensormap.replace <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace>`__
-
 tensormap.replace.tile.global_address.global.b1024.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/getctarank.rst b/docs/libcudacxx/ptx/instructions/getctarank.rst
index 5bad6259103..d355ed80929 100644
--- a/docs/libcudacxx/ptx/instructions/getctarank.rst
+++ b/docs/libcudacxx/ptx/instructions/getctarank.rst
@@ -6,13 +6,4 @@ getctarank
 -  PTX ISA:
    `getctarank <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank>`__
 
-getctarank.shared::cluster.u32
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline uint32_t getctarank(
-     cuda::ptx::space_cluster_t,
-     const void* addr);
+.. include:: generated/getctarank.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst
deleted file mode 100644
index c383c59c6fd..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst
+++ /dev/null
@@ -1,232 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-arrive:
-
-mbarrier.arrive
-===============
-
--  PTX ISA:
-   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`__
-
-.. _mbarrier.arrive-1:
-
-mbarrier.arrive
----------------
-
-Some of the listed PTX instructions below are semantically equivalent.
-They differ in one important way: the shorter instructions are typically
-supported on older compilers.
-
-mbarrier.arrive.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive(
-     uint64_t* addr);
-
-mbarrier.arrive.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive(
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.no_complete
----------------------------
-
-mbarrier.arrive.noComplete.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive_no_complete(
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.expect_tx
--------------------------
-
-mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-Usage
------
-
-.. code:: cuda
-
-   #include <cuda/ptx>
-   #include <cuda/barrier>
-   #include <cooperative_groups.h>
-
-   __global__ void kernel() {
-       using cuda::ptx::sem_release;
-       using cuda::ptx::space_cluster;
-       using cuda::ptx::space_shared;
-       using cuda::ptx::scope_cluster;
-       using cuda::ptx::scope_cta;
-
-       using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-       __shared__ barrier_t bar;
-       init(&bar, blockDim.x);
-       __syncthreads();
-
-       NV_IF_TARGET(NV_PROVIDES_SM_90, (
-           // Arrive on local shared memory barrier:
-           uint64_t token;
-           token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
-
-           // Get address of remote cluster barrier:
-           namespace cg = cooperative_groups;
-           cg::cluster_group cluster = cg::this_cluster();
-           unsigned int other_block_rank = cluster.block_rank() ^ 1;
-           uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
-
-           // Sync cluster to ensure remote barrier is initialized.
-           cluster.sync();
-
-           // Arrive on remote cluster barrier:
-           cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
-       )
-   }
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst
deleted file mode 100644
index 23197e2eb7c..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-test_wait:
-
-mbarrier.test_wait
-==================
-
--  PTX ISA:
-   `mbarrier.test_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
-
-.. _mbarrier.test_wait-1:
-
-mbarrier.test_wait
-------------------
-
-mbarrier.test_wait.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline bool mbarrier_test_wait(
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.parity
--------------------------
-
-mbarrier.test_wait.parity.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
-   template <typename=void>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.test_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst
deleted file mode 100644
index 762f5e100d7..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst
+++ /dev/null
@@ -1,174 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-try_wait:
-
-mbarrier.try_wait
-=================
-
--  PTX ISA:
-   `mbarrier.try_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
-
-
-.. _mbarrier.try_wait-1:
-
-mbarrier.try_wait
------------------
-
-mbarrier.try_wait.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait(
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait(
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity
-------------------------
-
-mbarrier.try_wait.parity.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst
new file mode 100644
index 00000000000..f01e7a95465
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst
@@ -0,0 +1,68 @@
+.. _libcudacxx-ptx-instructions-mbarrier-arrive:
+
+mbarrier.arrive
+===============
+
+-  PTX ISA:
+   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`__
+
+.. _mbarrier.arrive-1:
+
+mbarrier.arrive
+---------------
+
+Some of the listed PTX instructions below are semantically equivalent.
+They differ in one important way: the shorter instructions are typically
+supported on older compilers.
+
+.. include:: generated/mbarrier_arrive.rst
+
+mbarrier.arrive.no_complete
+---------------------------
+
+.. include:: generated/mbarrier_arrive_no_complete.rst
+
+mbarrier.arrive.expect_tx
+-------------------------
+
+.. include:: generated/mbarrier_arrive_expect_tx.rst
+
+Usage
+-----
+
+.. code:: cuda
+
+   #include <cuda/ptx>
+   #include <cuda/barrier>
+   #include <cooperative_groups.h>
+
+   __global__ void kernel() {
+       using cuda::ptx::sem_release;
+       using cuda::ptx::space_cluster;
+       using cuda::ptx::space_shared;
+       using cuda::ptx::scope_cluster;
+       using cuda::ptx::scope_cta;
+
+       using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+       __shared__ barrier_t bar;
+       init(&bar, blockDim.x);
+       __syncthreads();
+
+       NV_IF_TARGET(NV_PROVIDES_SM_90, (
+           // Arrive on local shared memory barrier:
+           uint64_t token;
+           token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+           // Get address of remote cluster barrier:
+           namespace cg = cooperative_groups;
+           cg::cluster_group cluster = cg::this_cluster();
+           unsigned int other_block_rank = cluster.block_rank() ^ 1;
+           uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+
+           // Sync cluster to ensure remote barrier is initialized.
+           cluster.sync();
+
+           // Arrive on remote cluster barrier:
+           cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
+       )
+   }
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst
new file mode 100644
index 00000000000..6c34813242f
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-mbarrier-expect_tx:
+
+mbarrier.expect_tx
+==================
+
+-  PTX ISA:
+   `mbarrier.expect_tx <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx>`__
+
+.. include:: generated/mbarrier_expect_tx.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst
new file mode 100644
index 00000000000..a736f53b0a2
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-mbarrier-init:
+
+mbarrier.init
+=============
+
+-  PTX ISA:
+   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init>`__
+
+.. include:: generated/mbarrier_init.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst
new file mode 100644
index 00000000000..d8a4e79473e
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst
@@ -0,0 +1,19 @@
+.. _libcudacxx-ptx-instructions-mbarrier-test_wait:
+
+mbarrier.test_wait
+==================
+
+-  PTX ISA:
+   `mbarrier.test_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
+
+.. _mbarrier.test_wait-1:
+
+mbarrier.test_wait
+------------------
+
+.. include:: generated/mbarrier_test_wait.rst
+
+mbarrier.test_wait.parity
+-------------------------
+
+.. include:: generated/mbarrier_test_wait_parity.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst
new file mode 100644
index 00000000000..1869695f3f6
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst
@@ -0,0 +1,20 @@
+.. _libcudacxx-ptx-instructions-mbarrier-try_wait:
+
+mbarrier.try_wait
+=================
+
+-  PTX ISA:
+   `mbarrier.try_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
+
+
+.. _mbarrier.try_wait-1:
+
+mbarrier.try_wait
+-----------------
+
+.. include:: generated/mbarrier_try_wait.rst
+
+mbarrier.try_wait.parity
+------------------------
+
+.. include:: generated/mbarrier_try_wait_parity.rst
diff --git a/docs/libcudacxx/ptx/instructions/red_async.rst b/docs/libcudacxx/ptx/instructions/red_async.rst
new file mode 100644
index 00000000000..82ba07c38de
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/red_async.rst
@@ -0,0 +1,31 @@
+.. _libcudacxx-ptx-instructions-mbarrier-red-async:
+
+red.async
+=========
+
+-  PTX ISA:
+   `red.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async>`__
+
+.. _red.async-1:
+
+red.async
+---------
+
+.. include:: generated/red_async.rst
+
+red.async ``.s64`` emulation
+----------------------------
+
+PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This
+exposure is emulated in ``cuda::ptx`` using
+
+.. code:: cuda
+
+   // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void red_async(
+     cuda::ptx::op_add_t,
+     int64_t* dest,
+     const int64_t& value,
+     int64_t* remote_bar);
diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst
index 375ce44622e..1e9597fa726 100644
--- a/docs/libcudacxx/ptx/instructions/special_registers.rst
+++ b/docs/libcudacxx/ptx/instructions/special_registers.rst
@@ -6,386 +6,4 @@ Special registers
 -  PTX ISA:
    `Special Register <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers>`__
 
-tid.x
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_x();
-
-tid.y
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_y();
-
-tid.z
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_z();
-
-ntid.x
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_x();
-
-ntid.y
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_y();
-
-ntid.z
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_z();
-
-laneid
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%laneid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_laneid();
-
-warpid
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%warpid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_warpid();
-
-nwarpid
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nwarpid();
-
-ctaid.x
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_x();
-
-ctaid.y
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_y();
-
-ctaid.z
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_z();
-
-nctaid.x
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_x();
-
-nctaid.y
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_y();
-
-nctaid.z
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_z();
-
-smid
-^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%smid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_smid();
-
-nsmid
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nsmid();
-
-gridid
-^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%gridid; // PTX ISA 30
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_gridid();
-
-is_explicit_cluster
-^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool get_sreg_is_explicit_cluster();
-
-clusterid.x
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_x();
-
-clusterid.y
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_y();
-
-clusterid.z
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_z();
-
-nclusterid.x
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_x();
-
-nclusterid.y
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_y();
-
-nclusterid.z
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_z();
-
-cluster_ctaid.x
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
-
-cluster_ctaid.y
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
-
-cluster_ctaid.z
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
-
-cluster_nctaid.x
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
-
-cluster_nctaid.y
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
-
-cluster_nctaid.z
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
-
-cluster_ctarank
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctarank();
-
-cluster_nctarank
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctarank();
-
-lanemask_eq
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_eq();
-
-lanemask_le
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_le();
-
-lanemask_lt
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_lt();
-
-lanemask_ge
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_ge();
-
-lanemask_gt
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_gt();
-
-clock
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clock; // PTX ISA 10
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clock();
-
-clock_hi
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clock_hi();
-
-clock64
-^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_clock64();
-
-globaltimer
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_globaltimer();
-
-globaltimer_lo
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_globaltimer_lo();
-
-globaltimer_hi
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_globaltimer_hi();
-
-total_smem_size
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_total_smem_size();
-
-aggr_smem_size
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_aggr_smem_size();
-
-dynamic_smem_size
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_dynamic_smem_size();
-
-current_graph_exec
-^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_current_graph_exec();
+.. include:: generated/special_registers.rst
diff --git a/docs/libcudacxx/ptx/instructions/st_async.rst b/docs/libcudacxx/ptx/instructions/st_async.rst
new file mode 100644
index 00000000000..c71aebd7da3
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/st_async.rst
@@ -0,0 +1,14 @@
+.. _libcudacxx-ptx-instructions-st-async:
+
+st.async
+========
+
+-  PTX ISA:
+   `st.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async>`__
+-  Used in: :ref:`How to use st.async <libcudacxx-ptx-examples-st-async>`
+
+**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For
+instance, the ``addr`` supplied to the ``v2.b32`` variant must be
+aligned to ``2 x 4 = 8`` bytes.
+
+.. include:: generated/st_async.rst
diff --git a/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst
new file mode 100644
index 00000000000..2f7622bba2c
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy:
+
+tensormap.cp_fenceproxy
+=======================
+
+-  PTX ISA:
+   `tensormap.cp_fenceproxy <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__
+
+.. include:: generated/tensormap_cp_fenceproxy.rst
diff --git a/docs/libcudacxx/ptx/instructions/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst
new file mode 100644
index 00000000000..331dcff313a
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tensormap-replace:
+
+tensormap.replace
+=================
+
+-  PTX ISA:
+   `tensormap.replace <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace>`__
+
+.. include:: generated/tensormap_replace.rst