diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index 136dfb81fc3..3bdef7655c7 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -7,6 +7,7 @@ PTX Instructions
    :maxdepth: 1
 
    instructions/barrier_cluster
+   instructions/bfind
    instructions/clusterlaunchcontrol
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
@@ -15,6 +16,7 @@ PTX Instructions
    instructions/cp_async_mbarrier_arrive
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
+   instructions/exit
    instructions/fence
    instructions/getctarank
    instructions/mapa
@@ -41,6 +43,7 @@ PTX Instructions
    instructions/tcgen05_wait
    instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
+   instructions/trap
    instructions/special_registers
 
 
@@ -72,7 +75,7 @@ Instructions by section
    * - `clz <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz>`__
      - No
    * - `bfind <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind>`__
-     - No
+     - CCCL 3.0.0
    * - `fns <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns>`__
      - No
    * - `brev <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev>`__
@@ -365,7 +368,7 @@ Instructions by section
    * - `ret <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret>`__
      - No
    * - `exit <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit>`__
-     - No
+     - CCCL 3.0.0
 
 .. list-table:: `Parallel Synchronization and Communication Instructions <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions>`__
    :widths: 50 50
@@ -529,7 +532,7 @@ Instructions by section
    * - `pmevent <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent>`__
      - No
    * - `trap <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap>`__
-     - No
+     - CCCL 3.0.0
    * - `setmaxnreg <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg>`__
      - No
 
diff --git a/docs/libcudacxx/ptx/instructions/bfind.rst b/docs/libcudacxx/ptx/instructions/bfind.rst
new file mode 100644
index 00000000000..694cdef27fe
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/bfind.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-bfind:
+
+bfind
+=====
+
+-  PTX ISA:
+   `bfind <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind>`__
+
+.. include:: generated/bfind.rst
diff --git a/docs/libcudacxx/ptx/instructions/exit.rst b/docs/libcudacxx/ptx/instructions/exit.rst
new file mode 100644
index 00000000000..899735f1660
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/exit.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-exit:
+
+exit
+====
+
+-  PTX ISA:
+   `exit <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit>`__
+
+.. include:: generated/exit.rst
diff --git a/docs/libcudacxx/ptx/instructions/generated/bfind.rst b/docs/libcudacxx/ptx/instructions/generated/bfind.rst
new file mode 100644
index 00000000000..4fd6c35e34b
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/bfind.rst
@@ -0,0 +1,74 @@
+..
+   This file was automatically generated. Do not edit.
+
+bfind.u32
+^^^^^^^^^
+.. code:: cuda
+
+   // bfind.u32 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind(
+     uint32_t a_reg);
+
+bfind.shiftamt.u32
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // bfind.shiftamt.u32 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind_shiftamt(
+     uint32_t a_reg);
+
+bfind.u64
+^^^^^^^^^
+.. code:: cuda
+
+   // bfind.u64 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind(
+     uint64_t a_reg);
+
+bfind.shiftamt.u64
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // bfind.shiftamt.u64 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind_shiftamt(
+     uint64_t a_reg);
+
+bfind.s32
+^^^^^^^^^
+.. code:: cuda
+
+   // bfind.s32 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind(
+     int32_t a_reg);
+
+bfind.shiftamt.s32
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // bfind.shiftamt.s32 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind_shiftamt(
+     int32_t a_reg);
+
+bfind.s64
+^^^^^^^^^
+.. code:: cuda
+
+   // bfind.s64 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind(
+     int64_t a_reg);
+
+bfind.shiftamt.s64
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // bfind.shiftamt.s64 dest, a_reg; // PTX ISA 20, SM_50
+   template <typename = void>
+   __device__ static inline uint32_t bfind_shiftamt(
+     int64_t a_reg);
diff --git a/docs/libcudacxx/ptx/instructions/generated/exit.rst b/docs/libcudacxx/ptx/instructions/generated/exit.rst
new file mode 100644
index 00000000000..f8739ac2207
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/exit.rst
@@ -0,0 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
+exit
+^^^^
+.. code:: cuda
+
+   // exit; // PTX ISA 10, SM_50
+   template <typename = void>
+   __device__ static inline void exit();
diff --git a/docs/libcudacxx/ptx/instructions/generated/trap.rst b/docs/libcudacxx/ptx/instructions/generated/trap.rst
new file mode 100644
index 00000000000..0fdad168b17
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/trap.rst
@@ -0,0 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
+trap
+^^^^
+.. code:: cuda
+
+   // trap; // PTX ISA 10, SM_50
+   template <typename = void>
+   __device__ static inline void trap();
diff --git a/docs/libcudacxx/ptx/instructions/trap.rst b/docs/libcudacxx/ptx/instructions/trap.rst
new file mode 100644
index 00000000000..54b641f357e
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/trap.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-trap:
+
+trap
+====
+
+-  PTX ISA:
+   `trap <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap>`__
+
+.. include:: generated/trap.rst
diff --git a/docs/repo.toml b/docs/repo.toml
index 08ce4e58775..7ff29fd6eba 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -54,7 +54,7 @@ api_output_directory = "api"
 use_fast_doxygen_conversion = true
 sphinx_generate_doxygen_groups = true
 sphinx_generate_doxygen_pages = true
-sphinx_exclude_patterns = ['ptx/instructions/generated']
+sphinx_exclude_patterns = []
 
 [repo_docs.projects.cub]
 name = "CUB"
diff --git a/libcudacxx/codegen/add_ptx_instruction.py b/libcudacxx/codegen/add_ptx_instruction.py
old mode 100644
new mode 100755
diff --git a/libcudacxx/include/cuda/__ptx/instructions/bfind.h b/libcudacxx/include/cuda/__ptx/instructions/bfind.h
new file mode 100644
index 00000000000..af8b5de98f8
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/bfind.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_BFIND_H_
+#define _CUDA_PTX_BFIND_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/bfind.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_BFIND_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/exit.h b/libcudacxx/include/cuda/__ptx/instructions/exit.h
new file mode 100644
index 00000000000..6ae28b71e54
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/exit.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_EXIT_H_
+#define _CUDA_PTX_EXIT_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/exit.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_EXIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h b/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h
new file mode 100644
index 00000000000..ca0dea37504
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h
@@ -0,0 +1,190 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_BFIND_H_
+#define _CUDA_PTX_GENERATED_BFIND_H_
+
+/*
+// bfind.u32 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind(
+  uint32_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::uint32_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.u32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.shiftamt.u32 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind_shiftamt(
+  uint32_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::uint32_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.shiftamt.u32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.u64 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind(
+  uint64_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::uint64_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.u64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.shiftamt.u64 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind_shiftamt(
+  uint64_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::uint64_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.shiftamt.u64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.s32 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind(
+  int32_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::int32_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.s32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.shiftamt.s32 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind_shiftamt(
+  int32_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::int32_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.shiftamt.s32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.s64 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind(
+  int64_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::int64_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.s64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// bfind.shiftamt.s64 dest, a_reg; // PTX ISA 20, SM_50
+template <typename = void>
+__device__ static inline uint32_t bfind_shiftamt(
+  int64_t a_reg);
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::int64_t __a_reg)
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  _CUDA_VSTD::uint32_t __dest;
+  asm("bfind.shiftamt.s64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :);
+  return __dest;
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__();
+  return 0;
+#  endif
+}
+#endif // __cccl_ptx_isa >= 200
+
+#endif // _CUDA_PTX_GENERATED_BFIND_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h b/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h
new file mode 100644
index 00000000000..d5ff7ac3e88
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_EXIT_H_
+#define _CUDA_PTX_GENERATED_EXIT_H_
+
+/*
+// exit; // PTX ISA 10, SM_50
+template <typename = void>
+__device__ static inline void exit();
+*/
+#if __cccl_ptx_isa >= 100
+extern "C" _CCCL_DEVICE void __cuda_ptx_exit_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline void exit()
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  asm volatile("exit;" : : :);
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_exit_is_not_supported_before_SM_50__();
+#  endif
+}
+#endif // __cccl_ptx_isa >= 100
+
+#endif // _CUDA_PTX_GENERATED_EXIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h b/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h
new file mode 100644
index 00000000000..b729185d0b8
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_TRAP_H_
+#define _CUDA_PTX_GENERATED_TRAP_H_
+
+/*
+// trap; // PTX ISA 10, SM_50
+template <typename = void>
+__device__ static inline void trap();
+*/
+#if __cccl_ptx_isa >= 100
+extern "C" _CCCL_DEVICE void __cuda_ptx_trap_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline void trap()
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
+  asm volatile("trap;" : : :);
+#  else
+  // Unsupported architectures will have a linker error with a semi-decent error message
+  __cuda_ptx_trap_is_not_supported_before_SM_50__();
+#  endif
+}
+#endif // __cccl_ptx_isa >= 100
+
+#endif // _CUDA_PTX_GENERATED_TRAP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/trap.h b/libcudacxx/include/cuda/__ptx/instructions/trap.h
new file mode 100644
index 00000000000..08ca0a0ec9e
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/trap.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TRAP_H_
+#define _CUDA_PTX_TRAP_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/trap.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TRAP_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 971288b456c..9b021262707 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -70,6 +70,7 @@
  */
 
 #include <cuda/__ptx/instructions/barrier_cluster.h>
+#include <cuda/__ptx/instructions/bfind.h>
 #include <cuda/__ptx/instructions/clusterlaunchcontrol.h>
 #include <cuda/__ptx/instructions/cp_async_bulk.h>
 #include <cuda/__ptx/instructions/cp_async_bulk_commit_group.h>
@@ -78,6 +79,7 @@
 #include <cuda/__ptx/instructions/cp_async_mbarrier_arrive.h>
 #include <cuda/__ptx/instructions/cp_reduce_async_bulk.h>
 #include <cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/exit.h>
 #include <cuda/__ptx/instructions/fence.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/__ptx/instructions/getctarank.h>
@@ -103,5 +105,6 @@
 #include <cuda/__ptx/instructions/tcgen05_wait.h>
 #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
 #include <cuda/__ptx/instructions/tensormap_replace.h>
+#include <cuda/__ptx/instructions/trap.h>
 
 #endif // _CUDA_PTX
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h
new file mode 100644
index 00000000000..1543a8fbb4b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h
@@ -0,0 +1,78 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_bfind(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // bfind.u32 dest, a_reg;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(uint32_t)>(cuda::ptx::bfind));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_50,
+    (
+        // bfind.shiftamt.u32 dest, a_reg;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(uint32_t)>(cuda::ptx::bfind_shiftamt));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // bfind.u64 dest, a_reg;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(uint64_t)>(cuda::ptx::bfind));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_50,
+    (
+        // bfind.shiftamt.u64 dest, a_reg;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(uint64_t)>(cuda::ptx::bfind_shiftamt));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // bfind.s32 dest, a_reg;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(int32_t)>(cuda::ptx::bfind));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_50,
+    (
+        // bfind.shiftamt.s32 dest, a_reg;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(int32_t)>(cuda::ptx::bfind_shiftamt));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // bfind.s64 dest, a_reg;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(int64_t)>(cuda::ptx::bfind));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_50,
+    (
+        // bfind.shiftamt.s64 dest, a_reg;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(int64_t)>(cuda::ptx::bfind_shiftamt));));
+#endif // __cccl_ptx_isa >= 200
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h
new file mode 100644
index 00000000000..9b25d68177d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_exit(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 100
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // exit;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::exit));));
+#endif // __cccl_ptx_isa >= 100
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h
new file mode 100644
index 00000000000..22c90cf10d1
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_trap(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 100
+  NV_IF_TARGET(NV_PROVIDES_SM_50,
+               (
+                   // trap;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::trap));));
+#endif // __cccl_ptx_isa >= 100
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp
new file mode 100644
index 00000000000..77f70363cec
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/bfind.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp
new file mode 100644
index 00000000000..568cd75b9fa
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/exit.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp
new file mode 100644
index 00000000000..cd9032033f5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/trap.h"
+
+int main(int, char**)
+{
+  return 0;
+}