diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 136dfb81fc3..3bdef7655c7 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -7,6 +7,7 @@ PTX Instructions :maxdepth: 1 instructions/barrier_cluster + instructions/bfind instructions/clusterlaunchcontrol instructions/cp_async_bulk instructions/cp_async_bulk_commit_group @@ -15,6 +16,7 @@ PTX Instructions instructions/cp_async_mbarrier_arrive instructions/cp_reduce_async_bulk instructions/cp_reduce_async_bulk_tensor + instructions/exit instructions/fence instructions/getctarank instructions/mapa @@ -41,6 +43,7 @@ PTX Instructions instructions/tcgen05_wait instructions/tensormap_replace instructions/tensormap_cp_fenceproxy + instructions/trap instructions/special_registers @@ -72,7 +75,7 @@ Instructions by section * - `clz `__ - No * - `bfind `__ - - No + - CCCL 3.0.0 * - `fns `__ - No * - `brev `__ @@ -365,7 +368,7 @@ Instructions by section * - `ret `__ - No * - `exit `__ - - No + - CCCL 3.0.0 .. list-table:: `Parallel Synchronization and Communication Instructions `__ :widths: 50 50 @@ -529,7 +532,7 @@ Instructions by section * - `pmevent `__ - No * - `trap `__ - - No + - CCCL 3.0.0 * - `setmaxnreg `__ - No diff --git a/docs/libcudacxx/ptx/instructions/bfind.rst b/docs/libcudacxx/ptx/instructions/bfind.rst new file mode 100644 index 00000000000..694cdef27fe --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/bfind.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-bfind: + +bfind +===== + +- PTX ISA: + `bfind `__ + +.. include:: generated/bfind.rst diff --git a/docs/libcudacxx/ptx/instructions/exit.rst b/docs/libcudacxx/ptx/instructions/exit.rst new file mode 100644 index 00000000000..899735f1660 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/exit.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-exit: + +exit +==== + +- PTX ISA: + `exit `__ + +.. include:: generated/exit.rst diff --git a/docs/libcudacxx/ptx/instructions/generated/bfind.rst b/docs/libcudacxx/ptx/instructions/generated/bfind.rst new file mode 100644 index 00000000000..4fd6c35e34b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/bfind.rst @@ -0,0 +1,74 @@ +.. + This file was automatically generated. Do not edit. + +bfind.u32 +^^^^^^^^^ +.. code:: cuda + + // bfind.u32 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind( + uint32_t a_reg); + +bfind.shiftamt.u32 +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // bfind.shiftamt.u32 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind_shiftamt( + uint32_t a_reg); + +bfind.u64 +^^^^^^^^^ +.. code:: cuda + + // bfind.u64 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind( + uint64_t a_reg); + +bfind.shiftamt.u64 +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // bfind.shiftamt.u64 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind_shiftamt( + uint64_t a_reg); + +bfind.s32 +^^^^^^^^^ +.. code:: cuda + + // bfind.s32 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind( + int32_t a_reg); + +bfind.shiftamt.s32 +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // bfind.shiftamt.s32 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind_shiftamt( + int32_t a_reg); + +bfind.s64 +^^^^^^^^^ +.. code:: cuda + + // bfind.s64 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind( + int64_t a_reg); + +bfind.shiftamt.s64 +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // bfind.shiftamt.s64 dest, a_reg; // PTX ISA 20, SM_50 + template + __device__ static inline uint32_t bfind_shiftamt( + int64_t a_reg); diff --git a/docs/libcudacxx/ptx/instructions/generated/exit.rst b/docs/libcudacxx/ptx/instructions/generated/exit.rst new file mode 100644 index 00000000000..f8739ac2207 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/exit.rst @@ -0,0 +1,10 @@ +.. + This file was automatically generated. Do not edit. + +exit +^^^^ +.. code:: cuda + + // exit; // PTX ISA 10, SM_50 + template + __device__ static inline void exit(); diff --git a/docs/libcudacxx/ptx/instructions/generated/trap.rst b/docs/libcudacxx/ptx/instructions/generated/trap.rst new file mode 100644 index 00000000000..0fdad168b17 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/trap.rst @@ -0,0 +1,10 @@ +.. + This file was automatically generated. Do not edit. + +trap +^^^^ +.. code:: cuda + + // trap; // PTX ISA 10, SM_50 + template + __device__ static inline void trap(); diff --git a/docs/libcudacxx/ptx/instructions/trap.rst b/docs/libcudacxx/ptx/instructions/trap.rst new file mode 100644 index 00000000000..54b641f357e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/trap.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-trap: + +trap +==== + +- PTX ISA: + `trap `__ + +.. include:: generated/trap.rst diff --git a/docs/repo.toml b/docs/repo.toml index 08ce4e58775..7ff29fd6eba 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -54,7 +54,7 @@ api_output_directory = "api" use_fast_doxygen_conversion = true sphinx_generate_doxygen_groups = true sphinx_generate_doxygen_pages = true -sphinx_exclude_patterns = ['ptx/instructions/generated'] +sphinx_exclude_patterns = [] [repo_docs.projects.cub] name = "CUB" diff --git a/libcudacxx/codegen/add_ptx_instruction.py b/libcudacxx/codegen/add_ptx_instruction.py old mode 100644 new mode 100755 diff --git a/libcudacxx/include/cuda/__ptx/instructions/bfind.h b/libcudacxx/include/cuda/__ptx/instructions/bfind.h new file mode 100644 index 00000000000..af8b5de98f8 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/bfind.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_BFIND_H_ +#define _CUDA_PTX_BFIND_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_BFIND_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/exit.h b/libcudacxx/include/cuda/__ptx/instructions/exit.h new file mode 100644 index 00000000000..6ae28b71e54 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/exit.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_EXIT_H_ +#define _CUDA_PTX_EXIT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_EXIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h b/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h new file mode 100644 index 00000000000..ca0dea37504 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/bfind.h @@ -0,0 +1,190 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BFIND_H_ +#define _CUDA_PTX_GENERATED_BFIND_H_ + +/* +// bfind.u32 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind( + uint32_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::uint32_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.u32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.shiftamt.u32 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind_shiftamt( + uint32_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::uint32_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.shiftamt.u32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.u64 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind( + uint64_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::uint64_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.u64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.shiftamt.u64 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind_shiftamt( + uint64_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::uint64_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.shiftamt.u64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.s32 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind( + int32_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::int32_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.s32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.shiftamt.s32 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind_shiftamt( + int32_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::int32_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.shiftamt.s32 %0, %1;" : "=r"(__dest) : "r"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.s64 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind( + int64_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind(_CUDA_VSTD::int64_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.s64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +/* +// bfind.shiftamt.s64 dest, a_reg; // PTX ISA 20, SM_50 +template +__device__ static inline uint32_t bfind_shiftamt( + int64_t a_reg); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t bfind_shiftamt(_CUDA_VSTD::int64_t __a_reg) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint32_t __dest; + asm("bfind.shiftamt.s64 %0, %1;" : "=r"(__dest) : "l"(__a_reg) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_bfind_shiftamt_is_not_supported_before_SM_50__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 200 + +#endif // _CUDA_PTX_GENERATED_BFIND_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h b/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h new file mode 100644 index 00000000000..d5ff7ac3e88 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/exit.h @@ -0,0 +1,25 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_EXIT_H_ +#define _CUDA_PTX_GENERATED_EXIT_H_ + +/* +// exit; // PTX ISA 10, SM_50 +template +__device__ static inline void exit(); +*/ +#if __cccl_ptx_isa >= 100 +extern "C" _CCCL_DEVICE void __cuda_ptx_exit_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline void exit() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + asm volatile("exit;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_exit_is_not_supported_before_SM_50__(); +# endif +} +#endif // __cccl_ptx_isa >= 100 + +#endif // _CUDA_PTX_GENERATED_EXIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h b/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h new file mode 100644 index 00000000000..b729185d0b8 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/trap.h @@ -0,0 +1,25 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TRAP_H_ +#define _CUDA_PTX_GENERATED_TRAP_H_ + +/* +// trap; // PTX ISA 10, SM_50 +template +__device__ static inline void trap(); +*/ +#if __cccl_ptx_isa >= 100 +extern "C" _CCCL_DEVICE void __cuda_ptx_trap_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline void trap() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + asm volatile("trap;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_trap_is_not_supported_before_SM_50__(); +# endif +} +#endif // __cccl_ptx_isa >= 100 + +#endif // _CUDA_PTX_GENERATED_TRAP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/trap.h b/libcudacxx/include/cuda/__ptx/instructions/trap.h new file mode 100644 index 00000000000..08ca0a0ec9e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/trap.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TRAP_H_ +#define _CUDA_PTX_TRAP_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TRAP_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 971288b456c..9b021262707 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -70,6 +70,7 @@ */ #include +#include #include #include #include @@ -78,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -103,5 +105,6 @@ #include #include #include +#include #endif // _CUDA_PTX diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h new file mode 100644 index 00000000000..1543a8fbb4b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/bfind.h @@ -0,0 +1,78 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_bfind(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // bfind.u32 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // bfind.shiftamt.u32 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind_shiftamt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // bfind.u64 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // bfind.shiftamt.u64 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind_shiftamt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // bfind.s32 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // bfind.shiftamt.s32 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind_shiftamt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // bfind.s64 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // bfind.shiftamt.s64 dest, a_reg; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::bfind_shiftamt));)); +#endif // __cccl_ptx_isa >= 200 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h new file mode 100644 index 00000000000..9b25d68177d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/exit.h @@ -0,0 +1,25 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_exit(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 100 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // exit; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::exit));)); +#endif // __cccl_ptx_isa >= 100 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h new file mode 100644 index 00000000000..22c90cf10d1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/trap.h @@ -0,0 +1,25 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_trap(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 100 + NV_IF_TARGET(NV_PROVIDES_SM_50, + ( + // trap; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::trap));)); +#endif // __cccl_ptx_isa >= 100 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp new file mode 100644 index 00000000000..77f70363cec --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.bfind.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/bfind.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp new file mode 100644 index 00000000000..568cd75b9fa --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.exit.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/exit.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp new file mode 100644 index 00000000000..cd9032033f5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.trap.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/trap.h" + +int main(int, char**) +{ + return 0; +}