From 497843a98d0ec855c598b6280a86a2ce81bb2af9 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 26 Feb 2025 10:36:25 +0100
Subject: [PATCH] Drop deprecated features from CUB util_ptx.cuh (#3935)

---
 cub/cub/util_ptx.cuh | 269 -------------------------------------------
 1 file changed, 269 deletions(-)

diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 206e815a761..d53a6cdc70a 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -52,28 +52,6 @@ CUB_NAMESPACE_BEGIN
  * Inlined PTX intrinsics
  ******************************************************************************/
 
-/**
- * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
-{
-  unsigned int ret;
-  asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-  return ret;
-}
-
-/**
- * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
-{
-  unsigned int ret;
-  asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-  return ret;
-}
-
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
@@ -124,135 +102,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in
   return BFE(source, bit_start, num_bits, detail::constant_v<int{sizeof(UnsignedBits)}>);
 }
 
-/**
- * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
-{
-  asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-}
-
-/**
- * \brief Three-operand add.  Returns \p x + \p y + \p z.
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-  asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-  return x;
-}
-
-/**
- * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit
- * destination register.  For SM2.0 or later.
- *
- * \par
- * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
- * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
- * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
- * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
- *
- * \par Snippet
- * The code snippet below illustrates byte-permute.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     int a        = 0x03020100;
- *     int b        = 0x07060504;
- *     int index    = 0x00007531;
- *
- *     int selected = PRMT(a, b, index);    // 0x07050301
- *
- * \endcode
- *
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-  int ret;
-  asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-  return ret;
-}
-
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
-/**
- * Sync-threads barrier.
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
-{
-  asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-/**
- * CTA barrier
- */
-CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
-{
-  __syncthreads();
-}
-
-/**
- * CTA barrier with predicate
- */
-CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
-{
-  return __syncthreads_and(p);
-}
-
-/**
- * CTA barrier with predicate
- */
-CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
-{
-  return __syncthreads_or(p);
-}
-
-/**
- * Warp barrier
- */
-CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
-{
-  __syncwarp(member_mask);
-}
-
-/**
- * Warp any
- */
-CCCL_DEPRECATED_BECAUSE("use __any_sync() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask)
-{
-  return __any_sync(member_mask, predicate);
-}
-
-/**
- * Warp any
- */
-CCCL_DEPRECATED_BECAUSE("use __all_sync() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask)
-{
-  return __all_sync(member_mask, predicate);
-}
-
-/**
- * Warp ballot
- */
-CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask)
-{
-  return __ballot_sync(member_mask, predicate);
-}
-
 /**
  * Warp synchronous shfl_up
  */
@@ -277,50 +128,6 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member
   return word;
 }
 
-/**
- * Warp synchronous shfl_idx
- */
-CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
-SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
-{
-  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
-               : "=r"(word)
-               : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
-  return word;
-}
-
-/**
- * Warp synchronous shfl_idx
- */
-CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
-{
-  return __shfl_sync(member_mask, word, src_lane);
-}
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
-{
-  float d;
-  asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-  return d;
-}
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
-_CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
-{
-  float d;
-  asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-  return d;
-}
-
 #endif // _CCCL_DOXYGEN_INVOKED
 
 /**
@@ -331,15 +138,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
   asm volatile("exit;");
 }
 
-/**
- * \brief  Abort execution and generate an interrupt to the host CPU
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
-{
-  asm volatile("trap;");
-}
-
 /**
  * \brief Returns the row-major linear thread identifier for a multidimensional thread block
  */
@@ -349,29 +147,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y,
        + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + threadIdx.x;
 }
 
-/**
- * \brief Returns the warp lane ID of the calling thread
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%laneid;" : "=r"(ret));
-  return ret;
-}
-
-/**
- * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not
- * correspond to a zero-based ranking within the thread block.
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%warpid;" : "=r"(ret));
-  return ret;
-}
-
 /**
  * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads
  *
@@ -401,50 +176,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id)
   return member_mask;
 }
 
-/**
- * \brief Returns the warp lane mask of all lanes less than the calling thread
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
-  return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret));
-  return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than the calling thread
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret));
-  return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead")
-_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe()
-{
-  unsigned int ret;
-  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret));
-  return ret;
-}
-
 /**
  * @brief Shuffle-up for any data type.
  *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by