Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review CUB util.ptx for CCCL 2.x #3342

Merged
merged 18 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -424,12 +424,11 @@ public:
for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
{
// In case the bit-offset of the counter at <index> is larger than the bit range of the
// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
// we use the PTX instruction `shr` to make sure behaviour is well-defined.
// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
// current unit, the bit_shift amount will be larger than the bits provided by this unit.
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
// The bit_shift is a run-time value, it is translated into SASS `shr` and the result behavior is well-defined.
const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
val |= (data[i] >> bit_shift) & ITEM_MASK;
fbusato marked this conversation as resolved.
Show resolved Hide resolved
}
return val;
}
Expand All @@ -442,12 +441,11 @@ public:
for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
{
// In case the bit-offset of the counter at <index> is larger than the bit range of the
// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
// we use the PTX instruction `shl` to make sure behaviour is well-defined.
// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
// current unit, the bit_shift amount will be larger than the bits provided by this unit.
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
// The bit_shift is a run-time value, it is translated into SASS `shl` and the result behavior is well-defined.
const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
data[i] += (value << bit_shift) & UNIT_MASK;
}
}

Expand Down
4 changes: 3 additions & 1 deletion cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

template <int _BLOCK_THREADS, int _ITEMS_PER_THREAD, int NOMINAL_4B_NUM_PARTS, typename ComputeT, int _RADIX_BITS>
Expand Down Expand Up @@ -199,7 +201,7 @@ struct AgentRadixSortHistogram
_CCCL_DEVICE _CCCL_FORCEINLINE void
AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
{
int part = LaneId() % NUM_PARTS;
int part = ::cuda::ptx::get_sreg_laneid() % NUM_PARTS;
#pragma unroll
for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
{
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -669,7 +670,7 @@ struct AgentRadixSortOnesweep
, current_bit(current_bit)
, num_bits(num_bits)
, warp(threadIdx.x / WARP_THREADS)
, lane(LaneId())
, lane(::cuda::ptx::get_sreg_laneid())
, decomposer(decomposer)
{
// initialization
Expand Down
8 changes: 5 additions & 3 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
#include <cub/util_type.cuh>
#include <cub/warp/warp_reduce.cuh>

#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -298,7 +300,7 @@ struct AgentRadixSortUpsweep
_CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts()
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = LaneId();
unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();

#pragma unroll
for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
Expand Down Expand Up @@ -419,7 +421,7 @@ struct AgentRadixSortUpsweep
_CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0)
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = LaneId();
unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();

// Place unpacked digit counters in shared memory
#pragma unroll
Expand Down Expand Up @@ -499,7 +501,7 @@ struct AgentRadixSortUpsweep
_CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = LaneId();
unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();

// Place unpacked digit counters in shared memory
#pragma unroll
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/ptx>
#include <cuda/std/type_traits>

#include <iterator>
Expand Down Expand Up @@ -465,7 +466,7 @@ struct AgentRle
{
// Perform warpscans
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = LaneId();
int lane_id = ::cuda::ptx::get_sreg_laneid();

LengthOffsetPair identity;
identity.key = 0;
Expand Down Expand Up @@ -551,7 +552,7 @@ struct AgentRle
Int2Type<true> is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = LaneId();
int lane_id = ::cuda::ptx::get_sreg_laneid();

// Locally compact items within the warp (first warp)
if (warp_id == 0)
Expand Down Expand Up @@ -608,7 +609,7 @@ struct AgentRle
Int2Type<false> is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = LaneId();
int lane_id = ::cuda::ptx::get_sreg_laneid();

// Unzip
OffsetT run_offsets[ITEMS_PER_THREAD];
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include <cub/util_type.cuh>
#include <cub/warp/warp_exchange.cuh>

#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

//! @rst
Expand Down Expand Up @@ -179,7 +181,7 @@ private:

// TODO(bgruber): can we use signed int here? Only these variables are unsigned:
unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
unsigned int lane_id = LaneId();
unsigned int lane_id = ::cuda::ptx::get_sreg_laneid();
unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;

Expand Down
9 changes: 5 additions & 4 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/std/cstdint>
#include <cuda/std/limits>
#include <cuda/std/type_traits>
Expand Down Expand Up @@ -716,7 +717,7 @@ public:

volatile DigitCounterT* digit_counters[KEYS_PER_THREAD];
uint32_t warp_id = linear_tid >> LOG_WARP_THREADS;
uint32_t lane_mask_lt = LaneMaskLt();
uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt();

#pragma unroll
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
Expand Down Expand Up @@ -1070,7 +1071,7 @@ struct BlockRadixRankMatchEarlyCounts
int bin_mask = *p_match_mask;
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
int warp_offset = 0;
int popc = __popc(bin_mask & LaneMaskLe());
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down Expand Up @@ -1099,7 +1100,7 @@ struct BlockRadixRankMatchEarlyCounts
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
int warp_offset = 0;
int popc = __popc(bin_mask & LaneMaskLe());
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down Expand Up @@ -1135,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts
, digit_extractor(digit_extractor)
, callback(callback)
, warp(threadIdx.x / WARP_THREADS)
, lane(LaneId())
, lane(::cuda::ptx::get_sreg_laneid())
{}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include <cub/util_ptx.cuh>
#include <cub/warp/warp_reduce.cuh>

#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

/**
Expand Down Expand Up @@ -121,7 +123,7 @@ struct BlockReduceWarpReductions
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
, warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
, lane_id(LaneId())
, lane_id(::cuda::ptx::get_sreg_laneid())
{}

/**
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/block/specializations/block_scan_warp_scans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include <cub/util_ptx.cuh>
#include <cub/warp/warp_scan.cuh>

#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

/**
Expand Down Expand Up @@ -127,7 +129,7 @@ struct BlockScanWarpScans
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
, warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
, lane_id(LaneId())
, lane_id(::cuda::ptx::get_sreg_laneid())
{}

//---------------------------------------------------------------------
Expand Down
39 changes: 14 additions & 25 deletions cub/cub/util_ptx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,6 @@ CUB_NAMESPACE_BEGIN
* Inlined PTX intrinsics
******************************************************************************/

namespace detail
{
/**
* @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
* num_bits is larger than 32 bits, @p num_bits is clamped to 32.
*/
_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
{
uint32_t ret{};
asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
return ret;
}

/**
* @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
* num_bits is larger than 32 bits, @p num_bits is clamped to 32.
*/
_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
{
uint32_t ret{};
asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
return ret;
}
} // namespace detail

/**
* \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend.
*/
Expand All @@ -90,6 +65,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int
/**
* \brief Shift-left then add. Returns (\p x << \p shift) + \p addend.
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
{
unsigned int ret;
Expand Down Expand Up @@ -150,6 +126,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in
/**
* \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE void
BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
{
Expand All @@ -159,6 +136,7 @@ BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, u
/**
* \brief Three-operand add. Returns \p x + \p y + \p z.
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
{
asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
Expand Down Expand Up @@ -192,6 +170,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y
* \endcode
*
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
int ret;
Expand All @@ -204,6 +183,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned
/**
* Sync-threads barrier.
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
{
asm volatile("bar.sync 1, %0;" : : "r"(count));
Expand Down Expand Up @@ -312,6 +292,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int
/**
* Floating point multiply. (Mantissa LSB rounds towards zero.)
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
{
float d;
Expand All @@ -322,6 +303,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
/**
* Floating point multiply-add. (Mantissa LSB rounds towards zero.)
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
{
float d;
Expand All @@ -342,6 +324,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
/**
* \brief Abort execution and generate an interrupt to the host CPU
*/
CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
{
asm volatile("trap;");
Expand All @@ -359,6 +342,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y,
/**
* \brief Returns the warp lane ID of the calling thread
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
{
unsigned int ret;
Expand All @@ -370,6 +354,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
* \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not
* correspond to a zero-based ranking within the thread block.
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId()
{
unsigned int ret;
Expand Down Expand Up @@ -409,6 +394,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id)
/**
* \brief Returns the warp lane mask of all lanes less than the calling thread
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
{
unsigned int ret;
Expand All @@ -419,6 +405,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
/**
* \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
{
unsigned int ret;
Expand All @@ -429,6 +416,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
/**
* \brief Returns the warp lane mask of all lanes greater than the calling thread
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
{
unsigned int ret;
Expand All @@ -439,6 +427,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
/**
* \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
*/
CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead")
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe()
{
unsigned int ret;
Expand Down
Loading
Loading