Skip to content

Commit

Permalink
Deprecate block/warp algo specializations (NVIDIA#3455)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored and davebayer committed Jan 22, 2025
1 parent 2bb126f commit 9de8020
Show file tree
Hide file tree
Showing 17 changed files with 108 additions and 29 deletions.
4 changes: 2 additions & 2 deletions cub/cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ private:
/// Internal specialization.
using InternalBlockHistogram =
::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
BlockHistogramAtomic<BINS>>;
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
detail::BlockHistogramAtomic<BINS>>;

/// Shared memory storage layout type for BlockHistogram
using _TempStorage = typename InternalBlockHistogram::TempStorage;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ private:
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};

using WarpReductions = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using WarpReductions = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;

/// Internal specialization type
using InternalBlockReduce =
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ private:
? BLOCK_SCAN_RAKING
: ALGORITHM;

using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking =
BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;

/// Define the delegate type for the desired algorithm
using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
Expand Down
11 changes: 9 additions & 2 deletions cub/cub/block/specializations/block_histogram_atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
#endif // no system header

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
* histograms from data samples partitioned across a CUDA thread block.
Expand All @@ -72,13 +73,19 @@ struct BlockHistogramAtomic
template <typename T, typename CounterT, int ITEMS_PER_THREAD>
_CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
{
// Update histogram
// Update histogram
#pragma unroll
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
atomicAdd(histogram + items[i], 1);
}
}
};
} // namespace detail

template <int BINS>
using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockHistogramAtomic<BINS>;

CUB_NAMESPACE_END
16 changes: 15 additions & 1 deletion cub/cub/block/specializations/block_histogram_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
#include <cub/util_ptx.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
* histograms from data samples partitioned across a CUDA thread block.
Expand Down Expand Up @@ -243,5 +244,18 @@ struct BlockHistogramSort
}
}
};
} // namespace detail

template <typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
int BINS,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int LEGACY_PTX_ARCH = 0>
using BlockHistogramSort CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") =
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/block/specializations/block_reduce_raking.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
* block. Supports non-commutative reduction operators.
Expand Down Expand Up @@ -257,5 +258,11 @@ struct BlockReduceRaking
return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceRaking CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
* across a CUDA thread block. Does not support non-commutative reduction operators. Does not
Expand Down Expand Up @@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly

// The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
// valid values
using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using FallBack = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;

/// Constants
enum
Expand Down Expand Up @@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly
return partial;
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
* across a CUDA thread block. Supports non-commutative reduction operators.
Expand Down Expand Up @@ -256,5 +257,11 @@ struct BlockReduceWarpReductions
return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/block/specializations/block_scan_raking.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cub/warp/warp_scan.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
* thread block.
Expand Down Expand Up @@ -794,5 +795,11 @@ struct BlockScanRaking
}
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
using BlockScanRaking CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 7 additions & 1 deletion cub/cub/block/specializations/block_scan_warp_scans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
* thread block.
Expand Down Expand Up @@ -537,5 +538,10 @@ struct BlockScanWarpScans
exclusive_output = scan_op(block_prefix, exclusive_output);
}
};
} // namespace detail
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 6 additions & 2 deletions cub/cub/warp/specializations/warp_reduce_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ template <class T>
struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
{};

} // namespace detail

/**
* @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -739,5 +737,11 @@ struct WarpReduceShfl
return output;
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpReduceShfl CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 7 additions & 1 deletion cub/cub/warp/specializations/warp_reduce_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -411,5 +412,10 @@ struct WarpReduceSmem
return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpReduceSmem CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
CUB_NAMESPACE_END
13 changes: 10 additions & 3 deletions cub/cub/warp/specializations/warp_scan_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -513,7 +514,7 @@ struct WarpScanShfl
// Iterate scan steps
int segment_first_lane = 0;

// Iterate scan steps
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
Expand Down Expand Up @@ -550,7 +551,7 @@ struct WarpScanShfl
// Find index of first set bit
int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));

// Iterate scan steps
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
Expand Down Expand Up @@ -674,5 +675,11 @@ struct WarpScanShfl
Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpScanShfl CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpScanShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/warp/specializations/warp_scan_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -432,5 +433,11 @@ struct WarpScanSmem
}
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpScanSmem CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpScanSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
4 changes: 2 additions & 2 deletions cub/cub/warp/warp_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ public:

/// Internal specialization.
/// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
using InternalWarpReduce =
::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
using InternalWarpReduce = ::cuda::std::
_If<IS_POW_OF_TWO, detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;

#endif // _CCCL_DOXYGEN_INVOKED

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/warp/warp_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ private:

/// Internal specialization.
/// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
using InternalWarpScan =
::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
using InternalWarpScan = ::cuda::std::
_If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;

/// Shared memory storage layout type for WarpScan
using _TempStorage = typename InternalWarpScan::TempStorage;
Expand Down
4 changes: 2 additions & 2 deletions docs/cub/developer_overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement

using InternalWarpReduce = cuda::std::conditional_t<
IS_POW_OF_TWO,
WarpReduceShfl<T, LOGICAL_WARP_THREADS>, // shuffle-based implementation
WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, // shuffle-based implementation
detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation

Specializations provide different shared memory requirements,
so the actual ``_TempStorage`` type is defined as:
Expand Down

0 comments on commit 9de8020

Please sign in to comment.