diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index 8caf6a5bf59..41abbd588b3 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -202,8 +202,8 @@ private: /// Internal specialization. using InternalBlockHistogram = ::cuda::std::_If, - BlockHistogramAtomic>; + detail::BlockHistogramSort, + detail::BlockHistogramAtomic>; /// Shared memory storage layout type for BlockHistogram using _TempStorage = typename InternalBlockHistogram::TempStorage; diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index 6828f6fa706..6cf578963fc 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -250,9 +250,9 @@ private: BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - using WarpReductions = BlockReduceWarpReductions; - using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly; - using Raking = BlockReduceRaking; + using WarpReductions = detail::BlockReduceWarpReductions; + using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly; + using Raking = detail::BlockReduceRaking; /// Internal specialization type using InternalBlockReduce = diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index 44b2342c7d1..c25bd2d258d 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -250,9 +250,9 @@ private: ? BLOCK_SCAN_RAKING : ALGORITHM; - using WarpScans = BlockScanWarpScans; + using WarpScans = detail::BlockScanWarpScans; using Raking = - BlockScanRaking; + detail::BlockScanRaking; /// Define the delegate type for the desired algorithm using InternalBlockScan = ::cuda::std::_If; diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh index 8edc8575c40..4103641dbe2 100644 --- a/cub/cub/block/specializations/block_histogram_atomic.cuh +++ b/cub/cub/block/specializations/block_histogram_atomic.cuh @@ -45,7 +45,8 @@ #endif // no system header CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. @@ -72,7 +73,7 @@ struct BlockHistogramAtomic template _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { -// Update histogram + // Update histogram #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { @@ -80,5 +81,11 @@ struct BlockHistogramAtomic } } }; +} // namespace detail + +template +using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockHistogramAtomic; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 38d49a3b8e6..127f30953b2 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -49,7 +49,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. @@ -243,5 +244,18 @@ struct BlockHistogramSort } } }; +} // namespace detail + +template +using BlockHistogramSort CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = + detail::BlockHistogramSort; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 7382732960b..90f8f12236f 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -50,7 +50,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread * block. Supports non-commutative reduction operators. @@ -257,5 +258,11 @@ struct BlockReduceRaking return Reduce(partial, num_valid, reduction_op); } }; +} // namespace detail + +template +using BlockReduceRaking CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 9bfd94f425d..7841db5f18a 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -50,7 +50,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction * across a CUDA thread block. Does not support non-commutative reduction operators. Does not @@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have // valid values - using FallBack = BlockReduceRaking; + using FallBack = detail::BlockReduceRaking; /// Constants enum @@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly return partial; } }; +} // namespace detail + +template +using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceRakingCommutativeOnly; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index efb47d6101e..2dfa526771f 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -51,7 +51,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction * across a CUDA thread block. Supports non-commutative reduction operators. @@ -256,5 +257,11 @@ struct BlockReduceWarpReductions return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } }; +} // namespace detail + +template +using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceWarpReductions; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index 7f1b1887bc3..2af4b8693fc 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -52,7 +52,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA * thread block. @@ -794,5 +795,11 @@ struct BlockScanRaking } } }; +} // namespace detail + +template +using BlockScanRaking CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockScanRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index b71855132c1..d034d2838ea 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -50,7 +50,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA * thread block. @@ -537,5 +538,10 @@ struct BlockScanWarpScans exclusive_output = scan_op(block_prefix, exclusive_output); } }; +} // namespace detail +template +using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockScanWarpScans; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 55df1f3beee..8c4ad78d1ad 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -83,8 +83,6 @@ template struct reduce_max_exists : ::cuda::std::true_type {}; -} // namespace detail - /** * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned * across a CUDA thread warp. @@ -739,5 +737,11 @@ struct WarpReduceShfl return output; } }; +} // namespace detail + +template +using WarpReduceShfl CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpReduceShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index d7884e26753..ade195ee6cb 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -52,7 +52,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned * across a CUDA thread warp. @@ -411,5 +412,10 @@ struct WarpReduceSmem return SegmentedReduce(input, flag, reduction_op, Int2Type()); } }; +} // namespace detail +template +using WarpReduceSmem CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpReduceSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index 22d6b4b6f0a..402b476c4e4 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -51,7 +51,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. @@ -513,7 +514,7 @@ struct WarpScanShfl // Iterate scan steps int segment_first_lane = 0; -// Iterate scan steps + // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { @@ -550,7 +551,7 @@ struct WarpScanShfl // Find index of first set bit int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); -// Iterate scan steps + // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { @@ -674,5 +675,11 @@ struct WarpScanShfl Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); } }; +} // namespace detail + +template +using WarpScanShfl CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpScanShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index 336416b69d9..090f0f96cb5 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -52,7 +52,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. @@ -432,5 +433,11 @@ struct WarpScanSmem } } }; +} // namespace detail + +template +using WarpScanSmem CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpScanSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 00440c18bdf..4b2c61e343a 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -174,8 +174,8 @@ public: /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpReduce = - ::cuda::std::_If, WarpReduceSmem>; + using InternalWarpReduce = ::cuda::std:: + _If, detail::WarpReduceSmem>; #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index cec992e699c..6eb6a35562b 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -180,8 +180,8 @@ private: /// Internal specialization. /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpScan = - ::cuda::std::_If, WarpScanSmem>; + using InternalWarpScan = ::cuda::std:: + _If, detail::WarpScanSmem>; /// Shared memory storage layout type for WarpScan using _TempStorage = typename InternalWarpScan::TempStorage; diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index a0a78ed0d71..4cc639e27fb 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement using InternalWarpReduce = cuda::std::conditional_t< IS_POW_OF_TWO, - WarpReduceShfl, // shuffle-based implementation - WarpReduceSmem>; // smem-based implementation + detail::WarpReduceShfl, // shuffle-based implementation + detail::WarpReduceSmem>; // smem-based implementation Specializations provide different shared memory requirements, so the actual ``_TempStorage`` type is defined as: