Skip to content

Commit

Permalink
Merge branch 'main' into review-cub-util-ptx
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Jan 13, 2025
2 parents 0af70fd + c339a52 commit 8d44adb
Show file tree
Hide file tree
Showing 80 changed files with 694 additions and 1,808 deletions.
6 changes: 2 additions & 4 deletions cub/cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -162,15 +160,15 @@ struct AgentScan
// Wrap the native input pointer with CacheModifiedInputIterator
// or directly use the supplied input iterator type
using WrappedInputIteratorT =
::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
InputIteratorT>;

// Constants
enum
{
// Inclusive scan if no init_value type is provided
HAS_INIT = !std::is_same<InitValueT, NullType>::value,
HAS_INIT = !::cuda::std::is_same<InitValueT, NullType>::value,
IS_INCLUSIVE = ForceInclusive || !HAS_INIT, // We are relying on either initial value not being `NullType`
// or the ForceInclusive tag to be true for inclusive scan
// to get picked up.
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@

#include <cuda/std/type_traits>

#include <iterator>

#include <nv/target>

CUB_NAMESPACE_BEGIN
Expand Down
9 changes: 3 additions & 6 deletions cub/cub/detail/detect_cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,9 @@
# endif // CUB_RUNTIME_FUNCTION predefined

# ifdef CUB_RDC_ENABLED
// Detect available version of CDP:
# if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
# define CUB_DETAIL_CDPv1
# else
# define CUB_DETAIL_CDPv2
# endif
# ifdef CUDA_FORCE_CDP1_IF_SUPPORTED
# error "CUDA Dynamic Parallelism 1 is no longer supported. Please undefine CUDA_FORCE_CDP1_IF_SUPPORTED."
# endif // CUDA_FORCE_CDP1_IF_SUPPORTED
# endif

#endif // Do not document
22 changes: 1 addition & 21 deletions cub/cub/detail/device_synchronize.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,27 +45,7 @@ _CCCL_EXEC_CHECK_DISABLE
CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
{
cudaError_t result = cudaErrorNotSupported;

// Device-side sync is only available under CDPv1:
#if defined(CUB_DETAIL_CDPv1)

# if ((__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
// CUDA >= 11.6
# define CUB_TMP_DEVICE_SYNC_IMPL result = __cudaDeviceSynchronizeDeprecationAvoidance();
# else // CUDA < 11.6:
# define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
# endif

#else // CDPv2 or no CDP:

# define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */

#endif // CDP version

NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL));

#undef CUB_TMP_DEVICE_SYNC_IMPL

NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ());
return result;
}

Expand Down
2 changes: 0 additions & 2 deletions cub/cub/detail/strong_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <iterator>

CUB_NAMESPACE_BEGIN

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down
25 changes: 11 additions & 14 deletions cub/cub/device/device_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ private:
CompareOpT compare_op,
cudaStream_t stream = 0)
{
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
Expand Down Expand Up @@ -374,10 +374,10 @@ public:
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

using DispatchMergeSortT =
DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
d_temp_storage,
Expand All @@ -402,10 +402,10 @@ private:
CompareOpT compare_op,
cudaStream_t stream = 0)
{
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -528,10 +528,10 @@ private:
CompareOpT compare_op,
cudaStream_t stream = 0)
{
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

using DispatchMergeSortT =
DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -760,9 +760,8 @@ public:
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;

return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>(
return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
}

Expand Down Expand Up @@ -860,9 +859,8 @@ public:
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;

return SortKeysNoNVTX<KeyIteratorT, PromotedOffsetT, CompareOpT>(
return SortKeysNoNVTX<KeyIteratorT, OffsetT, CompareOpT>(
d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
}

Expand Down Expand Up @@ -974,8 +972,7 @@ public:
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, PromotedOffsetT, CompareOpT>(
return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
}
};
Expand Down
144 changes: 3 additions & 141 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

#include <cub/config.cuh>

#include <cub/util_namespace.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand All @@ -46,6 +48,7 @@
#endif // no system header

#include <cub/agent/agent_scan.cuh>
#include <cub/device/dispatch/kernels/scan.cuh>
#include <cub/device/dispatch/tuning/tuning_scan.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
Expand All @@ -57,149 +60,8 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
* Kernel entry points
*****************************************************************************/

/**
* @brief Initialization kernel for tile status initialization (multi-block)
*
* @tparam ScanTileStateT
* Tile status interface type
*
* @param[in] tile_state
* Tile status interface
*
* @param[in] num_tiles
* Number of tiles
*/
template <typename ScanTileStateT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
{
// Initialize tile status
tile_state.InitializeStatus(num_tiles);
}

/**
* Initialization kernel for tile status initialization (multi-block)
*
* @tparam ScanTileStateT
* Tile status interface type
*
* @tparam NumSelectedIteratorT
* Output iterator type for recording the number of items selected
*
* @param[in] tile_state
* Tile status interface
*
* @param[in] num_tiles
* Number of tiles
*
* @param[out] d_num_selected_out
* Pointer to the total number of items selected
* (i.e., length of `d_selected_out`)
*/
template <typename ScanTileStateT, typename NumSelectedIteratorT>
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
{
// Initialize tile status
tile_state.InitializeStatus(num_tiles);

// Initialize d_num_selected_out
if ((blockIdx.x == 0) && (threadIdx.x == 0))
{
*d_num_selected_out = 0;
}
}

/**
* @brief Scan kernel entry point (multi-block)
*
*
* @tparam ChainedPolicyT
* Chained tuning policy
*
* @tparam InputIteratorT
* Random-access input iterator type for reading scan inputs @iterator
*
* @tparam OutputIteratorT
* Random-access output iterator type for writing scan outputs @iterator
*
* @tparam ScanTileStateT
* Tile status interface type
*
* @tparam ScanOpT
* Binary scan functor type having member
* `auto operator()(const T &a, const U &b)`
*
* @tparam InitValueT
* Initial value to seed the exclusive scan
* (cub::NullType for inclusive scans)
*
* @tparam OffsetT
* Unsigned integer type for global offsets
*
* @paramInput d_in
* data
*
* @paramOutput d_out
* data
*
* @paramTile tile_state
* status interface
*
* @paramThe start_tile
* starting tile for the current grid
*
* @paramBinary scan_op
* scan functor
*
* @paramInitial init_value
* value to seed the exclusive scan
*
* @paramTotal num_items
* number of scan items for the entire problem
*/
template <typename ChainedPolicyT,
typename InputIteratorT,
typename OutputIteratorT,
typename ScanTileStateT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT,
bool ForceInclusive>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
InputIteratorT d_in,
OutputIteratorT d_out,
ScanTileStateT tile_state,
int start_tile,
ScanOpT scan_op,
InitValueT init_value,
OffsetT num_items)
{
using RealInitValueT = typename InitValueT::value_type;
using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;

// Thread block type for scanning input tiles
using AgentScanT =
AgentScan<ScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, RealInitValueT, OffsetT, AccumT, ForceInclusive>;

// Shared memory for AgentScan
__shared__ typename AgentScanT::TempStorage temp_storage;

RealInitValueT real_init_value = init_value;

// Process tiles
AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
}

/******************************************************************************
* Dispatch
******************************************************************************/
Expand Down
Loading

0 comments on commit 8d44adb

Please sign in to comment.