diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index a2a5017d537..e4a221ccbae 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -26,12 +26,8 @@
*
******************************************************************************/
-/**
- * @file
- * The cub::BlockAdjacentDifference class provides
- * [collective](index.html#sec0) methods for computing the differences
- * of adjacent elements partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockAdjacentDifference class provides collective methods for computing
+//! the differences of adjacent elements partitioned across a CUDA thread block.
#pragma once
@@ -50,87 +46,81 @@
CUB_NAMESPACE_BEGIN
-/**
- * @brief BlockAdjacentDifference provides
- * [collective](index.html#sec0) methods for computing the
- * differences of adjacent elements partitioned across a CUDA thread
- * block.
- *
- * @ingroup BlockModule
- *
- * @par Overview
- * - BlockAdjacentDifference calculates the differences of adjacent elements in
- * the elements partitioned across a CUDA thread block. Because the binary
- * operation could be noncommutative, there are two sets of methods.
- * Methods named SubtractLeft subtract left element `i - 1` of input sequence
- * from current element `i`. Methods named SubtractRight subtract the right element `i + 1`
- * from the current one `i`:
- * @par
- * @code
- * int values[4]; // [1, 2, 3, 4]
- * //...
- * int subtract_left_result[4]; <-- [ 1, 1, 1, 1 ]
- * int subtract_right_result[4]; <-- [ -1, -1, -1, 4 ]
- * @endcode
- * - For SubtractLeft, if the left element is out of bounds, the
- * input value is assigned to `output[0]` without modification.
- * - For SubtractRight, if the right element is out of bounds, the input value
- * is assigned to the current output value without modification.
- * - The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * example_block_reduce_dyn_smem.cu
- * This example can be easily adapted to the storage required by
- * BlockAdjacentDifference.
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference to
- * compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute adjacent_difference
- * int result[4];
- *
- * BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
- * thread_data,
- * result,
- * CustomDifference());
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }.
- * The corresponding output `result` in those threads will be
- * { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }.
- *
- */
+//! @rst
+//! BlockAdjacentDifference provides :ref:`collective ` methods for computing the
+//! differences of adjacent elements partitioned across a CUDA thread block.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! BlockAdjacentDifference calculates the differences of adjacent elements in the elements partitioned across a CUDA
+//! thread block. Because the binary operation could be noncommutative, there are two sets of methods.
+//! Methods named SubtractLeft subtract left element ``i - 1`` of input sequence from current element ``i``.
+//! Methods named SubtractRight subtract the right element ``i + 1`` from the current one ``i``:
+//!
+//! .. code-block:: c++
+//!
+//! int values[4]; // [1, 2, 3, 4]
+//! //...
+//! int subtract_left_result[4]; <-- [ 1, 1, 1, 1 ]
+//! int subtract_right_result[4]; <-- [ -1, -1, -1, 4 ]
+//!
+//! - For SubtractLeft, if the left element is out of bounds, the input value is assigned to ``output[0]``
+//! without modification.
+//! - For SubtractRight, if the right element is out of bounds, the input value is assigned to the current output value
+//! without modification.
+//! - The block/example_block_reduce_dyn_smem.cu example under the examples/block folder illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockAdjacentDifference.
+//!
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates how to use BlockAdjacentDifference to
+//! compute the left difference between adjacent elements.
+//!
+//! .. code-block:: c++
+//!
+//! #include
+//! // or equivalently
+//!
+//! struct CustomDifference
+//! {
+//! template
+//! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+//! {
+//! return lhs - rhs;
+//! }
+//! };
+//!
+//! __global__ void ExampleKernel(...)
+//! {
+//! // Specialize BlockAdjacentDifference for a 1D block of
+//! // 128 threads of type int
+//! using BlockAdjacentDifferenceT =
+//! cub::BlockAdjacentDifference;
+//!
+//! // Allocate shared memory for BlockAdjacentDifference
+//! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+//!
+//! // Obtain a segment of consecutive items that are blocked across threads
+//! int thread_data[4];
+//! ...
+//!
+//! // Collectively compute adjacent_difference
+//! int result[4];
+//!
+//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+//! thread_data,
+//! result,
+//! CustomDifference());
+//!
+//! Suppose the set of input `thread_data` across the block of threads is
+//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+//! The corresponding output ``result`` in those threads will be
+//! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+//!
+//! @endrst
template {};
+ //! @name Collective constructors
+ //! @{
- /***********************************************************************//**
- * @name Collective constructors
- **************************************************************************/
- //@{
-
- /**
- * @brief Collective constructor using a private static allocation of shared
- * memory as temporary storage.
- */
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage
__device__ __forceinline__ BlockAdjacentDifference()
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
- /**
- * @brief Collective constructor using the specified memory allocation as
- * temporary storage.
- *
- * @param[in] temp_storage Reference to memory allocation having layout type TempStorage
- */
+ //! @brief Collective constructor using the specified memory allocation as temporary storage
+ //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
__device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
- //@} end member group
- /***********************************************************************//**
- * @name Read left operations
- **************************************************************************/
- //@{
-
- /**
- * @brief Subtracts the left element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block
- * // of 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
- * thread_data,
- * thread_data,
- * CustomDifference());
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
- * The corresponding output `result` in those threads will be
- * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to @p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- */
+ //! @} end member group
+ //! @name Read left operations
+ //! @{
+
+ //! @rst
+ //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+ //! adjacent elements.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block
+ //! // of 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+ //! The corresponding output ``result`` in those threads will be
+ //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
template
@@ -393,78 +353,77 @@ public:
}
}
- /**
- * @brief Subtracts the left element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // The last item in the previous tile:
- * int tile_predecessor_item = ...;
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
- * thread_data,
- * thread_data,
- * CustomDifference(),
- * tile_predecessor_item);
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
- * and that `tile_predecessor_item` is `3`. The corresponding output
- * `result` in those threads will be
- * `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to \p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- *
- * @param[in] tile_predecessor_item
- * [thread0 only] item which is going to be
- * subtracted from the first tile item (input0 from
- * thread0).
- */
+ //! @rst
+ //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+ //! adjacent elements.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // The last item in the previous tile:
+ //! int tile_predecessor_item = ...;
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference(),
+ //! tile_predecessor_item);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+ //! and that `tile_predecessor_item` is `3`. The corresponding output
+ //! ``result`` in those threads will be
+ //! ``{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
+ //!
+ //! @param[in] tile_predecessor_item
+ //! @rst
+ //! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+ //! (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+ //! @endrst
template
@@ -497,73 +456,71 @@ public:
}
}
- /**
- * @brief Subtracts the left element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- * int valid_items = 9;
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
- * thread_data,
- * thread_data,
- * CustomDifference(),
- * valid_items);
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
- * The corresponding output `result` in those threads will be
- * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to \p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- *
- * @param[in] valid_items
- * Number of valid items in thread block
- */
+ //! @rst
+ //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+ //! adjacent elements.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //! int valid_items = 9;
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference(),
+ //! valid_items);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+ //! The corresponding output ``result`` in those threads will be
+ //! ``{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
+ //!
+ //! @param[in] valid_items
+ //! Number of valid items in thread block
template
@@ -615,80 +572,80 @@ public:
}
}
- /**
- * @brief Subtracts the left element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- * int valid_items = 9;
- * int tile_predecessor_item = 4;
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
- * thread_data,
- * thread_data,
- * CustomDifference(),
- * valid_items,
- * tile_predecessor_item);
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
- * The corresponding output `result` in those threads will be
- * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to \p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- *
- * @param[in] valid_items
- * Number of valid items in thread block
- *
- * @param[in] tile_predecessor_item
- * **[thread0 only]** item which is going to be
- * subtracted from the first tile item (input0 from
- * thread0).
- */
+ //! @rst
+ //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+ //! adjacent elements.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //! int valid_items = 9;
+ //! int tile_predecessor_item = 4;
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference(),
+ //! valid_items,
+ //! tile_predecessor_item);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+ //! The corresponding output ``result`` in those threads will be
+ //! ``{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
+ //!
+ //! @param[in] valid_items
+ //! Number of valid items in thread block
+ //!
+ //! @param[in] tile_predecessor_item
+ //! @rst
+ //! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+ //! (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+ //! @endrst
template
@@ -746,74 +703,71 @@ public:
}
}
- //@} end member group
- /******************************************************************//**
- * @name Read right operations
- *********************************************************************/
- //@{
-
- /**
- * @brief Subtracts the right element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the right difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractRight(
- * thread_data,
- * thread_data,
- * CustomDifference());
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
- * The corresponding output `result` in those threads will be
- * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to \p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- */
+ //! @} end member group
+ //! @name Read right operations
+ //! @{
+ //!
+ //! @rst
+ //!
+ //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+ //! adjacent elements.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+ //! The corresponding output ``result`` in those threads will be
+ //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
template
@@ -845,79 +799,78 @@ public:
}
}
- /**
- * @brief Subtracts the right element of each adjacent pair of elements
- * partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference
- * to compute the right difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // The first item in the next tile:
- * int tile_successor_item = ...;
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractRight(
- * thread_data,
- * thread_data,
- * CustomDifference(),
- * tile_successor_item);
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`,
- * and that `tile_successor_item` is `3`. The corresponding output `result`
- * in those threads will be
- * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to @p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- *
- * @param[in] tile_successor_item
- * [threadBLOCK_THREADS-1 only] item
- * which is going to be subtracted from the last tile item
- * (inputITEMS_PER_THREAD-1 from
- * threadBLOCK_THREADS-1).
- */
+ //! @rst
+ //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+ //! adjacent elements.
+ //!
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // The first item in the next tile:
+ //! int tile_successor_item = ...;
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference(),
+ //! tile_successor_item);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``,
+ //! and that ``tile_successor_item`` is ``3``. The corresponding output ``result``
+ //! in those threads will be
+ //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
+ //!
+ //! @param[in] tile_successor_item
+ //! @rst
+ //! *thread*\ :sub:`BLOCK_THREADS` only item which is going to be subtracted from the last tile item
+ //! (*input*\ :sub:`ITEMS_PER_THREAD` from *thread*\ :sub:`BLOCK_THREADS`).
+ //! @endrst
template
@@ -947,73 +900,72 @@ public:
difference_op(input[ITEMS_PER_THREAD - 1], successor_item);
}
- /**
- * @brief Subtracts the right element of each adjacent pair in range of
- * elements partitioned across a CUDA thread block.
- *
- * @par
- * - \rowmajor
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference to
- * compute the right difference between adjacent elements.
- *
- * @par
- * @code
- * #include
- * // or equivalently
- *
- * struct CustomDifference
- * {
- * template
- * __device__ DataType operator()(DataType &lhs, DataType &rhs)
- * {
- * return lhs - rhs;
- * }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockAdjacentDifference for a 1D block of
- * // 128 threads of type int
- * using BlockAdjacentDifferenceT =
- * cub::BlockAdjacentDifference;
- *
- * // Allocate shared memory for BlockAdjacentDifference
- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute adjacent_difference
- * BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
- * thread_data,
- * thread_data,
- * CustomDifference(),
- * valid_items);
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
- * and that `valid_items` is `507`. The corresponding output `result` in
- * those threads will be
- * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`.
- *
- * @param[out] output
- * Calling thread's adjacent difference result
- *
- * @param[in] input
- * Calling thread's input items (may be aliased to @p output)
- *
- * @param[in] difference_op
- * Binary difference operator
- *
- * @param[in] valid_items
- * Number of valid items in thread block
- */
+ //! @rst
+ //! Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block.
+ //!
+ //! - @rowmajor
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+ //! adjacent elements.
+ //!
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include
+ //! // or equivalently
+ //!
+ //! struct CustomDifference
+ //! {
+ //! template
+ //! __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ //! {
+ //! return lhs - rhs;
+ //! }
+ //! };
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockAdjacentDifference for a 1D block of
+ //! // 128 threads of type int
+ //! using BlockAdjacentDifferenceT =
+ //! cub::BlockAdjacentDifference;
+ //!
+ //! // Allocate shared memory for BlockAdjacentDifference
+ //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute adjacent_difference
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
+ //! thread_data,
+ //! thread_data,
+ //! CustomDifference(),
+ //! valid_items);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+ //! and that ``valid_items`` is ``507``. The corresponding output ``result`` in
+ //! those threads will be
+ //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }``.
+ //! @endrst
+ //!
+ //! @param[out] output
+ //! Calling thread's adjacent difference result
+ //!
+ //! @param[in] input
+ //! Calling thread's input items (may be aliased to `output`)
+ //!
+ //! @param[in] difference_op
+ //! Binary difference operator
+ //!
+ //! @param[in] valid_items
+ //! Number of valid items in thread block
template
@@ -1062,11 +1014,9 @@ public:
}
}
- //@} end member group
- /******************************************************************//**
- * @name Head flag operations (deprecated)
- *********************************************************************/
- //@{
+ //! @} end member group
+ //! @name Head flag operations (deprecated)
+ //! @{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -1557,6 +1507,7 @@ public:
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
+ //! @} end member group
};
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index 2f628ebb779..8d673507573 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -49,82 +49,81 @@
CUB_NAMESPACE_BEGIN
-/**
- * @brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for
- * flagging discontinuities within an ordered set of items partitioned across a CUDA thread
- * block. ![](discont_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- * The data type to be flagged.
- *
- * @tparam BLOCK_DIM_X
- * The thread block length in threads along the X dimension
- *
- * @tparam BLOCK_DIM_Y
- * [optional] The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- * [optional] The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- * [optional] Unused.
- *
- * @par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- * that differ from their predecessors (or successors). For example, head flags are convenient
- * for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * @par Performance Considerations
- * - \granularity
- *
- * @par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * @par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute head flags for discontinuities in the segment
- * int head_flags[4];
- * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input \p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }.
- * The corresponding output \p head_flags in those threads will be
- * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- *
- * @par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * example_block_reduce_dyn_smem.cu
- *
- * This example can be easily adapted to the storage required by BlockDiscontinuity.
- */
+//! @rst
+//! The BlockDiscontinuity class provides :ref:`collective ` methods for
+//! flagging discontinuities within an ordered set of items partitioned across a CUDA thread
+//! block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+//! that differ from their predecessors (or successors). For example, head flags are convenient
+//! for demarcating disjoint data segments as part of a segmented scan or reduction.
+//! - @blocked
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Incurs zero bank conflicts for most types
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockDiscontinuity}
+//!
+//! The code snippet below illustrates the head flagging of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//! #include // or equivalently
+//!
+//! __global__ void ExampleKernel(...)
+//! {
+//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+//! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+//!
+//! // Allocate shared memory for BlockDiscontinuity
+//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+//!
+//! // Obtain a segment of consecutive items that are blocked across threads
+//! int thread_data[4];
+//! ...
+//!
+//! // Collectively compute head flags for discontinuities in the segment
+//! int head_flags[4];
+//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+//! The corresponding output ``head_flags`` in those threads will be
+//! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``examples/block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockDiscontinuity.
+//! @endrst
+//!
+//! @tparam T
+//! The data type to be flagged.
+//!
+//! @tparam BLOCK_DIM_X
+//! The thread block length in threads along the X dimension
+//!
+//! @tparam BLOCK_DIM_Y
+//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//! **[optional]** Unused
template <
typename T,
int BLOCK_DIM_X,
@@ -135,18 +134,12 @@ class BlockDiscontinuity
{
private:
- /******************************************************************************
- * Constants and type definitions
- ******************************************************************************/
-
- /// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
-
/// Shared memory storage layout type (last element from each thread's input)
struct _TempStorage
{
@@ -154,11 +147,6 @@ private:
T last_items[BLOCK_THREADS];
};
-
- /******************************************************************************
- * Utility methods
- ******************************************************************************/
-
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
@@ -272,10 +260,8 @@ public:
struct TempStorage : Uninitialized<_TempStorage> {};
- /******************************************************************//**
- * @name Collective constructors
- *********************************************************************/
- //@{
+ //! @name Collective constructors
+ //! @{
/**
* @brief Collective constructor using a private static allocation of shared memory as temporary
@@ -298,11 +284,9 @@ public:
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
- //@} end member group
- /******************************************************************//**
- * \name Head flag operations
- *********************************************************************/
- //@{
+ //! @} end member group
+ //! @name Head flag operations
+ //! @{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -388,74 +372,72 @@ public:
#endif // DOXYGEN_SHOULD_SKIP_THIS
- /**
- * @brief Sets head flags indicating discontinuities between items partitioned across the thread
- * block, for which the first item has no reference and is always flagged.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns \p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is always flagged.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute head flags for discontinuities in the segment
- * int head_flags[4];
- * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input \p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }.
- * The corresponding output \p head_flags in those threads will be
- * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true
- * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank
- * of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @rst
+ //! Sets head flags indicating discontinuities between items partitioned across the thread
+ //! block, for which the first item has no reference and is always flagged.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
+ //! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+ //! the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute head flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+ //! The corresponding output ``head_flags`` in those threads will be
+ //! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`.
+ //! `b_index` is the rank of b in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity head_flags
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
@@ -465,84 +447,81 @@ public:
FlagHeads(head_flags, input, preds, flag_op);
}
- /**
- * @brief Sets head flags indicating discontinuities between items partitioned across the thread
- * block.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns \p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is compared
- * against \p tile_predecessor_item.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Have thread0 obtain the predecessor item for the entire tile
- * int tile_predecessor_item;
- * if (threadIdx.x == 0) tile_predecessor_item == ...
- *
- * // Collectively compute head flags for discontinuities in the segment
- * int head_flags[4];
- * BlockDiscontinuity(temp_storage).FlagHeads(
- * head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
- *
- * @endcode
- * @par
- * Suppose the set of input \p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... },
- * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those
- * threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index),
- * and returning \p true if a discontinuity exists between \p a and \p b,
- * otherwise \p false. \p b_index is the rank of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- *
- * @param[in] tile_predecessor_item
- * [thread0 only] Item with which to compare the first tile item
- * (input0 from thread0).
- */
+ //! @rst
+ //! Sets head flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+ //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
+ //! in the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Have thread0 obtain the predecessor item for the entire tile
+ //! int tile_predecessor_item;
+ //! if (threadIdx.x == 0) tile_predecessor_item == ...
+ //!
+ //! // Collectively compute head flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagHeads(
+ //! head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
+ //! and that ``tile_predecessor_item`` is ``0``. The corresponding output ``head_flags`` in those
+ //! threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`,
+ //! and returning `true` if a discontinuity exists between `a` and `b`,
+ //! otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity `head_flags`
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
+ //!
+ //! @param[in] tile_predecessor_item
+ //! @rst
+ //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+ //! @endrst
template
__device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
@@ -554,82 +533,77 @@ public:
}
-
- //@} end member group
- /******************************************************************//**
- * @name Tail flag operations
- *********************************************************************/
- //@{
-
- /**
- * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
- * block, for which the last item has no reference and is always flagged.
- *
- * @par
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns \p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is always flagged.
- * - @blocked
- * - @granularity
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute tail flags for discontinuities in the segment
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }.
- * The corresponding output @p tail_flags in those threads will be
- * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true
- * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the
- * rank of b in the aggregate tile of data.
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @} end member group
+ //! @name Tail flag operations
+ //! @{
+
+ //! @rst
+ //! Sets tail flags indicating discontinuities between items partitioned across the thread
+ //! block, for which the last item has no reference and is always flagged.
+ //!
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+ //! ``flag_op(input[i], next-item)``
+ //! returns ``true`` (where `next-item` is either the next item
+ //! in the same thread or the first item in the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute tail flags for discontinuities in the segment
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
+ //! The corresponding output ``tail_flags`` in those threads will be
+ //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+ //! rank of `b` in the aggregate tile of data.
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
@@ -653,86 +627,84 @@ public:
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
- /**
- * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
- * block.
- *
- * @par
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns @p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is compared
- * against @p tile_successor_item.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Have thread127 obtain the successor item for the entire tile
- * int tile_successor_item;
- * if (threadIdx.x == 127) tile_successor_item == ...
- *
- * // Collectively compute tail flags for discontinuities in the segment
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(
- * tail_flags, thread_data, cub::Inequality(), tile_successor_item);
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }
- * and that @p tile_successor_item is @p 125. The corresponding output @p tail_flags in those
- * threads will be { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true
- * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the
- * rank of b in the aggregate tile of data.
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- *
- * @param[in] tile_successor_item
- * [threadBLOCK_THREADS-1 only] Item with which to
- * compare the last tile item (inputITEMS_PER_THREAD-1 from
- * threadBLOCK_THREADS-1).
- */
+ //! @rst
+ //! Sets tail flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+ //! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
+ //! the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared against
+ //! ``tile_successor_item``.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Have thread127 obtain the successor item for the entire tile
+ //! int tile_successor_item;
+ //! if (threadIdx.x == 127) tile_successor_item == ...
+ //!
+ //! // Collectively compute tail flags for discontinuities in the segment
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(
+ //! tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+ //! and that ``tile_successor_item`` is ``125``. The corresponding output ``tail_flags`` in those
+ //! threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+ //! rank of `b` in the aggregate tile of data.
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
+ //!
+ //! @param[in] tile_successor_item
+ //! @rst
+ //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to
+ //! compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+ //! *thread*\ :sub:`BLOCK_THREADS - 1`).
+ //! @endrst
template
__device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
@@ -760,94 +732,86 @@ public:
}
- //@} end member group
- /******************************************************************//**
- * @name Head & tail flag operations
- *********************************************************************/
- //@{
-
- /**
- * @brief Sets both head and tail flags indicating discontinuities between items partitioned
- * across the thread block.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns @p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is always flagged.
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns @p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is always flagged.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively compute head and flags for discontinuities in the segment
- * int head_flags[4];
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(
- * head_flags, tail_flags, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }
- * and that the tile_successor_item is @p 125. The corresponding output @p head_flags
- * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- * and the corresponding output @p tail_flags in those threads will be
- * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true
- * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the
- * rank of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @} end member group
+ //! @name Head & tail flag operations
+ //! @{
+
+ //! @rst
+ //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
+ //! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+ //! the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+ //! returns ``true`` (where next-item is either the next item in the same thread or the first item in
+ //! the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively compute head and flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(
+ //! head_flags, tail_flags, thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+ //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
+ //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+ //! and the corresponding output ``tail_flags`` in those threads will be
+ //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+ //! rank of `b` in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity head_flags
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
@@ -894,98 +858,93 @@ public:
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
- /**
- * @brief Sets both head and tail flags indicating discontinuities between items partitioned
- * across the thread block.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns @p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is always flagged.
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns @p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is compared
- * against @p tile_predecessor_item.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Have thread127 obtain the successor item for the entire tile
- * int tile_successor_item;
- * if (threadIdx.x == 127) tile_successor_item == ...
- *
- * // Collectively compute head and flags for discontinuities in the segment
- * int head_flags[4];
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(
- * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }
- * and that the tile_successor_item is @p 125. The corresponding output @p head_flags
- * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- * and the corresponding output @p tail_flags in those threads will be
- * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true
- * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the
- * rank of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] tile_successor_item
- * [threadBLOCK_THREADS-1 only] Item with which to compare
- * the last tile item (inputITEMS_PER_THREAD-1 from
- * threadBLOCK_THREADS-1).
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @rst
+ //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when
+ //! ``flag_op(previous-item, input[i])`` returns ``true`` (where ``previous-item`` is either the preceding item
+ //! in the same thread or the last item in the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` returns ``true``
+ //! (where ``next-item`` is either the next item in the same thread or the first item in the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
+ //! against ``tile_predecessor_item``.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Have thread127 obtain the successor item for the entire tile
+ //! int tile_successor_item;
+ //! if (threadIdx.x == 127) tile_successor_item == ...
+ //!
+ //! // Collectively compute head and flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(
+ //! head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+ //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
+ //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+ //! and the corresponding output ``tail_flags`` in those threads will be
+ //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+ //! rank of b in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity head_flags
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] tile_successor_item
+ //! @rst
+ //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare
+ //! the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+ //! *thread*\ :sub:`BLOCK_THREADS - 1`).
+ //! @endrst
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
@@ -1034,103 +993,97 @@ public:
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
- /**
- * @brief Sets both head and tail flags indicating discontinuities between items partitioned
- * across the thread block.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns @p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is compared
- * against @p tile_predecessor_item.
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns @p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is always flagged.
- * - \blocked
- * - \granularity
- * - \smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Have thread0 obtain the predecessor item for the entire tile
- * int tile_predecessor_item;
- * if (threadIdx.x == 0) tile_predecessor_item == ...
- *
- * // Have thread127 obtain the successor item for the entire tile
- * int tile_successor_item;
- * if (threadIdx.x == 127) tile_successor_item == ...
- *
- * // Collectively compute head and flags for discontinuities in the segment
- * int head_flags[4];
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(
- * head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
- * thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] },
- * that the @p tile_predecessor_item is @p 0, and that the
- * @p tile_successor_item is @p 125. The corresponding output @p head_flags
- * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- * and the corresponding output @p tail_flags in those threads will be
- * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true
- * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
- * of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[in] tile_predecessor_item
- * [thread0 only] Item with which to compare the first tile item
- * (input0 from thread0).
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @rst
+ //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+ //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
+ //! in the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+ //! ``flag_op(input[i], next-item)`` returns ``true`` (where ``next-item`` is either the next item
+ //! in the same thread or the first item in the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item
+ //! ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Have thread0 obtain the predecessor item for the entire tile
+ //! int tile_predecessor_item;
+ //! if (threadIdx.x == 0) tile_predecessor_item == ...
+ //!
+ //! // Have thread127 obtain the successor item for the entire tile
+ //! int tile_successor_item;
+ //! if (threadIdx.x == 127) tile_successor_item == ...
+ //!
+ //! // Collectively compute head and flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(
+ //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+ //! thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+ //! that the ``tile_predecessor_item`` is ``0``, and that the ``tile_successor_item`` is ``125``.
+ //! The corresponding output ``head_flags`` in those threads will be
+ //! ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``, and the corresponding output ``tail_flags``
+ //! in those threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+ //! of b in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity head_flags
+ //!
+ //! @param[in] tile_predecessor_item
+ //! @rst
+ //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+ //! @endrst
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
@@ -1173,109 +1126,104 @@ public:
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
- /**
- * @brief Sets both head and tail flags indicating discontinuities between items partitioned
- * across the thread block.
- *
- * @par
- * - The flag head_flagsi is set for item
- * inputi when
- * flag_op(previous-item, inputi)
- * returns @p true (where previous-item is either the preceding item
- * in the same thread or the last item in the previous thread).
- * - For thread0, item input0 is compared
- * against @p tile_predecessor_item.
- * - The flag tail_flagsi is set for item
- * inputi when
- * flag_op(inputi, next-item)
- * returns @p true (where next-item is either the next item
- * in the same thread or the first item in the next thread).
- * - For threadBLOCK_THREADS-1, item
- * inputITEMS_PER_THREAD-1 is compared
- * against @p tile_successor_item.
- * - @blocked
- * - @granularity
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
- * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- * typedef cub::BlockDiscontinuity BlockDiscontinuity;
- *
- * // Allocate shared memory for BlockDiscontinuity
- * __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Have thread0 obtain the predecessor item for the entire tile
- * int tile_predecessor_item;
- * if (threadIdx.x == 0) tile_predecessor_item == ...
- *
- * // Have thread127 obtain the successor item for the entire tile
- * int tile_successor_item;
- * if (threadIdx.x == 127) tile_successor_item == ...
- *
- * // Collectively compute head and flags for discontinuities in the segment
- * int head_flags[4];
- * int tail_flags[4];
- * BlockDiscontinuity(temp_storage).FlagTails(
- * head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
- * thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] },
- * that the @p tile_predecessor_item is @p 0, and that the
- * @p tile_successor_item is @p 125. The corresponding output @p head_flags
- * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
- * and the corresponding output @p tail_flags in those threads will be
- * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam FlagT
- * [inferred] The flag type (must be an integer type)
- *
- * @tparam FlagOp
- * [inferred] Binary predicate functor type having member
- * T operator()(const T &a, const T &b) or member
- * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true
- * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
- * of b in the aggregate tile of data.
- *
- * @param[out] head_flags
- * Calling thread's discontinuity head_flags
- *
- * @param[in] tile_predecessor_item
- * [thread0 only] Item with which to compare the first tile item
- * (input0 from thread0).
- *
- * @param[out] tail_flags
- * Calling thread's discontinuity tail_flags
- *
- * @param[in] tile_successor_item
- * [threadBLOCK_THREADS-1 only] Item with which to compare
- * the last tile item (inputITEMS_PER_THREAD-1 from
- * threadBLOCK_THREADS-1).
- *
- * @param[in] input
- * Calling thread's input items
- *
- * @param[in] flag_op
- * Binary boolean flag predicate
- */
+ //! @rst
+ //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+ //!
+ //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+ //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+ //! the previous thread).
+ //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+ //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+ //! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
+ //! the next thread).
+ //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
+ //! against ``tile_successor_item``.
+ //! - @blocked
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+ //! are partitioned in a :ref:`blocked arrangement ` across 128 threads
+ //! where each thread owns 4 consecutive items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ //! typedef cub::BlockDiscontinuity BlockDiscontinuity;
+ //!
+ //! // Allocate shared memory for BlockDiscontinuity
+ //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Have thread0 obtain the predecessor item for the entire tile
+ //! int tile_predecessor_item;
+ //! if (threadIdx.x == 0) tile_predecessor_item == ...
+ //!
+ //! // Have thread127 obtain the successor item for the entire tile
+ //! int tile_successor_item;
+ //! if (threadIdx.x == 127) tile_successor_item == ...
+ //!
+ //! // Collectively compute head and flags for discontinuities in the segment
+ //! int head_flags[4];
+ //! int tail_flags[4];
+ //! BlockDiscontinuity(temp_storage).FlagTails(
+ //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+ //! thread_data, cub::Inequality());
+ //!
+ //! Suppose the set of input ``thread_data`` across the block of threads is
+ //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+ //! that the ``tile_predecessor_item`` is ``0``, and that the
+ //! ``tile_successor_item`` is ``125``. The corresponding output ``head_flags``
+ //! in those threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+ //! and the corresponding output ``tail_flags`` in those threads will be
+ //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+ //! @endrst
+ //!
+ //! @tparam ITEMS_PER_THREAD
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
+ //!
+ //! @tparam FlagT
+ //! **[inferred]** The flag type (must be an integer type)
+ //!
+ //! @tparam FlagOp
+ //! **[inferred]** Binary predicate functor type having member
+ //! `T operator()(const T &a, const T &b)` or member
+ //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+ //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+ //! of `b` in the aggregate tile of data.
+ //!
+ //! @param[out] head_flags
+ //! Calling thread's discontinuity head_flags
+ //!
+ //! @param[in] tile_predecessor_item
+ //! @rst
+ //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+ //! @endrst
+ //!
+ //! @param[out] tail_flags
+ //! Calling thread's discontinuity tail_flags
+ //!
+ //! @param[in] tile_successor_item
+ //! @rst
+ //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item
+ //! (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`).
+ //! @endrst
+ //!
+ //! @param[in] input
+ //! Calling thread's input items
+ //!
+ //! @param[in] flag_op
+ //! Binary boolean flag predicate
template
__device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
@@ -1322,10 +1270,7 @@ public:
}
-
-
- //@} end member group
-
+ //! @} end member group
};
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index c99b9eba84d..c56fad3314c 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -26,10 +26,8 @@
*
******************************************************************************/
-/**
- * \file
- * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockExchange class provides :ref:`collective ` methods for
+//! rearranging data partitioned across a CUDA thread block.
#pragma once
@@ -50,89 +48,106 @@
CUB_NAMESPACE_BEGIN
-/**
- * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T The data type to be exchanged.
- * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
- * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH [optional] Unused.
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- * threads. For example, the device-accessible memory subsystem prefers access patterns
- * where data items are "striped" across threads (where consecutive threads access consecutive items),
- * yet most block-wide operations prefer a "blocked" partitioning of items across threads
- * (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements
- * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements
- * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3)
- * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- * typedef cub::BlockExchange BlockExchange;
- *
- * // Allocate shared memory for BlockExchange
- * __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- * // Load a tile of data striped across threads
- * int thread_data[4];
- * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- * // Collectively exchange data into a blocked arrangement across threads
- * BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }.
- * The corresponding output \p thread_data in those threads will be
- * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- * \par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * example_block_reduce_dyn_smem.cu
- *
- * This example can be easily adapted to the storage required by BlockExchange.
- */
-template <
- typename InputT,
- int BLOCK_DIM_X,
- int ITEMS_PER_THREAD,
- bool WARP_TIME_SLICING = false,
- int BLOCK_DIM_Y = 1,
- int BLOCK_DIM_Z = 1,
- int LEGACY_PTX_ARCH = 0>
+//! @rst
+//! The BlockExchange class provides :ref:`collective ` methods for rearranging data partitioned
+//! across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - It is commonplace for blocks of threads to rearrange data items between
+//! threads. For example, the device-accessible memory subsystem prefers access patterns
+//! where data items are "striped" across threads (where consecutive threads access consecutive items),
+//! yet most block-wide operations prefer a "blocked" partitioning of items across threads
+//! (where consecutive items belong to a single thread).
+//! - BlockExchange supports the following types of data exchanges:
+//!
+//! - Transposing between :ref:`blocked ` and :ref:`striped `
+//! arrangements
+//! - Transposing between :ref:`blocked ` and
+//! :ref:`warp-striped ` arrangements
+//! - Scattering ranked items to a :ref:`blocked arrangement `
+//! - Scattering ranked items to a :ref:`striped arrangement `
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockExchange}
+//!
+//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+//! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+//!
+//! .. code-block:: c++
+//!
+//! #include // or equivalently
+//!
+//! __global__ void ExampleKernel(int *d_data, ...)
+//! {
+//! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+//! typedef cub::BlockExchange BlockExchange;
+//!
+//! // Allocate shared memory for BlockExchange
+//! __shared__ typename BlockExchange::TempStorage temp_storage;
+//!
+//! // Load a tile of data striped across threads
+//! int thread_data[4];
+//! cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+//!
+//! // Collectively exchange data into a blocked arrangement across threads
+//! BlockExchange(temp_storage).StripedToBlocked(thread_data);
+//!
+//! Suppose the set of striped input ``thread_data`` across the block of threads is
+//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``.
+//! The corresponding output ``thread_data`` in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Proper device-specific padding ensures zero bank conflicts for most types.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to
+//! the storage required by BlockExchange.
+//! @endrst
+//!
+//! @tparam T
+//! The data type to be exchanged
+//!
+//! @tparam BLOCK_DIM_X
+//! The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//! The number of items partitioned onto each thread.
+//!
+//! @tparam WARP_TIME_SLICING
+//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data,
+//! time-slicing the block-wide exchange over multiple synchronized rounds.
+//! Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
+//!
+//! @tparam BLOCK_DIM_Y
+//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//! [optional] Unused.
+template
class BlockExchange
{
private:
-
- /******************************************************************************
- * Constants
- ******************************************************************************/
-
/// Constants
enum
{
@@ -161,10 +176,6 @@ private:
PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
};
- /******************************************************************************
- * Type definitions
- ******************************************************************************/
-
/// Shared memory storage layout type
struct __align__(16) _TempStorage
{
@@ -173,16 +184,11 @@ private:
public:
- /// \smemstorage{BlockExchange}
+ /// @smemstorage{BlockExchange}
struct TempStorage : Uninitialized<_TempStorage> {};
private:
-
- /******************************************************************************
- * Thread fields
- ******************************************************************************/
-
/// Shared storage reference
_TempStorage &temp_storage;
@@ -192,11 +198,6 @@ private:
unsigned int warp_id;
unsigned int warp_offset;
-
- /******************************************************************************
- * Utility methods
- ******************************************************************************/
-
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
@@ -204,16 +205,14 @@ private:
return private_storage;
}
- /**
- * @brief Transposes data items from blocked arrangement to striped
- * arrangement. Specialized for no timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement.
+ //! Specialized for no timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -239,16 +238,14 @@ private:
}
}
- /**
- * @brief Transposes data items from blocked arrangement to striped
- * arrangement. Specialized for warp-timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **blocked** arrangement to **striped**
+ //! arrangement. Specialized for warp-timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -305,16 +302,14 @@ private:
}
}
- /**
- * @brief Transposes data items from blocked arrangement to warp-striped
- * arrangement. Specialized for no timeslicing
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+ //! Specialized for no timeslicing
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -340,16 +335,14 @@ private:
}
}
- /**
- * @brief Transposes data items from blocked arrangement to warp-striped
- * arrangement. Specialized for warp-timeslicing
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+ //! Specialized for warp-timeslicing
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -406,16 +399,14 @@ private:
}
}
- /**
- * @brief Transposes data items from striped arrangement to blocked
- * arrangement. Specialized for no timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
+ //! Specialized for no timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -442,16 +433,14 @@ private:
}
}
- /**
- * @brief Transposes data items from striped arrangement to blocked
- * arrangement. Specialized for warp-timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
+ //! Specialized for warp-timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -510,16 +499,14 @@ private:
}
}
- /**
- * @brief Transposes data items from warp-striped arrangement to blocked
- * arrangement. Specialized for no timeslicing
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+ //! Specialized for no timeslicing
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -546,16 +533,14 @@ private:
}
}
- /**
- * @brief Transposes data items from warp-striped arrangement to blocked
- * arrangement. Specialized for warp-timeslicing
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- */
+ //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+ //! Specialized for warp-timeslicing
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
template
__device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -590,19 +575,17 @@ private:
}
}
- /**
- * @brief Exchanges data items annotated by rank into blocked arrangement. Specialized
- * for no timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
+ //! Specialized for no timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -629,19 +612,17 @@ private:
}
}
- /**
- * @brief Exchanges data items annotated by rank into blocked arrangement. Specialized
- * for warp-timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
+ //! Specialized for warp-timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -691,19 +672,17 @@ private:
}
}
- /**
- * @brief Exchanges data items annotated by rank into striped arrangement. Specialized
- * for no timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @brief Exchanges data items annotated by rank into **striped** arrangement.
+ //! Specialized for no timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -730,19 +709,17 @@ private:
}
}
- /**
- * @brief Exchanges data items annotated by rank into striped arrangement. Specialized
- * for warp-timeslicing.
- *
- * @param[in] input_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[out] output_items
- * Items to exchange, converting between blocked and striped arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @brief Exchanges data items annotated by rank into **striped** arrangement.
+ //! Specialized for warp-timeslicing.
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items to exchange, converting between **blocked** and **striped** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -803,10 +780,8 @@ private:
public:
- /******************************************************************//**
- * @name Collective constructors
- *********************************************************************/
- //@{
+ //! @name Collective constructors
+ //! @{
/**
* @brief Collective constructor using a private static allocation of shared memory as temporary storage.
@@ -835,54 +810,51 @@ public:
{}
- //@} end member group
- /******************************************************************//**
- * @name Structured exchanges
- *********************************************************************/
- //@{
-
- /**
- * @brief Transposes data items from striped arrangement to blocked
- * arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- * typedef cub::BlockExchange BlockExchange;
- *
- * // Allocate shared memory for BlockExchange
- * __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- * // Load a tile of ordered data into a striped arrangement across block threads
- * int thread_data[4];
- * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- * // Collectively exchange data into a blocked arrangement across threads
- * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of striped input @p thread_data across the block of threads is
- * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from
- * device-accessible memory. The corresponding output @p thread_data in those threads will be
- * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- */
+ //! @} end member group
+ //! @name Structured exchanges
+ //! @{
+
+ //! @rst
+ //! Transposes data items from **striped** arrangement to **blocked** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+ //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(int *d_data, ...)
+ //! {
+ //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ //! typedef cub::BlockExchange BlockExchange;
+ //!
+ //! // Allocate shared memory for BlockExchange
+ //! __shared__ typename BlockExchange::TempStorage temp_storage;
+ //!
+ //! // Load a tile of ordered data into a striped arrangement across block threads
+ //! int thread_data[4];
+ //! cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ //!
+ //! // Collectively exchange data into a blocked arrangement across threads
+ //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+ //!
+ //! Suppose the set of striped input ``thread_data`` across the block of threads is
+ //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from
+ //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+ //! @endrst
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
template
__device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -890,52 +862,51 @@ public:
StripedToBlocked(input_items, output_items, Int2Type());
}
- /**
- * @brief Transposes data items from blocked arrangement to striped
- * arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- * typedef cub::BlockExchange BlockExchange;
- *
- * // Allocate shared memory for BlockExchange
- * __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively exchange data into a striped arrangement across threads
- * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
- *
- * // Store data striped across block threads into an ordered tile
- * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of blocked input @p thread_data across the block of threads is
- * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
- * The corresponding output @p thread_data in those threads will be
- * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in
- * preparation for storing to device-accessible memory.
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- */
+ //! @rst
+ //! Transposes data items from **blocked** arrangement to **striped** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(int *d_data, ...)
+ //! {
+ //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ //! typedef cub::BlockExchange BlockExchange;
+ //!
+ //! // Allocate shared memory for BlockExchange
+ //! __shared__ typename BlockExchange::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively exchange data into a striped arrangement across threads
+ //! BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+ //!
+ //! // Store data striped across block threads into an ordered tile
+ //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
+ //!
+ //! Suppose the set of blocked input ``thread_data`` across the block of threads is
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+ //! The corresponding output ``thread_data`` in those threads will be
+ //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in
+ //! preparation for storing to device-accessible memory.
+ //! @endrst
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
template
__device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -943,51 +914,51 @@ public:
BlockedToStriped(input_items, output_items, Int2Type());
}
- /**
- * @brief Transposes data items from warp-striped arrangement to blocked
- * arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
- * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
- * items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- * typedef cub::BlockExchange BlockExchange;
- *
- * // Allocate shared memory for BlockExchange
- * __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- * // Load a tile of ordered data into a warp-striped arrangement across warp threads
- * int thread_data[4];
- * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data);
- *
- * // Collectively exchange data into a blocked arrangement across threads
- * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of warp-striped input @p thread_data across the block of threads is
- * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
- * after loading from device-accessible memory. (The first 128 items are striped across
- * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
- * The corresponding output @p thread_data in those threads will be
- * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- */
+ //! @rst
+ //! Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
+ //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+ //! items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(int *d_data, ...)
+ //! {
+ //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ //! typedef cub::BlockExchange BlockExchange;
+ //!
+ //! // Allocate shared memory for BlockExchange
+ //! __shared__ typename BlockExchange::TempStorage temp_storage;
+ //!
+ //! // Load a tile of ordered data into a warp-striped arrangement across warp threads
+ //! int thread_data[4];
+ //! cub::LoadSWarptriped(threadIdx.x, d_data, thread_data);
+ //!
+ //! // Collectively exchange data into a blocked arrangement across threads
+ //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+ //!
+ //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is
+ //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
+ //! after loading from device-accessible memory. (The first 128 items are striped across
+ //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+ //! The corresponding output ``thread_data`` in those threads will be
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+ //! @endrst
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
template
__device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -995,55 +966,54 @@ public:
WarpStripedToBlocked(input_items, output_items, Int2Type());
}
- /**
- * @brief Transposes data items from blocked arrangement to warp-striped
- * arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
- * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
- * items.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- * typedef cub::BlockExchange BlockExchange;
- *
- * // Allocate shared memory for BlockExchange
- * __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- * // Obtain a segment of consecutive items that are blocked across threads
- * int thread_data[4];
- * ...
- *
- * // Collectively exchange data into a warp-striped arrangement across threads
- * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
- *
- * // Store data striped across warp threads into an ordered tile
- * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of blocked input @p thread_data across the block of threads is
- * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
- * The corresponding output @p thread_data in those threads will be
- * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
- * in preparation for storing to device-accessible memory. (The first 128 items are striped
- * across the first warp of 32 threads, the second 128 items are striped across the second warp,
- * etc.)
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- */
+ //! @rst
+ //! Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
+ //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+ //! items.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(int *d_data, ...)
+ //! {
+ //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ //! typedef cub::BlockExchange BlockExchange;
+ //!
+ //! // Allocate shared memory for BlockExchange
+ //! __shared__ typename BlockExchange::TempStorage temp_storage;
+ //!
+ //! // Obtain a segment of consecutive items that are blocked across threads
+ //! int thread_data[4];
+ //! ...
+ //!
+ //! // Collectively exchange data into a warp-striped arrangement across threads
+ //! BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+ //!
+ //! // Store data striped across warp threads into an ordered tile
+ //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
+ //!
+ //! Suppose the set of blocked input ``thread_data`` across the block of threads is
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+ //! The corresponding output ``thread_data`` in those threads will be
+ //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
+ //! in preparation for storing to device-accessible memory. (The first 128 items are striped
+ //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+ //! @endrst
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
template
__device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -1051,32 +1021,27 @@ public:
BlockedToWarpStriped(input_items, output_items, Int2Type());
}
-
-
- //@} end member group
- /******************************************************************//**
- * @name Scatter exchanges
- *********************************************************************/
- //@{
-
- /**
- * @brief Exchanges data items annotated by rank into blocked arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @tparam OffsetT
- * [inferred] Signed integer type for local offsets
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @} end member group
+ //! @name Scatter exchanges
+ //! @{
+
+ //! @rst
+ //! Exchanges data items annotated by rank into **blocked** arrangement.
+ //!
+ //! - @smemreuse
+ //! @endrst
+ //!
+ //! @tparam OffsetT
+ //! **[inferred]** Signed integer type for local offsets
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -1085,24 +1050,24 @@ public:
ScatterToBlocked(input_items, output_items, ranks, Int2Type());
}
- /**
- * @brief Exchanges data items annotated by rank into striped arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @tparam OffsetT
- * [inferred] Signed integer type for local offsets
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @rst
+ //! Exchanges data items annotated by rank into **striped** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //! @endrst
+ //!
+ //! @tparam OffsetT
+ //! **[inferred]** Signed integer type for local offsets
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -1111,25 +1076,24 @@ public:
ScatterToStriped(input_items, output_items, ranks, Int2Type());
}
- /**
- * @brief Exchanges data items annotated by rank into striped arrangement.
- * Items with rank -1 are not exchanged.
- *
- * @par
- * - @smemreuse
- *
- * @tparam OffsetT
- * [inferred] Signed integer type for local offsets
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- */
+ //! @rst
+ //! Exchanges data items annotated by rank into **striped** arrangement. Items with rank -1 are not exchanged.
+ //!
+ //! - @smemreuse
+ //!
+ //! @endrst
+ //!
+ //! @tparam OffsetT
+ //! **[inferred]** Signed integer type for local offsets
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
template
__device__ __forceinline__ void
ScatterToStripedGuarded(InputT (&input_items)[ITEMS_PER_THREAD],
@@ -1156,30 +1120,30 @@ public:
}
}
- /**
- * @brief Exchanges valid data items annotated by rank into striped arrangement.
- *
- * @par
- * - @smemreuse
- *
- * @tparam OffsetT
- * [inferred] Signed integer type for local offsets
- *
- * @tparam ValidFlag
- * [inferred] FlagT type denoting which items are valid
- *
- * @param[in] input_items
- * Items to exchange, converting between striped and blocked arrangements.
- *
- * @param[out] output_items
- * Items from exchange, converting between striped and blocked arrangements.
- *
- * @param[in] ranks
- * Corresponding scatter ranks
- *
- * @param[in] is_valid
- * Corresponding flag denoting item validity
- */
+ //! @rst
+ //! Exchanges valid data items annotated by rank into **striped** arrangement.
+ //!
+ //! - @smemreuse
+ //!
+ //! @endrst
+ //!
+ //! @tparam OffsetT
+ //! **[inferred]** Signed integer type for local offsets
+ //!
+ //! @tparam ValidFlag
+ //! **[inferred]** FlagT type denoting which items are valid
+ //!
+ //! @param[in] input_items
+ //! Items to exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[out] output_items
+ //! Items from exchange, converting between **striped** and **blocked** arrangements.
+ //!
+ //! @param[in] ranks
+ //! Corresponding scatter ranks
+ //!
+ //! @param[in] is_valid
+ //! Corresponding flag denoting item validity
template
__device__ __forceinline__ void
ScatterToStripedFlagged(InputT (&input_items)[ITEMS_PER_THREAD],
@@ -1207,16 +1171,13 @@ public:
}
}
-
- //@} end member group
-
-
+ //! @} end member group
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*/
__device__ __forceinline__ void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
{
@@ -1225,7 +1186,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*/
__device__ __forceinline__ void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD])
{
@@ -1234,7 +1195,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*/
__device__ __forceinline__ void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
{
@@ -1243,7 +1204,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*/
__device__ __forceinline__ void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD])
{
@@ -1252,7 +1213,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*
* @param[in] ranks
* Corresponding scatter ranks
@@ -1266,7 +1227,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*
* @param[in] ranks
* Corresponding scatter ranks
@@ -1280,7 +1241,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*
* @param[in] ranks
* Corresponding scatter ranks
@@ -1294,7 +1255,7 @@ public:
/**
* @param[in-out] items
- * Items to exchange, converting between striped and blocked arrangements.
+ * Items to exchange, converting between **striped** and **blocked** arrangements.
*
* @param[in] ranks
* Corresponding scatter ranks
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index 1ef84dc5a20..e8c3e6cb490 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -50,158 +50,147 @@
CUB_NAMESPACE_BEGIN
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
- * block-wide histograms.
- */
+//! @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
+//! block-wide histograms.
enum BlockHistogramAlgorithm
{
- /**
- * @par Overview
- * Sorting followed by differentiation. Execution is comprised of two phases:
- * -# Sort the data using efficient radix sort
- * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
- *
- * @par Performance Considerations
- * Delivers consistent throughput regardless of sample bin distribution.
- */
- BLOCK_HISTO_SORT,
-
-
- /**
- * @par Overview
- * Use atomic addition to update byte counts directly
- *
- * @par Performance Considerations
- * Performance is strongly tied to the hardware implementation of atomic
- * addition, and may be significantly degraded for non uniformly-random
- * input distributions where many concurrent updates are likely to be
- * made to the same bin counter.
- */
- BLOCK_HISTO_ATOMIC,
+ //! @rst
+ //!
+ //! Overview
+ //! ++++++++++++++++++++++++++
+ //!
+ //! Sorting followed by differentiation. Execution is comprised of two phases:
+ //!
+ //! #. Sort the data using efficient radix sort
+ //! #. Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+ //!
+ //! Performance Considerations
+ //! ++++++++++++++++++++++++++
+ //!
+ //! Delivers consistent throughput regardless of sample bin distribution.
+ //!
+ //! @endrst
+ BLOCK_HISTO_SORT,
+
+ //! @rst
+ //!
+ //! Overview
+ //! ++++++++++++++++++++++++++
+ //!
+ //! Use atomic addition to update byte counts directly
+ //!
+ //! Performance Considerations
+ //! ++++++++++++++++++++++++++
+ //!
+ //! Performance is strongly tied to the hardware implementation of atomic
+ //! addition, and may be significantly degraded for non uniformly-random
+ //! input distributions where many concurrent updates are likely to be
+ //! made to the same bin counter.
+ //!
+ //! @endrst
+ BLOCK_HISTO_ATOMIC,
};
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-/**
- * @brief The BlockHistogram class provides [collective](index.html#sec0) methods for
- * constructing block-wide histograms from data samples partitioned across a CUDA thread
- * block. ![](histogram_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- * The sample type being histogrammed (must be castable to an integer bin identifier)
- *
- * @tparam BLOCK_DIM_X
- * The thread block length in threads along the X dimension
- *
- * @tparam ITEMS_PER_THREAD
- * The number of items per thread
- *
- * @tparam BINS
- * The number bins within the histogram
- *
- * @tparam ALGORITHM
- * [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm
- * to use (default: cub::BLOCK_HISTO_SORT)
- *
- * @tparam BLOCK_DIM_Y
- * [optional] The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- * [optional] The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- * [optional] Unused.
- *
- * @par Overview
- * - A histogram
- * counts the number of observations that fall into each of the disjoint categories (known as
- * bins).
- * - The `T` type must be implicitly castable to an integer type.
- * - BlockHistogram expects each integral `input[i]` value to satisfy
- * `0 <= input[i] < BINS`. Values outside of this range result in undefined
- * behavior.
- * - BlockHistogram can be optionally specialized to use different algorithms:
- * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref
- * cub::BlockHistogramAlgorithm)
- * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly.
- * [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * @par Performance Considerations
- * - @granularity
- *
- * @par A Simple Example
- * @blockcollective{BlockHistogram}
- * @par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character
- * samples each typedef cub::BlockHistogram BlockHistogram;
- *
- * // Allocate shared memory for BlockHistogram
- * __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- * // Allocate shared memory for block-wide histogram bin counts
- * __shared__ unsigned int smem_histogram[256];
- *
- * // Obtain input samples per thread
- * unsigned char data[4];
- * ...
- *
- * // Compute the block-wide histogram
- * BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * @endcode
- *
- * @par Performance and Usage Considerations
- * - All input values must fall between [0, BINS), or behavior is undefined.
- * - The histogram output can be constructed in shared or device-accessible memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * example_block_reduce_dyn_smem.cu
- *
- * This example can be easily adapted to the storage required by BlockHistogram.
- */
-template <
- typename T,
- int BLOCK_DIM_X,
- int ITEMS_PER_THREAD,
- int BINS,
- BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
- int BLOCK_DIM_Y = 1,
- int BLOCK_DIM_Z = 1,
- int LEGACY_PTX_ARCH = 0>
+//! @rst
+//! The BlockHistogram class provides :ref:`collective ` methods for
+//! constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A `histogram `_ counts the number of observations that fall into
+//! each of the disjoint categories (known as *bins*).
+//! - The ``T`` type must be implicitly castable to an integer type.
+//! - BlockHistogram expects each integral ``input[i]`` value to satisfy
+//! ``0 <= input[i] < BINS``. Values outside of this range result in undefined behavior.
+//! - BlockHistogram can be optionally specialized to use different algorithms:
+//!
+//! #. :cpp:enumerator:`cub::BLOCK_HISTO_SORT`: Sorting followed by differentiation.
+//! #. :cpp:enumerator:`cub::BLOCK_HISTO_ATOMIC`: Use atomic addition to update byte counts directly.
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockHistogram}
+//!
+//! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+//! are partitioned across 128 threads where each thread owns 4 samples.
+//!
+//! .. code-block:: c++
+//!
+//! #include // or equivalently
+//!
+//! __global__ void ExampleKernel(...)
+//! {
+//! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character
+//! samples each typedef cub::BlockHistogram BlockHistogram;
+//!
+//! // Allocate shared memory for BlockHistogram
+//! __shared__ typename BlockHistogram::TempStorage temp_storage;
+//!
+//! // Allocate shared memory for block-wide histogram bin counts
+//! __shared__ unsigned int smem_histogram[256];
+//!
+//! // Obtain input samples per thread
+//! unsigned char data[4];
+//! ...
+//!
+//! // Compute the block-wide histogram
+//! BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+//!
+//! Performance and Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - All input values must fall between ``[0, BINS)``, or behavior is undefined.
+//! - The histogram output can be constructed in shared or device-accessible memory
+//! - See ``cub::BlockHistogramAlgorithm`` for performance details regarding algorithmic alternatives
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage
+//! required by BlockHistogram.
+//! @endrst
+//!
+//! @tparam T
+//! The sample type being histogrammed (must be castable to an integer bin identifier)
+//!
+//! @tparam BLOCK_DIM_X
+//! The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//! The number of items per thread
+//!
+//! @tparam BINS
+//! The number bins within the histogram
+//!
+//! @tparam ALGORITHM
+//! **[optional]** cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use
+//! (default: cub::BLOCK_HISTO_SORT)
+//!
+//! @tparam BLOCK_DIM_Y
+//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//! **[optional]** Unused.
+template
class BlockHistogram
{
private:
- /******************************************************************************
- * Constants and type definitions
- ******************************************************************************/
-
/// Constants
enum
{
@@ -223,22 +212,12 @@ private:
/// Shared memory storage layout type for BlockHistogram
typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
- /******************************************************************************
- * Thread fields
- ******************************************************************************/
-
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
-
- /******************************************************************************
- * Utility methods
- ******************************************************************************/
-
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
@@ -246,21 +225,16 @@ private:
return private_storage;
}
-
public:
/// @smemstorage{BlockHistogram}
struct TempStorage : Uninitialized<_TempStorage> {};
- /******************************************************************//**
- * @name Collective constructors
- *********************************************************************/
- //@{
+ //! @name Collective constructors
+ //! @{
- /**
- * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
- */
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ BlockHistogram()
:
temp_storage(PrivateStorage()),
@@ -279,50 +253,50 @@ public:
{}
- //@} end member group
- /******************************************************************//**
- * @name Histogram operations
- *********************************************************************/
- //@{
-
-
- /**
- * @brief Initialize the shared histogram counters to zero.
- *
- * @par Snippet
- * The code snippet below illustrates a the initialization and update of a
- * histogram of 512 integer samples that are partitioned across 128 threads
- * where each thread owns 4 samples.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- * typedef cub::BlockHistogram BlockHistogram;
- *
- * // Allocate shared memory for BlockHistogram
- * __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- * // Allocate shared memory for block-wide histogram bin counts
- * __shared__ unsigned int smem_histogram[256];
- *
- * // Obtain input samples per thread
- * unsigned char thread_samples[4];
- * ...
- *
- * // Initialize the block-wide histogram
- * BlockHistogram(temp_storage).InitHistogram(smem_histogram);
- *
- * // Update the block-wide histogram
- * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
- *
- * @endcode
- *
- * @tparam CounterT
- * [inferred] Histogram counter type
- */
+ //! @} end member group
+ //! @name Histogram operations
+ //! @{
+
+
+ //! @rst
+ //! Initialize the shared histogram counters to zero.
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates a the initialization and update of a
+ //! histogram of 512 integer samples that are partitioned across 128 threads
+ //! where each thread owns 4 samples.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ //! typedef cub::BlockHistogram BlockHistogram;
+ //!
+ //! // Allocate shared memory for BlockHistogram
+ //! __shared__ typename BlockHistogram::TempStorage temp_storage;
+ //!
+ //! // Allocate shared memory for block-wide histogram bin counts
+ //! __shared__ unsigned int smem_histogram[256];
+ //!
+ //! // Obtain input samples per thread
+ //! unsigned char thread_samples[4];
+ //! ...
+ //!
+ //! // Initialize the block-wide histogram
+ //! BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+ //!
+ //! // Update the block-wide histogram
+ //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+ //!
+ //! @endrst
+ //!
+ //! @tparam CounterT
+ //! **[inferred]** Histogram counter type
template
__device__ __forceinline__ void InitHistogram(CounterT histogram[BINS])
{
@@ -341,51 +315,52 @@ public:
}
}
- /**
- * @brief Constructs a block-wide histogram in shared/device-accessible memory.
- * Each thread contributes an array of input elements.
- *
- * @par
- * - @granularity
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
- * character samples each typedef cub::BlockHistogram
- * BlockHistogram;
- *
- * // Allocate shared memory for BlockHistogram
- * __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- * // Allocate shared memory for block-wide histogram bin counts
- * __shared__ unsigned int smem_histogram[256];
- *
- * // Obtain input samples per thread
- * unsigned char thread_samples[4];
- * ...
- *
- * // Compute the block-wide histogram
- * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
- *
- * @endcode
- *
- * @tparam CounterT
- * [inferred] Histogram counter type
- *
- * @param[in] items
- * Calling thread's input values to histogram
- *
- * @param[out] histogram
- * Reference to shared/device-accessible memory histogram
- */
+ //! @rst
+ //! Constructs a block-wide histogram in shared/device-accessible memory.
+ //! Each thread contributes an array of input elements.
+ //!
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ //! are partitioned across 128 threads where each thread owns 4 samples.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+ //! // character samples each typedef cub::BlockHistogram
+ //! // BlockHistogram;
+ //!
+ //! // Allocate shared memory for BlockHistogram
+ //! __shared__ typename BlockHistogram::TempStorage temp_storage;
+ //!
+ //! // Allocate shared memory for block-wide histogram bin counts
+ //! __shared__ unsigned int smem_histogram[256];
+ //!
+ //! // Obtain input samples per thread
+ //! unsigned char thread_samples[4];
+ //! ...
+ //!
+ //! // Compute the block-wide histogram
+ //! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+ //!
+ //! @endrst
+ //!
+ //! @tparam CounterT
+ //! **[inferred]** Histogram counter type
+ //!
+ //! @param[in] items
+ //! Calling thread's input values to histogram
+ //!
+ //! @param[out] histogram
+ //! Reference to shared/device-accessible memory histogram
template
__device__ __forceinline__ void Histogram(T (&items)[ITEMS_PER_THREAD],
CounterT histogram[BINS])
@@ -399,55 +374,56 @@ public:
InternalBlockHistogram(temp_storage).Composite(items, histogram);
}
- /**
- * @brief Updates an existing block-wide histogram in shared/device-accessible memory.
- * Each thread composites an array of input elements.
- *
- * @par
- * - @granularity
- * - @smemreuse
- *
- * @par Snippet
- * The code snippet below illustrates a the initialization and update of a
- * histogram of 512 integer samples that are partitioned across 128 threads
- * where each thread owns 4 samples.
- * @par
- * @code
- * #include // or equivalently
- *
- * __global__ void ExampleKernel(...)
- * {
- * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
- * character samples each typedef cub::BlockHistogram
- * BlockHistogram;
- *
- * // Allocate shared memory for BlockHistogram
- * __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- * // Allocate shared memory for block-wide histogram bin counts
- * __shared__ unsigned int smem_histogram[256];
- *
- * // Obtain input samples per thread
- * unsigned char thread_samples[4];
- * ...
- *
- * // Initialize the block-wide histogram
- * BlockHistogram(temp_storage).InitHistogram(smem_histogram);
- *
- * // Update the block-wide histogram
- * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
- *
- * @endcode
- *
- * @tparam CounterT
- * [inferred] Histogram counter type
- *
- * @param[in] items
- * Calling thread's input values to histogram
- *
- * @param[out] histogram
- * Reference to shared/device-accessible memory histogram
- */
+ //! @rst
+ //! Updates an existing block-wide histogram in shared/device-accessible memory.
+ //! Each thread composites an array of input elements.
+ //!
+ //! - @granularity
+ //! - @smemreuse
+ //!
+ //! Snippet
+ //! +++++++
+ //!
+ //! The code snippet below illustrates a the initialization and update of a
+ //! histogram of 512 integer samples that are partitioned across 128 threads
+ //! where each thread owns 4 samples.
+ //!
+ //! .. code-block:: c++
+ //!
+ //! #include // or equivalently
+ //!
+ //! __global__ void ExampleKernel(...)
+ //! {
+ //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+ //! // character samples each typedef cub::BlockHistogram
+ //! // BlockHistogram;
+ //!
+ //! // Allocate shared memory for BlockHistogram
+ //! __shared__ typename BlockHistogram::TempStorage temp_storage;
+ //!
+ //! // Allocate shared memory for block-wide histogram bin counts
+ //! __shared__ unsigned int smem_histogram[256];
+ //!
+ //! // Obtain input samples per thread
+ //! unsigned char thread_samples[4];
+ //! ...
+ //!
+ //! // Initialize the block-wide histogram
+ //! BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+ //!
+ //! // Update the block-wide histogram
+ //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+ //!
+ //! @endrst
+ //!
+ //! @tparam CounterT
+ //! **[inferred]** Histogram counter type
+ //!
+ //! @param[in] items
+ //! Calling thread's input values to histogram
+ //!
+ //! @param[out] histogram
+ //! Reference to shared/device-accessible memory histogram
template
__device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD],
CounterT histogram[BINS])
@@ -458,4 +434,3 @@ public:
};
CUB_NAMESPACE_END
-
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 463b981e82b..d95cca4e346 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -26,10 +26,7 @@
*
******************************************************************************/
-/**
- * @file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
+//! @file Operations for reading linear tiles of data into the CUDA thread block.
#pragma once
@@ -50,41 +47,35 @@
CUB_NAMESPACE_BEGIN
-/**
- * @addtogroup UtilIo
- * @{
- */
-
-/******************************************************************//**
- * @name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * @blocked
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- * [inferred] The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_itr
- * The thread block's base input iterator for loading from
- *
- * @param[out] items
- * Data to load
- */
+//! @name Blocked arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//! **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam InputIteratorT
+//! **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//! The thread block's base input iterator for loading from
+//!
+//! @param[out] items
+//! Data to load
template
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
InputIteratorT block_itr,
@@ -98,34 +89,34 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
}
}
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
- * by range.
- *
- * @blocked
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- * [inferred] The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_itr
- * The thread block's base input iterator for loading from
- *
- * @param[out] items
- * Data to load
- *
- * @param[in] valid_items
- * Number of valid items to load
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//! **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam InputIteratorT
+//! **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//! The thread block's base input iterator for loading from
+//!
+//! @param[out] items
+//! Data to load
+//!
+//! @param[in] valid_items
+//! Number of valid items to load
template
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
InputIteratorT block_itr,
@@ -143,37 +134,38 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
}
}
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
- * by range, with a fall-back assignment of out-of-bound elements..
- *
- * @blocked
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- * [inferred] The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_itr
- * The thread block's base input iterator for loading from
- *
- * @param[out] items
- * Data to load
- *
- * @param[in] valid_items
- * Number of valid items to load
- *
- * @param[in] oob_default
- * Default value to assign out-of-bound items
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded
+//! by range, with a fall-back assignment of out-of-bound elements.
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//! **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam InputIteratorT
+//! **[inferred]** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//! The thread block's base input iterator for loading from
+//!
+//! @param[out] items
+//! Data to load
+//!
+//! @param[in] valid_items
+//! Number of valid items to load
+//!
+//! @param[in] oob_default
+//! Default value to assign out-of-bound items
template
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
InputIteratorT block_itr,
@@ -191,19 +183,18 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-/**
- * @brief Internal implementation for load vectorization
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_ptr
- * Input pointer for loading from
- *
- * @param[out] items
- * Data to load
- */
+
+//! @brief Internal implementation for load vectorization
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_ptr
+//! Input pointer for loading from
+//!
+//! @param[out] items
+//! Data to load
template
__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid,
T *block_ptr,
@@ -251,35 +242,36 @@ __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_t
#endif // DOXYGEN_SHOULD_SKIP_THIS
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * @blocked
- *
- * The input offset (@p block_ptr + @p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to
- * cub::BLOCK_LOAD_DIRECT:
- * - @p ITEMS_PER_THREAD is odd
- * - The data type @p T is not a built-in primitive or CUDA vector type
- * (e.g., @p short, @p int2, @p double, @p float2, etc.)
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_ptr
- * Input pointer for loading from
- *
- * @param[out] items
- * Data to load
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//!
+//! @blocked
+//!
+//! The input offset (``block_ptr + block_offset``) must be quad-item aligned
+//!
+//! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+//!
+//! - ``ITEMS_PER_THREAD`` is odd
+//! - The data type ``T`` is not a built-in primitive or CUDA vector type
+//! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+//!
+//! @endrst
+//!
+//! @tparam T
+//! **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_ptr
+//! Input pointer for loading from
+//!
+//! @param[out] items
+//! Data to load
template
__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid,
T *block_ptr,
@@ -288,40 +280,38 @@ __device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid,
InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items);
}
-
-//@} end member group
-/******************************************************************//**
- * @name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- * The thread block size in threads
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- * [inferred] The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_itr
- * The thread block's base input iterator for loading from
- *
- * @param[out] items
- * Data to load
- */
+//! @} end member group
+//! @name Striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block.
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//! The thread block size in threads
+//!
+//! @tparam T
+//! **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam InputIteratorT
+//! **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//! The thread block's base input iterator for loading from
+//!
+//! @param[out] items
+//! Data to load
template
__device__ __forceinline__ void LoadDirectStriped(int linear_tid,
InputIteratorT block_itr,
@@ -350,37 +340,39 @@ __device__ __forceinline__ void load_transform_direct_striped(
} // namespace detail
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
- * by range
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- * The thread block size in threads
- *
- * @tparam T
- * [inferred] The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- * [inferred] The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- * [inferred] The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- * A suitable 1D thread-identifier for the calling thread
- * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
- *
- * @param[in] block_itr
- * The thread block's base input iterator for loading from
- *
- * @param[out] items
- * Data to load
- *
- * @param[in] valid_items
- * Number of valid items to load
- */
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//! The thread block size in threads
+//!
+//! @tparam T
+//! **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//! **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam InputIteratorT
+//! **inferred** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//! A suitable 1D thread-identifier for the calling thread
+//! (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//! The thread block's base input iterator for loading from
+//!
+//! @param[out] items
+//! Data to load
+//!
+//! @param[in] valid_items
+//! Number of valid items to load
+//!
template
__device__ __forceinline__ void LoadDirectStriped(int linear_tid,
InputIteratorT block_itr,
@@ -397,40 +389,41 @@ __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
}
}
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
- * by range, with a fall-back assignment of out-of-bound elements.
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- * The thread block size in threads
- *
- * @tparam T
- *