diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index a2a5017d537..e4a221ccbae 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -26,12 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * The cub::BlockAdjacentDifference class provides
- * [<em>collective</em>](index.html#sec0) methods for computing the differences
- * of adjacent elements partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockAdjacentDifference class provides collective methods for computing 
+//! the differences of adjacent elements partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -50,87 +46,81 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief BlockAdjacentDifference provides
- *        [<em>collective</em>](index.html#sec0) methods for computing the
- *        differences of adjacent elements partitioned across a CUDA thread
- *        block.
- *
- * @ingroup BlockModule
- *
- * @par Overview
- * - BlockAdjacentDifference calculates the differences of adjacent elements in
- *   the elements partitioned across a CUDA thread block. Because the binary
- *   operation could be noncommutative, there are two sets of methods.
- *   Methods named SubtractLeft subtract left element `i - 1` of input sequence
- *   from current element `i`. Methods named SubtractRight subtract the right element `i + 1`
- *   from the current one `i`:
- *   @par
- *   @code
- *   int values[4]; // [1, 2, 3, 4]
- *   //...
- *   int subtract_left_result[4];  <-- [  1,  1,  1,  1 ]
- *   int subtract_right_result[4]; <-- [ -1, -1, -1,  4 ]
- *   @endcode
- * - For SubtractLeft, if the left element is out of bounds, the
- *   input value is assigned to `output[0]` without modification.
- * - For SubtractRight, if the right element is out of bounds, the input value
- *   is assigned to the current output value without modification.
- * - The following example under the examples/block folder illustrates usage of
- *   dynamically shared memory with BlockReduce and how to re-purpose
- *   the same memory region:
- *   <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *   This example can be easily adapted to the storage required by
- *   BlockAdjacentDifference.
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p BlockAdjacentDifference to
- * compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include <cub/cub.cuh>
- * // or equivalently <cub/block/block_adjacent_difference.cuh>
- *
- * struct CustomDifference
- * {
- *   template <typename DataType>
- *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
- *   {
- *     return lhs - rhs;
- *   }
- * };
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockAdjacentDifference for a 1D block of
- *     // 128 threads of type int
- *     using BlockAdjacentDifferenceT =
- *        cub::BlockAdjacentDifference<int, 128>;
- *
- *     // Allocate shared memory for BlockAdjacentDifference
- *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute adjacent_difference
- *     int result[4];
- *
- *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
- *         thread_data,
- *         result,
- *         CustomDifference());
- *
- * @endcode
- * @par
- * Suppose the set of input `thread_data` across the block of threads is
- * <tt>{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }</tt>.
- * The corresponding output `result` in those threads will be
- * <tt>{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }</tt>.
- *
- */
+//! @rst
+//! BlockAdjacentDifference provides :ref:`collective <collective-primitives>` methods for computing the
+//! differences of adjacent elements partitioned across a CUDA thread block.
+//! 
+//! Overview
+//! ++++++++++++++++
+//! 
+//! BlockAdjacentDifference calculates the differences of adjacent elements in the elements partitioned across a CUDA 
+//! thread block. Because the binary operation could be noncommutative, there are two sets of methods.
+//! Methods named SubtractLeft subtract left element ``i - 1`` of input sequence from current element ``i``. 
+//! Methods named SubtractRight subtract the right element ``i + 1`` from the current one ``i``:
+//!
+//! .. code-block:: c++
+//!
+//!    int values[4]; // [1, 2, 3, 4]
+//!    //...
+//!    int subtract_left_result[4];  <-- [  1,  1,  1,  1 ]
+//!    int subtract_right_result[4]; <-- [ -1, -1, -1,  4 ]
+//!
+//! - For SubtractLeft, if the left element is out of bounds, the input value is assigned to ``output[0]`` 
+//!   without modification.
+//! - For SubtractRight, if the right element is out of bounds, the input value is assigned to the current output value
+//!   without modification.
+//! - The block/example_block_reduce_dyn_smem.cu example under the examples/block folder illustrates usage of 
+//!   dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//!   This example can be easily adapted to the storage required by BlockAdjacentDifference.
+//! 
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates how to use BlockAdjacentDifference to
+//! compute the left difference between adjacent elements.
+//! 
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+//! 
+//!    struct CustomDifference
+//!    {
+//!      template <typename DataType>
+//!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+//!      {
+//!        return lhs - rhs;
+//!      }
+//!    };
+//! 
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockAdjacentDifference for a 1D block of
+//!        // 128 threads of type int
+//!        using BlockAdjacentDifferenceT =
+//!           cub::BlockAdjacentDifference<int, 128>;
+//! 
+//!        // Allocate shared memory for BlockAdjacentDifference
+//!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+//! 
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//! 
+//!        // Collectively compute adjacent_difference
+//!        int result[4];
+//! 
+//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+//!            thread_data,
+//!            result,
+//!            CustomDifference());
+//! 
+//! Suppose the set of input `thread_data` across the block of threads is
+//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+//! The corresponding output ``result`` in those threads will be
+//! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+//!
+//! @endrst
 template <typename T,
           int BLOCK_DIM_X,
           int BLOCK_DIM_Y     = 1,
@@ -139,13 +129,6 @@ template <typename T,
 class BlockAdjacentDifference
 {
 private:
-
-    /***************************************************************************
-     * Constants and type definitions
-     **************************************************************************/
-
-    /// Constants
-
     /// The thread block size in threads
     static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
 
@@ -156,11 +139,6 @@ private:
         T last_items[BLOCK_THREADS];
     };
 
-
-    /***************************************************************************
-     * Utility methods
-     **************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -253,10 +231,6 @@ private:
         }
     };
 
-    /***************************************************************************
-     * Thread fields
-     **************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -266,103 +240,89 @@ private:
 
 public:
 
-    /// \smemstorage{BlockAdjacentDifference}
+    /// @smemstorage{BlockAdjacentDifference}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
+    //! @name Collective constructors
+    //! @{
 
-    /***********************************************************************//**
-     * @name Collective constructors
-     **************************************************************************/
-    //@{
-
-    /**
-     * @brief Collective constructor using a private static allocation of shared
-     *        memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage
     __device__ __forceinline__ BlockAdjacentDifference()
         : temp_storage(PrivateStorage())
         , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-    /**
-     * @brief Collective constructor using the specified memory allocation as
-     *        temporary storage.
-     *
-     * @param[in] temp_storage Reference to memory allocation having layout type TempStorage
-     */
+    //! @brief Collective constructor using the specified memory allocation as temporary storage
+    //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
     __device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
         : temp_storage(temp_storage.Alias())
         , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-    //@}  end member group
-    /***********************************************************************//**
-     * @name Read left operations
-     **************************************************************************/
-    //@{
-
-    /**
-     * @brief Subtracts the left element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the left difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockAdjacentDifference for a 1D block
-     *     // of 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
-     *
-     *     // Allocate shared memory for BlockAdjacentDifference
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
-     * The corresponding output `result` in those threads will be
-     * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to @p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     */
+    //! @} end member group
+    //! @name Read left operations
+    //! @{
+
+    //! @rst
+    //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+    //! adjacent elements.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockAdjacentDifference for a 1D block
+    //!        // of 128 threads of type int
+    //!        using BlockAdjacentDifferenceT =
+    //!           cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!        // Allocate shared memory for BlockAdjacentDifference
+    //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute adjacent_difference
+    //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+    //!            thread_data,
+    //!            thread_data,
+    //!            CustomDifference());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+    //! The corresponding output ``result`` in those threads will be
+    //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
     template <int ITEMS_PER_THREAD,
               typename OutputType,
               typename DifferenceOpT>
@@ -393,78 +353,77 @@ public:
       }
     }
 
-    /**
-     * @brief Subtracts the left element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the left difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
-     *
-     *     // Allocate shared memory for BlockAdjacentDifference
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // The last item in the previous tile:
-     *     int tile_predecessor_item = ...;
-     *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference(),
-     *         tile_predecessor_item);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
-     * and that `tile_predecessor_item` is `3`. The corresponding output
-     * `result` in those threads will be
-     * `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to \p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     *
-     * @param[in] tile_predecessor_item
-     *   <b>[<em>thread</em><sub>0</sub> only]</b> item which is going to be
-     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
-     *   <em>thread</em><sub>0</sub>).
-     */
+    //! @rst
+    //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+    //! adjacent elements.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockAdjacentDifference for a 1D block of
+    //!        // 128 threads of type int
+    //!        using BlockAdjacentDifferenceT =
+    //!           cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!        // Allocate shared memory for BlockAdjacentDifference
+    //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // The last item in the previous tile:
+    //!        int tile_predecessor_item = ...;
+    //!
+    //!        // Collectively compute adjacent_difference
+    //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+    //!            thread_data,
+    //!            thread_data,
+    //!            CustomDifference(),
+    //!            tile_predecessor_item);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+    //! and that `tile_predecessor_item` is `3`. The corresponding output
+    //! ``result`` in those threads will be
+    //! ``{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
+    //!
+    //! @param[in] tile_predecessor_item
+    //!   @rst
+    //!   *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item 
+    //!   (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+    //!   @endrst
     template <int ITEMS_PER_THREAD,
               typename OutputT,
               typename DifferenceOpT>
@@ -497,73 +456,71 @@ public:
       }
     }
 
-    /**
-     * @brief Subtracts the left element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the left difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *   // Specialize BlockAdjacentDifference for a 1D block of
-     *   // 128 threads of type int
-     *   using BlockAdjacentDifferenceT =
-     *      cub::BlockAdjacentDifference<int, 128>;
-     *
-     *   // Allocate shared memory for BlockAdjacentDifference
-     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *   // Obtain a segment of consecutive items that are blocked across threads
-     *   int thread_data[4];
-     *   ...
-     *   int valid_items = 9;
-     *
-     *   // Collectively compute adjacent_difference
-     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
-     *       thread_data,
-     *       thread_data,
-     *       CustomDifference(),
-     *       valid_items);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
-     * The corresponding output `result` in those threads will be
-     * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to \p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     *
-     * @param[in] valid_items
-     *   Number of valid items in thread block
-     */
+    //! @rst
+    //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+    //! adjacent elements.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!      // Specialize BlockAdjacentDifference for a 1D block of
+    //!      // 128 threads of type int
+    //!      using BlockAdjacentDifferenceT =
+    //!         cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!      // Allocate shared memory for BlockAdjacentDifference
+    //!      __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!      // Obtain a segment of consecutive items that are blocked across threads
+    //!      int thread_data[4];
+    //!      ...
+    //!      int valid_items = 9;
+    //!
+    //!      // Collectively compute adjacent_difference
+    //!      BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+    //!          thread_data,
+    //!          thread_data,
+    //!          CustomDifference(),
+    //!          valid_items);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+    //! The corresponding output ``result`` in those threads will be
+    //! ``{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
+    //!
+    //! @param[in] valid_items
+    //!   Number of valid items in thread block
     template <int ITEMS_PER_THREAD,
               typename OutputType,
               typename DifferenceOpT>
@@ -615,80 +572,80 @@ public:
       }
     }
 
-    /**
-     * @brief Subtracts the left element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the left difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *   // Specialize BlockAdjacentDifference for a 1D block of
-     *   // 128 threads of type int
-     *   using BlockAdjacentDifferenceT =
-     *      cub::BlockAdjacentDifference<int, 128>;
-     *
-     *   // Allocate shared memory for BlockAdjacentDifference
-     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *   // Obtain a segment of consecutive items that are blocked across threads
-     *   int thread_data[4];
-     *   ...
-     *   int valid_items = 9;
-     *   int tile_predecessor_item = 4;
-     *
-     *   // Collectively compute adjacent_difference
-     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
-     *       thread_data,
-     *       thread_data,
-     *       CustomDifference(),
-     *       valid_items,
-     *       tile_predecessor_item);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
-     * The corresponding output `result` in those threads will be
-     * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to \p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     *
-     * @param[in] valid_items
-     *   Number of valid items in thread block
-     *
-     * @param[in] tile_predecessor_item
-     *   **[<em>thread</em><sub>0</sub> only]** item which is going to be
-     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
-     *   <em>thread</em><sub>0</sub>).
-     */
+    //! @rst
+    //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //!
+    //! Snippet
+    //! +++++++
+    //! 
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+    //! adjacent elements.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!      // Specialize BlockAdjacentDifference for a 1D block of
+    //!      // 128 threads of type int
+    //!      using BlockAdjacentDifferenceT =
+    //!         cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!      // Allocate shared memory for BlockAdjacentDifference
+    //!      __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!      // Obtain a segment of consecutive items that are blocked across threads
+    //!      int thread_data[4];
+    //!      ...
+    //!      int valid_items = 9;
+    //!      int tile_predecessor_item = 4;
+    //!
+    //!      // Collectively compute adjacent_difference
+    //!      BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+    //!          thread_data,
+    //!          thread_data,
+    //!          CustomDifference(),
+    //!          valid_items,
+    //!          tile_predecessor_item);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+    //! The corresponding output ``result`` in those threads will be
+    //! ``{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
+    //!
+    //! @param[in] valid_items
+    //!   Number of valid items in thread block
+    //!
+    //! @param[in] tile_predecessor_item
+    //!   @rst
+    //!   *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item 
+    //!   (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+    //!   @endrst
     template <int ITEMS_PER_THREAD,
               typename OutputType,
               typename DifferenceOpT>
@@ -746,74 +703,71 @@ public:
       }
     }
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Read right operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Subtracts the right element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the right difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
-     *
-     *     // Allocate shared memory for BlockAdjacentDifference
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractRight(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
-     * The corresponding output `result` in those threads will be
-     * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to \p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     */
+    //! @} end member group
+    //! @name Read right operations
+    //! @{
+    //!
+    //! @rst
+    //! 
+    //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //! 
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+    //! adjacent elements.
+    //! 
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //! 
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockAdjacentDifference for a 1D block of
+    //!        // 128 threads of type int
+    //!        using BlockAdjacentDifferenceT =
+    //!           cub::BlockAdjacentDifference<int, 128>;
+    //! 
+    //!        // Allocate shared memory for BlockAdjacentDifference
+    //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute adjacent_difference
+    //!        BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+    //!            thread_data,
+    //!            thread_data,
+    //!            CustomDifference());
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+    //! The corresponding output ``result`` in those threads will be
+    //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }``.
+    //! @endrst
+    //! 
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //! 
+    //! @param[in] difference_op
+    //!   Binary difference operator
     template <int ITEMS_PER_THREAD,
               typename OutputT,
               typename DifferenceOpT>
@@ -845,79 +799,78 @@ public:
       }
     }
 
-    /**
-     * @brief Subtracts the right element of each adjacent pair of elements
-     *        partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference
-     * to compute the right difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
-     *
-     *     // Allocate shared memory for BlockAdjacentDifference
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // The first item in the next tile:
-     *     int tile_successor_item = ...;
-     *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractRight(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference(),
-     *         tile_successor_item);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`,
-     * and that `tile_successor_item` is `3`. The corresponding output `result`
-     * in those threads will be
-     * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to @p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     *
-     * @param[in] tile_successor_item
-     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> item
-     *   which is going to be subtracted from the last tile item
-     *   (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
-     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-     */
+    //! @rst
+    //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+    //! adjacent elements.
+    //!
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockAdjacentDifference for a 1D block of
+    //!        // 128 threads of type int
+    //!        using BlockAdjacentDifferenceT =
+    //!           cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!        // Allocate shared memory for BlockAdjacentDifference
+    //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // The first item in the next tile:
+    //!        int tile_successor_item = ...;
+    //!
+    //!        // Collectively compute adjacent_difference
+    //!        BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+    //!            thread_data,
+    //!            thread_data,
+    //!            CustomDifference(),
+    //!            tile_successor_item);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``,
+    //! and that ``tile_successor_item`` is ``3``. The corresponding output ``result``
+    //! in those threads will be
+    //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
+    //!
+    //! @param[in] tile_successor_item
+    //!   @rst
+    //!   *thread*\ :sub:`BLOCK_THREADS` only item which is going to be subtracted from the last tile item 
+    //!   (*input*\ :sub:`ITEMS_PER_THREAD` from *thread*\ :sub:`BLOCK_THREADS`).
+    //!   @endrst
     template <int ITEMS_PER_THREAD,
               typename OutputT,
               typename DifferenceOpT>
@@ -947,73 +900,72 @@ public:
         difference_op(input[ITEMS_PER_THREAD - 1], successor_item);
     }
 
-    /**
-     * @brief Subtracts the right element of each adjacent pair in range of
-     *        elements partitioned across a CUDA thread block.
-     *
-     * @par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference to
-     * compute the right difference between adjacent elements.
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/block/block_adjacent_difference.cuh>
-     *
-     * struct CustomDifference
-     * {
-     *   template <typename DataType>
-     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-     *   {
-     *     return lhs - rhs;
-     *   }
-     * };
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
-     *
-     *     // Allocate shared memory for BlockAdjacentDifference
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference(),
-     *         valid_items);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input `thread_data` across the block of threads is
-     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
-     * and that `valid_items` is `507`. The corresponding output `result` in
-     * those threads will be
-     * `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`.
-     *
-     * @param[out] output
-     *   Calling thread's adjacent difference result
-     *
-     * @param[in] input
-     *   Calling thread's input items (may be aliased to @p output)
-     *
-     * @param[in] difference_op
-     *   Binary difference operator
-     *
-     * @param[in] valid_items
-     *   Number of valid items in thread block
-     */
+    //! @rst
+    //! Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block.
+    //!
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+    //! adjacent elements.
+    //!
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+    //!
+    //!    struct CustomDifference
+    //!    {
+    //!      template <typename DataType>
+    //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+    //!      {
+    //!        return lhs - rhs;
+    //!      }
+    //!    };
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockAdjacentDifference for a 1D block of
+    //!        // 128 threads of type int
+    //!        using BlockAdjacentDifferenceT =
+    //!           cub::BlockAdjacentDifference<int, 128>;
+    //!
+    //!        // Allocate shared memory for BlockAdjacentDifference
+    //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute adjacent_difference
+    //!        BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
+    //!            thread_data,
+    //!            thread_data,
+    //!            CustomDifference(),
+    //!            valid_items);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+    //! and that ``valid_items`` is ``507``. The corresponding output ``result`` in
+    //! those threads will be
+    //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }``.
+    //! @endrst
+    //!
+    //! @param[out] output
+    //!   Calling thread's adjacent difference result
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items (may be aliased to `output`)
+    //!
+    //! @param[in] difference_op
+    //!   Binary difference operator
+    //!
+    //! @param[in] valid_items
+    //!   Number of valid items in thread block
     template <int ITEMS_PER_THREAD,
               typename OutputT,
               typename DifferenceOpT>
@@ -1062,11 +1014,9 @@ public:
       }
     }
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Head flag operations (deprecated)
-     *********************************************************************/
-    //@{
+    //! @} end member group
+    //! @name Head flag operations (deprecated)
+    //! @{
 
     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
@@ -1557,6 +1507,7 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
+    //! @} end member group
 };
 
 
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index 2f628ebb779..8d673507573 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -49,82 +49,81 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for
- *        flagging discontinuities within an ordered set of items partitioned across a CUDA thread
- *        block. ![](discont_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- *   The data type to be flagged.
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- *   that differ from their predecessors (or successors).  For example, head flags are convenient
- *   for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * @par Performance Considerations
- * - \granularity
- *
- * @par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * @par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * @endcode
- * @par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * @par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a
- * href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockDiscontinuity.
- */
+//! @rst
+//! The BlockDiscontinuity class provides :ref:`collective <collective-primitives>` methods for
+//! flagging discontinuities within an ordered set of items partitioned across a CUDA thread
+//! block. 
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+//!   that differ from their predecessors (or successors). For example, head flags are convenient
+//!   for demarcating disjoint data segments as part of a segmented scan or reduction.
+//! - @blocked
+//! 
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Incurs zero bank conflicts for most types
+//! 
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockDiscontinuity}
+//!
+//! The code snippet below illustrates the head flagging of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+//! 
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+//!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+//! 
+//!        // Allocate shared memory for BlockDiscontinuity
+//!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+//! 
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//! 
+//!        // Collectively compute head flags for discontinuities in the segment
+//!        int head_flags[4];
+//!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+//! 
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+//! The corresponding output ``head_flags`` in those threads will be
+//! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+//! 
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``examples/block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. 
+//! This example can be easily adapted to the storage required by BlockDiscontinuity.
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be flagged.
+//! 
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//! 
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//! 
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//! 
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused
 template <
     typename    T,
     int         BLOCK_DIM_X,
@@ -135,18 +134,12 @@ class BlockDiscontinuity
 {
 private:
 
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
     enum
     {
         /// The thread block size in threads
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-
     /// Shared memory storage layout type (last element from each thread's input)
     struct _TempStorage
     {
@@ -154,11 +147,6 @@ private:
         T last_items[BLOCK_THREADS];
     };
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -272,10 +260,8 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
     /**
      * @brief Collective constructor using a private static allocation of shared memory as temporary
@@ -298,11 +284,9 @@ public:
         , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
+    //! @} end member group
+    //! @name Head flag operations
+    //! @{
 
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
@@ -388,74 +372,72 @@ public:
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-    /**
-     * @brief Sets head flags indicating discontinuities between items partitioned across the thread
-     *        block, for which the first item has no reference and is always flagged.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
-     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank
-     *   of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @rst
+    //! Sets head flags indicating discontinuities between items partitioned across the thread
+    //! block, for which the first item has no reference and is always flagged.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
+    //!   ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in 
+    //!   the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute head flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+    //! The corresponding output ``head_flags`` in those threads will be
+    //! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`.  
+    //!   `b_index` is the rank of b in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity head_flags
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                               T (&input)[ITEMS_PER_THREAD],
@@ -465,84 +447,81 @@ public:
         FlagHeads(head_flags, input, preds, flag_op);
     }
 
-    /**
-     * @brief Sets head flags indicating discontinuities between items partitioned across the thread
-     *        block.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those
-     * threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>,
-     *   and returning \p true if a discontinuity exists between \p a and \p b,
-     *   otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     *
-     * @param[in] tile_predecessor_item
-     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
-     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-     */
+    //! @rst
+    //! Sets head flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+    //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item 
+    //!   in the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Have thread0 obtain the predecessor item for the entire tile
+    //!        int tile_predecessor_item;
+    //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+    //!
+    //!        // Collectively compute head flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagHeads(
+    //!            head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
+    //! and that ``tile_predecessor_item`` is ``0``.  The corresponding output ``head_flags`` in those
+    //! threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`,
+    //!   and returning `true` if a discontinuity exists between `a` and `b`,
+    //!   otherwise `false`.  `b_index` is the rank of b in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity `head_flags`
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
+    //!
+    //! @param[in] tile_predecessor_item
+    //!   @rst
+    //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                               T (&input)[ITEMS_PER_THREAD],
@@ -554,82 +533,77 @@ public:
     }
 
 
-
-    //@}  end member group
-    /******************************************************************//**
-     * @name Tail flag operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
-     *        block, for which the last item has no reference and is always flagged.
-     *
-     * @par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output @p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
-     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the
-     *   rank of b in the aggregate tile of data.
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @} end member group
+    //! @name Tail flag operations
+    //! @{
+
+    //! @rst
+    //! Sets tail flags indicating discontinuities between items partitioned across the thread
+    //! block, for which the last item has no reference and is always flagged.
+    //!
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+    //!   ``flag_op(input[i], next-item)``
+    //!   returns ``true`` (where `next-item` is either the next item
+    //!   in the same thread or the first item in the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute tail flags for discontinuities in the segment
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
+    //! The corresponding output ``tail_flags`` in those threads will be
+    //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+    //!   rank of `b` in the aggregate tile of data.
+    //!   
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
                                               T (&input)[ITEMS_PER_THREAD],
@@ -653,86 +627,84 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-    /**
-     * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
-     *        block.
-     *
-     * @par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns @p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against @p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that @p tile_successor_item is @p 125.  The corresponding output @p tail_flags in those
-     * threads will be <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
-     *   if a discontinuity exists between @p a and @p b, otherwise @p false.  @p b_index is the
-     *   rank of b in the aggregate tile of data.
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     *
-     * @param[in] tile_successor_item
-     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to
-     *   compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
-     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-     */
+    //! @rst
+    //! Sets tail flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+    //!   returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in 
+    //!   the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared against
+    //!   ``tile_successor_item``.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Have thread127 obtain the successor item for the entire tile
+    //!        int tile_successor_item;
+    //!        if (threadIdx.x == 127) tile_successor_item == ...
+    //!
+    //!        // Collectively compute tail flags for discontinuities in the segment
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(
+    //!            tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+    //! and that ``tile_successor_item`` is ``125``.  The corresponding output ``tail_flags`` in those
+    //! threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+    //!   rank of `b` in the aggregate tile of data.
+    //!
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
+    //!
+    //! @param[in] tile_successor_item
+    //!   @rst
+    //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to
+    //!   compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+    //!   *thread*\ :sub:`BLOCK_THREADS - 1`).
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
                                               T (&input)[ITEMS_PER_THREAD],
@@ -760,94 +732,86 @@ public:
     }
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Head & tail flag operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
-     *        across the thread block.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns @p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns @p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is @p 125.  The corresponding output @p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output @p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
-     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the
-     *   rank of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @} end member group
+    //! @name Head & tail flag operations
+    //! @{
+
+    //! @rst
+    //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns 
+    //!   ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in 
+    //!   the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+    //!   returns ``true`` (where next-item is either the next item in the same thread or the first item in 
+    //!   the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute head and flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(
+    //!            head_flags, tail_flags, thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+    //! and that the tile_successor_item is ``125``.  The corresponding output ``head_flags``
+    //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+    //! and the corresponding output ``tail_flags`` in those threads will be
+    //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+    //!   rank of `b` in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity head_flags
+    //!
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                                       FlagT (&tail_flags)[ITEMS_PER_THREAD],
@@ -894,98 +858,93 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-    /**
-     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
-     *        across the thread block.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns @p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns @p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against @p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is @p 125.  The corresponding output @p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output @p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
-     *   if a discontinuity exists between @p a and @p b, otherwise @p false.  @p b_index is the
-     *   rank of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] tile_successor_item
-     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
-     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
-     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @rst
+    //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when
+    //!   ``flag_op(previous-item, input[i])`` returns ``true`` (where ``previous-item`` is either the preceding item
+    //!   in the same thread or the last item in the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` returns ``true``
+    //!   (where ``next-item`` is either the next item in the same thread or the first item in the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared 
+    //!   against ``tile_predecessor_item``.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Have thread127 obtain the successor item for the entire tile
+    //!        int tile_successor_item;
+    //!        if (threadIdx.x == 127) tile_successor_item == ...
+    //!
+    //!        // Collectively compute head and flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(
+    //!            head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+    //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
+    //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+    //! and the corresponding output ``tail_flags`` in those threads will be
+    //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+    //!   rank of b in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity head_flags
+    //!
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] tile_successor_item
+    //!   @rst
+    //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare
+    //!   the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+    //!   *thread*\ :sub:`BLOCK_THREADS - 1`).
+    //!   @endrst
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                                       FlagT (&tail_flags)[ITEMS_PER_THREAD],
@@ -1034,103 +993,97 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-    /**
-     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
-     *        across the thread block.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns @p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against @p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns @p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the @p tile_predecessor_item is @p 0, and that the
-     * @p tile_successor_item is @p 125.  The corresponding output @p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output @p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
-     *   if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
-     *   of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[in] tile_predecessor_item
-     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
-     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @rst
+    //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` 
+    //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item 
+    //!   in the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+    //!   ``flag_op(input[i], next-item)`` returns ``true`` (where ``next-item`` is either the next item
+    //!   in the same thread or the first item in the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item
+    //!   ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Have thread0 obtain the predecessor item for the entire tile
+    //!        int tile_predecessor_item;
+    //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+    //!
+    //!        // Have thread127 obtain the successor item for the entire tile
+    //!        int tile_successor_item;
+    //!        if (threadIdx.x == 127) tile_successor_item == ...
+    //!
+    //!        // Collectively compute head and flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(
+    //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+    //!            thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+    //! that the ``tile_predecessor_item`` is ``0``, and that the ``tile_successor_item`` is ``125``. 
+    //! The corresponding output ``head_flags`` in those threads will be 
+    //! ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``, and the corresponding output ``tail_flags`` 
+    //! in those threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+    //!   of b in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity head_flags
+    //!
+    //! @param[in] tile_predecessor_item
+    //!   @rst
+    //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+    //!   @endrst
+    //!
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                                       T tile_predecessor_item,
@@ -1173,109 +1126,104 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-    /**
-     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
-     *        across the thread block.
-     *
-     * @par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns @p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against @p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns @p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against @p tile_successor_item.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the @p tile_predecessor_item is @p 0, and that the
-     * @p tile_successor_item is @p 125.  The corresponding output @p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output @p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam FlagT
-     *   <b>[inferred]</b> The flag type (must be an integer type)
-     *
-     * @tparam FlagOp
-     *   <b>[inferred]</b> Binary predicate functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt> or member
-     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
-     *   if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
-     *   of b in the aggregate tile of data.
-     *
-     * @param[out] head_flags
-     *   Calling thread's discontinuity head_flags
-     *
-     * @param[in] tile_predecessor_item
-     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
-     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-     *
-     * @param[out] tail_flags
-     *   Calling thread's discontinuity tail_flags
-     *
-     * @param[in] tile_successor_item
-     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
-     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
-     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[in] flag_op
-     *   Binary boolean flag predicate
-     */
+    //! @rst
+    //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+    //!
+    //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` 
+    //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in 
+    //!   the previous thread).
+    //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+    //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+    //!   returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in 
+    //!   the next thread).
+    //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared 
+    //!   against ``tile_successor_item``.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+    //!
+    //!        // Allocate shared memory for BlockDiscontinuity
+    //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Have thread0 obtain the predecessor item for the entire tile
+    //!        int tile_predecessor_item;
+    //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+    //!
+    //!        // Have thread127 obtain the successor item for the entire tile
+    //!        int tile_successor_item;
+    //!        if (threadIdx.x == 127) tile_successor_item == ...
+    //!
+    //!        // Collectively compute head and flags for discontinuities in the segment
+    //!        int head_flags[4];
+    //!        int tail_flags[4];
+    //!        BlockDiscontinuity(temp_storage).FlagTails(
+    //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+    //!            thread_data, cub::Inequality());
+    //!
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+    //! that the ``tile_predecessor_item`` is ``0``, and that the
+    //! ``tile_successor_item`` is ``125``. The corresponding output ``head_flags``
+    //! in those threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+    //! and the corresponding output ``tail_flags`` in those threads will be
+    //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+    //! @endrst
+    //!
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //!
+    //! @tparam FlagT
+    //!   **[inferred]** The flag type (must be an integer type)
+    //!
+    //! @tparam FlagOp
+    //!   **[inferred]** Binary predicate functor type having member
+    //!   `T operator()(const T &a, const T &b)` or member
+    //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+    //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+    //!   of `b` in the aggregate tile of data.
+    //!
+    //! @param[out] head_flags
+    //!   Calling thread's discontinuity head_flags
+    //!
+    //! @param[in] tile_predecessor_item
+    //!   @rst
+    //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+    //!   @endrst
+    //!
+    //! @param[out] tail_flags
+    //!   Calling thread's discontinuity tail_flags
+    //!
+    //! @param[in] tile_successor_item
+    //!   @rst
+    //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item 
+    //!   (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`).
+    //!   @endrst
+    //!
+    //! @param[in] input
+    //!   Calling thread's input items
+    //!
+    //! @param[in] flag_op
+    //!   Binary boolean flag predicate
     template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
     __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
                                                       T tile_predecessor_item,
@@ -1322,10 +1270,7 @@ public:
     }
 
 
-
-
-    //@}  end member group
-
+    //! @} end member group
 };
 
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index c99b9eba84d..c56fad3314c 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -26,10 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
+//!       rearranging data partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -50,89 +48,106 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the device-accessible memory subsystem prefers access patterns
- *   where data items are "striped" across threads (where consecutive threads access consecutive items),
- *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
- *   (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
- *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- * \par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockExchange.
- */
-template <
-    typename    InputT,
-    int         BLOCK_DIM_X,
-    int         ITEMS_PER_THREAD,
-    bool        WARP_TIME_SLICING   = false,
-    int         BLOCK_DIM_Y         = 1,
-    int         BLOCK_DIM_Z         = 1,
-    int         LEGACY_PTX_ARCH     = 0>
+//! @rst
+//! The BlockExchange class provides :ref:`collective <collective-primitives>` methods for rearranging data partitioned 
+//! across a CUDA thread block.
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - It is commonplace for blocks of threads to rearrange data items between
+//!   threads.  For example, the device-accessible memory subsystem prefers access patterns
+//!   where data items are "striped" across threads (where consecutive threads access consecutive items),
+//!   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+//!   (where consecutive items belong to a single thread).
+//! - BlockExchange supports the following types of data exchanges:
+//!
+//!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and :ref:`striped <flexible-data-arrangement>`
+//!     arrangements
+//!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and 
+//!     :ref:`warp-striped <flexible-data-arrangement>`  arrangements
+//!   - Scattering ranked items to a :ref:`blocked arrangement <flexible-data-arrangement>` 
+//!   - Scattering ranked items to a :ref:`striped arrangement <flexible-data-arrangement>` 
+//!
+//! - @rowmajor
+//! 
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockExchange}
+//!
+//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+//! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+//! 
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+//!        typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+//! 
+//!        // Allocate shared memory for BlockExchange
+//!        __shared__ typename BlockExchange::TempStorage temp_storage;
+//! 
+//!        // Load a tile of data striped across threads
+//!        int thread_data[4];
+//!        cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+//! 
+//!        // Collectively exchange data into a blocked arrangement across threads
+//!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
+//! 
+//! Suppose the set of striped input ``thread_data`` across the block of threads is
+//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``.
+//! The corresponding output ``thread_data`` in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+//! 
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Proper device-specific padding ensures zero bank conflicts for most types.
+//! 
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with 
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to 
+//! the storage required by BlockExchange.
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be exchanged
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of items partitioned onto each thread.
+//!
+//! @tparam WARP_TIME_SLICING
+//!   **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data, 
+//!   time-slicing the block-wide exchange over multiple synchronized rounds. 
+//!   Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
+//!
+//! @tparam BLOCK_DIM_Y          
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z          
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH      
+//!   <b>[optional]</b> Unused.
+template <typename InputT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          bool WARP_TIME_SLICING = false,
+          int BLOCK_DIM_Y        = 1,
+          int BLOCK_DIM_Z        = 1,
+          int LEGACY_PTX_ARCH    = 0>
 class BlockExchange
 {
 private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
     /// Constants
     enum
     {
@@ -161,10 +176,6 @@ private:
         PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
     };
 
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
     /// Shared memory storage layout type
     struct __align__(16) _TempStorage
     {
@@ -173,16 +184,11 @@ private:
 
 public:
 
-    /// \smemstorage{BlockExchange}
+    /// @smemstorage{BlockExchange}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 private:
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -192,11 +198,6 @@ private:
     unsigned int warp_id;
     unsigned int warp_offset;
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -204,16 +205,14 @@ private:
         return private_storage;
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
-     *        arrangement. Specialized for no timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. 
+    //!        Specialized for no timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -239,16 +238,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
-     *        arrangement.  Specialized for warp-timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **blocked** arrangement to **striped**
+    //!        arrangement. Specialized for warp-timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -305,16 +302,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
-     *        arrangement. Specialized for no timeslicing
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. 
+    //!        Specialized for no timeslicing
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -340,16 +335,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
-     *        arrangement. Specialized for warp-timeslicing
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. 
+    //!        Specialized for warp-timeslicing
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -406,16 +399,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
-     *        arrangement. Specialized for no timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. 
+    //!        Specialized for no timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -442,16 +433,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
-     *        arrangement. Specialized for warp-timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. 
+    //!        Specialized for warp-timeslicing.
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -510,16 +499,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
-     *        arrangement. Specialized for no timeslicing
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. 
+    //!        Specialized for no timeslicing
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -546,16 +533,14 @@ private:
         }
     }
 
-    /**
-     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
-     *        arrangement. Specialized for warp-timeslicing
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     */
+    //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. 
+    //!        Specialized for warp-timeslicing
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -590,19 +575,17 @@ private:
         }
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized
-     * for no timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @brief Exchanges data items annotated by rank into **blocked** arrangement.  
+    //!        Specialized for no timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -629,19 +612,17 @@ private:
         }
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized
-     *        for warp-timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @brief Exchanges data items annotated by rank into **blocked** arrangement. 
+    //!        Specialized for warp-timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -691,19 +672,17 @@ private:
         }
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized
-     *        for no timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @brief Exchanges data items annotated by rank into **striped** arrangement.  
+    //!        Specialized for no timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -730,19 +709,17 @@ private:
         }
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized
-     *        for warp-timeslicing.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @brief Exchanges data items annotated by rank into **striped** arrangement. 
+    //!        Specialized for warp-timeslicing.
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+    //!
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -803,10 +780,8 @@ private:
 
 public:
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
     /**
      * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
@@ -835,54 +810,51 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
-     *        arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of striped input @p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from
-     * device-accessible memory. The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     */
+    //! @} end member group
+    //! @name Structured exchanges
+    //! @{
+
+    //! @rst
+    //! Transposes data items from **striped** arrangement to **blocked** arrangement.
+    //! 
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+    //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+    //! 
+    //!        // Allocate shared memory for BlockExchange
+    //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+    //! 
+    //!        // Load a tile of ordered data into a striped arrangement across block threads
+    //!        int thread_data[4];
+    //!        cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+    //! 
+    //!        // Collectively exchange data into a blocked arrangement across threads
+    //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+    //! 
+    //! Suppose the set of striped input ``thread_data`` across the block of threads is
+    //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from
+    //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //! @endrst
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -890,52 +862,51 @@ public:
         StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
-     *        arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of blocked input @p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to device-accessible memory.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     */
+    //! @rst
+    //! Transposes data items from **blocked** arrangement to **striped** arrangement.
+    //! 
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+    //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+    //! 
+    //!        // Allocate shared memory for BlockExchange
+    //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively exchange data into a striped arrangement across threads
+    //!        BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+    //! 
+    //!        // Store data striped across block threads into an ordered tile
+    //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+    //! 
+    //! Suppose the set of blocked input ``thread_data`` across the block of threads is
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in
+    //! preparation for storing to device-accessible memory.
+    //! @endrst
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -943,51 +914,51 @@ public:
         BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-    /**
-     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
-     *        arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
-     * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
-     * items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of warp-striped input @p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from device-accessible memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     */
+    //! @rst 
+    //! Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+    //! 
+    //! - @smemreuse
+    //! 
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
+    //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+    //! items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+    //! 
+    //!        // Allocate shared memory for BlockExchange
+    //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+    //! 
+    //!        // Load a tile of ordered data into a warp-striped arrangement across warp threads
+    //!        int thread_data[4];
+    //!        cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+    //! 
+    //!        // Collectively exchange data into a blocked arrangement across threads
+    //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+    //! 
+    //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is
+    //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
+    //! after loading from device-accessible memory. (The first 128 items are striped across
+    //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //! @endrst
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -995,55 +966,54 @@ public:
         WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-    /**
-     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
-     * arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
-     * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
-     * items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of blocked input @p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to device-accessible memory. (The first 128 items are striped
-     * across the first warp of 32 threads, the second 128 items are striped across the second warp,
-     * etc.)
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     */
+    //! @rst
+    //! Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+    //! 
+    //! - @smemreuse
+    //! 
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
+    //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+    //! items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+    //! 
+    //!        // Allocate shared memory for BlockExchange
+    //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively exchange data into a warp-striped arrangement across threads
+    //!        BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+    //! 
+    //!        // Store data striped across warp threads into an ordered tile
+    //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+    //! 
+    //! Suppose the set of blocked input ``thread_data`` across the block of threads is
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
+    //! in preparation for storing to device-accessible memory. (The first 128 items are striped
+    //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+    //! @endrst
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
     template <typename OutputT>
     __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                          OutputT (&output_items)[ITEMS_PER_THREAD])
@@ -1051,32 +1021,27 @@ public:
         BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * @name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @tparam OffsetT
-     *   <b>[inferred]</b> Signed integer type for local offsets
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @}  end member group
+    //! @name Scatter exchanges
+    //! @{
+
+    //! @rst
+    //! Exchanges data items annotated by rank into **blocked** arrangement.
+    //!
+    //! - @smemreuse
+    //! @endrst
+    //!
+    //! @tparam OffsetT
+    //!   **[inferred]** Signed integer type for local offsets
+    //!
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //!
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+    //!
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -1085,24 +1050,24 @@ public:
         ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @tparam OffsetT
-     *   <b>[inferred]</b> Signed integer type for local offsets
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @rst 
+    //! Exchanges data items annotated by rank into **striped** arrangement.
+    //! 
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam OffsetT
+    //!   **[inferred]** Signed integer type for local offsets
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
                                                      OutputT (&output_items)[ITEMS_PER_THREAD],
@@ -1111,25 +1076,24 @@ public:
         ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
 
-    /**
-     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *        Items with rank -1 are not exchanged.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @tparam OffsetT
-     *   <b>[inferred]</b> Signed integer type for local offsets
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     */
+    //! @rst 
+    //! Exchanges data items annotated by rank into **striped** arrangement. Items with rank -1 are not exchanged.
+    //! 
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam OffsetT
+    //!   **[inferred]** Signed integer type for local offsets
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
     template <typename OutputT, typename OffsetT>
     __device__ __forceinline__ void
     ScatterToStripedGuarded(InputT (&input_items)[ITEMS_PER_THREAD],
@@ -1156,30 +1120,30 @@ public:
         }
     }
 
-    /**
-     * @brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @tparam OffsetT
-     *   <b>[inferred]</b> Signed integer type for local offsets
-     *
-     * @tparam ValidFlag
-     *   <b>[inferred]</b> FlagT type denoting which items are valid
-     *
-     * @param[in] input_items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[out] output_items
-     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-     *
-     * @param[in] ranks
-     *   Corresponding scatter ranks
-     *
-     * @param[in] is_valid
-     *   Corresponding flag denoting item validity
-     */
+    //! @rst 
+    //! Exchanges valid data items annotated by rank into **striped** arrangement.
+    //! 
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam OffsetT
+    //!   **[inferred]** Signed integer type for local offsets
+    //! 
+    //! @tparam ValidFlag
+    //!   **[inferred]** FlagT type denoting which items are valid
+    //! 
+    //! @param[in] input_items
+    //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[out] output_items
+    //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+    //! 
+    //! @param[in] ranks
+    //!   Corresponding scatter ranks
+    //! 
+    //! @param[in] is_valid
+    //!   Corresponding flag denoting item validity
     template <typename OutputT, typename OffsetT, typename ValidFlag>
     __device__ __forceinline__ void
     ScatterToStripedFlagged(InputT (&input_items)[ITEMS_PER_THREAD],
@@ -1207,16 +1171,13 @@ public:
         }
     }
 
-
-    //@}  end member group
-
-
+    //! @}  end member group
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      */
     __device__ __forceinline__ void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
     {
@@ -1225,7 +1186,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      */
     __device__ __forceinline__ void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD])
     {
@@ -1234,7 +1195,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      */
     __device__ __forceinline__ void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
     {
@@ -1243,7 +1204,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      */
     __device__ __forceinline__ void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD])
     {
@@ -1252,7 +1213,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      *
      * @param[in] ranks
      *   Corresponding scatter ranks
@@ -1266,7 +1227,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      *
      * @param[in] ranks
      *   Corresponding scatter ranks
@@ -1280,7 +1241,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      *
      * @param[in] ranks
      *   Corresponding scatter ranks
@@ -1294,7 +1255,7 @@ public:
 
     /**
      * @param[in-out] items
-     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *   Items to exchange, converting between **striped** and **blocked** arrangements.
      *
      * @param[in] ranks
      *   Corresponding scatter ranks
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index 1ef84dc5a20..e8c3e6cb490 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -50,158 +50,147 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
- *        block-wide histograms.
- */
+//! @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
+//!        block-wide histograms.
 enum BlockHistogramAlgorithm
 {
 
-    /**
-     * @par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * @par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * @par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * @par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
+  //! @rst
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Sorting followed by differentiation. Execution is comprised of two phases:
+  //!
+  //! #. Sort the data using efficient radix sort
+  //! #. Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Delivers consistent throughput regardless of sample bin distribution.
+  //!
+  //! @endrst
+  BLOCK_HISTO_SORT,
+
+  //! @rst
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Use atomic addition to update byte counts directly
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Performance is strongly tied to the hardware implementation of atomic
+  //! addition, and may be significantly degraded for non uniformly-random
+  //! input distributions where many concurrent updates are likely to be
+  //! made to the same bin counter.
+  //!
+  //! @endrst
+  BLOCK_HISTO_ATOMIC,
 };
 
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-/**
- * @brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for
- *        constructing block-wide histograms from data samples partitioned across a CUDA thread
- *        block. ![](histogram_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- *   The sample type being histogrammed (must be castable to an integer bin identifier)
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam ITEMS_PER_THREAD
- *   The number of items per thread
- *
- * @tparam BINS
- *   The number bins within the histogram
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm
- *   to use (default: cub::BLOCK_HISTO_SORT)
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as
- *   <em>bins</em>).
- * - The `T` type must be implicitly castable to an integer type.
- * - BlockHistogram expects each integral `input[i]` value to satisfy
- *   `0 <= input[i] < BINS`. Values outside of this range result in undefined
- *   behavior.
- * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref
- *      cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly.
- *      [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * @par Performance Considerations
- * - @granularity
- *
- * @par A Simple Example
- * @blockcollective{BlockHistogram}
- * @par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character
- * samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * @endcode
- *
- * @par Performance and Usage Considerations
- * - All input values must fall between [0, BINS), or behavior is undefined.
- * - The histogram output can be constructed in shared or device-accessible memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a
- * href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockHistogram.
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     LEGACY_PTX_ARCH     = 0>
+//! @rst
+//! The BlockHistogram class provides :ref:`collective <collective-primitives>` methods for
+//! constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A `histogram <http://en.wikipedia.org/wiki/Histogram>`_ counts the number of observations that fall into
+//!   each of the disjoint categories (known as *bins*).
+//! - The ``T`` type must be implicitly castable to an integer type.
+//! - BlockHistogram expects each integral ``input[i]`` value to satisfy
+//!   ``0 <= input[i] < BINS``. Values outside of this range result in undefined behavior.
+//! - BlockHistogram can be optionally specialized to use different algorithms:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_HISTO_SORT`: Sorting followed by differentiation.
+//!   #. :cpp:enumerator:`cub::BLOCK_HISTO_ATOMIC`: Use atomic addition to update byte counts directly.
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockHistogram}
+//!
+//! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+//! are partitioned across 128 threads where each thread owns 4 samples.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character
+//!    samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+//!
+//!        // Allocate shared memory for BlockHistogram
+//!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+//!
+//!        // Allocate shared memory for block-wide histogram bin counts
+//!        __shared__ unsigned int smem_histogram[256];
+//!
+//!        // Obtain input samples per thread
+//!        unsigned char data[4];
+//!        ...
+//!
+//!        // Compute the block-wide histogram
+//!        BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+//!
+//! Performance and Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - All input values must fall between ``[0, BINS)``, or behavior is undefined.
+//! - The histogram output can be constructed in shared or device-accessible memory
+//! - See ``cub::BlockHistogramAlgorithm`` for performance details regarding algorithmic alternatives
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage
+//! required by BlockHistogram.
+//! @endrst
+//!
+//! @tparam T
+//!   The sample type being histogrammed (must be castable to an integer bin identifier)
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of items per thread
+//!
+//! @tparam BINS
+//!   The number bins within the histogram
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_HISTO_SORT)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
+          int BLOCK_DIM_Y                   = 1,
+          int BLOCK_DIM_Z                   = 1,
+          int LEGACY_PTX_ARCH               = 0>
 class BlockHistogram
 {
 private:
 
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
     /// Constants
     enum
     {
@@ -223,22 +212,12 @@ private:
     /// Shared memory storage layout type for BlockHistogram
     typedef typename InternalBlockHistogram::TempStorage _TempStorage;
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
     /// Linear thread-id
     unsigned int linear_tid;
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -246,21 +225,16 @@ private:
         return private_storage;
     }
 
-
 public:
 
     /// @smemstorage{BlockHistogram}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockHistogram()
     :
         temp_storage(PrivateStorage()),
@@ -279,50 +253,50 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * @brief Initialize the shared histogram counters to zero.
-     *
-     * @par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * @endcode
-     *
-     * @tparam CounterT
-     *   <b>[inferred]</b> Histogram counter type
-     */
+    //! @}  end member group
+    //! @name Histogram operations
+    //! @{
+
+
+    //! @rst 
+    //! Initialize the shared histogram counters to zero.
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a the initialization and update of a
+    //! histogram of 512 integer samples that are partitioned across 128 threads
+    //! where each thread owns 4 samples.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!      // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+    //!      typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+    //! 
+    //!      // Allocate shared memory for BlockHistogram
+    //!      __shared__ typename BlockHistogram::TempStorage temp_storage;
+    //! 
+    //!      // Allocate shared memory for block-wide histogram bin counts
+    //!      __shared__ unsigned int smem_histogram[256];
+    //! 
+    //!      // Obtain input samples per thread
+    //!      unsigned char thread_samples[4];
+    //!      ...
+    //! 
+    //!      // Initialize the block-wide histogram
+    //!      BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+    //! 
+    //!      // Update the block-wide histogram
+    //!      BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam CounterT
+    //!   **[inferred]** Histogram counter type
     template <typename CounterT>
     __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS])
     {
@@ -341,51 +315,52 @@ public:
         }
     }
 
-    /**
-     * @brief Constructs a block-wide histogram in shared/device-accessible memory.
-     *        Each thread contributes an array of input elements.
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
-     * character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
-     * BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * @endcode
-     *
-     * @tparam CounterT
-     *   <b>[inferred]</b> Histogram counter type
-     *
-     * @param[in] items
-     *   Calling thread's input values to histogram
-     *
-     * @param[out] histogram
-     *   Reference to shared/device-accessible memory histogram
-     */
+    //! @rst 
+    //! Constructs a block-wide histogram in shared/device-accessible memory.
+    //! Each thread contributes an array of input elements.
+    //! 
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+    //! are partitioned across 128 threads where each thread owns 4 samples.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+    //!        // character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
+    //!        // BlockHistogram;
+    //! 
+    //!        // Allocate shared memory for BlockHistogram
+    //!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+    //! 
+    //!        // Allocate shared memory for block-wide histogram bin counts
+    //!        __shared__ unsigned int smem_histogram[256];
+    //! 
+    //!        // Obtain input samples per thread
+    //!        unsigned char thread_samples[4];
+    //!        ...
+    //! 
+    //!        // Compute the block-wide histogram
+    //!        BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+    //! 
+    //! @endrst 
+    //! 
+    //! @tparam CounterT
+    //!   **[inferred]** Histogram counter type
+    //! 
+    //! @param[in] items
+    //!   Calling thread's input values to histogram
+    //! 
+    //! @param[out] histogram
+    //!   Reference to shared/device-accessible memory histogram
     template <typename CounterT>
     __device__ __forceinline__ void Histogram(T (&items)[ITEMS_PER_THREAD],
                                               CounterT histogram[BINS])
@@ -399,55 +374,56 @@ public:
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
     }
 
-    /**
-     * @brief Updates an existing block-wide histogram in shared/device-accessible memory.
-     *        Each thread composites an array of input elements.
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
-     * character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
-     * BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * @endcode
-     *
-     * @tparam CounterT
-     *   <b>[inferred]</b> Histogram counter type
-     *
-     * @param[in] items
-     *   Calling thread's input values to histogram
-     *
-     * @param[out] histogram
-     *   Reference to shared/device-accessible memory histogram
-     */
+    //! @rst 
+    //! Updates an existing block-wide histogram in shared/device-accessible memory.
+    //! Each thread composites an array of input elements.
+    //! 
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a the initialization and update of a
+    //! histogram of 512 integer samples that are partitioned across 128 threads
+    //! where each thread owns 4 samples.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+    //!        // character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
+    //!        // BlockHistogram;
+    //! 
+    //!        // Allocate shared memory for BlockHistogram
+    //!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+    //! 
+    //!        // Allocate shared memory for block-wide histogram bin counts
+    //!        __shared__ unsigned int smem_histogram[256];
+    //! 
+    //!        // Obtain input samples per thread
+    //!        unsigned char thread_samples[4];
+    //!        ...
+    //! 
+    //!        // Initialize the block-wide histogram
+    //!        BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+    //! 
+    //!        // Update the block-wide histogram
+    //!        BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+    //! 
+    //! @endrst 
+    //! 
+    //! @tparam CounterT
+    //!   **[inferred]** Histogram counter type
+    //! 
+    //! @param[in] items
+    //!   Calling thread's input values to histogram
+    //! 
+    //! @param[out] histogram
+    //!   Reference to shared/device-accessible memory histogram
     template <typename CounterT>
     __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD],
                                               CounterT histogram[BINS])
@@ -458,4 +434,3 @@ public:
 };
 
 CUB_NAMESPACE_END
-
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 463b981e82b..d95cca4e346 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -26,10 +26,7 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
+//! @file Operations for reading linear tiles of data into the CUDA thread block.
 
 #pragma once
 
@@ -50,41 +47,35 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIo
- * @{
- */
 
-
-/******************************************************************//**
- * @name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * @blocked
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- */
+//! @name Blocked arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//! 
+//! @blocked
+//! 
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **[inferred]** The random-access iterator type for input iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
 template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
                                                   InputIteratorT block_itr,
@@ -98,34 +89,34 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
     }
 }
 
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
- *        by range.
- *
- * @blocked
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+//! 
+//! @blocked
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **[inferred]** The random-access iterator type for input iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
 template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
                                                   InputIteratorT block_itr,
@@ -143,37 +134,38 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
     }
 }
 
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
- *        by range, with a fall-back assignment of out-of-bound elements..
- *
- * @blocked
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- *
- * @param[in] oob_default
- *   Default value to assign out-of-bound items
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded
+//! by range, with a fall-back assignment of out-of-bound elements.
+//! 
+//! @blocked
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **[inferred]** The random-access iterator type for input \iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
+//! 
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
 template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
                                                   InputIteratorT block_itr,
@@ -191,19 +183,18 @@ __device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-/**
- * @brief Internal implementation for load vectorization
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_ptr
- *   Input pointer for loading from
- *
- * @param[out] items
- *   Data to load
- */
+
+//! @brief Internal implementation for load vectorization
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_ptr
+//!   Input pointer for loading from
+//! 
+//! @param[out] items
+//!   Data to load
 template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
 __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid,
                                                                     T *block_ptr,
@@ -251,35 +242,36 @@ __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_t
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-/**
- * @brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * @blocked
- *
- * The input offset (@p block_ptr + @p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to
- * cub::BLOCK_LOAD_DIRECT:
- *   - @p ITEMS_PER_THREAD is odd
- *   - The data type @p T is not a built-in primitive or CUDA vector type
- *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_ptr
- *   Input pointer for loading from
- *
- * @param[out] items
- *   Data to load
- */
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//! 
+//! @blocked
+//! 
+//! The input offset (``block_ptr + block_offset``) must be quad-item aligned
+//! 
+//! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+//!
+//! - ``ITEMS_PER_THREAD`` is odd
+//! - The data type ``T`` is not a built-in primitive or CUDA vector type
+//!   (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_ptr
+//!   Input pointer for loading from
+//! 
+//! @param[out] items
+//!   Data to load
 template <typename T, int ITEMS_PER_THREAD>
 __device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid,
                                                             T *block_ptr,
@@ -288,40 +280,38 @@ __device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid,
     InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
 }
 
-
-//@}  end member group
-/******************************************************************//**
- * @name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- */
+//! @} end member group
+//! @name Striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block.
+//! 
+//! @striped
+//!
+//! @endrst
+//! 
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **[inferred]** The random-access iterator type for input iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
 template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
                                                   InputIteratorT block_itr,
@@ -350,37 +340,39 @@ __device__ __forceinline__ void load_transform_direct_striped(
 
 } // namespace detail
 
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
- *        by range
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- */
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//! 
+//! @tparam T
+//!   **inferred** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **inferred** The random-access iterator type for input \iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
+//! 
 template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
                                                   InputIteratorT block_itr,
@@ -397,40 +389,41 @@ __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
     }
 }
 
-/**
- * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
- *        by range, with a fall-back assignment of out-of-bound elements.
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- *
- * @param[in] oob_default
- *   Default value to assign out-of-bound items
- */
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block, guarded
+//! by range, with a fall-back assignment of out-of-bound elements.
+//! 
+//! @striped
+//!
+//! @endrst
+//! 
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//! 
+//! @tparam T
+//!   **inferred** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **inferred** The random-access iterator type for input \iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
+//! 
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
 template <int BLOCK_THREADS,
           typename InputT,
           typename DefaultT,
@@ -451,39 +444,40 @@ __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
 
 
 
-//@}  end member group
-/******************************************************************//**
- * @name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Load a linear segment of items into a warp-striped arrangement across the thread block.
- *
- * @warpstriped
- *
- * @par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- */
+//! @} end member group
+//! @name Warp-striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block.
+//! 
+//! @warpstriped
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **inferred** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **inferred** The random-access iterator type for input iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
 template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
                                                       InputIteratorT block_itr,
@@ -501,37 +495,39 @@ __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
     }
 }
 
-/**
- * @brief Load a linear segment of items into a warp-striped arrangement across the thread block,
- *        guarded by range
- *
- * @warpstriped
- *
- * @par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- */
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+//! 
+//! @warpstriped
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//! 
+//! @endrst
+//! 
+//! @tparam T
+//!   **inferred** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **inferred** The random-access iterator type for input \iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
 template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
                                                       InputIteratorT block_itr,
@@ -553,40 +549,43 @@ __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
     }
 }
 
-/**
- * @brief Load a linear segment of items into a warp-striped arrangement across the thread block,
- *        guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * @warpstriped
- *
- * @par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to load.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for input \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base input iterator for loading from
- *
- * @param[out] items
- *   Data to load
- *
- * @param[in] valid_items
- *   Number of valid items to load
- *
- * @param[in] oob_default
- *   Default value to assign out-of-bound items
- */
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block,
+//! guarded by range, with a fall-back assignment of out-of-bound elements.
+//!
+//! @warpstriped
+//!
+//! @endrst
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//! 
+//! @tparam T
+//!   **inferred** The data type to load.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam InputIteratorT
+//!   **inferred** The random-access iterator type for input \iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base input iterator for loading from
+//! 
+//! @param[out] items
+//!   Data to load
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to load
+//! 
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
 template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
                                                       InputIteratorT block_itr,
@@ -604,238 +603,244 @@ __device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
 
 
 
-//@}  end member group
-
-/** @} */       // end group UtilIo
-
+//! @} end member group
 
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a
- *        linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
+//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a
+//!        linear segment of data from memory into a blocked arrangement across a CUDA thread block.
 enum BlockLoadAlgorithm
 {
-    /**
-     * @par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.
-     *
-     * @par Performance Considerations
-     * The utilization of memory transactions (coalescing) decreases as the
-     * access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * @par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.
-     *
-     * @par Performance Considerations
-     * The utilization of memory transactions (coalescing) doesn't depend on
-     * the number of items per thread.
-     */
-    BLOCK_LOAD_STRIPED,
-
-    /**
-     * @par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
-     * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall
-     *   back to cub::BLOCK_LOAD_DIRECT:
-     *   - @p ITEMS_PER_THREAD is odd
-     *   - The @p InputIteratorT is not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type @p T is not a built-in primitive or CUDA vector type
-     *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * @par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-    /**
-     * @par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
-     * read efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * @par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly larger latencies than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     * - Provisions more shared storage, but incurs smaller latencies than the
-     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-
-    /**
-     * @par Overview
-     *
-     * Like @p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * of data is read directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
-     * requirement, only one warp's worth of shared memory is provisioned and is
-     * subsequently time-sliced among warps.
-     *
-     * @par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) decreases as the
+  //! access stride between threads increases (i.e., the number items per thread).
+  //! @endrst
+  BLOCK_LOAD_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) doesn't depend on
+  //! the number of items per thread.
+  //!
+  //! @endrst
+  BLOCK_LOAD_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read
+  //! from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+  //! For example, ``ld.global.v4.s32`` instructions will be generated
+  //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high until the the
+  //!   access stride between threads (i.e., the number items per thread) exceeds the
+  //!   maximum vector load width (typically 4 items or 64B, whichever is lower).
+  //! - The following conditions will prevent vectorization and loading will fall
+  //!   back to cub::BLOCK_LOAD_DIRECT:
+  //!
+  //!   - ``ITEMS_PER_THREAD`` is odd
+  //!   - The ``InputIteratorT`` is not a simple pointer type
+  //!   - The block input offset is not quadword-aligned
+  //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
+  BLOCK_LOAD_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
+  //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items loaded per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_LOAD_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
+  //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - The local reordering incurs slightly larger latencies than the
+  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - Provisions more shared storage, but incurs smaller latencies than the
+  //!   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+  //!
+  //! @endrst
+  BLOCK_LOAD_WARP_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>`
+  //! of data is read directly from memory and then is locally transposed into a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>`. To reduce the shared memory requirement, only one
+  //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items loaded per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger
+  //!   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+  //!
+  //! @endrst
+  BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
 };
 
-
-/**
- * @brief The BlockLoad class provides [<em>collective</em>](index.html#sec0)
- *        data movement methods for loading a linear segment of items from memory
- *        into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a
- *        CUDA thread block.  ![](block_load_logo.png)
- *
- * @ingroup BlockModule
- *
- * @ingroup UtilIo
- *
- * @tparam InputT
- *   The data type to read into (which must be convertible from the input iterator's value type).
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam ITEMS_PER_THREAD
- *   The number of consecutive items partitioned onto each thread.
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- *
- * @tparam WARP_TIME_SLICING
- *   <b>[optional]</b> Whether or not only one warp's worth of shared memory should be
- *   allocated and time-sliced among block-warps during any load-related data transpositions
- *   (versus each warp having its own storage). (default: false)
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *  <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *  <b>[optional]</b> Unused.
- *
- * @par Overview
- * - The BlockLoad class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockLoad can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
- * - \rowmajor
- *
- * @par A Simple Example
- * @blockcollective{BlockLoad}
- * @par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of @p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockLoad.
- */
-template <
-    typename            InputT,
-    int                 BLOCK_DIM_X,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 LEGACY_PTX_ARCH     = 0>
+//! @rst
+//! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
+//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a
+//! CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - The BlockLoad class provides a single data movement abstraction that can be specialized
+//!   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+//!   performance policies for different architectures, data types, granularity sizes, etc.
+//! - BlockLoad can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_STRIPED`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_VECTORIZE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory
+//!      using CUDA's built-in vectorized loads as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a `blocked arrangement <flexible-data-arrangement>`.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`:
+//!      A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED`:
+//!      A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>` one warp at a time.
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockLoad}
+//!
+//! The code snippet below illustrates the loading of a linear
+//! segment of 512 integers into a "blocked" arrangement across 128 threads where each
+//! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
+//! meaning memory references are efficiently coalesced using a warp-striped access
+//! pattern (after which items are locally reordered among threads).
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+//!        typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+//!
+//!        // Allocate shared memory for BlockLoad
+//!        __shared__ typename BlockLoad::TempStorage temp_storage;
+//!
+//!        // Load a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        BlockLoad(temp_storage).Load(d_data, thread_data);
+//!
+//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
+//! The set of ``thread_data`` across the block of threads in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockLoad.
+//!
+//! @endrst
+//!
+//! @tparam InputT
+//!   The data type to read into (which must be convertible from the input iterator's value type).
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
+//!
+//! @tparam WARP_TIME_SLICING
+//!   **[optional]** Whether or not only one warp's worth of shared memory should be
+//!   allocated and time-sliced among block-warps during any load-related data transpositions
+//!   (versus each warp having its own storage). (default: false)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!  **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!  **[optional]** Unused.
+template <typename InputT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
+          int BLOCK_DIM_Y              = 1,
+          int BLOCK_DIM_Z              = 1,
+          int LEGACY_PTX_ARCH          = 0>
 class BlockLoad
 {
 private:
 
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
     /// Constants
     enum
     {
@@ -843,16 +848,10 @@ private:
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
     /// Load helper
     template <BlockLoadAlgorithm _POLICY, int DUMMY>
     struct LoadInternal;
 
-
     /**
      * BLOCK_LOAD_DIRECT specialization of load helper
      */
@@ -1464,11 +1463,6 @@ private:
         }
     };
 
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
     /// Internal load implementation to use
     typedef LoadInternal<ALGORITHM, 0> InternalLoad;
 
@@ -1477,10 +1471,6 @@ private:
     typedef typename InternalLoad::TempStorage _TempStorage;
 
 
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -1488,11 +1478,6 @@ private:
         return private_storage;
     }
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Thread reference to shared storage
     _TempStorage &temp_storage;
 
@@ -1505,10 +1490,8 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
     /**
      * @brief Collective constructor using a private static allocation of shared memory as temporary
@@ -1534,53 +1517,52 @@ public:
 
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Data movement
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Load a linear segment of items from memory.
-     *
-     * @par
-     * - @blocked
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of @p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
+    //! @} end member group
+    //! @name Data movement
+    //! @{
+
+    //! @rst 
+    //! Load a linear segment of items from memory.
+    //! 
+    //! - @blocked
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the loading of a linear
+    //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
+    //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
+    //! meaning memory references are efficiently coalesced using a warp-striped access
+    //! pattern (after which items are locally reordered among threads).
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    //! 
+    //!        // Allocate shared memory for BlockLoad
+    //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+    //! 
+    //!        // Load a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+    //! 
+    //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
+    //! The set of ``thread_data`` across the block of threads in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in] block_itr
+    //!   The thread block's base input iterator for loading from
+    //! 
+    //! @param[out] items
+    //!   Data to load
     template <typename InputIteratorT>
     __device__ __forceinline__ void Load(InputIteratorT block_itr,
                                          InputT (&items)[ITEMS_PER_THREAD])
@@ -1588,51 +1570,53 @@ public:
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range.
-     *
-     * @par
-     * - @blocked
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items. The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and @p valid_items is @p 5.
-     * The set of @p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
+    //! @rst 
+    //!
+    //! Load a linear segment of items from memory, guarded by range.
+    //! 
+    //! - @blocked
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the guarded loading of a linear
+    //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
+    //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
+    //! meaning memory references are efficiently coalesced using a warp-striped access
+    //! pattern (after which items are locally reordered among threads).
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    //! 
+    //!        // Allocate shared memory for BlockLoad
+    //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+    //! 
+    //!        // Load a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+    //! 
+    //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``.
+    //! The set of ``thread_data`` across the block of threads in those threads will be
+    //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads
+    //! being unmasked to load portions of valid data (and other items remaining unassigned).
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] block_itr
+    //!   The thread block's base input iterator for loading from
+    //! 
+    //! @param[out] items
+    //!   Data to load
+    //! 
+    //! @param[in] valid_items
+    //!   Number of valid items to load
     template <typename InputIteratorT>
     __device__ __forceinline__ void Load(InputIteratorT block_itr,
                                          InputT (&items)[ITEMS_PER_THREAD],
@@ -1641,56 +1625,57 @@ public:
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @par
-     * - @blocked
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * @p valid_items is @p 5, and the out-of-bounds default is @p -1.
-     * The set of @p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned @p -1)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
+    //! @rst
+    //! Load a linear segment of items from memory, guarded by range, with a fall-back
+    //! assignment of out-of-bound elements
+    //! 
+    //! - @blocked
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the guarded loading of a linear
+    //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
+    //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
+    //! meaning memory references are efficiently coalesced using a warp-striped access
+    //! pattern (after which items are locally reordered among threads).
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    //! 
+    //!        // Allocate shared memory for BlockLoad
+    //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+    //! 
+    //!        // Load a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+    //! 
+    //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``
+    //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``.
+    //! The set of ``thread_data`` across the block of threads in those threads will be
+    //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads
+    //! being unmasked to load portions of valid data (and other items are assigned ``-1``)
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] block_itr
+    //!   The thread block's base input iterator for loading from
+    //! 
+    //! @param[out] items
+    //!   Data to load
+    //! 
+    //! @param[in] valid_items
+    //!   Number of valid items to load
+    //! 
+    //! @param[in] oob_default
+    //!   Default value to assign out-of-bound items
     template <typename InputIteratorT, typename DefaultT>
     __device__ __forceinline__ void Load(InputIteratorT block_itr,
                                          InputT (&items)[ITEMS_PER_THREAD],
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 1a5505db37e..7757dea1bcd 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -26,10 +26,7 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
- */
+//! @file cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
 
 #pragma once
 
@@ -56,43 +53,38 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
- *        keys from a single tile. Note that different ranking algorithms require different
- *        initial arrangements of keys to function properly.
- */
+//! @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
+//!        keys from a single tile. Note that different ranking algorithms require different
+//!        initial arrangements of keys to function properly.
 enum RadixRankAlgorithm
 {
-    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == false. It
-     * uses thread-private histograms, and thus uses more shared memory. Requires blocked
-     * arrangement of keys. Does not support count callbacks. */
-    RADIX_RANK_BASIC,
-    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN ==
-     * true. Similar to RADIX_RANK BASIC, it requires blocked arrangement of
-     * keys and does not support count callbacks.*/
-    RADIX_RANK_MEMOIZE,
-    /** Ranking using the BlockRadixRankMatch algorithm. It uses warp-private
-     * histograms and matching for ranking the keys in a single warp. Therefore,
-     * it uses less shared memory compared to RADIX_RANK_BASIC. It requires
-     * warp-striped key arrangement and supports count callbacks. */
-    RADIX_RANK_MATCH,
-    /** Ranking using the BlockRadixRankMatchEarlyCounts algorithm with
-     * MATCH_ALGORITHM == WARP_MATCH_ANY. An alternative implementation of
-     * match-based ranking that computes bin counts early. Because of this, it
-     * works better with onesweep sorting, which requires bin counts for
-     * decoupled look-back. Assumes warp-striped key arrangement and supports
-     * count callbacks.*/
-    RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
-    /** Ranking using the BlockRadixRankEarlyCounts algorithm with
-     * MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR. It uses extra space in shared
-     * memory to generate warp match masks using atomicOr(). This is faster when
-     * there are few matches, but can lead to slowdowns if the number of
-     * matching keys among warp lanes is high. Assumes warp-striped key
-     * arrangement and supports count callbacks. */
-    RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+  //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == false`.
+  //! It uses thread-private histograms, and thus uses more shared memory.
+  //! Requires blocked arrangement of keys. Does not support count callbacks.
+  RADIX_RANK_BASIC,
+
+  //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == true`.
+  //! Similar to RADIX_RANK BASIC, it requires blocked arrangement of keys and does not support count callbacks.
+  RADIX_RANK_MEMOIZE,
+
+  //! Ranking using the BlockRadixRankMatch algorithm. It uses warp-private histograms and matching for ranking
+  //! the keys in a single warp. Therefore, it uses less shared memory compared to RADIX_RANK_BASIC.
+  //! It requires warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH,
+
+  //! Ranking using the BlockRadixRankMatchEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ANY`.
+  //! An alternative implementation of match-based ranking that computes bin counts early.
+  //! Because of this, it works better with onesweep sorting, which requires bin counts for decoupled look-back.
+  //! Assumes warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+
+  //! Ranking using the BlockRadixRankEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR`.
+  //! It uses extra space in shared memory to generate warp match masks using `atomicOr()`.
+  //! This is faster when there are few matches, but can lead to slowdowns if the number of matching keys among
+  //! warp lanes is high. Assumes warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
 };
 
-
 /** Empty callback implementation */
 template <int BINS_PER_THREAD>
 struct BlockRadixRankEmptyCallback
@@ -131,107 +123,101 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 } // namespace detail
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-/**
- * @brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread
- *        block.
- *
- * @ingroup BlockModule
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam RADIX_BITS
- *   The number of radix bits per digit place
- *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
- *
- * @tparam MEMOIZE_OUTER_SCAN
- *   <b>[optional]</b> Whether or not to buffer outer raking scan
- *   partials to incur fewer shared memory reads at the expense of higher register pressure
- *   (default: true for architectures SM35 and newer, false otherwise).
- *   See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- *
- * @tparam INNER_SCAN_ALGORITHM
- *   <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default:
- *   cub::BLOCK_SCAN_WARP_SCANS)
- *
- * @tparam SMEM_CONFIG
- *   <b>[optional]</b> Shared memory bank mode (default: @p cudaSharedMemBankSizeFourByte)
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *  <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *  <b>[optional]</b> Unused.
- *
- * @par Overview
- * Blah...
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - @blocked
- *
- * @par Performance Considerations
- * - @granularity
- *
- * @par
- * @code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *   constexpr int block_threads = 2;
- *   constexpr int radix_bits = 5;
- *
- *   // Specialize BlockRadixRank for a 1D block of 2 threads
- *   // Specialize BlockRadixRank for a 1D block of 2 threads
- *   using block_radix_rank = cub::BlockRadixRank<block_threads, radix_bits>;
- *   using storage_t = typename block_radix_rank::TempStorage;
- *
- *   // Allocate shared memory for BlockRadixSort
- *   __shared__ storage_t temp_storage;
- *
- *   // Obtain a segment of consecutive items that are blocked across threads
- *   int keys[2];
- *   int ranks[2];
- *   ...
- *
- *   cub::BFEDigitExtractor<int> extractor(0, radix_bits);
- *   block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
- *
- *   ...
- * @endcode
- * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`.
- * The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`.
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockRadixRank.
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = true,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     LEGACY_PTX_ARCH         = 0>
+//! @rst
+//! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+//! - @blocked
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!      constexpr int block_threads = 2;
+//!      constexpr int radix_bits = 5;
+//!
+//!      // Specialize BlockRadixRank for a 1D block of 2 threads
+//!      // Specialize BlockRadixRank for a 1D block of 2 threads
+//!      using block_radix_rank = cub::BlockRadixRank<block_threads, radix_bits>;
+//!      using storage_t = typename block_radix_rank::TempStorage;
+//!
+//!      // Allocate shared memory for BlockRadixSort
+//!      __shared__ storage_t temp_storage;
+//!
+//!      // Obtain a segment of consecutive items that are blocked across threads
+//!      int keys[2];
+//!      int ranks[2];
+//!      ...
+//!
+//!      cub::BFEDigitExtractor<int> extractor(0, radix_bits);
+//!      block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
+//!
+//!      ...
+//!
+//! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
+//! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockRadixRank.
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam RADIX_BITS
+//!   The number of radix bits per digit place
+//!
+//! @tparam IS_DESCENDING
+//!   Whether or not the sorted-order is high-to-low
+//!
+//! @tparam MEMOIZE_OUTER_SCAN
+//!   **[optional]** Whether or not to buffer outer raking scan
+//!   partials to incur fewer shared memory reads at the expense of higher register pressure
+//!   (default: true for architectures SM35 and newer, false otherwise).
+//!   See `BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE` for more details.
+//!
+//! @tparam INNER_SCAN_ALGORITHM
+//!   **[optional]** The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+//!
+//! @tparam SMEM_CONFIG
+//!   **[optional]** Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template < int BLOCK_DIM_X,
+           int RADIX_BITS,
+           bool IS_DESCENDING,
+           bool MEMOIZE_OUTER_SCAN                 = true,
+           BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+           cudaSharedMemConfig SMEM_CONFIG         = cudaSharedMemBankSizeFourByte,
+           int BLOCK_DIM_Y                         = 1,
+           int BLOCK_DIM_Z                         = 1,
+           int LEGACY_PTX_ARCH                     = 0>
 class BlockRadixRank
 {
 private:
 
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
     // Integer type for digit counters (to be packed into words of type PackedCounters)
     using DigitCounter = unsigned short;
 
@@ -304,11 +290,6 @@ private:
         typename BlockScan::TempStorage block_scan;
     };
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -318,11 +299,6 @@ private:
     /// Copy of raking segment, promoted to registers
     PackedCounter cached_segment[RAKING_SEGMENT];
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /**
      * Internal storage allocator
      */
@@ -332,7 +308,6 @@ private:
         return private_storage;
     }
 
-
     /**
      * Performs upsweep raking reduction, returning the aggregate
      */
@@ -443,15 +418,10 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of shared memory as temporary
-     *        storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockRadixRank()
     :
         temp_storage(PrivateStorage()),
@@ -470,11 +440,9 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Raking
-     *********************************************************************/
-    //@{
+    //! @} end member group
+    //! @name Raking
+    //! @{
 
     /**
      * @brief Rank keys.
@@ -597,6 +565,8 @@ public:
             }
         }
     }
+
+    //! @}
 };
 
 
@@ -618,10 +588,6 @@ class BlockRadixRankMatch
 {
 private:
 
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
     typedef int32_t    RankT;
     typedef int32_t    DigitCounterT;
 
@@ -681,11 +647,6 @@ private:
         } aliasable;
     };
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -700,10 +661,8 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
     /**
      * @brief Collective constructor using the specified memory allocation as temporary storage.
@@ -717,11 +676,9 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Raking
-     *********************************************************************/
-    //@{
+    //! @}  end member group
+    //! @name Raking
+    //! @{
 
     /**
      * @brief Computes the count of keys for each digit value, and calls the
@@ -958,6 +915,8 @@ public:
         RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix,
                  BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
     }
+
+  //! @}
 };
 
 enum WarpMatchAlgorithm
@@ -1317,5 +1276,3 @@ using block_radix_rank_t = cub::detail::conditional_t<
 
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index b56b4cf05f6..538a8069730 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -544,14 +544,10 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockRadixSort()
     :
         temp_storage(PrivateStorage()),
@@ -570,59 +566,57 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
+    //! @} end member group
+    //! @name Sorting (blocked arrangements)
+    //! @{
 
-    /**
-     * @brief Performs an ascending block-wide radix sort over a [<em>blocked
-     *        arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output @p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst 
+    //! Performs an ascending block-wide radix sort over a 
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+    //! 
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive keys.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+    //!        typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+    //!
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        ...
+    //!
+    //!        // Collectively sort the keys
+    //!        BlockRadixSort(temp_storage).Sort(thread_keys);
+    //!
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //! @endrst
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
                                          int begin_bit = 0,
                                          int end_bit   = sizeof(KeyT) * 8)
@@ -762,62 +756,65 @@ public:
         Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
     }
 
-    /**
-     * @brief Performs an ascending block-wide radix sort across a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * @par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
-     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in-out] values
-     *   Values to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst 
+    //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of keys and values.
+    //! 
+    //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+    //!   more than one tile of values, simply perform a key-value sort of the keys paired
+    //!   with a temporary value array that enumerates the key indices. The reordered indices
+    //!   can then be used as a gather-vector for exchanging other associated tile data through
+    //!   shared memory.
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys and values that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive pairs.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 
+    //!        typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        int thread_values[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys and values among block threads
+    //!        BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+    //! 
+    //! @endcode
+    //! @par
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.  The
+    //! corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in,out] values
+    //!   Values to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
                                          ValueT (&values)[ITEMS_PER_THREAD],
                                          int begin_bit = 0,
@@ -980,53 +977,54 @@ public:
              detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
     }
 
-    /**
-     * @brief Performs a descending block-wide radix sort over a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output @p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst
+    //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of keys.
+    //! 
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys that
+    //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+    //! where each thread owns 4 consecutive keys.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+    //!        typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys
+    //!        BlockRadixSort(temp_storage).Sort(thread_keys);
+    //! 
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
                                                    int begin_bit = 0,
                                                    int end_bit   = sizeof(KeyT) * 8)
@@ -1177,62 +1175,63 @@ public:
                     decomposer);
     }
 
-    /**
-     * @brief Performs a descending block-wide radix sort across a [<em>blocked
-     *        arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * @par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
-     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in-out] values
-     *   Values to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst
+    //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of keys and values.
+    //! 
+    //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+    //!   more than one tile of values, simply perform a key-value sort of the keys paired
+    //!   with a temporary value array that enumerates the key indices.  The reordered indices
+    //!   can then be used as a gather-vector for exchanging other associated tile data through
+    //!   shared memory.
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys and values that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive pairs.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
+    //!    values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        int thread_values[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys and values among block threads
+    //!        BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+    //! 
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
+    //! corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in,out] values
+    //!   Values to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
                                                    ValueT (&values)[ITEMS_PER_THREAD],
                                                    int begin_bit = 0,
@@ -1397,60 +1396,58 @@ public:
                     decomposer);
     }
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
+    //! @}  end member group
+    //! @name Sorting (blocked arrangement -> striped arrangement)
+    //! @{
 
-    /**
-     * @brief Performs an ascending radix sort across a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a
-     *        [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
-     * threads where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst 
+    //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys, 
+    //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+    //! 
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys that
+    //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+    //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+    //!        typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys
+    //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+    //! 
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. 
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
                                                          int begin_bit = 0,
                                                          int end_bit   = sizeof(KeyT) * 8)
@@ -1603,63 +1600,63 @@ public:
                              decomposer);
     }
 
-    /**
-     * @brief Performs an ascending radix sort across a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them
-     *        in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * @par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
-     * threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
-     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in-out] values
-     *   Values to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst 
+    //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and 
+    //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+    //! 
+    //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+    //!   more than one tile of values, simply perform a key-value sort of the keys paired
+    //!   with a temporary value array that enumerates the key indices.  The reordered indices
+    //!   can then be used as a gather-vector for exchanging other associated tile data through
+    //!   shared memory.
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys and values that
+    //! are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+    //! threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 
+    //!        typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        int thread_values[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys and values among block threads
+    //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+    //! 
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in,out] values
+    //!   Values to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
                                                          ValueT (&values)[ITEMS_PER_THREAD],
                                                          int begin_bit = 0,
@@ -1816,54 +1813,54 @@ public:
                              decomposer);
     }
 
-    /**
-     * @brief Performs a descending radix sort across a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a
-     *        [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * @par
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
-     * threads where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst
+    //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+    //!
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys that
+    //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+    //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+    //!        typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+    //!
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        ...
+    //!
+    //!        // Collectively sort the keys
+    //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+    //!
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.  
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
+    //!
+    //! @endrst
+    //!
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //!
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //!
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
                                                                    int begin_bit = 0,
                                                                    int end_bit   = sizeof(KeyT) * 8)
@@ -2016,63 +2013,63 @@ public:
                              decomposer);
     }
 
-    /**
-     * @brief Performs a descending radix sort across a
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values,
-     *        leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * @par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
-     * threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
-     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output @p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     * @param[in-out] keys
-     *   Keys to sort
-     *
-     * @param[in-out] values
-     *   Values to sort
-     *
-     * @param[in] begin_bit
-     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-     *
-     * @param[in] end_bit
-     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-     */
+    //! @rst 
+    //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
+    //! 
+    //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+    //!   more than one tile of values, simply perform a key-value sort of the keys paired
+    //!   with a temporary value array that enumerates the key indices.  The reordered indices
+    //!   can then be used as a gather-vector for exchanging other associated tile data through
+    //!   shared memory.
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sort of 512 integer keys and values that
+    //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+    //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 
+    //!        typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+    //! 
+    //!        // Allocate shared memory for BlockRadixSort
+    //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_keys[4];
+    //!        int thread_values[4];
+    //!        ...
+    //! 
+    //!        // Collectively sort the keys and values among block threads
+    //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+    //! 
+    //! Suppose the set of input ``thread_keys`` across the block of threads is
+    //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.  
+    //! The corresponding output ``thread_keys`` in those threads will be
+    //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in,out] keys
+    //!   Keys to sort
+    //! 
+    //! @param[in,out] values
+    //!   Values to sort
+    //! 
+    //! @param[in] begin_bit
+    //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+    //! 
+    //! @param[in] end_bit
+    //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
     __device__ __forceinline__ void
     SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
                                    ValueT (&values)[ITEMS_PER_THREAD],
diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh
index 3a97e8792a0..72c9a937a54 100644
--- a/cub/cub/block/block_raking_layout.cuh
+++ b/cub/cub/block/block_raking_layout.cuh
@@ -48,32 +48,29 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking
- *        across thread block data.    ![](raking.png)
- *
- * @ingroup BlockModule
- *
- * @par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * @tparam T
- *   The data type to be exchanged.
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads.
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         LEGACY_PTX_ARCH = 0>
+//! @rst
+//! BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! This type facilitates a shared memory usage pattern where a block of CUDA
+//! threads places elements into shared memory and then reduces the active
+//! parallelism to one "raking" warp of threads for serially aggregating consecutive
+//! sequences of shared items.  Padding is inserted to eliminate bank conflicts
+//! (for most data types).
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be exchanged.
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads.
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T, int BLOCK_THREADS, int LEGACY_PTX_ARCH = 0>
 struct BlockRakingLayout
 {
     //---------------------------------------------------------------------
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index afc163da49e..cf7dc2fd640 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -26,11 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing
- * a parallel reduction of items partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing
+//!       a parallel reduction of items partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -59,208 +56,195 @@ CUB_NAMESPACE_BEGIN
  * Algorithmic variants
  ******************************************************************************/
 
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA thread block.
- */
+//! BlockReduceAlgorithm enumerates alternative algorithms for parallel reduction across a CUDA thread block.
 enum BlockReduceAlgorithm
 {
 
-    /**
-     * @par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * @par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * @par
-     * @image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * @par Performance Considerations
-     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
-     *   and is preferable when the reduction operator is commutative.  This variant
-     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
+    //! @rst
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //!
+    //! An efficient "raking" reduction algorithm that only supports commutative
+    //! reduction operators (true for most operations, e.g., addition).
+    //! 
+    //! Execution is comprised of three phases:
+    //! #. Upsweep sequential reduction in registers (if threads contribute more
+    //!    than one input each). Threads in warps other than the first warp place
+    //!    their partial reductions into shared memory.
+    //! #. Upsweep sequential reduction in shared memory. Threads within the first
+    //!    warp continue to accumulate by raking across segments of shared partial reductions
+    //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp.
+    //! 
+    //! Performance Considerations
+    //! ++++++++++++++++++++++++++
+    //!
+    //! - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+    //!   and is preferable when the reduction operator is commutative. This variant
+    //!   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+    //!   throughput across the GPU when suitably occupied. However, turn-around latency may be
+    //!   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+    //!   when the GPU is under-occupied.
+    //!
+    //! @endrst
     BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
 
 
-    /**
-     * @par Overview
-     * An efficient "raking" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators. \blocked.
-     *
-     * @par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * @par
-     * @image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * @par Performance Considerations
-     * - This variant performs more communication than BLOCK_REDUCE_RAKING
-     *   and is only preferable when the reduction operator is non-commutative.  This variant
-     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
+    //! @rst
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //!
+    //! An efficient "raking" reduction algorithm that supports commutative
+    //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+    //! operators. @blocked.
+    //! 
+    //! Execution is comprised of three phases:
+    //! #. Upsweep sequential reduction in registers (if threads contribute more
+    //!    than one input each). Each thread then places the partial reduction
+    //!    of its item(s) into shared memory.
+    //! #. Upsweep sequential reduction in shared memory. Threads within a
+    //!    single warp rake across segments of shared partial reductions.
+    //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp.
+    //! 
+    //! @par Performance Considerations
+    //! ++++++++++++++++++++++++++
+    //!
+    //! - This variant performs more communication than BLOCK_REDUCE_RAKING
+    //!   and is only preferable when the reduction operator is non-commutative. This variant
+    //!   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+    //!   throughput across the GPU when suitably occupied. However, turn-around latency may be
+    //!   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+    //!   when the GPU is under-occupied.
+    //!
+    //! @endrst
     BLOCK_REDUCE_RAKING,
 
 
-    /**
-     * @par Overview
-     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators.
-     *
-     * @par
-     * Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * @par
-     * @image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * @par Performance Considerations
-     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
-     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
-     *   throughput across the GPU.  However turn-around latency may be lower and
-     *   thus useful when the GPU is under-occupied.
-     */
+    //! @rst 
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //!
+    //! A quick "tiled warp-reductions" reduction algorithm that supports commutative
+    //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+    //! operators.
+    //! 
+    //! Execution is comprised of four phases:
+    //! #. Upsweep sequential reduction in registers (if threads contribute more
+    //!    than one input each). Each thread then places the partial reduction
+    //!    of its item(s) into shared memory.
+    //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+    //!    reduction within each warp.
+    //! #. A propagation phase where the warp reduction outputs in each warp are
+    //!    updated with the aggregate from each preceding warp.
+    //! 
+    //! Performance Considerations
+    //! ++++++++++++++++++++++++++
+    //!
+    //! - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+    //!   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+    //!   throughput across the GPU. However turn-around latency may be lower and
+    //!   thus useful when the GPU is under-occupied.
+    //! 
+    //! @endrst
     BLOCK_REDUCE_WARP_REDUCTIONS,
 };
 
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * @brief The BlockReduce class provides [<em>collective</em>](index.html#sec0)
- *        methods for computing a parallel reduction of items partitioned across
- *        a CUDA thread block. ![](reduce_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- *   Data type being reduced
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying
- *   the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a>
- * (or <em>fold</em>) uses a binary combining operator to compute a single aggregate from a list of
- * input elements.
- * - @rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different
- *   latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.
- *      An efficient "raking" reduction algorithm that only
- *      supports commutative reduction operators.
- *      [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.
- *      An efficient "raking" reduction algorithm that supports commutative and
- *      non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.
- *      A quick "tiled warp-reductions" reduction algorithm that supports commutative and
- *      non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * @par Performance Considerations
- * - @granularity
- * - Very efficient (only one synchronization barrier).
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - @p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * @par A Simple Example
- * @blockcollective{BlockReduce}
- * @par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for a 1D block of 128 threads of type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * @endcode
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
-    int                     BLOCK_DIM_Y     = 1,
-    int                     BLOCK_DIM_Z     = 1,
-    int                     LEGACY_PTX_ARCH = 0>
+//! @rst
+//! The BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing a parallel reduction
+//! of items partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_ (or *fold*) uses a binary combining
+//!   operator to compute a single aggregate from a list of input elements.
+//! - @rowmajor
+//! - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput
+//!   workload profiles:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY`:
+//!      An efficient "raking" reduction algorithm that only supports commutative reduction operators.
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING`:
+//!      An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators.
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_WARP_REDUCTIONS`:
+//!      A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative
+//!      reduction operators.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Very efficient (only one synchronization barrier).
+//! - Incurs zero bank conflicts for most types
+//! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!   - Summation (vs. generic reduction)
+//!   - ``BLOCK_THREADS`` is a multiple of the architecture's warp size
+//!   - Every thread has a valid input (i.e., full vs. partial-tiles)
+//! - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockReduce}
+//!
+//! The code snippet below illustrates a sum reduction of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+//!        typedef cub::BlockReduce<int, 128> BlockReduce;
+//!
+//!        // Allocate shared memory for BlockReduce
+//!        __shared__ typename BlockReduce::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Compute the block-wide sum for thread0
+//!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   Data type being reduced
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
+          int BLOCK_DIM_Y                = 1,
+          int BLOCK_DIM_Z                = 1,
+          int LEGACY_PTX_ARCH            = 0>
 class BlockReduce
 {
 private:
 
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
     /// Constants
     enum
     {
@@ -283,11 +267,6 @@ private:
     /// Shared memory storage layout type for BlockReduce
     typedef typename InternalBlockReduce::TempStorage _TempStorage;
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -295,11 +274,6 @@ private:
         return private_storage;
     }
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -313,14 +287,10 @@ public:
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockReduce()
     :
         temp_storage(PrivateStorage()),
@@ -339,108 +309,106 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
-     *        reduction functor. Each thread contributes one input element.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * @endcode
-     *
-     * @tparam ReductionOp
-     *   <b>[inferred]</b> Binary reduction functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input
-     *
-     * @param[in] reduction_op
-     *   Binary reduction functor
-     */
+    //! @}  end member group
+    //! @name Generic reductions
+    //! @{
+
+    //! @rst 
+    //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. 
+    //! Each thread contributes one input element.
+    //! 
+    //! - The return value is undefined in threads other than thread\ :sub:`0`.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a max reduction of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Each thread obtains an input item
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Compute the block-wide max for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+    //! 
+    //! @endrst 
+    //! 
+    //! @tparam ReductionOp
+    //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input
+    //! 
+    //! @param[in] reduction_op
+    //!   Binary reduction functor
     template <typename ReductionOp>
     __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
     {
         return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
     }
 
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
-     *        reduction functor. Each thread contributes an array of consecutive input elements.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * @endcode
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ReductionOp
-     *   <b>[inferred]</b> Binary reduction functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] inputs
-     *   Calling thread's input segment
-     *
-     * @param[in] reduction_op
-     *   Binary reduction functor
-     */
+    //! @rst 
+    //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. 
+    //! Each thread contributes an array of consecutive input elements.
+    //! 
+    //! - The return value is undefined in threads other than thread\ :sub:`0`.
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a max reduction of 512 integer items that are partitioned in a 
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns 
+    //! 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Compute the block-wide max for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ReductionOp
+    //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] inputs
+    //!   Calling thread's input segment
+    //! 
+    //! @param[in] reduction_op
+    //!   Binary reduction functor
     template <int ITEMS_PER_THREAD, typename ReductionOp>
     __device__ __forceinline__ T Reduce(T (&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
     {
@@ -449,52 +417,52 @@ public:
         return Reduce(partial, reduction_op);
     }
 
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
-     *        reduction functor. The first @p num_valid threads each contribute one input element.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items
-     * that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * @endcode
-     *
-     * @tparam ReductionOp
-     *   <b>[inferred]</b> Binary reduction functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input
-     *
-     * @param[in] reduction_op
-     *   Binary reduction functor
-     *
-     * @param[in] num_valid
-     *   Number of threads containing valid elements (may be less than BLOCK_THREADS)
-     */
+    //! @rst 
+    //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. 
+    //! The first ``num_valid`` threads each contribute one input element.
+    //! 
+    //! - The return value is undefined in threads other than thread<sub>0</sub>.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a max reduction of a partially-full tile of integer items
+    //! that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int num_valid, ...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Each thread obtains an input item
+    //!        int thread_data;
+    //!        if (threadIdx.x < num_valid) thread_data = ...
+    //! 
+    //!        // Compute the block-wide max for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+    //! 
+    //! @endrst 
+    //! 
+    //! @tparam ReductionOp
+    //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input
+    //! 
+    //! @param[in] reduction_op
+    //!   Binary reduction functor
+    //! 
+    //! @param[in] num_valid
+    //!   Number of threads containing valid elements (may be less than BLOCK_THREADS)
     template <typename ReductionOp>
     __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
     {
@@ -509,95 +477,93 @@ public:
         }
     }
 
-
-    //@}  end member group
-    /******************************************************************//**
-     * @name Summation reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
-     *        as the reduction operator.  Each thread contributes one input element.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * @endcode
-     *
-     * @param[in] input
-     *   Calling thread's input
-     */
+    //! @}  end member group
+    //! @name Summation reductions
+    //! @{
+
+    //! @rst 
+    //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator. 
+    //! Each thread contributes one input element.
+    //! 
+    //! - The return value is undefined in threads other than thread\ :sub:`0`.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sum reduction of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Each thread obtains an input item
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Compute the block-wide sum for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+    //! 
+    //! @endrst 
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input
     __device__ __forceinline__ T Sum(T input)
     {
         return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
     }
 
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
-     *        as the reduction operator.  Each thread contributes an array of consecutive input
-     *        elements.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * @endcode
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @param[in] inputs
-     *   Calling thread's input segment
-     */
+    //! @rst 
+    //! Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. 
+    //! Each thread contributes an array of consecutive input elements.
+    //! 
+    //! - The return value is undefined in threads other than thread\ :sub:`0`.
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a 
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns 
+    //! 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Compute the block-wide sum for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+    //! 
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @param[in] inputs
+    //!   Calling thread's input segment
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ T Sum(T (&inputs)[ITEMS_PER_THREAD])
     {
@@ -606,47 +572,47 @@ public:
         return Sum(partial);
     }
 
-    /**
-     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
-     *        as the reduction operator. The first @p num_valid threads each contribute one input
-     *        element.
-     *
-     * @par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items
-     * that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * @endcode
-     *
-     * @param[in] input
-     *   Calling thread's input
-     *
-     * @param[in] num_valid
-     *   Number of threads containing valid elements (may be less than BLOCK_THREADS)
-     */
+    //! @rst 
+    //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator. 
+    //! The first ``num_valid`` threads each contribute one input element.
+    //! 
+    //! - The return value is undefined in threads other than thread\ :sub:`0`.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a sum reduction of a partially-full tile of integer items
+    //! that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int num_valid, ...)
+    //!    {
+    //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockReduce<int, 128> BlockReduce;
+    //! 
+    //!        // Allocate shared memory for BlockReduce
+    //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+    //! 
+    //!        // Each thread obtains an input item (up to num_items)
+    //!        int thread_data;
+    //!        if (threadIdx.x < num_valid)
+    //!            thread_data = ...
+    //! 
+    //!        // Compute the block-wide sum for thread0
+    //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+    //! 
+    //! @endrst 
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input
+    //! 
+    //! @param[in] num_valid
+    //!   Number of threads containing valid elements (may be less than BLOCK_THREADS)
     __device__ __forceinline__ T Sum(T input, int num_valid)
     {
         // Determine if we skip bounds checking
@@ -661,12 +627,8 @@ public:
     }
 
 
-    //@}  end member group
+    //! @}  end member group
 };
 
-/**
- * @example example_block_reduce.cu
- */
-
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 42dbb6192a0..f1818352791 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -49,101 +49,103 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That
- * is, given the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i]
- * many times in the output array. Due to the nature of the run-length decoding algorithm
- * ("decompression"), the output size of the run-length decoded array is runtime-dependent and
- * potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a
- * "window" from the run-length decoded array. The window's offset can be specified and
- * BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from
- * the specified window will be returned.
- *
- * @note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the
- * run_lengths array). A run of length zero may not be followed by a run length that is not zero.
- *
- * @par
- * @code
- * __global__ void ExampleKernel(...)
- * {
- *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
- *   using RunItemT = uint64_t;
- *   // Type large enough to index into the run-length decoded array
- *   using RunLengthT = uint32_t;
- *
- *   // Specialising BlockRunLengthDecode for a 1D block of 128 threads
- *   constexpr int BLOCK_DIM_X = 128;
- *   // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
- *   constexpr int RUNS_PER_THREAD = 2;
- *   // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
- *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
- *
- *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
- *   using BlockRunLengthDecodeT =
- *     cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
- *
- *   // Allocate shared memory for BlockRunLengthDecode
- *   __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
- *
- *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
- *   RunItemT run_values[RUNS_PER_THREAD];
- *   RunLengthT run_lengths[RUNS_PER_THREAD];
- *   ...
- *
- *   // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
- *   uint32_t total_decoded_size = 0;
- *   BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
- *
- *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
- *   // have been decoded.
- *   uint32_t decoded_window_offset = 0U;
- *   while (decoded_window_offset < total_decoded_size)
- *   {
- *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
- *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
- *
- *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
- *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
- *     block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
- *
- *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
- *
- *     ...
- *   }
- * }
- * @endcode
- * @par
- * Suppose the set of input @p run_values across the block of threads is
- * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
- * @p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
- * The corresponding output @p decoded_items in those threads will be
- * <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt>
- * and @p relative_offsets will be
- * <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the
- * first iteration of the while loop.
- *
- * @tparam ItemT
- *   The data type of the items being run-length decoded
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam RUNS_PER_THREAD
- *   The number of consecutive runs that each thread contributes
- *
- * @tparam DECODED_ITEMS_PER_THREAD
- *   The maximum number of decoded items that each thread holds
- *
- * @tparam DecodedOffsetT
- *   Type used to index into the block's decoded items (large enough to hold the sum over all the
- *   runs' lengths)
- *
- * @tparam BLOCK_DIM_Y
- *   The thread block length in threads along the Y dimension
- *
- * @tparam BLOCK_DIM_Z
- *   The thread block length in threads along the Z dimension
- */
+//! @rst
+//! The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That
+//! is, given the two arrays ``run_value[N]`` and ``run_lengths[N]``, ``run_value[i]`` is repeated ``run_lengths[i]``
+//! many times in the output array. Due to the nature of the run-length decoding algorithm
+//! ("decompression"), the output size of the run-length decoded array is runtime-dependent and
+//! potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a
+//! "window" from the run-length decoded array. The window's offset can be specified and
+//! BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from
+//! the specified window will be returned.
+//! 
+//! .. note::
+//!   Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). 
+//!   A run of length zero may not be followed by a run length that is not zero.
+//! 
+//!
+//! .. code-block:: c++
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!      // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+//!      using RunItemT = uint64_t;
+//!      // Type large enough to index into the run-length decoded array
+//!      using RunLengthT = uint32_t;
+//! 
+//!      // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+//!      constexpr int BLOCK_DIM_X = 128;
+//!      // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+//!      constexpr int RUNS_PER_THREAD = 2;
+//!      // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+//!      constexpr int DECODED_ITEMS_PER_THREAD = 4;
+//! 
+//!      // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+//!      using BlockRunLengthDecodeT =
+//!        cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+//! 
+//!      // Allocate shared memory for BlockRunLengthDecode
+//!      __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+//! 
+//!      // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+//!      RunItemT run_values[RUNS_PER_THREAD];
+//!      RunLengthT run_lengths[RUNS_PER_THREAD];
+//!      ...
+//! 
+//!      // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+//!      uint32_t total_decoded_size = 0;
+//!      BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+//! 
+//!      // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+//!      // have been decoded.
+//!      uint32_t decoded_window_offset = 0U;
+//!      while (decoded_window_offset < total_decoded_size)
+//!      {
+//!        RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+//!        RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+//! 
+//!        // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+//!        uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+//!        block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+//! 
+//!        decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+//! 
+//!        ...
+//!      }
+//!    }
+//!
+//! Suppose the set of input ``run_values`` across the block of threads is
+//! ``{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }`` and
+//! ``run_lengths`` is ``{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }``.
+//! The corresponding output ``decoded_items`` in those threads will be
+//! ``{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }``
+//! and ``relative_offsets`` will be
+//! ``{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }`` during the
+//! first iteration of the while loop.
+//!
+//! @endrst
+//! 
+//! @tparam ItemT
+//!   The data type of the items being run-length decoded
+//! 
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//! 
+//! @tparam RUNS_PER_THREAD
+//!   The number of consecutive runs that each thread contributes
+//! 
+//! @tparam DECODED_ITEMS_PER_THREAD
+//!   The maximum number of decoded items that each thread holds
+//! 
+//! @tparam DecodedOffsetT
+//!   Type used to index into the block's decoded items (large enough to hold the sum over all the
+//!   runs' lengths)
+//! 
+//! @tparam BLOCK_DIM_Y
+//!   The thread block length in threads along the Y dimension
+//! 
+//! @tparam BLOCK_DIM_Z
+//!   The thread block length in threads along the Z dimension
 template <typename ItemT,
           int BLOCK_DIM_X,
           int RUNS_PER_THREAD,
@@ -201,11 +203,9 @@ public:
   // CONSTRUCTOR
   //---------------------------------------------------------------------
 
-  /**
-   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
-   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
-   * <b>RunLengthDecode</b> calls.
-   */
+  //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. 
+  //!        The algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+  //!        `RunLengthDecode` calls.
   template <typename RunLengthT, typename TotalDecodedSizeT>
   __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
                                                   ItemT (&run_values)[RUNS_PER_THREAD],
@@ -217,11 +217,9 @@ public:
     InitWithRunLengths(run_values, run_lengths, total_decoded_size);
   }
 
-  /**
-   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
-   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
-   * <b>RunLengthDecode</b> calls.
-   */
+  //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. 
+  //!        The algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+  //!        `RunLengthDecode` calls.
   template <typename UserRunOffsetT>
   __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
                                                   ItemT (&run_values)[RUNS_PER_THREAD],
@@ -342,10 +340,10 @@ public:
   /**
    * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
    * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
-   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
-   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within
+   * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting \p from_decoded_offset can be
    * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
-   * <b>RunLengthDecode</b> is not required.
+   * `RunLengthDecode` is not required.
    * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
    * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
    * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
@@ -406,11 +404,11 @@ public:
 
   /**
    * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
-   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
-   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
-   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * items in a blocked arrangement to `decoded_items`. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within
+   * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting `from_decoded_offset` can be
    * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
-   * <b>RunLengthDecode</b> is not required.
+   * `RunLengthDecode` is not required.
    *
    * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
    * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index 0c2d23ef4f0..8a57a5a8d42 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -26,11 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a
- * parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a
+//!       parallel prefix sum/scan of items partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -56,181 +53,188 @@ CUB_NAMESPACE_BEGIN
  * Algorithmic variants
  ******************************************************************************/
 
-/**
- * @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
- *        parallel prefix scan across a CUDA thread block.
- */
+
+//! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
+//!        parallel prefix scan across a CUDA thread block.
 enum BlockScanAlgorithm
 {
 
-    /**
-     * @par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * @par
-     * @image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * @par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
+    //! @rst
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //!
+    //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
+    //!
+    //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).  
+    //!    Each thread then places the partial reduction of its item(s) into shared memory.
+    //! #. Upsweep sequential reduction in shared memory. 
+    //!    Threads within a single warp rake across segments of shared partial reductions.
+    //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+    //! #. Downsweep sequential exclusive scan in shared memory.  
+    //!    Threads within a single warp rake across segments of shared partial reductions, 
+    //!    seeded with the warp-scan output.
+    //! #. Downsweep sequential scan in registers (if threads contribute more than one input), 
+    //!    seeded with the raking scan output.
+    //! 
+    //! Performance Considerations
+    //! ++++++++++++++++++++++++++
+    //!
+    //! - Although this variant may suffer longer turnaround latencies when the
+    //!   GPU is under-occupied, it can often provide higher overall throughput
+    //!   across the GPU when suitably occupied.
+    //! 
+    //! @endrst
     BLOCK_SCAN_RAKING,
 
 
-    /**
-     * @par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
+    //! @rst
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //! 
+    //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher 
+    //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
+    //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
+    //! 
+    //! @endrst
     BLOCK_SCAN_RAKING_MEMOIZE,
 
 
-    /**
-     * @par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * @par
-     * @image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * @par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
+    //! @rst
+    //! Overview
+    //! ++++++++++++++++++++++++++
+    //! 
+    //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
+    //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).  
+    //!    Each thread then places the partial reduction of its item(s) into shared memory.
+    //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+    //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate 
+    //!    from each preceding warp.
+    //! #. Downsweep sequential scan in registers (if threads contribute more than one input), 
+    //!    seeded with the raking scan output.
+    //!
+    //! Performance Considerations
+    //! ++++++++++++++++++++++++++
+    //!
+    //! - Although this variant may suffer lower overall throughput across the
+    //!   GPU because due to a heavy reliance on inefficient warpscans, it can
+    //!   often provide lower turnaround latencies when the GPU is under-occupied.
+    //! 
+    //! @endrst
     BLOCK_SCAN_WARP_SCANS,
 };
 
 
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * @brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for
- *        computing a parallel prefix sum/scan of items partitioned across a
- *        CUDA thread block. ![](block_scan_logo.png)
- *
- * @ingroup BlockModule
- *
- * @tparam T
- *   Data type being scanned
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term @em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term @em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - \rowmajor
- * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.
- *      An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
- *      [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.
- *      Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
- *      register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.
- *      A quick (low latency) "tiled warpscans" prefix scan algorithm.
- *      [More...](\ref cub::BlockScanAlgorithm)
- *
- * @par Performance Considerations
- * - @granularity
- * - Uses special instructions when applicable (e.g., warp @p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Invokes a minimal number of minimal block-wide synchronization barriers (only
- *   one or two depending on algorithm selection)
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - @blocksize
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- * @par A Simple Example
- * @blockcollective{BlockScan}
- * @par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for a 1D block of 128 threads of type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of input @p thread_data across the block of threads is
- * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output @p thread_data in those threads will be
- * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockScan.
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
-    int                 BLOCK_DIM_Y     = 1,
-    int                 BLOCK_DIM_Z     = 1,
-    int                 LEGACY_PTX_ARCH = 0>
+//! @rst
+//! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
+//! sum/scan of items partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Given a list of input elements and a binary reduction operator, a
+//!   `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
+//!   to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
+//!   the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
+//!   the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
+//!   the *i*\ :sup:`th` output reduction.
+//! - @rowmajor
+//! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
+//!      An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
+//!      Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
+//!      register pressure for intermediate storage.
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
+//!      A quick (low latency) "tiled warpscans" prefix scan algorithm.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Uses special instructions when applicable (e.g., warp ``SHFL``)
+//! - Uses synchronization-free communication between warp lanes when applicable
+//! - Invokes a minimal number of minimal block-wide synchronization barriers (only
+//!   one or two depending on algorithm selection)
+//! - Incurs zero bank conflicts for most types
+//! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!
+//!   - Prefix sum variants (vs. generic scan)
+//!   - @blocksize
+//!
+//! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockScan}
+//!
+//! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockScan for a 1D block of 128 threads of type int
+//!        typedef cub::BlockScan<int, 128> BlockScan;
+//!
+//!        // Allocate shared memory for BlockScan
+//!        __shared__ typename BlockScan::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Collectively compute the block-wide exclusive prefix sum
+//!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
+//! The corresponding output ``thread_data`` in those threads will be
+//! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockScan.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   Data type being scanned
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_SCAN_RAKING)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension
+//!   (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
+          int BLOCK_DIM_Y              = 1,
+          int BLOCK_DIM_Z              = 1,
+          int LEGACY_PTX_ARCH          = 0>
 class BlockScan
 {
 private:
 
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
     /// Constants
     enum
     {
@@ -260,22 +264,12 @@ private:
     /// Shared memory storage layout type for BlockScan
     typedef typename InternalBlockScan::TempStorage _TempStorage;
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
     /// Linear thread-id
     unsigned int linear_tid;
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -283,24 +277,16 @@ private:
         return private_storage;
     }
 
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
 public:
 
     /// @smemstorage{BlockScan}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockScan()
     :
         temp_storage(PrivateStorage()),
@@ -320,57 +306,54 @@ public:
 
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *        The value of 0 is applied as the initial value, and is assigned to
-     *        @p output in <em>thread</em><sub>0</sub>.
-     *
-     * @par
-     * - @identityzero
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
-     * threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     */
+    //! @}  end member group
+    //! @name Exclusive prefix sum operations
+    //! @{
+
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned 
+    //! to ``output`` in *thread*\ :sub:`0`.
+    //! 
+    //! - @identityzero
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>  // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix sum
+    //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
     __device__ __forceinline__ void ExclusiveSum(T input, T &output)
     {
         T initial_value{};
@@ -378,57 +361,56 @@ public:
         ExclusiveScan(input, output, initial_value, cub::Sum());
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *        The value of 0 is applied as the initial value, and is assigned to
-     *        @p output in <em>thread</em><sub>0</sub>. Also provides every thread
-     *        with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @identityzero
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
-     * threads will be <tt>0, 1, ..., 127</tt>. Furthermore the value @p 128 will
-     * be stored in @p block_aggregate for all threads.
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to \p input)
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes one input element. 
+    //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @identityzero
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix sum
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``. 
+    //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate)
     {
         T initial_value{};
@@ -436,100 +418,99 @@ public:
         ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *        Instead of using 0 as the block-wide prefix, the call-back functor
-     *        @p block_prefix_callback_op is invoked by the first warp in the block,
-     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used
-     *        as the "seed" value that logically prefixes the thread block's scan inputs.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @identityzero
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
-     *   @p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block,
-     *   however only the return value from <em>lane</em><sub>0</sub> is applied
-     *   as the block-wide prefix. Can be stateful.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to \p input)
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+    //! Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor
+    //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
+    //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
+    //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @identityzero
+    //! - The ``block_prefix_callback_op`` functor must implement a member function
+    //!   ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` is the same value 
+    //!   also returned by the scan operation. The functor will be invoked by the first warp of threads in the block,
+    //!   however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an exclusive prefix sum over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+    //! of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total += block_aggregate;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data = d_data[block_offset];
+    //! 
+    //!            // Collectively compute the block-wide exclusive prefix sum
+    //!            BlockScan(temp_storage).ExclusiveSum(
+    //!                thread_data, thread_data, prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            d_data[block_offset] = thread_data;
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+    //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
+    //! The output for the second segment will be ``128, 129, ..., 255``.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void ExclusiveSum(T input,
                                                  T &output,
@@ -539,63 +520,61 @@ public:
     }
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of consecutive
-     *        input elements. The value of 0 is applied as the initial value, and is
-     *        assigned to @p output[0] in <em>thread</em><sub>0</sub>.
-     *
-     * @par
-     * - @identityzero
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The corresponding output
-     * @p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     */
+    //! @} end member group
+    //! @name Exclusive prefix sum operations (multiple data per thread)
+    //! @{
+
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
+    //! 
+    //! - @identityzero
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix sum
+    //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. 
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD])
@@ -605,63 +584,63 @@ public:
         ExclusiveScan(input, output, initial_value, cub::Sum());
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of consecutive
-     *        input elements. The value of 0 is applied as the initial value, and is
-     *        assigned to @p output[0] in <em>thread</em><sub>0</sub>. Also provides
-     *        every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @identityzero
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The
-     * corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to \p input)
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @identityzero
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in 
+    //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns 
+    //! 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix sum
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. 
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+    //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD],
@@ -673,119 +652,118 @@ public:
         ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of consecutive
-     *        input elements. Instead of using 0 as the block-wide prefix, the
-     *        call-back functor @p block_prefix_callback_op is invoked by the first warp
-     *        in the block, and the value returned by <em>lane</em><sub>0</sub> in that
-     *        warp is used as the "seed" value that logically prefixes the thread block's
-     *        scan inputs. Also provides every thread with the block-wide
-     *        @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @identityzero
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter @p block_aggregate is the same value also returned
-     *   by the scan operation. The functor will be invoked by the first warp of threads in
-     *   the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.
-     *   Can be stateful.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked
-     * arrangement</em>](index.html#sec5sec3) across 128 threads where each thread owns 4
-     * consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>0, 1, 2, 3, ..., 510, 511</tt>. The output for the second segment
-     * will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to \p input)
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by 
+    //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" 
+    //! value that logically prefixes the thread block's scan inputs. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @identityzero
+    //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
+    //!   The functor's input parameter ``block_aggregate`` is the same value also returned
+    //!   by the scan operation. The functor will be invoked by the first warp of threads in
+    //!   the block, however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix.
+    //!   Can be stateful.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an exclusive prefix sum over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+    //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` 
+    //! across 128 threads where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total += block_aggregate;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+    //!        typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+    //!        typedef cub::BlockScan<int, 128>                             BlockScan;
+    //! 
+    //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+    //!        __shared__ union {
+    //!            typename BlockLoad::TempStorage     load;
+    //!            typename BlockScan::TempStorage     scan;
+    //!            typename BlockStore::TempStorage    store;
+    //!        } temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data[4];
+    //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Collectively compute the block-wide exclusive prefix sum
+    //!            int block_aggregate;
+    //!            BlockScan(temp_storage.scan).ExclusiveSum(
+    //!                thread_data, thread_data, prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+    //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``. 
+    //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member
+    //!   `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD],
@@ -796,66 +774,64 @@ public:
 
 
 
-    //@}  end member group        // Exclusive prefix sums
-    /******************************************************************//**
-     * @name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the specified binary
-     *        @p scan_op functor. Each thread contributes one input element.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
-     * in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] initial_value
-     *   Initial value to seed the exclusive scan (and is assigned to @p output[0] in
-     * <em>thread</em><sub>0</sub>)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     */
+    //! @} end member group // Exclusive prefix sums
+    //! @name Exclusive prefix scan operations
+    //! @{
+
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix max scan
+    //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] initial_value
+    //!   @rst
+    //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+    //!   @endrst
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <typename ScanOp>
     __device__ __forceinline__ void
     ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
@@ -863,66 +839,66 @@ public:
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the specified
-     *        binary @p scan_op functor. Each thread contributes one input element.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output
-     * @p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value @p 126 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] initial_value
-     *   Initial value to seed the exclusive scan (and is assigned to
-     *   @p output[0] in <em>thread</em><sub>0</sub>)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix max scan
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
+    //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to ``input``)
+    //! 
+    //! @param[in] initial_value
+    //!   @rst
+    //!   Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`)
+    //!   @endrst
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <typename ScanOp>
     __device__ __forceinline__ void
     ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate)
@@ -930,107 +906,105 @@ public:
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes one input element.
-     *        the call-back functor @p block_prefix_callback_op is invoked by the first warp
-     *        in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp
-     *        is used as the "seed" value that logically prefixes the thread block's scan
-     *        inputs. Also provides every thread with the block-wide @p block_aggregate of
-     *        all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter @p block_aggregate
-     *   is the same value also returned by the scan operation. The functor will be invoked by the
-     *   first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix. Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>. The output for the second segment
-     * will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide
-     *   prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by 
+    //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as 
+    //! the "seed" value that logically prefixes the thread block's scan inputs. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. 
+    //!   The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation. 
+    //!   The functor will be invoked by the first warp of threads in the block, however only the return value from
+    //!   *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an exclusive prefix max scan over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans. 
+    //! Each tile consists of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(INT_MIN);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data = d_data[block_offset];
+    //! 
+    //!            // Collectively compute the block-wide exclusive prefix max scan
+    //!            BlockScan(temp_storage).ExclusiveScan(
+    //!                thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            d_data[block_offset] = thread_data;
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+    //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. 
+    //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <typename ScanOp, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void ExclusiveScan(T input,
                                                   T &output,
@@ -1041,73 +1015,72 @@ public:
     }
 
 
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * @name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer
-     * items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] initial_value
-     *   Initial value to seed the exclusive scan (and is assigned to @p output[0] in
-     * <em>thread</em><sub>0</sub>)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     */
+    //! @} end member group // Inclusive prefix sums
+    //! @name Exclusive prefix scan operations (multiple data per thread)
+    //! @{
+
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
+    //! items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+    //! across 128 threads where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix max scan
+    //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member
+    //!   `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] initial_value
+    //!   @rst
+    //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+    //!   @endrst
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -1124,73 +1097,73 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements. Also provides every thread
-     *        with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param input
-     *   [in] Calling thread's input items
-     *
-     * @param output
-     *   [out] Calling thread's output items (may be aliased to @p input)
-     *
-     * @param initial_value
-     *   [in] Initial value to seed the exclusive scan
-     *   (and is assigned to @p output[0] in <em>thread</em><sub>0</sub>)
-     *
-     * @param scan_op
-     *   [in] Binary scan functor
-     *
-     * @param block_aggregate
-     *   [out] block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in 
+    //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns 
+    //! 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide exclusive prefix max scan
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
+    //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] initial_value
+    //!   @rst
+    //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+    //!   @endrst
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -1208,121 +1181,121 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements. The call-back functor
-     *        @p block_prefix_callback_op is invoked by the first warp in the block,
-     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used as
-     *        the "seed" value that logically prefixes the thread block's scan inputs.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter @p block_aggregate
-     *   is the same value also returned by the scan operation. The functor will be invoked by the
-     *   first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be
-     * <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param input
-     *   [in] Calling thread's input items
-     *
-     * @param output
-     *   [out] Calling thread's output items (may be aliased to @p input)
-     *
-     * @param scan_op
-     *   [in] Binary scan functor
-     *
-     * @param block_prefix_callback_op
-     *   [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for
-     *   specifying a block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value 
+    //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread 
+    //! block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function
+    //!   ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate``
+    //!   is the same value also returned by the scan operation. The functor will be invoked by the
+    //!   first warp of threads in the block, however only the return value from
+    //!   *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an exclusive prefix max scan over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans. Each tile consists
+    //! of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+    //!        typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+    //!        typedef cub::BlockScan<int, 128>                             BlockScan;
+    //! 
+    //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+    //!        __shared__ union {
+    //!            typename BlockLoad::TempStorage     load;
+    //!            typename BlockScan::TempStorage     scan;
+    //!            typename BlockStore::TempStorage    store;
+    //!        } temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data[4];
+    //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Collectively compute the block-wide exclusive prefix max scan
+    //!            BlockScan(temp_storage.scan).ExclusiveScan(
+    //!                thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+    //! The corresponding output for the first segment will be
+    //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
+    //! The output for the second segment will be
+    //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -1340,72 +1313,66 @@ public:
     }
 
 
-    //@}  end member group
+    //! @}  end member group
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
 
-    /******************************************************************//**
-     * @name Exclusive prefix scan operations (no initial value, single datum per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        one input element. With no initial value, the output computed
-     *        for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     */
+    //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
+    //! @{
+
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. 
+    //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <typename ScanOp>
     __device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        one input element. Also provides every thread with the block-wide
-     *        @p block_aggregate of all inputs. With no initial value, the output
-     *        computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. Also provides every thread with the block-wide
+    //! ``block_aggregate`` of all inputs. With no initial value, the output computed for 
+    //! *thread*\ :sub:`0` is undefined.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <typename ScanOp>
     __device__ __forceinline__ void
     ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
@@ -1413,40 +1380,36 @@ public:
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
     }
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Exclusive prefix scan operations (no initial value, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements. With no initial value, the
-     *        output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     */
+    //! @}  end member group
+    //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
+    //! @{
+
+    //! @rst
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. With no initial value, the
+    //! output computed for *thread*\ :sub:`0` is undefined.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -1462,39 +1425,36 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
     }
 
-    /**
-     * @brief Computes an exclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements. Also provides every thread
-     *        with the block-wide @p block_aggregate of all inputs.
-     *        With no initial value, the output computed for
-     *        <em>thread</em><sub>0</sub> is undefined.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to \p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. Also provides every thread
+    //! with the block-wide ``block_aggregate`` of all inputs.
+    //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -1512,208 +1472,206 @@ public:
     }
 
 
-    //@}  end member group
+    //! @}  end member group
 #endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
 
-    /******************************************************************//**
-     * @name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *
-     * @par
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those threads
-     * will be <tt>1, 2, ..., 128</tt>.
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     */
+    //! @name Inclusive prefix sum operations
+    //! @{
+
+    //! @rst
+    //! Computes an inclusive block-wide prefix scan using addition (+)
+    //! as the scan operator. Each thread contributes one input element.
+    //! 
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide inclusive prefix sum
+    //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
     __device__ __forceinline__ void InclusiveSum(T input, T &output)
     {
         InclusiveScan(input, output, cub::Sum());
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
-     * threads will be <tt>1, 2, ..., 128</tt>. Furthermore the value @p 128 will
-     * be stored in @p block_aggregate for all threads.
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to \p input)
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes one input element. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide inclusive prefix sum
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. 
+    //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. 
+    //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     __device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate)
     {
         InclusiveScan(input, output, cub::Sum(), block_aggregate);
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes one input element.
-     *        Instead of using 0 as the block-wide prefix, the call-back functor
-     *        @p block_prefix_callback_op is invoked by the first warp in the block,
-     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is
-     *        used as the "seed" value that logically prefixes the thread block's
-     *        scan inputs. Also provides every thread with the block-wide
-     *        @p block_aggregate of all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
-     *   @p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block,
-     *   however only the return value from <em>lane</em><sub>0</sub> is applied
-     *   as the block-wide prefix. Can be stateful.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.
-     * Each tile consists of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
+    //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by 
+    //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
+    //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function
+    //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+    //!   ``block_aggregate`` is the same value also returned by the scan operation.
+    //!   The functor will be invoked by the first warp of threads in the block,
+    //!   however only the return value from *lane*\ :sub:`0` is applied
+    //!   as the block-wide prefix. Can be stateful.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an inclusive prefix sum over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.
+    //! Each tile consists of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total += block_aggregate;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data = d_data[block_offset];
+    //! 
+    //!            // Collectively compute the block-wide inclusive prefix sum
+    //!            BlockScan(temp_storage).InclusiveSum(
+    //!                thread_data, thread_data, prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            d_data[block_offset] = thread_data;
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+    //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
+    //! The output for the second segment will be ``129, 130, ..., 256``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied 
+    //!   to the logical input sequence.
+    //!   @endrst
     template <typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void InclusiveSum(T input,
                                                  T &output,
@@ -1723,61 +1681,58 @@ public:
     }
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of
-     *        consecutive input elements.
-     *
-     * @par
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The corresponding output
-     * @p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     */
+    //! @}  end member group
+    //! @name Inclusive prefix sum operations (multiple data per thread)
+    //! @{
+
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements.
+    //! 
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide inclusive prefix sum
+    //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
+    //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD])
@@ -1800,65 +1755,64 @@ public:
         }
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of consecutive
-     *        input elements. Also provides every thread with the block-wide
-     *        @p block_aggregate of all inputs.
-     *
-     * @par
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output @p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //! 
+    //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+    //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide inclusive prefix sum
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
+    //! corresponding output ``thread_data`` in those threads will be
+    //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
+    //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[out] block_aggregate
+    //!   block-wide aggregate reduction of input items
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD],
@@ -1882,116 +1836,116 @@ public:
         }
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using addition (+)
-     *        as the scan operator. Each thread contributes an array of consecutive
-     *        input elements. Instead of using 0 as the block-wide prefix, the
-     *        call-back functor @p block_prefix_callback_op is invoked by the first
-     *        warp in the block, and the value returned by <em>lane</em><sub>0</sub>
-     *        in that warp is used as the "seed" value that logically prefixes the
-     *        thread block's scan inputs. Also provides every thread with the
-     *        block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
-     *   @p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block,
-     *   however only the return value from <em>lane</em><sub>0</sub> is applied
-     *   as the block-wide prefix. Can be stateful.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads where each thread
-     * owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>1, 2, 3, 4, ..., 511, 512</tt>. The output for the second segment will be
-     * <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst
+    //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by 
+    //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" 
+    //! value that logically prefixes the thread block's scan inputs. Also provides every thread with the
+    //! block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function
+    //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+    //!   ``block_aggregate`` is the same value also returned by the scan operation.
+    //!   The functor will be invoked by the first warp of threads in the block,
+    //!   however only the return value from *lane*\ :sub:`0` is applied
+    //!   as the block-wide prefix. Can be stateful.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an inclusive prefix sum over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+    //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! across 128 threads where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total += block_aggregate;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+    //!        typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+    //!        typedef cub::BlockScan<int, 128>                             BlockScan;
+    //! 
+    //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+    //!        __shared__ union {
+    //!            typename BlockLoad::TempStorage     load;
+    //!            typename BlockScan::TempStorage     scan;
+    //!            typename BlockStore::TempStorage    store;
+    //!        } temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data[4];
+    //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Collectively compute the block-wide inclusive prefix sum
+    //!            BlockScan(temp_storage.scan).IncluisveSum(
+    //!                thread_data, thread_data, prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+    //! The corresponding output for the first segment will be
+    //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
+    //! ``513, 514, 515, 516, ..., 1023, 1024``.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the 
+    //!   logical input sequence.
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
                                                  T (&output)[ITEMS_PER_THREAD],
@@ -2016,126 +1970,122 @@ public:
     }
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        one input element.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
-     * in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param input
-     *   [in] Calling thread's input item
-     *
-     * @param output
-     *   [out] Calling thread's output item (may be aliased to @p input)
-     *
-     * @param scan_op
-     *   [in] Binary scan functor
-     */
+    //! @}  end member group
+    //! @name Inclusive prefix scan operations
+    //! @{
+
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+    //! are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //!
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //!
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //!
+    //!        // Collectively compute the block-wide inclusive prefix max scan
+    //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
+    //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <typename ScanOp>
     __device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        one input element. Also provides every thread with the block-wide
-     *        @p block_aggregate of all inputs.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128
-     * integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
-     * in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>. Furthermore the value
-     * @p 126 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. Also provides every thread with the block-wide
+    //! ``block_aggregate`` of all inputs.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix max scan of 128
+    //! integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //!
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //!
+    //!        // Obtain input item for each thread
+    //!        int thread_data;
+    //!        ...
+    //!
+    //!        // Collectively compute the block-wide inclusive prefix max scan
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
+    //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
+    //! ``126`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   Block-wide aggregate reduction of input items
     template <typename ScanOp>
     __device__ __forceinline__ void
     InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
@@ -2143,108 +2093,108 @@ public:
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        one input element. The call-back functor @p block_prefix_callback_op
-     *        is invoked by the first warp in the block, and the value returned by
-     *        <em>lane</em><sub>0</sub> in that warp is used as the "seed" value
-     *        that logically prefixes the thread block's scan inputs.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
-     *   @p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block,
-     *   however only the return value from <em>lane</em><sub>0</sub> is applied
-     *   as the block-wide prefix. Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - @rowmajor
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>0, 0, 2, 2, ..., 126, 126</tt>. The output for the second segment
-     * will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input item
-     *
-     * @param[out] output
-     *   Calling thread's output item (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
+    //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as 
+    //! the "seed" value that logically prefixes the thread block's scan inputs.
+    //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function
+    //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+    //!   ``block_aggregate`` is the same value also returned by the scan operation.
+    //!   The functor will be invoked by the first warp of threads in the block,
+    //!   however only the return value from *lane*\ :sub:`0` is applied
+    //!   as the block-wide prefix. Can be stateful.
+    //! - Supports non-commutative scan operators.
+    //! - @rowmajor
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an inclusive prefix max scan over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+    //! of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(INT_MIN);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data = d_data[block_offset];
+    //! 
+    //!            // Collectively compute the block-wide inclusive prefix max scan
+    //!            BlockScan(temp_storage).InclusiveScan(
+    //!                thread_data, thread_data, cub::Max(), prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            d_data[block_offset] = thread_data;
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+    //! The corresponding output for the first segment will be
+    //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
+    //! will be ``128, 128, 130, 130, ..., 254, 254``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output item (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <typename ScanOp, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void InclusiveScan(T input,
                                                   T &output,
@@ -2255,69 +2205,66 @@ public:
     }
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        an array of consecutive input elements.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     */
+    //! @}  end member group
+    //! @name Inclusive prefix scan operations (multiple data per thread)
+    //! @{
+
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+    //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //!
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //!
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Collectively compute the block-wide inclusive prefix max scan
+    //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -2340,70 +2287,68 @@ public:
         }
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes
-     *        an array of consecutive input elements. Also provides every thread
-     *        with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads of type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(),
-     * block_aggregate);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of input @p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output @p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[out] block_aggregate
-     *   block-wide aggregate reduction of input items
-     */
+    //! @rst
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. Also provides every thread
+    //! with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+    //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+    //! where each thread owns 4 consecutive items.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(...)
+    //!    {
+    //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+    //!        typedef cub::BlockScan<int, 128> BlockScan;
+    //! 
+    //!        // Allocate shared memory for BlockScan
+    //!        __shared__ typename BlockScan::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Collectively compute the block-wide inclusive prefix max scan
+    //!        int block_aggregate;
+    //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+    //! 
+    //! Suppose the set of input ``thread_data`` across the block of threads is
+    //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+    //! The corresponding output ``thread_data`` in those threads will be
+    //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
+    //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
+    //!
+    //! @endrst
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[out] block_aggregate
+    //!   Block-wide aggregate reduction of input items
     template <int ITEMS_PER_THREAD, typename ScanOp>
     __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -2427,121 +2372,119 @@ public:
         }
     }
 
-    /**
-     * @brief Computes an inclusive block-wide prefix scan using the
-     *        specified binary @p scan_op functor. Each thread contributes an
-     *        array of consecutive input elements. The call-back functor
-     *        @p block_prefix_callback_op is invoked by the first warp in the block,
-     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used
-     *        as the "seed" value that logically prefixes the thread block's scan inputs.
-     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
-     *
-     * @par
-     * - The @p block_prefix_callback_op functor must implement a member function
-     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
-     *   @p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block,
-     *   however only the return value from <em>lane</em><sub>0</sub> is applied
-     *   as the block-wide prefix. Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * @endcode
-     * @par
-     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be
-     * <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>. The output for the second
-     * segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
-     *
-     * @tparam ITEMS_PER_THREAD
-     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     *
-     * @tparam ScanOp
-     *   <b>[inferred]</b> Binary scan functor type having member
-     *   <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * @tparam BlockPrefixCallbackOp
-     *   <b>[inferred]</b> Call-back functor type having member
-     *   <tt>T operator()(T block_aggregate)</tt>
-     *
-     * @param[in] input
-     *   Calling thread's input items
-     *
-     * @param[out] output
-     *   Calling thread's output items (may be aliased to @p input)
-     *
-     * @param[in] scan_op
-     *   Binary scan functor
-     *
-     * @param[in-out] block_prefix_callback_op
-     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
-     *   block-wide prefix to be applied to the logical input sequence.
-     */
+    //! @rst 
+    //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. 
+    //! Each thread contributes an array of consecutive input elements. 
+    //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
+    //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the 
+    //! thread block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+    //! 
+    //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. 
+    //!   The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation.
+    //!   The functor will be invoked by the first warp of threads in the block, however only the return value 
+    //!   from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+    //! - Supports non-commutative scan operators.
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates a single thread block that progressively
+    //! computes an inclusive prefix max scan over multiple "tiles" of input using a
+    //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+    //! of 128 integer items that are partitioned across 128 threads.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+    //! 
+    //!    // A stateful callback functor that maintains a running prefix to be applied
+    //!    // during consecutive scan operations.
+    //!    struct BlockPrefixCallbackOp
+    //!    {
+    //!        // Running prefix
+    //!        int running_total;
+    //! 
+    //!        // Constructor
+    //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+    //! 
+    //!        // Callback operator to be entered by the first warp of threads in the block.
+    //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    //!        __device__ int operator()(int block_aggregate)
+    //!        {
+    //!            int old_prefix = running_total;
+    //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+    //!            return old_prefix;
+    //!        }
+    //!    };
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+    //!    {
+    //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+    //!        typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+    //!        typedef cub::BlockScan<int, 128>                             BlockScan;
+    //! 
+    //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+    //!        __shared__ union {
+    //!            typename BlockLoad::TempStorage     load;
+    //!            typename BlockScan::TempStorage     scan;
+    //!            typename BlockStore::TempStorage    store;
+    //!        } temp_storage;
+    //! 
+    //!        // Initialize running total
+    //!        BlockPrefixCallbackOp prefix_op(0);
+    //! 
+    //!        // Have the block iterate over segments of items
+    //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+    //!        {
+    //!            // Load a segment of consecutive items that are blocked across threads
+    //!            int thread_data[4];
+    //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Collectively compute the block-wide inclusive prefix max scan
+    //!            BlockScan(temp_storage.scan).InclusiveScan(
+    //!                thread_data, thread_data, cub::Max(), prefix_op);
+    //!            CTA_SYNC();
+    //! 
+    //!            // Store scanned items to output segment
+    //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+    //!            CTA_SYNC();
+    //!        }
+    //!
+    //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+    //! The corresponding output for the first segment will be
+    //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
+    //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
+    //!
+    //! @endrst 
+    //! 
+    //! @tparam ITEMS_PER_THREAD
+    //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+    //! 
+    //! @tparam ScanOp
+    //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+    //! 
+    //! @tparam BlockPrefixCallbackOp
+    //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+    //! 
+    //! @param[in] input
+    //!   Calling thread's input items
+    //! 
+    //! @param[out] output
+    //!   Calling thread's output items (may be aliased to `input`)
+    //! 
+    //! @param[in] scan_op
+    //!   Binary scan functor
+    //! 
+    //! @param[in,out] block_prefix_callback_op
+    //!   @rst
+    //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to 
+    //!   the logical input sequence.
+    //!   @endrst
     template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
     __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
                                                   T (&output)[ITEMS_PER_THREAD],
@@ -2565,14 +2508,8 @@ public:
         }
     }
 
-    //@}  end member group
-
-
+    //! @}  end member group
 };
 
-/**
- * \example example_block_scan.cu
- */
-
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index 84f618ab066..43ff0b190cd 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -26,11 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling
- * data partitioned across a CUDA thread block.
- */
+//! @file The cub::BlockShuffle class provides :ref:`collective <collective-primitives>` methods for shuffling
+//!       data partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -49,47 +46,39 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0)
- *        methods for shuffling data partitioned across a CUDA thread block.
- *
- * @ingroup BlockModule
- *
- * @tparam T
- *   The data type to be exchanged.
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
- * either (a) up to their successor or (b) down to their predecessor.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 LEGACY_PTX_ARCH     = 0>
+//! @rst
+//! The BlockShuffle class provides :ref:`collective <collective-primitives>`
+//! methods for shuffling data partitioned across a CUDA thread block.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! It is commonplace for blocks of threads to rearrange data items between threads.
+//! The BlockShuffle abstraction allows threads to efficiently shift items either
+//! (a) up to their successor or
+//! (b) down to their predecessor
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be exchanged.
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0>
 class BlockShuffle
 {
 private:
 
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
     enum
     {
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
@@ -99,10 +88,6 @@ private:
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
     };
 
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
     /// Shared memory storage layout type (last element from each thread's input)
     typedef T _TempStorage[BLOCK_THREADS];
 
@@ -114,11 +99,6 @@ public:
 
 private:
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Shared storage reference
     _TempStorage &temp_storage;
 
@@ -126,10 +106,6 @@ private:
     unsigned int linear_tid;
 
 
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -140,15 +116,10 @@ private:
 
 public:
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
-    /**
-     * @brief Collective constructor using a private static allocation of
-     *        shared memory as temporary storage.
-     */
+    //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
     __device__ __forceinline__ BlockShuffle()
     :
         temp_storage(PrivateStorage()),
@@ -168,32 +139,34 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Shuffle movement
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Each <em>thread<sub>i</sub></em> obtains the @p input provided by
-     *        <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
-     *        The offset @p distance may be negative.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @param[in] input
-     *   The input item from the calling thread (<em>thread<sub>i</sub></em>)
-     *
-     * @param[out] output
-     *   The @p input item from the successor (or predecessor) thread
-     *   <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to @p input).
-     *   This value is only updated for for <em>thread<sub>i</sub></em> when
-     *   0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
-     *
-     * @param[in] distance
-     *   Offset distance (may be negative)
-     */
+    //! @}  end member group
+    //! @name Shuffle movement
+    //! @{
+
+    //! @rst
+    //!
+    //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`.
+    //! The offset ``distance`` may be negative.
+    //! 
+    //! - @smemreuse
+    //!
+    //! @endrst
+    //! 
+    //! @param[in] input
+    //!   @rst
+    //!   The input item from the calling thread (*thread*\ :sub:`i`)
+    //!   @endrst
+    //! 
+    //! @param[out] output
+    //!   @rst
+    //!   The ``input`` item from the successor (or predecessor) thread
+    //!   *thread*\ :sub:`i + distance` (may be aliased to ``input``).
+    //!   This value is only updated for for *thread*\ :sub:`i` when
+    //!   ``0 <= (i + distance) < BLOCK_THREADS - 1``
+    //!   @endrst
+    //! 
+    //! @param[in] distance
+    //!   Offset distance (may be negative)
     __device__ __forceinline__ void Offset(T input, T &output, int distance = 1)
     {
         temp_storage[linear_tid] = input;
@@ -207,25 +180,25 @@ public:
         }
     }
 
-    /**
-     * @brief Each <em>thread<sub>i</sub></em> obtains the @p input
-     *        provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
-     *
-     * @par
-     * - @smemreuse
-     *
-     * @param[in] input
-     *   The calling thread's input item
-     *
-     * @param[out] output
-     *   The @p input item from thread
-     *   <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt>BLOCK_THREADS</tt></sub>
-     *   (may be aliased to @p input). This value is not updated for
-     *   <em>thread</em><sub>BLOCK_THREADS-1</sub>
-     *
-     * @param[in] distance
-     *   Offset distance (0 < @p distance < <tt>BLOCK_THREADS</tt>)
-     */
+    //! @rst
+    //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`.
+    //! 
+    //! - @smemreuse
+    //!
+    //! @rst
+    //! 
+    //! @param[in] input
+    //!   The calling thread's input item
+    //! 
+    //! @param[out] output
+    //!   @rst
+    //!   The ``input`` item from thread
+    //!   *thread*\ :sub:`(i + distance>) % BLOCK_THREADS` (may be aliased to ``input``). 
+    //!   This value is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+    //!   @endrst
+    //! 
+    //! @param[in] distance
+    //!   Offset distance (`0 < distance < `BLOCK_THREADS`)
     __device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance = 1)
     {
         temp_storage[linear_tid] = input;
@@ -239,23 +212,24 @@ public:
         output = temp_storage[offset];
     }
 
-    /**
-     * @brief The thread block rotates its
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of
-     *        @p input items, shifting it up by one item.
-     *
-     * @par
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @param[in] input
-     *   The calling thread's input items
-     *
-     * @param[out] prev
-     *   The corresponding predecessor items (may be aliased to @p input).
-     *   The item @p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-     */
+    //! @rst 
+    //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>` of
+    //! ``input`` items, shifting it up by one item.
+    //! 
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in] input
+    //!   The calling thread's input items
+    //! 
+    //! @param[out] prev
+    //!   @rst
+    //!   The corresponding predecessor items (may be aliased to ``input``).
+    //!   The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+    //!   @endrst
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
     {
@@ -271,42 +245,55 @@ public:
             prev[0] = temp_storage[linear_tid - 1];
     }
 
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
+    //! @rst
+    //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of ``input`` items, shifting it up by one item. All threads receive the ``input`` provided by
+    //! *thread*\ :sub:`BLOCK_THREADS - 1`.
+    //!
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst
+    //!
+    //! @param[in] input
+    //!   The calling thread's input items
+    //!
+    //! @param[out] prev
+    //!   @rst
+    //!   The corresponding predecessor items (may be aliased to ``input``).
+    //!   The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+    //!   @endrst
+    //!
+    //! @param[out] block_suffix
+    //!   @rst
+    //!   The item ``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`, provided to all threads
+    //!   @endrst
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    __device__ __forceinline__ void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD], T& block_suffix)
     {
-        Up(input, prev);
-        block_suffix = temp_storage[BLOCK_THREADS - 1];
+      Up(input, prev);
+      block_suffix = temp_storage[BLOCK_THREADS - 1];
     }
 
-    /**
-     * @brief The thread block rotates its
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of
-     *        @p input items, shifting it down by one item.
-     *
-     * @par
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @param[in] input
-     *   The calling thread's input items
-     *
-     * @param[out] prev
-     *   The corresponding predecessor items (may be aliased to @p input).
-     *   The value @p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-     */
+    //! @rst
+    //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>`
+    //! of ``input`` items, shifting it down by one item.
+    //!
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst
+    //!
+    //! @param[in] input
+    //!   The calling thread's input items
+    //!
+    //! @param[out] prev
+    //!   @rst
+    //!   The corresponding predecessor items (may be aliased to ``input``).
+    //!   The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+    //!   @endrst
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
     {
@@ -322,27 +309,29 @@ public:
             prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1];
     }
 
-    /**
-     * @brief The thread block rotates its
-     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of input items,
-     *        shifting it down by one item. All threads receive @p input[0]
-     *        provided by <em>thread</em><sub><tt>0</tt></sub>.
-     *
-     * @par
-     * - @blocked
-     * - @granularity
-     * - @smemreuse
-     *
-     * @param[in] input
-     *   The calling thread's input items
-     *
-     * @param[out] prev
-     *   The corresponding predecessor items (may be aliased to @p input).
-     *   The value @p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-     *
-     * @param[out] block_prefix
-     *   The item @p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-     */
+    //! @rst 
+    //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>` of input items,
+    //! shifting it down by one item. All threads receive ``input[0]` provided by *thread*\ :sub:`0`.
+    //! 
+    //! - @blocked
+    //! - @granularity
+    //! - @smemreuse
+    //!
+    //! @endrst 
+    //! 
+    //! @param[in] input
+    //!   The calling thread's input items
+    //! 
+    //! @param[out] prev
+    //!   @rst
+    //!   The corresponding predecessor items (may be aliased to ``input``).
+    //!   The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+    //!   @endrst
+    //! 
+    //! @param[out] block_prefix
+    //!   @rst
+    //!   The item ``input[0]`` from *thread*\ :sub:`0`, provided to all threads
+    //!   @endrst
     template <int ITEMS_PER_THREAD>
     __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD],
                                          T (&prev)[ITEMS_PER_THREAD],
@@ -352,9 +341,7 @@ public:
         block_prefix = temp_storage[0];
     }
 
-    //@}  end member group
-
-
+    //! @} end member group
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 98dd76e7fdd..0404a0bc628 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -26,10 +26,7 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * Operations for writing linear segments of data from the CUDA thread block
- */
+//! @file Operations for writing linear segments of data from the CUDA thread block
 
 #pragma once
 
@@ -49,41 +46,34 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * @name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * @blocked
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[in] items
- *   Data to store
- */
+//! @name Blocked arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a blocked arrangement of items across a thread block into a linear segment of items
+//! 
+//! @blocked
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[in] items
+//!   Data to store
 template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
                                                    OutputIteratorT block_itr,
@@ -99,35 +89,35 @@ __device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
     }
 }
 
-/**
- * @brief Store a blocked arrangement of items across a
- *        thread block into a linear segment of items, guarded by range
- *
- * @blocked
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[in] items
- *   Data to store
- *
- * @param[in] valid_items
- *   Number of valid items to write
- *
- */
+//! @rst
+//! Store a blocked arrangement of items across a
+//! thread block into a linear segment of items, guarded by range
+//! 
+//! @blocked
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[in] items
+//!   Data to store
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to write
 template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
                                                    OutputIteratorT block_itr,
@@ -147,38 +137,39 @@ __device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
     }
 }
 
-/**
- * @brief Store a blocked arrangement of items across a
- *        thread block into a linear segment of items.
- *
- * @blocked
- *
- * The output offset (@p block_ptr + @p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by @p cudaMalloc()
- *
- * @par
- * The following conditions will prevent vectorization and storing will
- * fall back to cub::BLOCK_STORE_DIRECT:
- *   - @p ITEMS_PER_THREAD is odd
- *   - The data type @p T is not a built-in primitive or CUDA vector type
- *     (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_ptr
- *   Input pointer for storing from
- *
- * @param[in] items
- *   Data to store
- */
+//! @rst
+//! Store a blocked arrangement of items across a
+//! thread block into a linear segment of items.
+//! 
+//! @blocked
+//! 
+//! The output offset (``block_ptr + block_offset``) must be quad-item aligned,
+//! which is the default starting offset returned by ``cudaMalloc()``
+//! 
+//! The following conditions will prevent vectorization and storing will
+//! fall back to cub::BLOCK_STORE_DIRECT:
+//!
+//!   - ``ITEMS_PER_THREAD`` is odd
+//!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+//!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
+//! 
+//! @param[in] block_ptr
+//!   Input pointer for storing from
+//! 
+//! @param[in] items
+//!   Data to store
 template <typename T, int ITEMS_PER_THREAD>
 __device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid,
                                                              T *block_ptr,
@@ -220,40 +211,39 @@ __device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid,
 
 
 
-//@}  end member group
-/******************************************************************//**
- * @name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Store a striped arrangement of data across the thread block into a
- *        linear segment of items.
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output @iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[in] items
- *   Data to store
- */
+//! @}  end member group
+//! @name Striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a striped arrangement of data across the thread block into a
+//! linear segment of items.
+//! 
+//! @striped
+//! 
+//! @endrst
+//! 
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[in] items
+//!   Data to store
 template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectStriped(int linear_tid,
                                                    OutputIteratorT block_itr,
@@ -269,37 +259,38 @@ __device__ __forceinline__ void StoreDirectStriped(int linear_tid,
     }
 }
 
-/**
- * @brief Store a striped arrangement of data across the thread block into
- *        a linear segment of items, guarded by range
- *
- * @striped
- *
- * @tparam BLOCK_THREADS
- *   The thread block size in threads
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[in] items
- *   Data to store
- *
- * @param[in] valid_items
- *   Number of valid items to write
- */
+//! @rst
+//! Store a striped arrangement of data across the thread block into
+//! a linear segment of items, guarded by range
+//! 
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[in] items
+//!   Data to store
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to write
 template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectStriped(int linear_tid,
                                                    OutputIteratorT block_itr,
@@ -321,40 +312,41 @@ __device__ __forceinline__ void StoreDirectStriped(int linear_tid,
 
 
 
-//@}  end member group
-/******************************************************************//**
- * @name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * @brief Store a warp-striped arrangement of data across the
- *        thread block into a linear segment of items.
- *
- * @warpstriped
- *
- * @par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[out] items
- *   Data to load
- */
+//! @}  end member group
+//! @name Warp-striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a warp-striped arrangement of data across the
+//! thread block into a linear segment of items.
+//! 
+//! @warpstriped
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[out] items
+//!   Data to load
 template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
                                                        OutputIteratorT block_itr,
@@ -374,37 +366,40 @@ __device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
     }
 }
 
-/**
- * @brief Store a warp-striped arrangement of data across the thread block into a
- *        linear segment of items, guarded by range
- *
- * @warpstriped
- *
- * @par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to store.
- *
- * @tparam ITEMS_PER_THREAD
- *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- * @tparam OutputIteratorT
- *   <b>[inferred]</b> The random-access iterator type for output \iterator.
- *
- * @param[in] linear_tid
- *   A suitable 1D thread-identifier for the calling thread
- *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
- *
- * @param[in] block_itr
- *   The thread block's base output iterator for storing to
- *
- * @param[in] items
- *   Data to store
- *
- * @param[in] valid_items
- *   Number of valid items to write
- */
+//! @rst
+//! Store a warp-striped arrangement of data across the thread block into a
+//! linear segment of items, guarded by range
+//! 
+//! @warpstriped
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//! 
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//! 
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//! 
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//! 
+//! @param[in] items
+//!   Data to store
+//! 
+//! @param[in] valid_items
+//!   Number of valid items to write
 template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
 __device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
                                                        OutputIteratorT block_itr,
@@ -429,250 +424,258 @@ __device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
 }
 
 
-//@}  end member group
-
-
-/** @} */       // end group UtilIo
+//! @}  end member group
 
 
 //-----------------------------------------------------------------------------
 // Generic BlockStore abstraction
 //-----------------------------------------------------------------------------
 
-/**
- * @brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a
- *        blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
+//! cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a
+//! blocked arrangement of items across a CUDA thread block to a linear segment of memory.
 enum BlockStoreAlgorithm
 {
-    /**
-     * @par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * @par Overview
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.
-     *
-     * @par Performance Considerations
-     * The utilization of memory transactions (coalescing) remains high regardless
-     * of items written per thread.
-     */
-    BLOCK_STORE_STRIPED,
-
-    /**
-     * @par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
-     * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - @p ITEMS_PER_THREAD is odd
-     *   - The @p OutputIteratorT is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type @p T is not a built-in primitive or CUDA vector type
-     *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * @par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * @par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     *
-     * @par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-
-    /**
-     * @par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * To reduce the shared memory requirement, only one warp's worth of shared
-     * memory is provisioned and is subsequently time-sliced among warps.
-     *
-     * @par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * @par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) decreases as the
+  //!   access stride between threads increases (i.e., the number items per thread).
+  //!
+  //! @endrst
+  BLOCK_STORE_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) remains high regardless
+  //! of items written per thread.
+  //!
+  //! @endrst
+  BLOCK_STORE_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly
+  //! to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+  //! For example, ``st.global.v4.s32`` instructions will be generated
+  //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high until the the
+  //!   access stride between threads (i.e., the number items per thread) exceeds the
+  //!   maximum vector store width (typically 4 items or 64B, whichever is lower).
+  //! - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+  //!   - ``ITEMS_PER_THREAD`` is odd
+  //!   - The ``OutputIteratorT`` is not a simple pointer type
+  //!   - The block output offset is not quadword-aligned
+  //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
+  BLOCK_STORE_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_STORE_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_STORE_WARP_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
+  //! To reduce the shared memory requirement, only one warp's worth of shared
+  //! memory is provisioned and is subsequently time-sliced among warps.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger
+  //!   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+  //!
+  //! @endrst
+  BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
 };
 
-
-/**
- * @brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement
- *        methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items
- *        partitioned across a CUDA thread block to a linear segment of memory.
- *        ![](block_store_logo.png)
- *
- * @ingroup BlockModule
- *
- * @ingroup UtilIo
- *
- * @tparam T
- *   The type of data to be written.
- *
- * @tparam BLOCK_DIM_X
- *   The thread block length in threads along the X dimension
- *
- * @tparam ITEMS_PER_THREAD
- *   The number of consecutive items partitioned onto each thread.
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.
- *   default: cub::BLOCK_STORE_DIRECT.
- *
- * @tparam BLOCK_DIM_Y
- *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- *
- * @tparam BLOCK_DIM_Z
- *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- *
- * @tparam LEGACY_PTX_ARCH
- *   <b>[optional]</b> Unused.
- *
- * @par Overview
- * - The BlockStore class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_STRIPED</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory. To reduce the shared memory requireent, only one warp's worth of shared
- *      memory is provisioned and is subsequently time-sliced among warps.  [More...](\ref cub::BlockStoreAlgorithm)
- * - \rowmajor
- *
- * @par A Simple Example
- * \blockcollective{BlockStore}
- * @par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * @endcode
- * @par
- * Suppose the set of @p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
- * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
- *
- * This example can be easily adapted to the storage required by BlockStore.
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     LEGACY_PTX_ARCH     = 0>
+//! @rst
+//! The BlockStore class provides :ref:`collective <collective-primitives>` data movement
+//! methods for writing a :ref:`blocked arrangement <flexible-data-arrangement>` of items
+//! partitioned across a CUDA thread block to a linear segment of memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - The BlockStore class provides a single data movement abstraction that can be specialized
+//!   to implement different cub::BlockStoreAlgorithm strategies. This facilitates different
+//!   performance policies for different architectures, data types, granularity sizes, etc.
+//! - BlockStore can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_DIRECT`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_STRIPED`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_VECTORIZE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory
+//!      using CUDA's built-in vectorized stores as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_TRANSPOSE`
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!      To reduce the shared memory requireent, only one warp's worth of shared memory is provisioned and is
+//!      subsequently time-sliced among warps.
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockStore}
+//!
+//! The code snippet below illustrates the storing of a "blocked" arrangement
+//! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+//! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+//! meaning items are locally reordered among threads so that memory references will be
+//! efficiently coalesced using a warp-striped access pattern.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+//!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+//!
+//!        // Allocate shared memory for BlockStore
+//!        __shared__ typename BlockStore::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Store items to linear memory
+//!        BlockStore(temp_storage).Store(d_data, thread_data);
+//!
+//! Suppose the set of ``thread_data`` across the block of threads is
+//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockStore.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The type of data to be written.
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockStoreAlgorithm tuning policy enumeration (default: cub::BLOCK_STORE_DIRECT)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
+          int BLOCK_DIM_Y               = 1,
+          int BLOCK_DIM_Z               = 1,
+          int LEGACY_PTX_ARCH           = 0>
 class BlockStore
 {
 private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
 
-    /// Constants
     enum
     {
         /// The thread block size in threads
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
     /// Store helper
     template <BlockStoreAlgorithm _POLICY, int DUMMY>
     struct StoreInternal;
 
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
     template <int DUMMY>
     struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
     {
@@ -1119,22 +1122,12 @@ private:
     };
 
 
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
     /// Internal load implementation to use
     typedef StoreInternal<ALGORITHM, 0> InternalStore;
 
-
     /// Shared memory storage layout type
     typedef typename InternalStore::TempStorage _TempStorage;
 
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
     /// Internal storage allocator
     __device__ __forceinline__ _TempStorage& PrivateStorage()
     {
@@ -1142,11 +1135,6 @@ private:
         return private_storage;
     }
 
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
     /// Thread reference to shared storage
     _TempStorage &temp_storage;
 
@@ -1156,14 +1144,12 @@ private:
 public:
 
 
-    /// @smemstorage{BlockStore}
+    //! @smemstorage{BlockStore}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
-    /******************************************************************//**
-     * @name Collective constructors
-     *********************************************************************/
-    //@{
+    //! @name Collective constructors
+    //! @{
 
     /**
      * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
@@ -1186,113 +1172,112 @@ public:
     {}
 
 
-    //@}  end member group
-    /******************************************************************//**
-     * @name Data movement
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief Store items into a linear segment of memory.
-     *
-     * @par
-     * - @blocked
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of @p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     * @param block_itr[out]
-     *   The thread block's base output iterator for storing to
-     *
-     * @param items[in]
-     *   Data to store
-     *
-     */
+    //! @}  end member group
+    //! @name Data movement
+    //! @{
+
+    //! @rst
+    //! Store items into a linear segment of memory
+    //! 
+    //! - @blocked
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the storing of a "blocked" arrangement
+    //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+    //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+    //! meaning items are locally reordered among threads so that memory references will be
+    //! efficiently coalesced using a warp-striped access pattern.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+    //! 
+    //!    __global__ void ExampleKernel(int *d_data, ...)
+    //!    {
+    //!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+    //! 
+    //!        // Allocate shared memory for BlockStore
+    //!        __shared__ typename BlockStore::TempStorage temp_storage;
+    //! 
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //! 
+    //!        // Store items to linear memory
+    //!        int thread_data[4];
+    //!        BlockStore(temp_storage).Store(d_data, thread_data);
+    //! 
+    //! Suppose the set of ``thread_data`` across the block of threads is
+    //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+    //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+    //!
+    //! @endrst
+    //! 
+    //! @param block_itr[out]
+    //!   The thread block's base output iterator for storing to
+    //! 
+    //! @param items[in]
+    //!   Data to store
     template <typename OutputIteratorT>
     __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
     {
         InternalStore(temp_storage, linear_tid).Store(block_itr, items);
     }
 
-    /**
-     * @brief Store items into a linear segment of memory, guarded by range.
-     *
-     * @par
-     * - @blocked
-     * - @smemreuse
-     *
-     * @par Snippet
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * @endcode
-     * @par
-     * Suppose the set of @p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and @p valid_items is @p 5.
-     * The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     * @param block_itr[out]
-     *   The thread block's base output iterator for storing to
-     *
-     * @param items[in]
-     *   Data to store
-     *
-     * @param valid_items[in]
-     *   Number of valid items to write
-     */
+    //! @rst 
+    //! Store items into a linear segment of memory, guarded by range.
+    //! 
+    //! - @blocked
+    //! - @smemreuse
+    //! 
+    //! Snippet
+    //! +++++++
+    //!
+    //! The code snippet below illustrates the guarded storing of a "blocked" arrangement
+    //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+    //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+    //! meaning items are locally reordered among threads so that memory references will be
+    //! efficiently coalesced using a warp-striped access pattern.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+    //!
+    //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+    //!    {
+    //!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+    //!        typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+    //!
+    //!        // Allocate shared memory for BlockStore
+    //!        __shared__ typename BlockStore::TempStorage temp_storage;
+    //!
+    //!        // Obtain a segment of consecutive items that are blocked across threads
+    //!        int thread_data[4];
+    //!        ...
+    //!
+    //!        // Store items to linear memory
+    //!        int thread_data[4];
+    //!        BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+    //!
+    //! Suppose the set of ``thread_data`` across the block of threads is
+    //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
+    //! The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ?, ...``, with
+    //! only the first two threads being unmasked to store portions of valid data.
+    //!
+    //! @endrst 
+    //!
+    //! @param block_itr[out]
+    //!   The thread block's base output iterator for storing to
+    //!
+    //! @param items[in]
+    //!   Data to store
+    //!
+    //! @param valid_items[in]
+    //!   Number of valid items to write
     template <typename OutputIteratorT>
     __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                           T (&items)[ITEMS_PER_THREAD],
@@ -1301,7 +1286,7 @@ public:
         InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
     }
 
-    //@}  end member group
+    //! @}  end member group
 };
 
 template <class Policy,
diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml
index 18b04c65892..7b33463955a 100644
--- a/cub/docs/repo.toml
+++ b/cub/docs/repo.toml
@@ -51,30 +51,30 @@ doxygen_input = [
 # more information on the format can be found at:
 #     https://www.doxygen.nl/manual/config.html#cfg_aliases
 doxygen_aliases = [
-  "smemwarpreuse=A subsequent `__syncwarp()` warp-wide barrier should be invoked after calling this method if the collective's temporary storage (e.g., @p temp_storage) is to be reused or repurposed.",
-  "smemreuse=A subsequent `__syncthreads()` threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., @p temp_storage) is to be reused or repurposed.",
-  "smemreuse{1}=After any operation, a subsequent `__syncthreads()` barrier is required if the collective's \\1 is to be reused or repurposed",
-  "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.",
-  "granularity=Efficiency is increased with increased granularity `ITEMS_PER_THREAD`. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of `cub::BlockLoad` for efficiently gathering a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.",
+  "smemwarpreuse=A subsequent ``__syncwarp()`` warp-wide barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.",
+  "smemreuse=A subsequent ``__syncthreads()`` threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.",
+  "smemreuse{1}=After any operation, a subsequent ``__syncthreads()`` barrier is required if the collective's \\1 is to be reused or repurposed",
+  "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the ``__shared__`` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or ``union``'d with other storage allocation types to facilitate memory reuse.",
+  "granularity=Efficiency is increased with increased granularity ``ITEMS_PER_THREAD``. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of ``cub::BlockLoad`` for efficiently gathering a :ref:`blocked arrangement <flexible-data-arrangement>` of elements across threads.",
   "blocksize=The number of threads in the block is a multiple of the architecture's warp size",
-  "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the `__CUDA_ARCH__` macro (e.g., 350 for sm_35).  Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of `__CUDA_ARCH__` during the current compiler pass)",
+  "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)",
   "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.",
   "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.",
-  "devicestorage=When `d_temp_storage` is `NULL`, no work is done and the required allocation size is returned in `temp_storage_bytes`.",
-  "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is `O(P)`, where `P` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size `N`).",
-  "devicestorageNP=This operation requires an allocation of temporary device storage that is `O(N+P)`, where `N` is the length of the input and `P` is the number of streaming multiprocessors on the device.",
-  "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is `O(N/C + P)`, where `N` is the length of the input, `C` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and `P` is the number of streaming multiprocessors on the device.",
+  "devicestorage=When ``d_temp_storage`` is `NULL`, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
+  "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is ``O(P)``, where ``P`` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size ``N``).",
+  "devicestorageNP=This operation requires an allocation of temporary device storage that is ``O(N+P)``, where ``N`` is the length of the input and ``P`` is the number of streaming multiprocessors on the device.",
+  "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is ``O(N/C + P)``, where ``N`` is the length of the input, ``C`` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and ``P`` is the number of streaming multiprocessors on the device.",
   "cdp_class{1}= - Dynamic parallelism. \\1 methods can be called within kernel code on devices in which CUDA dynamic parallelism is supported.",
   "iterator=(may be a simple pointer type)",
-  "offset_size1=(Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yeild better performance than `size_t` in 64-bit memory mode.)",
-  "offset_size2=Careful consideration should be given to the size of integer types used for offsets and lengths. Many (if not most) scenarios will only require 32-bit offsets (e.g., `int`). 64-bit offset types (e.g., `size_t` on 64-bit memory mode) can consume a significant amount of thread storage resources, adversely affecting processor occupancy and performance.",
+  "offset_size1=(Consider using 32-bit values as offsets/lengths/etc. For example, ``int`` will typically yeild better performance than ``size_t`` in 64-bit memory mode.)",
+  "offset_size2=Careful consideration should be given to the size of integer types used for offsets and lengths. Many (if not most) scenarios will only require 32-bit offsets (e.g., ``int``). 64-bit offset types (e.g., ``size_t`` on 64-bit memory mode) can consume a significant amount of thread storage resources, adversely affecting processor occupancy and performance.",
   "rowmajor=For multi-dimensional blocks, threads are linearly ranked in row-major order.",
-  "blocked=Assumes a [*blocked arrangement*](index.html#sec4sec3) of (*block-threads* * *items-per-thread*) items across the thread block, where thread<sub><em>i</em></sub> owns the <em>i</em><sup>th</sup> range of <em>items-per-thread</em> contiguous items.  For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
-  "striped=Assumes a [<em>striped arrangement</em>](index.html#sec4sec3) of (<em>block-threads</em>*<em>items-per-thread</em>) items across the thread block, where thread<sub><em>i</em></sub> owns items (<em>i</em>), (<em>i</em> + <em>block-threads</em>), ..., (<em>i</em> + (<em>block-threads</em>*(<em>items-per-thread</em>-1))).  For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
-  "warpstriped=Assumes a <em>warp-striped arrangement</em> of elements across threads, where warp<sub><em>i</em></sub> owns the <em>i</em><sup>th</sup> range of (<em>warp-threads</em>*<em>items-per-thread</em>) contiguous items, and each thread owns items (<em>i</em>), (<em>i</em> + <em>warp-threads</em>), ..., (<em>i</em> + (<em>warp-threads</em>*(<em>items-per-thread</em>-1))).",
+  "blocked=Assumes a :ref:`blocked arrangement <flexible-data-arrangement>` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns the *i*\\ :sup:`th` range of *items-per-thread* contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
+  "striped=Assumes a :ref:`striped arrangement <flexible-data-arrangement>` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns items (*i*), (*i* + *block-threads*), ..., (*i* + (*block-threads* * (*items-per-thread* - 1))).  For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
+  "warpstriped=Assumes a *warp-striped arrangement* of elements across threads, where warp\\ :sub:`i` owns the *i*\\ :sup:`th` range of (*warp-threads* * *items-per-thread*) contiguous items, and each thread owns items (*i*), (*i* + *warp-threads*), ..., (*i* + (*warp-threads* * (*items-per-thread* - 1))).",
   "linear_performance{1}=The work-complexity of \\1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU."  ,
   "plots_below=Performance plots for other scenarios can be found in the detailed method descriptions below.",
-  "identityzero=This operation assumes the value of obtained by the `T`'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition."
+  "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition."
 ]
 
 # doxygen sometimes gets confused by macros. the array below allows the user to
@@ -94,6 +94,7 @@ doxygen_predefined = [
     "__forceinline__",
     "__declspec(x)=",
     "__align__(x)=",
+    "CUB_DEPRECATED",
     "CUB_STATIC_ASSERT(cond,msg)=",
     "CUB_RUNTIME_FUNCTION",
     "CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED",