From a00de21b5e79aa5c398efeac50b37d99d580c859 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:46:01 -0800
Subject: [PATCH] Remove CUB `DeviceSpMV` (#3549)

---
 cub/cub/agent/agent_spmv_orig.cuh             | 764 ---------------
 cub/cub/cub.cuh                               |   1 -
 cub/cub/device/device_spmv.cuh                | 216 ----
 .../device/dispatch/dispatch_spmv_orig.cuh    | 924 ------------------
 cub/test/test_device_spmv.cu                  | 611 ------------
 5 files changed, 2516 deletions(-)
 delete mode 100644 cub/cub/agent/agent_spmv_orig.cuh
 delete mode 100644 cub/cub/device/device_spmv.cuh
 delete mode 100644 cub/cub/device/dispatch/dispatch_spmv_orig.cuh
 delete mode 100644 cub/test/test_device_spmv.cu
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
deleted file mode 100644
index 90a5e3aa6c9..00000000000
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ /dev/null
@@ -1,764 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-
-#pragma once
-
-#include <cub/config.cuh>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cub/block/block_exchange.cuh>
-#include <cub/block/block_reduce.cuh>
-#include <cub/block/block_scan.cuh>
-#include <cub/iterator/cache_modified_input_iterator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/thread/thread_operators.cuh>
-#include <cub/thread/thread_search.cuh>
-#include <cub/util_type.cuh>
-
-#include <cuda/std/__algorithm/max.h>
-#include <cuda/std/__algorithm/min.h>
-#include <cuda/std/type_traits>
-
-#include <iterator>
-
-CUB_NAMESPACE_BEGIN
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * @param Parameterizable tuning policy type for AgentSpmv
- *
- * @tparam _BLOCK_THREADS
- *   Threads per thread block
- *
- * @tparam _ITEMS_PER_THREAD
- *   Items per thread (per tile of input)
- *
- * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER
- *   Cache load modifier for reading CSR row-offsets during search
- *
- * @tparam _ROW_OFFSETS_LOAD_MODIFIER
- *   Cache load modifier for reading CSR row-offsets
- *
- * @tparam _COLUMN_INDICES_LOAD_MODIFIER
- *   Cache load modifier for reading CSR column-indices
- *
- * @tparam _VALUES_LOAD_MODIFIER
- *   Cache load modifier for reading CSR values
- *
- * @tparam _VECTOR_VALUES_LOAD_MODIFIER
- *   Cache load modifier for reading vector values
- *
- * @tparam _DIRECT_LOAD_NONZEROS
- *   Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through
- * shared memory)
- *
- * @tparam _SCAN_ALGORITHM
- *   The BlockScan algorithm to use
- */
-template <int _BLOCK_THREADS,
-          int _ITEMS_PER_THREAD,
-          CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-          CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER,
-          CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER,
-          CacheLoadModifier _VALUES_LOAD_MODIFIER,
-          CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
-          bool _DIRECT_LOAD_NONZEROS,
-          BlockScanAlgorithm _SCAN_ALGORITHM>
-struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy
-{
-  enum
-  {
-    /// Threads per thread block
-    BLOCK_THREADS = _BLOCK_THREADS,
-
-    /// Items per thread (per tile of input)
-    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-
-    /// Whether to load nonzeros directly from global during sequential merging (pre-staged through
-    /// shared memory)
-    DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS,
-  };
-
-  /// Cache load modifier for reading CSR row-offsets
-  static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;
-
-  /// Cache load modifier for reading CSR row-offsets
-  static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER;
-
-  /// Cache load modifier for reading CSR column-indices
-  static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER;
-
-  /// Cache load modifier for reading CSR values
-  static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER;
-
-  /// Cache load modifier for reading vector values
-  static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER;
-
-  /// The BlockScan algorithm to use
-  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
-};
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- */
-template <typename ValueT, typename OffsetT>
-struct
-// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/
-#if !_CCCL_COMPILER(NVHPC)
-  CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-#endif
-    SpmvParams
-{
-  /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix
-  /// <b>A</b>.
-  const ValueT* d_values;
-
-  /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices
-  /// and \p d_values
-  const OffsetT* d_row_end_offsets;
-
-  /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements
-  /// of matrix <b>A</b>.  (Indices are zero-valued.)
-  const OffsetT* d_column_indices;
-
-  /// Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-  const ValueT* d_vector_x;
-
-  /// Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-  ValueT* d_vector_y;
-
-  /// Number of rows of matrix <b>A</b>.
-  int num_rows;
-
-  /// Number of columns of matrix <b>A</b>.
-  int num_cols;
-
-  /// Number of nonzero elements of matrix <b>A</b>.
-  int num_nonzeros;
-
-  /// Alpha multiplicand
-  ValueT alpha;
-
-  /// Beta addend-multiplicand
-  ValueT beta;
-};
-
-/**
- * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- *
- * @tparam AgentSpmvPolicyT
- *   Parameterized AgentSpmvPolicy tuning policy type
- *
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- *
- * @tparam HAS_ALPHA
- *   Whether the input parameter \p alpha is 1
- *
- * @tparam HAS_BETA
- *   Whether the input parameter \p beta is 0
- *
- * @tparam LEGACY_PTX_ARCH
- *   PTX compute capability (unused)
- */
-template <typename AgentSpmvPolicyT,
-          typename ValueT,
-          typename OffsetT,
-          bool HAS_ALPHA,
-          bool HAS_BETA,
-          int LEGACY_PTX_ARCH = 0>
-struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
-{
-  //---------------------------------------------------------------------
-  // Types and constants
-  //---------------------------------------------------------------------
-
-  /// Constants
-  enum
-  {
-    BLOCK_THREADS    = AgentSpmvPolicyT::BLOCK_THREADS,
-    ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
-    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
-  };
-
-  /// 2D merge path coordinate type
-  using CoordinateT = typename CubVector<OffsetT, 2>::Type;
-
-  /// Input iterator wrapper types (for applying cache modifiers)
-
-  using RowOffsetsSearchIteratorT =
-    CacheModifiedInputIterator<AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT>;
-
-  using RowOffsetsIteratorT = CacheModifiedInputIterator<AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, OffsetT, OffsetT>;
-
-  using ColumnIndicesIteratorT =
-    CacheModifiedInputIterator<AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, OffsetT, OffsetT>;
-
-  using ValueIteratorT = CacheModifiedInputIterator<AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
-
-  using VectorValueIteratorT =
-    CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
-
-  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-  using KeyValuePairT = KeyValuePair<OffsetT, ValueT>;
-
-  // Reduce-value-by-segment scan operator
-  using ReduceBySegmentOpT = ReduceByKeyOp<::cuda::std::plus<>>;
-
-  // BlockReduce specialization
-  using BlockReduceT = BlockReduce<ValueT, BLOCK_THREADS, BLOCK_REDUCE_WARP_REDUCTIONS>;
-
-  // BlockScan specialization
-  using BlockScanT = BlockScan<KeyValuePairT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM>;
-
-  // BlockScan specialization
-  using BlockPrefixSumT = BlockScan<ValueT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM>;
-
-  // BlockExchange specialization
-  using BlockExchangeT = BlockExchange<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD>;
-
-  /// Merge item type (either a non-zero value or a row-end offset)
-  union MergeItem
-  {
-    // Value type to pair with index type OffsetT
-    // (NullType if loading values directly during merge)
-    using MergeValueT = ::cuda::std::_If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>;
-
-    OffsetT row_end_offset;
-    MergeValueT nonzero;
-  };
-
-  /// Shared memory type required by this thread block
-  struct _TempStorage
-  {
-    CoordinateT tile_coords[2];
-
-    union Aliasable
-    {
-      // Smem needed for tile of merge items
-      MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
-
-      // Smem needed for block exchange
-      typename BlockExchangeT::TempStorage exchange;
-
-      // Smem needed for block-wide reduction
-      typename BlockReduceT::TempStorage reduce;
-
-      // Smem needed for tile scanning
-      typename BlockScanT::TempStorage scan;
-
-      // Smem needed for tile prefix sum
-      typename BlockPrefixSumT::TempStorage prefix_sum;
-
-    } aliasable;
-  };
-
-  /// Temporary storage type (unionable)
-  struct TempStorage : Uninitialized<_TempStorage>
-  {};
-
-  //---------------------------------------------------------------------
-  // Per-thread fields
-  //---------------------------------------------------------------------
-
-  /// Reference to temp_storage
-  _TempStorage& temp_storage;
-
-  _CCCL_SUPPRESS_DEPRECATED_PUSH
-  SpmvParams<ValueT, OffsetT>& spmv_params;
-  _CCCL_SUPPRESS_DEPRECATED_POP
-
-  /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements
-  /// of matrix <b>A</b>.
-  ValueIteratorT wd_values;
-
-  /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p
-  /// d_column_indices and \p d_values
-  RowOffsetsIteratorT wd_row_end_offsets;
-
-  /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero
-  /// elements of matrix <b>A</b>.  (Indices are zero-valued.)
-  ColumnIndicesIteratorT wd_column_indices;
-
-  /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
-  /// <em>x</em>
-  VectorValueIteratorT wd_vector_x;
-
-  /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
-  /// <em>x</em>
-  VectorValueIteratorT wd_vector_y;
-
-  //---------------------------------------------------------------------
-  // Interface
-  //---------------------------------------------------------------------
-
-  /**
-   * @param temp_storage
-   *   Reference to temp_storage
-   *
-   * @param spmv_params
-   *   SpMV input parameter bundle
-   */
-  _CCCL_SUPPRESS_DEPRECATED_PUSH
-  _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams<ValueT, OffsetT>& spmv_params)
-      : temp_storage(temp_storage.Alias())
-      , spmv_params(spmv_params)
-      , wd_values(spmv_params.d_values)
-      , wd_row_end_offsets(spmv_params.d_row_end_offsets)
-      , wd_column_indices(spmv_params.d_column_indices)
-      , wd_vector_x(spmv_params.d_vector_x)
-      , wd_vector_y(spmv_params.d_vector_y)
-  {}
-  _CCCL_SUPPRESS_DEPRECATED_POP
-
-  /**
-   * @brief Consume a merge tile, specialized for direct-load of nonzeros
-   *
-   * @param is_direct_load
-   *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT
-  ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type<true> is_direct_load)
-  {
-    int tile_num_rows               = tile_end_coord.x - tile_start_coord.x;
-    int tile_num_nonzeros           = tile_end_coord.y - tile_start_coord.y;
-    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
-
-    // Gather the row end-offsets for the merge tile into shared memory
-    for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
-    {
-      const OffsetT offset = (::cuda::std::min)(
-        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
-      s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
-    }
-
-    __syncthreads();
-
-    // Search for the thread's starting coordinate within the merge tile
-    _CCCL_SUPPRESS_DEPRECATED_PUSH
-    CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
-    _CCCL_SUPPRESS_DEPRECATED_POP
-    CoordinateT thread_start_coord;
-
-    MergePathSearch(
-      OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
-      s_tile_row_end_offsets, // List A
-      tile_nonzero_indices, // List B
-      tile_num_rows,
-      tile_num_nonzeros,
-      thread_start_coord);
-
-    __syncthreads(); // Perf-sync
-
-    // Compute the thread's merge path segment
-    CoordinateT thread_current_coord = thread_start_coord;
-    KeyValuePairT scan_segment[ITEMS_PER_THREAD];
-
-    ValueT running_total = 0.0;
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
-      OffsetT column_idx  = wd_column_indices[nonzero_idx];
-      ValueT value        = wd_values[nonzero_idx];
-
-      ValueT vector_value = wd_vector_x[column_idx];
-
-      ValueT nonzero = value * vector_value;
-
-      OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
-
-      if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-      {
-        // Move down (accumulate)
-        running_total += nonzero;
-        scan_segment[ITEM].value = running_total;
-        scan_segment[ITEM].key   = tile_num_rows;
-        ++thread_current_coord.y;
-      }
-      else
-      {
-        // Move right (reset)
-        scan_segment[ITEM].value = running_total;
-        scan_segment[ITEM].key   = thread_current_coord.x;
-        running_total            = 0.0;
-        ++thread_current_coord.x;
-      }
-    }
-
-    __syncthreads();
-
-    // Block-wide reduce-value-by-segment
-    KeyValuePairT tile_carry;
-    ReduceBySegmentOpT scan_op;
-    KeyValuePairT scan_item;
-
-    scan_item.value = running_total;
-    scan_item.key   = thread_current_coord.x;
-
-    BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-    if (tile_num_rows > 0)
-    {
-      if (threadIdx.x == 0)
-      {
-        scan_item.key = -1;
-      }
-
-// Direct scatter
-#pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        if (scan_segment[ITEM].key < tile_num_rows)
-        {
-          if (scan_item.key == scan_segment[ITEM].key)
-          {
-            scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
-          }
-
-          if (HAS_ALPHA)
-          {
-            scan_segment[ITEM].value *= spmv_params.alpha;
-          }
-
-          if (HAS_BETA)
-          {
-            // Update the output vector element
-            ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
-            scan_segment[ITEM].value += addend;
-          }
-
-          // Set the output vector element
-          spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
-        }
-      }
-    }
-
-    // Return the tile's running carry-out
-    return tile_carry;
-  }
-
-  /**
-   * @brief Consume a merge tile, specialized for indirect load of nonzeros
-   *
-   * @param is_direct_load
-   *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT
-  ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type<false> is_direct_load)
-  {
-    int tile_num_rows     = tile_end_coord.x - tile_start_coord.x;
-    int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
-
-#if (CUB_PTX_ARCH >= 520)
-
-    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
-    ValueT* s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-// Gather the nonzeros for the merge tile into shared memory
-#  pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
-
-      ValueIteratorT a          = wd_values + tile_start_coord.y + nonzero_idx;
-      ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx;
-      ValueT* s                 = s_tile_nonzeros + nonzero_idx;
-
-      if (nonzero_idx < tile_num_nonzeros)
-      {
-        OffsetT column_idx = *ci;
-        ValueT value       = *a;
-
-        ValueT vector_value = wd_vector_x[column_idx];
-
-        ValueT nonzero = value * vector_value;
-
-        *s = nonzero;
-      }
-    }
-
-#else
-
-    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
-    ValueT* s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-    // Gather the nonzeros for the merge tile into shared memory
-    if (tile_num_nonzeros > 0)
-    {
-#  pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
-        nonzero_idx     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
-
-        OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
-        ValueT value       = wd_values[tile_start_coord.y + nonzero_idx];
-
-        ValueT vector_value = wd_vector_x[column_idx];
-
-        ValueT nonzero = value * vector_value;
-
-        s_tile_nonzeros[nonzero_idx] = nonzero;
-      }
-    }
-
-#endif
-
-// Gather the row end-offsets for the merge tile into shared memory
-#pragma unroll 1
-    for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
-    {
-      const OffsetT offset = (::cuda::std::min)(
-        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
-      s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
-    }
-
-    __syncthreads();
-
-    // Search for the thread's starting coordinate within the merge tile
-    _CCCL_SUPPRESS_DEPRECATED_PUSH
-    CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
-    _CCCL_SUPPRESS_DEPRECATED_POP
-    CoordinateT thread_start_coord;
-
-    MergePathSearch(
-      OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
-      s_tile_row_end_offsets, // List A
-      tile_nonzero_indices, // List B
-      tile_num_rows,
-      tile_num_nonzeros,
-      thread_start_coord);
-
-    __syncthreads(); // Perf-sync
-
-    // Compute the thread's merge path segment
-    CoordinateT thread_current_coord = thread_start_coord;
-    KeyValuePairT scan_segment[ITEMS_PER_THREAD];
-    ValueT running_total = 0.0;
-
-    OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
-    ValueT nonzero         = s_tile_nonzeros[thread_current_coord.y];
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-      {
-        // Move down (accumulate)
-        scan_segment[ITEM].value = nonzero;
-        running_total += nonzero;
-        ++thread_current_coord.y;
-        nonzero = s_tile_nonzeros[thread_current_coord.y];
-      }
-      else
-      {
-        // Move right (reset)
-        scan_segment[ITEM].value = 0.0;
-        running_total            = 0.0;
-        ++thread_current_coord.x;
-        row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
-      }
-
-      scan_segment[ITEM].key = thread_current_coord.x;
-    }
-
-    __syncthreads();
-
-    // Block-wide reduce-value-by-segment
-    KeyValuePairT tile_carry;
-    ReduceBySegmentOpT scan_op;
-    KeyValuePairT scan_item;
-
-    scan_item.value = running_total;
-    scan_item.key   = thread_current_coord.x;
-
-    BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-    if (threadIdx.x == 0)
-    {
-      scan_item.key   = thread_start_coord.x;
-      scan_item.value = 0.0;
-    }
-
-    if (tile_num_rows > 0)
-    {
-      __syncthreads();
-
-      // Scan downsweep and scatter
-      ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
-
-      if (scan_item.key != scan_segment[0].key)
-      {
-        s_partials[scan_item.key] = scan_item.value;
-      }
-      else
-      {
-        scan_segment[0].value += scan_item.value;
-      }
-
-#pragma unroll
-      for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
-        {
-          s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
-        }
-        else
-        {
-          scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
-        }
-      }
-
-      __syncthreads();
-
-#pragma unroll 1
-      for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
-      {
-        spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
-      }
-    }
-
-    // Return the tile's running carry-out
-    return tile_carry;
-  }
-
-  /**
-   * @brief Consume input tile
-   *
-   * @param[in] d_tile_coordinates
-   *   Pointer to the temporary array of tile starting coordinates
-   *
-   * @param[out] d_tile_carry_pairs
-   *   Pointer to the temporary array carry-out dot product row-ids, one per block
-   *
-   * @param[in] num_merge_tiles
-   *   Number of merge tiles
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ConsumeTile(CoordinateT* d_tile_coordinates, KeyValuePairT* d_tile_carry_pairs, int num_merge_tiles)
-  {
-    int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
-
-    if (tile_idx >= num_merge_tiles)
-    {
-      return;
-    }
-
-    // Read our starting coordinates
-    if (threadIdx.x < 2)
-    {
-      if (d_tile_coordinates == nullptr)
-      {
-        // Search our starting coordinates
-        OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
-        CoordinateT tile_coord;
-        _CCCL_SUPPRESS_DEPRECATED_PUSH
-        CountingInputIterator<OffsetT> nonzero_indices(0);
-        _CCCL_SUPPRESS_DEPRECATED_POP
-
-        // Search the merge path
-        MergePathSearch(
-          diagonal,
-          RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-          nonzero_indices,
-          spmv_params.num_rows,
-          spmv_params.num_nonzeros,
-          tile_coord);
-
-        temp_storage.tile_coords[threadIdx.x] = tile_coord;
-      }
-      else
-      {
-        temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
-      }
-    }
-
-    __syncthreads();
-
-    CoordinateT tile_start_coord = temp_storage.tile_coords[0];
-    CoordinateT tile_end_coord   = temp_storage.tile_coords[1];
-
-    // Consume multi-segment tile
-    KeyValuePairT tile_carry =
-      ConsumeTile(tile_idx, tile_start_coord, tile_end_coord, Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
-
-    // Output the tile's carry-out
-    if (threadIdx.x == 0)
-    {
-      if (HAS_ALPHA)
-      {
-        tile_carry.value *= spmv_params.alpha;
-      }
-
-      tile_carry.key += tile_start_coord.x;
-      if (tile_carry.key >= spmv_params.num_rows)
-      {
-        // FIXME: This works around an invalid memory access in the
-        // fixup kernel. The underlying issue needs to be debugged and
-        // properly fixed, but this hack prevents writes to
-        // out-of-bounds addresses. It doesn't appear to have an effect
-        // on the validity of the results, since this only affects the
-        // carry-over from last tile in the input.
-        tile_carry.key   = spmv_params.num_rows - 1;
-        tile_carry.value = ValueT{};
-      };
-
-      d_tile_carry_pairs[tile_idx] = tile_carry;
-    }
-  }
-};
-
-CUB_NAMESPACE_END
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index 2c4d6dd5f4e..ce55c879e0c 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -75,7 +75,6 @@
 #include <cub/device/device_segmented_reduce.cuh>
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
-#include <cub/device/device_spmv.cuh>
 #include <cub/device/device_transform.cuh>
 
 // Grid
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
deleted file mode 100644
index 241af8cd1d1..00000000000
--- a/cub/cub/device/device_spmv.cuh
+++ /dev/null
@@ -1,216 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-//! @file
-//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication
-//! (SpMV).
-
-#pragma once
-
-#include <cub/config.cuh>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cub/config.cuh>
-
-#include <cub/detail/nvtx.cuh>
-#include <cub/device/dispatch/dispatch_spmv_orig.cuh>
-
-#include <iterator>
-#include <limits>
-
-#include <stdio.h>
-
-CUB_NAMESPACE_BEGIN
-
-//! @rst
-//! DeviceSpmv provides device-wide parallel operations for performing
-//! sparse-matrix * dense-vector multiplication (SpMV).
-//!
-//! Overview
-//! +++++++++++++++++++++++++++++++++++++++++++++
-//!
-//! The `SpMV computation <http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication>`_
-//! performs the matrix-vector operation ``y = A * x + y``, where:
-//!
-//!  - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in
-//!    `compressed-storage-row (CSR) format
-//!    <https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)>`_ (i.e., three
-//!    arrays:
-//!    ``values``, ``row_offsets``, and ``column_indices``)
-//!  - ``x`` and ``y`` are dense vectors
-//!
-//! Usage Considerations
-//! +++++++++++++++++++++++++++++++++++++++++++++
-//!
-//! @cdp_class{DeviceSpmv}
-//!
-//! @endrst
-struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv
-{
-  //! @name CSR matrix operations
-  //! @{
-
-  //! @rst
-  //! This function performs the matrix-vector operation ``y = A*x``.
-  //!
-  //! Snippet
-  //! +++++++++++++++++++++++++++++++++++++++++++++
-  //!
-  //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros).
-  //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
-  //!
-  //!    // Declare, allocate, and initialize device-accessible pointers for input matrix A, input
-  //!    vector x,
-  //!    // and output vector y
-  //!    int    num_rows = 9;
-  //!    int    num_cols = 9;
-  //!    int    num_nonzeros = 24;
-  //!
-  //!    float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
-  //!                      //        1, 1, 1, 1, 1, 1, 1, 1,
-  //!                      //        1, 1, 1, 1, 1, 1, 1, 1]
-  //!
-  //!    int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
-  //!                             //        4, 6, 1, 3, 5, 7, 2, 4,
-  //!                             //        8, 3, 7, 4, 6, 8, 5, 7]
-  //!
-  //!    int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
-  //!
-  //!    float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
-  //!    float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
-  //!    ...
-  //!
-  //!    // Determine temporary device storage requirements
-  //!    void*    d_temp_storage = nullptr;
-  //!    size_t   temp_storage_bytes = 0;
-  //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-  //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-  //!        num_rows, num_cols, num_nonzeros);
-  //!
-  //!    // Allocate temporary storage
-  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
-  //!
-  //!    // Run SpMV
-  //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-  //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-  //!        num_rows, num_cols, num_nonzeros);
-  //!
-  //!    // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
-  //!
-  //! @endrst
-  //!
-  //! @tparam ValueT
-  //!   **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.)
-  //!
-  //! @param[in] d_temp_storage
-  //!   Device-accessible allocation of temporary storage.
-  //!   When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
-  //!
-  //! @param[in,out] temp_storage_bytes
-  //!   Reference to size in bytes of `d_temp_storage` allocation
-  //!
-  //! @param[in] d_values
-  //!   Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements
-  //!   of matrix `A`.
-  //!
-  //! @param[in] d_row_offsets
-  //!   Pointer to the array of `m + 1` offsets demarcating the start of every row in
-  //!   `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`)
-  //!
-  //! @param[in] d_column_indices
-  //!   Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero
-  //!   elements of matrix `A`. (Indices are zero-valued.)
-  //!
-  //! @param[in] d_vector_x
-  //!   Pointer to the array of `num_cols` values corresponding to the dense input vector `x`
-  //!
-  //! @param[out] d_vector_y
-  //!   Pointer to the array of `num_rows` values corresponding to the dense output vector `y`
-  //!
-  //! @param[in] num_rows
-  //!   number of rows of matrix `A`.
-  //!
-  //! @param[in] num_cols
-  //!   number of columns of matrix `A`.
-  //!
-  //! @param[in] num_nonzeros
-  //!   number of nonzero elements of matrix `A`.
-  //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
-  template <typename ValueT>
-  CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-  CUB_RUNTIME_FUNCTION static cudaError_t
-    CsrMV(void* d_temp_storage,
-          size_t& temp_storage_bytes,
-          const ValueT* d_values,
-          const int* d_row_offsets,
-          const int* d_column_indices,
-          const ValueT* d_vector_x,
-          ValueT* d_vector_y,
-          int num_rows,
-          int num_cols,
-          int num_nonzeros,
-          cudaStream_t stream = 0)
-  {
-    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV");
-
-    SpmvParams<ValueT, int> spmv_params;
-    spmv_params.d_values          = d_values;
-    spmv_params.d_row_end_offsets = d_row_offsets + 1;
-    spmv_params.d_column_indices  = d_column_indices;
-    spmv_params.d_vector_x        = d_vector_x;
-    spmv_params.d_vector_y        = d_vector_y;
-    spmv_params.num_rows          = num_rows;
-    spmv_params.num_cols          = num_cols;
-    spmv_params.num_nonzeros      = num_nonzeros;
-    spmv_params.alpha             = ValueT{1};
-    spmv_params.beta              = ValueT{0};
-
-    _CCCL_SUPPRESS_DEPRECATED_PUSH
-    return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
-    _CCCL_SUPPRESS_DEPRECATED_POP
-  }
-
-  //! @}  end member group
-};
-
-CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
deleted file mode 100644
index 16353f392dc..00000000000
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ /dev/null
@@ -1,924 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * @file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector
- * multiplication (SpMV).
- */
-
-#pragma once
-
-#include <cub/config.cuh>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cub/agent/agent_segment_fixup.cuh>
-#include <cub/agent/agent_spmv_orig.cuh>
-#include <cub/agent/single_pass_scan_operators.cuh>
-#include <cub/grid/grid_queue.cuh>
-#include <cub/thread/thread_search.cuh>
-#include <cub/util_debug.cuh>
-#include <cub/util_device.cuh>
-#include <cub/util_math.cuh>
-#include <cub/util_type.cuh>
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-#include <cstdio>
-#include <iterator>
-
-#include <nv/target>
-
-CUB_NAMESPACE_BEGIN
-
-/******************************************************************************
- * SpMV kernel entry points
- *****************************************************************************/
-
-/**
- * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
- *
- * @tparam AgentSpmvPolicyT
- *   Parameterized SpmvPolicy tuning policy type
- *
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- *
- * @param[in] spmv_params
- *   SpMV input parameter bundle
- */
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-template <typename AgentSpmvPolicyT, typename ValueT, typename OffsetT>
-CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params) //
-  _CCCL_SUPPRESS_DEPRECATED_POP
-{
-  using VectorValueIteratorT =
-    CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
-
-  VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
-
-  int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (row_idx < spmv_params.num_rows)
-  {
-    OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
-    OffsetT nonzero_idx     = spmv_params.d_row_end_offsets[row_idx - 1];
-
-    ValueT value = 0.0;
-    if (end_nonzero_idx != nonzero_idx)
-    {
-      value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
-    }
-
-    spmv_params.d_vector_y[row_idx] = value;
-  }
-}
-
-/**
- * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
- *
- * @tparam SpmvPolicyT
- *   Parameterized SpmvPolicy tuning policy type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- *
- * @tparam CoordinateT
- *   Merge path coordinate type
- *
- * @tparam SpmvParamsT
- *   SpmvParams type
- *
- * @param[in] num_merge_tiles
- *   Number of SpMV merge tiles (spmv grid size)
- *
- * @param[out] d_tile_coordinates
- *   Pointer to the temporary array of tile starting coordinates
- *
- * @param[in] spmv_params
- *   SpMV input parameter bundle
- */
-template <typename SpmvPolicyT, typename OffsetT, typename CoordinateT, typename SpmvParamsT>
-CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
-  int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params)
-{
-  /// Constants
-  enum
-  {
-    BLOCK_THREADS    = SpmvPolicyT::BLOCK_THREADS,
-    ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
-    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
-  };
-
-  using RowOffsetsSearchIteratorT =
-    CacheModifiedInputIterator<SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT>;
-
-  // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
-  int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tile_idx < num_merge_tiles + 1)
-  {
-    OffsetT diagonal = (tile_idx * TILE_ITEMS);
-    CoordinateT tile_coordinate;
-    _CCCL_SUPPRESS_DEPRECATED_PUSH
-    CountingInputIterator<OffsetT> nonzero_indices(0);
-    _CCCL_SUPPRESS_DEPRECATED_POP
-
-    // Search the merge path
-    MergePathSearch(
-      diagonal,
-      RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-      nonzero_indices,
-      spmv_params.num_rows,
-      spmv_params.num_nonzeros,
-      tile_coordinate);
-
-    // Output starting offset
-    d_tile_coordinates[tile_idx] = tile_coordinate;
-  }
-}
-
-/**
- * @brief Spmv agent entry point
- *
- * @tparam SpmvPolicyT
- *   Parameterized SpmvPolicy tuning policy type
- *
- * @tparam ScanTileStateT
- *   Tile status interface type
- *
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- *
- * @tparam CoordinateT
- *   Merge path coordinate type
- *
- * @tparam HAS_ALPHA
- *   Whether the input parameter Alpha is 1
- *
- * @tparam HAS_BETA
- *   Whether the input parameter Beta is 0
- *
- * @param[in] spmv_params
- *   SpMV input parameter bundle
- *
- * @param[in] d_tile_coordinates
- *   Pointer to the temporary array of tile starting coordinates
- *
- * @param[out] d_tile_carry_pairs
- *   Pointer to the temporary array carry-out dot product row-ids, one per block
- *
- * @param[in] num_tiles
- *   Number of merge tiles
- *
- * @param[in] tile_state
- *   Tile status interface for fixup reduce-by-key kernel
- *
- * @param[in] num_segment_fixup_tiles
- *   Number of reduce-by-key tiles (fixup grid size)
- */
-template <typename SpmvPolicyT,
-          typename ScanTileStateT,
-          typename ValueT,
-          typename OffsetT,
-          typename CoordinateT,
-          bool HAS_ALPHA,
-          bool HAS_BETA>
-CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel(
-  SpmvParams<ValueT, OffsetT> spmv_params,
-  CoordinateT* d_tile_coordinates,
-  KeyValuePair<OffsetT, ValueT>* d_tile_carry_pairs,
-  int num_tiles,
-  ScanTileStateT tile_state,
-  int num_segment_fixup_tiles)
-{
-  // Spmv agent type specialization
-  _CCCL_SUPPRESS_DEPRECATED_PUSH
-  using AgentSpmvT = AgentSpmv<SpmvPolicyT, ValueT, OffsetT, HAS_ALPHA, HAS_BETA>;
-  _CCCL_SUPPRESS_DEPRECATED_POP
-
-  // Shared memory for AgentSpmv
-  __shared__ typename AgentSpmvT::TempStorage temp_storage;
-
-  AgentSpmvT(temp_storage, spmv_params).ConsumeTile(d_tile_coordinates, d_tile_carry_pairs, num_tiles);
-
-  // Initialize fixup tile status
-  tile_state.InitializeStatus(num_segment_fixup_tiles);
-}
-
-/**
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for sequence offsets
- *
- * @tparam HAS_BETA
- *   Whether the input parameter Beta is 0
- */
-template <typename ValueT, typename OffsetT, bool HAS_BETA>
-CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
-{
-  const int row = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-
-  if (row < spmv_params.num_rows)
-  {
-    ValueT result = 0.0;
-
-    _CCCL_IF_CONSTEXPR (HAS_BETA)
-    {
-      result += spmv_params.beta * spmv_params.d_vector_y[row];
-    }
-
-    spmv_params.d_vector_y[row] = result;
-  }
-}
-
-/**
- * @brief Multi-block reduce-by-key sweep kernel entry point
- *
- * @tparam AgentSegmentFixupPolicyT
- *   Parameterized AgentSegmentFixupPolicy tuning policy type
- *
- * @tparam PairsInputIteratorT
- *   Random-access input iterator type for keys
- *
- * @tparam AggregatesOutputIteratorT
- *   Random-access output iterator type for values
- *
- * @tparam OffsetT
- *   Signed integer type for global offsets
- *
- * @tparam ScanTileStateT
- *   Tile status interface type
- *
- * @param[in] d_pairs_in
- *   Pointer to the array carry-out dot product row-ids, one per spmv block
- *
- * @param[in,out] d_aggregates_out
- *   Output value aggregates
- *
- * @param[in] num_items
- *   Total number of items to select from
- *
- * @param[in] num_tiles
- *   Total number of tiles for the entire problem
- *
- * @param[in] tile_state
- *   Tile status interface
- */
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-template <typename AgentSegmentFixupPolicyT,
-          typename PairsInputIteratorT,
-          typename AggregatesOutputIteratorT,
-          typename OffsetT,
-          typename ScanTileStateT>
-CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
-__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel(
-    PairsInputIteratorT d_pairs_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    OffsetT num_items,
-    int num_tiles,
-    ScanTileStateT tile_state) //
-  _CCCL_SUPPRESS_DEPRECATED_POP
-{
-  // Thread block type for reducing tiles of value segments
-  using AgentSegmentFixupT =
-    AgentSegmentFixup<AgentSegmentFixupPolicyT,
-                      PairsInputIteratorT,
-                      AggregatesOutputIteratorT,
-                      ::cuda::std::equal_to<>,
-                      ::cuda::std::plus<>,
-                      OffsetT>;
-
-  // Shared memory for AgentSegmentFixup
-  __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
-
-  // Process tiles
-  AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, ::cuda::std::equal_to<>{}, ::cuda::std::plus<>{})
-    .ConsumeRange(num_items, num_tiles, tile_state);
-}
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
- *
- * @tparam ValueT
- *   Matrix and vector value type
- *
- * @tparam OffsetT
- *   Signed integer type for global offsets
- */
-template <typename ValueT, typename OffsetT>
-struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
-{
-  //---------------------------------------------------------------------
-  // Constants and Types
-  //---------------------------------------------------------------------
-
-  enum
-  {
-    INIT_KERNEL_THREADS         = 128,
-    EMPTY_MATRIX_KERNEL_THREADS = 128
-  };
-
-  // SpmvParams bundle type
-  using SpmvParamsT = SpmvParams<ValueT, OffsetT>;
-
-  // 2D merge path coordinate type
-  using CoordinateT = typename CubVector<OffsetT, 2>::Type;
-
-  // Tile status descriptor interface type
-  using ScanTileStateT = ReduceByKeyScanTileState<ValueT, OffsetT>;
-
-  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-  using KeyValuePairT = KeyValuePair<OffsetT, ValueT>;
-
-  //---------------------------------------------------------------------
-  // Tuning policies
-  //---------------------------------------------------------------------
-
-  /// SM50
-  struct Policy500
-  {
-    using SpmvPolicyT =
-      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128,
-                      (sizeof(ValueT) > 4) ? 6 : 7,
-                      LOAD_LDG,
-                      LOAD_DEFAULT,
-                      (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                      (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                      LOAD_LDG,
-                      (sizeof(ValueT) > 4) ? true : false,
-                      (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>;
-
-    using SegmentFixupPolicyT =
-      AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE>;
-  };
-
-  /// SM60
-  struct Policy600
-  {
-    using SpmvPolicyT =
-      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128,
-                      (sizeof(ValueT) > 4) ? 5 : 7,
-                      LOAD_DEFAULT,
-                      LOAD_DEFAULT,
-                      LOAD_DEFAULT,
-                      LOAD_DEFAULT,
-                      LOAD_DEFAULT,
-                      false,
-                      BLOCK_SCAN_WARP_SCANS>;
-
-    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
-  };
-
-  //---------------------------------------------------------------------
-  // Tuning policies of current PTX compiler pass
-  //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-  using PtxPolicy = Policy600;
-
-#else
-  using PtxPolicy = Policy500;
-#endif
-
-  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-  struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT
-  {};
-  struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT
-  {};
-
-  //---------------------------------------------------------------------
-  // Utilities
-  //---------------------------------------------------------------------
-
-  /**
-   * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-   */
-  template <typename KernelConfig>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static void
-  InitConfigs(int ptx_version, KernelConfig& spmv_config, KernelConfig& segment_fixup_config)
-  {
-    NV_IF_TARGET(
-      NV_IS_DEVICE,
-      ( // We're on the device, so initialize the kernel dispatch
-        // configurations with the current PTX policy
-        spmv_config.template Init<PtxSpmvPolicyT>(); segment_fixup_config.template Init<PtxSegmentFixupPolicy>();),
-      (
-        // We're on the host, so lookup and initialize the kernel dispatch
-        // configurations with the policies that match the device's PTX
-        // version
-        if (ptx_version >= 600) {
-          spmv_config.template Init<typename Policy600::SpmvPolicyT>();
-          segment_fixup_config.template Init<typename Policy600::SegmentFixupPolicyT>();
-        } else if (ptx_version >= 500) {
-          spmv_config.template Init<typename Policy500::SpmvPolicyT>();
-          segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
-        } else {
-          spmv_config.template Init<typename Policy500::SpmvPolicyT>();
-          segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
-        }));
-  }
-
-  /**
-   * Kernel kernel dispatch configuration.
-   */
-  struct KernelConfig
-  {
-    int block_threads;
-    int items_per_thread;
-    int tile_items;
-
-    template <typename PolicyT>
-    CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void Init()
-    {
-      block_threads    = PolicyT::BLOCK_THREADS;
-      items_per_thread = PolicyT::ITEMS_PER_THREAD;
-      tile_items       = block_threads * items_per_thread;
-    }
-  };
-
-  //---------------------------------------------------------------------
-  // Dispatch entrypoints
-  //---------------------------------------------------------------------
-
-  /**
-   * Internal dispatch routine for computing a device-wide reduction using the
-   * specified kernel functions.
-   *
-   * If the input is larger than a single tile, this method uses two-passes of
-   * kernel invocations.
-   *
-   * @tparam Spmv1ColKernelT
-   *   Function type of cub::DeviceSpmv1ColKernel
-   *
-   * @tparam SpmvSearchKernelT
-   *   Function type of cub::AgentSpmvSearchKernel
-   *
-   * @tparam SpmvKernelT
-   *   Function type of cub::AgentSpmvKernel
-   *
-   * @tparam SegmentFixupKernelT
-   *   Function type of cub::DeviceSegmentFixupKernelT
-   *
-   * @tparam SpmvEmptyMatrixKernelT
-   *   Function type of cub::DeviceSpmvEmptyMatrixKernel
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage.
-   *   When nullptr, the required allocation size is written to
-   *   `temp_storage_bytes` and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of \p d_temp_storage allocation
-   *
-   * @paramSpMV spmv_params
-   *   input parameter bundle
-   *
-   * @param[in] stream
-   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
-   *
-   * @param[in] spmv_1col_kernel
-   *   Kernel function pointer to parameterization of DeviceSpmv1ColKernel
-   *
-   * @param[in] spmv_search_kernel
-   *   Kernel function pointer to parameterization of AgentSpmvSearchKernel
-   *
-   * @param[in] spmv_kernel
-   *   Kernel function pointer to parameterization of AgentSpmvKernel
-   *
-   * @param[in] segment_fixup_kernel
-   *   Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
-   *
-   * @param[in] spmv_empty_matrix_kernel
-   *   Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel
-   *
-   * @param[in] spmv_config
-   *   Dispatch parameters that match the policy that @p spmv_kernel was compiled for
-   *
-   * @param[in] segment_fixup_config
-   *   Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for
-   */
-  template <typename Spmv1ColKernelT,
-            typename SpmvSearchKernelT,
-            typename SpmvKernelT,
-            typename SegmentFixupKernelT,
-            typename SpmvEmptyMatrixKernelT>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SpmvParamsT& spmv_params,
-    cudaStream_t stream,
-    Spmv1ColKernelT spmv_1col_kernel,
-    SpmvSearchKernelT spmv_search_kernel,
-    SpmvKernelT spmv_kernel,
-    SegmentFixupKernelT segment_fixup_kernel,
-    SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
-    KernelConfig spmv_config,
-    KernelConfig segment_fixup_config)
-  {
-    cudaError error = cudaSuccess;
-    do
-    {
-      if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0)
-      {
-        return cudaErrorInvalidValue;
-      }
-
-      if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0)
-      { // Empty problem, no-op.
-        if (d_temp_storage == nullptr)
-        {
-          temp_storage_bytes = 1;
-        }
-
-        break;
-      }
-
-      if (spmv_params.num_nonzeros == 0)
-      {
-        if (d_temp_storage == nullptr)
-        {
-          // Return if the caller is simply requesting the size of the storage allocation
-          temp_storage_bytes = 1;
-          break;
-        }
-
-        constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS;
-        const int blocks_in_grid       = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block);
-
-#ifdef CUB_DEBUG_LOG
-        _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n",
-                blocks_in_grid,
-                threads_in_block,
-                (long long) stream);
-#endif // CUB_DEBUG_LOG
-        error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
-                  .doit(spmv_empty_matrix_kernel, spmv_params);
-
-        if (CubDebug(error))
-        {
-          break;
-        }
-
-        // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
-        {
-          break;
-        }
-
-        break;
-      }
-
-      if (spmv_params.num_cols == 1)
-      {
-        if (d_temp_storage == nullptr)
-        {
-          // Return if the caller is simply requesting the size of the storage allocation
-          temp_storage_bytes = 1;
-          break;
-        }
-
-        // Get search/init grid dims
-        int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
-        int degen_col_kernel_grid_size  = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size);
-
-#ifdef CUB_DEBUG_LOG
-        _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
-                degen_col_kernel_grid_size,
-                degen_col_kernel_block_size,
-                (long long) stream);
-#endif // CUB_DEBUG_LOG
-
-        // Invoke spmv_search_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream)
-          .doit(spmv_1col_kernel, spmv_params);
-
-        // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
-        {
-          break;
-        }
-
-        // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
-        {
-          break;
-        }
-
-        break;
-      }
-
-      // Get device ordinal
-      int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
-      {
-        break;
-      }
-
-      // Get SM count
-      int sm_count;
-      if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
-      {
-        break;
-      }
-
-      // Get max x-dimension of grid
-      int max_dim_x;
-      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
-      {
-        break;
-      }
-
-      // Total number of spmv work items
-      int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
-
-      // Tile sizes of kernels
-      int merge_tile_size         = spmv_config.block_threads * spmv_config.items_per_thread;
-      int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
-
-      // Number of tiles for kernels
-      int num_merge_tiles         = ::cuda::ceil_div(num_merge_items, merge_tile_size);
-      int num_segment_fixup_tiles = ::cuda::ceil_div(num_merge_tiles, segment_fixup_tile_size);
-
-      // Get SM occupancy for kernels
-      int spmv_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads)))
-      {
-        break;
-      }
-
-      int segment_fixup_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(
-                     segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads)))
-      {
-        break;
-      }
-
-      // Get grid dimensions
-      dim3 spmv_grid_size(CUB_MIN(num_merge_tiles, max_dim_x), ::cuda::ceil_div(num_merge_tiles, max_dim_x), 1);
-
-      dim3 segment_fixup_grid_size(
-        CUB_MIN(num_segment_fixup_tiles, max_dim_x), ::cuda::ceil_div(num_segment_fixup_tiles, max_dim_x), 1);
-
-      // Get the temporary storage allocation requirements
-      size_t allocation_sizes[3];
-      if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0])))
-      {
-        break; // bytes needed for reduce-by-key tile status descriptors
-      }
-      allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs
-      allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates
-
-      // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-      void* allocations[3] = {};
-      if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
-      {
-        break;
-      }
-      if (d_temp_storage == nullptr)
-      {
-        // Return if the caller is simply requesting the size of the storage allocation
-        break;
-      }
-
-      // Construct the tile status interface
-      ScanTileStateT tile_state;
-      if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0])))
-      {
-        break;
-      }
-
-      // Alias the other allocations
-      KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs
-      CoordinateT* d_tile_coordinates   = (CoordinateT*) allocations[2]; // Agent starting coordinates
-
-      // Get search/init grid dims
-      int search_block_size = INIT_KERNEL_THREADS;
-      int search_grid_size  = ::cuda::ceil_div(num_merge_tiles + 1, search_block_size);
-
-      if (search_grid_size < sm_count)
-      //            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
-      {
-        // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
-        d_tile_coordinates = nullptr;
-      }
-      else
-      {
-// Use separate search kernel if we have enough spmv tiles to saturate the device
-
-// Log spmv_search_kernel configuration
-#ifdef CUB_DEBUG_LOG
-        _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
-                search_grid_size,
-                search_block_size,
-                (long long) stream);
-#endif // CUB_DEBUG_LOG
-
-        // Invoke spmv_search_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream)
-          .doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params);
-
-        // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
-        {
-          break;
-        }
-
-        // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
-        {
-          break;
-        }
-      }
-
-// Log spmv_kernel configuration
-#ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-              spmv_grid_size.x,
-              spmv_grid_size.y,
-              spmv_grid_size.z,
-              spmv_config.block_threads,
-              (long long) stream,
-              spmv_config.items_per_thread,
-              spmv_sm_occupancy);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke spmv_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream)
-        .doit(spmv_kernel,
-              spmv_params,
-              d_tile_coordinates,
-              d_tile_carry_pairs,
-              num_merge_tiles,
-              tile_state,
-              num_segment_fixup_tiles);
-
-      // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
-      {
-        break;
-      }
-
-      // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
-      {
-        break;
-      }
-
-      // Run reduce-by-key fixup if necessary
-      if (num_merge_tiles > 1)
-      {
-// Log segment_fixup_kernel configuration
-#ifdef CUB_DEBUG_LOG
-        _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                segment_fixup_grid_size.x,
-                segment_fixup_grid_size.y,
-                segment_fixup_grid_size.z,
-                segment_fixup_config.block_threads,
-                (long long) stream,
-                segment_fixup_config.items_per_thread,
-                segment_fixup_sm_occupancy);
-#endif // CUB_DEBUG_LOG
-
-        // Invoke segment_fixup_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream)
-          .doit(segment_fixup_kernel,
-                d_tile_carry_pairs,
-                spmv_params.d_vector_y,
-                num_merge_tiles,
-                num_segment_fixup_tiles,
-                tile_state);
-
-        // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
-        {
-          break;
-        }
-
-        // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
-        {
-          break;
-        }
-      }
-    } while (0);
-
-    return error;
-  }
-
-  /**
-   * @brief Internal dispatch routine for computing a device-wide reduction
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage.
-   *   When nullptr, the required allocation size is written to
-   *   `temp_storage_bytes` and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param SpMV spmv_params
-   *   input parameter bundle
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
-   */
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream = 0)
-  {
-    cudaError error = cudaSuccess;
-    do
-    {
-      // Get PTX version
-      int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
-      {
-        break;
-      }
-
-      // Get kernel kernel dispatch configurations
-      KernelConfig spmv_config, segment_fixup_config;
-      InitConfigs(ptx_version, spmv_config, segment_fixup_config);
-
-      constexpr bool has_alpha = false;
-      constexpr bool has_beta  = false;
-
-      if (CubDebug(
-            error = Dispatch(
-              d_temp_storage,
-              temp_storage_bytes,
-              spmv_params,
-              stream,
-              DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
-              DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
-              DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, has_alpha, has_beta>,
-              DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-              DeviceSpmvEmptyMatrixKernel<ValueT, OffsetT, has_beta>,
-              spmv_config,
-              segment_fixup_config)))
-      {
-        break;
-      }
-
-    } while (0);
-
-    return error;
-  }
-};
-
-CUB_NAMESPACE_END
diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu
deleted file mode 100644
index 13dba77a594..00000000000
--- a/cub/test/test_device_spmv.cu
+++ /dev/null
@@ -1,611 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <cub/device/device_spmv.cuh>
-#include <cub/util_debug.cuh>
-
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/mismatch.h>
-#include <thrust/scan.h>
-
-#include <cuda/std/type_traits>
-
-#include <iostream>
-#include <type_traits>
-#include <typeinfo>
-
-#include "test_util.h"
-#include <c2h/device_policy.h>
-#include <c2h/vector.h>
-
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-
-bool g_verbose = false;
-
-//==============================================================================
-// Casts char types to int for numeric printing
-template <typename T>
-T print_cast(T val)
-{
-  return val;
-}
-
-int print_cast(char val)
-{
-  return static_cast<int>(val);
-}
-
-int print_cast(signed char val)
-{
-  return static_cast<int>(val);
-}
-
-int print_cast(unsigned char val)
-{
-  return static_cast<int>(val);
-}
-
-//==============================================================================
-// Print a vector to out
-template <typename VectorT>
-void print_vector(std::ostream& out, const VectorT& vec)
-{
-  bool first = true;
-  for (const auto& val : vec)
-  {
-    if (!first)
-    {
-      out << ", ";
-    }
-    first = false;
-    out << print_cast(val);
-  }
-}
-
-//==============================================================================
-// Simple CSR matrix implementation.
-// HostStorage controls whether data is stored on the host or device.
-// Use the host_csr_matrix and device_csr_matrix aliases for code clarity.
-template <typename ValueT, bool HostStorage>
-struct csr_matrix
-{
-  csr_matrix(int num_rows, int num_cols)
-      : m_row_offsets(static_cast<size_t>(num_rows + 1), 0)
-      , m_num_rows(num_rows)
-      , m_num_columns(num_cols)
-  {}
-
-  // host/device conversion constructor
-  explicit csr_matrix(const csr_matrix<ValueT, !HostStorage>& other)
-      : m_values(other.m_values)
-      , m_row_offsets(other.m_row_offsets)
-      , m_column_indices(other.m_column_indices)
-      , m_num_rows(other.m_num_rows)
-      , m_num_columns(other.m_num_columns)
-      , m_num_nonzeros(other.m_num_nonzeros)
-  {}
-
-  // Note that this must append to the values array. Finish filling each row
-  // before adding to the next, and each row's columns must be added in order.
-  // Must call `finalize` once all items are added.
-  void append_value(int row, int col, ValueT value)
-  {
-    ++m_num_nonzeros;
-    ++m_row_offsets[row];
-    m_column_indices.push_back(col);
-    m_values.push_back(std::move(value));
-  }
-
-  void finalize()
-  {
-    _CCCL_IF_CONSTEXPR (HostStorage)
-    {
-      thrust::exclusive_scan(thrust::host, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin());
-    }
-    else
-    {
-      thrust::exclusive_scan(c2h::device_policy, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin());
-    }
-    AssertEquals(m_row_offsets.back(), m_num_nonzeros);
-  }
-
-  const ValueT* get_values() const
-  {
-    return thrust::raw_pointer_cast(m_values.data());
-  }
-
-  const int* get_row_offsets() const
-  {
-    return thrust::raw_pointer_cast(m_row_offsets.data());
-  }
-
-  int get_row_offset(int row) const
-  {
-    return m_row_offsets[row];
-  }
-
-  int get_row_num_nonzero(int row) const
-  {
-    return m_row_offsets[row + 1] - m_row_offsets[row];
-  }
-
-  const int* get_column_indices() const
-  {
-    return thrust::raw_pointer_cast(m_column_indices.data());
-  }
-
-  int get_num_rows() const
-  {
-    return m_num_rows;
-  }
-
-  int get_num_columns() const
-  {
-    return m_num_columns;
-  }
-
-  int get_num_nonzeros() const
-  {
-    return m_num_nonzeros;
-  }
-
-  void print_internals(std::ostream& out) const
-  {
-    out << (HostStorage ? "host" : "device") << "_csr_matrix"
-        << "(" << m_num_rows << ", " << m_num_columns << ")\n"
-        << " - num_elems:   " << (m_num_rows * m_num_columns) << "\n"
-        << " - num_nonzero: " << m_num_nonzeros << "\n"
-        << " - row_offsets:\n     [";
-    print_vector(out, m_row_offsets);
-    out << "]\n"
-        << " - column_indices:\n     [";
-    print_vector(out, m_column_indices);
-    out << "]\n"
-        << " - values:\n     [";
-    print_vector(out, m_values);
-    out << "]\n";
-  }
-
-  void print_summary(std::ostream& out) const
-  {
-    const int num_elems = m_num_rows * m_num_columns;
-    const float fill_ratio =
-      num_elems == 0 ? 0.f : (static_cast<float>(m_num_nonzeros) / static_cast<float>(num_elems));
-
-    out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/" << num_elems << " (" << fill_ratio
-        << ")\n";
-  }
-
-  friend class csr_matrix<ValueT, !HostStorage>;
-
-private:
-  template <typename VecValueT>
-  using vector_t = ::cuda::std::_If<HostStorage, c2h::host_vector<VecValueT>, c2h::device_vector<VecValueT>>;
-
-  vector_t<ValueT> m_values;
-  vector_t<int> m_row_offsets;
-  vector_t<int> m_column_indices;
-
-  int m_num_rows{0};
-  int m_num_columns{0};
-  int m_num_nonzeros{0};
-};
-
-//==============================================================================
-// Convenience aliases for host/device csr_matrix types.
-template <typename ValueT>
-using host_csr_matrix = csr_matrix<ValueT, true>;
-
-template <typename ValueT>
-using device_csr_matrix = csr_matrix<ValueT, false>;
-
-//==============================================================================
-// Compare two floats within a tolerance.
-// This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks.
-template <typename ValueT>
-struct fp_almost_equal_functor
-{
-  __host__ __device__ bool operator()(ValueT v1, ValueT v2) const
-  {
-    constexpr double r_tol = 1e-3;
-    constexpr double a_tol = 1e-2;
-    const double limit     = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol;
-    return std::fabs(v1 - v2) <= limit;
-  }
-};
-
-//==============================================================================
-// Compare the reference and cub output vectors.
-// Use fuzzy check for floating point values.
-template <typename ValueT>
-bool compare_results(
-  std::true_type /* is_fp */, const c2h::host_vector<ValueT>& h_vec1, const c2h::device_vector<ValueT>& d_vec2)
-{
-  c2h::device_vector<ValueT> d_vec1(h_vec1);
-  auto err = thrust::mismatch(
-    c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin(), fp_almost_equal_functor<ValueT>{});
-  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
-  {
-    return true;
-  }
-  else
-  {
-    c2h::host_vector<ValueT> h_vec2(d_vec2);
-    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
-    std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs "
-              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
-    return false;
-  }
-};
-
-template <typename ValueT>
-bool compare_results(
-  std::false_type /* is_fp */, const c2h::host_vector<ValueT>& h_vec1, const c2h::device_vector<ValueT>& d_vec2)
-{
-  c2h::device_vector<ValueT> d_vec1(h_vec1);
-  auto err = thrust::mismatch(c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin());
-  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
-  {
-    return true;
-  }
-  else
-  {
-    c2h::host_vector<ValueT> h_vec2(d_vec2);
-    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
-    std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs "
-              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
-    return false;
-  }
-}
-
-//==============================================================================
-// Generate a random host_csr_matrix<ValueT> with the specified dimensions.
-// target_fill_ratio is the target fraction of non-zero elements (may be more
-// or less in the output).
-template <typename ValueT>
-host_csr_matrix<ValueT> make_random_csr_matrix(int num_rows, int num_cols, float target_fill_ratio)
-{
-  host_csr_matrix<ValueT> mat{num_rows, num_cols};
-
-  for (int row = 0; row < num_rows; ++row)
-  {
-    for (int col = 0; col < num_cols; ++col)
-    {
-      const bool is_non_zero = RandomValue<float>(1.f) < target_fill_ratio;
-      if (!is_non_zero)
-      {
-        continue;
-      }
-
-      if (std::is_floating_point<ValueT>::value)
-      {
-        // Keep fp numbers somewhat small, from -50 -> 50; otherwise we run
-        // into issues with nans/infs
-        ValueT value = (RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50));
-        mat.append_value(row, col, value);
-      }
-      else
-      {
-        ValueT value{};
-        InitValue(RANDOM, value);
-        mat.append_value(row, col, value);
-      }
-    }
-  }
-
-  mat.finalize();
-
-  const int num_elements        = num_rows * num_cols;
-  const float actual_fill_ratio = static_cast<float>(mat.get_num_nonzeros()) / static_cast<float>(num_elements);
-
-  if (g_verbose)
-  {
-    printf(
-      "Created host_csr_matrix<%s>(%d, %d)\n"
-      " - NumElements: %d\n"
-      " - NumNonZero:  %d\n"
-      " - Target fill: %0.2f%%\n"
-      " - Actual fill: %0.2f%%\n",
-      typeid(ValueT).name(),
-      num_rows,
-      num_cols,
-      num_elements,
-      mat.get_num_nonzeros(),
-      target_fill_ratio,
-      actual_fill_ratio);
-  }
-
-  return mat;
-}
-
-//==============================================================================
-// Fill a vector with random values.
-template <typename ValueT>
-c2h::host_vector<ValueT> make_random_vector(int len)
-{
-  c2h::host_vector<ValueT> vec(len);
-  for (auto& val : vec)
-  {
-    if (std::is_floating_point<ValueT>::value)
-    { // Keep fp numbers somewhat small; otherwise we run into issues with
-      // nans/infs
-      val = RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50);
-    }
-    else
-    {
-      InitValue(RANDOM, val);
-    }
-  }
-  return vec;
-}
-
-//==============================================================================
-// Serial y = Ax computation
-template <typename ValueT>
-void compute_reference_solution(
-  const host_csr_matrix<ValueT>& a, const c2h::host_vector<ValueT>& x, c2h::host_vector<ValueT>& y)
-{
-  if (a.get_num_rows() == 0 || a.get_num_columns() == 0)
-  {
-    return;
-  }
-
-  for (int row = 0; row < a.get_num_rows(); ++row)
-  {
-    const int row_offset = a.get_row_offset(row);
-    const int row_length = a.get_row_num_nonzero(row);
-    const int* cols      = a.get_column_indices() + row_offset;
-    const int* cols_end  = cols + row_length;
-    const ValueT* values = a.get_values() + row_offset;
-
-    ValueT accum{};
-    while (cols < cols_end)
-    {
-      accum += (*values++) * x[*cols++];
-    }
-    y[row] = accum;
-  }
-}
-
-//==============================================================================
-// cub::DeviceSpmv::CsrMV y = Ax computation
-template <typename ValueT>
-void compute_cub_solution(
-  const device_csr_matrix<ValueT>& a, const c2h::device_vector<ValueT>& x, c2h::device_vector<ValueT>& y)
-{
-  c2h::device_vector<char> temp_storage;
-  std::size_t temp_storage_bytes{};
-  auto err = cub::DeviceSpmv::CsrMV(
-    nullptr,
-    temp_storage_bytes,
-    a.get_values(),
-    a.get_row_offsets(),
-    a.get_column_indices(),
-    thrust::raw_pointer_cast(x.data()),
-    thrust::raw_pointer_cast(y.data()),
-    a.get_num_rows(),
-    a.get_num_columns(),
-    a.get_num_nonzeros());
-  CubDebugExit(err);
-
-  temp_storage.resize(temp_storage_bytes);
-
-  err = cub::DeviceSpmv::CsrMV(
-    thrust::raw_pointer_cast(temp_storage.data()),
-    temp_storage_bytes,
-    a.get_values(),
-    a.get_row_offsets(),
-    a.get_column_indices(),
-    thrust::raw_pointer_cast(x.data()),
-    thrust::raw_pointer_cast(y.data()),
-    a.get_num_rows(),
-    a.get_num_columns(),
-    a.get_num_nonzeros());
-  CubDebugExit(err);
-}
-
-//==============================================================================
-// Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the
-// results.
-template <typename ValueT>
-void test_spmv(const host_csr_matrix<ValueT>& h_a, const c2h::host_vector<ValueT>& h_x)
-{
-  if (g_verbose)
-  {
-    std::cout << "Testing cub::DeviceSpmv on inputs:\n";
-    h_a.print_internals(std::cout);
-    std::cout << "x vector:\n  [";
-    print_vector(std::cout, h_x);
-    std::cout << "]" << std::endl;
-  }
-  else
-  {
-    h_a.print_summary(std::cout);
-  }
-
-  const device_csr_matrix<ValueT> d_a(h_a);
-  const c2h::device_vector<ValueT> d_x(h_x);
-
-  c2h::host_vector<ValueT> h_y(h_a.get_num_rows());
-  c2h::device_vector<ValueT> d_y(d_a.get_num_rows());
-
-  compute_reference_solution(h_a, h_x, h_y);
-  compute_cub_solution(d_a, d_x, d_y);
-
-  if (g_verbose)
-  {
-    std::cout << "reference output:\n  [";
-    print_vector(std::cout, h_y);
-    std::cout << "]\n";
-    c2h::host_vector<ValueT> tmp_y(d_y);
-    std::cout << "cub::DeviceSpmv output:\n  [";
-    print_vector(std::cout, tmp_y);
-    std::cout << "]" << std::endl;
-  }
-
-  constexpr auto is_fp = std::is_floating_point<ValueT>{};
-  AssertTrue(compare_results(is_fp, h_y, d_y));
-}
-
-//==============================================================================
-// Test example from cub::DeviceSpmv documentation
-template <typename ValueT>
-void test_doc_example()
-{
-  std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()" << std::endl;
-
-  host_csr_matrix<ValueT> h_a(9, 9);
-  h_a.append_value(0, 1, ValueT{1});
-  h_a.append_value(0, 3, ValueT{1});
-  h_a.append_value(1, 0, ValueT{1});
-  h_a.append_value(1, 2, ValueT{1});
-  h_a.append_value(1, 4, ValueT{1});
-  h_a.append_value(2, 1, ValueT{1});
-  h_a.append_value(2, 5, ValueT{1});
-  h_a.append_value(3, 0, ValueT{1});
-  h_a.append_value(3, 4, ValueT{1});
-  h_a.append_value(3, 6, ValueT{1});
-  h_a.append_value(4, 1, ValueT{1});
-  h_a.append_value(4, 3, ValueT{1});
-  h_a.append_value(4, 5, ValueT{1});
-  h_a.append_value(4, 7, ValueT{1});
-  h_a.append_value(5, 2, ValueT{1});
-  h_a.append_value(5, 4, ValueT{1});
-  h_a.append_value(5, 8, ValueT{1});
-  h_a.append_value(6, 3, ValueT{1});
-  h_a.append_value(6, 7, ValueT{1});
-  h_a.append_value(7, 4, ValueT{1});
-  h_a.append_value(7, 6, ValueT{1});
-  h_a.append_value(7, 8, ValueT{1});
-  h_a.append_value(8, 5, ValueT{1});
-  h_a.append_value(8, 7, ValueT{1});
-  h_a.finalize();
-
-  c2h::host_vector<ValueT> h_x(9, ValueT{1});
-
-  test_spmv(h_a, h_x);
-}
-
-//==============================================================================
-// Generate and test a random SpMV operation with the given parameters.
-template <typename ValueT>
-void test_random(int rows, int cols, float target_fill_ratio)
-{
-  std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows << ", " << cols << ", " << target_fill_ratio
-            << ")" << std::endl;
-
-  host_csr_matrix<ValueT> h_a  = make_random_csr_matrix<ValueT>(rows, cols, target_fill_ratio);
-  c2h::host_vector<ValueT> h_x = make_random_vector<ValueT>(cols);
-
-  test_spmv(h_a, h_x);
-}
-
-//==============================================================================
-// Dispatch many random SpMV tests over a variety of parameters.
-template <typename ValueT>
-void test_random()
-{
-  test_random<ValueT>(0, 0, 1.f);
-  test_random<ValueT>(0, 1, 1.f);
-  test_random<ValueT>(1, 0, 1.f);
-
-  constexpr int dim_min = 1;
-  constexpr int dim_max = 10000;
-
-  constexpr int max_num_elems = 100000;
-
-  constexpr float ratio_min  = 0.f;
-  constexpr float ratio_max  = 1.1f; // a lil over to account for fp errors
-  constexpr float ratio_step = 0.3334f;
-
-  for (int rows = dim_min; rows < dim_max; rows <<= 1)
-  {
-    for (int cols = dim_min; cols < dim_max; cols <<= 1)
-    {
-      if (rows * cols >= max_num_elems)
-      {
-        continue;
-      }
-
-      for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step)
-      {
-        test_random<ValueT>(rows, cols, ratio);
-        // Test nearby non-power-of-two dims:
-        test_random<ValueT>(rows + 97, cols + 83, ratio);
-      }
-    }
-  }
-}
-
-//==============================================================================
-// Dispatch many SpMV tests for a given ValueT.
-template <typename ValueT>
-void test_type()
-{
-  test_doc_example<ValueT>();
-  test_random<ValueT>();
-}
-
-//==============================================================================
-// Dispatch many SpMV tests over a variety of types.
-void test_types()
-{
-  test_type<float>();
-  test_type<double>();
-  test_type<signed char>();
-  test_type<int>();
-  test_type<long long>();
-}
-
-int main(int argc, char** argv)
-{
-  // Initialize command line
-  CommandLineArgs args(argc, argv);
-  g_verbose = args.CheckCmdLineFlag("v");
-
-  // Print usage
-  if (args.CheckCmdLineFlag("help"))
-  {
-    printf("%s "
-           "[--device=<device-id>] "
-           "[--v] verbose"
-           "\n",
-           argv[0]);
-    exit(0);
-  }
-
-  CubDebugExit(args.DeviceInit());
-
-  test_types();
-}
-
-_CCCL_SUPPRESS_DEPRECATED_POP