From a00de21b5e79aa5c398efeac50b37d99d580c859 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 30 Jan 2025 10:46:01 -0800 Subject: [PATCH] Remove CUB `DeviceSpMV` (#3549) --- cub/cub/agent/agent_spmv_orig.cuh | 764 --------------- cub/cub/cub.cuh | 1 - cub/cub/device/device_spmv.cuh | 216 ---- .../device/dispatch/dispatch_spmv_orig.cuh | 924 ------------------ cub/test/test_device_spmv.cu | 611 ------------ 5 files changed, 2516 deletions(-) delete mode 100644 cub/cub/agent/agent_spmv_orig.cuh delete mode 100644 cub/cub/device/device_spmv.cuh delete mode 100644 cub/cub/device/dispatch/dispatch_spmv_orig.cuh delete mode 100644 cub/test/test_device_spmv.cu diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh deleted file mode 100644 index 90a5e3aa6c9..00000000000 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ /dev/null @@ -1,764 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * @param Parameterizable tuning policy type for AgentSpmv - * - * @tparam _BLOCK_THREADS - * Threads per thread block - * - * @tparam _ITEMS_PER_THREAD - * Items per thread (per tile of input) - * - * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER - * Cache load modifier for reading CSR row-offsets during search - * - * @tparam _ROW_OFFSETS_LOAD_MODIFIER - * Cache load modifier for reading CSR row-offsets - * - * @tparam _COLUMN_INDICES_LOAD_MODIFIER - * Cache load modifier for reading CSR column-indices - * - * @tparam _VALUES_LOAD_MODIFIER - * Cache load modifier for reading CSR values - * - * @tparam _VECTOR_VALUES_LOAD_MODIFIER - * Cache load modifier for reading vector values - * - * @tparam _DIRECT_LOAD_NONZEROS - * Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through - * shared memory) - * - * @tparam _SCAN_ALGORITHM - * The BlockScan algorithm to use - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy -{ - enum - { - /// Threads per thread block - BLOCK_THREADS = _BLOCK_THREADS, - - /// Items per thread (per tile of input) - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - - /// Whether to load nonzeros directly from global during sequential merging (pre-staged through - /// shared memory) - DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, - }; - - /// Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR column-indices - static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR values - static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; - - /// Cache load modifier for reading vector values - static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; - - /// The BlockScan algorithm to use - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; -}; - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - */ -template -struct -// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/ -#if !_CCCL_COMPILER(NVHPC) - CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -#endif - SpmvParams -{ - /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix - /// A. - const ValueT* d_values; - - /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices - /// and \p d_values - const OffsetT* d_row_end_offsets; - - /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements - /// of matrix A. (Indices are zero-valued.) - const OffsetT* d_column_indices; - - /// Pointer to the array of \p num_cols values corresponding to the dense input vector x - const ValueT* d_vector_x; - - /// Pointer to the array of \p num_rows values corresponding to the dense output vector y - ValueT* d_vector_y; - - /// Number of rows of matrix A. - int num_rows; - - /// Number of columns of matrix A. - int num_cols; - - /// Number of nonzero elements of matrix A. - int num_nonzeros; - - /// Alpha multiplicand - ValueT alpha; - - /// Beta addend-multiplicand - ValueT beta; -}; - -/** - * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - * - * @tparam AgentSpmvPolicyT - * Parameterized AgentSpmvPolicy tuning policy type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam HAS_ALPHA - * Whether the input parameter \p alpha is 1 - * - * @tparam HAS_BETA - * Whether the input parameter \p beta is 0 - * - * @tparam LEGACY_PTX_ARCH - * PTX compute capability (unused) - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - /// 2D merge path coordinate type - using CoordinateT = typename CubVector::Type; - - /// Input iterator wrapper types (for applying cache modifiers) - - using RowOffsetsSearchIteratorT = - CacheModifiedInputIterator; - - using RowOffsetsIteratorT = CacheModifiedInputIterator; - - using ColumnIndicesIteratorT = - CacheModifiedInputIterator; - - using ValueIteratorT = CacheModifiedInputIterator; - - using VectorValueIteratorT = - CacheModifiedInputIterator; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - using KeyValuePairT = KeyValuePair; - - // Reduce-value-by-segment scan operator - using ReduceBySegmentOpT = ReduceByKeyOp<::cuda::std::plus<>>; - - // BlockReduce specialization - using BlockReduceT = BlockReduce; - - // BlockScan specialization - using BlockScanT = BlockScan; - - // BlockScan specialization - using BlockPrefixSumT = BlockScan; - - // BlockExchange specialization - using BlockExchangeT = BlockExchange; - - /// Merge item type (either a non-zero value or a row-end offset) - union MergeItem - { - // Value type to pair with index type OffsetT - // (NullType if loading values directly during merge) - using MergeValueT = ::cuda::std::_If; - - OffsetT row_end_offset; - MergeValueT nonzero; - }; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CoordinateT tile_coords[2]; - - union Aliasable - { - // Smem needed for tile of merge items - MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; - - // Smem needed for block exchange - typename BlockExchangeT::TempStorage exchange; - - // Smem needed for block-wide reduction - typename BlockReduceT::TempStorage reduce; - - // Smem needed for tile scanning - typename BlockScanT::TempStorage scan; - - // Smem needed for tile prefix sum - typename BlockPrefixSumT::TempStorage prefix_sum; - - } aliasable; - }; - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> - {}; - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage& temp_storage; - - _CCCL_SUPPRESS_DEPRECATED_PUSH - SpmvParams& spmv_params; - _CCCL_SUPPRESS_DEPRECATED_POP - - /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements - /// of matrix A. - ValueIteratorT wd_values; - - /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p - /// d_column_indices and \p d_values - RowOffsetsIteratorT wd_row_end_offsets; - - /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero - /// elements of matrix A. (Indices are zero-valued.) - ColumnIndicesIteratorT wd_column_indices; - - /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector - /// x - VectorValueIteratorT wd_vector_x; - - /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector - /// x - VectorValueIteratorT wd_vector_y; - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * @param temp_storage - * Reference to temp_storage - * - * @param spmv_params - * SpMV input parameter bundle - */ - _CCCL_SUPPRESS_DEPRECATED_PUSH - _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams& spmv_params) - : temp_storage(temp_storage.Alias()) - , spmv_params(spmv_params) - , wd_values(spmv_params.d_values) - , wd_row_end_offsets(spmv_params.d_row_end_offsets) - , wd_column_indices(spmv_params.d_column_indices) - , wd_vector_x(spmv_params.d_vector_x) - , wd_vector_y(spmv_params.d_vector_y) - {} - _CCCL_SUPPRESS_DEPRECATED_POP - - /** - * @brief Consume a merge tile, specialized for direct-load of nonzeros - * - * @param is_direct_load - * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - */ - _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT - ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - - // Gather the row end-offsets for the merge tile into shared memory - for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) - { - const OffsetT offset = (::cuda::std::min)( - static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); - s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; - } - - __syncthreads(); - - // Search for the thread's starting coordinate within the merge tile - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - _CCCL_SUPPRESS_DEPRECATED_POP - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - __syncthreads(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - - ValueT running_total = 0.0; - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - running_total += nonzero; - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = tile_num_rows; - ++thread_current_coord.y; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = thread_current_coord.x; - running_total = 0.0; - ++thread_current_coord.x; - } - } - - __syncthreads(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (tile_num_rows > 0) - { - if (threadIdx.x == 0) - { - scan_item.key = -1; - } - -// Direct scatter -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM].key < tile_num_rows) - { - if (scan_item.key == scan_segment[ITEM].key) - { - scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; - } - - if (HAS_ALPHA) - { - scan_segment[ITEM].value *= spmv_params.alpha; - } - - if (HAS_BETA) - { - // Update the output vector element - ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; - scan_segment[ITEM].value += addend; - } - - // Set the output vector element - spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; - } - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - /** - * @brief Consume a merge tile, specialized for indirect load of nonzeros - * - * @param is_direct_load - * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - */ - _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT - ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - -#if (CUB_PTX_ARCH >= 520) - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - -// Gather the nonzeros for the merge tile into shared memory -# pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; - ValueT* s = s_tile_nonzeros + nonzero_idx; - - if (nonzero_idx < tile_num_nonzeros) - { - OffsetT column_idx = *ci; - ValueT value = *a; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - *s = nonzero; - } - } - -#else - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - if (tile_num_nonzeros > 0) - { -# pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - s_tile_nonzeros[nonzero_idx] = nonzero; - } - } - -#endif - -// Gather the row end-offsets for the merge tile into shared memory -#pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) - { - const OffsetT offset = (::cuda::std::min)( - static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); - s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; - } - - __syncthreads(); - - // Search for the thread's starting coordinate within the merge tile - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - _CCCL_SUPPRESS_DEPRECATED_POP - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - __syncthreads(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - ValueT running_total = 0.0; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - scan_segment[ITEM].value = nonzero; - running_total += nonzero; - ++thread_current_coord.y; - nonzero = s_tile_nonzeros[thread_current_coord.y]; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = 0.0; - running_total = 0.0; - ++thread_current_coord.x; - row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - } - - scan_segment[ITEM].key = thread_current_coord.x; - } - - __syncthreads(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (threadIdx.x == 0) - { - scan_item.key = thread_start_coord.x; - scan_item.value = 0.0; - } - - if (tile_num_rows > 0) - { - __syncthreads(); - - // Scan downsweep and scatter - ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; - - if (scan_item.key != scan_segment[0].key) - { - s_partials[scan_item.key] = scan_item.value; - } - else - { - scan_segment[0].value += scan_item.value; - } - -#pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) - { - s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; - } - else - { - scan_segment[ITEM].value += scan_segment[ITEM - 1].value; - } - } - - __syncthreads(); - -#pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - /** - * @brief Consume input tile - * - * @param[in] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[out] d_tile_carry_pairs - * Pointer to the temporary array carry-out dot product row-ids, one per block - * - * @param[in] num_merge_tiles - * Number of merge tiles - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void - ConsumeTile(CoordinateT* d_tile_coordinates, KeyValuePairT* d_tile_carry_pairs, int num_merge_tiles) - { - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - - if (tile_idx >= num_merge_tiles) - { - return; - } - - // Read our starting coordinates - if (threadIdx.x < 2) - { - if (d_tile_coordinates == nullptr) - { - // Search our starting coordinates - OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; - CoordinateT tile_coord; - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator nonzero_indices(0); - _CCCL_SUPPRESS_DEPRECATED_POP - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coord); - - temp_storage.tile_coords[threadIdx.x] = tile_coord; - } - else - { - temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; - } - } - - __syncthreads(); - - CoordinateT tile_start_coord = temp_storage.tile_coords[0]; - CoordinateT tile_end_coord = temp_storage.tile_coords[1]; - - // Consume multi-segment tile - KeyValuePairT tile_carry = - ConsumeTile(tile_idx, tile_start_coord, tile_end_coord, Int2Type()); - - // Output the tile's carry-out - if (threadIdx.x == 0) - { - if (HAS_ALPHA) - { - tile_carry.value *= spmv_params.alpha; - } - - tile_carry.key += tile_start_coord.x; - if (tile_carry.key >= spmv_params.num_rows) - { - // FIXME: This works around an invalid memory access in the - // fixup kernel. The underlying issue needs to be debugged and - // properly fixed, but this hack prevents writes to - // out-of-bounds addresses. It doesn't appear to have an effect - // on the validity of the results, since this only affects the - // carry-over from last tile in the input. - tile_carry.key = spmv_params.num_rows - 1; - tile_carry.value = ValueT{}; - }; - - d_tile_carry_pairs[tile_idx] = tile_carry; - } - } -}; - -CUB_NAMESPACE_END diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index 2c4d6dd5f4e..ce55c879e0c 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -75,7 +75,6 @@ #include #include #include -#include #include // Grid diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh deleted file mode 100644 index 241af8cd1d1..00000000000 --- a/cub/cub/device/device_spmv.cuh +++ /dev/null @@ -1,216 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -//! @file -//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication -//! (SpMV). - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -#include -#include - -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -//! @rst -//! DeviceSpmv provides device-wide parallel operations for performing -//! sparse-matrix * dense-vector multiplication (SpMV). -//! -//! Overview -//! +++++++++++++++++++++++++++++++++++++++++++++ -//! -//! The `SpMV computation `_ -//! performs the matrix-vector operation ``y = A * x + y``, where: -//! -//! - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in -//! `compressed-storage-row (CSR) format -//! `_ (i.e., three -//! arrays: -//! ``values``, ``row_offsets``, and ``column_indices``) -//! - ``x`` and ``y`` are dense vectors -//! -//! Usage Considerations -//! +++++++++++++++++++++++++++++++++++++++++++++ -//! -//! @cdp_class{DeviceSpmv} -//! -//! @endrst -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv -{ - //! @name CSR matrix operations - //! @{ - - //! @rst - //! This function performs the matrix-vector operation ``y = A*x``. - //! - //! Snippet - //! +++++++++++++++++++++++++++++++++++++++++++++ - //! - //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros). - //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! // Declare, allocate, and initialize device-accessible pointers for input matrix A, input - //! vector x, - //! // and output vector y - //! int num_rows = 9; - //! int num_cols = 9; - //! int num_nonzeros = 24; - //! - //! float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, - //! // 1, 1, 1, 1, 1, 1, 1, 1, - //! // 1, 1, 1, 1, 1, 1, 1, 1] - //! - //! int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, - //! // 4, 6, 1, 3, 5, 7, 2, 4, - //! // 8, 3, 7, 4, 6, 8, 5, 7] - //! - //! int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] - //! - //! float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] - //! float* d_vector_y; // e.g., [ , , , , , , , , ] - //! ... - //! - //! // Determine temporary device storage requirements - //! void* d_temp_storage = nullptr; - //! size_t temp_storage_bytes = 0; - //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - //! num_rows, num_cols, num_nonzeros); - //! - //! // Allocate temporary storage - //! cudaMalloc(&d_temp_storage, temp_storage_bytes); - //! - //! // Run SpMV - //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - //! num_rows, num_cols, num_nonzeros); - //! - //! // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] - //! - //! @endrst - //! - //! @tparam ValueT - //! **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.) - //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. - //! When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done. - //! - //! @param[in,out] temp_storage_bytes - //! Reference to size in bytes of `d_temp_storage` allocation - //! - //! @param[in] d_values - //! Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements - //! of matrix `A`. - //! - //! @param[in] d_row_offsets - //! Pointer to the array of `m + 1` offsets demarcating the start of every row in - //! `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`) - //! - //! @param[in] d_column_indices - //! Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero - //! elements of matrix `A`. (Indices are zero-valued.) - //! - //! @param[in] d_vector_x - //! Pointer to the array of `num_cols` values corresponding to the dense input vector `x` - //! - //! @param[out] d_vector_y - //! Pointer to the array of `num_rows` values corresponding to the dense output vector `y` - //! - //! @param[in] num_rows - //! number of rows of matrix `A`. - //! - //! @param[in] num_cols - //! number of columns of matrix `A`. - //! - //! @param[in] num_nonzeros - //! number of nonzero elements of matrix `A`. - //! - //! @param[in] stream - //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. - //! @endrst - template - CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") - CUB_RUNTIME_FUNCTION static cudaError_t - CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - const ValueT* d_values, - const int* d_row_offsets, - const int* d_column_indices, - const ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - cudaStream_t stream = 0) - { - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV"); - - SpmvParams spmv_params; - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = ValueT{1}; - spmv_params.beta = ValueT{0}; - - _CCCL_SUPPRESS_DEPRECATED_PUSH - return DispatchSpmv::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); - _CCCL_SUPPRESS_DEPRECATED_POP - } - - //! @} end member group -}; - -CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh deleted file mode 100644 index 16353f392dc..00000000000 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ /dev/null @@ -1,924 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * @file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector - * multiplication (SpMV). - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -/****************************************************************************** - * SpMV kernel entry points - *****************************************************************************/ - -/** - * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. - * - * @tparam AgentSpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @param[in] spmv_params - * SpMV input parameter bundle - */ -_CCCL_SUPPRESS_DEPRECATED_PUSH -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) // - _CCCL_SUPPRESS_DEPRECATED_POP -{ - using VectorValueIteratorT = - CacheModifiedInputIterator; - - VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); - - int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (row_idx < spmv_params.num_rows) - { - OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; - OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; - - ValueT value = 0.0; - if (end_nonzero_idx != nonzero_idx) - { - value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; - } - - spmv_params.d_vector_y[row_idx] = value; - } -} - -/** - * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. - * - * @tparam SpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam CoordinateT - * Merge path coordinate type - * - * @tparam SpmvParamsT - * SpmvParams type - * - * @param[in] num_merge_tiles - * Number of SpMV merge tiles (spmv grid size) - * - * @param[out] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[in] spmv_params - * SpMV input parameter bundle - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( - int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params) -{ - /// Constants - enum - { - BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - using RowOffsetsSearchIteratorT = - CacheModifiedInputIterator; - - // Find the starting coordinate for all tiles (plus the end coordinate of the last one) - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_merge_tiles + 1) - { - OffsetT diagonal = (tile_idx * TILE_ITEMS); - CoordinateT tile_coordinate; - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator nonzero_indices(0); - _CCCL_SUPPRESS_DEPRECATED_POP - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coordinate); - - // Output starting offset - d_tile_coordinates[tile_idx] = tile_coordinate; - } -} - -/** - * @brief Spmv agent entry point - * - * @tparam SpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam ScanTileStateT - * Tile status interface type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam CoordinateT - * Merge path coordinate type - * - * @tparam HAS_ALPHA - * Whether the input parameter Alpha is 1 - * - * @tparam HAS_BETA - * Whether the input parameter Beta is 0 - * - * @param[in] spmv_params - * SpMV input parameter bundle - * - * @param[in] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[out] d_tile_carry_pairs - * Pointer to the temporary array carry-out dot product row-ids, one per block - * - * @param[in] num_tiles - * Number of merge tiles - * - * @param[in] tile_state - * Tile status interface for fixup reduce-by-key kernel - * - * @param[in] num_segment_fixup_tiles - * Number of reduce-by-key tiles (fixup grid size) - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( - SpmvParams spmv_params, - CoordinateT* d_tile_coordinates, - KeyValuePair* d_tile_carry_pairs, - int num_tiles, - ScanTileStateT tile_state, - int num_segment_fixup_tiles) -{ - // Spmv agent type specialization - _CCCL_SUPPRESS_DEPRECATED_PUSH - using AgentSpmvT = AgentSpmv; - _CCCL_SUPPRESS_DEPRECATED_POP - - // Shared memory for AgentSpmv - __shared__ typename AgentSpmvT::TempStorage temp_storage; - - AgentSpmvT(temp_storage, spmv_params).ConsumeTile(d_tile_coordinates, d_tile_carry_pairs, num_tiles); - - // Initialize fixup tile status - tile_state.InitializeStatus(num_segment_fixup_tiles); -} - -/** - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam HAS_BETA - * Whether the input parameter Beta is 0 - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) -{ - const int row = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - - if (row < spmv_params.num_rows) - { - ValueT result = 0.0; - - _CCCL_IF_CONSTEXPR (HAS_BETA) - { - result += spmv_params.beta * spmv_params.d_vector_y[row]; - } - - spmv_params.d_vector_y[row] = result; - } -} - -/** - * @brief Multi-block reduce-by-key sweep kernel entry point - * - * @tparam AgentSegmentFixupPolicyT - * Parameterized AgentSegmentFixupPolicy tuning policy type - * - * @tparam PairsInputIteratorT - * Random-access input iterator type for keys - * - * @tparam AggregatesOutputIteratorT - * Random-access output iterator type for values - * - * @tparam OffsetT - * Signed integer type for global offsets - * - * @tparam ScanTileStateT - * Tile status interface type - * - * @param[in] d_pairs_in - * Pointer to the array carry-out dot product row-ids, one per spmv block - * - * @param[in,out] d_aggregates_out - * Output value aggregates - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] num_tiles - * Total number of tiles for the entire problem - * - * @param[in] tile_state - * Tile status interface - */ -_CCCL_SUPPRESS_DEPRECATED_PUSH -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( - PairsInputIteratorT d_pairs_in, - AggregatesOutputIteratorT d_aggregates_out, - OffsetT num_items, - int num_tiles, - ScanTileStateT tile_state) // - _CCCL_SUPPRESS_DEPRECATED_POP -{ - // Thread block type for reducing tiles of value segments - using AgentSegmentFixupT = - AgentSegmentFixup, - ::cuda::std::plus<>, - OffsetT>; - - // Shared memory for AgentSegmentFixup - __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; - - // Process tiles - AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, ::cuda::std::equal_to<>{}, ::cuda::std::plus<>{}) - .ConsumeRange(num_items, num_tiles, tile_state); -} - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for global offsets - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128, - EMPTY_MATRIX_KERNEL_THREADS = 128 - }; - - // SpmvParams bundle type - using SpmvParamsT = SpmvParams; - - // 2D merge path coordinate type - using CoordinateT = typename CubVector::Type; - - // Tile status descriptor interface type - using ScanTileStateT = ReduceByKeyScanTileState; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - using KeyValuePairT = KeyValuePair; - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM50 - struct Policy500 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 6 : 7, - LOAD_LDG, - LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>; - - using SegmentFixupPolicyT = - AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE>; - }; - - /// SM60 - struct Policy600 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 5 : 7, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 600) - using PtxPolicy = Policy600; - -#else - using PtxPolicy = Policy500; -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT - {}; - struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT - {}; - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static void - InitConfigs(int ptx_version, KernelConfig& spmv_config, KernelConfig& segment_fixup_config) - { - NV_IF_TARGET( - NV_IS_DEVICE, - ( // We're on the device, so initialize the kernel dispatch - // configurations with the current PTX policy - spmv_config.template Init(); segment_fixup_config.template Init();), - ( - // We're on the host, so lookup and initialize the kernel dispatch - // configurations with the policies that match the device's PTX - // version - if (ptx_version >= 600) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } else if (ptx_version >= 500) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); - })); - } - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduction using the - * specified kernel functions. - * - * If the input is larger than a single tile, this method uses two-passes of - * kernel invocations. - * - * @tparam Spmv1ColKernelT - * Function type of cub::DeviceSpmv1ColKernel - * - * @tparam SpmvSearchKernelT - * Function type of cub::AgentSpmvSearchKernel - * - * @tparam SpmvKernelT - * Function type of cub::AgentSpmvKernel - * - * @tparam SegmentFixupKernelT - * Function type of cub::DeviceSegmentFixupKernelT - * - * @tparam SpmvEmptyMatrixKernelT - * Function type of cub::DeviceSpmvEmptyMatrixKernel - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @paramSpMV spmv_params - * input parameter bundle - * - * @param[in] stream - * CUDA stream to launch kernels within. Default is stream0. - * - * @param[in] spmv_1col_kernel - * Kernel function pointer to parameterization of DeviceSpmv1ColKernel - * - * @param[in] spmv_search_kernel - * Kernel function pointer to parameterization of AgentSpmvSearchKernel - * - * @param[in] spmv_kernel - * Kernel function pointer to parameterization of AgentSpmvKernel - * - * @param[in] segment_fixup_kernel - * Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel - * - * @param[in] spmv_empty_matrix_kernel - * Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel - * - * @param[in] spmv_config - * Dispatch parameters that match the policy that @p spmv_kernel was compiled for - * - * @param[in] segment_fixup_config - * Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for - */ - template - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - SpmvParamsT& spmv_params, - cudaStream_t stream, - Spmv1ColKernelT spmv_1col_kernel, - SpmvSearchKernelT spmv_search_kernel, - SpmvKernelT spmv_kernel, - SegmentFixupKernelT segment_fixup_kernel, - SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, - KernelConfig spmv_config, - KernelConfig segment_fixup_config) - { - cudaError error = cudaSuccess; - do - { - if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0) - { - return cudaErrorInvalidValue; - } - - if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0) - { // Empty problem, no-op. - if (d_temp_storage == nullptr) - { - temp_storage_bytes = 1; - } - - break; - } - - if (spmv_params.num_nonzeros == 0) - { - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS; - const int blocks_in_grid = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block); - -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n", - blocks_in_grid, - threads_in_block, - (long long) stream); -#endif // CUB_DEBUG_LOG - error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) - .doit(spmv_empty_matrix_kernel, spmv_params); - - if (CubDebug(error)) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - break; - } - - if (spmv_params.num_cols == 1) - { - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - // Get search/init grid dims - int degen_col_kernel_block_size = INIT_KERNEL_THREADS; - int degen_col_kernel_grid_size = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size); - -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", - degen_col_kernel_grid_size, - degen_col_kernel_block_size, - (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_search_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream) - .doit(spmv_1col_kernel, spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - break; - } - - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) - { - break; - } - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) - { - break; - } - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) - { - break; - } - - // Total number of spmv work items - int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; - - // Tile sizes of kernels - int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; - int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; - - // Number of tiles for kernels - int num_merge_tiles = ::cuda::ceil_div(num_merge_items, merge_tile_size); - int num_segment_fixup_tiles = ::cuda::ceil_div(num_merge_tiles, segment_fixup_tile_size); - - // Get SM occupancy for kernels - int spmv_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads))) - { - break; - } - - int segment_fixup_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads))) - { - break; - } - - // Get grid dimensions - dim3 spmv_grid_size(CUB_MIN(num_merge_tiles, max_dim_x), ::cuda::ceil_div(num_merge_tiles, max_dim_x), 1); - - dim3 segment_fixup_grid_size( - CUB_MIN(num_segment_fixup_tiles, max_dim_x), ::cuda::ceil_div(num_segment_fixup_tiles, max_dim_x), 1); - - // Get the temporary storage allocation requirements - size_t allocation_sizes[3]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) - { - break; // bytes needed for reduce-by-key tile status descriptors - } - allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs - allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - void* allocations[3] = {}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) - { - break; - } - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) - { - break; - } - - // Alias the other allocations - KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs - CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates - - // Get search/init grid dims - int search_block_size = INIT_KERNEL_THREADS; - int search_grid_size = ::cuda::ceil_div(num_merge_tiles + 1, search_block_size); - - if (search_grid_size < sm_count) - // if (num_merge_tiles < spmv_sm_occupancy * sm_count) - { - // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords - d_tile_coordinates = nullptr; - } - else - { -// Use separate search kernel if we have enough spmv tiles to saturate the device - -// Log spmv_search_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", - search_grid_size, - search_block_size, - (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_search_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) - .doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - } - -// Log spmv_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - spmv_grid_size.x, - spmv_grid_size.y, - spmv_grid_size.z, - spmv_config.block_threads, - (long long) stream, - spmv_config.items_per_thread, - spmv_sm_occupancy); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) - .doit(spmv_kernel, - spmv_params, - d_tile_coordinates, - d_tile_carry_pairs, - num_merge_tiles, - tile_state, - num_segment_fixup_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - // Run reduce-by-key fixup if necessary - if (num_merge_tiles > 1) - { -// Log segment_fixup_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - segment_fixup_grid_size.x, - segment_fixup_grid_size.y, - segment_fixup_grid_size.z, - segment_fixup_config.block_threads, - (long long) stream, - segment_fixup_config.items_per_thread, - segment_fixup_sm_occupancy); -#endif // CUB_DEBUG_LOG - - // Invoke segment_fixup_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream) - .doit(segment_fixup_kernel, - d_tile_carry_pairs, - spmv_params.d_vector_y, - num_merge_tiles, - num_segment_fixup_tiles, - tile_state); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - } - } while (0); - - return error; - } - - /** - * @brief Internal dispatch routine for computing a device-wide reduction - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param SpMV spmv_params - * input parameter bundle - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is stream0. - */ - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream = 0) - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) - { - break; - } - - // Get kernel kernel dispatch configurations - KernelConfig spmv_config, segment_fixup_config; - InitConfigs(ptx_version, spmv_config, segment_fixup_config); - - constexpr bool has_alpha = false; - constexpr bool has_beta = false; - - if (CubDebug( - error = Dispatch( - d_temp_storage, - temp_storage_bytes, - spmv_params, - stream, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - DeviceSpmvEmptyMatrixKernel, - spmv_config, - segment_fixup_config))) - { - break; - } - - } while (0); - - return error; - } -}; - -CUB_NAMESPACE_END diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu deleted file mode 100644 index 13dba77a594..00000000000 --- a/cub/test/test_device_spmv.cu +++ /dev/null @@ -1,611 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -// Ensure printing of CUDA runtime errors to console -#define CUB_STDERR - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "test_util.h" -#include -#include - -_CCCL_SUPPRESS_DEPRECATED_PUSH - -bool g_verbose = false; - -//============================================================================== -// Casts char types to int for numeric printing -template -T print_cast(T val) -{ - return val; -} - -int print_cast(char val) -{ - return static_cast(val); -} - -int print_cast(signed char val) -{ - return static_cast(val); -} - -int print_cast(unsigned char val) -{ - return static_cast(val); -} - -//============================================================================== -// Print a vector to out -template -void print_vector(std::ostream& out, const VectorT& vec) -{ - bool first = true; - for (const auto& val : vec) - { - if (!first) - { - out << ", "; - } - first = false; - out << print_cast(val); - } -} - -//============================================================================== -// Simple CSR matrix implementation. -// HostStorage controls whether data is stored on the host or device. -// Use the host_csr_matrix and device_csr_matrix aliases for code clarity. -template -struct csr_matrix -{ - csr_matrix(int num_rows, int num_cols) - : m_row_offsets(static_cast(num_rows + 1), 0) - , m_num_rows(num_rows) - , m_num_columns(num_cols) - {} - - // host/device conversion constructor - explicit csr_matrix(const csr_matrix& other) - : m_values(other.m_values) - , m_row_offsets(other.m_row_offsets) - , m_column_indices(other.m_column_indices) - , m_num_rows(other.m_num_rows) - , m_num_columns(other.m_num_columns) - , m_num_nonzeros(other.m_num_nonzeros) - {} - - // Note that this must append to the values array. Finish filling each row - // before adding to the next, and each row's columns must be added in order. - // Must call `finalize` once all items are added. - void append_value(int row, int col, ValueT value) - { - ++m_num_nonzeros; - ++m_row_offsets[row]; - m_column_indices.push_back(col); - m_values.push_back(std::move(value)); - } - - void finalize() - { - _CCCL_IF_CONSTEXPR (HostStorage) - { - thrust::exclusive_scan(thrust::host, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin()); - } - else - { - thrust::exclusive_scan(c2h::device_policy, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin()); - } - AssertEquals(m_row_offsets.back(), m_num_nonzeros); - } - - const ValueT* get_values() const - { - return thrust::raw_pointer_cast(m_values.data()); - } - - const int* get_row_offsets() const - { - return thrust::raw_pointer_cast(m_row_offsets.data()); - } - - int get_row_offset(int row) const - { - return m_row_offsets[row]; - } - - int get_row_num_nonzero(int row) const - { - return m_row_offsets[row + 1] - m_row_offsets[row]; - } - - const int* get_column_indices() const - { - return thrust::raw_pointer_cast(m_column_indices.data()); - } - - int get_num_rows() const - { - return m_num_rows; - } - - int get_num_columns() const - { - return m_num_columns; - } - - int get_num_nonzeros() const - { - return m_num_nonzeros; - } - - void print_internals(std::ostream& out) const - { - out << (HostStorage ? "host" : "device") << "_csr_matrix" - << "(" << m_num_rows << ", " << m_num_columns << ")\n" - << " - num_elems: " << (m_num_rows * m_num_columns) << "\n" - << " - num_nonzero: " << m_num_nonzeros << "\n" - << " - row_offsets:\n ["; - print_vector(out, m_row_offsets); - out << "]\n" - << " - column_indices:\n ["; - print_vector(out, m_column_indices); - out << "]\n" - << " - values:\n ["; - print_vector(out, m_values); - out << "]\n"; - } - - void print_summary(std::ostream& out) const - { - const int num_elems = m_num_rows * m_num_columns; - const float fill_ratio = - num_elems == 0 ? 0.f : (static_cast(m_num_nonzeros) / static_cast(num_elems)); - - out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/" << num_elems << " (" << fill_ratio - << ")\n"; - } - - friend class csr_matrix; - -private: - template - using vector_t = ::cuda::std::_If, c2h::device_vector>; - - vector_t m_values; - vector_t m_row_offsets; - vector_t m_column_indices; - - int m_num_rows{0}; - int m_num_columns{0}; - int m_num_nonzeros{0}; -}; - -//============================================================================== -// Convenience aliases for host/device csr_matrix types. -template -using host_csr_matrix = csr_matrix; - -template -using device_csr_matrix = csr_matrix; - -//============================================================================== -// Compare two floats within a tolerance. -// This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks. -template -struct fp_almost_equal_functor -{ - __host__ __device__ bool operator()(ValueT v1, ValueT v2) const - { - constexpr double r_tol = 1e-3; - constexpr double a_tol = 1e-2; - const double limit = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol; - return std::fabs(v1 - v2) <= limit; - } -}; - -//============================================================================== -// Compare the reference and cub output vectors. -// Use fuzzy check for floating point values. -template -bool compare_results( - std::true_type /* is_fp */, const c2h::host_vector& h_vec1, const c2h::device_vector& d_vec2) -{ - c2h::device_vector d_vec1(h_vec1); - auto err = thrust::mismatch( - c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin(), fp_almost_equal_functor{}); - if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) - { - return true; - } - else - { - c2h::host_vector h_vec2(d_vec2); - const auto idx = thrust::distance(d_vec1.cbegin(), err.first); - std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " - << print_cast(ValueT{h_vec2[idx]}) << std::endl; - return false; - } -}; - -template -bool compare_results( - std::false_type /* is_fp */, const c2h::host_vector& h_vec1, const c2h::device_vector& d_vec2) -{ - c2h::device_vector d_vec1(h_vec1); - auto err = thrust::mismatch(c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin()); - if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) - { - return true; - } - else - { - c2h::host_vector h_vec2(d_vec2); - const auto idx = thrust::distance(d_vec1.cbegin(), err.first); - std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " - << print_cast(ValueT{h_vec2[idx]}) << std::endl; - return false; - } -} - -//============================================================================== -// Generate a random host_csr_matrix with the specified dimensions. -// target_fill_ratio is the target fraction of non-zero elements (may be more -// or less in the output). -template -host_csr_matrix make_random_csr_matrix(int num_rows, int num_cols, float target_fill_ratio) -{ - host_csr_matrix mat{num_rows, num_cols}; - - for (int row = 0; row < num_rows; ++row) - { - for (int col = 0; col < num_cols; ++col) - { - const bool is_non_zero = RandomValue(1.f) < target_fill_ratio; - if (!is_non_zero) - { - continue; - } - - if (std::is_floating_point::value) - { - // Keep fp numbers somewhat small, from -50 -> 50; otherwise we run - // into issues with nans/infs - ValueT value = (RandomValue(static_cast(100)) - static_cast(50)); - mat.append_value(row, col, value); - } - else - { - ValueT value{}; - InitValue(RANDOM, value); - mat.append_value(row, col, value); - } - } - } - - mat.finalize(); - - const int num_elements = num_rows * num_cols; - const float actual_fill_ratio = static_cast(mat.get_num_nonzeros()) / static_cast(num_elements); - - if (g_verbose) - { - printf( - "Created host_csr_matrix<%s>(%d, %d)\n" - " - NumElements: %d\n" - " - NumNonZero: %d\n" - " - Target fill: %0.2f%%\n" - " - Actual fill: %0.2f%%\n", - typeid(ValueT).name(), - num_rows, - num_cols, - num_elements, - mat.get_num_nonzeros(), - target_fill_ratio, - actual_fill_ratio); - } - - return mat; -} - -//============================================================================== -// Fill a vector with random values. -template -c2h::host_vector make_random_vector(int len) -{ - c2h::host_vector vec(len); - for (auto& val : vec) - { - if (std::is_floating_point::value) - { // Keep fp numbers somewhat small; otherwise we run into issues with - // nans/infs - val = RandomValue(static_cast(100)) - static_cast(50); - } - else - { - InitValue(RANDOM, val); - } - } - return vec; -} - -//============================================================================== -// Serial y = Ax computation -template -void compute_reference_solution( - const host_csr_matrix& a, const c2h::host_vector& x, c2h::host_vector& y) -{ - if (a.get_num_rows() == 0 || a.get_num_columns() == 0) - { - return; - } - - for (int row = 0; row < a.get_num_rows(); ++row) - { - const int row_offset = a.get_row_offset(row); - const int row_length = a.get_row_num_nonzero(row); - const int* cols = a.get_column_indices() + row_offset; - const int* cols_end = cols + row_length; - const ValueT* values = a.get_values() + row_offset; - - ValueT accum{}; - while (cols < cols_end) - { - accum += (*values++) * x[*cols++]; - } - y[row] = accum; - } -} - -//============================================================================== -// cub::DeviceSpmv::CsrMV y = Ax computation -template -void compute_cub_solution( - const device_csr_matrix& a, const c2h::device_vector& x, c2h::device_vector& y) -{ - c2h::device_vector temp_storage; - std::size_t temp_storage_bytes{}; - auto err = cub::DeviceSpmv::CsrMV( - nullptr, - temp_storage_bytes, - a.get_values(), - a.get_row_offsets(), - a.get_column_indices(), - thrust::raw_pointer_cast(x.data()), - thrust::raw_pointer_cast(y.data()), - a.get_num_rows(), - a.get_num_columns(), - a.get_num_nonzeros()); - CubDebugExit(err); - - temp_storage.resize(temp_storage_bytes); - - err = cub::DeviceSpmv::CsrMV( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - a.get_values(), - a.get_row_offsets(), - a.get_column_indices(), - thrust::raw_pointer_cast(x.data()), - thrust::raw_pointer_cast(y.data()), - a.get_num_rows(), - a.get_num_columns(), - a.get_num_nonzeros()); - CubDebugExit(err); -} - -//============================================================================== -// Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the -// results. -template -void test_spmv(const host_csr_matrix& h_a, const c2h::host_vector& h_x) -{ - if (g_verbose) - { - std::cout << "Testing cub::DeviceSpmv on inputs:\n"; - h_a.print_internals(std::cout); - std::cout << "x vector:\n ["; - print_vector(std::cout, h_x); - std::cout << "]" << std::endl; - } - else - { - h_a.print_summary(std::cout); - } - - const device_csr_matrix d_a(h_a); - const c2h::device_vector d_x(h_x); - - c2h::host_vector h_y(h_a.get_num_rows()); - c2h::device_vector d_y(d_a.get_num_rows()); - - compute_reference_solution(h_a, h_x, h_y); - compute_cub_solution(d_a, d_x, d_y); - - if (g_verbose) - { - std::cout << "reference output:\n ["; - print_vector(std::cout, h_y); - std::cout << "]\n"; - c2h::host_vector tmp_y(d_y); - std::cout << "cub::DeviceSpmv output:\n ["; - print_vector(std::cout, tmp_y); - std::cout << "]" << std::endl; - } - - constexpr auto is_fp = std::is_floating_point{}; - AssertTrue(compare_results(is_fp, h_y, d_y)); -} - -//============================================================================== -// Test example from cub::DeviceSpmv documentation -template -void test_doc_example() -{ - std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()" << std::endl; - - host_csr_matrix h_a(9, 9); - h_a.append_value(0, 1, ValueT{1}); - h_a.append_value(0, 3, ValueT{1}); - h_a.append_value(1, 0, ValueT{1}); - h_a.append_value(1, 2, ValueT{1}); - h_a.append_value(1, 4, ValueT{1}); - h_a.append_value(2, 1, ValueT{1}); - h_a.append_value(2, 5, ValueT{1}); - h_a.append_value(3, 0, ValueT{1}); - h_a.append_value(3, 4, ValueT{1}); - h_a.append_value(3, 6, ValueT{1}); - h_a.append_value(4, 1, ValueT{1}); - h_a.append_value(4, 3, ValueT{1}); - h_a.append_value(4, 5, ValueT{1}); - h_a.append_value(4, 7, ValueT{1}); - h_a.append_value(5, 2, ValueT{1}); - h_a.append_value(5, 4, ValueT{1}); - h_a.append_value(5, 8, ValueT{1}); - h_a.append_value(6, 3, ValueT{1}); - h_a.append_value(6, 7, ValueT{1}); - h_a.append_value(7, 4, ValueT{1}); - h_a.append_value(7, 6, ValueT{1}); - h_a.append_value(7, 8, ValueT{1}); - h_a.append_value(8, 5, ValueT{1}); - h_a.append_value(8, 7, ValueT{1}); - h_a.finalize(); - - c2h::host_vector h_x(9, ValueT{1}); - - test_spmv(h_a, h_x); -} - -//============================================================================== -// Generate and test a random SpMV operation with the given parameters. -template -void test_random(int rows, int cols, float target_fill_ratio) -{ - std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows << ", " << cols << ", " << target_fill_ratio - << ")" << std::endl; - - host_csr_matrix h_a = make_random_csr_matrix(rows, cols, target_fill_ratio); - c2h::host_vector h_x = make_random_vector(cols); - - test_spmv(h_a, h_x); -} - -//============================================================================== -// Dispatch many random SpMV tests over a variety of parameters. -template -void test_random() -{ - test_random(0, 0, 1.f); - test_random(0, 1, 1.f); - test_random(1, 0, 1.f); - - constexpr int dim_min = 1; - constexpr int dim_max = 10000; - - constexpr int max_num_elems = 100000; - - constexpr float ratio_min = 0.f; - constexpr float ratio_max = 1.1f; // a lil over to account for fp errors - constexpr float ratio_step = 0.3334f; - - for (int rows = dim_min; rows < dim_max; rows <<= 1) - { - for (int cols = dim_min; cols < dim_max; cols <<= 1) - { - if (rows * cols >= max_num_elems) - { - continue; - } - - for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step) - { - test_random(rows, cols, ratio); - // Test nearby non-power-of-two dims: - test_random(rows + 97, cols + 83, ratio); - } - } - } -} - -//============================================================================== -// Dispatch many SpMV tests for a given ValueT. -template -void test_type() -{ - test_doc_example(); - test_random(); -} - -//============================================================================== -// Dispatch many SpMV tests over a variety of types. -void test_types() -{ - test_type(); - test_type(); - test_type(); - test_type(); - test_type(); -} - -int main(int argc, char** argv) -{ - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--device=] " - "[--v] verbose" - "\n", - argv[0]); - exit(0); - } - - CubDebugExit(args.DeviceInit()); - - test_types(); -} - -_CCCL_SUPPRESS_DEPRECATED_POP