Skip to content

Commit

Permalink
Merge branch 'main' into fix-arm64-emu-macro
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Jan 28, 2025
2 parents c94e6ff + e08bda4 commit afa315e
Show file tree
Hide file tree
Showing 201 changed files with 2,512 additions and 1,759 deletions.
11 changes: 6 additions & 5 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@ workflows:
# Old CTK/compiler
- {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']}
# Current CTK build-only
- {jobs: ['build'], std: 17, cxx: ['gcc7', 'clang14']}
- {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
- {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
- {jobs: ['build'], std: 'max', cxx: ['gcc7', 'gcc8', 'gcc9']}
- {jobs: ['build'], std: 'all', cxx: ['gcc10', 'gcc11', 'gcc12']}
- {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
- {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
- {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
# Current CTK testing:
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
# Disabled until we figure out the issue with the TBB dll
#- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
# Split up cub tests:
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
- {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
Expand Down
2 changes: 1 addition & 1 deletion ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for module in cuda_parallel cuda_cooperative; do
begin_group "⚙️ ${module} site-packages"
pip freeze
end_group "⚙️ ${module} site-packages"
run_command "🚀 Pytest ${module}" python -m pytest -v ./tests
run_command "🚀 Pytest ${module}" pytest -v ./tests
deactivate

popd >/dev/null
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -122,20 +120,12 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,6 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -148,20 +146,12 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // !TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#include <cub/device/device_run_length_encode.cuh>

#include <thrust/iterator/constant_iterator.h>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -74,7 +76,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_t = OffsetT;
using keys_input_it_t = const T*;
using unique_output_it_t = T*;
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
using vals_input_it_t = thrust::constant_iterator<offset_t, OffsetT>;
using aggregate_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = ::cuda::std::equal_to<>;
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@
#include <cub/util_namespace.cuh>
#include <cub/util_type.cuh>

#include <thrust/system/cuda/detail/core/util.h>
#include <thrust/system/cuda/detail/core/load_iterator.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/__cccl/cuda_capabilities.h>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -86,7 +87,7 @@ struct AgentBlockSort
// Types and constants
//---------------------------------------------------------------------

static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;

using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;

Expand Down Expand Up @@ -469,7 +470,7 @@ struct AgentMerge
struct TempStorage : Uninitialized<_TempStorage>
{};

static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
#include <cub/block/block_store.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/ptx>
#include <cuda/std/type_traits>
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
6 changes: 6 additions & 0 deletions cub/cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -567,7 +569,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -701,7 +705,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
// Search our starting coordinates
OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
CoordinateT tile_coord;
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> nonzero_indices(0);
_CCCL_SUPPRESS_DEPRECATED_POP

// Search the merge path
MergePathSearch(
Expand Down
13 changes: 13 additions & 0 deletions cub/cub/detail/launcher/cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@ struct TripleChevronFactory
{
return cudaOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_ptr, block_size, dynamic_smem_bytes);
}

_CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION cudaError_t MaxGridDimX(int& max_grid_dim_x) const
{
int device_ordinal;
cudaError_t error = CubDebug(cudaGetDevice(&device_ordinal));
if (cudaSuccess != error)
{
return error;
}

// Get max grid dimension
return cudaDeviceGetAttribute(&max_grid_dim_x, cudaDevAttrMaxGridDimX, device_ordinal);
}
};

} // namespace detail
Expand Down
1 change: 1 addition & 0 deletions cub/cub/device/device_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include <thrust/iterator/iterator_traits.h>
#include <thrust/system/cuda/detail/core/util.h>
#include <thrust/type_traits/is_contiguous_iterator.h>
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

#if __cccl_lib_mdspan
# include <cuda/std/__mdspan/extents.h>
Expand Down
28 changes: 16 additions & 12 deletions cub/cub/device/device_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,22 @@ private:
int end_bit,
cudaStream_t stream)
{
return DispatchRadixSort<IsDescending, KeyT, ValueT, OffsetT, DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>, DecomposerT>::
Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
static_cast<OffsetT>(num_items),
begin_bit,
end_bit,
is_overwrite_okay,
stream,
decomposer);
return DispatchRadixSort<
IsDescending,
KeyT,
ValueT,
OffsetT,
detail::radix::policy_hub<KeyT, ValueT, OffsetT>,
DecomposerT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
static_cast<OffsetT>(num_items),
begin_bit,
end_bit,
is_overwrite_okay,
stream,
decomposer);
}

template <bool IsDescending, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/device/dispatch/dispatch_rle.cuh>
#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <iterator>

Expand Down Expand Up @@ -199,14 +200,17 @@ struct DeviceRunLengthEncode
using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;

// Generator type for providing 1s values for run-length reduction
_CCCL_SUPPRESS_DEPRECATED_PUSH
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
_CCCL_SUPPRESS_DEPRECATED_POP

using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;

using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

using policy_t = detail::rle::encode::policy_hub<accum_t, key_t>;

_CCCL_SUPPRESS_DEPRECATED_PUSH
return DispatchReduceByKey<
InputIteratorT,
UniqueOutputIteratorT,
Expand All @@ -228,6 +232,7 @@ struct DeviceRunLengthEncode
reduction_op(),
num_items,
stream);
_CCCL_SUPPRESS_DEPRECATED_POP
}

//! @rst
Expand Down
Loading

0 comments on commit afa315e

Please sign in to comment.