Skip to content

Commit

Permalink
Merge branch 'main' of github.com:NVIDIA/cccl into fea/use-sccache-bu…
Browse files Browse the repository at this point in the history
…ild-cluster
  • Loading branch information
trxcllnt committed Jan 28, 2025
2 parents f0cf283 + 83b10c2 commit eb245ca
Show file tree
Hide file tree
Showing 50 changed files with 438 additions and 272 deletions.
5 changes: 3 additions & 2 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ workflows:
- {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
- {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
# Current CTK testing:
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
# Disabled until we figure out the issue with the TBB dll
#- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
# Split up cub tests:
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
- {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -122,20 +120,12 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,6 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -148,20 +146,12 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // !TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#include <cub/device/device_run_length_encode.cuh>

#include <thrust/iterator/constant_iterator.h>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -74,7 +76,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_t = OffsetT;
using keys_input_it_t = const T*;
using unique_output_it_t = T*;
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
using vals_input_it_t = thrust::constant_iterator<offset_t, OffsetT>;
using aggregate_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = ::cuda::std::equal_to<>;
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
#include <cub/block/block_store.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/ptx>
#include <cuda/std/type_traits>
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
6 changes: 6 additions & 0 deletions cub/cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -567,7 +569,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -701,7 +705,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
// Search our starting coordinates
OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
CoordinateT tile_coord;
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> nonzero_indices(0);
_CCCL_SUPPRESS_DEPRECATED_POP

// Search the merge path
MergePathSearch(
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/device/dispatch/dispatch_rle.cuh>
#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <iterator>

Expand Down Expand Up @@ -199,14 +200,17 @@ struct DeviceRunLengthEncode
using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;

// Generator type for providing 1s values for run-length reduction
_CCCL_SUPPRESS_DEPRECATED_PUSH
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
_CCCL_SUPPRESS_DEPRECATED_POP

using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;

using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

using policy_t = detail::rle::encode::policy_hub<accum_t, key_t>;

_CCCL_SUPPRESS_DEPRECATED_PUSH
return DispatchReduceByKey<
InputIteratorT,
UniqueOutputIteratorT,
Expand All @@ -228,6 +232,7 @@ struct DeviceRunLengthEncode
reduction_op(),
num_items,
stream);
_CCCL_SUPPRESS_DEPRECATED_POP
}

//! @rst
Expand Down
2 changes: 2 additions & 0 deletions cub/cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
{
OffsetT diagonal = (tile_idx * TILE_ITEMS);
CoordinateT tile_coordinate;
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> nonzero_indices(0);
_CCCL_SUPPRESS_DEPRECATED_POP

// Search the merge path
MergePathSearch(
Expand Down
19 changes: 14 additions & 5 deletions cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,20 @@
# pragma system_header
#endif // no system header

_CCCL_SUPPRESS_DEPRECATED_PUSH
#include <cuda/std/functional>
_CCCL_SUPPRESS_DEPRECATED_POP

#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <thrust/iterator/iterator_adaptor.h>
#include <thrust/iterator/tabulate_output_iterator.h>

#include <cuda/std/functional>
#include <cuda/std/type_traits>

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

// suppress deprecation warnings for ConstantInputIterator
_CCCL_SUPPRESS_DEPRECATED_PUSH
CUB_NAMESPACE_BEGIN

namespace detail::reduce
Expand Down Expand Up @@ -190,6 +189,12 @@ template <typename InputIteratorT,
detail::reduce::policy_hub<KeyValuePair<PerPartitionOffsetT, InitT>, PerPartitionOffsetT, ReductionOpT>>
struct dispatch_streaming_arg_reduce_t
{
# if _CCCL_COMPILER(NVHPC)
// NVHPC fails to suppress a deprecation when the alias is inside the function below, so we put it here and span a
// deprecation suppression region across the entire file as well
using constant_offset_it_t = ConstantInputIterator<GlobalOffsetT>;
# endif // _CCCL_COMPILER(NVHPC)

// Internal dispatch routine for computing a device-wide argument extremum, like `ArgMin` and `ArgMax`
//
// @param[in] d_temp_storage
Expand Down Expand Up @@ -229,7 +234,11 @@ struct dispatch_streaming_arg_reduce_t
cudaStream_t stream)
{
// Constant iterator to provide the offset of the current partition for the user-provided input iterator
# if !_CCCL_COMPILER(NVHPC)
_CCCL_SUPPRESS_DEPRECATED_PUSH
using constant_offset_it_t = ConstantInputIterator<GlobalOffsetT>;
_CCCL_SUPPRESS_DEPRECATED_POP
# endif

// Wrapped input iterator to produce index-value tuples, i.e., <PerPartitionOffsetT, InputT>-tuples
// We make sure to offset the user-provided input iterator by the current partition's offset
Expand Down Expand Up @@ -373,7 +382,7 @@ struct dispatch_streaming_arg_reduce_t
};

} // namespace detail::reduce

_CCCL_SUPPRESS_DEPRECATED_POP
CUB_NAMESPACE_END

#endif // !_CCCL_DOXYGEN_INVOKED
22 changes: 16 additions & 6 deletions cub/cub/device/dispatch/tuning/tuning_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
};

#ifdef _CUB_HAS_TRANSFORM_UBLKCP
// H100 and H200
struct policy900 : ChainedPolicy<900, policy900, policy300>
template <int BlockSize, int PtxVersion>
struct bulkcopy_policy
{
static constexpr int min_bif = arch_to_min_bytes_in_flight(900);
using async_policy = async_copy_policy_t<256>;
static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);
using async_policy = async_copy_policy_t<BlockSize>;
static constexpr bool exhaust_smem =
bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(
async_policy::block_threads * async_policy::min_items_per_thread)
Expand All @@ -188,10 +188,20 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
static constexpr bool use_fallback =
RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t<256>, async_policy>;
using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t<BlockSize>, async_policy>;
};

using max_policy = policy900;
struct policy900
: bulkcopy_policy<256, 900>
, ChainedPolicy<900, policy900, policy300>
{};

struct policy1000
: bulkcopy_policy<128, 1000>
, ChainedPolicy<1000, policy1000, policy900>
{};

using max_policy = policy1000;
#else // _CUB_HAS_TRANSFORM_UBLKCP
using max_policy = policy300;
#endif // _CUB_HAS_TRANSFORM_UBLKCP
Expand Down
10 changes: 9 additions & 1 deletion cub/cub/iterator/constant_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,13 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename OffsetT = ptrdiff_t>
class ConstantInputIterator
class
#ifndef __CUDA_ARCH__
// Avoid generating a deprecation warning from length_encode.compute_xx.cpp1.ii, which is compiled by cicc for which
// we cannot suppress the warning
CCCL_DEPRECATED_BECAUSE("Use thrust::constant_iterator instead")
#endif
ConstantInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -216,11 +222,13 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.val << "," << itr.offset << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/iterator/counting_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename OffsetT = ptrdiff_t>
class CountingInputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator instead") CountingInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -218,11 +218,13 @@ public:

/// ostream operator
#if !_CCCL_COMPILER(NVRTC)
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.val << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
#endif // !_CCCL_COMPILER(NVRTC)
};

Expand Down
4 changes: 3 additions & 1 deletion cub/cub/iterator/discard_output_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN
* @brief A discard iterator
*/
template <typename OffsetT = ptrdiff_t>
class DiscardOutputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::discard_iterator instead") DiscardOutputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -191,11 +191,13 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.offset << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/iterator/transform_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename ConversionOp, typename InputIteratorT, typename OffsetT = ptrdiff_t>
class TransformInputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator instead") TransformInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -233,10 +233,12 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
{
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
Loading

0 comments on commit eb245ca

Please sign in to comment.