Skip to content

Commit

Permalink
Merge branch 'main' into bugfix/barrier_deadlock
Browse files Browse the repository at this point in the history
  • Loading branch information
wmaxey authored Jan 30, 2025
2 parents 17f4b3d + cea61a3 commit 428477d
Show file tree
Hide file tree
Showing 170 changed files with 3,583 additions and 1,072 deletions.
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ template <typename Policy,
bool ReadLeft>
struct AgentDifference
{
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, InputIteratorT>::type;

using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
Expand Down Expand Up @@ -119,7 +119,7 @@ struct AgentDifference
OffsetT num_items)
: temp_storage(temp_storage.Alias())
, input_it(input_it)
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it))
, first_tile_previous(first_tile_previous)
, result(result)
, difference_op(difference_op)
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ struct agent_t
using key_type = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;

using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt2>::type;

using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
Expand Down
15 changes: 9 additions & 6 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ struct AgentBlockSort

using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;

using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
using KeysLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueInputIteratorT>::type;

using BlockLoadKeys = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
Expand Down Expand Up @@ -438,10 +440,11 @@ struct AgentMerge
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueT*>::type;

using KeysOutputPongIt = KeyIteratorT;
using ItemsOutputPongIt = ValueIteratorT;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ public:

using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;

using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const ValueT*>::type;

using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
using WarpLoadItemsT =
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#endif // no system header

#include <cub/detail/type_traits.cuh> // implicit_prom_t
#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED
#include <cub/util_type.cuh> // _CCCL_HAS_INT128()

#include <cuda/cmath> // cuda::std::ceil_div
#include <cuda/std/bit> // std::has_single_bit
Expand Down Expand Up @@ -79,15 +79,15 @@ struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 4)>
using type = ::cuda::std::uint64_t;
};

#if CUB_IS_INT128_ENABLED
#if _CCCL_HAS_INT128()

template <typename T>
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>::type>
{
using type = __uint128_t;
};

#endif // CUB_IS_INT128_ENABLED
#endif // _CCCL_HAS_INT128()

template <typename T>
using larger_unsigned_type_t = typename larger_unsigned_type<T>::type;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/detail/launcher/cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ namespace detail

struct TripleChevronFactory
{
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron
operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron operator()(
dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream, bool dependent_launch = false) const
{
return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream);
return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream, dependent_launch);
}

CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
Expand Down
18 changes: 12 additions & 6 deletions cub/cub/device/device_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,19 @@ struct DeviceMerge
void* d_temp_storage,
std::size_t& temp_storage_bytes,
KeyIteratorIn1 keys_in1,
int num_keys1,
::cuda::std::int64_t num_keys1,
KeyIteratorIn2 keys_in2,
int num_keys2,
::cuda::std::int64_t num_keys2,
KeyIteratorOut keys_out,
CompareOp compare_op = {},
cudaStream_t stream = nullptr)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");

using offset_t = ::cuda::std::int64_t;

return detail::merge::
dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, int, CompareOp>::
dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, offset_t, CompareOp>::
dispatch(
d_temp_storage,
temp_storage_bytes,
Expand Down Expand Up @@ -161,24 +164,27 @@ struct DeviceMerge
std::size_t& temp_storage_bytes,
KeyIteratorIn1 keys_in1,
ValueIteratorIn1 values_in1,
int num_pairs1,
::cuda::std::int64_t num_pairs1,
KeyIteratorIn2 keys_in2,
ValueIteratorIn2 values_in2,
int num_pairs2,
::cuda::std::int64_t num_pairs2,
KeyIteratorOut keys_out,
ValueIteratorOut values_out,
CompareOp compare_op = {},
cudaStream_t stream = nullptr)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");

using offset_t = ::cuda::std::int64_t;

return detail::merge::dispatch_t<
KeyIteratorIn1,
ValueIteratorIn1,
KeyIteratorIn2,
ValueIteratorIn2,
KeyIteratorOut,
ValueIteratorOut,
int,
offset_t,
CompareOp>::dispatch(d_temp_storage,
temp_storage_bytes,
keys_in1,
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -646,27 +646,27 @@ public:
using IntArithmeticT = ::cuda::std::_If< //
sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), //
uint32_t, //
#if CUB_IS_INT128_ENABLED
#if _CCCL_HAS_INT128()
::cuda::std::_If< //
(::cuda::std::is_same<CommonT, __int128_t>::value || //
::cuda::std::is_same<CommonT, __uint128_t>::value), //
CommonT, //
uint64_t> //
#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv
uint64_t
#endif // !CUB_IS_INT128_ENABLED
#endif // !_CCCL_HAS_INT128()
>;

// Alias template that excludes __[u]int128 from the integral types
template <typename T>
using is_integral_excl_int128 =
#if CUB_IS_INT128_ENABLED
#if _CCCL_HAS_INT128()
::cuda::std::_If<::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
::cuda::std::false_type,
::cuda::std::is_integral<T>>;
#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv
::cuda::std::is_integral<T>;
#endif // !CUB_IS_INT128_ENABLED
#endif // !_CCCL_HAS_INT128()

union ScaleT
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ __launch_bounds__(
CompareOp>::type;
using MergePolicy = typename MergeAgent::policy;

using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator;
using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
__shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
Expand Down
Loading

0 comments on commit 428477d

Please sign in to comment.