Skip to content

Commit

Permalink
Merge branch 'main' of github.com:NVIDIA/cccl into fea/use-sccache-bu…
Browse files Browse the repository at this point in the history
…ild-cluster
  • Loading branch information
trxcllnt committed Feb 3, 2025
2 parents 5ddbbc7 + 9b9e3ed commit f845766
Show file tree
Hide file tree
Showing 61 changed files with 807 additions and 888 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/backport-prs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ jobs:
)
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Create backport pull requests
uses: korthout/backport-action@v1
uses: korthout/backport-action@v3
with:
merge_commits: 'skip'
16 changes: 8 additions & 8 deletions c2h/generators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -478,15 +478,15 @@ template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, float* out, std::size_t element_size);
template void init_key_segments(
const c2h::device_vector<std::uint32_t>& segment_offsets, custom_type_state_t* out, std::size_t element_size);
#ifdef _CCCL_HAS_NVFP16
#if TEST_HALF_T()
template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, half_t* out, std::size_t element_size);
#endif // _CCCL_HAS_NVFP16
#endif // TEST_HALF_T()

#ifdef _CCCL_HAS_NVBF16
#if TEST_BF_T()
template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, bfloat16_t* out, std::size_t element_size);
#endif // _CCCL_HAS_NVBF16
#endif // TEST_BF_T()
} // namespace detail

template <typename T>
Expand Down Expand Up @@ -552,15 +552,15 @@ INSTANTIATE(double);
INSTANTIATE(bool);
INSTANTIATE(char);

#ifdef _CCCL_HAS_NVFP16
#if TEST_HALF_T()
INSTANTIATE(half_t);
INSTANTIATE(__half);
#endif // _CCCL_HAS_NVFP16
#endif // TEST_HALF_T()

#ifdef _CCCL_HAS_NVBF16
#if TEST_BF_T()
INSTANTIATE(bfloat16_t);
INSTANTIATE(__nv_bfloat16);
#endif // _CCCL_HAS_NVBF16
#endif // TEST_BF_T()

#undef INSTANTIATE_RND
#undef INSTANTIATE_MOD
Expand Down
28 changes: 23 additions & 5 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

#include <cub/util_type.cuh>

#include <cuda/std/limits>
#include <cuda/std/type_traits>

#include <cstdint>
Expand Down Expand Up @@ -232,19 +233,36 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
* Traits overloads
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
struct __is_extended_floating_point<bfloat16_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES

template <>
class __numeric_limits_impl<bfloat16_t, __numeric_limits_type::__floating_point>
{
static __host__ __device__ __forceinline__ bfloat16_t Max()
public:
static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t max()
{
return bfloat16_t(numeric_limits<__nv_bfloat16>::max());
}

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t min()
{
return bfloat16_t::max();
return bfloat16_t(numeric_limits<__nv_bfloat16>::min());
}

static __host__ __device__ __forceinline__ bfloat16_t Lowest()
static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t lowest()
{
return bfloat16_t::lowest();
return bfloat16_t(numeric_limits<__nv_bfloat16>::lowest());
}
};
_LIBCUDACXX_END_NAMESPACE_STD

template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
Expand Down
26 changes: 17 additions & 9 deletions c2h/include/c2h/extended_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,29 @@
#include <cuda/__cccl_config>

#ifndef TEST_HALF_T
# define TEST_HALF_T _CCCL_HAS_NVFP16
#endif
# if defined(_CCCL_HAS_NVFP16)
# define TEST_HALF_T() 1
# else // defined(_CCCL_HAS_NVFP16)
# define TEST_HALF_T() 0
# endif // defined(_CCCL_HAS_NVFP16)
#endif // TEST_HALF_T

#ifndef TEST_BF_T
# define TEST_BF_T _CCCL_HAS_NVBF16
#endif

#ifdef TEST_HALF_T
# if defined(_CCCL_HAS_NVBF16)
# define TEST_BF_T() 1
# else // defined(_CCCL_HAS_NVBF16)
# define TEST_BF_T() 0
# endif // defined(_CCCL_HAS_NVBF16)
#endif // TEST_BF_T

#if TEST_HALF_T()
# include <cuda_fp16.h>

# include <c2h/half.cuh>
#endif
#endif // TEST_HALF_T()

#ifdef TEST_BF_T
#if TEST_BF_T()
# include <cuda_bf16.h>

# include <c2h/bfloat16.cuh>
#endif
#endif // TEST_BF_T()
28 changes: 23 additions & 5 deletions c2h/include/c2h/half.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

#include <cub/util_type.cuh>

#include <cuda/std/limits>
#include <cuda/std/type_traits>

#include <cstdint>
Expand Down Expand Up @@ -327,19 +328,36 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
* Traits overloads
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct CUB_NS_QUALIFIER::FpLimits<half_t>
struct __is_extended_floating_point<half_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES

template <>
class __numeric_limits_impl<half_t, __numeric_limits_type::__floating_point>
{
static __host__ __device__ __forceinline__ half_t Max()
public:
static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t max()
{
return half_t(numeric_limits<__half>::max());
}

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t min()
{
return (half_t::max)();
return half_t(numeric_limits<__half>::min());
}

static __host__ __device__ __forceinline__ half_t Lowest()
static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t lowest()
{
return half_t::lowest();
return half_t(numeric_limits<__half>::lowest());
}
};
_LIBCUDACXX_END_NAMESPACE_STD

template <>
struct CUB_NS_QUALIFIER::NumericTraits<half_t>
Expand Down
165 changes: 165 additions & 0 deletions cub/benchmarks/bench/merge/keys.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include <cub/device/device_merge.cuh>

#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/iterator/tabulate_output_iterator.h>
#include <thrust/sort.h>

#include <cuda/std/utility>

#include <cstdint>

#include "merge_common.cuh"
#include <nvbench_helper.cuh>

// %RANGE% TUNE_TRANSPOSE trp 0:1:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1

template <typename KeyT, typename OffsetT>
void keys(nvbench::state& state, nvbench::type_list<KeyT, OffsetT>)
{
using key_t = KeyT;
using value_t = cub::NullType;
using key_input_it_t = key_t*;
using value_input_it_t = value_t*;
using key_it_t = key_t*;
using value_it_t = value_t*;
using offset_t = OffsetT;
using compare_op_t = less_t;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t>;
using dispatch_t = cub::cub::detail::merge::
dispatch_t<key_it_t, value_it_t, key_it_t, value_it_t, key_it_t, value_it_t, offset_t, compare_op_t, policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::detail::merge::
dispatch_t<key_it_t, value_it_t, key_it_t, value_it_t, key_it_t, value_it_t, offset_t, compare_op_t>;
#endif // TUNE_BASE

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

// We generate data distributions in the range [0, 255], which, with lower entropy, get skewed towards 0.
// We use this to generate increasingly large *consecutive* segments of data that are getting selected from the lhs
thrust::device_vector<uint8_t> rnd_selector_val = generate(elements, entropy);
uint8_t threshold = 128;
select_if_less_than_t select_lhs_op{false, threshold};
select_if_less_than_t select_rhs_op{true, threshold};

// The following algorithm only works under the precondition that there's at least 50% of the data in the lhs
// If that's not the case, we simply swap the logic for selecting into lhs and rhs
const auto num_items_selected_into_lhs =
static_cast<offset_t>(thrust::count_if(rnd_selector_val.begin(), rnd_selector_val.end(), select_lhs_op));
if (num_items_selected_into_lhs < elements / 2)
{
using ::cuda::std::swap;
swap(select_lhs_op, select_rhs_op);
}

// We want lhs and rhs to be of equal size. We also want to have skewed distributions, such that we put different
// workloads on the binary search part. For this reason, we identify the index from the input, referred to as pivot
// point, after which the lhs is "full". We compose the rhs by selecting all items up to the pivot point that were not
// selected for lhs and *all* items after the pivot point.
constexpr std::size_t num_pivot_points = 1;
thrust::device_vector<offset_t> pivot_point(num_pivot_points);
const auto num_items_lhs = elements / 2;
const auto num_items_rhs = elements - num_items_lhs;
auto counting_it = thrust::make_counting_iterator(offset_t{0});
thrust::copy_if(
counting_it,
counting_it + elements,
rnd_selector_val.begin(),
thrust::make_tabulate_output_iterator(write_pivot_point_t<offset_t>{
static_cast<offset_t>(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}),
select_lhs_op);

thrust::device_vector<key_t> keys_lhs(num_items_lhs);
thrust::device_vector<key_t> keys_rhs(num_items_rhs);
thrust::device_vector<key_t> keys_out(elements);

// Generate increasing input range to sample from
thrust::device_vector<key_t> increasing_input = generate(elements);
thrust::sort(increasing_input.begin(), increasing_input.end());

// Select lhs from input up to pivot point
offset_t pivot_point_val = pivot_point[0];
auto const end_lhs = thrust::copy_if(
increasing_input.cbegin(),
increasing_input.cbegin() + pivot_point_val,
rnd_selector_val.cbegin(),
keys_lhs.begin(),
select_lhs_op);

// Select rhs items from input up to pivot point
auto const end_rhs = thrust::copy_if(
increasing_input.cbegin(),
increasing_input.cbegin() + pivot_point_val,
rnd_selector_val.cbegin(),
keys_rhs.begin(),
select_rhs_op);
// From pivot point copy all remaining items to rhs
thrust::copy(increasing_input.cbegin() + pivot_point_val, increasing_input.cbegin() + elements, end_rhs);

key_t* d_keys_lhs = thrust::raw_pointer_cast(keys_lhs.data());
key_t* d_keys_rhs = thrust::raw_pointer_cast(keys_rhs.data());
key_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<KeyT>(elements);
state.add_global_memory_writes<KeyT>(elements);

// Allocate temporary storage:
std::size_t temp_size{};
dispatch_t::dispatch(
nullptr,
temp_size,
d_keys_lhs,
nullptr,
num_items_lhs,
d_keys_rhs,
nullptr,
num_items_rhs,
d_keys_out,
nullptr,
compare_op_t{},
cudaStream_t{});

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::dispatch(
temp_storage,
temp_size,
d_keys_lhs,
nullptr,
num_items_lhs,
d_keys_rhs,
nullptr,
num_items_rhs,
d_keys_out,
nullptr,
compare_op_t{},
launch.get_stream());
});
}

#ifdef TUNE_KeyT
using key_types = nvbench::type_list<TUNE_KeyT>;
#else // !defined(TUNE_KeyT)
using key_types = fundamental_types;
#endif // TUNE_KeyT

NVBENCH_BENCH_TYPES(keys, NVBENCH_TYPE_AXES(key_types, offset_types))
.set_name("base")
.set_type_axes_names({"KeyT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.201"});
Loading

0 comments on commit f845766

Please sign in to comment.