Merge branch 'main' of github.com:NVIDIA/cccl into fea/use-sccache-bu…

…ild-cluster
NVIDIA · Feb 3, 2025 · f845766 · f845766
2 parents 5ddbbc7 + 9b9e3ed
commit f845766
Show file tree

Hide file tree

Showing 61 changed files with 807 additions and 888 deletions.
diff --git a/.github/workflows/backport-prs.yml b/.github/workflows/backport-prs.yml
@@ -28,9 +28,7 @@ jobs:
       )
     steps:
       - uses: actions/checkout@v4
-        with:
-          persist-credentials: false
       - name: Create backport pull requests
-        uses: korthout/backport-action@v1
+        uses: korthout/backport-action@v3
         with:
           merge_commits: 'skip'
diff --git a/c2h/generators.cu b/c2h/generators.cu
@@ -478,15 +478,15 @@ template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, float* out, std::size_t element_size);
 template void init_key_segments(
   const c2h::device_vector<std::uint32_t>& segment_offsets, custom_type_state_t* out, std::size_t element_size);
-#ifdef _CCCL_HAS_NVFP16
+#if TEST_HALF_T()
 template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, half_t* out, std::size_t element_size);
-#endif // _CCCL_HAS_NVFP16
+#endif // TEST_HALF_T()
 
-#ifdef _CCCL_HAS_NVBF16
+#if TEST_BF_T()
 template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, bfloat16_t* out, std::size_t element_size);
-#endif // _CCCL_HAS_NVBF16
+#endif // TEST_BF_T()
 } // namespace detail
 
 template <typename T>
@@ -552,15 +552,15 @@ INSTANTIATE(double);
 INSTANTIATE(bool);
 INSTANTIATE(char);
 
-#ifdef _CCCL_HAS_NVFP16
+#if TEST_HALF_T()
 INSTANTIATE(half_t);
 INSTANTIATE(__half);
-#endif // _CCCL_HAS_NVFP16
+#endif // TEST_HALF_T()
 
-#ifdef _CCCL_HAS_NVBF16
+#if TEST_BF_T()
 INSTANTIATE(bfloat16_t);
 INSTANTIATE(__nv_bfloat16);
-#endif // _CCCL_HAS_NVBF16
+#endif // TEST_BF_T()
 
 #undef INSTANTIATE_RND
 #undef INSTANTIATE_MOD

diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
@@ -36,6 +36,7 @@
 
 #include <cub/util_type.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -232,19 +233,36 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
  * Traits overloads
  ******************************************************************************/
 
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
+struct __is_extended_floating_point<bfloat16_t> : true_type
+{};
+
+#ifndef _CCCL_NO_VARIABLE_TEMPLATES
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
+#endif // _CCCL_NO_VARIABLE_TEMPLATES
+
+template <>
+class __numeric_limits_impl<bfloat16_t, __numeric_limits_type::__floating_point>
 {
-  static __host__ __device__ __forceinline__ bfloat16_t Max()
+public:
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t max()
+  {
+    return bfloat16_t(numeric_limits<__nv_bfloat16>::max());
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t min()
   {
-    return bfloat16_t::max();
+    return bfloat16_t(numeric_limits<__nv_bfloat16>::min());
   }
 
-  static __host__ __device__ __forceinline__ bfloat16_t Lowest()
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bfloat16_t lowest()
   {
-    return bfloat16_t::lowest();
+    return bfloat16_t(numeric_limits<__nv_bfloat16>::lowest());
   }
 };
+_LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
 struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>

diff --git a/c2h/include/c2h/extended_types.h b/c2h/include/c2h/extended_types.h
@@ -30,21 +30,29 @@
 #include <cuda/__cccl_config>
 
 #ifndef TEST_HALF_T
-#  define TEST_HALF_T _CCCL_HAS_NVFP16
-#endif
+#  if defined(_CCCL_HAS_NVFP16)
+#    define TEST_HALF_T() 1
+#  else // defined(_CCCL_HAS_NVFP16)
+#    define TEST_HALF_T() 0
+#  endif // defined(_CCCL_HAS_NVFP16)
+#endif // TEST_HALF_T
 
 #ifndef TEST_BF_T
-#  define TEST_BF_T _CCCL_HAS_NVBF16
-#endif
-
-#ifdef TEST_HALF_T
+#  if defined(_CCCL_HAS_NVBF16)
+#    define TEST_BF_T() 1
+#  else // defined(_CCCL_HAS_NVBF16)
+#    define TEST_BF_T() 0
+#  endif // defined(_CCCL_HAS_NVBF16)
+#endif // TEST_BF_T
+
+#if TEST_HALF_T()
 #  include <cuda_fp16.h>
 
 #  include <c2h/half.cuh>
-#endif
+#endif // TEST_HALF_T()
 
-#ifdef TEST_BF_T
+#if TEST_BF_T()
 #  include <cuda_bf16.h>
 
 #  include <c2h/bfloat16.cuh>
-#endif
+#endif // TEST_BF_T()
diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh
@@ -37,6 +37,7 @@
 
 #include <cub/util_type.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -327,19 +328,36 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
  * Traits overloads
  ******************************************************************************/
 
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct CUB_NS_QUALIFIER::FpLimits<half_t>
+struct __is_extended_floating_point<half_t> : true_type
+{};
+
+#ifndef _CCCL_NO_VARIABLE_TEMPLATES
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
+#endif // _CCCL_NO_VARIABLE_TEMPLATES
+
+template <>
+class __numeric_limits_impl<half_t, __numeric_limits_type::__floating_point>
 {
-  static __host__ __device__ __forceinline__ half_t Max()
+public:
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t max()
+  {
+    return half_t(numeric_limits<__half>::max());
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t min()
   {
-    return (half_t::max)();
+    return half_t(numeric_limits<__half>::min());
   }
 
-  static __host__ __device__ __forceinline__ half_t Lowest()
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE half_t lowest()
   {
-    return half_t::lowest();
+    return half_t(numeric_limits<__half>::lowest());
   }
 };
+_LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
 struct CUB_NS_QUALIFIER::NumericTraits<half_t>

diff --git a/cub/benchmarks/bench/merge/keys.cu b/cub/benchmarks/bench/merge/keys.cu
@@ -0,0 +1,165 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/sort.h>
+
+#include <cuda/std/utility>
+
+#include <cstdint>
+
+#include "merge_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1
+
+template <typename KeyT, typename OffsetT>
+void keys(nvbench::state& state, nvbench::type_list<KeyT, OffsetT>)
+{
+  using key_t            = KeyT;
+  using value_t          = cub::NullType;
+  using key_input_it_t   = key_t*;
+  using value_input_it_t = value_t*;
+  using key_it_t         = key_t*;
+  using value_it_t       = value_t*;
+  using offset_t         = OffsetT;
+  using compare_op_t     = less_t;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<key_t>;
+  using dispatch_t = cub::cub::detail::merge::
+    dispatch_t<key_it_t, value_it_t, key_it_t, value_it_t, key_it_t, value_it_t, offset_t, compare_op_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::detail::merge::
+    dispatch_t<key_it_t, value_it_t, key_it_t, value_it_t, key_it_t, value_it_t, offset_t, compare_op_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  // We generate data distributions in the range [0, 255], which, with lower entropy, get skewed towards 0.
+  // We use this to generate increasingly large *consecutive* segments of data that are getting selected from the lhs
+  thrust::device_vector<uint8_t> rnd_selector_val = generate(elements, entropy);
+  uint8_t threshold                               = 128;
+  select_if_less_than_t select_lhs_op{false, threshold};
+  select_if_less_than_t select_rhs_op{true, threshold};
+
+  // The following algorithm only works under the precondition that there's at least 50% of the data in the lhs
+  // If that's not the case, we simply swap the logic for selecting into lhs and rhs
+  const auto num_items_selected_into_lhs =
+    static_cast<offset_t>(thrust::count_if(rnd_selector_val.begin(), rnd_selector_val.end(), select_lhs_op));
+  if (num_items_selected_into_lhs < elements / 2)
+  {
+    using ::cuda::std::swap;
+    swap(select_lhs_op, select_rhs_op);
+  }
+
+  // We want lhs and rhs to be of equal size. We also want to have skewed distributions, such that we put different
+  // workloads on the binary search part. For this reason, we identify the index from the input, referred to as pivot
+  // point, after which the lhs is "full". We compose the rhs by selecting all items up to the pivot point that were not
+  // selected for lhs and *all* items after the pivot point.
+  constexpr std::size_t num_pivot_points = 1;
+  thrust::device_vector<offset_t> pivot_point(num_pivot_points);
+  const auto num_items_lhs = elements / 2;
+  const auto num_items_rhs = elements - num_items_lhs;
+  auto counting_it         = thrust::make_counting_iterator(offset_t{0});
+  thrust::copy_if(
+    counting_it,
+    counting_it + elements,
+    rnd_selector_val.begin(),
+    thrust::make_tabulate_output_iterator(write_pivot_point_t<offset_t>{
+      static_cast<offset_t>(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}),
+    select_lhs_op);
+
+  thrust::device_vector<key_t> keys_lhs(num_items_lhs);
+  thrust::device_vector<key_t> keys_rhs(num_items_rhs);
+  thrust::device_vector<key_t> keys_out(elements);
+
+  // Generate increasing input range to sample from
+  thrust::device_vector<key_t> increasing_input = generate(elements);
+  thrust::sort(increasing_input.begin(), increasing_input.end());
+
+  // Select lhs from input up to pivot point
+  offset_t pivot_point_val = pivot_point[0];
+  auto const end_lhs       = thrust::copy_if(
+    increasing_input.cbegin(),
+    increasing_input.cbegin() + pivot_point_val,
+    rnd_selector_val.cbegin(),
+    keys_lhs.begin(),
+    select_lhs_op);
+
+  // Select rhs items from input up to pivot point
+  auto const end_rhs = thrust::copy_if(
+    increasing_input.cbegin(),
+    increasing_input.cbegin() + pivot_point_val,
+    rnd_selector_val.cbegin(),
+    keys_rhs.begin(),
+    select_rhs_op);
+  // From pivot point copy all remaining items to rhs
+  thrust::copy(increasing_input.cbegin() + pivot_point_val, increasing_input.cbegin() + elements, end_rhs);
+
+  key_t* d_keys_lhs = thrust::raw_pointer_cast(keys_lhs.data());
+  key_t* d_keys_rhs = thrust::raw_pointer_cast(keys_rhs.data());
+  key_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_writes<KeyT>(elements);
+
+  // Allocate temporary storage:
+  std::size_t temp_size{};
+  dispatch_t::dispatch(
+    nullptr,
+    temp_size,
+    d_keys_lhs,
+    nullptr,
+    num_items_lhs,
+    d_keys_rhs,
+    nullptr,
+    num_items_rhs,
+    d_keys_out,
+    nullptr,
+    compare_op_t{},
+    cudaStream_t{});
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::dispatch(
+      temp_storage,
+      temp_size,
+      d_keys_lhs,
+      nullptr,
+      num_items_lhs,
+      d_keys_rhs,
+      nullptr,
+      num_items_rhs,
+      d_keys_out,
+      nullptr,
+      compare_op_t{},
+      launch.get_stream());
+  });
+}
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types = fundamental_types;
+#endif // TUNE_KeyT
+
+NVBENCH_BENCH_TYPES(keys, NVBENCH_TYPE_AXES(key_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.201"});