NVIDIA · bernhardmgruber · Feb 5, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
@@ -87,7 +87,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::uint8_t* d_temp_storage = nullptr;
   std::size_t temp_storage_bytes{};
 
-  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
   OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;

@@ -97,7 +97,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::uint8_t* d_temp_storage = nullptr;
   std::size_t temp_storage_bytes{};
 
-  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
   OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;

@@ -106,7 +106,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::uint8_t* d_temp_storage = nullptr;
   std::size_t temp_storage_bytes{};
 
-  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
   OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;

@@ -95,7 +95,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::uint8_t* d_temp_storage = nullptr;
   std::size_t temp_storage_bytes{};
 
-  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
   OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;

@@ -36,6 +36,8 @@
 
 #include <cub/config.cuh>
 
+#include "cuda/std/__type_traits/integral_constant.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -401,7 +403,7 @@ struct AgentHistogram
     SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
     bool is_valid[PIXELS_PER_THREAD],
     CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
-    Int2Type<true> is_rle_compress)
+    ::cuda::std::true_type is_rle_compress)
   {
 #pragma unroll
     for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
@@ -447,7 +449,7 @@ struct AgentHistogram
     SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
     bool is_valid[PIXELS_PER_THREAD],
     CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
-    Int2Type<false> is_rle_compress)
+    ::cuda::std::false_type is_rle_compress)
   {
 #pragma unroll
     for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
@@ -478,7 +480,7 @@ struct AgentHistogram
       privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
     }
 
-    AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
   }
 
   /**
@@ -487,7 +489,7 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void
   AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
   {
-    AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
   }
 
   //---------------------------------------------------------------------
@@ -500,7 +502,7 @@ struct AgentHistogram
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
+    int_constant_t<_NUM_ACTIVE_CHANNELS> num_active_channels)
   {
     using AliasedPixels = PixelT[PIXELS_PER_THREAD];
 
@@ -515,7 +517,7 @@ struct AgentHistogram
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<1> num_active_channels)
+    int_constant_t<1> num_active_channels)
   {
     using AliasedVecs = VecT[VECS_PER_THREAD];
 
@@ -530,19 +532,19 @@ struct AgentHistogram
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<true> is_full_tile,
-    Int2Type<true> is_aligned)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::true_type is_aligned)
   {
-    LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    LoadFullAlignedTile(block_offset, valid_samples, samples, int_constant_v<NUM_ACTIVE_CHANNELS>);
   }
 
   // Load full, mis-aligned tile using sample iterator
   _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<true> is_full_tile,
-    Int2Type<false> is_aligned)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::false_type is_aligned)
   {
     using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
 
@@ -556,8 +558,8 @@ struct AgentHistogram
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<false> is_full_tile,
-    Int2Type<true> is_aligned)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::true_type is_aligned)
   {
     using AliasedPixels = PixelT[PIXELS_PER_THREAD];
 
@@ -575,8 +577,8 @@ struct AgentHistogram
     OffsetT block_offset,
     int valid_samples,
     SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-    Int2Type<false> is_full_tile,
-    Int2Type<false> is_aligned)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::false_type is_aligned)
   {
     using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
 
@@ -586,7 +588,7 @@ struct AgentHistogram
 
   template <bool IS_FULL_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<false> /* is_striped = false */)
+  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::false_type /* is_striped = false */)
   {
 #pragma unroll
     for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
@@ -597,7 +599,7 @@ struct AgentHistogram
 
   template <bool IS_FULL_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<true> /* is_striped = true */)
+  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::true_type /* is_striped = true */)
   {
 #pragma unroll
     for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
@@ -626,11 +628,11 @@ struct AgentHistogram
     bool is_valid[PIXELS_PER_THREAD];
 
     // Load tile
-    LoadTile(block_offset, valid_samples, samples, Int2Type<IS_FULL_TILE>(), Int2Type<IS_ALIGNED>());
+    LoadTile(block_offset, valid_samples, samples, bool_constant_v<IS_FULL_TILE>, bool_constant_v<IS_ALIGNED>);
 
     // Set valid flags
     MarkValid<IS_FULL_TILE>(
-      is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {});
+      is_valid, valid_samples, bool_constant_v < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED >);
 
     // Accumulate samples
     if (prefer_smem)
@@ -665,7 +667,7 @@ struct AgentHistogram
     OffsetT row_stride_samples,
     int tiles_per_row,
     GridQueue<int> tile_queue,
-    Int2Type<true> is_work_stealing)
+    ::cuda::std::true_type is_work_stealing)
   {
     int num_tiles                = num_rows * tiles_per_row;
     int tile_idx                 = (blockIdx.y * gridDim.x) + blockIdx.x;
@@ -727,7 +729,7 @@ struct AgentHistogram
     OffsetT row_stride_samples,
     int tiles_per_row,
     GridQueue<int> tile_queue,
-    Int2Type<false> is_work_stealing)
+    ::cuda::std::false_type is_work_stealing)
   {
     for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
     {
@@ -875,12 +877,12 @@ struct AgentHistogram
     if ((d_native_samples != nullptr) && (vec_aligned_rows || pixel_aligned_rows))
     {
       ConsumeTiles<true>(
-        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
     }
     else
     {
       ConsumeTiles<false>(
-        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
     }
   }
 

@@ -341,8 +341,8 @@ struct AgentRadixSortDownsweep
     OffsetT block_offset,
     OffsetT valid_items,
     bit_ordered_type oob_item,
-    Int2Type<true> is_full_tile,
-    Int2Type<false> warp_striped)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::false_type warp_striped)
   {
     BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
 
@@ -357,8 +357,8 @@ struct AgentRadixSortDownsweep
     OffsetT block_offset,
     OffsetT valid_items,
     bit_ordered_type oob_item,
-    Int2Type<false> is_full_tile,
-    Int2Type<false> warp_striped)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::false_type warp_striped)
   {
     // Register pressure work-around: moving valid_items through shfl prevents compiler
     // from reusing guards/addressing from prior guarded loads
@@ -377,8 +377,8 @@ struct AgentRadixSortDownsweep
     OffsetT block_offset,
     OffsetT valid_items,
     bit_ordered_type oob_item,
-    Int2Type<true> is_full_tile,
-    Int2Type<true> warp_striped)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::true_type warp_striped)
   {
     LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
   }
@@ -391,8 +391,8 @@ struct AgentRadixSortDownsweep
     OffsetT block_offset,
     OffsetT valid_items,
     bit_ordered_type oob_item,
-    Int2Type<false> is_full_tile,
-    Int2Type<true> warp_striped)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::true_type warp_striped)
   {
     // Register pressure work-around: moving valid_items through shfl prevents compiler
     // from reusing guards/addressing from prior guarded loads
@@ -408,8 +408,8 @@ struct AgentRadixSortDownsweep
     ValueT (&values)[ITEMS_PER_THREAD],
     OffsetT block_offset,
     OffsetT valid_items,
-    Int2Type<true> is_full_tile,
-    Int2Type<false> warp_striped)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::false_type warp_striped)
   {
     BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);
 
@@ -423,8 +423,8 @@ struct AgentRadixSortDownsweep
     ValueT (&values)[ITEMS_PER_THREAD],
     OffsetT block_offset,
     OffsetT valid_items,
-    Int2Type<false> is_full_tile,
-    Int2Type<false> warp_striped)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::false_type warp_striped)
   {
     // Register pressure work-around: moving valid_items through shfl prevents compiler
     // from reusing guards/addressing from prior guarded loads
@@ -442,8 +442,8 @@ struct AgentRadixSortDownsweep
     ValueT (&values)[ITEMS_PER_THREAD],
     OffsetT block_offset,
     OffsetT valid_items,
-    Int2Type<true> is_full_tile,
-    Int2Type<true> warp_striped)
+    ::cuda::std::true_type is_full_tile,
+    ::cuda::std::true_type warp_striped)
   {
     LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
   }
@@ -455,8 +455,8 @@ struct AgentRadixSortDownsweep
     ValueT (&values)[ITEMS_PER_THREAD],
     OffsetT block_offset,
     OffsetT valid_items,
-    Int2Type<false> is_full_tile,
-    Int2Type<true> warp_striped)
+    ::cuda::std::false_type is_full_tile,
+    ::cuda::std::true_type warp_striped)
   {
     // Register pressure work-around: moving valid_items through shfl prevents compiler
     // from reusing guards/addressing from prior guarded loads
@@ -474,13 +474,13 @@ struct AgentRadixSortDownsweep
     int (&ranks)[ITEMS_PER_THREAD],
     OffsetT block_offset,
     OffsetT valid_items,
-    Int2Type<false> /*is_keys_only*/)
+    ::cuda::std::false_type /*is_keys_only*/)
   {
     ValueT values[ITEMS_PER_THREAD];
 
     __syncthreads();
 
-    LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
+    LoadValues(values, block_offset, valid_items, bool_constant_v<FULL_TILE>, bool_constant_v<LOAD_WARP_STRIPED>);
 
     ScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, valid_items);
   }
@@ -494,7 +494,7 @@ struct AgentRadixSortDownsweep
     int (& /*ranks*/)[ITEMS_PER_THREAD],
     OffsetT /*block_offset*/,
     OffsetT /*valid_items*/,
-    Int2Type<true> /*is_keys_only*/)
+    ::cuda::std::true_type /*is_keys_only*/)
   {}
 
   /**
@@ -512,7 +512,8 @@ struct AgentRadixSortDownsweep
       IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
 
     // Load tile of keys
-    LoadKeys(keys, block_offset, valid_items, default_key, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
+    LoadKeys(
+      keys, block_offset, valid_items, default_key, bool_constant_v<FULL_TILE>, bool_constant_v<LOAD_WARP_STRIPED>);
 
 #pragma unroll
     for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
@@ -587,7 +588,7 @@ struct AgentRadixSortDownsweep
     ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
 
     // Gather/scatter values
-    GatherScatterValues<FULL_TILE>(relative_bin_offsets, ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    GatherScatterValues<FULL_TILE>(relative_bin_offsets, ranks, block_offset, valid_items, bool_constant_v<KEYS_ONLY>);
   }
 
   //---------------------------------------------------------------------

@@ -115,7 +115,7 @@ struct AgentRadixSortOnesweep
   enum
   {
     ITEMS_PER_THREAD      = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD,
-    KEYS_ONLY             = std::is_same<ValueT, NullType>::value,
+    KEYS_ONLY             = ::cuda::std::is_same<ValueT, NullType>::value,
     BLOCK_THREADS         = AgentRadixSortOnesweepPolicy::BLOCK_THREADS,
     RANK_NUM_PARTS        = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
     TILE_ITEMS            = BLOCK_THREADS * ITEMS_PER_THREAD,
@@ -595,7 +595,8 @@ struct AgentRadixSortOnesweep
     }
   }
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<false> keys_only)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], ::cuda::std::false_type keys_only)
   {
     // compute digits corresponding to the keys
     int digits[ITEMS_PER_THREAD];
@@ -613,7 +614,9 @@ struct AgentRadixSortOnesweep
     ScatterValuesGlobal(digits);
   }
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], ::cuda::std::true_type keys_only)
+  {}
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
   {
@@ -644,7 +647,7 @@ struct AgentRadixSortOnesweep
     ScatterKeysGlobal();
 
     // scatter values if necessary
-    GatherScatterValues(ranks, Int2Type<KEYS_ONLY>());
+    GatherScatterValues(ranks, bool_constant_v<KEYS_ONLY>);
   }
 
   _CCCL_DEVICE _CCCL_FORCEINLINE //