Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace Int2Type in CUB library #3641

Merged
merged 16 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};

cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};

cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};

cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};

cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;
Expand Down
48 changes: 25 additions & 23 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@

#include <cub/config.cuh>

#include "cuda/std/__type_traits/integral_constant.h"

fbusato marked this conversation as resolved.
Show resolved Hide resolved
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down Expand Up @@ -401,7 +403,7 @@ struct AgentHistogram
SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
bool is_valid[PIXELS_PER_THREAD],
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
Int2Type<true> is_rle_compress)
::cuda::std::true_type is_rle_compress)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
Expand Down Expand Up @@ -447,7 +449,7 @@ struct AgentHistogram
SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
bool is_valid[PIXELS_PER_THREAD],
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
Int2Type<false> is_rle_compress)
::cuda::std::false_type is_rle_compress)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
Expand Down Expand Up @@ -478,7 +480,7 @@ struct AgentHistogram
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
}

AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
}

/**
Expand All @@ -487,7 +489,7 @@ struct AgentHistogram
_CCCL_DEVICE _CCCL_FORCEINLINE void
AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
{
AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
}

//---------------------------------------------------------------------
Expand All @@ -500,7 +502,7 @@ struct AgentHistogram
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
int_constant_t<_NUM_ACTIVE_CHANNELS> num_active_channels)
{
using AliasedPixels = PixelT[PIXELS_PER_THREAD];

Expand All @@ -515,7 +517,7 @@ struct AgentHistogram
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<1> num_active_channels)
int_constant_t<1> num_active_channels)
{
using AliasedVecs = VecT[VECS_PER_THREAD];

Expand All @@ -530,19 +532,19 @@ struct AgentHistogram
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<true> is_full_tile,
Int2Type<true> is_aligned)
::cuda::std::true_type is_full_tile,
::cuda::std::true_type is_aligned)
{
LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
LoadFullAlignedTile(block_offset, valid_samples, samples, int_constant_v<NUM_ACTIVE_CHANNELS>);
}

// Load full, mis-aligned tile using sample iterator
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<true> is_full_tile,
Int2Type<false> is_aligned)
::cuda::std::true_type is_full_tile,
::cuda::std::false_type is_aligned)
{
using AliasedSamples = SampleT[SAMPLES_PER_THREAD];

Expand All @@ -556,8 +558,8 @@ struct AgentHistogram
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<false> is_full_tile,
Int2Type<true> is_aligned)
::cuda::std::false_type is_full_tile,
::cuda::std::true_type is_aligned)
{
using AliasedPixels = PixelT[PIXELS_PER_THREAD];

Expand All @@ -575,8 +577,8 @@ struct AgentHistogram
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<false> is_full_tile,
Int2Type<false> is_aligned)
::cuda::std::false_type is_full_tile,
::cuda::std::false_type is_aligned)
{
using AliasedSamples = SampleT[SAMPLES_PER_THREAD];

Expand All @@ -586,7 +588,7 @@ struct AgentHistogram

template <bool IS_FULL_TILE>
_CCCL_DEVICE _CCCL_FORCEINLINE void
MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<false> /* is_striped = false */)
MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::false_type /* is_striped = false */)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
Expand All @@ -597,7 +599,7 @@ struct AgentHistogram

template <bool IS_FULL_TILE>
_CCCL_DEVICE _CCCL_FORCEINLINE void
MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<true> /* is_striped = true */)
MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::true_type /* is_striped = true */)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
Expand Down Expand Up @@ -626,11 +628,11 @@ struct AgentHistogram
bool is_valid[PIXELS_PER_THREAD];

// Load tile
LoadTile(block_offset, valid_samples, samples, Int2Type<IS_FULL_TILE>(), Int2Type<IS_ALIGNED>());
LoadTile(block_offset, valid_samples, samples, bool_constant_v<IS_FULL_TILE>, bool_constant_v<IS_ALIGNED>);

// Set valid flags
MarkValid<IS_FULL_TILE>(
is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {});
is_valid, valid_samples, bool_constant_v < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED >);

// Accumulate samples
if (prefer_smem)
Expand Down Expand Up @@ -665,7 +667,7 @@ struct AgentHistogram
OffsetT row_stride_samples,
int tiles_per_row,
GridQueue<int> tile_queue,
Int2Type<true> is_work_stealing)
::cuda::std::true_type is_work_stealing)
{
int num_tiles = num_rows * tiles_per_row;
int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
Expand Down Expand Up @@ -727,7 +729,7 @@ struct AgentHistogram
OffsetT row_stride_samples,
int tiles_per_row,
GridQueue<int> tile_queue,
Int2Type<false> is_work_stealing)
::cuda::std::false_type is_work_stealing)
{
for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
{
Expand Down Expand Up @@ -875,12 +877,12 @@ struct AgentHistogram
if ((d_native_samples != nullptr) && (vec_aligned_rows || pixel_aligned_rows))
{
ConsumeTiles<true>(
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
}
else
{
ConsumeTiles<false>(
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
}
}

Expand Down
43 changes: 22 additions & 21 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,8 @@ struct AgentRadixSortDownsweep
OffsetT block_offset,
OffsetT valid_items,
bit_ordered_type oob_item,
Int2Type<true> is_full_tile,
Int2Type<false> warp_striped)
::cuda::std::true_type is_full_tile,
::cuda::std::false_type warp_striped)
{
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);

Expand All @@ -357,8 +357,8 @@ struct AgentRadixSortDownsweep
OffsetT block_offset,
OffsetT valid_items,
bit_ordered_type oob_item,
Int2Type<false> is_full_tile,
Int2Type<false> warp_striped)
::cuda::std::false_type is_full_tile,
::cuda::std::false_type warp_striped)
{
// Register pressure work-around: moving valid_items through shfl prevents compiler
// from reusing guards/addressing from prior guarded loads
Expand All @@ -377,8 +377,8 @@ struct AgentRadixSortDownsweep
OffsetT block_offset,
OffsetT valid_items,
bit_ordered_type oob_item,
Int2Type<true> is_full_tile,
Int2Type<true> warp_striped)
::cuda::std::true_type is_full_tile,
::cuda::std::true_type warp_striped)
{
LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
}
Expand All @@ -391,8 +391,8 @@ struct AgentRadixSortDownsweep
OffsetT block_offset,
OffsetT valid_items,
bit_ordered_type oob_item,
Int2Type<false> is_full_tile,
Int2Type<true> warp_striped)
::cuda::std::false_type is_full_tile,
::cuda::std::true_type warp_striped)
{
// Register pressure work-around: moving valid_items through shfl prevents compiler
// from reusing guards/addressing from prior guarded loads
Expand All @@ -408,8 +408,8 @@ struct AgentRadixSortDownsweep
ValueT (&values)[ITEMS_PER_THREAD],
OffsetT block_offset,
OffsetT valid_items,
Int2Type<true> is_full_tile,
Int2Type<false> warp_striped)
::cuda::std::true_type is_full_tile,
::cuda::std::false_type warp_striped)
{
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);

Expand All @@ -423,8 +423,8 @@ struct AgentRadixSortDownsweep
ValueT (&values)[ITEMS_PER_THREAD],
OffsetT block_offset,
OffsetT valid_items,
Int2Type<false> is_full_tile,
Int2Type<false> warp_striped)
::cuda::std::false_type is_full_tile,
::cuda::std::false_type warp_striped)
{
// Register pressure work-around: moving valid_items through shfl prevents compiler
// from reusing guards/addressing from prior guarded loads
Expand All @@ -442,8 +442,8 @@ struct AgentRadixSortDownsweep
ValueT (&values)[ITEMS_PER_THREAD],
OffsetT block_offset,
OffsetT valid_items,
Int2Type<true> is_full_tile,
Int2Type<true> warp_striped)
::cuda::std::true_type is_full_tile,
::cuda::std::true_type warp_striped)
{
LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
}
Expand All @@ -455,8 +455,8 @@ struct AgentRadixSortDownsweep
ValueT (&values)[ITEMS_PER_THREAD],
OffsetT block_offset,
OffsetT valid_items,
Int2Type<false> is_full_tile,
Int2Type<true> warp_striped)
::cuda::std::false_type is_full_tile,
::cuda::std::true_type warp_striped)
{
// Register pressure work-around: moving valid_items through shfl prevents compiler
// from reusing guards/addressing from prior guarded loads
Expand All @@ -474,13 +474,13 @@ struct AgentRadixSortDownsweep
int (&ranks)[ITEMS_PER_THREAD],
OffsetT block_offset,
OffsetT valid_items,
Int2Type<false> /*is_keys_only*/)
::cuda::std::false_type /*is_keys_only*/)
{
ValueT values[ITEMS_PER_THREAD];

__syncthreads();

LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
LoadValues(values, block_offset, valid_items, bool_constant_v<FULL_TILE>, bool_constant_v<LOAD_WARP_STRIPED>);

ScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, valid_items);
}
Expand All @@ -494,7 +494,7 @@ struct AgentRadixSortDownsweep
int (& /*ranks*/)[ITEMS_PER_THREAD],
OffsetT /*block_offset*/,
OffsetT /*valid_items*/,
Int2Type<true> /*is_keys_only*/)
::cuda::std::true_type /*is_keys_only*/)
{}

/**
Expand All @@ -512,7 +512,8 @@ struct AgentRadixSortDownsweep
IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);

// Load tile of keys
LoadKeys(keys, block_offset, valid_items, default_key, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
LoadKeys(
keys, block_offset, valid_items, default_key, bool_constant_v<FULL_TILE>, bool_constant_v<LOAD_WARP_STRIPED>);

#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
Expand Down Expand Up @@ -587,7 +588,7 @@ struct AgentRadixSortDownsweep
ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);

// Gather/scatter values
GatherScatterValues<FULL_TILE>(relative_bin_offsets, ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
GatherScatterValues<FULL_TILE>(relative_bin_offsets, ranks, block_offset, valid_items, bool_constant_v<KEYS_ONLY>);
}

//---------------------------------------------------------------------
Expand Down
11 changes: 7 additions & 4 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ struct AgentRadixSortOnesweep
enum
{
ITEMS_PER_THREAD = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD,
KEYS_ONLY = std::is_same<ValueT, NullType>::value,
KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value,
BLOCK_THREADS = AgentRadixSortOnesweepPolicy::BLOCK_THREADS,
RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
Expand Down Expand Up @@ -595,7 +595,8 @@ struct AgentRadixSortOnesweep
}
}

_CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<false> keys_only)
_CCCL_DEVICE _CCCL_FORCEINLINE void
GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], ::cuda::std::false_type keys_only)
{
// compute digits corresponding to the keys
int digits[ITEMS_PER_THREAD];
Expand All @@ -613,7 +614,9 @@ struct AgentRadixSortOnesweep
ScatterValuesGlobal(digits);
}

_CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
_CCCL_DEVICE _CCCL_FORCEINLINE void
GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], ::cuda::std::true_type keys_only)
{}

_CCCL_DEVICE _CCCL_FORCEINLINE void Process()
{
Expand Down Expand Up @@ -644,7 +647,7 @@ struct AgentRadixSortOnesweep
ScatterKeysGlobal();

// scatter values if necessary
GatherScatterValues(ranks, Int2Type<KEYS_ONLY>());
GatherScatterValues(ranks, bool_constant_v<KEYS_ONLY>);
}

_CCCL_DEVICE _CCCL_FORCEINLINE //
Expand Down
Loading
Loading