Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SM100 histogram tunings #3691

Merged
merged 6 commits into from
Feb 6, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 7 additions & 119 deletions cub/cub/device/dispatch/tuning/tuning_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ struct sm100_tuning;

// even
template <class SampleT>
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
{
// ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876
static constexpr int items = 12;
Expand All @@ -148,30 +148,14 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
static constexpr int tune_vec_size = 1 << 2;
static constexpr int vec_size = 1 << 2;
};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
{};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
{};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
{};
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks

// range
template <class SampleT>
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
{
// ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637
static constexpr int items = 12;
Expand All @@ -181,108 +165,12 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
static constexpr int tune_vec_size = 1 << 2;
};

// same as base
template <class SampleT>
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
{};

template <class SampleT>
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
{
// ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104
static constexpr int items = 9;
static constexpr int threads = 1024;
static constexpr bool rle_compress = true;
static constexpr bool work_stealing = false;
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr int tune_vec_size = 1 << 0;
};

template <class SampleT>
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
{
// ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657
static constexpr int items = 7;
static constexpr int threads = 544;
static constexpr bool rle_compress = true;
static constexpr bool work_stealing = false;
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
static constexpr int tune_vec_size = 1 << 0;
};

// multi.even
template <class SampleT>
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
{
// ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504
static constexpr int items = 9;
static constexpr int threads = 1024;
static constexpr bool rle_compress = false;
static constexpr bool work_stealing = false;
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr int tune_vec_size = 1 << 0;
};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
{};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
{};

// same as base
template <class SampleT>
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
{};

// multi.range
template <class SampleT>
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
{
// ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584
static constexpr int items = 7;
static constexpr int threads = 160;
static constexpr bool rle_compress = false;
static constexpr bool work_stealing = false;
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr int tune_vec_size = 1 << 1;
static constexpr int vec_size = 1 << 2;
};

// same as base
template <class SampleT>
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
{};

// same as base
template <class SampleT>
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
{};
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks

// same as base
template <class SampleT>
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8>
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
{};
// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks

template <class SampleT, class CounterT, int NumChannels, int NumActiveChannels, bool IsEven>
struct policy_hub
Expand Down
Loading