From 8764a9ade2970b3036f9e9311d6b38f54582089a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 5 Feb 2025 12:15:36 +0100 Subject: [PATCH 1/6] Fix SM100 histogram tunings The tuning data member names did not match the one used when selecting tunings, so all SM100 tunings were SFINAE-ed out. --- cub/cub/device/dispatch/tuning/tuning_histogram.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 43844ba0284..b1d7c24ad8d 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -148,7 +148,7 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_CA; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 2; + static constexpr int vec_size = 1 << 2; }; // same as base @@ -181,7 +181,7 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 2; + static constexpr int vec_size = 1 << 2; }; // same as base @@ -201,7 +201,7 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 0; + static constexpr int vec_size = 1 << 0; }; template @@ -215,7 +215,7 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 0; + static constexpr int vec_size = 1 << 0; }; // multi.even @@ -230,7 +230,7 @@ struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 0; + static constexpr int vec_size = 1 << 0; }; // same as base @@ -263,7 +263,7 @@ struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 1; + static constexpr int vec_size = 1 << 1; }; // same as base From b8c80e10dc83eb8548f8ab90230174eee0634d69 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 5 Feb 2025 12:19:45 +0100 Subject: [PATCH 2/6] Fix falling back to SM90 tunings --- .../dispatch/tuning/tuning_histogram.cuh | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index b1d7c24ad8d..6de73f4f801 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -151,23 +151,17 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr int vec_size = 1 << 2; }; -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; // range template @@ -184,11 +178,9 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr int vec_size = 1 << 2; }; -// same as base -template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; template struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> @@ -234,22 +226,16 @@ struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s }; // same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; +// template +// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; // same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; +// template +// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; // same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; +// template +// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; // multi.range template From 0dece2ed1e2f5f6b12f9b48e8327be74d1b3d022 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 6 Feb 2025 10:11:23 +0100 Subject: [PATCH 3/6] Fix defaulting tunings for multi.range --- .../dispatch/tuning/tuning_histogram.cuh | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 6de73f4f801..d654ba72524 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -225,15 +225,15 @@ struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s static constexpr int vec_size = 1 << 0; }; -// same as base +// same as SM90 // template // struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; -// same as base +// same as SM90 // template // struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; -// same as base +// same as SM90 // template // struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; @@ -252,23 +252,17 @@ struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s static constexpr int vec_size = 1 << 1; }; -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; +// same as SM90 +// template +// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; template struct policy_hub From dfd5d9c047a2c55970da0e1d3d0ac63042e9f829 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 6 Feb 2025 10:16:02 +0100 Subject: [PATCH 4/6] Fix bools --- .../dispatch/tuning/tuning_histogram.cuh | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index d654ba72524..639965434d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -138,7 +138,7 @@ struct sm100_tuning; // even template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876 static constexpr int items = 12; @@ -153,19 +153,19 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s // same as SM90 // template -// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; +// struct sm100_tuning {}; // range template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637 static constexpr int items = 12; @@ -180,10 +180,10 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s // same as SM90 // template -// struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; +// struct sm100_tuning {}; template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> +struct sm100_tuning { // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 static constexpr int items = 9; @@ -197,7 +197,7 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s }; template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> +struct sm100_tuning { // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 static constexpr int items = 7; @@ -212,7 +212,7 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s // multi.even template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504 static constexpr int items = 9; @@ -227,19 +227,19 @@ struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s // same as SM90 // template -// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; +// struct sm100_tuning {}; // multi.range template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584 static constexpr int items = 7; @@ -254,15 +254,15 @@ struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, s // same as SM90 // template -// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {}; +// struct sm100_tuning {}; // same as SM90 // template -// struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {}; +// struct sm100_tuning {}; template struct policy_hub From 66cbb57126dbb3ecfa998f73ad09c9c6a9dd2364 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 6 Feb 2025 10:16:31 +0100 Subject: [PATCH 5/6] Disable regressions --- .../dispatch/tuning/tuning_histogram.cuh | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 639965434d2..adcbe6956ff 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -182,33 +182,35 @@ struct sm100_tuning // struct sm100_tuning {}; -template -struct sm100_tuning -{ - // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 0; -}; - -template -struct sm100_tuning -{ - // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 - static constexpr int items = 7; - static constexpr int threads = 544; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int vec_size = 1 << 0; -}; +// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled +// template +// struct sm100_tuning +// { +// // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 +// static constexpr int items = 9; +// static constexpr int threads = 1024; +// static constexpr bool rle_compress = true; +// static constexpr bool work_stealing = false; +// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; +// static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; +// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; +// static constexpr int vec_size = 1 << 0; +// }; + +// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled +// template +// struct sm100_tuning +// { +// // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 +// static constexpr int items = 7; +// static constexpr int threads = 544; +// static constexpr bool rle_compress = true; +// static constexpr bool work_stealing = false; +// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; +// static constexpr CacheLoadModifier load_modifier = LOAD_LDG; +// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; +// static constexpr int vec_size = 1 << 0; +// }; // multi.even template From 30f5b218ae7c69ef7d949be641b9243509553d8b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 6 Feb 2025 21:41:35 +0100 Subject: [PATCH 6/6] Drop tunings with no benefit --- .../dispatch/tuning/tuning_histogram.cuh | 100 +----------------- 1 file changed, 3 insertions(+), 97 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index adcbe6956ff..5886c9a243e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -151,17 +151,7 @@ struct sm100_tuning -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks // range template @@ -178,93 +168,9 @@ struct sm100_tuning -// struct sm100_tuning {}; - -// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled -// template -// struct sm100_tuning -// { -// // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 -// static constexpr int items = 9; -// static constexpr int threads = 1024; -// static constexpr bool rle_compress = true; -// static constexpr bool work_stealing = false; -// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; -// static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; -// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; -// static constexpr int vec_size = 1 << 0; -// }; - -// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled -// template -// struct sm100_tuning -// { -// // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 -// static constexpr int items = 7; -// static constexpr int threads = 544; -// static constexpr bool rle_compress = true; -// static constexpr bool work_stealing = false; -// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; -// static constexpr CacheLoadModifier load_modifier = LOAD_LDG; -// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; -// static constexpr int vec_size = 1 << 0; -// }; - -// multi.even -template -struct sm100_tuning -{ - // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 0; -}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// multi.range -template -struct sm100_tuning -{ - // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584 - static constexpr int items = 7; - static constexpr int threads = 160; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 1; -}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks -// same as SM90 -// template -// struct sm100_tuning {}; +// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks template struct policy_hub