diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index adcbe6956ff..5886c9a243e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -151,17 +151,7 @@ struct sm100_tuning -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks // range template @@ -178,93 +168,9 @@ struct sm100_tuning -// struct sm100_tuning {}; - -// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled -// template -// struct sm100_tuning -// { -// // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 -// static constexpr int items = 9; -// static constexpr int threads = 1024; -// static constexpr bool rle_compress = true; -// static constexpr bool work_stealing = false; -// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; -// static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; -// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; -// static constexpr int vec_size = 1 << 0; -// }; - -// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled -// template -// struct sm100_tuning -// { -// // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 -// static constexpr int items = 7; -// static constexpr int threads = 544; -// static constexpr bool rle_compress = true; -// static constexpr bool work_stealing = false; -// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; -// static constexpr CacheLoadModifier load_modifier = LOAD_LDG; -// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; -// static constexpr int vec_size = 1 << 0; -// }; - -// multi.even -template -struct sm100_tuning -{ - // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 0; -}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// multi.range -template -struct sm100_tuning -{ - // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584 - static constexpr int items = 7; - static constexpr int threads = 160; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 1; -}; - -// same as SM90 -// template -// struct sm100_tuning {}; - -// same as SM90 -// template -// struct sm100_tuning {}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks -// same as SM90 -// template -// struct sm100_tuning {}; +// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks template struct policy_hub