diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 639965434d2..adcbe6956ff 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -182,33 +182,35 @@ struct sm100_tuning // struct sm100_tuning {}; -template -struct sm100_tuning -{ - // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int vec_size = 1 << 0; -}; - -template -struct sm100_tuning -{ - // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 - static constexpr int items = 7; - static constexpr int threads = 544; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int vec_size = 1 << 0; -}; +// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled +// template +// struct sm100_tuning +// { +// // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 +// static constexpr int items = 9; +// static constexpr int threads = 1024; +// static constexpr bool rle_compress = true; +// static constexpr bool work_stealing = false; +// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; +// static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; +// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; +// static constexpr int vec_size = 1 << 0; +// }; + +// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled +// template +// struct sm100_tuning +// { +// // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 +// static constexpr int items = 7; +// static constexpr int threads = 544; +// static constexpr bool rle_compress = true; +// static constexpr bool work_stealing = false; +// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; +// static constexpr CacheLoadModifier load_modifier = LOAD_LDG; +// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; +// static constexpr int vec_size = 1 << 0; +// }; // multi.even template