Skip to content

Commit 9df7a86

Browse files
Drop max benchmarks
1 parent 8b8e992 commit 9df7a86

File tree

1 file changed

+1
-133
lines changed

1 file changed

+1
-133
lines changed

cub/cub/device/dispatch/tuning/tuning_scan.cuh

+1-133
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ enum class primitive_op
7373
enum class op_type
7474
{
7575
plus,
76-
min_or_max,
7776
unknown
7877
};
7978
enum class offset_size
@@ -125,30 +124,10 @@ struct is_plus<::cuda::std::plus<T>>
125124
static constexpr bool value = true;
126125
};
127126

128-
template <typename Op>
129-
struct is_min_or_max
130-
{
131-
static constexpr bool value = false;
132-
};
133-
134-
template <typename T>
135-
struct is_min_or_max<::cuda::minimum<T>>
136-
{
137-
static constexpr bool value = true;
138-
};
139-
140-
template <typename T>
141-
struct is_min_or_max<::cuda::maximum<T>>
142-
{
143-
static constexpr bool value = true;
144-
};
145-
146127
template <class ScanOpT>
147128
constexpr op_type classify_op()
148129
{
149-
return is_plus<ScanOpT>::value
150-
? op_type::plus
151-
: (is_min_or_max<ScanOpT>::value ? op_type::min_or_max : op_type::unknown);
130+
return is_plus<ScanOpT>::value ? op_type::plus : op_type::unknown;
152131
}
153132

154133
template <class ValueT>
@@ -433,117 +412,6 @@ struct sm100_tuning<double, AccumT, OffsetT, op_type::plus, primitive_accum::yes
433412
// {};
434413
#endif
435414

436-
// min/max (only ran benchmarks for max)
437-
template <class ValueT, class AccumT, class OffsetT>
438-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_1>
439-
{
440-
// ipt_22.tpb_128.ns_1900.dcid_5.l2w_750.trp_1.ld_1 1.288379 1.078212 1.274188 1.615385
441-
static constexpr int items = 22;
442-
static constexpr int threads = 128;
443-
using delay_constructor = exponential_backon_jitter_window_constructor_t<1900, 750>;
444-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
445-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
446-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
447-
};
448-
449-
template <class ValueT, class AccumT, class OffsetT>
450-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_1>
451-
{
452-
// ipt_24.tpb_128.ns_344.dcid_2.l2w_710.trp_1.ld_0 1.222111 0.983240 1.205706 1.587886
453-
static constexpr int items = 24;
454-
static constexpr int threads = 128;
455-
using delay_constructor = exponential_backoff_constructor_t<1900, 750>;
456-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
457-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
458-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
459-
};
460-
461-
template <class ValueT, class AccumT, class OffsetT>
462-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_2>
463-
{
464-
// ipt_14.tpb_384.ns_1708.dcid_7.l2w_930.trp_1.ld_1 1.242487 1.002841 1.226297 1.615385
465-
static constexpr int items = 14;
466-
static constexpr int threads = 384;
467-
using delay_constructor = exponential_backon_constructor_t<1708, 930>;
468-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
469-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
470-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
471-
};
472-
473-
template <class ValueT, class AccumT, class OffsetT>
474-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_2>
475-
{
476-
// ipt_14.tpb_352.ns_1524.dcid_7.l2w_955.trp_1.ld_1 1.234616 1.000000 1.218721 1.596154
477-
static constexpr int items = 14;
478-
static constexpr int threads = 352;
479-
using delay_constructor = exponential_backon_constructor_t<1524, 955>;
480-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
481-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
482-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
483-
};
484-
485-
template <class ValueT, class AccumT, class OffsetT>
486-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_4>
487-
{
488-
// ipt_23.tpb_256.ns_1240.dcid_7.l2w_560.trp_1.ld_2 1.192410 1.000000 1.175338 1.289286
489-
static constexpr int items = 23;
490-
static constexpr int threads = 256;
491-
using delay_constructor = exponential_backon_constructor_t<1240, 560>;
492-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
493-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
494-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
495-
};
496-
497-
template <class ValueT, class AccumT, class OffsetT>
498-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_4>
499-
{
500-
// ipt_22.tpb_192.ns_976.dcid_7.l2w_1180.trp_1.ld_0 1.172486 1.000000 1.158032 1.305288
501-
static constexpr int items = 22;
502-
static constexpr int threads = 192;
503-
using delay_constructor = exponential_backon_constructor_t<976, 1180>;
504-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
505-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
506-
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
507-
};
508-
509-
template <class ValueT, class AccumT, class OffsetT>
510-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_8>
511-
{
512-
// ipt_22.tpb_256.ns_380.dcid_2.l2w_920.trp_1.ld_0 1.218252 1.171831 1.214092 1.246711
513-
static constexpr int items = 22;
514-
static constexpr int threads = 256;
515-
using delay_constructor = exponential_backoff_constructor_t<380, 920>;
516-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
517-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
518-
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
519-
};
520-
521-
template <class ValueT, class AccumT, class OffsetT>
522-
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_8>
523-
{
524-
// ipt_20.tpb_256.ns_220.dcid_1.l2w_740.trp_1.ld_1 1.191382 1.010806 1.186827 1.299600
525-
static constexpr int items = 20;
526-
static constexpr int threads = 256;
527-
using delay_constructor = fixed_delay_constructor_t<220, 740>;
528-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
529-
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
530-
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
531-
};
532-
533-
// todo(gonidelis): Add tunings for i128, float and double.
534-
// template <class OffsetT> struct sm100_tuning<float, OffsetT, op_type::min_or_max, primitive_accum::yes,
535-
// offset_size::_8, accum_size::_4>;
536-
// template <class OffsetT> struct sm100_tuning<double, OffsetT, op_type::min_or_max,
537-
// primitive_accum::yes, offset_size::_8, accum_size::_8>;
538-
539-
#if CUB_IS_INT128_ENABLED
540-
// template <class OffsetT> struct sm100_tuning<__int128_t, OffsetT, op_type::min_or_max, primitive_accum::no,
541-
// offset_size::_8, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <class OffsetT> struct
542-
// sm100_tuning<__uint128_t, OffsetT, op_type::min_or_max, primitive_accum::no, offset_size::_8, accum_size::_16>
543-
// : sm100_tuning<__int128_t, OffsetT, op_type::min_or_max, primitive_accum::no, offset_size::_8, accum_size::_16>
544-
// {};
545-
#endif
546-
547415
template <typename PolicyT, typename = void, typename = void>
548416
struct ScanPolicyWrapper : PolicyT
549417
{

0 commit comments

Comments
 (0)