@@ -73,7 +73,6 @@ enum class primitive_op
73
73
enum class op_type
74
74
{
75
75
plus,
76
- min_or_max,
77
76
unknown
78
77
};
79
78
enum class offset_size
@@ -125,30 +124,10 @@ struct is_plus<::cuda::std::plus<T>>
125
124
static constexpr bool value = true ;
126
125
};
127
126
128
- template <typename Op>
129
- struct is_min_or_max
130
- {
131
- static constexpr bool value = false ;
132
- };
133
-
134
- template <typename T>
135
- struct is_min_or_max <::cuda::minimum<T>>
136
- {
137
- static constexpr bool value = true ;
138
- };
139
-
140
- template <typename T>
141
- struct is_min_or_max <::cuda::maximum<T>>
142
- {
143
- static constexpr bool value = true ;
144
- };
145
-
146
127
template <class ScanOpT >
147
128
constexpr op_type classify_op ()
148
129
{
149
- return is_plus<ScanOpT>::value
150
- ? op_type::plus
151
- : (is_min_or_max<ScanOpT>::value ? op_type::min_or_max : op_type::unknown);
130
+ return is_plus<ScanOpT>::value ? op_type::plus : op_type::unknown;
152
131
}
153
132
154
133
template <class ValueT >
@@ -433,117 +412,6 @@ struct sm100_tuning<double, AccumT, OffsetT, op_type::plus, primitive_accum::yes
433
412
// {};
434
413
#endif
435
414
436
- // min/max (only ran benchmarks for max)
437
- template <class ValueT , class AccumT , class OffsetT >
438
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_1>
439
- {
440
- // ipt_22.tpb_128.ns_1900.dcid_5.l2w_750.trp_1.ld_1 1.288379 1.078212 1.274188 1.615385
441
- static constexpr int items = 22 ;
442
- static constexpr int threads = 128 ;
443
- using delay_constructor = exponential_backon_jitter_window_constructor_t <1900 , 750 >;
444
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
445
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
446
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
447
- };
448
-
449
- template <class ValueT , class AccumT , class OffsetT >
450
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_1>
451
- {
452
- // ipt_24.tpb_128.ns_344.dcid_2.l2w_710.trp_1.ld_0 1.222111 0.983240 1.205706 1.587886
453
- static constexpr int items = 24 ;
454
- static constexpr int threads = 128 ;
455
- using delay_constructor = exponential_backoff_constructor_t <1900 , 750 >;
456
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
457
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
458
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
459
- };
460
-
461
- template <class ValueT , class AccumT , class OffsetT >
462
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_2>
463
- {
464
- // ipt_14.tpb_384.ns_1708.dcid_7.l2w_930.trp_1.ld_1 1.242487 1.002841 1.226297 1.615385
465
- static constexpr int items = 14 ;
466
- static constexpr int threads = 384 ;
467
- using delay_constructor = exponential_backon_constructor_t <1708 , 930 >;
468
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
469
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
470
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
471
- };
472
-
473
- template <class ValueT , class AccumT , class OffsetT >
474
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_2>
475
- {
476
- // ipt_14.tpb_352.ns_1524.dcid_7.l2w_955.trp_1.ld_1 1.234616 1.000000 1.218721 1.596154
477
- static constexpr int items = 14 ;
478
- static constexpr int threads = 352 ;
479
- using delay_constructor = exponential_backon_constructor_t <1524 , 955 >;
480
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
481
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
482
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
483
- };
484
-
485
- template <class ValueT , class AccumT , class OffsetT >
486
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_4>
487
- {
488
- // ipt_23.tpb_256.ns_1240.dcid_7.l2w_560.trp_1.ld_2 1.192410 1.000000 1.175338 1.289286
489
- static constexpr int items = 23 ;
490
- static constexpr int threads = 256 ;
491
- using delay_constructor = exponential_backon_constructor_t <1240 , 560 >;
492
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
493
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
494
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
495
- };
496
-
497
- template <class ValueT , class AccumT , class OffsetT >
498
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_4>
499
- {
500
- // ipt_22.tpb_192.ns_976.dcid_7.l2w_1180.trp_1.ld_0 1.172486 1.000000 1.158032 1.305288
501
- static constexpr int items = 22 ;
502
- static constexpr int threads = 192 ;
503
- using delay_constructor = exponential_backon_constructor_t <976 , 1180 >;
504
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
505
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
506
- static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
507
- };
508
-
509
- template <class ValueT , class AccumT , class OffsetT >
510
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_4, value_size::_8>
511
- {
512
- // ipt_22.tpb_256.ns_380.dcid_2.l2w_920.trp_1.ld_0 1.218252 1.171831 1.214092 1.246711
513
- static constexpr int items = 22 ;
514
- static constexpr int threads = 256 ;
515
- using delay_constructor = exponential_backoff_constructor_t <380 , 920 >;
516
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
517
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
518
- static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
519
- };
520
-
521
- template <class ValueT , class AccumT , class OffsetT >
522
- struct sm100_tuning <ValueT, AccumT, OffsetT, op_type::min_or_max, primitive_accum::yes, offset_size::_8, value_size::_8>
523
- {
524
- // ipt_20.tpb_256.ns_220.dcid_1.l2w_740.trp_1.ld_1 1.191382 1.010806 1.186827 1.299600
525
- static constexpr int items = 20 ;
526
- static constexpr int threads = 256 ;
527
- using delay_constructor = fixed_delay_constructor_t <220 , 740 >;
528
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
529
- static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
530
- static constexpr CacheLoadModifier load_modifier = LOAD_CA;
531
- };
532
-
533
- // todo(gonidelis): Add tunings for i128, float and double.
534
- // template <class OffsetT> struct sm100_tuning<float, OffsetT, op_type::min_or_max, primitive_accum::yes,
535
- // offset_size::_8, accum_size::_4>;
536
- // template <class OffsetT> struct sm100_tuning<double, OffsetT, op_type::min_or_max,
537
- // primitive_accum::yes, offset_size::_8, accum_size::_8>;
538
-
539
- #if CUB_IS_INT128_ENABLED
540
- // template <class OffsetT> struct sm100_tuning<__int128_t, OffsetT, op_type::min_or_max, primitive_accum::no,
541
- // offset_size::_8, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <class OffsetT> struct
542
- // sm100_tuning<__uint128_t, OffsetT, op_type::min_or_max, primitive_accum::no, offset_size::_8, accum_size::_16>
543
- // : sm100_tuning<__int128_t, OffsetT, op_type::min_or_max, primitive_accum::no, offset_size::_8, accum_size::_16>
544
- // {};
545
- #endif
546
-
547
415
template <typename PolicyT, typename = void , typename = void >
548
416
struct ScanPolicyWrapper : PolicyT
549
417
{
0 commit comments