@@ -53,7 +53,7 @@ def git_url(self) -> str:
53
53
return "https://github.com/intel/compute-benchmarks.git"
54
54
55
55
def git_hash (self ) -> str :
56
- return "c9e135d4f26dd6badd83009f92f25d6285fc1e21 "
56
+ return "4995560017559849a519e58978a0afdd55903e15 "
57
57
58
58
def setup (self ) -> None :
59
59
if options .sycl is None :
@@ -173,6 +173,9 @@ def benchmarks(self) -> list[Benchmark]:
173
173
# See SubmitKernel.enabled()
174
174
long_kernel_exec_time_ooo = [20 , 200 ]
175
175
176
+ # The Combo Profiler is available only for selected sycl benchmarks
177
+ profiler_types = ["timer" , "cpuCounter" ]
178
+
176
179
for runtime in list (RUNTIMES ):
177
180
# Add SubmitKernel benchmarks using loops
178
181
for in_order_queue in [0 , 1 ]:
@@ -184,16 +187,18 @@ def benchmarks(self) -> list[Benchmark]:
184
187
else long_kernel_exec_time_ooo
185
188
)
186
189
for kernel_exec_time in [1 , * long_kernel_exec_time ]:
187
- benches .append (
188
- SubmitKernel (
189
- self ,
190
- runtime ,
191
- in_order_queue ,
192
- measure_completion ,
193
- use_events ,
194
- kernel_exec_time ,
190
+ for profiler_type in profiler_types :
191
+ benches .append (
192
+ SubmitKernel (
193
+ self ,
194
+ runtime ,
195
+ in_order_queue ,
196
+ measure_completion ,
197
+ use_events ,
198
+ kernel_exec_time ,
199
+ profiler_type ,
200
+ )
195
201
)
196
- )
197
202
198
203
# Add SinKernelGraph benchmarks
199
204
for with_graphs in [0 , 1 ]:
@@ -203,51 +208,69 @@ def benchmarks(self) -> list[Benchmark]:
203
208
)
204
209
205
210
# Add ULLS benchmarks
206
- benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
211
+ for profiler_type in profiler_types :
212
+ benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 , profiler_type ))
207
213
benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
208
214
209
215
# Add GraphApiSubmitGraph benchmarks
210
216
for in_order_queue in [0 , 1 ]:
211
- benches .append (
212
- GraphApiSubmitGraph (
213
- self ,
214
- runtime ,
215
- in_order_queue ,
216
- self .submit_graph_num_kernels [- 1 ],
217
- 0 ,
218
- useEvents = 0 ,
219
- useHostTasks = 1 ,
217
+ for profiler_type in profiler_types :
218
+ benches .append (
219
+ GraphApiSubmitGraph (
220
+ self ,
221
+ runtime ,
222
+ in_order_queue ,
223
+ self .submit_graph_num_kernels [- 1 ],
224
+ 0 ,
225
+ profiler_type ,
226
+ useEvents = 0 ,
227
+ useHostTasks = 1 ,
228
+ )
220
229
)
221
- )
222
230
for num_kernels in self .submit_graph_num_kernels :
223
231
for measure_completion_time in [0 , 1 ]:
224
232
for use_events in [0 , 1 ]:
225
- benches .append (
226
- GraphApiSubmitGraph (
227
- self ,
228
- runtime ,
229
- in_order_queue ,
230
- num_kernels ,
231
- measure_completion_time ,
232
- use_events ,
233
- useHostTasks = 0 ,
233
+ for profiler_type in profiler_types :
234
+ benches .append (
235
+ GraphApiSubmitGraph (
236
+ self ,
237
+ runtime ,
238
+ in_order_queue ,
239
+ num_kernels ,
240
+ measure_completion_time ,
241
+ profiler_type ,
242
+ use_events ,
243
+ useHostTasks = 0 ,
244
+ )
234
245
)
235
- )
236
246
237
247
# Add other benchmarks
238
248
benches += [
239
- QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 ),
240
- QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 ),
241
- QueueMemcpy (self , "Device" , "Device" , 1024 ),
242
249
StreamMemory (self , "Triad" , 10 * 1024 , "Device" ),
243
- ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 ),
244
- ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 ),
245
250
VectorSum (self ),
246
251
GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Gromacs" ),
247
252
GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Gromacs" ),
248
253
GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Llama" ),
249
254
GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Llama" ),
250
255
]
256
+ for profiler_type in profiler_types :
257
+ benches .append (
258
+ QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 , profiler_type )
259
+ )
260
+ benches .append (
261
+ QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 , profiler_type )
262
+ )
263
+ benches .append (QueueMemcpy (self , "Device" , "Device" , 1024 , profiler_type ))
264
+ benches .append (
265
+ ExecImmediateCopyQueue (
266
+ self , 0 , 1 , "Device" , "Device" , 1024 , profiler_type
267
+ )
268
+ )
269
+ benches .append (
270
+ ExecImmediateCopyQueue (
271
+ self , 1 , 1 , "Device" , "Host" , 1024 , profiler_type
272
+ )
273
+ )
251
274
252
275
# Add UR-specific benchmarks
253
276
benches += [
@@ -295,12 +318,15 @@ def parse_unit_type(compute_unit):
295
318
296
319
297
320
class ComputeBenchmark (Benchmark ):
298
- def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
321
+ def __init__ (
322
+ self , bench , name , test , runtime : RUNTIMES = None , profiler_type : str = ""
323
+ ):
299
324
super ().__init__ (bench .directory , bench )
300
325
self .bench = bench
301
326
self .bench_name = name
302
327
self .test = test
303
328
self .runtime = runtime
329
+ self .profiler_type = profiler_type
304
330
305
331
def supported_runtimes (self ) -> list [RUNTIMES ]:
306
332
"""Base runtimes supported by this benchmark, can be overridden."""
@@ -428,14 +454,19 @@ def __init__(
428
454
MeasureCompletion = 0 ,
429
455
UseEvents = 0 ,
430
456
KernelExecTime = 1 ,
457
+ profiler_type = "" ,
431
458
):
432
459
self .ioq = ioq
433
460
self .MeasureCompletion = MeasureCompletion
434
461
self .UseEvents = UseEvents
435
462
self .KernelExecTime = KernelExecTime
436
463
self .NumKernels = 10
437
464
super ().__init__ (
438
- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
465
+ bench ,
466
+ f"api_overhead_benchmark_{ runtime .value } " ,
467
+ "SubmitKernel" ,
468
+ runtime ,
469
+ profiler_type ,
439
470
)
440
471
441
472
def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -504,7 +535,7 @@ def range(self) -> tuple[float, float]:
504
535
return (0.0 , None )
505
536
506
537
def bin_args (self ) -> list [str ]:
507
- return [
538
+ bin_args = [
508
539
f"--Ioq={ self .ioq } " ,
509
540
f"--MeasureCompletion={ self .MeasureCompletion } " ,
510
541
"--iterations=100000" ,
@@ -513,6 +544,9 @@ def bin_args(self) -> list[str]:
513
544
f"--KernelExecTime={ self .KernelExecTime } " ,
514
545
f"--UseEvents={ self .UseEvents } " ,
515
546
]
547
+ if self .runtime == RUNTIMES .SYCL :
548
+ bin_args .append (f"--profilerType={ self .profiler_type } " )
549
+ return bin_args
516
550
517
551
def get_metadata (self ) -> dict [str , BenchmarkMetadata ]:
518
552
metadata_dict = super ().get_metadata ()
@@ -532,13 +566,20 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
532
566
533
567
534
568
class ExecImmediateCopyQueue (ComputeBenchmark ):
535
- def __init__ (self , bench , ioq , isCopyOnly , source , destination , size ):
569
+ def __init__ (
570
+ self , bench , ioq , isCopyOnly , source , destination , size , profiler_type
571
+ ):
536
572
self .ioq = ioq
537
573
self .isCopyOnly = isCopyOnly
538
574
self .source = source
539
575
self .destination = destination
540
576
self .size = size
541
- super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" )
577
+ super ().__init__ (
578
+ bench ,
579
+ "api_overhead_benchmark_sycl" ,
580
+ "ExecImmediateCopyQueue" ,
581
+ profiler_type = profiler_type ,
582
+ )
542
583
543
584
def name (self ):
544
585
order = "in order" if self .ioq else "out of order"
@@ -569,16 +610,22 @@ def bin_args(self) -> list[str]:
569
610
f"--dst={ self .destination } " ,
570
611
f"--size={ self .size } " ,
571
612
"--withCopyOffload=0" ,
613
+ f"--profilerType={ self .profiler_type } " ,
572
614
]
573
615
574
616
575
617
class QueueInOrderMemcpy (ComputeBenchmark ):
576
- def __init__ (self , bench , isCopyOnly , source , destination , size ):
618
+ def __init__ (self , bench , isCopyOnly , source , destination , size , profiler_type ):
577
619
self .isCopyOnly = isCopyOnly
578
620
self .source = source
579
621
self .destination = destination
580
622
self .size = size
581
- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" )
623
+ super ().__init__ (
624
+ bench ,
625
+ "memory_benchmark_sycl" ,
626
+ "QueueInOrderMemcpy" ,
627
+ profiler_type = profiler_type ,
628
+ )
582
629
583
630
def name (self ):
584
631
return f"memory_benchmark_sycl QueueInOrderMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -605,15 +652,18 @@ def bin_args(self) -> list[str]:
605
652
f"--size={ self .size } " ,
606
653
"--count=100" ,
607
654
"--withCopyOffload=0" ,
655
+ f"--profilerType={ self .profiler_type } " ,
608
656
]
609
657
610
658
611
659
class QueueMemcpy (ComputeBenchmark ):
612
- def __init__ (self , bench , source , destination , size ):
660
+ def __init__ (self , bench , source , destination , size , profiler_type ):
613
661
self .source = source
614
662
self .destination = destination
615
663
self .size = size
616
- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" )
664
+ super ().__init__ (
665
+ bench , "memory_benchmark_sycl" , "QueueMemcpy" , profiler_type = profiler_type
666
+ )
617
667
618
668
def name (self ):
619
669
return f"memory_benchmark_sycl QueueMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -636,6 +686,7 @@ def bin_args(self) -> list[str]:
636
686
f"--sourcePlacement={ self .source } " ,
637
687
f"--destinationPlacement={ self .destination } " ,
638
688
f"--size={ self .size } " ,
689
+ f"--profilerType={ self .profiler_type } " ,
639
690
]
640
691
641
692
@@ -858,6 +909,7 @@ def __init__(
858
909
inOrderQueue ,
859
910
numKernels ,
860
911
measureCompletionTime ,
912
+ profiler_type ,
861
913
useEvents ,
862
914
useHostTasks ,
863
915
):
@@ -873,7 +925,11 @@ def __init__(
873
925
self .use_events_str = f" with events" if self .useEvents else ""
874
926
self .host_tasks_str = f" use host tasks" if self .useHostTasks else ""
875
927
super ().__init__ (
876
- bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
928
+ bench ,
929
+ f"graph_api_benchmark_{ runtime .value } " ,
930
+ "SubmitGraph" ,
931
+ runtime ,
932
+ profiler_type ,
877
933
)
878
934
879
935
def explicit_group (self ):
@@ -901,7 +957,7 @@ def get_tags(self):
901
957
]
902
958
903
959
def bin_args (self ) -> list [str ]:
904
- return [
960
+ bin_args = [
905
961
"--iterations=10000" ,
906
962
f"--NumKernels={ self .numKernels } " ,
907
963
f"--MeasureCompletionTime={ self .measureCompletionTime } " ,
@@ -912,14 +968,21 @@ def bin_args(self) -> list[str]:
912
968
"--UseExplicit=0" ,
913
969
f"--UseHostTasks={ self .useHostTasks } " ,
914
970
]
971
+ if self .runtime == RUNTIMES .SYCL :
972
+ bin_args .append (f"--profilerType={ self .profiler_type } " )
973
+ return bin_args
915
974
916
975
917
976
class UllsEmptyKernel (ComputeBenchmark ):
918
- def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
977
+ def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs , profiler_type ):
919
978
self .wgc = wgc
920
979
self .wgs = wgs
921
980
super ().__init__ (
922
- bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
981
+ bench ,
982
+ f"ulls_benchmark_{ runtime .value } " ,
983
+ "EmptyKernel" ,
984
+ runtime ,
985
+ profiler_type ,
923
986
)
924
987
925
988
def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -943,11 +1006,14 @@ def get_tags(self):
943
1006
return [runtime_to_tag_name (self .runtime ), "micro" , "latency" , "submit" ]
944
1007
945
1008
def bin_args (self ) -> list [str ]:
946
- return [
1009
+ bin_args = [
947
1010
"--iterations=10000" ,
948
1011
f"--wgs={ self .wgs } " ,
949
1012
f"--wgc={ self .wgc } " ,
950
1013
]
1014
+ if self .runtime == RUNTIMES .SYCL :
1015
+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1016
+ return bin_args
951
1017
952
1018
953
1019
class UllsKernelSwitch (ComputeBenchmark ):
0 commit comments