Skip to content

Commit 8ec0f3d

Browse files
committed
[Benchmarks] Bump Compute Benchmarks
- stabilize results with a small sleep between binary runs - add combo profiler functionality to allow for choosing between time and cpu count measurement
1 parent 8a9b62a commit 8ec0f3d

File tree

1 file changed

+116
-50
lines changed

1 file changed

+116
-50
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 116 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def git_url(self) -> str:
5353
return "https://github.com/intel/compute-benchmarks.git"
5454

5555
def git_hash(self) -> str:
56-
return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
56+
return "4995560017559849a519e58978a0afdd55903e15"
5757

5858
def setup(self) -> None:
5959
if options.sycl is None:
@@ -173,6 +173,9 @@ def benchmarks(self) -> list[Benchmark]:
173173
# See SubmitKernel.enabled()
174174
long_kernel_exec_time_ooo = [20, 200]
175175

176+
# The Combo Profiler is available only for selected sycl benchmarks
177+
profiler_types = ["timer", "cpuCounter"]
178+
176179
for runtime in list(RUNTIMES):
177180
# Add SubmitKernel benchmarks using loops
178181
for in_order_queue in [0, 1]:
@@ -184,16 +187,18 @@ def benchmarks(self) -> list[Benchmark]:
184187
else long_kernel_exec_time_ooo
185188
)
186189
for kernel_exec_time in [1, *long_kernel_exec_time]:
187-
benches.append(
188-
SubmitKernel(
189-
self,
190-
runtime,
191-
in_order_queue,
192-
measure_completion,
193-
use_events,
194-
kernel_exec_time,
190+
for profiler_type in profiler_types:
191+
benches.append(
192+
SubmitKernel(
193+
self,
194+
runtime,
195+
in_order_queue,
196+
measure_completion,
197+
use_events,
198+
kernel_exec_time,
199+
profiler_type,
200+
)
195201
)
196-
)
197202

198203
# Add SinKernelGraph benchmarks
199204
for with_graphs in [0, 1]:
@@ -203,51 +208,69 @@ def benchmarks(self) -> list[Benchmark]:
203208
)
204209

205210
# Add ULLS benchmarks
206-
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
211+
for profiler_type in profiler_types:
212+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
207213
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
208214

209215
# Add GraphApiSubmitGraph benchmarks
210216
for in_order_queue in [0, 1]:
211-
benches.append(
212-
GraphApiSubmitGraph(
213-
self,
214-
runtime,
215-
in_order_queue,
216-
self.submit_graph_num_kernels[-1],
217-
0,
218-
useEvents=0,
219-
useHostTasks=1,
217+
for profiler_type in profiler_types:
218+
benches.append(
219+
GraphApiSubmitGraph(
220+
self,
221+
runtime,
222+
in_order_queue,
223+
self.submit_graph_num_kernels[-1],
224+
0,
225+
profiler_type,
226+
useEvents=0,
227+
useHostTasks=1,
228+
)
220229
)
221-
)
222230
for num_kernels in self.submit_graph_num_kernels:
223231
for measure_completion_time in [0, 1]:
224232
for use_events in [0, 1]:
225-
benches.append(
226-
GraphApiSubmitGraph(
227-
self,
228-
runtime,
229-
in_order_queue,
230-
num_kernels,
231-
measure_completion_time,
232-
use_events,
233-
useHostTasks=0,
233+
for profiler_type in profiler_types:
234+
benches.append(
235+
GraphApiSubmitGraph(
236+
self,
237+
runtime,
238+
in_order_queue,
239+
num_kernels,
240+
measure_completion_time,
241+
profiler_type,
242+
use_events,
243+
useHostTasks=0,
244+
)
234245
)
235-
)
236246

237247
# Add other benchmarks
238248
benches += [
239-
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
240-
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
241-
QueueMemcpy(self, "Device", "Device", 1024),
242249
StreamMemory(self, "Triad", 10 * 1024, "Device"),
243-
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
244-
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
245250
VectorSum(self),
246251
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
247252
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
248253
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
249254
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
250255
]
256+
for profiler_type in profiler_types:
257+
benches.append(
258+
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
259+
)
260+
benches.append(
261+
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type)
262+
)
263+
benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
264+
benches.append(
265+
ExecImmediateCopyQueue(
266+
self, 0, 1, "Device", "Device", 1024, profiler_type
267+
)
268+
)
269+
benches.append(
270+
ExecImmediateCopyQueue(
271+
self, 1, 1, "Device", "Host", 1024, profiler_type
272+
)
273+
)
251274

252275
# Add UR-specific benchmarks
253276
benches += [
@@ -295,12 +318,15 @@ def parse_unit_type(compute_unit):
295318

296319

297320
class ComputeBenchmark(Benchmark):
298-
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
321+
def __init__(
322+
self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
323+
):
299324
super().__init__(bench.directory, bench)
300325
self.bench = bench
301326
self.bench_name = name
302327
self.test = test
303328
self.runtime = runtime
329+
self.profiler_type = profiler_type
304330

305331
def supported_runtimes(self) -> list[RUNTIMES]:
306332
"""Base runtimes supported by this benchmark, can be overridden."""
@@ -428,14 +454,19 @@ def __init__(
428454
MeasureCompletion=0,
429455
UseEvents=0,
430456
KernelExecTime=1,
457+
profiler_type="",
431458
):
432459
self.ioq = ioq
433460
self.MeasureCompletion = MeasureCompletion
434461
self.UseEvents = UseEvents
435462
self.KernelExecTime = KernelExecTime
436463
self.NumKernels = 10
437464
super().__init__(
438-
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
465+
bench,
466+
f"api_overhead_benchmark_{runtime.value}",
467+
"SubmitKernel",
468+
runtime,
469+
profiler_type,
439470
)
440471

441472
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -504,7 +535,7 @@ def range(self) -> tuple[float, float]:
504535
return (0.0, None)
505536

506537
def bin_args(self) -> list[str]:
507-
return [
538+
bin_args = [
508539
f"--Ioq={self.ioq}",
509540
f"--MeasureCompletion={self.MeasureCompletion}",
510541
"--iterations=100000",
@@ -513,6 +544,9 @@ def bin_args(self) -> list[str]:
513544
f"--KernelExecTime={self.KernelExecTime}",
514545
f"--UseEvents={self.UseEvents}",
515546
]
547+
if self.runtime == RUNTIMES.SYCL:
548+
bin_args.append(f"--profilerType={self.profiler_type}")
549+
return bin_args
516550

517551
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
518552
metadata_dict = super().get_metadata()
@@ -532,13 +566,20 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
532566

533567

534568
class ExecImmediateCopyQueue(ComputeBenchmark):
535-
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
569+
def __init__(
570+
self, bench, ioq, isCopyOnly, source, destination, size, profiler_type
571+
):
536572
self.ioq = ioq
537573
self.isCopyOnly = isCopyOnly
538574
self.source = source
539575
self.destination = destination
540576
self.size = size
541-
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
577+
super().__init__(
578+
bench,
579+
"api_overhead_benchmark_sycl",
580+
"ExecImmediateCopyQueue",
581+
profiler_type=profiler_type,
582+
)
542583

543584
def name(self):
544585
order = "in order" if self.ioq else "out of order"
@@ -569,16 +610,22 @@ def bin_args(self) -> list[str]:
569610
f"--dst={self.destination}",
570611
f"--size={self.size}",
571612
"--withCopyOffload=0",
613+
f"--profilerType={self.profiler_type}",
572614
]
573615

574616

575617
class QueueInOrderMemcpy(ComputeBenchmark):
576-
def __init__(self, bench, isCopyOnly, source, destination, size):
618+
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
577619
self.isCopyOnly = isCopyOnly
578620
self.source = source
579621
self.destination = destination
580622
self.size = size
581-
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
623+
super().__init__(
624+
bench,
625+
"memory_benchmark_sycl",
626+
"QueueInOrderMemcpy",
627+
profiler_type=profiler_type,
628+
)
582629

583630
def name(self):
584631
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -605,15 +652,18 @@ def bin_args(self) -> list[str]:
605652
f"--size={self.size}",
606653
"--count=100",
607654
"--withCopyOffload=0",
655+
f"--profilerType={self.profiler_type}",
608656
]
609657

610658

611659
class QueueMemcpy(ComputeBenchmark):
612-
def __init__(self, bench, source, destination, size):
660+
def __init__(self, bench, source, destination, size, profiler_type):
613661
self.source = source
614662
self.destination = destination
615663
self.size = size
616-
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
664+
super().__init__(
665+
bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
666+
)
617667

618668
def name(self):
619669
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -636,6 +686,7 @@ def bin_args(self) -> list[str]:
636686
f"--sourcePlacement={self.source}",
637687
f"--destinationPlacement={self.destination}",
638688
f"--size={self.size}",
689+
f"--profilerType={self.profiler_type}",
639690
]
640691

641692

@@ -858,6 +909,7 @@ def __init__(
858909
inOrderQueue,
859910
numKernels,
860911
measureCompletionTime,
912+
profiler_type,
861913
useEvents,
862914
useHostTasks,
863915
):
@@ -873,7 +925,11 @@ def __init__(
873925
self.use_events_str = f" with events" if self.useEvents else ""
874926
self.host_tasks_str = f" use host tasks" if self.useHostTasks else ""
875927
super().__init__(
876-
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
928+
bench,
929+
f"graph_api_benchmark_{runtime.value}",
930+
"SubmitGraph",
931+
runtime,
932+
profiler_type,
877933
)
878934

879935
def explicit_group(self):
@@ -901,7 +957,7 @@ def get_tags(self):
901957
]
902958

903959
def bin_args(self) -> list[str]:
904-
return [
960+
bin_args = [
905961
"--iterations=10000",
906962
f"--NumKernels={self.numKernels}",
907963
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -912,14 +968,21 @@ def bin_args(self) -> list[str]:
912968
"--UseExplicit=0",
913969
f"--UseHostTasks={self.useHostTasks}",
914970
]
971+
if self.runtime == RUNTIMES.SYCL:
972+
bin_args.append(f"--profilerType={self.profiler_type}")
973+
return bin_args
915974

916975

917976
class UllsEmptyKernel(ComputeBenchmark):
918-
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
977+
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
919978
self.wgc = wgc
920979
self.wgs = wgs
921980
super().__init__(
922-
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
981+
bench,
982+
f"ulls_benchmark_{runtime.value}",
983+
"EmptyKernel",
984+
runtime,
985+
profiler_type,
923986
)
924987

925988
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -943,11 +1006,14 @@ def get_tags(self):
9431006
return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
9441007

9451008
def bin_args(self) -> list[str]:
946-
return [
1009+
bin_args = [
9471010
"--iterations=10000",
9481011
f"--wgs={self.wgs}",
9491012
f"--wgc={self.wgc}",
9501013
]
1014+
if self.runtime == RUNTIMES.SYCL:
1015+
bin_args.append(f"--profilerType={self.profiler_type}")
1016+
return bin_args
9511017

9521018

9531019
class UllsKernelSwitch(ComputeBenchmark):

0 commit comments

Comments
 (0)