Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit 263043b

Browse files
author
DEKHTIARJonathan
committed
Improving Logging with Dequeue timing
1 parent 46393fa commit 263043b

File tree

1 file changed

+41
-17
lines changed

1 file changed

+41
-17
lines changed

tftrt/examples/benchmark_runner.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -374,11 +374,15 @@ def infer_batch(x):
374374

375375
iter_times = []
376376
memcopy_times = []
377+
dequeue_times = []
377378

378-
def log_step(step_idx, display_every, iter_time, memcpyHtoD_time):
379+
def log_step(step_idx, display_every, iter_time, memcpyHtoD_time, dequeue_time):
379380
if step_idx % display_every == 0:
380381
print(
381-
f" step {step_idx:04d}, iter_time(ms)={iter_time:.3f}, memcpyHtoD_time(ms)={memcpyHtoD_time:.3f}"
382+
f"step {step_idx:04d}, "
383+
f"iter_time(ms)={iter_time:08.3f}, "
384+
f"memcpyHtoD_time(ms)={memcpyHtoD_time:08.3f}, "
385+
f"dequeue_time(ms)={dequeue_time:08.3f}"
382386
)
383387

384388
dataset = timed_dataset(
@@ -401,7 +405,17 @@ def force_data_on_gpu(data, device="/gpu:0"):
401405
output_data = tf.identity(data)
402406
return output_data
403407

404-
for step_idx, data_batch in enumerate(dataset):
408+
step_idx = 0
409+
ds_iter = iter(dataset)
410+
411+
while True:
412+
413+
try:
414+
start_time = time.time()
415+
data_batch = next(ds_iter)
416+
dequeue_times.append(time.time() - start_time)
417+
except:
418+
break
405419

406420
start_time = time.time()
407421
data_batch = force_data_on_gpu(data_batch)
@@ -418,11 +432,13 @@ def force_data_on_gpu(data, device="/gpu:0"):
418432
step_idx + 1,
419433
display_every=self._args.display_every,
420434
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
421-
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
435+
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000,
436+
dequeue_time=np.mean(dequeue_times[-self._args.display_every:]) * 1000
422437
)
423438
else:
424-
print(f"{'GPU Iteration Time':18s}: {iter_times[-1]:.4f}s")
425-
print(f"{'MemCopyHtoD Iteration Time':18s}: {iter_times[-1]:.4f}s")
439+
print(f"{'GPU Iteration Time':18s}: {iter_times[-1]:08.4f}s")
440+
print(f"{'Data MemCopyHtoD Time':18s}: {memcpyHtoD_time[-1]:08.4f}s")
441+
print(f"{'Data Dequeue Time':18s}: {dequeue_times[-1]:08.4f}s")
426442

427443
if not self._args.use_synthetic_data:
428444
data_aggregator.aggregate_data(y_pred, y)
@@ -431,6 +447,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
431447
step_idx + 1 >= self._args.num_iterations):
432448
break
433449

450+
step_idx += 1
451+
434452
if (
435453
not self._args.debug_performance and
436454
step_idx % self._args.display_every != 0
@@ -439,7 +457,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
439457
step_idx + 1,
440458
display_every=1, # force print
441459
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
442-
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
460+
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000,
461+
dequeue_time=np.mean(dequeue_times[-self._args.display_every:]) * 1000
443462
)
444463

445464
with timed_section("Metric Computation"):
@@ -458,14 +477,18 @@ def force_data_on_gpu(data, device="/gpu:0"):
458477
)
459478

460479
# Skipping last batch. Might have different batch_size
461-
run_times = np.array(iter_times)
462-
run_times = run_times[self._args.num_warmup_iterations:-1]
463-
mem_times = np.array(memcopy_times)
464-
mem_times = mem_times[self._args.num_warmup_iterations:-1]
480+
iter_times = np.array(iter_times)
481+
iter_times = iter_times[self._args.num_warmup_iterations:-1]
482+
483+
memcopy_times = np.array(memcopy_times)
484+
memcopy_times = memcopy_times[self._args.num_warmup_iterations:-1]
485+
486+
dequeue_times = np.array(dequeue_times)
487+
dequeue_times = dequeue_times[self._args.num_warmup_iterations:-1]
465488

466489
metrics['Total GPU Time (s)'] = int(np.ceil(np.sum(iter_times)))
467-
metrics['Throughput (samples/sec)'] = np.mean(
468-
self._args.batch_size / run_times
490+
metrics['Throughput (samples/sec)'] = (
491+
self._args.batch_size / np.mean(iter_times)
469492
)
470493

471494
def timing_metrics(time_arr, log_prefix):
@@ -479,16 +502,17 @@ def timing_metrics(time_arr, log_prefix):
479502
data[f"{log_prefix} Max (ms)"] = np.max(time_arr) * 1000
480503
return data
481504

482-
metrics.update(timing_metrics(run_times, "GPU Latency"))
483-
metrics.update(timing_metrics(mem_times, "MemCopyHtoD Time"))
505+
metrics.update(timing_metrics(iter_times, "GPU Latency"))
506+
metrics.update(timing_metrics(dequeue_times, "Data Batch Dequeue Time"))
507+
metrics.update(timing_metrics(memcopy_times, "Data MemCopyHtoD Time"))
484508

485509
self._export_runtime_metrics_to_json(metrics)
486510

487511
def log_value(key, val):
488512
if isinstance(val, int):
489-
print(f"- {key:40s}: {val}")
513+
print(f"- {key:45s}: {val}")
490514
else:
491-
print(f"- {key:40s}: {val:.2f}")
515+
print(f"- {key:45s}: {val:.2f}")
492516

493517
for key, val in sorted(metrics.items()):
494518
if isinstance(val, dict):

0 commit comments

Comments
 (0)