@@ -374,11 +374,15 @@ def infer_batch(x):
374
374
375
375
iter_times = []
376
376
memcopy_times = []
377
+ dequeue_times = []
377
378
378
- def log_step (step_idx , display_every , iter_time , memcpyHtoD_time ):
379
+ def log_step (step_idx , display_every , iter_time , memcpyHtoD_time , dequeue_time ):
379
380
if step_idx % display_every == 0 :
380
381
print (
381
- f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} , memcpyHtoD_time(ms)={ memcpyHtoD_time :.3f} "
382
+ f"step { step_idx :04d} , "
383
+ f"iter_time(ms)={ iter_time :08.3f} , "
384
+ f"memcpyHtoD_time(ms)={ memcpyHtoD_time :08.3f} , "
385
+ f"dequeue_time(ms)={ dequeue_time :08.3f} "
382
386
)
383
387
384
388
dataset = timed_dataset (
@@ -401,7 +405,17 @@ def force_data_on_gpu(data, device="/gpu:0"):
401
405
output_data = tf .identity (data )
402
406
return output_data
403
407
404
- for step_idx , data_batch in enumerate (dataset ):
408
+ step_idx = 0
409
+ ds_iter = iter (dataset )
410
+
411
+ while True :
412
+
413
+ try :
414
+ start_time = time .time ()
415
+ data_batch = next (ds_iter )
416
+ dequeue_times .append (time .time () - start_time )
417
+ except :
418
+ break
405
419
406
420
start_time = time .time ()
407
421
data_batch = force_data_on_gpu (data_batch )
@@ -418,11 +432,13 @@ def force_data_on_gpu(data, device="/gpu:0"):
418
432
step_idx + 1 ,
419
433
display_every = self ._args .display_every ,
420
434
iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
421
- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
435
+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
436
+ dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
422
437
)
423
438
else :
424
- print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
425
- print (f"{ 'MemCopyHtoD Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
439
+ print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:08.4f} s" )
440
+ print (f"{ 'Data MemCopyHtoD Time' :18s} : { memcpyHtoD_time [- 1 ]:08.4f} s" )
441
+ print (f"{ 'Data Dequeue Time' :18s} : { dequeue_times [- 1 ]:08.4f} s" )
426
442
427
443
if not self ._args .use_synthetic_data :
428
444
data_aggregator .aggregate_data (y_pred , y )
@@ -431,6 +447,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
431
447
step_idx + 1 >= self ._args .num_iterations ):
432
448
break
433
449
450
+ step_idx += 1
451
+
434
452
if (
435
453
not self ._args .debug_performance and
436
454
step_idx % self ._args .display_every != 0
@@ -439,7 +457,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
439
457
step_idx + 1 ,
440
458
display_every = 1 , # force print
441
459
iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
442
- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
460
+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
461
+ dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
443
462
)
444
463
445
464
with timed_section ("Metric Computation" ):
@@ -458,14 +477,18 @@ def force_data_on_gpu(data, device="/gpu:0"):
458
477
)
459
478
460
479
# Skipping last batch. Might have different batch_size
461
- run_times = np .array (iter_times )
462
- run_times = run_times [self ._args .num_warmup_iterations :- 1 ]
463
- mem_times = np .array (memcopy_times )
464
- mem_times = mem_times [self ._args .num_warmup_iterations :- 1 ]
480
+ iter_times = np .array (iter_times )
481
+ iter_times = iter_times [self ._args .num_warmup_iterations :- 1 ]
482
+
483
+ memcopy_times = np .array (memcopy_times )
484
+ memcopy_times = memcopy_times [self ._args .num_warmup_iterations :- 1 ]
485
+
486
+ dequeue_times = np .array (dequeue_times )
487
+ dequeue_times = dequeue_times [self ._args .num_warmup_iterations :- 1 ]
465
488
466
489
metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
467
- metrics ['Throughput (samples/sec)' ] = np . mean (
468
- self ._args .batch_size / run_times
490
+ metrics ['Throughput (samples/sec)' ] = (
491
+ self ._args .batch_size / np . mean ( iter_times )
469
492
)
470
493
471
494
def timing_metrics (time_arr , log_prefix ):
@@ -479,16 +502,17 @@ def timing_metrics(time_arr, log_prefix):
479
502
data [f"{ log_prefix } Max (ms)" ] = np .max (time_arr ) * 1000
480
503
return data
481
504
482
- metrics .update (timing_metrics (run_times , "GPU Latency" ))
483
- metrics .update (timing_metrics (mem_times , "MemCopyHtoD Time" ))
505
+ metrics .update (timing_metrics (iter_times , "GPU Latency" ))
506
+ metrics .update (timing_metrics (dequeue_times , "Data Batch Dequeue Time" ))
507
+ metrics .update (timing_metrics (memcopy_times , "Data MemCopyHtoD Time" ))
484
508
485
509
self ._export_runtime_metrics_to_json (metrics )
486
510
487
511
def log_value (key , val ):
488
512
if isinstance (val , int ):
489
- print (f"- { key :40s } : { val } " )
513
+ print (f"- { key :45s } : { val } " )
490
514
else :
491
- print (f"- { key :40s } : { val :.2f} " )
515
+ print (f"- { key :45s } : { val :.2f} " )
492
516
493
517
for key , val in sorted (metrics .items ()):
494
518
if isinstance (val , dict ):
0 commit comments