Skip to content

Commit f67ee85

Browse files
SSYernarfacebook-github-bot
authored andcommitted
Operator-level microbenchmarking (#3154)
Summary: This change introduces microbenchmarking for PyTorch operators. Since we need to capture and measure each operator call (which is happening under the hood of PyTorch), we need to use torch.profiler.profile. Example operators are `aten:mm`, `aten::sigmoid`, `cudaLaunchKernel`, etc… Use `--benchmark_operators` to enable the operator-level benchmarking. Use `--limit_operator_results` argument to specify the number of top runtime operators to benchmark. Use `--target_operators` argument to list PyTorch operators to benchmark. Example output: ``` TrainPipelineSparseDist | Malloc retries (P50/P90/P100): 0.0 / 0.0 / 0.0 | Runtime (P90): 442.08 ms | Peak Memory alloc (P90): 24.23 GB | Peak Memory reserved (P90): 26.21 GB operator_aten::copy_ | Malloc retries (P50/P90/P100): -1.0 / -1.0 / -1.0 | Runtime (P90): 39.21 ms | Peak Memory alloc (P90): 0.00 GB | Peak Memory reserved (P90): -0.00 GB ... ``` Differential Revision: D77676673
1 parent d7c7098 commit f67ee85

File tree

2 files changed

+101
-3
lines changed

2 files changed

+101
-3
lines changed

torchrec/distributed/benchmark/benchmark_train_pipeline.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@
4040
TestTowerCollectionSparseNNConfig,
4141
TestTowerSparseNNConfig,
4242
)
43-
from torchrec.distributed.benchmark.benchmark_utils import benchmark_func, cmd_conf
43+
from torchrec.distributed.benchmark.benchmark_utils import (
44+
benchmark_func,
45+
benchmark_operators,
46+
cmd_conf,
47+
)
4448
from torchrec.distributed.comm import get_local_size
4549
from torchrec.distributed.embedding_types import EmbeddingComputeKernel
4650
from torchrec.distributed.planner import Topology
@@ -110,6 +114,9 @@ class RunOptions:
110114
sparse_lr: float = 0.1
111115
sparse_momentum: Optional[float] = None
112116
sparse_weight_decay: Optional[float] = None
117+
benchmark_operators: bool = False
118+
target_operators: Optional[List[str]] = None
119+
limit_operator_results: int = 10
113120

114121

115122
@dataclass
@@ -379,10 +386,11 @@ def _func_to_benchmark(
379386
if jit_suffix
380387
else type(pipeline).__name__
381388
)
389+
382390
result = benchmark_func(
383391
name=name,
384-
bench_inputs=bench_inputs, # pyre-ignore
385-
prof_inputs=bench_inputs, # pyre-ignore
392+
bench_inputs=bench_inputs, # pyre-ignore[6]
393+
prof_inputs=bench_inputs, # pyre-ignore[6]
386394
num_benchmarks=5,
387395
num_profiles=2,
388396
profile_dir=run_option.profile,
@@ -393,6 +401,19 @@ def _func_to_benchmark(
393401
)
394402
results.append(result)
395403

404+
if run_option.benchmark_operators:
405+
op_results = benchmark_operators(
406+
func_to_benchmark=pipeline,
407+
bench_inputs=bench_inputs,
408+
num_benchmarks=5,
409+
device_type="cuda",
410+
target_operators=run_option.target_operators,
411+
is_pipeline=True,
412+
rank=rank,
413+
limit_results=run_option.limit_operator_results,
414+
)
415+
results.extend(op_results)
416+
396417
if rank == 0:
397418
for result in results:
398419
print(result)

torchrec/distributed/benchmark/benchmark_utils.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,83 @@ def trace_handler(prof) -> None:
905905
)
906906

907907

908+
def benchmark_operators(
909+
func_to_benchmark: Any, # pyre-ignore[2]
910+
bench_inputs: List[Any], # pyre-ignore[2]
911+
num_benchmarks: int,
912+
device_type: str = "cuda",
913+
target_operators: Optional[List[str]] = None,
914+
is_pipeline: bool = False,
915+
rank: int = -1,
916+
limit_results: int = 10,
917+
) -> List[BenchmarkResult]:
918+
activities = [torch.profiler.ProfilerActivity.CPU]
919+
if device_type == "cuda":
920+
activities.append(torch.profiler.ProfilerActivity.CUDA)
921+
922+
results = []
923+
elapsed_times = {}
924+
peak_memory_usage = {}
925+
926+
for _ in range(num_benchmarks):
927+
with torch.profiler.profile(
928+
activities=activities,
929+
record_shapes=True,
930+
profile_memory=True,
931+
with_stack=True,
932+
with_flops=True,
933+
with_modules=True,
934+
) as prof:
935+
if is_pipeline:
936+
dataloader = iter(bench_inputs)
937+
while True:
938+
try:
939+
func_to_benchmark.progress(dataloader)
940+
except StopIteration:
941+
break
942+
else:
943+
for bench_input in bench_inputs:
944+
func_to_benchmark(bench_input)
945+
946+
for evt in prof.key_averages():
947+
if evt.key not in elapsed_times:
948+
elapsed_times[evt.key] = []
949+
peak_memory_usage[evt.key] = 0
950+
951+
elapsed_times[evt.key].append(evt.self_device_time_total / 1e3)
952+
peak_memory_usage[evt.key] = max(
953+
peak_memory_usage[evt.key], evt.self_device_memory_usage
954+
)
955+
956+
for op in elapsed_times:
957+
if target_operators is not None and op not in target_operators:
958+
continue
959+
960+
mem_stats = [
961+
MemoryStats(
962+
rank=rank,
963+
malloc_retries=-1, # Not supported in profiler
964+
max_mem_allocated_mbs=peak_memory_usage[op] / 1024 / 1024,
965+
max_mem_reserved_mbs=-1, # Not supported in profiler
966+
)
967+
]
968+
969+
results.append(
970+
BenchmarkResult(
971+
short_name=f"operator_{op}",
972+
elapsed_time=torch.tensor(elapsed_times[op], dtype=torch.float),
973+
mem_stats=mem_stats,
974+
rank=rank,
975+
)
976+
)
977+
978+
sorted_results = sorted(
979+
results, key=lambda x: x.runtime_percentile(90), reverse=True
980+
)
981+
982+
return sorted_results[:limit_results]
983+
984+
908985
def benchmark_type_name(compile_mode: CompileMode, sharding_type: ShardingType) -> str:
909986
if sharding_type == ShardingType.TABLE_WISE:
910987
name = "tw-sharded"

0 commit comments

Comments
 (0)