-
Notifications
You must be signed in to change notification settings - Fork 721
/
num-workers-bench.py
executable file
·62 lines (49 loc) · 1.6 KB
/
num-workers-bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
"""
This benchmark shows that num_workers>0 leads to a better performance
usage:
./num-workers-bench.py
"""
import torch
import time
class MyDataset(torch.utils.data.Dataset):
def __init__(self):
self.tensor = torch.ones(1*2**18) # 1 mb tensor
def __len__(self):
return 1000
def __getitem__(self, idx):
# emulate a slow data transform
time.sleep(0.005)
return self.tensor
num_runs = 10
num_workers = 5
batch_size = 100
compute_emulation_time = 0.2
ds = MyDataset()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
device = "cuda:0"
for num_workers in range(5):
dl = torch.utils.data.DataLoader(
ds,
batch_size=batch_size,
pin_memory=True,
num_workers=num_workers,
)
duration = 0
for i in range(num_runs):
slept_time = 0
start_event.record()
for batch in dl:
batch = batch.to(device=device, non_blocking=True)
# emulate a compute delay to give workers a chance to reload, otherwise the benchmark
# will be measuring waiting for workers
time.sleep(compute_emulation_time)
# will then subtract this artificial delay from the total to try to isolate
# the iterator's overhead
slept_time += compute_emulation_time
end_event.record()
torch.cuda.synchronize()
duration += start_event.elapsed_time(end_event) / 1000 - slept_time
duration /= num_runs
print(f"num_workers={num_workers}: average time: {duration:0.3f}")