-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlaunch_multi.sh
143 lines (127 loc) · 4.9 KB
/
launch_multi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/bash
#
# Script to launch a multi-gpu distributed training using MONAI Core
# on UF HiperGator's AI partition, a SLURM cluster using Singularity
# as container runtime.
#
# This script uses `pt_multinode_helper_funcs.sh`, and
# either `run_on_node.sh`(for single-node multi-gpu training)
# or `run_on_multinode.sh` (for multi-node multi-gpu training). All
# the three `.sh` files are in \monaicore_multigpu\util_multigpu.
#
# We use torch.distributed.launch to launch the training, so please
# set as follows:
# set #SBATCH --ntasks=--nodes
# set #SBATCH --ntasks-per-node=1
# set #SBATCH --gpus=total number of processes to run on all nodes
# set #SBATCH --gpus-per-task=--gpus/--ntasks
#
# for multi-node training, replace `run_on_node.sh` in
# `PT_LAUNCH_SCRIPT=$(realpath "${PT_LAUNCH_UTILS_PATH}/run_on_node.sh")`
# with `run_on_multinode.sh`.
#
# Modify paths to your own paths.
#
# (c) 2021, Brian J. Stucky, UF Research Computing
# 2022, modified by Huiwen Ju, [email protected]
# Resource allocation.
#SBATCH --wait-all-nodes=1
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus=8
#SBATCH --gpus-per-task=8
#SBATCH --cpus-per-task=8
#SBATCH --mem=200gb
#SBATCH --partition=hpg-ai
#SBATCH --exclude=c0906a-s29,c1101a-s29,c1101a-s23,c1004a-s23,c1103a-s17
#SBATCH --exclusive
#SBATCH --time=2:00:00
#SBATCH --output=%x.%j.out
module load singularity
export NCCL_DEBUG=INFO
# can be set to either OFF (default), INFO, or DETAIL
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export NCCL_ASYNC_ERROR_HANDLING=1
# Training command specification: training_script -args.
# TRAINING_SCRIPT="$(realpath "$HOME/monai_uf_tutorials/monaicore_multigpu/unet_ddp/unet_training_ddp.py")"
TRAINING_SCRIPT="$(realpath "$HOME/monai_uf_tutorials/pretrain_modify/main.py")"
# 1. train from scratch (no --use_checkpoint, use --noamp)
# 8 GPUs, batch_size = 1, 722/8 = 90 iters/epoch. val: 49 images
TRAINING_CMD="$TRAINING_SCRIPT \
--distributed \
--logdir=/mnt \
--roi_x=128 --roi_y=128 --roi_z=128 \
--lrdecay --lr=6e-6 --decay=0.1 \
--batch_size=1 \
--epochs=3 --num_steps=270 --eval_num=90 \
--noamp"
# 2. train from scratch (with amp, without checkpointing gradients)
# TRAINING_CMD="$TRAINING_SCRIPT \
# --distributed \
# --logdir=/mnt \
# --roi_x=128 --roi_y=128 --roi_z=128 \
# --lrdecay --lr=6e-6 --decay=0.1 \
# --batch_size=1 \
# --epochs=3 --num_steps=270 --eval_num=90"
# 3. NOT WORKING with the default setting! train from scratch (cachedataset, --mem Specify the real memory required per node.)
# TRAINING_CMD="$TRAINING_SCRIPT \
# --distributed \
# --logdir=/mnt \
# --roi_x=128 --roi_y=128 --roi_z=128 \
# --lrdecay --lr=6e-6 --decay=0.1 \
# --batch_size=1 \
# --epochs=3 --num_steps=270 --eval_num=90 \
# --noamp \
# --cache_dataset"
# 4. NOT WORKING with the current main.py, need to add more code! train from scratch (smartcachedataset, --mem Specify the real memory required per node.)
# TRAINING_CMD="$TRAINING_SCRIPT \
# --distributed \
# --logdir=/mnt \
# --roi_x=128 --roi_y=128 --roi_z=128 \
# --lrdecay --lr=6e-6 --decay=0.1 \
# --batch_size=1 \
# --epochs=3 --num_steps=270 --eval_num=90 \
# --noamp \
# --smartcache_dataset"
# 5. resume training from downloaded pretrained model
# TRAINING_CMD="$TRAINING_SCRIPT \
# --distributed \
# --logdir=/mnt \
# --roi_x=128 --roi_y=128 --roi_z=128 \
# --lrdecay --lr=6e-6 --decay=0.1 \
# --batch_size=1 \
# --epochs=3 --num_steps=270 --eval_num=90 \
# --noamp \
# --resume=/mnt/pretrained_models/model_swinvit.pt"
# 6. resume training from a checkpoint
# TRAINING_CMD="$TRAINING_SCRIPT \
# --distributed \
# --logdir=/mnt \
# --roi_x=128 --roi_y=128 --roi_z=128 \
# --lrdecay --lr=6e-6 --decay=0.1 \
# --batch_size=1 \
# --epochs=3 --num_steps=270 --eval_num=90 \
# --noamp \
# --resume=/mnt/runs/model_bestValRMSE.pt"
# Python location (if not provided, system default will be used).
# Here we run within a MONAI Core Singularity container,
# see `build_container.sh` to build a MONAI Core Singularity container.
# PYTHON_PATH="singularity exec --nv \
# /blue/vendor-nvidia/hju/monaicore0.8.1 python3"
PYTHON_PATH="singularity exec --nv --bind /blue/vendor-nvidia/hju/data/swinunetr_pretrain_CT:/mnt \
/blue/vendor-nvidia/hju/monaicore0.9.1 python3"
# Location of the PyTorch launch utilities,
# i.e. `pt_multinode_helper_funcs.sh`, `run_on_node.sh` and `run_on_multinode`.
PT_LAUNCH_UTILS_PATH=$HOME/monai_uf_tutorials/monaicore_multigpu/util_multigpu
source "${PT_LAUNCH_UTILS_PATH}/pt_multinode_helper_funcs.sh"
init_node_info
pwd; hostname; date
echo "Primary node: $PRIMARY"
echo "Primary TCP port: $PRIMARY_PORT"
echo "Secondary nodes: $SECONDARIES"
PT_LAUNCH_SCRIPT=$(realpath "${PT_LAUNCH_UTILS_PATH}/run_on_node.sh")
# PT_LAUNCH_SCRIPT=$(realpath "${PT_LAUNCH_UTILS_PATH}/run_on_multinode.sh")
echo "Running \"$TRAINING_CMD\" on each node..."
srun --unbuffered "$PT_LAUNCH_SCRIPT" "$(realpath $PT_LAUNCH_UTILS_PATH)" \
"$TRAINING_CMD" "$PYTHON_PATH"