diff --git a/distributed/ddp-tutorial-series/README.md b/distributed/ddp-tutorial-series/README.md index d0ce17c00f..1424717f90 100644 --- a/distributed/ddp-tutorial-series/README.md +++ b/distributed/ddp-tutorial-series/README.md @@ -4,6 +4,14 @@ Code for the DDP tutorial series at https://pytorch.org/tutorials/beginner/ddp_s Each code file extends upon the previous one. The series starts with a non-distributed script that runs on a single GPU and incrementally updates to end with multinode training on a Slurm cluster. +## Dependencies + +1. nccl + 1. https://github.com/NVIDIA/nccl + 2. https://github.com/NVIDIA/nccl-tests +2. torch>=1.11.0 + + ## Files * [single_gpu.py](single_gpu.py): Non-distributed training script @@ -16,6 +24,32 @@ Each code file extends upon the previous one. The series starts with a non-distr * [slurm/config.yaml.template](slurm/config.yaml.template): configuration to set up an AWS cluster * [slurm/sbatch_run.sh](slurm/sbatch_run.sh): slurm script to launch the training job +## Create Virtual Environment + +```shell +$ python -m venv +$ source /bin/activate +``` + +## Run commands +* [single_gpu.py](single_gpu.py): +```shell +$ python single_gpu.py 50 10 +``` +* [multigpu.py](multigpu.py): + +```shell +$ python multigpu.py 50 10 +``` + + +* [multigpu_torchrun.py](multigpu_torchrun.py): +```shell +$ torchrun --standalone --nproc_per_node=gpu multigpu_torchrun.py 50 10 +``` + +* [multinode.py](multinode.py): DDP on multiple nodes using Torchrun (and optionally Slurm) + TODO diff --git a/distributed/ddp-tutorial-series/datautils.py b/distributed/ddp-tutorial-series/datautils.py index 22c364eaa0..52a0d6d07c 100644 --- a/distributed/ddp-tutorial-series/datautils.py +++ b/distributed/ddp-tutorial-series/datautils.py @@ -1,6 +1,7 @@ import torch from torch.utils.data import Dataset + class MyTrainDataset(Dataset): def __init__(self, size): self.size = size @@ -10,4 +11,4 @@ def __len__(self): return self.size def __getitem__(self, index): - return self.data[index] \ No newline at end of file + return self.data[index] diff --git a/distributed/ddp-tutorial-series/multigpu.py b/distributed/ddp-tutorial-series/multigpu.py index 029731b5d2..25864d72a0 100644 --- a/distributed/ddp-tutorial-series/multigpu.py +++ b/distributed/ddp-tutorial-series/multigpu.py @@ -2,12 +2,15 @@ import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from datautils import MyTrainDataset +from icecream import ic +# --- Additional modules required for Distributed Training import torch.multiprocessing as mp from torch.utils.data.distributed import DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group import os +# --- def ddp_setup(rank, world_size): @@ -46,7 +49,7 @@ def _run_batch(self, source, targets): def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") + print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batch size: {b_sz} | Steps: {len(self.train_data)}") self.train_data.sampler.set_epoch(epoch) for source, targets in self.train_data: source = source.to(self.gpu_id) @@ -84,6 +87,8 @@ def prepare_dataloader(dataset: Dataset, batch_size: int): def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int): + ic(rank, world_size) + ddp_setup(rank, world_size) dataset, model, optimizer = load_train_objs() train_data = prepare_dataloader(dataset, batch_size) diff --git a/distributed/ddp-tutorial-series/multigpu_torchrun.py b/distributed/ddp-tutorial-series/multigpu_torchrun.py index 66d8187346..39a521ee39 100644 --- a/distributed/ddp-tutorial-series/multigpu_torchrun.py +++ b/distributed/ddp-tutorial-series/multigpu_torchrun.py @@ -3,17 +3,21 @@ from torch.utils.data import Dataset, DataLoader from datautils import MyTrainDataset -import torch.multiprocessing as mp +# --- Additional modules required for Distributed Training from torch.utils.data.distributed import DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group import os +# --- + +from utils import print_nodes_info def ddp_setup(): init_process_group(backend="nccl") torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + class Trainer: def __init__( self, @@ -52,7 +56,7 @@ def _run_batch(self, source, targets): def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") + print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batch size: {b_sz} | Steps: {len(self.train_data)}") self.train_data.sampler.set_epoch(epoch) for source, targets in self.train_data: source = source.to(self.gpu_id) @@ -107,5 +111,9 @@ def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str parser.add_argument('save_every', type=int, help='How often to save a snapshot') parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)') args = parser.parse_args() - + + # --- Print the environment variables + print_nodes_info() + # --- + main(args.save_every, args.total_epochs, args.batch_size) diff --git a/distributed/ddp-tutorial-series/multinode.py b/distributed/ddp-tutorial-series/multinode.py index e80636bcc4..893989931b 100644 --- a/distributed/ddp-tutorial-series/multinode.py +++ b/distributed/ddp-tutorial-series/multinode.py @@ -2,18 +2,22 @@ import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from datautils import MyTrainDataset +from utils import print_nodes_info +# --- Additional modules required for Distributed Training import torch.multiprocessing as mp from torch.utils.data.distributed import DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group import os +# --- def ddp_setup(): init_process_group(backend="nccl") torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + class Trainer: def __init__( self, @@ -53,7 +57,7 @@ def _run_batch(self, source, targets): def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") + print(f"[GPU{self.global_rank}] Epoch {epoch} | Batch size: {b_sz} | Steps: {len(self.train_data)}") self.train_data.sampler.set_epoch(epoch) for source, targets in self.train_data: source = source.to(self.local_rank) @@ -108,5 +112,9 @@ def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str parser.add_argument('save_every', type=int, help='How often to save a snapshot') parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)') args = parser.parse_args() - + + # --- Print the environment variables + print_nodes_info() + # --- + main(args.save_every, args.total_epochs, args.batch_size) diff --git a/distributed/ddp-tutorial-series/requirements.txt b/distributed/ddp-tutorial-series/requirements.txt index 9270a1d6ee..18ef26580e 100644 --- a/distributed/ddp-tutorial-series/requirements.txt +++ b/distributed/ddp-tutorial-series/requirements.txt @@ -1 +1,3 @@ -torch>=1.11.0 \ No newline at end of file +torch>=1.11.0 +numpy +icecream \ No newline at end of file diff --git a/distributed/ddp-tutorial-series/run_multigpu.sh b/distributed/ddp-tutorial-series/run_multigpu.sh new file mode 100644 index 0000000000..77abc5930c --- /dev/null +++ b/distributed/ddp-tutorial-series/run_multigpu.sh @@ -0,0 +1 @@ +python multigpu.py 50 10 \ No newline at end of file diff --git a/distributed/ddp-tutorial-series/run_multigpu_torchrun.sh b/distributed/ddp-tutorial-series/run_multigpu_torchrun.sh new file mode 100644 index 0000000000..7d4c594f6f --- /dev/null +++ b/distributed/ddp-tutorial-series/run_multigpu_torchrun.sh @@ -0,0 +1 @@ +torchrun --standalone --nproc_per_node=gpu multigpu_torchrun.py 50 10 diff --git a/distributed/ddp-tutorial-series/run_single_gpu.sh b/distributed/ddp-tutorial-series/run_single_gpu.sh new file mode 100644 index 0000000000..58e4e74816 --- /dev/null +++ b/distributed/ddp-tutorial-series/run_single_gpu.sh @@ -0,0 +1 @@ +python single_gpu.py 50 10 diff --git a/distributed/ddp-tutorial-series/single_gpu.py b/distributed/ddp-tutorial-series/single_gpu.py index e91ab81cc1..d81c671f1c 100644 --- a/distributed/ddp-tutorial-series/single_gpu.py +++ b/distributed/ddp-tutorial-series/single_gpu.py @@ -11,7 +11,7 @@ def __init__( train_data: DataLoader, optimizer: torch.optim.Optimizer, gpu_id: int, - save_every: int, + save_every: int, ) -> None: self.gpu_id = gpu_id self.model = model.to(gpu_id) @@ -28,7 +28,7 @@ def _run_batch(self, source, targets): def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") + print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batch size: {b_sz} | Steps: {len(self.train_data)}") for source, targets in self.train_data: source = source.to(self.gpu_id) targets = targets.to(self.gpu_id) diff --git a/distributed/ddp-tutorial-series/utils.py b/distributed/ddp-tutorial-series/utils.py new file mode 100644 index 0000000000..d3ebf29e0a --- /dev/null +++ b/distributed/ddp-tutorial-series/utils.py @@ -0,0 +1,10 @@ +import os + +from icecream import ic + + +def print_nodes_info(): + Node = os.environ['LOCAL_RANK'] + ic(Node, os.environ['LOCAL_RANK'], os.environ['RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'], + os.environ['MASTER_PORT']) + print()