List of experience from users
On this page, you can search for a how-to page from another user, create a new how-to page, or find a full list at the end of the page.
Search for a page
Create a new page
Please use the following form to create a new page. You will be redirected to the newly created page.
Discussion
Required sbatch commands:
#SBATCH –ntasks=6 #SBATCH –gpus=6 #SBATCH –cpus-per-task=8 export WORLD_SIZE=$SLURM_NTASKS Here is a very cut down version of my code that properly utilizes DDP:
import torch.multiprocessing as mp import os from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DistributedSampler, RandomSampler, DataLoader from torch.cuda.amp import GradScaler, autocast
def setup_ddp(rank = 0): dist.init_process_group(“nccl”, rank=rank, world_size=int(os.environ['WORLD_SIZE'])) torch.cuda.device(rank)
def cleanup_ddp(): if dist.is_initialized(): dist.destroy_process_group()
def main(rank): setup_ddp(rank) device = torch.device(rank)
model = … model = model.to(device) model = DDP(model, device_ids=[rank], output_device=0, find_unused_parameters=True) trainset = torch.dataset(….) sampler = DistributedSampler(trainset, dist.get_world_size(), dist.get_rank(), shuffle=True) trainloader = DataLoader(trainset, batch_size=…, num_workers=…, pin_memory=True, sampler=sampler) scaler = GradScaler(enabled=True)
for epoch in epochs: for i, (sample,lbl) in enumerate(trainloader): optimizer.zero_grad(set_to_none=True) sample = sample.to(device, non_blocking = True) lbl = lbl.to(device, non_blocking = True) with autocast(enabled=True): logits = model(sample) loss = loss_fn(logits, lbl)
scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() torch.cuda.synchronize()
if _name_ == '_main_': mp.spawn(main, args=(), nprocs=os.environ[“WORLD_SIZE”], join=True) cleanup_ddp()