List of experience from users

On this page, you can search for a how-to page from another user, create a new how-to page, or find a full list at the end of the page.

Search for a page

Create a new page

Please use the following form to create a new page. You will be redirected to the newly created page.

All how-to pages

Discussion

CSE IT, 2024/08/19 19:12

Required sbatch commands:

#SBATCH –ntasks=6 #SBATCH –gpus=6 #SBATCH –cpus-per-task=8 export WORLD_SIZE=$SLURM_NTASKS Here is a very cut down version of my code that properly utilizes DDP:

import torch.multiprocessing as mp import os from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DistributedSampler, RandomSampler, DataLoader from torch.cuda.amp import GradScaler, autocast

def setup_ddp(rank = 0): dist.init_process_group(“nccl”, rank=rank, world_size=int(os.environ['WORLD_SIZE'])) torch.cuda.device(rank)

def cleanup_ddp(): if dist.is_initialized(): dist.destroy_process_group()

def main(rank): setup_ddp(rank) device = torch.device(rank)

model = … model = model.to(device) model = DDP(model, device_ids=[rank], output_device=0, find_unused_parameters=True) trainset = torch.dataset(….) sampler = DistributedSampler(trainset, dist.get_world_size(), dist.get_rank(), shuffle=True) trainloader = DataLoader(trainset, batch_size=…, num_workers=…, pin_memory=True, sampler=sampler) scaler = GradScaler(enabled=True)

for epoch in epochs: for i, (sample,lbl) in enumerate(trainloader): optimizer.zero_grad(set_to_none=True) sample = sample.to(device, non_blocking = True) lbl = lbl.to(device, non_blocking = True) with autocast(enabled=True): logits = model(sample) loss = loss_fn(logits, lbl)

scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() torch.cuda.synchronize()

if _name_ == '_main_': mp.spawn(main, args=(), nprocs=os.environ[“WORLD_SIZE”], join=True) cleanup_ddp()

You could leave a comment if you were logged in.