fix: single machine parallel training success
This commit is contained in:
parent
aedc6b46e9
commit
8f3253ff24
@ -16,13 +16,14 @@ def ddp_init(rank, world_size):
|
||||
os.environ['MASTER_ADDR'] = 'localhost'
|
||||
os.environ['MASTER_PORT'] = '21046'
|
||||
init_process_group('nccl', rank=rank, world_size=world_size)
|
||||
torch.cuda.set_device(rank)
|
||||
|
||||
|
||||
def main(rank, world_size):
|
||||
ddp_init(rank, world_size)
|
||||
|
||||
model = Network()
|
||||
model = DDP(model, device_ids=rank)
|
||||
model = Network().to(rank)
|
||||
model = DDP(model, device_ids=[rank])
|
||||
|
||||
dataset = Cifar10Dataset('./dataset_dir/cifar-10-batches-py')
|
||||
loader = DataLoader(dataset, batch_size=32, shuffle=False, sampler=DistributedSampler(dataset, ))
|
||||
@ -49,11 +50,9 @@ def main(rank, world_size):
|
||||
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
print(train_loss_sum / len(loader))
|
||||
print((train_correct_sum / len(dataset)).item(),'%')
|
||||
print()
|
||||
print(f"[DEVICE {rank}] EPOCH {epoch} loss={train_loss_sum/len(loader)} acc={(train_correct_sum/len(dataset)).item()}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
world_size = torch.cuda.device_count()
|
||||
mp.spawn(main, args=(world_size, ))
|
||||
mp.spawn(main, args=(world_size, ), nprocs=world_size)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user