fix: single machine parallel training success
This commit is contained in:
parent
aedc6b46e9
commit
8f3253ff24
@ -16,13 +16,14 @@ def ddp_init(rank, world_size):
|
|||||||
os.environ['MASTER_ADDR'] = 'localhost'
|
os.environ['MASTER_ADDR'] = 'localhost'
|
||||||
os.environ['MASTER_PORT'] = '21046'
|
os.environ['MASTER_PORT'] = '21046'
|
||||||
init_process_group('nccl', rank=rank, world_size=world_size)
|
init_process_group('nccl', rank=rank, world_size=world_size)
|
||||||
|
torch.cuda.set_device(rank)
|
||||||
|
|
||||||
|
|
||||||
def main(rank, world_size):
|
def main(rank, world_size):
|
||||||
ddp_init(rank, world_size)
|
ddp_init(rank, world_size)
|
||||||
|
|
||||||
model = Network()
|
model = Network().to(rank)
|
||||||
model = DDP(model, device_ids=rank)
|
model = DDP(model, device_ids=[rank])
|
||||||
|
|
||||||
dataset = Cifar10Dataset('./dataset_dir/cifar-10-batches-py')
|
dataset = Cifar10Dataset('./dataset_dir/cifar-10-batches-py')
|
||||||
loader = DataLoader(dataset, batch_size=32, shuffle=False, sampler=DistributedSampler(dataset, ))
|
loader = DataLoader(dataset, batch_size=32, shuffle=False, sampler=DistributedSampler(dataset, ))
|
||||||
@ -49,11 +50,9 @@ def main(rank, world_size):
|
|||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
print(train_loss_sum / len(loader))
|
print(f"[DEVICE {rank}] EPOCH {epoch} loss={train_loss_sum/len(loader)} acc={(train_correct_sum/len(dataset)).item()}")
|
||||||
print((train_correct_sum / len(dataset)).item(),'%')
|
|
||||||
print()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
world_size = torch.cuda.device_count()
|
world_size = torch.cuda.device_count()
|
||||||
mp.spawn(main, args=(world_size, ))
|
mp.spawn(main, args=(world_size, ), nprocs=world_size)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user