feat: single machine DDP (test)
This commit is contained in:
parent
939aa6d92e
commit
aedc6b46e9
@ -6,34 +6,54 @@ from model import Network
|
|||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
import torch.multiprocessing as mp
|
||||||
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||||
|
from torch.distributed import init_process_group, destroy_process_group
|
||||||
|
import os
|
||||||
|
|
||||||
model = Network().to(device)
|
def ddp_init(rank, world_size):
|
||||||
dataset = Cifar10Dataset('./dataset_dir/cifar-10-batches-py')
|
os.environ['MASTER_ADDR'] = 'localhost'
|
||||||
loader = DataLoader(dataset, batch_size=32, shuffle=True)
|
os.environ['MASTER_PORT'] = '21046'
|
||||||
|
init_process_group('nccl', rank=rank, world_size=world_size)
|
||||||
|
|
||||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
||||||
criterion = nn.CrossEntropyLoss()
|
|
||||||
|
|
||||||
for epoch in range(50):
|
def main(rank, world_size):
|
||||||
model.train()
|
ddp_init(rank, world_size)
|
||||||
train_loss_sum = 0
|
|
||||||
train_correct_sum = 0
|
|
||||||
for x, y in loader:
|
|
||||||
x = x.float()
|
|
||||||
x, y = x.to(device), y.to(device)
|
|
||||||
|
|
||||||
predict = model(x)
|
model = Network()
|
||||||
loss = criterion(predict, y)
|
model = DDP(model, device_ids=rank)
|
||||||
loss.backward()
|
|
||||||
|
|
||||||
# evaluate
|
dataset = Cifar10Dataset('./dataset_dir/cifar-10-batches-py')
|
||||||
train_loss_sum += loss.item()
|
loader = DataLoader(dataset, batch_size=32, shuffle=False, sampler=DistributedSampler(dataset, ))
|
||||||
predicted_classes = torch.argmax(predict, dim=1)
|
|
||||||
train_correct_sum += (predicted_classes == y).sum()
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
|
for epoch in range(50):
|
||||||
|
model.train()
|
||||||
|
train_loss_sum = 0
|
||||||
|
train_correct_sum = 0
|
||||||
|
for x, y in loader:
|
||||||
|
x = x.float()
|
||||||
|
x, y = x.to(rank), y.to(rank)
|
||||||
|
|
||||||
|
predict = model(x)
|
||||||
|
loss = criterion(predict, y)
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# evaluate
|
||||||
|
train_loss_sum += loss.item()
|
||||||
|
predicted_classes = torch.argmax(predict, dim=1)
|
||||||
|
train_correct_sum += (predicted_classes == y).sum()
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
print(train_loss_sum / len(loader))
|
||||||
|
print((train_correct_sum / len(dataset)).item(),'%')
|
||||||
|
print()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
world_size = torch.cuda.device_count()
|
||||||
|
mp.spawn(main, args=(world_size, ))
|
||||||
|
|
||||||
optimizer.step()
|
|
||||||
optimizer.zero_grad()
|
|
||||||
print(train_loss_sum / len(loader))
|
|
||||||
print((train_correct_sum / len(dataset)).item(),'%')
|
|
||||||
print()
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user