fix: some docker typo
This commit is contained in:
parent
c1c946d01b
commit
0438f9f7b1
@ -223,11 +223,11 @@ class ClusterCommunicationModule():
|
||||
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
|
||||
],
|
||||
name=f'train-{train_args["index"]}',
|
||||
env={
|
||||
environment={
|
||||
'GPU_NUM': self.node_manager.GPU_num,
|
||||
'NODE_NUM': train_args['node_num'],
|
||||
'NODE_RANK': train_args['index'],
|
||||
'MASTER_IP': 'train-0',
|
||||
'MASTER_IP': 'train-0' if self.node_manager.status == 'worker' else '127.0.0.1',
|
||||
'MASTER_PORT': 21046,
|
||||
},
|
||||
detach=True
|
||||
@ -241,7 +241,7 @@ class ClusterCommunicationModule():
|
||||
status_code = result['StatusCode']
|
||||
print(status_code, type(status_code))
|
||||
|
||||
def scatter_container(self, image_name, train=False):
|
||||
def scatter_container(self, image_name, train):
|
||||
def master_run(image_name):
|
||||
print("[Master] run")
|
||||
train_args = {
|
||||
|
||||
@ -120,9 +120,13 @@ class NodeManager():
|
||||
'''
|
||||
|
||||
data_image = "snsd0805/cifar100-dataset:v1"
|
||||
train_image = "snsd0805/cifar100-train:v3"
|
||||
|
||||
# Start Downloading
|
||||
self.cluster_communication_module.scatter_container(data_image)
|
||||
# self.cluster_communication_module.scatter_container(data_image, train=False)
|
||||
|
||||
# start training
|
||||
self.cluster_communication_module.scatter_container(train_image, train=True)
|
||||
|
||||
|
||||
else:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user