fix: some docker typo
This commit is contained in:
parent
c1c946d01b
commit
0438f9f7b1
@ -223,11 +223,11 @@ class ClusterCommunicationModule():
|
|||||||
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
|
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
|
||||||
],
|
],
|
||||||
name=f'train-{train_args["index"]}',
|
name=f'train-{train_args["index"]}',
|
||||||
env={
|
environment={
|
||||||
'GPU_NUM': self.node_manager.GPU_num,
|
'GPU_NUM': self.node_manager.GPU_num,
|
||||||
'NODE_NUM': train_args['node_num'],
|
'NODE_NUM': train_args['node_num'],
|
||||||
'NODE_RANK': train_args['index'],
|
'NODE_RANK': train_args['index'],
|
||||||
'MASTER_IP': 'train-0',
|
'MASTER_IP': 'train-0' if self.node_manager.status == 'worker' else '127.0.0.1',
|
||||||
'MASTER_PORT': 21046,
|
'MASTER_PORT': 21046,
|
||||||
},
|
},
|
||||||
detach=True
|
detach=True
|
||||||
@ -241,7 +241,7 @@ class ClusterCommunicationModule():
|
|||||||
status_code = result['StatusCode']
|
status_code = result['StatusCode']
|
||||||
print(status_code, type(status_code))
|
print(status_code, type(status_code))
|
||||||
|
|
||||||
def scatter_container(self, image_name, train=False):
|
def scatter_container(self, image_name, train):
|
||||||
def master_run(image_name):
|
def master_run(image_name):
|
||||||
print("[Master] run")
|
print("[Master] run")
|
||||||
train_args = {
|
train_args = {
|
||||||
|
|||||||
@ -120,9 +120,13 @@ class NodeManager():
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
data_image = "snsd0805/cifar100-dataset:v1"
|
data_image = "snsd0805/cifar100-dataset:v1"
|
||||||
|
train_image = "snsd0805/cifar100-train:v3"
|
||||||
|
|
||||||
# Start Downloading
|
# Start Downloading
|
||||||
self.cluster_communication_module.scatter_container(data_image)
|
# self.cluster_communication_module.scatter_container(data_image, train=False)
|
||||||
|
|
||||||
|
# start training
|
||||||
|
self.cluster_communication_module.scatter_container(train_image, train=True)
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user