fix: some docker typo

This commit is contained in:
Ting-Jun Wang 2024-06-01 01:29:54 +08:00
parent c1c946d01b
commit 0438f9f7b1
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354
2 changed files with 8 additions and 4 deletions

View File

@ -223,11 +223,11 @@ class ClusterCommunicationModule():
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
],
name=f'train-{train_args["index"]}',
env={
environment={
'GPU_NUM': self.node_manager.GPU_num,
'NODE_NUM': train_args['node_num'],
'NODE_RANK': train_args['index'],
'MASTER_IP': 'train-0',
'MASTER_IP': 'train-0' if self.node_manager.status == 'worker' else '127.0.0.1',
'MASTER_PORT': 21046,
},
detach=True
@ -241,7 +241,7 @@ class ClusterCommunicationModule():
status_code = result['StatusCode']
print(status_code, type(status_code))
def scatter_container(self, image_name, train=False):
def scatter_container(self, image_name, train):
def master_run(image_name):
print("[Master] run")
train_args = {

View File

@ -120,9 +120,13 @@ class NodeManager():
'''
data_image = "snsd0805/cifar100-dataset:v1"
train_image = "snsd0805/cifar100-train:v3"
# Start Downloading
self.cluster_communication_module.scatter_container(data_image)
# self.cluster_communication_module.scatter_container(data_image, train=False)
# start training
self.cluster_communication_module.scatter_container(train_image, train=True)
else: