fix: some docker typo

This commit is contained in:
Ting-Jun Wang 2024-06-01 01:29:54 +08:00
parent c1c946d01b
commit 0438f9f7b1
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354
2 changed files with 8 additions and 4 deletions

View File

@ -223,11 +223,11 @@ class ClusterCommunicationModule():
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]) docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
], ],
name=f'train-{train_args["index"]}', name=f'train-{train_args["index"]}',
env={ environment={
'GPU_NUM': self.node_manager.GPU_num, 'GPU_NUM': self.node_manager.GPU_num,
'NODE_NUM': train_args['node_num'], 'NODE_NUM': train_args['node_num'],
'NODE_RANK': train_args['index'], 'NODE_RANK': train_args['index'],
'MASTER_IP': 'train-0', 'MASTER_IP': 'train-0' if self.node_manager.status == 'worker' else '127.0.0.1',
'MASTER_PORT': 21046, 'MASTER_PORT': 21046,
}, },
detach=True detach=True
@ -241,7 +241,7 @@ class ClusterCommunicationModule():
status_code = result['StatusCode'] status_code = result['StatusCode']
print(status_code, type(status_code)) print(status_code, type(status_code))
def scatter_container(self, image_name, train=False): def scatter_container(self, image_name, train):
def master_run(image_name): def master_run(image_name):
print("[Master] run") print("[Master] run")
train_args = { train_args = {

View File

@ -120,9 +120,13 @@ class NodeManager():
''' '''
data_image = "snsd0805/cifar100-dataset:v1" data_image = "snsd0805/cifar100-dataset:v1"
train_image = "snsd0805/cifar100-train:v3"
# Start Downloading # Start Downloading
self.cluster_communication_module.scatter_container(data_image) # self.cluster_communication_module.scatter_container(data_image, train=False)
# start training
self.cluster_communication_module.scatter_container(train_image, train=True)
else: else: