feat: remove container automatically
This commit is contained in:
parent
0438f9f7b1
commit
bc476e398f
@ -52,6 +52,8 @@ class ClusterCommunicationModule():
|
|||||||
if self.node_manager.docker_client.swarm.attrs == {}:
|
if self.node_manager.docker_client.swarm.attrs == {}:
|
||||||
print("Build new docker swarm...")
|
print("Build new docker swarm...")
|
||||||
self.node_manager.docker_client.swarm.init(advertise_addr=self.host, listen_addr=f"{self.host}:2377", force_new_cluster=True)
|
self.node_manager.docker_client.swarm.init(advertise_addr=self.host, listen_addr=f"{self.host}:2377", force_new_cluster=True)
|
||||||
|
print("Create new overlay network")
|
||||||
|
self.node_manager.docker_client.networks.create(name='train-net', driver='overlay', attachable=True)
|
||||||
|
|
||||||
# send docker swarm token to the worker
|
# send docker swarm token to the worker
|
||||||
token = self.node_manager.docker_client.swarm.attrs['JoinTokens']['Worker']
|
token = self.node_manager.docker_client.swarm.attrs['JoinTokens']['Worker']
|
||||||
@ -122,7 +124,7 @@ class ClusterCommunicationModule():
|
|||||||
print(f"[RUN_CONTAINER] {image}")
|
print(f"[RUN_CONTAINER] {image}")
|
||||||
self.run_container(image, train, args)
|
self.run_container(image, train, args)
|
||||||
|
|
||||||
print(f"[RUN_CONTAINER] {image}")
|
print(f"[RUN_CONTAINER SUCCESS] {image}")
|
||||||
self.client_sock.send('[RUN_CONTAINER_SUCCESS] {}'.encode())
|
self.client_sock.send('[RUN_CONTAINER_SUCCESS] {}'.encode())
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -178,9 +180,11 @@ class ClusterCommunicationModule():
|
|||||||
def exit(self):
|
def exit(self):
|
||||||
if self.node_manager.status == 'master':
|
if self.node_manager.status == 'master':
|
||||||
for conn in self.worker_conns:
|
for conn in self.worker_conns:
|
||||||
conn.send('[STOP] {}'.encode())
|
try:
|
||||||
check, args = conn.recv(1024).decode().split(' ')
|
conn.send('[STOP] {}'.encode())
|
||||||
print(f'{args} has stopped.')
|
check, args = conn.recv(1024).decode().split(' ')
|
||||||
|
except:
|
||||||
|
print(f'{args} has stopped.')
|
||||||
self.node_manager.docker_client.swarm.leave(force=True)
|
self.node_manager.docker_client.swarm.leave(force=True)
|
||||||
|
|
||||||
if self.node_manager.status == 'worker':
|
if self.node_manager.status == 'worker':
|
||||||
@ -239,7 +243,9 @@ class ClusterCommunicationModule():
|
|||||||
|
|
||||||
result = container.wait()
|
result = container.wait()
|
||||||
status_code = result['StatusCode']
|
status_code = result['StatusCode']
|
||||||
print(status_code, type(status_code))
|
if status_code != 0:
|
||||||
|
print(f'[ERROR] some error occur in the docker container, error_code={status_code}')
|
||||||
|
container.remove()
|
||||||
|
|
||||||
def scatter_container(self, image_name, train):
|
def scatter_container(self, image_name, train):
|
||||||
def master_run(image_name):
|
def master_run(image_name):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user