From bc476e398fbca925f38447124f21269db03d835b Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Sat, 1 Jun 2024 13:06:05 +0800 Subject: [PATCH] feat: remove container automatically --- src/communication.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/communication.py b/src/communication.py index 99fa584..af19796 100644 --- a/src/communication.py +++ b/src/communication.py @@ -52,6 +52,8 @@ class ClusterCommunicationModule(): if self.node_manager.docker_client.swarm.attrs == {}: print("Build new docker swarm...") self.node_manager.docker_client.swarm.init(advertise_addr=self.host, listen_addr=f"{self.host}:2377", force_new_cluster=True) + print("Create new overlay network") + self.node_manager.docker_client.networks.create(name='train-net', driver='overlay', attachable=True) # send docker swarm token to the worker token = self.node_manager.docker_client.swarm.attrs['JoinTokens']['Worker'] @@ -122,7 +124,7 @@ class ClusterCommunicationModule(): print(f"[RUN_CONTAINER] {image}") self.run_container(image, train, args) - print(f"[RUN_CONTAINER] {image}") + print(f"[RUN_CONTAINER SUCCESS] {image}") self.client_sock.send('[RUN_CONTAINER_SUCCESS] {}'.encode()) return True @@ -178,9 +180,11 @@ class ClusterCommunicationModule(): def exit(self): if self.node_manager.status == 'master': for conn in self.worker_conns: - conn.send('[STOP] {}'.encode()) - check, args = conn.recv(1024).decode().split(' ') - print(f'{args} has stopped.') + try: + conn.send('[STOP] {}'.encode()) + check, args = conn.recv(1024).decode().split(' ') + except: + print(f'{args} has stopped.') self.node_manager.docker_client.swarm.leave(force=True) if self.node_manager.status == 'worker': @@ -239,7 +243,9 @@ class ClusterCommunicationModule(): result = container.wait() status_code = result['StatusCode'] - print(status_code, type(status_code)) + if status_code != 0: + print(f'[ERROR] some error occur in the docker container, error_code={status_code}') + container.remove() def scatter_container(self, image_name, train): def master_run(image_name):