From da1c216d3bd6a1c641a5c17759b8212798ca451d Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Thu, 30 May 2024 14:55:59 +0800 Subject: [PATCH] feat: build docker swarm cluster --- src/communication.py | 29 +++++++++++++++++++++-------- src/node_manager.py | 4 ++++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/communication.py b/src/communication.py index da3cf46..85380ed 100644 --- a/src/communication.py +++ b/src/communication.py @@ -1,5 +1,6 @@ import socket import json +import docker class ClusterCommunicationModule(): def __init__(self, host, port, node_manager): @@ -51,6 +52,10 @@ class ClusterCommunicationModule(): self.client_sock.connect((addr[0], self.port)) self.client_sock.send('[CHECK]'.encode()) + # join docker swarm cluster + docker_token = self.client_sock.recv(1024).decode().split()[-1] + print("Receive Docker Swarm Join_Token=", docker_token) + # remove 'add node' self.node_manager.actions = self.node_manager.actions[1:] @@ -76,18 +81,26 @@ class ClusterCommunicationModule(): data = self.client_sock.recv(1024) data = data.decode() + if data == '[REJECT]': + print(f"{host} reject.") + status = False + elif data == '[ACCEPT]': + self.node_manager.status = 'master' + print(f"{host} accept.") + + # build docker swarm + self.node_manager.docker_client.init(advertise_addr="eth0", listen_addr=f"{self.host}:2377", force_new_cluster=True) + token = self.node_manager.docker_client.attrs['JoinTokens']['Worker'] + self.client_sock.send(f'[DOCKER_TOKEN] {token}'.encode()) + + status = True + # close client_sock, and waiting for the worker connect to our self.sock self.client_sock.close() self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + - if data == '[REJECT]': - print(f"{host} reject.") - return False - elif data == '[ACCEPT]': - self.node_manager.status = 'master' - - print(f"{host} accept.") - return True + return status def cluster_info(self): ans = [] diff --git a/src/node_manager.py b/src/node_manager.py index aa970be..e823f92 100644 --- a/src/node_manager.py +++ b/src/node_manager.py @@ -2,6 +2,7 @@ import threading from src.communication import ServiceExplorationModule, ClusterCommunicationModule import torch import time +import docker class NodeManager(): def __init__(self, host, port): @@ -21,6 +22,9 @@ class NodeManager(): # let all client can know which IP address has our service so that it can link to. self.service_exploration_module = ServiceExplorationModule(host, port+1, self) + # docker client + self.docker_client = docker.from_env() + time.sleep(2) def get_GPU_info(self):