feat: build docker swarm cluster

This commit is contained in:
Ting-Jun Wang 2024-05-30 14:55:59 +08:00
parent 7bf0f39d39
commit da1c216d3b
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354
2 changed files with 25 additions and 8 deletions

View File

@ -1,5 +1,6 @@
import socket import socket
import json import json
import docker
class ClusterCommunicationModule(): class ClusterCommunicationModule():
def __init__(self, host, port, node_manager): def __init__(self, host, port, node_manager):
@ -51,6 +52,10 @@ class ClusterCommunicationModule():
self.client_sock.connect((addr[0], self.port)) self.client_sock.connect((addr[0], self.port))
self.client_sock.send('[CHECK]'.encode()) self.client_sock.send('[CHECK]'.encode())
# join docker swarm cluster
docker_token = self.client_sock.recv(1024).decode().split()[-1]
print("Receive Docker Swarm Join_Token=", docker_token)
# remove 'add node' # remove 'add node'
self.node_manager.actions = self.node_manager.actions[1:] self.node_manager.actions = self.node_manager.actions[1:]
@ -76,18 +81,26 @@ class ClusterCommunicationModule():
data = self.client_sock.recv(1024) data = self.client_sock.recv(1024)
data = data.decode() data = data.decode()
if data == '[REJECT]':
print(f"{host} reject.")
status = False
elif data == '[ACCEPT]':
self.node_manager.status = 'master'
print(f"{host} accept.")
# build docker swarm
self.node_manager.docker_client.init(advertise_addr="eth0", listen_addr=f"{self.host}:2377", force_new_cluster=True)
token = self.node_manager.docker_client.attrs['JoinTokens']['Worker']
self.client_sock.send(f'[DOCKER_TOKEN] {token}'.encode())
status = True
# close client_sock, and waiting for the worker connect to our self.sock # close client_sock, and waiting for the worker connect to our self.sock
self.client_sock.close() self.client_sock.close()
self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if data == '[REJECT]': return status
print(f"{host} reject.")
return False
elif data == '[ACCEPT]':
self.node_manager.status = 'master'
print(f"{host} accept.")
return True
def cluster_info(self): def cluster_info(self):
ans = [] ans = []

View File

@ -2,6 +2,7 @@ import threading
from src.communication import ServiceExplorationModule, ClusterCommunicationModule from src.communication import ServiceExplorationModule, ClusterCommunicationModule
import torch import torch
import time import time
import docker
class NodeManager(): class NodeManager():
def __init__(self, host, port): def __init__(self, host, port):
@ -21,6 +22,9 @@ class NodeManager():
# let all client can know which IP address has our service so that it can link to. # let all client can know which IP address has our service so that it can link to.
self.service_exploration_module = ServiceExplorationModule(host, port+1, self) self.service_exploration_module = ServiceExplorationModule(host, port+1, self)
# docker client
self.docker_client = docker.from_env()
time.sleep(2) time.sleep(2)
def get_GPU_info(self): def get_GPU_info(self):