feat: build docker swarm cluster
This commit is contained in:
parent
7bf0f39d39
commit
da1c216d3b
@ -1,5 +1,6 @@
|
|||||||
import socket
|
import socket
|
||||||
import json
|
import json
|
||||||
|
import docker
|
||||||
|
|
||||||
class ClusterCommunicationModule():
|
class ClusterCommunicationModule():
|
||||||
def __init__(self, host, port, node_manager):
|
def __init__(self, host, port, node_manager):
|
||||||
@ -51,6 +52,10 @@ class ClusterCommunicationModule():
|
|||||||
self.client_sock.connect((addr[0], self.port))
|
self.client_sock.connect((addr[0], self.port))
|
||||||
self.client_sock.send('[CHECK]'.encode())
|
self.client_sock.send('[CHECK]'.encode())
|
||||||
|
|
||||||
|
# join docker swarm cluster
|
||||||
|
docker_token = self.client_sock.recv(1024).decode().split()[-1]
|
||||||
|
print("Receive Docker Swarm Join_Token=", docker_token)
|
||||||
|
|
||||||
# remove 'add node'
|
# remove 'add node'
|
||||||
self.node_manager.actions = self.node_manager.actions[1:]
|
self.node_manager.actions = self.node_manager.actions[1:]
|
||||||
|
|
||||||
@ -76,18 +81,26 @@ class ClusterCommunicationModule():
|
|||||||
data = self.client_sock.recv(1024)
|
data = self.client_sock.recv(1024)
|
||||||
data = data.decode()
|
data = data.decode()
|
||||||
|
|
||||||
|
if data == '[REJECT]':
|
||||||
|
print(f"{host} reject.")
|
||||||
|
status = False
|
||||||
|
elif data == '[ACCEPT]':
|
||||||
|
self.node_manager.status = 'master'
|
||||||
|
print(f"{host} accept.")
|
||||||
|
|
||||||
|
# build docker swarm
|
||||||
|
self.node_manager.docker_client.init(advertise_addr="eth0", listen_addr=f"{self.host}:2377", force_new_cluster=True)
|
||||||
|
token = self.node_manager.docker_client.attrs['JoinTokens']['Worker']
|
||||||
|
self.client_sock.send(f'[DOCKER_TOKEN] {token}'.encode())
|
||||||
|
|
||||||
|
status = True
|
||||||
|
|
||||||
# close client_sock, and waiting for the worker connect to our self.sock
|
# close client_sock, and waiting for the worker connect to our self.sock
|
||||||
self.client_sock.close()
|
self.client_sock.close()
|
||||||
self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
|
||||||
if data == '[REJECT]':
|
|
||||||
print(f"{host} reject.")
|
|
||||||
return False
|
|
||||||
elif data == '[ACCEPT]':
|
|
||||||
self.node_manager.status = 'master'
|
|
||||||
|
|
||||||
print(f"{host} accept.")
|
return status
|
||||||
return True
|
|
||||||
|
|
||||||
def cluster_info(self):
|
def cluster_info(self):
|
||||||
ans = []
|
ans = []
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import threading
|
|||||||
from src.communication import ServiceExplorationModule, ClusterCommunicationModule
|
from src.communication import ServiceExplorationModule, ClusterCommunicationModule
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
|
import docker
|
||||||
|
|
||||||
class NodeManager():
|
class NodeManager():
|
||||||
def __init__(self, host, port):
|
def __init__(self, host, port):
|
||||||
@ -21,6 +22,9 @@ class NodeManager():
|
|||||||
# let all client can know which IP address has our service so that it can link to.
|
# let all client can know which IP address has our service so that it can link to.
|
||||||
self.service_exploration_module = ServiceExplorationModule(host, port+1, self)
|
self.service_exploration_module = ServiceExplorationModule(host, port+1, self)
|
||||||
|
|
||||||
|
# docker client
|
||||||
|
self.docker_client = docker.from_env()
|
||||||
|
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
def get_GPU_info(self):
|
def get_GPU_info(self):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user