feat: build docker swarm cluster
This commit is contained in:
parent
7bf0f39d39
commit
da1c216d3b
@ -1,5 +1,6 @@
|
||||
import socket
|
||||
import json
|
||||
import docker
|
||||
|
||||
class ClusterCommunicationModule():
|
||||
def __init__(self, host, port, node_manager):
|
||||
@ -51,6 +52,10 @@ class ClusterCommunicationModule():
|
||||
self.client_sock.connect((addr[0], self.port))
|
||||
self.client_sock.send('[CHECK]'.encode())
|
||||
|
||||
# join docker swarm cluster
|
||||
docker_token = self.client_sock.recv(1024).decode().split()[-1]
|
||||
print("Receive Docker Swarm Join_Token=", docker_token)
|
||||
|
||||
# remove 'add node'
|
||||
self.node_manager.actions = self.node_manager.actions[1:]
|
||||
|
||||
@ -76,18 +81,26 @@ class ClusterCommunicationModule():
|
||||
data = self.client_sock.recv(1024)
|
||||
data = data.decode()
|
||||
|
||||
if data == '[REJECT]':
|
||||
print(f"{host} reject.")
|
||||
status = False
|
||||
elif data == '[ACCEPT]':
|
||||
self.node_manager.status = 'master'
|
||||
print(f"{host} accept.")
|
||||
|
||||
# build docker swarm
|
||||
self.node_manager.docker_client.init(advertise_addr="eth0", listen_addr=f"{self.host}:2377", force_new_cluster=True)
|
||||
token = self.node_manager.docker_client.attrs['JoinTokens']['Worker']
|
||||
self.client_sock.send(f'[DOCKER_TOKEN] {token}'.encode())
|
||||
|
||||
status = True
|
||||
|
||||
# close client_sock, and waiting for the worker connect to our self.sock
|
||||
self.client_sock.close()
|
||||
self.client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
|
||||
|
||||
if data == '[REJECT]':
|
||||
print(f"{host} reject.")
|
||||
return False
|
||||
elif data == '[ACCEPT]':
|
||||
self.node_manager.status = 'master'
|
||||
|
||||
print(f"{host} accept.")
|
||||
return True
|
||||
return status
|
||||
|
||||
def cluster_info(self):
|
||||
ans = []
|
||||
|
||||
@ -2,6 +2,7 @@ import threading
|
||||
from src.communication import ServiceExplorationModule, ClusterCommunicationModule
|
||||
import torch
|
||||
import time
|
||||
import docker
|
||||
|
||||
class NodeManager():
|
||||
def __init__(self, host, port):
|
||||
@ -21,6 +22,9 @@ class NodeManager():
|
||||
# let all client can know which IP address has our service so that it can link to.
|
||||
self.service_exploration_module = ServiceExplorationModule(host, port+1, self)
|
||||
|
||||
# docker client
|
||||
self.docker_client = docker.from_env()
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
def get_GPU_info(self):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user