import threading from src.communication import ServiceExplorationModule, ClusterCommunicationModule import torch import time import docker from web3 import Web3 from src.scheduler import Scheduler from constant import * class NodeManager(): def __init__(self, host, port): self.status = 'none' self.actions = [ {'explanation': 'Add another node into our cluster', 'function': 'add_node'}, {'explanation': 'Start waiting for the new task', 'function': 'start_listen_task'}, {'explanation': 'Exit', 'function': 'exit'}, ] self.get_GPU_info() print(f"You have {self.GPU} * {self.GPU_num}") # start Cluster Communication Module # let the nodes in the cluster can communicate self.cluster_communication_module = ClusterCommunicationModule(host, port, self) # start Service Exploration Module # let all client can know which IP address has our service so that it can link to. self.service_exploration_module = ServiceExplorationModule(host, port+1, self) # docker client self.docker_client = docker.from_env() # web3 provider # if this is master, it should init a Web object. self.w3 = None self.scheduler = None self.wallet = None time.sleep(2) def get_GPU_info(self): self.GPU_num = torch.cuda.device_count() assert self.GPU_num > 0, "Your computer doesn't have GPU resource" self.GPU = torch.cuda.get_device_name(0) for i in range(self.GPU_num): assert torch.cuda.get_device_name(i) == self.GPU, "Please provide same type of GPUs." def start_service(self): communication_thread = threading.Thread(target=self.cluster_communication_module.listen) communication_thread.daemon = True communication_thread.start() explore_service_thread = threading.Thread(target=self.service_exploration_module.listen) explore_service_thread.daemon = True explore_service_thread.start() def add_node(self): hosts = self.service_exploration_module.explore() if len(hosts) != 0: msg = "These are the nodes you can request for join into our cluster: \n" msg += '\n'.join([f'{index+1}) {host}' for index, host in enumerate(hosts)]) msg += '\n> ' choose = input(msg) try: choose = int(choose)-1 accept = self.cluster_communication_module.request(hosts[choose]) if accept: self.actions = [ {'explanation': 'Add another node into our cluster', 'function': 'add_node'}, {'explanation': 'Cluster info', 'function': 'cluster_info'}, {'explanation': 'Start waiting for the new task', 'function': 'start_listen_task'}, {'explanation': 'Exit', 'function': 'exit'}, ] except: print("=== FAIL ===") else: print("No other nodes in your subnet.") def start_listen_task(self): self.w3 = Web3(Web3.HTTPProvider(WEB3_PROVIDER_URL + WEB3_PROVIDER_KEY)) self.scheduler = Scheduler(self.w3, SCHEDULER_ADDR, SCHEDULER_ABI_FILE) self.wallet = self.w3.eth.account.from_key(WALLET_KEY) print(f"We have use {WEB3_PROVIDER_URL+WEB3_PROVIDER_KEY} as the web3 provider.") print(f"And we have load your wallet private key {WALLET_KEY} (address={self.wallet.address})") print() if self.w3.is_connected(): ''' print("[INFO] Connected Successfully.") print() # Register the cluster gpu_num = self.cluster_info() gpu_id = GPU_NAME2ID[self.GPU] print(f"\nWe will register this cluster({self.GPU} * {gpu_num})...") receipt = self.scheduler.register_cluster(self.wallet, gpu_id, gpu_num) event = self.scheduler.get_cluster_event(receipt) print("\n[INFO] Register our cluster succefully. \nThis is our cluster event on the blockchain: ") print(f" {event[0]['args']}") # start waiting self.cluster_communication_module.start_listen_task() print("\nWaiting for the new task from Sepolia testnet...") print("Ctrl+C to stop the waiting...") try: next_task = self.scheduler.listen_task(self.wallet.address) except: print("[INFO] stop the waiting") return # get task info task_index = next_task['args']['taskIndex'] data_image = next_task['args']['dataImage'] train_image = next_task['args']['trainImage'] print("\n[INFO] You Receive a new task:") print(f" - Download Image: {data_image}") print(f" - Training Image: {train_image}") ''' data_image = "test/test" # Start Downloading self.cluster_communication_module.run_container(data_image) else: print("[ERROR] Connected Failed.") print("Please check for your provider key & wallet key") def cluster_info(self): info = self.cluster_communication_module.cluster_info() print(f"\nThere are {len(info)+1} nodes in this cluster.") print("Cluster Info:") print(f" {self.service_exploration_module.host}(local) -> {self.GPU} * {self.GPU_num}") GPU_num = self.GPU_num for host in info: GPU_num += host['GPU_num'] print(f" {host['host']} -> {host['GPU']} * {host['GPU_num']}") return GPU_num def exit(self): self.cluster_communication_module.exit() self.service_exploration_module.exit()