feat: communication about docker image information
This commit is contained in:
parent
3184669630
commit
55211c3856
@ -1,4 +1,6 @@
|
|||||||
|
from os.path import isdir
|
||||||
import socket
|
import socket
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
import docker
|
import docker
|
||||||
@ -113,10 +115,14 @@ class ClusterCommunicationModule():
|
|||||||
self.client_sock.send('[START_LISTEN_TASK_CHECK] {}'.encode())
|
self.client_sock.send('[START_LISTEN_TASK_CHECK] {}'.encode())
|
||||||
print("The master has started listening for new task from Sepolia testnet...")
|
print("The master has started listening for new task from Sepolia testnet...")
|
||||||
elif command == '[RUN_CONTAINER]':
|
elif command == '[RUN_CONTAINER]':
|
||||||
image = json.loads(args)['image']
|
args = json.loads(args)
|
||||||
print(f"[RUN_CONTAINER] {image}")
|
image = args['image']
|
||||||
time.sleep(3)
|
train = args['train']
|
||||||
|
|
||||||
|
print(f"[RUN_CONTAINER] {image}")
|
||||||
|
self.run_container(image, train, args)
|
||||||
|
|
||||||
|
print(f"[RUN_CONTAINER] {image}")
|
||||||
self.client_sock.send('[RUN_CONTAINER_SUCCESS] {}'.encode())
|
self.client_sock.send('[RUN_CONTAINER_SUCCESS] {}'.encode())
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -185,18 +191,75 @@ class ClusterCommunicationModule():
|
|||||||
for conn in self.worker_conns:
|
for conn in self.worker_conns:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def run_container(self, image_name):
|
def run_container(self, image_name, train, train_args={})
|
||||||
|
'''
|
||||||
|
train_args
|
||||||
|
- index
|
||||||
|
- node_num
|
||||||
|
'''
|
||||||
|
if not os.path.isdir('./dataset_dir'):
|
||||||
|
os.mkdir('./dataset_dir')
|
||||||
|
print("Create ./dataset_dir dir.")
|
||||||
|
if not os.path.isdir('./output'):
|
||||||
|
os.mkdir('./output')
|
||||||
|
print("Create ./output dir.")
|
||||||
|
|
||||||
|
if not train:
|
||||||
|
container = self.node_manager.docker_client.containers.run(
|
||||||
|
image_name,
|
||||||
|
volumes={'dataset_dir': {'bind': '/dataset', 'mode': 'rw'}},
|
||||||
|
detach=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
container = self.node_manager.docker_client.containers.run(
|
||||||
|
image_name,
|
||||||
|
volumes={
|
||||||
|
'dataset_dir': {'bind': '/dataset', 'mode': 'rw'},
|
||||||
|
'output': {'bind': '/output', 'mode': 'rw'},
|
||||||
|
},
|
||||||
|
network='train-net',
|
||||||
|
runtime='nvidia',
|
||||||
|
device_requests=[
|
||||||
|
docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])
|
||||||
|
],
|
||||||
|
name=f'train-{train_args["index"]}',
|
||||||
|
env={
|
||||||
|
'GPU_NUM': self.node_manager.GPU_num,
|
||||||
|
'NODE_NUM': train_args['node_num'],
|
||||||
|
'NODE_RANK': train_args['index'],
|
||||||
|
'MASTER_IP': 'train-0',
|
||||||
|
'MASTER_PORT': 21046,
|
||||||
|
},
|
||||||
|
detach=True
|
||||||
|
)
|
||||||
|
|
||||||
|
print(container.short_id)
|
||||||
|
for line in container.logs(stream=True):
|
||||||
|
print(line.strip().decode())
|
||||||
|
|
||||||
|
result = container.wait()
|
||||||
|
status_code = result['StatusCode']
|
||||||
|
print(status_code, type(status_code))
|
||||||
|
|
||||||
|
def scatter_container(self, image_name, train=False):
|
||||||
def master_run(image_name):
|
def master_run(image_name):
|
||||||
print("[Master] run")
|
print("[Master] run")
|
||||||
for i in range(5):
|
train_args = {
|
||||||
print(i)
|
'index': 0,
|
||||||
time.sleep(1)
|
'node_num': len(self.worker_conns)+1
|
||||||
|
}
|
||||||
|
self.run_container(image_name, train, train_args)
|
||||||
print("[Master] finished")
|
print("[Master] finished")
|
||||||
|
|
||||||
def send_and_wait(conn_index, conn, image_name):
|
def send_and_wait(conn_index, conn, image_name):
|
||||||
try:
|
try:
|
||||||
# build command
|
# build command
|
||||||
data = {'image': image_name}
|
data = {
|
||||||
|
'image': image_name,
|
||||||
|
'train': train,
|
||||||
|
'index': conn_index+1,
|
||||||
|
'node_num': len(self.worker_conns)+1
|
||||||
|
}
|
||||||
command = '[RUN_CONTAINER] {}'.format(json.dumps(data))
|
command = '[RUN_CONTAINER] {}'.format(json.dumps(data))
|
||||||
|
|
||||||
# send
|
# send
|
||||||
@ -226,7 +289,7 @@ class ClusterCommunicationModule():
|
|||||||
thread.join()
|
thread.join()
|
||||||
|
|
||||||
# all finished
|
# all finished
|
||||||
print("[INFO] All workers finished.")
|
print("\n[INFO] All workers finished.")
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|||||||
@ -119,10 +119,10 @@ class NodeManager():
|
|||||||
print(f" - Training Image: {train_image}")
|
print(f" - Training Image: {train_image}")
|
||||||
'''
|
'''
|
||||||
|
|
||||||
data_image = "test/test"
|
data_image = "snsd0805/cifar10-dataset:v1"
|
||||||
|
|
||||||
# Start Downloading
|
# Start Downloading
|
||||||
self.cluster_communication_module.run_container(data_image)
|
self.cluster_communication_module.scatter_container(data_image)
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user