From e5c97ca8a118223a55704fcbe7a5e6dbe98eb84a Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Wed, 29 May 2024 04:31:49 +0800 Subject: [PATCH] docs: update README about docker relay network --- README.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f5df3fa..de976d1 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,18 @@ python3 -m torch.distributed.run \ ## Testint (in the docker container) +### Build overlay network in Docker +``` +# Master: Init the cluster +docker swarm init --advertise-addr=192.168.1.46 --listen-addr=192.168.1.46:2377 + +# Worker: Join the cluster +docker swarm join --token TOKEN_FROM_MASTER 192.168.1.46:2377 + +# Master: Create a overlay network in Docker +docker network create --driver overlay --attachable train-net +``` + ### Start Downloading Image ``` mkdir ./dataset_dir @@ -34,16 +46,18 @@ docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE ### Start Training Image ``` docker run -it \ - --net=host \ + --rm \ + --network train-net \ --runtime=nvidia \ --gpus all \ + --name train-0 \ -v ./dataset_dir:/dataset \ -v ./output:/output \ -e GPU_NUM=1 \ -e NODE_NUM=2 \ -e NODE_RANK=0 \ - -e MASTER_IP=192.168.1.46 \ + -e MASTER_IP=train-0 \ -e MASTER_PORT=21046 \ - snsd0805/cifar100-train:v2 + snsd0805/cifar100-train:v3 bash ```