docs: update README about docker relay network

This commit is contained in:
Ting-Jun Wang 2024-05-29 04:31:49 +08:00
parent bdaa1d4846
commit e5c97ca8a1
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354

View File

@ -25,6 +25,18 @@ python3 -m torch.distributed.run \
## Testint (in the docker container) ## Testint (in the docker container)
### Build overlay network in Docker
```
# Master: Init the cluster
docker swarm init --advertise-addr=192.168.1.46 --listen-addr=192.168.1.46:2377
# Worker: Join the cluster
docker swarm join --token TOKEN_FROM_MASTER 192.168.1.46:2377
# Master: Create a overlay network in Docker
docker network create --driver overlay --attachable train-net
```
### Start Downloading Image ### Start Downloading Image
``` ```
mkdir ./dataset_dir mkdir ./dataset_dir
@ -34,16 +46,18 @@ docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE
### Start Training Image ### Start Training Image
``` ```
docker run -it \ docker run -it \
--net=host \ --rm \
--network train-net \
--runtime=nvidia \ --runtime=nvidia \
--gpus all \ --gpus all \
--name train-0 \
-v ./dataset_dir:/dataset \ -v ./dataset_dir:/dataset \
-v ./output:/output \ -v ./output:/output \
-e GPU_NUM=1 \ -e GPU_NUM=1 \
-e NODE_NUM=2 \ -e NODE_NUM=2 \
-e NODE_RANK=0 \ -e NODE_RANK=0 \
-e MASTER_IP=192.168.1.46 \ -e MASTER_IP=train-0 \
-e MASTER_PORT=21046 \ -e MASTER_PORT=21046 \
snsd0805/cifar100-train:v2 snsd0805/cifar100-train:v3 bash
``` ```