docs: update README about docker relay network
This commit is contained in:
parent
bdaa1d4846
commit
e5c97ca8a1
20
README.md
20
README.md
@ -25,6 +25,18 @@ python3 -m torch.distributed.run \
|
|||||||
|
|
||||||
## Testint (in the docker container)
|
## Testint (in the docker container)
|
||||||
|
|
||||||
|
### Build overlay network in Docker
|
||||||
|
```
|
||||||
|
# Master: Init the cluster
|
||||||
|
docker swarm init --advertise-addr=192.168.1.46 --listen-addr=192.168.1.46:2377
|
||||||
|
|
||||||
|
# Worker: Join the cluster
|
||||||
|
docker swarm join --token TOKEN_FROM_MASTER 192.168.1.46:2377
|
||||||
|
|
||||||
|
# Master: Create a overlay network in Docker
|
||||||
|
docker network create --driver overlay --attachable train-net
|
||||||
|
```
|
||||||
|
|
||||||
### Start Downloading Image
|
### Start Downloading Image
|
||||||
```
|
```
|
||||||
mkdir ./dataset_dir
|
mkdir ./dataset_dir
|
||||||
@ -34,16 +46,18 @@ docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE
|
|||||||
### Start Training Image
|
### Start Training Image
|
||||||
```
|
```
|
||||||
docker run -it \
|
docker run -it \
|
||||||
--net=host \
|
--rm \
|
||||||
|
--network train-net \
|
||||||
--runtime=nvidia \
|
--runtime=nvidia \
|
||||||
--gpus all \
|
--gpus all \
|
||||||
|
--name train-0 \
|
||||||
-v ./dataset_dir:/dataset \
|
-v ./dataset_dir:/dataset \
|
||||||
-v ./output:/output \
|
-v ./output:/output \
|
||||||
-e GPU_NUM=1 \
|
-e GPU_NUM=1 \
|
||||||
-e NODE_NUM=2 \
|
-e NODE_NUM=2 \
|
||||||
-e NODE_RANK=0 \
|
-e NODE_RANK=0 \
|
||||||
-e MASTER_IP=192.168.1.46 \
|
-e MASTER_IP=train-0 \
|
||||||
-e MASTER_PORT=21046 \
|
-e MASTER_PORT=21046 \
|
||||||
snsd0805/cifar100-train:v2
|
snsd0805/cifar100-train:v3 bash
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user