docs: add README
This commit is contained in:
parent
0a287e3b46
commit
cf00e840a0
49
README.md
Normal file
49
README.md
Normal file
@ -0,0 +1,49 @@
|
||||
# Distributed-Training-Example
|
||||
|
||||
## Developing (not in the docker container)
|
||||
|
||||
### Start Downloading
|
||||
```
|
||||
mkdir ./dataset_dir
|
||||
cd ./dataset_dir
|
||||
wget https://www.cs.toronto.edu/\~kriz/cifar-10-python.tar.gz
|
||||
tar zxvf cifar-10-python.tar.gz
|
||||
```
|
||||
|
||||
### Start Training
|
||||
```
|
||||
python3 -m torch.distributed.run \
|
||||
--nproc_per_node=1 \
|
||||
--nnodes=1 \
|
||||
--node_rank=0 \
|
||||
--rdzv_id=21046 \
|
||||
--rdzv_backend=c10d \
|
||||
--rdzv_endpoint=127.0.0.1:21046 \
|
||||
main.py
|
||||
```
|
||||
|
||||
|
||||
## Testint (in the docker container)
|
||||
|
||||
### Start Downloading Image
|
||||
```
|
||||
mkdir ./dataset_dir
|
||||
docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE
|
||||
```
|
||||
|
||||
### Start Training Image
|
||||
```
|
||||
docker run -it \
|
||||
--net=host \
|
||||
--runtime=nvidia \
|
||||
--gpus all \
|
||||
-v ./dataset_dir:/dataset \
|
||||
-v ./output:/output \
|
||||
-e GPU_NUM=1 \
|
||||
-e NODE_NUM=2 \
|
||||
-e NODE_RANK=0 \
|
||||
-e MASTER_IP=192.168.1.46 \
|
||||
-e MASTER_PORT=21046 \
|
||||
snsd0805/cifar100-train:v2
|
||||
```
|
||||
|
||||
Loading…
Reference in New Issue
Block a user