Compare commits

...

3 Commits

3 changed files with 45 additions and 4 deletions

View File

@ -25,6 +25,18 @@ python3 -m torch.distributed.run \
## Testint (in the docker container) ## Testint (in the docker container)
### Build overlay network in Docker
```
# Master: Init the cluster
docker swarm init --advertise-addr=192.168.1.46 --listen-addr=192.168.1.46:2377
# Worker: Join the cluster
docker swarm join --token TOKEN_FROM_MASTER 192.168.1.46:2377
# Master: Create a overlay network in Docker
docker network create --driver overlay --attachable train-net
```
### Start Downloading Image ### Start Downloading Image
``` ```
mkdir ./dataset_dir mkdir ./dataset_dir
@ -34,16 +46,18 @@ docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE
### Start Training Image ### Start Training Image
``` ```
docker run -it \ docker run -it \
--net=host \ --rm \
--network train-net \
--runtime=nvidia \ --runtime=nvidia \
--gpus all \ --gpus all \
--name train-0 \
-v ./dataset_dir:/dataset \ -v ./dataset_dir:/dataset \
-v ./output:/output \ -v ./output:/output \
-e GPU_NUM=1 \ -e GPU_NUM=1 \
-e NODE_NUM=2 \ -e NODE_NUM=2 \
-e NODE_RANK=0 \ -e NODE_RANK=0 \
-e MASTER_IP=192.168.1.46 \ -e MASTER_IP=train-0 \
-e MASTER_PORT=21046 \ -e MASTER_PORT=21046 \
snsd0805/cifar100-train:v2 snsd0805/cifar100-train:v3 bash
``` ```

26
train/Dockerfile Normal file
View File

@ -0,0 +1,26 @@
# You can change what base image you want
# Here, i use torch2.2.2+118
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
# You should open 21046 port for communication between multiple machins
EXPOSE 21046
# Now, I create the directory '/train' for saving the training codes.
RUN mkdir /train
# You should get your training code here.
# I use COPY to copy my codes into the image.
# Or, you can `apt update && apt install git` and use git to pull your code
# from GitHub, Gitea, or GitLab.
COPY ./*.py /train
# Training setting
ENV GPU_NUM 1
ENV NODE_NUM 1
ENV NODE_RANK 0
ENV MASTER_IP "127.0.0.1"
ENV MASTER_PORT 21046
WORKDIR /train
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py

View File

@ -31,6 +31,7 @@ class Trainer():
self.criterion = criterion self.criterion = criterion
def train(self, epoch_num): def train(self, epoch_num):
print("Start traininig...")
for epoch in range(epoch_num): for epoch in range(epoch_num):
self.model.train() self.model.train()
train_loss_sum = 0 train_loss_sum = 0
@ -52,7 +53,7 @@ class Trainer():
self.optimizer.step() self.optimizer.step()
self.optimizer.zero_grad() self.optimizer.zero_grad()
print(f"[DEVICE {self.global_rank}] EPOCH {epoch} loss={train_loss_sum/len(self.loader)} acc={(train_correct_sum/train_item_counter).item()}") print(f"[RANK {self.global_rank}] EPOCH {epoch} loss={train_loss_sum/len(self.loader)} acc={(train_correct_sum/train_item_counter).item()}")
def save(self, model_path): def save(self, model_path):
torch.save(self.model.state_dict(), model_path) torch.save(self.model.state_dict(), model_path)