Compare commits
No commits in common. "e5c97ca8a118223a55704fcbe7a5e6dbe98eb84a" and "cb5884098871d05fbe1b4d366468e62f0710aa8d" have entirely different histories.
e5c97ca8a1
...
cb58840988
20
README.md
20
README.md
@ -25,18 +25,6 @@ python3 -m torch.distributed.run \
|
|||||||
|
|
||||||
## Testint (in the docker container)
|
## Testint (in the docker container)
|
||||||
|
|
||||||
### Build overlay network in Docker
|
|
||||||
```
|
|
||||||
# Master: Init the cluster
|
|
||||||
docker swarm init --advertise-addr=192.168.1.46 --listen-addr=192.168.1.46:2377
|
|
||||||
|
|
||||||
# Worker: Join the cluster
|
|
||||||
docker swarm join --token TOKEN_FROM_MASTER 192.168.1.46:2377
|
|
||||||
|
|
||||||
# Master: Create a overlay network in Docker
|
|
||||||
docker network create --driver overlay --attachable train-net
|
|
||||||
```
|
|
||||||
|
|
||||||
### Start Downloading Image
|
### Start Downloading Image
|
||||||
```
|
```
|
||||||
mkdir ./dataset_dir
|
mkdir ./dataset_dir
|
||||||
@ -46,18 +34,16 @@ docker run -it --rm -v ./dataset_dir:/dataset YOUR_IMAGE
|
|||||||
### Start Training Image
|
### Start Training Image
|
||||||
```
|
```
|
||||||
docker run -it \
|
docker run -it \
|
||||||
--rm \
|
--net=host \
|
||||||
--network train-net \
|
|
||||||
--runtime=nvidia \
|
--runtime=nvidia \
|
||||||
--gpus all \
|
--gpus all \
|
||||||
--name train-0 \
|
|
||||||
-v ./dataset_dir:/dataset \
|
-v ./dataset_dir:/dataset \
|
||||||
-v ./output:/output \
|
-v ./output:/output \
|
||||||
-e GPU_NUM=1 \
|
-e GPU_NUM=1 \
|
||||||
-e NODE_NUM=2 \
|
-e NODE_NUM=2 \
|
||||||
-e NODE_RANK=0 \
|
-e NODE_RANK=0 \
|
||||||
-e MASTER_IP=train-0 \
|
-e MASTER_IP=192.168.1.46 \
|
||||||
-e MASTER_PORT=21046 \
|
-e MASTER_PORT=21046 \
|
||||||
snsd0805/cifar100-train:v3 bash
|
snsd0805/cifar100-train:v2
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -1,26 +0,0 @@
|
|||||||
# You can change what base image you want
|
|
||||||
# Here, i use torch2.2.2+118
|
|
||||||
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
|
|
||||||
|
|
||||||
# You should open 21046 port for communication between multiple machins
|
|
||||||
EXPOSE 21046
|
|
||||||
|
|
||||||
# Now, I create the directory '/train' for saving the training codes.
|
|
||||||
RUN mkdir /train
|
|
||||||
|
|
||||||
# You should get your training code here.
|
|
||||||
# I use COPY to copy my codes into the image.
|
|
||||||
# Or, you can `apt update && apt install git` and use git to pull your code
|
|
||||||
# from GitHub, Gitea, or GitLab.
|
|
||||||
COPY ./*.py /train
|
|
||||||
|
|
||||||
# Training setting
|
|
||||||
ENV GPU_NUM 1
|
|
||||||
ENV NODE_NUM 1
|
|
||||||
ENV NODE_RANK 0
|
|
||||||
ENV MASTER_IP "127.0.0.1"
|
|
||||||
ENV MASTER_PORT 21046
|
|
||||||
|
|
||||||
WORKDIR /train
|
|
||||||
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py
|
|
||||||
|
|
||||||
@ -31,7 +31,6 @@ class Trainer():
|
|||||||
self.criterion = criterion
|
self.criterion = criterion
|
||||||
|
|
||||||
def train(self, epoch_num):
|
def train(self, epoch_num):
|
||||||
print("Start traininig...")
|
|
||||||
for epoch in range(epoch_num):
|
for epoch in range(epoch_num):
|
||||||
self.model.train()
|
self.model.train()
|
||||||
train_loss_sum = 0
|
train_loss_sum = 0
|
||||||
@ -53,7 +52,7 @@ class Trainer():
|
|||||||
|
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad()
|
||||||
print(f"[RANK {self.global_rank}] EPOCH {epoch} loss={train_loss_sum/len(self.loader)} acc={(train_correct_sum/train_item_counter).item()}")
|
print(f"[DEVICE {self.global_rank}] EPOCH {epoch} loss={train_loss_sum/len(self.loader)} acc={(train_correct_sum/train_item_counter).item()}")
|
||||||
|
|
||||||
def save(self, model_path):
|
def save(self, model_path):
|
||||||
torch.save(self.model.state_dict(), model_path)
|
torch.save(self.model.state_dict(), model_path)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user