feat: add training Dockerfile

2024-05-29 04:15:35 +08:00 · 2024-05-29 04:15:35 +08:00 · 17949bd1a6
commit 17949bd1a6
parent cb58840988
1 changed files with 26 additions and 0 deletions
--- a/train/Dockerfile
+++ b/train/Dockerfile
@ -0,0 +1,26 @@
+# You can change what base image you want
+# Here, i use torch2.2.2+118
+FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+
+# You should open 21046 port for communication between multiple machins
+EXPOSE 21046
+
+# Now, I create the directory '/train' for saving the training codes.
+RUN mkdir /train
+
+# You should get your training code here.
+# I use COPY to copy my codes into the image.
+# Or, you can `apt update && apt install git` and use git to pull your code
+# from GitHub, Gitea, or GitLab.
+COPY ./*.py /train
+
+# Training setting
+ENV GPU_NUM 1
+ENV NODE_NUM 1
+ENV NODE_RANK 0
+ENV MASTER_IP "127.0.0.1"
+ENV MASTER_PORT 21046
+
+WORKDIR /train
+CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py
+