From 17949bd1a6bec397002faf7406efa36239c60d62 Mon Sep 17 00:00:00 2001
From: Ting-Jun Wang <levi900227@gmail.com>
Date: Wed, 29 May 2024 04:15:35 +0800
Subject: [PATCH] feat: add training Dockerfile

---
 train/Dockerfile | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 train/Dockerfile

diff --git a/train/Dockerfile b/train/Dockerfile
new file mode 100644
index 0000000..797ef8a
--- /dev/null
+++ b/train/Dockerfile
@@ -0,0 +1,26 @@
+# You can change what base image you want
+# Here, i use torch2.2.2+118
+FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+
+# You should open 21046 port for communication between multiple machins
+EXPOSE 21046
+
+# Now, I create the directory '/train' for saving the training codes.
+RUN mkdir /train
+
+# You should get your training code here.
+# I use COPY to copy my codes into the image.
+# Or, you can `apt update && apt install git` and use git to pull your code
+# from GitHub, Gitea, or GitLab.
+COPY ./*.py /train
+
+# Training setting
+ENV GPU_NUM 1
+ENV NODE_NUM 1
+ENV NODE_RANK 0
+ENV MASTER_IP "127.0.0.1"
+ENV MASTER_PORT 21046
+
+WORKDIR /train
+CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py
+