From 17949bd1a6bec397002faf7406efa36239c60d62 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Wed, 29 May 2024 04:15:35 +0800 Subject: [PATCH] feat: add training Dockerfile --- train/Dockerfile | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 train/Dockerfile diff --git a/train/Dockerfile b/train/Dockerfile new file mode 100644 index 0000000..797ef8a --- /dev/null +++ b/train/Dockerfile @@ -0,0 +1,26 @@ +# You can change what base image you want +# Here, i use torch2.2.2+118 +FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime + +# You should open 21046 port for communication between multiple machins +EXPOSE 21046 + +# Now, I create the directory '/train' for saving the training codes. +RUN mkdir /train + +# You should get your training code here. +# I use COPY to copy my codes into the image. +# Or, you can `apt update && apt install git` and use git to pull your code +# from GitHub, Gitea, or GitLab. +COPY ./*.py /train + +# Training setting +ENV GPU_NUM 1 +ENV NODE_NUM 1 +ENV NODE_RANK 0 +ENV MASTER_IP "127.0.0.1" +ENV MASTER_PORT 21046 + +WORKDIR /train +CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py +