feat: add training Dockerfile

This commit is contained in:
Ting-Jun Wang 2024-05-29 04:15:35 +08:00
parent cb58840988
commit 17949bd1a6
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354

26
train/Dockerfile Normal file
View File

@ -0,0 +1,26 @@
# You can change what base image you want
# Here, i use torch2.2.2+118
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
# You should open 21046 port for communication between multiple machins
EXPOSE 21046
# Now, I create the directory '/train' for saving the training codes.
RUN mkdir /train
# You should get your training code here.
# I use COPY to copy my codes into the image.
# Or, you can `apt update && apt install git` and use git to pull your code
# from GitHub, Gitea, or GitLab.
COPY ./*.py /train
# Training setting
ENV GPU_NUM 1
ENV NODE_NUM 1
ENV NODE_RANK 0
ENV MASTER_IP "127.0.0.1"
ENV MASTER_PORT 21046
WORKDIR /train
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py