Distributed-Training-Example/train/Dockerfile

# You can change what base image you want
# Here, i use torch2.2.2+118
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime

# You should open 21046 port for communication between multiple machins
EXPOSE 21046

# Now, I create the directory '/train' for saving the training codes.
RUN mkdir /train

# You should get your training code here.
# I use COPY to copy my codes into the image.
# Or, you can `apt update && apt install git` and use git to pull your code
# from GitHub, Gitea, or GitLab.
COPY ./*.py /train

# Training setting
ENV GPU_NUM 1
ENV NODE_NUM 1
ENV NODE_RANK 0
ENV MASTER_IP "127.0.0.1"
ENV MASTER_PORT 21046

WORKDIR /train
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py