# You can change what base image you want # Here, i use torch2.2.2+118 FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime # You should open 21046 port for communication between multiple machins EXPOSE 21046 # Now, I create the directory '/train' for saving the training codes. RUN mkdir /train # You should get your training code here. # I use COPY to copy my codes into the image. # Or, you can `apt update && apt install git` and use git to pull your code # from GitHub, Gitea, or GitLab. COPY ./*.py /train # Training setting ENV GPU_NUM 1 ENV NODE_NUM 1 ENV NODE_RANK 0 ENV MASTER_IP "127.0.0.1" ENV MASTER_PORT 21046 WORKDIR /train CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py