27 lines
873 B
Docker
27 lines
873 B
Docker
# You can change what base image you want
|
|
# Here, i use torch2.2.2+118
|
|
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
|
|
|
|
# You should open 21046 port for communication between multiple machins
|
|
EXPOSE 21046
|
|
|
|
# Now, I create the directory '/train' for saving the training codes.
|
|
RUN mkdir /train
|
|
|
|
# You should get your training code here.
|
|
# I use COPY to copy my codes into the image.
|
|
# Or, you can `apt update && apt install git` and use git to pull your code
|
|
# from GitHub, Gitea, or GitLab.
|
|
COPY ./*.py /train
|
|
|
|
# Training setting
|
|
ENV GPU_NUM 1
|
|
ENV NODE_NUM 1
|
|
ENV NODE_RANK 0
|
|
ENV MASTER_IP "127.0.0.1"
|
|
ENV MASTER_PORT 21046
|
|
|
|
WORKDIR /train
|
|
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py
|
|
|