feat: add training Dockerfile
This commit is contained in:
parent
cb58840988
commit
17949bd1a6
26
train/Dockerfile
Normal file
26
train/Dockerfile
Normal file
@ -0,0 +1,26 @@
|
||||
# You can change what base image you want
|
||||
# Here, i use torch2.2.2+118
|
||||
FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
|
||||
|
||||
# You should open 21046 port for communication between multiple machins
|
||||
EXPOSE 21046
|
||||
|
||||
# Now, I create the directory '/train' for saving the training codes.
|
||||
RUN mkdir /train
|
||||
|
||||
# You should get your training code here.
|
||||
# I use COPY to copy my codes into the image.
|
||||
# Or, you can `apt update && apt install git` and use git to pull your code
|
||||
# from GitHub, Gitea, or GitLab.
|
||||
COPY ./*.py /train
|
||||
|
||||
# Training setting
|
||||
ENV GPU_NUM 1
|
||||
ENV NODE_NUM 1
|
||||
ENV NODE_RANK 0
|
||||
ENV MASTER_IP "127.0.0.1"
|
||||
ENV MASTER_PORT 21046
|
||||
|
||||
WORKDIR /train
|
||||
CMD export NCCL_SOCKET_IFNAME=eth0 && python3 -m torch.distributed.run --nproc_per_node=$GPU_NUM --nnodes=$NODE_NUM --node_rank=$NODE_RANK --rdzv_id=$MASTER_PORT --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:$MASTER_PORT main.py
|
||||
|
||||
Loading…
Reference in New Issue
Block a user