多节点分布式部署
更新时间:2025年3月6日 06:24
浏览:280
H800 80G * 4 台
主节点
docker run \
--restart=always \
--name deepseek \
--network host \
--shm-size 512g \
--gpus=all \
--privileged \
--entrypoint /bin/bash \
-v /data/docker/deepseek/model-cache:/model-cache \
-e HF_ENDPOINT=https://hf-mirror.com \
-e NCCL_SOCKET_IFNAME=ibs32 \
-e GLOO_SOCKET_IFNAME=ibs32 \
-e NCCL_IB_HCA=mlx5 \
-e NCCL_DEBUG=TRACE \
-itd \
lmsysorg/sglang:v0.4.3-cu124 \
python3 -m sglang.launch_server \
--model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
--served-model-name deepseek-ai/DeepSeek-R1 \
--tp 32 \
--dist-init-addr 10.166.5.101:20000 \
--nnodes 4 \
--node-rank 0 \
--trust-remote-code \
--host 0.0.0.0 \
--port 80
从节点1
docker run \
--restart=always \
--name deepseek \
--network host \
--shm-size 512g \
--gpus=all \
--privileged \
--entrypoint /bin/bash \
-v /data/docker/deepseek/model-cache:/model-cache \
-e HF_ENDPOINT=https://hf-mirror.com \
-e NCCL_SOCKET_IFNAME=ibs32 \
-e GLOO_SOCKET_IFNAME=ibs32 \
-e NCCL_IB_HCA=mlx5 \
-e NCCL_DEBUG=TRACE \
-itd \
lmsysorg/sglang:v0.4.3-cu124 \
python3 -m sglang.launch_server \
--model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
--served-model-name deepseek-ai/DeepSeek-R1 \
--tp 32 \
--dist-init-addr 10.166.5.101:20000 \
--nnodes 4 \
--node-rank 1 \
--trust-remote-code
从节点2
docker run \
--restart=always \
--name deepseek \
--network host \
--shm-size 512g \
--gpus=all \
--privileged \
--entrypoint /bin/bash \
-v /data/docker/deepseek/model-cache:/model-cache \
-e HF_ENDPOINT=https://hf-mirror.com \
-e NCCL_SOCKET_IFNAME=ibs32 \
-e GLOO_SOCKET_IFNAME=ibs32 \
-e NCCL_IB_HCA=mlx5 \
-e NCCL_DEBUG=TRACE \
-itd \
lmsysorg/sglang:v0.4.3-cu124 \
python3 -m sglang.launch_server \
--model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
--tp 32 \
--dist-init-addr 10.166.5.101:20000 \
--nnodes 4 \
--node-rank 2 \
--trust-remote-code
从节点3
docker run \
--restart=always \
--name deepseek \
--network host \
--shm-size 512g \
--gpus=all \
--privileged \
--entrypoint /bin/bash \
-v /data/docker/deepseek/model-cache:/model-cache \
-e HF_ENDPOINT=https://hf-mirror.com \
-e NCCL_SOCKET_IFNAME=ibs32 \
-e GLOO_SOCKET_IFNAME=ibs32 \
-e NCCL_IB_HCA=mlx5 \
-e NCCL_DEBUG=TRACE \
-itd \
lmsysorg/sglang:v0.4.3-cu124 \
python3 -m sglang.launch_server \
--model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
--tp 32 \
--dist-init-addr 10.166.5.101:20000 \
--nnodes 4 \
--node-rank 3 \
--trust-remote-code