多节点分布式部署

更新时间:2025年3月6日 06:24 浏览:280

H800 80G * 4 台

https://git.junyouji.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes

主节点

docker run \
  --restart=always \
  --name deepseek \
  --network host \
  --shm-size 512g \
  --gpus=all \
  --privileged \
  --entrypoint /bin/bash \
  -v /data/docker/deepseek/model-cache:/model-cache \
  -e HF_ENDPOINT=https://hf-mirror.com \
  -e NCCL_SOCKET_IFNAME=ibs32 \
  -e GLOO_SOCKET_IFNAME=ibs32 \
  -e NCCL_IB_HCA=mlx5 \
  -e NCCL_DEBUG=TRACE \
  -itd \
  lmsysorg/sglang:v0.4.3-cu124 \
    python3 -m sglang.launch_server \
    --model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
    --served-model-name deepseek-ai/DeepSeek-R1 \
    --tp 32 \
    --dist-init-addr 10.166.5.101:20000 \
    --nnodes 4 \
    --node-rank 0 \
    --trust-remote-code \
    --host 0.0.0.0 \
    --port 80

从节点1

docker run \
  --restart=always \
  --name deepseek \
  --network host \
  --shm-size 512g \
  --gpus=all \
  --privileged \
  --entrypoint /bin/bash \
  -v /data/docker/deepseek/model-cache:/model-cache \
  -e HF_ENDPOINT=https://hf-mirror.com \
  -e NCCL_SOCKET_IFNAME=ibs32 \
  -e GLOO_SOCKET_IFNAME=ibs32 \
  -e NCCL_IB_HCA=mlx5 \
  -e NCCL_DEBUG=TRACE \
  -itd \
  lmsysorg/sglang:v0.4.3-cu124 \
    python3 -m sglang.launch_server \
    --model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
    --served-model-name deepseek-ai/DeepSeek-R1 \
    --tp 32 \
    --dist-init-addr 10.166.5.101:20000 \
    --nnodes 4 \
    --node-rank 1 \
    --trust-remote-code

从节点2

docker run \
  --restart=always \
  --name deepseek \
  --network host \
  --shm-size 512g \
  --gpus=all \
  --privileged \
  --entrypoint /bin/bash \
  -v /data/docker/deepseek/model-cache:/model-cache \
  -e HF_ENDPOINT=https://hf-mirror.com \
  -e NCCL_SOCKET_IFNAME=ibs32 \
  -e GLOO_SOCKET_IFNAME=ibs32 \
  -e NCCL_IB_HCA=mlx5 \
  -e NCCL_DEBUG=TRACE \
  -itd \
  lmsysorg/sglang:v0.4.3-cu124 \
    python3 -m sglang.launch_server \
    --model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
    --tp 32 \
    --dist-init-addr 10.166.5.101:20000 \
    --nnodes 4 \
    --node-rank 2 \
    --trust-remote-code

从节点3

docker run \
  --restart=always \
  --name deepseek \
  --network host \
  --shm-size 512g \
  --gpus=all \
  --privileged \
  --entrypoint /bin/bash \
  -v /data/docker/deepseek/model-cache:/model-cache \
  -e HF_ENDPOINT=https://hf-mirror.com \
  -e NCCL_SOCKET_IFNAME=ibs32 \
  -e GLOO_SOCKET_IFNAME=ibs32 \
  -e NCCL_IB_HCA=mlx5 \
  -e NCCL_DEBUG=TRACE \
  -itd \
  lmsysorg/sglang:v0.4.3-cu124 \
    python3 -m sglang.launch_server \
    --model-path /model-cache/deepseek-ai/DeepSeek-R1-BF16 \
    --tp 32 \
    --dist-init-addr 10.166.5.101:20000 \
    --nnodes 4 \
    --node-rank 3 \
    --trust-remote-code
导航