docker swarm 显卡调度

作者: 刘一二发布时间: 2023年10月6日浏览: 325

配置每个节点的 daemon

/etc/docker/daemon.json

{
    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": []
        }
    },
    "default-runtime":"nvidia",
    "node-generic-resources": [
            "NVIDIA-GPU=GPU-477dc39a-7e47-a974-1a78-e097d732d5e5",
            "NVIDIA-GPU=GPU-66a0dcb9-9bbe-430d-2db2-77bbb97a80d7",
            "NVIDIA-GPU=GPU-34311cfe-d45b-7541-d1da-6b23c8a6ef3f",
            "NVIDIA-GPU=GPU-4f87d71c-95fe-2c5b-97c2-ef1d858625d3",
            "NVIDIA-GPU=GPU-6c9dc87a-63b1-9c72-ff51-9447ee99ce99",
            "NVIDIA-GPU=GPU-bcf26554-bad4-7db4-8810-291ffe1a6e49",
            "NVIDIA-GPU=GPU-d1808868-9034-f991-7662-8fe6ee9f8fc7"
    ]
}

runtimes - 增加 nvidia 运行时

default-runtime - 切换默认运行时为 nvidia, 可通过 "docker info" 查看

node-generic-resources - 配置节点中的显卡资源，显卡UUID 通过 "nvidia-smi -a | grep UUID" 获取

启用 docker swarm 中的显卡资源

/etc/nvidia-container-runtime/config.toml

将

#swarm-resource = "DOCKER_RESOURCE_GPU"

前面的注释云掉

swarm-resource = "DOCKER_RESOURCE_GPU"

重启该节点的 docker 服务

service docker restart

创建服务时申请一张显卡

docker service create --name cuda --generic-resource "NVIDIA-GPU=1" nvidia/cuda

python 对接 docker api 调度示例

先安装 docker 包: pip install docker

import docker
from docker.types import RestartPolicy, Resources, EndpointSpec


# 创建服务
# https://docs.docker.com/engine/api/v1.42/#tag/Service/operation/ServiceCreate
# https://docker-py.readthedocs.io/en/stable/services.html
def create(options: dict):

	if "name" not in options:
		raise Exception("名称参数（name）缺失！")

	if "image" not in options:
		raise Exception("铰像参数（image）缺失！")

	# CPU 限制
	if "cpu_limit" not in options:
		options["cpu_limit"] = 1 * 1000000000

	# 内存限制
	if "mem_limit" not in options:
		options["mem_limit"] = 1 * 1024 * 1024 * 1024

	if "gpu" not in options:
		options["gpu"] = 0

	# client = docker.from_env()
	client = docker.DockerClient(base_url=self.config.root_url)

	generic_resources = None
	if options["gpu"] > 0:
		generic_resources = {
				"NVIDIA-GPU": options["gpu"],
			}

	kwargs = {
		"name": options["name"],
		#"maxreplicas": 1,
		"restart_policy":  RestartPolicy(
			condition="on-failure",
			delay=10000000000,
			max_attempts=10,
		),
		"resources": Resources(
			cpu_limit=options["cpu_limit"],
			mem_limit=options["mem_limit"],
			generic_resources=generic_resources
		),
	}

	if "published_port" in options:
		if "target_port" in options:
			kwargs["endpoint_spec"] = EndpointSpec(
				ports={
					# options["published_port"]: (options["target_port"], "tcp", "host"),
					options["published_port"]: options["target_port"],
				}
			)

	service = client.services.create(
		options["image"],
		None,
		**kwargs
	)

	return {
		"id": service.id,
		"short_id": service.short_id,
		"name": service.name,
		"version": service.version,
		"attrs": service.attrs,
	}