/etc/docker/daemon.json
{
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"default-runtime":"nvidia",
"node-generic-resources": [
"NVIDIA-GPU=GPU-477dc39a-7e47-a974-1a78-e097d732d5e5",
"NVIDIA-GPU=GPU-66a0dcb9-9bbe-430d-2db2-77bbb97a80d7",
"NVIDIA-GPU=GPU-34311cfe-d45b-7541-d1da-6b23c8a6ef3f",
"NVIDIA-GPU=GPU-4f87d71c-95fe-2c5b-97c2-ef1d858625d3",
"NVIDIA-GPU=GPU-6c9dc87a-63b1-9c72-ff51-9447ee99ce99",
"NVIDIA-GPU=GPU-bcf26554-bad4-7db4-8810-291ffe1a6e49",
"NVIDIA-GPU=GPU-d1808868-9034-f991-7662-8fe6ee9f8fc7"
]
}
runtimes - 增加 nvidia 运行时
default-runtime - 切换默认运行时为 nvidia, 可通过 "docker info" 查看
node-generic-resources - 配置节点中的显卡资源 ,显卡UUID 通过 "nvidia-smi -a | grep UUID" 获取
/etc/nvidia-container-runtime/config.toml
将
#swarm-resource = "DOCKER_RESOURCE_GPU"
前面的注释云掉
swarm-resource = "DOCKER_RESOURCE_GPU"
重启该节点的 docker 服务
service docker restart
docker service create --name cuda --generic-resource "NVIDIA-GPU=1" nvidia/cuda
先安装 docker 包: pip install docker
import docker
from docker.types import RestartPolicy, Resources, EndpointSpec
# 创建服务
# https://docs.docker.com/engine/api/v1.42/#tag/Service/operation/ServiceCreate
# https://docker-py.readthedocs.io/en/stable/services.html
def create(options: dict):
if "name" not in options:
raise Exception("名称参数(name)缺失!")
if "image" not in options:
raise Exception("铰像参数(image)缺失!")
# CPU 限制
if "cpu_limit" not in options:
options["cpu_limit"] = 1 * 1000000000
# 内存限制
if "mem_limit" not in options:
options["mem_limit"] = 1 * 1024 * 1024 * 1024
if "gpu" not in options:
options["gpu"] = 0
# client = docker.from_env()
client = docker.DockerClient(base_url=self.config.root_url)
generic_resources = None
if options["gpu"] > 0:
generic_resources = {
"NVIDIA-GPU": options["gpu"],
}
kwargs = {
"name": options["name"],
#"maxreplicas": 1,
"restart_policy": RestartPolicy(
condition="on-failure",
delay=10000000000,
max_attempts=10,
),
"resources": Resources(
cpu_limit=options["cpu_limit"],
mem_limit=options["mem_limit"],
generic_resources=generic_resources
),
}
if "published_port" in options:
if "target_port" in options:
kwargs["endpoint_spec"] = EndpointSpec(
ports={
# options["published_port"]: (options["target_port"], "tcp", "host"),
options["published_port"]: options["target_port"],
}
)
service = client.services.create(
options["image"],
None,
**kwargs
)
return {
"id": service.id,
"short_id": service.short_id,
"name": service.name,
"version": service.version,
"attrs": service.attrs,
}