npu环境docker部署vllm
npu环境docker部署vllm,并推理部署Qwen3-0.6B
·
系统环境
cat /etc/os-release
NAME="EulerOS"
VERSION="2.0 (SP10)"
ID="euleros"
VERSION_ID="2.0"
PRETTY_NAME="EulerOS 2.0 (SP10)"
ANSI_COLOR="0;31"
uname -m
aarch64
npu-smi info
# 8卡 ...
下载模型
- 安装git git-lfs
yum install git wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-arm64-v3.7.0.tar.gz tar -xzvf git-lfs-linux-amd64-v3.7.0.tar.gz cd git-lfs-3.7.0/ ./install.sh - 下载模型
GIT_LFS_SKIP_SMUDGE=1 git clone https://gitcode.com/hf_mirrors/Qwen/Qwen3-0.6B.git cd Qwen3-0.6B git lfs install nohup git lfs pull > /dev/null 2>&1 &
下载镜像
- 修改源
vim /etc/docker/daemon.json{ ... "registry-mirrors": [ "https://docker.xuanyuan.me", "https://docker.1ms.run", "https://mirror.ccs.tencentyun.com", "https://docker-0.unsee.tech", "https://docker.m.daocloud.io" ], ... # 把 Docker 数据放到 大容量数据盘 "max-concurrent-downloads": 1, "data-root": "/data2/develop/docker/default-work" } - 重新加载 systemd 配置
systemctl daemon-reload - 重启 Docker
systemctl restart docker - 查看源
docker info | grep -i MirrorRegistry Mirrors: https://xxx.mirror.aliyuncs.com/ https://mirror.ccs.tencentyun.com/ - 下载vllm镜像
docker pull quay.io/ascend/vllm-ascend:v0.11.0rc0
vllm 推理部署 Qwen3-0.6B
- docker-compose.yaml
version: '3.8' services: vllm-ascend: image: quay.io/ascend/vllm-ascend:v0.11.0rc0 container_name: vllm-Qwen3-0.6B devices: # 配置第8张卡单独运行 - /dev/davinci7 - /dev/davinci_manager - /dev/devmm_svm - /dev/hisi_hdc volumes: - /usr/local/dcmi:/usr/local/dcmi - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info - /etc/ascend_install.info:/etc/ascend_install.info - /data2/models/Qwen3-0.6B:/data/model ports: - "8100:8000" restart: unless-stopped stdin_open: true tty: true command: > vllm serve /data/model --served-model-name Qwen3-0.6B --tensor-parallel-size 1 --dtype float16 --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' --max-num-seqs 4 --max-model-len 2048 --gpu-memory-utilization 0.8 --trust_remote_code - chat/completions
curl --location 'http://localhost:8100/v1/chat/completions' \ --header 'Content-Type: application/json' \ --data '{ "model": "Qwen3-0.6B", "messages": [ { "role": "user", "content": "你好,你是谁,简单自我介绍一下" } ], "top_p": 0.95, "stream": true, "stream_options": { "include_usage": true, "continuous_usage_stats": true } }'
更多推荐
所有评论(0)