pip install vllm
以下启动命令,能有25token/s
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
vllm serve /workspace/Qwen3.6-27B-FP8 \
--host 0.0.0.0 \
--port 8000 \
--served-model-name Qwen3.6-27B-FP8 \
--trust-remote-code \
--gpu-memory-utilization 0.75 \
--max-model-len 65536 \
--max-num-seqs 2 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--enable-chunked-prefill \
--enable-prefix-caching \
--speculative-config '{"method": "mtp", "num_speculative_tokens": 4}' \
--default-chat-template-kwargs '{"enable_thinking": false}' \
--reasoning-parser qwen3 \
--async-scheduling
测试
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "Qwen3.6-27B-FP8",
"messages": [
{"role": "user", "content": "你好,请用中文介绍一下你自己"}
],
"max_tokens": 512
}'
以下是测试数据
(APIServer pid=6203) INFO 05-15 03:03:33 [loggers.py:271] Engine 000: Avg prompt throughput: 4.0 tokens/s, Avg generation throughput: 0.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO 05-15 03:03:33 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 4.00, Accepted throughput: 0.12 tokens/s, Drafted throughput: 0.12 tokens/s, Accepted: 6 tokens, Drafted: 6 tokens, Per-position acceptance rate: 1.000, 1.000, 1.000, Avg Draft acceptance rate: 100.0%
(APIServer pid=6203) INFO 05-15 03:03:43 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO: 172.17.0.1:44149 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO: 172.17.0.1:55991 - "POST /v1/messages HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:03 [loggers.py:271] Engine 000: Avg prompt throughput: 2689.6 tokens/s, Avg generation throughput: 2.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO 05-15 03:04:04 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 3.12, Accepted throughput: 0.57 tokens/s, Drafted throughput: 0.80 tokens/s, Accepted: 17 tokens, Drafted: 24 tokens, Per-position acceptance rate: 0.875, 0.625, 0.625, Avg Draft acceptance rate: 70.8%
(APIServer pid=6203) INFO: 172.17.0.1:56001 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:14 [loggers.py:271] Engine 000: Avg prompt throughput: 113.2 tokens/s, Avg generation throughput: 11.9 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 35.7%, Prefix cache hit rate: 47.4%
(APIServer pid=6203) INFO 05-15 03:04:14 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.68, Accepted throughput: 7.40 tokens/s, Drafted throughput: 13.20 tokens/s, Accepted: 74 tokens, Drafted: 132 tokens, Per-position acceptance rate: 0.795, 0.591, 0.295, Avg Draft acceptance rate: 56.1%
(APIServer pid=6203) INFO 05-15 03:04:24 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 2.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 47.4%
(APIServer pid=6203) INFO 05-15 03:04:24 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 1.60, Accepted throughput: 0.90 tokens/s, Drafted throughput: 4.50 tokens/s, Accepted: 9 tokens, Drafted: 45 tokens, Per-position acceptance rate: 0.467, 0.067, 0.067, Avg Draft acceptance rate: 20.0%
(APIServer pid=6203) INFO: 172.17.0.1:58803 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:34 [loggers.py:271] Engine 000: Avg prompt throughput: 129.1 tokens/s, Avg generation throughput: 3.4 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 35.7%, Prefix cache hit rate: 63.3%
(APIServer pid=6203) INFO 05-15 03:04:34 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.36, Accepted throughput: 1.90 tokens/s, Drafted throughput: 4.20 tokens/s, Accepted: 19 tokens, Drafted: 42 tokens, Per-position acceptance rate: 0.643, 0.429, 0.286, Avg Draft acceptance rate: 45.2%
(APIServer pid=6203) INFO: 172.17.0.1:58813 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO: 172.17.0.1:58825 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:44 [loggers.py:271] Engine 000: Avg prompt throughput: 288.6 tokens/s, Avg generation throughput: 18.1 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 75.9%
(APIServer pid=6203) INFO 05-15 03:04:44 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 3.54, Accepted throughput: 13.20 tokens/s, Drafted throughput: 15.60 tokens/s, Accepted: 132 tokens, Drafted: 156 tokens, Per-position acceptance rate: 0.942, 0.846, 0.750, Avg Draft acceptance rate: 84.6%
(APIServer pid=6203) INFO: 172.17.0.1:38793 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO: 172.17.0.1:38803 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO: 172.17.0.1:40051 - "POST /v1/messages HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:54 [loggers.py:271] Engine 000: Avg prompt throughput: 324.4 tokens/s, Avg generation throughput: 21.1 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 81.2%
(APIServer pid=6203) INFO 05-15 03:04:54 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.92, Accepted throughput: 13.80 tokens/s, Drafted throughput: 21.60 tokens/s, Accepted: 138 tokens, Drafted: 216 tokens, Per-position acceptance rate: 0.750, 0.653, 0.514, Avg Draft acceptance rate: 63.9%
(APIServer pid=6203) INFO 05-15 03:05:04 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 81.2%