logo
0
0
WeChat Login

pip install vllm

以下启动命令,能有25token/s

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

vllm serve /workspace/Qwen3.6-27B-FP8 \
            --host 0.0.0.0 \
            --port 8000 \
            --served-model-name Qwen3.6-27B-FP8 \
            --trust-remote-code \
            --gpu-memory-utilization 0.75 \
            --max-model-len  65536 \
            --max-num-seqs 2 \
            --enable-auto-tool-choice \
            --tool-call-parser qwen3_coder \
            --enable-chunked-prefill \
            --enable-prefix-caching \
            --speculative-config '{"method": "mtp", "num_speculative_tokens": 4}' \
            --default-chat-template-kwargs '{"enable_thinking": false}' \
            --reasoning-parser qwen3 \
            --async-scheduling 

测试

curl http://localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
    "model": "Qwen3.6-27B-FP8",
    "messages": [
      {"role": "user", "content": "你好,请用中文介绍一下你自己"}
    ],
    "max_tokens": 512
  }'

以下是测试数据

(APIServer pid=6203) INFO 05-15 03:03:33 [loggers.py:271] Engine 000: Avg prompt throughput: 4.0 tokens/s, Avg generation throughput: 0.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO 05-15 03:03:33 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 4.00, Accepted throughput: 0.12 tokens/s, Drafted throughput: 0.12 tokens/s, Accepted: 6 tokens, Drafted: 6 tokens, Per-position acceptance rate: 1.000, 1.000, 1.000, Avg Draft acceptance rate: 100.0%
(APIServer pid=6203) INFO 05-15 03:03:43 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO:     172.17.0.1:44149 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO:     172.17.0.1:55991 - "POST /v1/messages HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:03 [loggers.py:271] Engine 000: Avg prompt throughput: 2689.6 tokens/s, Avg generation throughput: 2.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
(APIServer pid=6203) INFO 05-15 03:04:04 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 3.12, Accepted throughput: 0.57 tokens/s, Drafted throughput: 0.80 tokens/s, Accepted: 17 tokens, Drafted: 24 tokens, Per-position acceptance rate: 0.875, 0.625, 0.625, Avg Draft acceptance rate: 70.8%
(APIServer pid=6203) INFO:     172.17.0.1:56001 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:14 [loggers.py:271] Engine 000: Avg prompt throughput: 113.2 tokens/s, Avg generation throughput: 11.9 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 35.7%, Prefix cache hit rate: 47.4%
(APIServer pid=6203) INFO 05-15 03:04:14 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.68, Accepted throughput: 7.40 tokens/s, Drafted throughput: 13.20 tokens/s, Accepted: 74 tokens, Drafted: 132 tokens, Per-position acceptance rate: 0.795, 0.591, 0.295, Avg Draft acceptance rate: 56.1%
(APIServer pid=6203) INFO 05-15 03:04:24 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 2.4 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 47.4%
(APIServer pid=6203) INFO 05-15 03:04:24 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 1.60, Accepted throughput: 0.90 tokens/s, Drafted throughput: 4.50 tokens/s, Accepted: 9 tokens, Drafted: 45 tokens, Per-position acceptance rate: 0.467, 0.067, 0.067, Avg Draft acceptance rate: 20.0%
(APIServer pid=6203) INFO:     172.17.0.1:58803 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:34 [loggers.py:271] Engine 000: Avg prompt throughput: 129.1 tokens/s, Avg generation throughput: 3.4 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 35.7%, Prefix cache hit rate: 63.3%
(APIServer pid=6203) INFO 05-15 03:04:34 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.36, Accepted throughput: 1.90 tokens/s, Drafted throughput: 4.20 tokens/s, Accepted: 19 tokens, Drafted: 42 tokens, Per-position acceptance rate: 0.643, 0.429, 0.286, Avg Draft acceptance rate: 45.2%
(APIServer pid=6203) INFO:     172.17.0.1:58813 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO:     172.17.0.1:58825 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:44 [loggers.py:271] Engine 000: Avg prompt throughput: 288.6 tokens/s, Avg generation throughput: 18.1 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 75.9%
(APIServer pid=6203) INFO 05-15 03:04:44 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 3.54, Accepted throughput: 13.20 tokens/s, Drafted throughput: 15.60 tokens/s, Accepted: 132 tokens, Drafted: 156 tokens, Per-position acceptance rate: 0.942, 0.846, 0.750, Avg Draft acceptance rate: 84.6%
(APIServer pid=6203) INFO:     172.17.0.1:38793 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO:     172.17.0.1:38803 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(APIServer pid=6203) INFO:     172.17.0.1:40051 - "POST /v1/messages HTTP/1.1" 200 OK
(APIServer pid=6203) INFO 05-15 03:04:54 [loggers.py:271] Engine 000: Avg prompt throughput: 324.4 tokens/s, Avg generation throughput: 21.1 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 81.2%
(APIServer pid=6203) INFO 05-15 03:04:54 [metrics.py:101] SpecDecoding metrics: Mean acceptance length: 2.92, Accepted throughput: 13.80 tokens/s, Drafted throughput: 21.60 tokens/s, Accepted: 138 tokens, Drafted: 216 tokens, Per-position acceptance rate: 0.750, 0.653, 0.514, Avg Draft acceptance rate: 63.9%
(APIServer pid=6203) INFO 05-15 03:05:04 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 81.2%

About

No description, topics, or website provided.
Language
Jinja92.3%
Shell7.8%