Skip to content

Commit c5308dc

Browse files
author
Andrey Cheptsov
committed
Update Qwen 3.6 service docs and examples
1 parent c222325 commit c5308dc

4 files changed

Lines changed: 140 additions & 152 deletions

File tree

docs/docs/concepts/services.md

Lines changed: 45 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
2121

2222
```yaml
2323
type: service
24-
name: qwen397
24+
name: qwen36
2525

2626
image: lmsysorg/sglang:v0.5.10.post1
2727

2828
commands:
2929
- |
3030
sglang serve \
31-
--model-path Qwen/Qwen3.5-397B-A17B-FP8 \
31+
--model-path Qwen/Qwen3.6-27B \
32+
--host 0.0.0.0 \
3233
--port 30000 \
3334
--tp $DSTACK_GPUS_NUM \
34-
--reasoning-parser qwen3 \
35-
--tool-call-parser qwen3_coder \
36-
--enable-flashinfer-allreduce-fusion \
37-
--mem-fraction-static 0.8
35+
--mem-fraction-static 0.8 \
36+
--context-length 262144 \
37+
--reasoning-parser qwen3
3838

3939
port: 30000
40-
model: Qwen/Qwen3.5-397B-A17B-FP8
40+
model: Qwen/Qwen3.6-27B
4141

4242
volumes:
4343
# Optional instance volume for model and runtime caches
@@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
4646
optional: true
4747

4848
resources:
49-
cpu: x86:96..
50-
memory: 512GB..
5149
shm_size: 16GB
52-
disk: 500GB..
53-
gpu: H100:80GB:8
50+
gpu: H100:4
5451
```
5552

5653
</div>
@@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
6158

6259
```yaml
6360
type: service
64-
name: qwen397
61+
name: qwen36
6562

66-
image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
67-
68-
env:
69-
- HIP_FORCE_DEV_KERNARG=1
70-
- SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
71-
- SGLANG_DISABLE_CUDNN_CHECK=1
72-
- SGLANG_INT4_WEIGHT=0
73-
- SGLANG_MOE_PADDING=1
74-
- SGLANG_ROCM_DISABLE_LINEARQUANT=0
75-
- SGLANG_ROCM_FUSED_DECODE_MLA=1
76-
- SGLANG_SET_CPU_AFFINITY=1
77-
- SGLANG_USE_AITER=1
78-
- SGLANG_USE_ROCM700A=1
63+
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
7964

8065
commands:
8166
- |
8267
sglang serve \
83-
--model-path Qwen/Qwen3.5-397B-A17B-FP8 \
68+
--model-path Qwen/Qwen3.6-27B \
69+
--host 0.0.0.0 \
70+
--port 30000 \
8471
--tp $DSTACK_GPUS_NUM \
85-
--reasoning-parser qwen3 \
86-
--tool-call-parser qwen3_coder \
8772
--mem-fraction-static 0.8 \
8873
--context-length 262144 \
89-
--attention-backend triton \
90-
--disable-cuda-graph \
91-
--fp8-gemm-backend aiter \
92-
--port 30000
74+
--reasoning-parser qwen3
9375

9476
port: 30000
95-
model: Qwen/Qwen3.5-397B-A17B-FP8
77+
model: Qwen/Qwen3.6-27B
9678

9779
volumes:
9880
# Optional instance volume for model and runtime caches
@@ -101,30 +83,33 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
10183
optional: true
10284

10385
resources:
104-
cpu: x86:52..
105-
memory: 700GB..
86+
cpu: 52..
87+
memory: 896GB..
10688
shm_size: 16GB
107-
disk: 600GB..
108-
gpu: MI300X:192GB:4
89+
disk: 450GB..
90+
gpu: MI300X:4
10991
```
11092

11193
</div>
11294

95+
The first startup on MI300X can take longer while SGLang compiles ROCm
96+
kernels.
97+
11398
To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
11499

115100
<div class="termy">
116101

117102
```shell
118103
$ dstack apply -f .dstack.yml
119104

120-
Submit the run qwen397? [y/n]: y
105+
Submit the run qwen36? [y/n]: y
121106

122107
Provisioning...
123108
---> 100%
124109

125110
Service is published at:
126-
http://localhost:3000/proxy/services/main/qwen397/
127-
Model Qwen/Qwen3.5-397B-A17B-FP8 is published at:
111+
http://localhost:3000/proxy/services/main/qwen36/
112+
Model Qwen/Qwen3.6-27B is published at:
128113
http://localhost:3000/proxy/models/main/
129114
```
130115

@@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
138123
<div class="termy">
139124

140125
```shell
141-
$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \
126+
$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \
142127
-H 'Content-Type: application/json' \
143128
-H 'Authorization: Bearer &lt;dstack token&gt;' \
144129
-d '{
145-
"model": "Qwen/Qwen3.5-397B-A17B-FP8",
130+
"model": "Qwen/Qwen3.6-27B",
146131
"messages": [
147132
{
148133
"role": "user",
@@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
213198

214199
```yaml
215200
type: service
216-
name: qwen397-service
201+
name: qwen36-service
217202

218203
image: lmsysorg/sglang:v0.5.10.post1
219204

220205
commands:
221206
- |
222207
sglang serve \
223-
--model-path Qwen/Qwen3.5-397B-A17B-FP8 \
208+
--model-path Qwen/Qwen3.6-27B \
209+
--host 0.0.0.0 \
224210
--port 30000 \
225211
--tp $DSTACK_GPUS_NUM \
226212
--reasoning-parser qwen3 \
227-
--tool-call-parser qwen3_coder \
228-
--enable-flashinfer-allreduce-fusion \
229-
--mem-fraction-static 0.8
213+
--mem-fraction-static 0.8 \
214+
--context-length 262144
230215

231216
port: 30000
232-
model: Qwen/Qwen3.5-397B-A17B-FP8
217+
model: Qwen/Qwen3.6-27B
233218

234219
volumes:
235220
# Optional instance volume for model and runtime caches
@@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules.
238223
optional: true
239224

240225
resources:
241-
cpu: x86:96..
242-
memory: 512GB..
243226
shm_size: 16GB
244-
disk: 500GB..
245-
gpu: H100:80GB:8
227+
gpu: H100:4
246228

247229
replicas: 1..2
248230
scaling:
@@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
258240

259241
```yaml
260242
type: service
261-
name: qwen397-service
262-
263-
image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
243+
name: qwen36-service
264244

265-
env:
266-
- HIP_FORCE_DEV_KERNARG=1
267-
- SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
268-
- SGLANG_DISABLE_CUDNN_CHECK=1
269-
- SGLANG_INT4_WEIGHT=0
270-
- SGLANG_MOE_PADDING=1
271-
- SGLANG_ROCM_DISABLE_LINEARQUANT=0
272-
- SGLANG_ROCM_FUSED_DECODE_MLA=1
273-
- SGLANG_SET_CPU_AFFINITY=1
274-
- SGLANG_USE_AITER=1
275-
- SGLANG_USE_ROCM700A=1
245+
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
276246

277247
commands:
278248
- |
279249
sglang serve \
280-
--model-path Qwen/Qwen3.5-397B-A17B-FP8 \
250+
--model-path Qwen/Qwen3.6-27B \
251+
--host 0.0.0.0 \
252+
--port 30000 \
281253
--tp $DSTACK_GPUS_NUM \
282254
--reasoning-parser qwen3 \
283-
--tool-call-parser qwen3_coder \
284255
--mem-fraction-static 0.8 \
285-
--context-length 262144 \
286-
--attention-backend triton \
287-
--disable-cuda-graph \
288-
--fp8-gemm-backend aiter \
289-
--port 30000
256+
--context-length 262144
290257

291258
port: 30000
292-
model: Qwen/Qwen3.5-397B-A17B-FP8
259+
model: Qwen/Qwen3.6-27B
293260

294261
volumes:
295262
# Optional instance volume for model and runtime caches
@@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules.
298265
optional: true
299266

300267
resources:
301-
cpu: x86:52..
302-
memory: 700GB..
268+
cpu: 52..
269+
memory: 896GB..
303270
shm_size: 16GB
304-
disk: 600GB..
305-
gpu: MI300X:192GB:4
271+
disk: 450GB..
272+
gpu: MI300X:4
306273

307274
replicas: 1..2
308275
scaling:

docs/examples.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ hide:
153153
SGLang
154154
</h3>
155155
<p>
156-
Deploy DeepSeek distilled models with SGLang
156+
Deploy Qwen3.6-27B with SGLang
157157
</p>
158158
</a>
159159
<a href="/examples/inference/vllm"
@@ -162,7 +162,7 @@ hide:
162162
vLLM
163163
</h3>
164164
<p>
165-
Deploy Llama 3.1 with vLLM
165+
Deploy Qwen3.6-27B with vLLM
166166
</p>
167167
</a>
168168
<a href="/examples/inference/nim"

0 commit comments

Comments
 (0)