@@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
2121
2222 ```yaml
2323 type: service
24- name: qwen397
24+ name: qwen36
2525
2626 image: lmsysorg/sglang:v0.5.10.post1
2727
2828 commands:
2929 - |
3030 sglang serve \
31- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
31+ --model-path Qwen/Qwen3.6-27B \
32+ --host 0.0.0.0 \
3233 --port 30000 \
3334 --tp $DSTACK_GPUS_NUM \
34- --reasoning-parser qwen3 \
35- --tool-call-parser qwen3_coder \
36- --enable-flashinfer-allreduce-fusion \
37- --mem-fraction-static 0.8
35+ --mem-fraction-static 0.8 \
36+ --context-length 262144 \
37+ --reasoning-parser qwen3
3838
3939 port: 30000
40- model: Qwen/Qwen3.5-397B-A17B-FP8
40+ model: Qwen/Qwen3.6-27B
4141
4242 volumes:
4343 # Optional instance volume for model and runtime caches
@@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
4646 optional: true
4747
4848 resources:
49- cpu: x86:96..
50- memory: 512GB..
5149 shm_size: 16GB
52- disk: 500GB..
53- gpu: H100:80GB:8
50+ gpu: H100:4
5451 ```
5552
5653 </div>
@@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
6158
6259 ```yaml
6360 type: service
64- name: qwen397
61+ name: qwen36
6562
66- image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
67-
68- env:
69- - HIP_FORCE_DEV_KERNARG=1
70- - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
71- - SGLANG_DISABLE_CUDNN_CHECK=1
72- - SGLANG_INT4_WEIGHT=0
73- - SGLANG_MOE_PADDING=1
74- - SGLANG_ROCM_DISABLE_LINEARQUANT=0
75- - SGLANG_ROCM_FUSED_DECODE_MLA=1
76- - SGLANG_SET_CPU_AFFINITY=1
77- - SGLANG_USE_AITER=1
78- - SGLANG_USE_ROCM700A=1
63+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
7964
8065 commands:
8166 - |
8267 sglang serve \
83- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
68+ --model-path Qwen/Qwen3.6-27B \
69+ --host 0.0.0.0 \
70+ --port 30000 \
8471 --tp $DSTACK_GPUS_NUM \
85- --reasoning-parser qwen3 \
86- --tool-call-parser qwen3_coder \
8772 --mem-fraction-static 0.8 \
8873 --context-length 262144 \
89- --attention-backend triton \
90- --disable-cuda-graph \
91- --fp8-gemm-backend aiter \
92- --port 30000
74+ --reasoning-parser qwen3
9375
9476 port: 30000
95- model: Qwen/Qwen3.5-397B-A17B-FP8
77+ model: Qwen/Qwen3.6-27B
9678
9779 volumes:
9880 # Optional instance volume for model and runtime caches
@@ -101,30 +83,33 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
10183 optional: true
10284
10385 resources:
104- cpu: x86: 52..
105- memory: 700GB ..
86+ cpu: 52..
87+ memory: 896GB ..
10688 shm_size: 16GB
107- disk: 600GB ..
108- gpu: MI300X:192GB: 4
89+ disk: 450GB ..
90+ gpu: MI300X:4
10991 ```
11092
11193 </div>
11294
95+ The first startup on MI300X can take longer while SGLang compiles ROCm
96+ kernels.
97+
11398To run a service, pass the configuration to [ ` dstack apply ` ] ( ../reference/cli/dstack/apply.md ) :
11499
115100<div class =" termy " >
116101
117102``` shell
118103$ dstack apply -f .dstack.yml
119104
120- Submit the run qwen397 ? [y/n]: y
105+ Submit the run qwen36 ? [y/n]: y
121106
122107Provisioning...
123108---> 100%
124109
125110Service is published at:
126- http://localhost:3000/proxy/services/main/qwen397 /
127- Model Qwen/Qwen3.5-397B-A17B-FP8 is published at:
111+ http://localhost:3000/proxy/services/main/qwen36 /
112+ Model Qwen/Qwen3.6-27B is published at:
128113 http://localhost:3000/proxy/models/main/
129114```
130115
@@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
138123<div class =" termy " >
139124
140125``` shell
141- $ curl http://localhost:3000/proxy/services/main/qwen397 /v1/chat/completions \
126+ $ curl http://localhost:3000/proxy/services/main/qwen36 /v1/chat/completions \
142127 -H ' Content-Type: application/json' \
143128 -H ' Authorization: Bearer <dstack token>' \
144129 -d ' {
145- "model": "Qwen/Qwen3.5-397B-A17B-FP8 ",
130+ "model": "Qwen/Qwen3.6-27B ",
146131 "messages": [
147132 {
148133 "role": "user",
@@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
213198
214199 ```yaml
215200 type: service
216- name: qwen397 -service
201+ name: qwen36 -service
217202
218203 image: lmsysorg/sglang:v0.5.10.post1
219204
220205 commands:
221206 - |
222207 sglang serve \
223- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
208+ --model-path Qwen/Qwen3.6-27B \
209+ --host 0.0.0.0 \
224210 --port 30000 \
225211 --tp $DSTACK_GPUS_NUM \
226212 --reasoning-parser qwen3 \
227- --tool-call-parser qwen3_coder \
228- --enable-flashinfer-allreduce-fusion \
229- --mem-fraction-static 0.8
213+ --mem-fraction-static 0.8 \
214+ --context-length 262144
230215
231216 port: 30000
232- model: Qwen/Qwen3.5-397B-A17B-FP8
217+ model: Qwen/Qwen3.6-27B
233218
234219 volumes:
235220 # Optional instance volume for model and runtime caches
@@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules.
238223 optional: true
239224
240225 resources:
241- cpu: x86:96..
242- memory: 512GB..
243226 shm_size: 16GB
244- disk: 500GB..
245- gpu: H100:80GB:8
227+ gpu: H100:4
246228
247229 replicas: 1..2
248230 scaling:
@@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
258240
259241 ```yaml
260242 type: service
261- name: qwen397-service
262-
263- image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
243+ name: qwen36-service
264244
265- env:
266- - HIP_FORCE_DEV_KERNARG=1
267- - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
268- - SGLANG_DISABLE_CUDNN_CHECK=1
269- - SGLANG_INT4_WEIGHT=0
270- - SGLANG_MOE_PADDING=1
271- - SGLANG_ROCM_DISABLE_LINEARQUANT=0
272- - SGLANG_ROCM_FUSED_DECODE_MLA=1
273- - SGLANG_SET_CPU_AFFINITY=1
274- - SGLANG_USE_AITER=1
275- - SGLANG_USE_ROCM700A=1
245+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
276246
277247 commands:
278248 - |
279249 sglang serve \
280- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
250+ --model-path Qwen/Qwen3.6-27B \
251+ --host 0.0.0.0 \
252+ --port 30000 \
281253 --tp $DSTACK_GPUS_NUM \
282254 --reasoning-parser qwen3 \
283- --tool-call-parser qwen3_coder \
284255 --mem-fraction-static 0.8 \
285- --context-length 262144 \
286- --attention-backend triton \
287- --disable-cuda-graph \
288- --fp8-gemm-backend aiter \
289- --port 30000
256+ --context-length 262144
290257
291258 port: 30000
292- model: Qwen/Qwen3.5-397B-A17B-FP8
259+ model: Qwen/Qwen3.6-27B
293260
294261 volumes:
295262 # Optional instance volume for model and runtime caches
@@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules.
298265 optional: true
299266
300267 resources:
301- cpu: x86: 52..
302- memory: 700GB ..
268+ cpu: 52..
269+ memory: 896GB ..
303270 shm_size: 16GB
304- disk: 600GB ..
305- gpu: MI300X:192GB: 4
271+ disk: 450GB ..
272+ gpu: MI300X:4
306273
307274 replicas: 1..2
308275 scaling:
0 commit comments