vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 13 additions & 3 deletions b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docs/source/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/user_guide/additional_config.md‎
Lines changed: 19 additions & 15 deletions b/‎docs/source/user_guide/additional_config.md‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎docs/source/user_guide/graph_mode.md‎
Lines changed: 82 additions & 0 deletions b/‎docs/source/user_guide/graph_mode.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎examples/offline_dualbatch_overlap_npu.py‎
Lines changed: 51 additions & 0 deletions b/‎examples/offline_dualbatch_overlap_npu.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎examples/prompt_embedding_inference.py‎
Lines changed: 83 additions & 0 deletions b/‎examples/prompt_embedding_inference.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 0 additions & 2 deletions b/‎requirements-dev.txt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py‎
Lines changed: 1 addition & 2 deletions b/‎tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py‎
Lines changed: 1 addition & 2 deletions
@@ -114,14 +114,20 @@ jobs:
             # pytest -sv tests/singlecard/test_guided_decoding.py.py
             # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
             pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_camem.py
             pytest -sv tests/singlecard/ \
             --ignore=tests/singlecard/test_offline_inference.py \
             --ignore=tests/singlecard/test_scheduler.py \
             --ignore=tests/singlecard/test_guided_decoding.py \
-            --ignore=tests/singlecard/test_ascend_config.py
+            --ignore=tests/singlecard/test_ascend_config.py \
+            --ignore=tests/singlecard/test_camem.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py
+            # To avoid oom, we need to run the test in a single process.
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
@@ -136,16 +142,20 @@ jobs:
             pytest -sv tests/singlecard/test_camem.py
             # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
             pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_prompt_embedding.py
             pytest -sv tests/singlecard/ \
               --ignore=tests/singlecard/test_offline_inference.py \
               --ignore=tests/singlecard/test_scheduler.py \
               --ignore=tests/singlecard/test_guided_decoding.py \
               --ignore=tests/singlecard/test_camem.py \
-              --ignore=tests/singlecard/test_ascend_config.py
+              --ignore=tests/singlecard/test_ascend_config.py \
+              --ignore=tests/singlecard/test_prompt_embedding.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
             # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
+            # To avoid oom, we need to run the test in a single process.
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
@@ -47,6 +47,7 @@ user_guide/suppoted_features
 user_guide/supported_models
 user_guide/env_vars
 user_guide/additional_config
+user_guide/graph_mode.md
 user_guide/release_notes
 :::
 
 
@@ -24,23 +24,25 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
 
 The following table lists the additional configuration options available in vLLM Ascend:
 
-| Name                          | Type | Default | Description                                   |
-| ----------------------------- | ---- | ------- | --------------------------------------------- |
-| `torchair_graph_config`       | dict | `{}`    | The config options for torchair graph mode    |
-| `ascend_scheduler_config`     | dict | `{}`    | The config options for ascend scheduler       |
-| `expert_tensor_parallel_size` | str  | `1`     | Expert tensor parallel size the model to use. |
+| Name | Type | Default | Description |
+| ---- | ---- | ------- | ----------- |
+| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
+| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler  |
+| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
+| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. |
 
 The details of each config option are as follows:
 
 **torchair_graph_config**
-
-| Name                     | Type      | Default | Description                                                       |
-| ------------------------ | --------- | ------- | ----------------------------------------------------------------- |
-| `enabled`                | bool      | `False` | Whether to enable torchair graph mode                             |
-| `use_cached_graph`       | bool      | `False` | Whether to use cached graph                                       |
-| `graph_batch_sizes`      | list[int] | `[]`    | The batch size for torchair graph cache                           |
-| `graph_batch_sizes_init` | bool      | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
-| `enable_kv_nz`           | bool      | `False` | Whether to enable kvcache NZ layout                               |
+| Name | Type | Default | Description |
+| ---- | ---- | ------- | ----------- |
+| `enabled` | bool | `False` | Whether to enable torchair graph mode |
+| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
+| `use_cached_graph` | bool | `False` | Whether to use cached graph |
+| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
+| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert |
+| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout |
 
 **ascend_scheduler_config**
 
@@ -60,13 +62,15 @@ A full example of additional configuration is as follows:
         "enabled": true,
         "use_cached_graph": true,
         "graph_batch_sizes": [1, 2, 4, 8],
-        "graph_batch_sizes_init": true,
+        "graph_batch_sizes_init": false,
+        "enable_multistream_shared_expert": false,
         "enable_kv_nz": false
     },
     "ascend_scheduler_config": {
         "enabled": true,
         "chunked_prefill_enabled": true,
     },
-    "expert_tensor_parallel_size": 1
+    "expert_tensor_parallel_size": 1,
+    "refresh": false,
 }
 ```
@@ -0,0 +1,82 @@
+# Graph Mode Guide
+
+
+This feature is currently experimental. In future versions, there may be behavioral changes around configuration, coverage, performance improvement.
+
+This guide provides instructions for using Ascend Graph Mode with vLLM Ascend. Please note that graph mode is only available on V1 Engine. And only Qwen, DeepSeek series models are well tested in 0.9.0rc1. We'll make it stable and generalize in the next release.
+
+## Getting Started
+
+From v0.9.0rc1 with V1 Engine, vLLM Ascend will run models in graph mode by default to keep the same behavior with vLLM. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model.
+
+There are two kinds for graph mode supported by vLLM Ascend:
+- **ACLGraph**: This is the default graph mode supported by vLLM Ascend. In v0.9.0rc1, only Qwen series models are well tested.
+- **TorchAirGraph**: This is the GE graph mode. In v0.9.0rc1, only DeepSeek series models are supported.
+
+## Using ACLGraph
+ACLGraph is enabled by default. Take Qwen series models as an example, just set to use V1 Engine is enough.
+
+offline example:
+
+```python
+import os
+
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="Qwen/Qwen2-7B-Instruct")
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct
+```
+
+## Using TorchAirGraph
+
+If you want to run DeepSeek series models with graph mode, you should use [TorchAirGraph](https://www.hiascend.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html). In this case, additional config is required.
+
+offline example:
+
+```python
+import os
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}})
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}'
+```
+
+You can find more detail about additional config [here](./additional_config.md)
+
+## Fallback to Eager Mode
+
+If both `ACLGraph` and `TorchAirGraph` fail to run, you should fallback to eager mode.
+
+offline example:
+
+```python
+import os
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="someother_model_weight", enforce_eager=True)
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct --enforce-eager
+```
@@ -0,0 +1,51 @@
+import os
+import time
+
+from vllm import LLM, SamplingParams
+
+# enable dual-batch overlap for vllm ascend
+os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
+os.environ["VLLM_USE_V1"] = "1"
+
+# Sample prompts.
+prompts = ["The president of the United States is"] * 41
+# Create a sampling params object.
+sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="deepseek-ai/DeepSeek-V3-Lite-base-latest-w8a8-dynamic",
+              enforce_eager=True,
+              tensor_parallel_size=2,
+              max_model_len=4096,
+              trust_remote_code=True,
+              additional_config={
+                  "torchair_graph_config": {
+                      "enabled": False
+                  },
+                  "ascend_scheduler_config": {
+                      "enabled": True
+                  },
+                  "expert_tensor_parallel_size": 1
+              })
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,83 @@
+import torch
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizer)
+from vllm import LLM
+
+
+def init_tokenizer_and_llm(model_name: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
+    embedding_layer = transformers_model.get_input_embeddings()
+    llm = LLM(model=model_name, enable_prompt_embeds=True)
+    return tokenizer, embedding_layer, llm
+
+
+def get_prompt_embeds(chat: list[dict[str,
+                                      str]], tokenizer: PreTrainedTokenizer,
+                      embedding_layer: torch.nn.Module):
+    token_ids = tokenizer.apply_chat_template(chat,
+                                              add_generation_prompt=True,
+                                              return_tensors='pt')
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+    return prompt_embeds
+
+
+def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
+                            embedding_layer: torch.nn.Module):
+    chat = [{
+        "role": "user",
+        "content": "Please tell me about the capital of France."
+    }]
+    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
+
+    outputs = llm.generate({
+        "prompt_embeds": prompt_embeds,
+    })
+
+    print("\n[Single Inference Output]")
+    print("-" * 30)
+    for o in outputs:
+        print(o.outputs[0].text)
+    print("-" * 30)
+
+
+def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
+                           embedding_layer: torch.nn.Module):
+    chats = [[{
+        "role": "user",
+        "content": "Please tell me about the capital of France."
+    }],
+             [{
+                 "role": "user",
+                 "content": "When is the day longest during the year?"
+             }],
+             [{
+                 "role": "user",
+                 "content": "Where is bigger, the moon or the sun?"
+             }]]
+
+    prompt_embeds_list = [
+        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
+    ]
+
+    outputs = llm.generate([{
+        "prompt_embeds": embeds
+    } for embeds in prompt_embeds_list])
+
+    print("\n[Batch Inference Outputs]")
+    print("-" * 30)
+    for i, o in enumerate(outputs):
+        print(f"Q{i+1}: {chats[i][0]['content']}")
+        print(f"A{i+1}: {o.outputs[0].text}\n")
+    print("-" * 30)
+
+
+def main():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
+    single_prompt_inference(llm, tokenizer, embedding_layer)
+    batch_prompt_inference(llm, tokenizer, embedding_layer)
+
+
+if __name__ == "__main__":
+    main()
@@ -16,5 +16,8 @@ requires = [
     "torch>=2.5.1",
     "torchvision<0.21.0",
     "wheel",
+    "msgpack",
+    "quart",
+    "numba",
 ]
 build-backend = "setuptools.build_meta"
@@ -9,6 +9,4 @@ ray
 types-jsonschema
 xgrammar
 zmq
-numba
-quart
 types-psutil
@@ -18,3 +18,6 @@ wheel
 # requirements for disaggregated prefill
 msgpack
 quart
+
+# Required for N-gram speculative decoding
+numba
@@ -34,8 +34,7 @@
 # 3% relative tolerance for numerical accuracy.
 RTOL = 0.03
 # Baseline accuracy after VLLM optimization.
-# FIXME: fix the accuracy issue
-EXPECTED_VALUE = 0.000758150113722517
+EXPECTED_VALUE = 0.3843821076573162
 
 
 def run_test(model_name, queue, more_args=None):
Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,8 @@ requires = [`
`16`	`16`	`"torch>=2.5.1",`
`17`	`17`	`"torchvision<0.21.0",`
`18`	`18`	`"wheel",`
	`19`	`+ "msgpack",`
	`20`	`+ "quart",`
	`21`	`+ "numba",`
`19`	`22`	`]`
`20`	`23`	`build-backend = "setuptools.build_meta"`