vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/vllm_ascend_test_full.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/vllm_ascend_test_full.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/vllm_ascend_test_pd.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/vllm_ascend_test_pd.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/disaggregated_prefill_v1/README.md‎
Lines changed: 4 additions & 4 deletions b/‎examples/disaggregated_prefill_v1/README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements-dev.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 27 additions & 2 deletions b/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎tests/e2e/multicard/test_qwen3_moe.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/e2e/multicard/test_qwen3_moe.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/e2e/singlecard/ops/test_fused_moe.py‎
Lines changed: 82 additions & 0 deletions b/‎tests/e2e/singlecard/ops/test_fused_moe.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎tests/e2e/singlecard/test_ascend_scheduler.py‎
Lines changed: 23 additions & 0 deletions b/‎tests/e2e/singlecard/test_ascend_scheduler.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/ut/distributed/test_distributed_tensor_parallel.py‎
Lines changed: 0 additions & 139 deletions b/‎tests/ut/distributed/test_distributed_tensor_parallel.py‎
Lines changed: 0 additions & 139 deletions
@@ -258,4 +258,4 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
+          pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
@@ -226,6 +226,8 @@ jobs:
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
 
           #pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
           pytest -sv tests/e2e/multicard/test_prefix_caching.py
 
@@ -108,4 +108,5 @@ jobs:
 
       - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
         run: |
+          git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
           bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_LLMDD_RPC_PORT=5559
+export VLLM_ASCEND_LLMDD_RPC_PORT=5559
 
 vllm serve /models/deepseek_r1_w8a8 \
   --host 0.0.0.0 \
@@ -85,7 +85,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_LLMDD_RPC_PORT=5659
+export VLLM_ASCEND_LLMDD_RPC_PORT=5659
 
 vllm serve /models/deepseek_r1_w8a8 \
   --host 0.0.0.0 \
@@ -131,7 +131,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_LLMDD_RPC_PORT=5759
+export VLLM_ASCEND_LLMDD_RPC_PORT=5759
 
 vllm serve /models/deepseek_r1_w8a8 \
   --host 0.0.0.0 \
@@ -173,7 +173,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_LLMDD_RPC_PORT=5859
+export VLLM_ASCEND_LLMDD_RPC_PORT=5859
 
 vllm serve /models/deepseek_r1_w8a8 \
   --host 0.0.0.0 \
 
@@ -5,7 +5,7 @@ openai
 pytest >= 6.0
 pytest-asyncio
 pytest-mock
-lm-eval==0.4.8
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d 
 types-jsonschema
 xgrammar
 zmq
 
@@ -31,7 +31,9 @@
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
-QWEN_DENSE_MODELS = ["Qwen/QwQ-32B", "Qwen/Qwen-32B"]
+QWEN_DENSE_MODELS = [
+    "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
+]
 
 
 def test_models_distributed_QwQ():
@@ -170,6 +172,29 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
             max_model_len=8192,
             enforce_eager=enforce_eager,
             dtype="auto",
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
+def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
+        model, enforce_eager):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download(model),
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            dtype="auto",
+            tensor_parallel_size=2,
+            quantization="ascend",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
             max_model_len=8192,
             tensor_parallel_size=2,
             quantization="ascend",
-            enforce_eager=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
@@ -29,6 +29,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 
 from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
 
 NUM_EXPERTS = [8, 64]
@@ -165,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
     torch.npu.reset_peak_memory_stats()
 
 
+@pytest.mark.parametrize("m", [1, 33, 64])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_token_dispatcher_with_all_gather_quant(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    context_mock = MagicMock()
+    context_mock.fused_moe_state = 0
+    with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
+               return_value=context_mock):
+        a = torch.randn((m, k), device=device, dtype=dtype) / 10
+        w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
+        w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
+        w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
+        w2_scale = torch.empty((e, k), device=device, dtype=dtype)
+
+        score = torch.randn((m, e), device=device, dtype=dtype)
+        expert_map = None
+        local_e = e
+
+        score = torch.softmax(score, dim=-1, dtype=dtype)
+        topk_weights, topk_ids = torch.topk(score, topk)
+        topk_ids = topk_ids.to(torch.int32)
+        row_idx = (torch.arange(
+            0,
+            m * topk,
+            device=device,
+            dtype=torch.int32,
+        ).view(topk, -1).permute(1, 0).contiguous())
+
+        dispatcher_kwargs = {
+            "num_experts": e,
+            "top_k": topk,
+            "num_local_experts": local_e,
+        }
+        dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
+
+        apply_router_weight_on_input = False
+        dispatch_output = dispatcher.token_dispatch(
+            hidden_states=a,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            with_quant=True)
+
+        sorted_hidden_states = dispatch_output["hidden_states"]
+        group_list = dispatch_output["group_list"]
+        group_list_type = dispatch_output.get("group_list_type", 1)
+        dynamic_scale = dispatch_output["dynamic_scale"]
+
+        expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
+                                          w1=w1,
+                                          w1_scale=w1_scale,
+                                          w2=w2,
+                                          w2_scale=w2_scale,
+                                          group_list=group_list,
+                                          group_list_type=group_list_type,
+                                          dynamic_scale=dynamic_scale,
+                                          with_quant=True)
+        combined_output = dispatcher.token_combine(hidden_states=expert_output,
+                                                   bias=None)
+        assert combined_output.shape == (m, k)
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
+
+
 @pytest.mark.parametrize("m", [1, 33, 64])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
+from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
@@ -86,3 +87,25 @@ def test_chunked_prefill_with_ascend_scheduler(
         name_0="vllm_output",
         name_1="chunked_prefill_output",
     )
+
+
+def test_async_scheduling() -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 10
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner(
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            max_model_len=4096,
+            max_num_seqs=50,
+            dtype="bfloat16",
+            gpu_memory_utilization=0.9,
+            async_scheduling=True,
+    ) as vllm_model:
+        vllm_model.generate(prompts, sampling_params=sampling_params)