Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3fa0c35
[Refactor][MOE] remove redundant code.
Aug 28, 2025
8d3331e
[Refactor][MOE] remove redundant code.
Aug 28, 2025
dcde977
[Refactor][MOE] remove redundant code.
Aug 28, 2025
ce1db63
[Refactor][MOE] remove redundant code.
Aug 28, 2025
93d0cdb
[Refactor][MOE] remove redundant code.
Aug 28, 2025
b6ccdea
[Refactor][MOE] remove redundant code.
Aug 28, 2025
bd34a97
[Refactor][MOE] remove redundant code.
Aug 29, 2025
b2902a6
[Refactor][MOE] remove redundant code.
Aug 29, 2025
9e5b59e
[Refactor][MOE] remove redundant code.
Aug 29, 2025
6eea19b
[Refactor][MOE] remove redundant code.
Aug 29, 2025
3a2ec56
[Refactor][MOE] remove redundant code.
Aug 29, 2025
d2195c7
[Bugfix] Fix aclgraph not enabled by default (#2590)
MengqingCao Aug 28, 2025
297ea45
Support v0.10.1 (#2584)
Yikun Aug 28, 2025
29d32ea
[Fix] Fix DP-related padding logic (#2582)
yiz-liu Aug 28, 2025
98d3ce4
[CI] Add e2e ci test for A3 (#2573)
zhangxinyuehfad Aug 29, 2025
5cbb7a6
[Feat]: Add custom lmhead tensor model parallel (#2309)
lidenghui1110 Aug 29, 2025
ab13da5
[Fix] Resolve data-parallel (DP) assertion errors in TorchAir (#2626)
yiz-liu Aug 29, 2025
3d1cb62
[main] [bugfix] Fix misjudging quantized/unquantized scenarios (#2627)
Pr0Wh1teGivee Aug 29, 2025
dc4b209
[MAIN][BUGFIX] BugFix: Resolve the issue of waiting queue accumulatio…
wangxiaoteng888 Aug 29, 2025
3137dfc
[Refactor][MOE] remove redundant code.
Aug 28, 2025
85694b3
[Refactor][MOE] remove redundant code.
Aug 29, 2025
5d0528f
[Refactor][MOE] remove redundant code.
Aug 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/source/user_guide/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ This is the 1st release candidate of v0.10.0 for vLLM Ascend. Please follow the
* `expert_tensor_parallel_size` in `additional_config` is removed now, and the EP and TP is aligned with vLLM now. [#1681](https://github.com/vllm-project/vllm-ascend/pull/1681)
* Add `VLLM_ASCEND_MLA_PA` in environ variables, use this to enable mla paged attention operator for deepseek mla decode.
* Add `VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE` in environ variables, enable `MatmulAllReduce` fusion kernel when tensor parallel is enabled. This feature is supported in A2, and eager mode will get better performance.
* Add `VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ` in environ variables, Whether to enable moe all2all seq, this provides a basic framework on the basis of alltoall for easy expansion.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do not remove release note

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok


- UT coverage reached 76.34% after a batch of prs followed by this rfc: [#1298](https://github.com/vllm-project/vllm-ascend/issues/1298)
- Sequence Parallelism works for Qwen3 MoE. [#2209](https://github.com/vllm-project/vllm-ascend/issues/2209)
Expand Down
22 changes: 0 additions & 22 deletions tests/e2e/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,28 +141,6 @@ def test_models_distributed_topk() -> None:
vllm_model.generate(example_prompts, sampling_params)


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
def test_models_distributed_alltoallv() -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)

with VllmRunner(
"deepseek-ai/DeepSeek-V2-Lite",
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)


def test_models_distributed_Qwen3_W8A8():
example_prompts = [
"Hello, my name is",
Expand Down
14 changes: 5 additions & 9 deletions tests/ut/ops/test_fused_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
from vllm_ascend.ascend_forward_context import (FusedMoEState,
_get_fused_moe_state)
from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
AscendUnquantizedFusedMoEMethod,
unified_apply_mlp)
AscendUnquantizedFusedMoEMethod)
from vllm_ascend.ops.layers.experts_selector import select_experts
from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
from vllm_ascend.utils import AscendSocVersion, adapt_patch

adapt_patch(True)
Expand Down Expand Up @@ -442,11 +442,11 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
assert result.shape == expected_shape

@pytest.mark.parametrize("others_param",
[[16, False], [1, True], [1, False], [4, False]])
[[16], [1], [4]])
def test_apply_with_expert_map(self, moe_method, mock_dist_env,
mock_moe_env, others_param):

ep_size, alltoall_buffer = others_param
ep_size = others_param
is_prefill = False

if ep_size == 1:
Expand All @@ -464,9 +464,7 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
with_quant=False,
token_dispatcher=selected_token_dispatcher)

with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
alltoall_buffer), \
patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3):

expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
Expand All @@ -475,8 +473,6 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
if ep_size == 1:
x = x.view(-1, 2)
router_logits = torch.randn(8, 8)
if alltoall_buffer:
moe_method.max_model_len = 1
layer = MagicMock()

local_num_experts = 2
Expand Down
44 changes: 1 addition & 43 deletions tests/ut/ops/test_token_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,11 @@

from tests.ut.base import PytestBase, TestBase
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
AscendSocVersion, MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig,
TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
AscendSocVersion, TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
TokenDispatcherWithMC2, _Dispatchers, _register_token_dispatcher,
get_token_dispatcher, setup_token_dispatchers)


class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):

@pytest.fixture
def config(self):
config = MoEDispatcherConfig()
config.set_num_local_experts(2)
config.set_num_moe_experts(4)
config.set_moe_pad_expert_input_to_capacity(False)
config.set_moe_expert_capacity_factor(None)
config.set_moe_router_topk(2)
config.set_moe_grouped_gemm(False)
config.set_group_topk(0)
config.set_num_groups(1)
config.set_is_fused(False)
return config.build()

def mock_ep_group(self, mocker):
mock_group = mocker.MagicMock()
mock_group.rank_in_group = 0
mock_group.world_size = 2
mock_group.device_group = "mock_group"
return mock_group

@pytest.fixture
def dispatcher(self, config, mocker: MockerFixture):
mocker.patch(
"vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
return_value=self.mock_ep_group(mocker))
mocker.patch("torch.npu.current_device", return_value="cpu")
mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
return MoEAlltoAllSeqOverLapDispatcher(config)

def test_initialization(self, dispatcher, config):
assert dispatcher.num_local_experts == config.num_local_experts
assert dispatcher.num_experts == config.num_moe_experts
assert dispatcher.local_expert_indices == [0, 1]
assert dispatcher.ep_rank == 0
assert dispatcher.ep_size == 2
assert dispatcher.overlap_stream is not None


class TestTokenDispatcherWithMC2(TestBase):

def setUp(self):
Expand Down
10 changes: 3 additions & 7 deletions tests/ut/torchair/ops/test_torchair_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
assert result.shape == x.shape

@pytest.mark.parametrize("others_param",
[[16, False], [1, True], [1, False], [4, False]])
[[16], [1], [4]])
def test_apply_with_expert_map(self, moe_method, mock_dist_env,
mock_moe_env, others_param):
"""
Expand All @@ -363,22 +363,18 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
3 test use_select_experts and fused_experts_with_all2all
4 test use_select_experts and fused_experts
"""
ep_size, alltoall_buffer = others_param
ep_size = others_param
is_prefill = False
forward_context = MagicMock(
fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.MOE_ALL2ALL_BUFFER",
alltoall_buffer), \
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
moe_method.ep_size = ep_size
x = torch.randn(8, 2, 2)
if ep_size == 1:
x = x.view(-1, 2)
router_logits = torch.randn(8, 8)
if alltoall_buffer:
moe_method.max_model_len = 1
layer = MagicMock()
layer.w13_weight = torch.randn(8, 16, 1)
layer.w2_weight = torch.randn(16, 8, 1)
Expand Down
4 changes: 0 additions & 4 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
return FusedMoEState.NaiveMulticast
else:
return FusedMoEState.AllGather
elif envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
# MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
return (FusedMoEState.All2AllSeq if
(ep_size < 16 or with_prefill) else FusedMoEState.MC2)
# NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
elif ep_size < 16 or with_prefill:
return FusedMoEState.All2All
Expand Down
10 changes: 0 additions & 10 deletions vllm_ascend/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@
"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
),
# MOE_ALL2ALL_BUFFER:
# 0: default, normal init.
# 1: enable moe_all2all_buffer.
"MOE_ALL2ALL_BUFFER":
lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
# training, the optimized model may not be suitable. In this case, set this
# value to False to disable the optimized model.
Expand Down Expand Up @@ -136,11 +131,6 @@
# this feature is supported in A2, and eager mode will get better performance.
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
# Whether to enable the alltoall_seq flag, this provides a basic framework on the basis of alltoall for easy expansion.
# 0: default, normal init.
# 1: enable moe all2all seq.
"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))),
# Whether to enable mlp optimize when tensor parallel is enabled.
# this feature in eager mode will get better performance.
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
Expand Down
98 changes: 96 additions & 2 deletions vllm_ascend/ops/common_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,23 @@
from typing import Any, Callable, Optional

import torch
import torch_npu
from vllm.config import CompilationLevel, get_current_vllm_config
from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod)
from vllm.model_executor.layers.fused_moe.config import \
FusedMoEParallelConfig # isort: skip

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
DummyCommImpl,
MC2CommImpl,
MoECommMethod)
from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.fused_moe import apply_mlp, fused_experts_moge
from vllm_ascend.ops.layers.experts_selector import select_experts
from vllm_ascend.ops.layers.moe_mlp import unquant_apply_mlp
from vllm_ascend.utils import is_310p

original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
Expand Down Expand Up @@ -81,7 +84,7 @@ def fused_experts(

permuted_hidden_states, expert_tokens, group_list_type = moe_comm_method.permute(
hidden_states, topk_ids, topk_weights, expert_map, num_experts)
mlp_output = apply_mlp(
mlp_output = unquant_apply_mlp(
permuted_hidden_states,
w1,
w2,
Expand All @@ -93,6 +96,97 @@ def fused_experts(
return hidden_states


def fused_experts_moge(
hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
moe_parallel_config: FusedMoEParallelConfig,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
top_k: int,
global_num_experts: int,
expert_map: torch.Tensor = None,
apply_router_weight_on_input: bool = False,
) -> torch.Tensor:
"""

Args:
hidden_states: Hidden states of shape (num_tokens, hidden_size).
w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
topk_weights: Routing weights of shape (num_tokens, top_k).
topk_ids: Selected expert IDs of shape (num_tokens, top_k).
top_k: Number of experts to select.
expert_map: Expert mapping of shape (num_experts,).

Returns:
hidden_states: Hidden states after routing.
"""
ep_size = moe_parallel_config.ep_size
local_num_experts = global_num_experts // ep_size
local_num_group = top_k // ep_size

if apply_router_weight_on_input:
assert (topk_weights.dim() == 2
), "`topk_weights` should be in shape (num_tokens, topk)"
_, topk = topk_weights.shape
assert (
topk == 1
), "Only support topk=1 when `apply_router_weight_on_input` is True"
hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)

bsz, _ = hidden_states.shape
flatten_topk_ids = topk_ids.view(-1)
sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
sorted_topk_ids = sorted_topk_ids.to(torch.int32)
sorted_hidden_states = hidden_states.index_select(
0, sorted_topk_ids // local_num_group)

experts_id = torch.arange(0,
local_num_experts,
dtype=topk_ids.dtype,
device=topk_ids.device)
num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
torch.float32).sum(0)
topk_scales = topk_weights.view(-1).index_select(
0, sorted_topk_ids).unsqueeze(-1)
group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)

w1 = w1.transpose(1, 2)
gate_up_out = torch_npu.npu_grouped_matmul(
x=[sorted_hidden_states],
weight=[w1],
split_item=2,
group_list_type=0,
group_type=0,
group_list=group_list,
)[0]

if is_310p():
gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
torch.float16)
else:
gate_up_out = torch_npu.npu_swiglu(gate_up_out)
gate_up_out *= topk_scales

w2 = w2.transpose(1, 2)
down_out_list = torch_npu.npu_grouped_matmul(
x=[gate_up_out],
weight=[w2],
split_item=2,
group_list_type=0,
group_type=0,
group_list=group_list,
)[0]

unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids)
final_hidden_states = unsorted_hidden_states.reshape(
bsz, top_k // ep_size, -1).sum(1)

return final_hidden_states

Comment on lines +98 to +187
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This function fused_experts_moge is identical to torchair_fused_experts_moge in vllm_ascend/torchair/ops/torchair_fused_moe.py. To avoid code duplication and improve maintainability, consider using this common function in the torchair implementation as well. This would involve removing torchair_fused_experts_moge and updating its call site to use this function.


def unquantized_fused_moe_init_func(self, *args, **kwargs):
original_unquantized_fused_moe_init_func(self, *args, **kwargs)
vllm_config = get_current_vllm_config()
Expand Down
Loading
Loading