|
23 | 23 | from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
|
24 | 24 |
|
25 | 25 | from vllm_ascend.ascend_forward_context import _get_fused_moe_state
|
26 |
| -# from vllm_ascend.ops.fused_moe import (AscendFusedMoE, |
27 |
| -# AscendUnquantizedFusedMoEMethod) |
28 |
| -from vllm_ascend.torchair.ops.torchair_fused_moe import (TorchairAscendFusedMoE, |
29 |
| - TorchairAscendUnquantizedFusedMoEMethod) |
| 26 | +from vllm_ascend.torchair.ops.torchair_fused_moe import ( |
| 27 | + TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) |
30 | 28 | from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
|
31 | 29 |
|
32 | 30 | adapt_patch(True)
|
@@ -57,33 +55,33 @@ def mock_dist_env(mocker: MockerFixture):
|
57 | 55 |
|
58 | 56 | with patch('torch.distributed.get_rank', return_value=0), \
|
59 | 57 | patch('torch.distributed.get_world_size', return_value=4), \
|
60 |
| - patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \ |
61 |
| - patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \ |
62 |
| - patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ |
| 58 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \ |
| 59 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \ |
| 60 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ |
63 | 61 | patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
64 |
| - patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ |
| 62 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ |
65 | 63 | patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
66 | 64 | patch('torch.distributed.all_gather', return_value=MagicMock(return_value=torch.randn(10,32))), \
|
67 | 65 | patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \
|
68 |
| - patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce', |
| 66 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce', |
69 | 67 | return_value=torch.randn(5, 32)), \
|
70 |
| - patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter', |
| 68 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter', |
71 | 69 | return_value=torch.randn(5, 32)), \
|
72 | 70 | patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
|
73 | 71 | return_value=mock_dp_and_tp_group(mocker)), \
|
74 |
| - patch('vllm_ascend.ops.fused_moe.get_ascend_config', |
| 72 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config', |
75 | 73 | return_value=MagicMock(
|
76 | 74 | torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
|
77 | 75 | expert_map_path=None
|
78 | 76 | )), \
|
79 |
| - patch('vllm_ascend.ops.fused_moe.determine_expert_map', |
| 77 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map', |
80 | 78 | return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
|
81 |
| - patch('vllm_ascend.ops.fused_moe.get_forward_context', |
| 79 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context', |
82 | 80 | return_value=MagicMock(
|
83 | 81 | max_tokens_across_dp=10,
|
84 | 82 | dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
|
85 | 83 | )), \
|
86 |
| - patch('vllm_ascend.ops.fused_moe.get_current_vllm_config', |
| 84 | + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config', |
87 | 85 | return_value=MagicMock(
|
88 | 86 | parallel_config=MagicMock(tensor_parallel_size=2),
|
89 | 87 | scheduler_config=MagicMock(max_num_seqs=4),
|
@@ -196,7 +194,7 @@ def apply(self, hidden_states: torch.Tensor,
|
196 | 194 | pass
|
197 | 195 |
|
198 | 196 |
|
199 |
| -class TestAscendFusedMoe: |
| 197 | +class TestTorchairAscendFusedMoe: |
200 | 198 |
|
201 | 199 | def test_init_no_quant(self, mock_dist_env, default_moe_config):
|
202 | 200 | layer = TorchairAscendFusedMoE(**default_moe_config)
|
@@ -233,7 +231,7 @@ def test_init_with_quant(self, mock_dist_env, default_moe_config):
|
233 | 231 | mock_quant_config.get_quant_method.return_value = mock_quant_method
|
234 | 232 |
|
235 | 233 | moe = TorchairAscendFusedMoE(**default_moe_config,
|
236 |
| - quant_config=mock_quant_config) |
| 234 | + quant_config=mock_quant_config) |
237 | 235 |
|
238 | 236 | assert moe.quant_method is not None
|
239 | 237 | assert moe.quant_method == mock_quant_method
|
@@ -266,7 +264,7 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
|
266 | 264 | forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
|
267 | 265 | dtype=torch.bool),
|
268 | 266 | padded_num_tokens=num_tokens)
|
269 |
| - with patch("vllm_ascend.ops.fused_moe.get_forward_context", |
| 267 | + with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", |
270 | 268 | return_value=forward_context):
|
271 | 269 | output = moe.forward(inputs,
|
272 | 270 | router_logits,
|
@@ -299,7 +297,7 @@ def test_forward_ms_fused_moe_comp(self, mock_dist_env,
|
299 | 297 | assert output.shape == (5, 32)
|
300 | 298 |
|
301 | 299 |
|
302 |
| -class TestAscendUnquantizedFusedMoEMethod: |
| 300 | +class TestTorchairAscendUnquantizedFusedMoEMethod: |
303 | 301 |
|
304 | 302 | def test_process_weights_after_loading(self, moe_method, mock_dist_env):
|
305 | 303 | layer = MagicMock()
|
@@ -328,7 +326,7 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
|
328 | 326 | is_deepseek_v3_r1 = global_num_experts == 256
|
329 | 327 | forward_context = MagicMock(fused_moe_state=_get_fused_moe_state(
|
330 | 328 | ep_size, is_prefill, is_deepseek_v3_r1))
|
331 |
| - with patch("vllm_ascend.ops.fused_moe.get_forward_context", |
| 329 | + with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", |
332 | 330 | return_value=forward_context):
|
333 | 331 | moe_method.ep_size = ep_size
|
334 | 332 | x = torch.randn(8, 2, 2)
|
@@ -363,10 +361,10 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
|
363 | 361 | is_prefill = False
|
364 | 362 | forward_context = MagicMock(
|
365 | 363 | fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
|
366 |
| - with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER", |
| 364 | + with patch("vllm_ascend.torchair.ops.torchair_fused_moe.MOE_ALL2ALL_BUFFER", |
367 | 365 | alltoall_buffer), \
|
368 |
| - patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \ |
369 |
| - patch("vllm_ascend.ops.fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): |
| 366 | + patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \ |
| 367 | + patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): |
370 | 368 | expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
|
371 | 369 | moe_method.ep_size = ep_size
|
372 | 370 | x = torch.randn(8, 2, 2)
|
|
0 commit comments