@@ -129,36 +129,38 @@ def capture_register(dispatcher_instance):
129
129
with_quant = False )
130
130
131
131
with patch ('torch.distributed.get_rank' , return_value = 0 ), \
132
- patch ('torch.distributed.get_world_size' , return_value = 4 ), \
133
- patch ('vllm_ascend.ops.fused_moe.get_ep_group' , return_value = mock_ep_and_mc2_group (mocker )), \
134
- patch ('vllm_ascend.ops.fused_moe.get_mc2_group' , return_value = mock_ep_and_mc2_group (mocker )), \
135
- patch ('vllm_ascend.ops.fused_moe.get_tp_group' , return_value = mock_dp_and_tp_group (mocker )), \
136
- patch ('vllm.distributed.parallel_state.get_tp_group' , return_value = mock_dp_and_tp_group (mocker )), \
137
- patch ('vllm_ascend.ops.fused_moe.get_dp_group' , return_value = mock_dp_and_tp_group (mocker )), \
138
- patch ('vllm.model_executor.layers.fused_moe.layer.get_dp_group' , return_value = mock_dp_and_tp_group (mocker )), \
139
- patch ('torch.distributed.all_gather' ), \
140
- patch ('torch.distributed.all_to_all_single' ), \
141
- patch ('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce' ), \
142
- patch ('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter' ), \
143
- patch ('vllm.model_executor.layers.fused_moe.config.get_dp_group' ,
144
- return_value = mock_dp_and_tp_group (mocker )), \
145
- patch ('vllm_ascend.ops.fused_moe.get_ascend_config' ,
146
- return_value = MagicMock (
147
- torchair_graph_config = MagicMock (enabled = False , enable_multistream_moe = False ),
148
- expert_map_path = None
149
- )), \
150
- patch ('vllm_ascend.ops.fused_moe.determine_expert_map' ,
151
- return_value = (3 , torch .tensor ([0 , 1 , 2 , - 1 , - 1 , - 1 , - 1 , - 1 ]))), \
152
- patch ('vllm_ascend.ops.fused_moe.get_forward_context' ,
153
- return_value = mock_forward_context_obj ), \
132
+ patch ('torch.distributed.get_world_size' , return_value = 4 ), \
133
+ patch ('vllm_ascend.ops.fused_moe.get_ep_group' , return_value = mock_ep_and_mc2_group (mocker )), \
134
+ patch ('vllm_ascend.ops.fused_moe.get_mc2_group' , return_value = mock_ep_and_mc2_group (mocker )), \
135
+ patch ('vllm_ascend.ops.fused_moe.get_tp_group' , return_value = mock_dp_and_tp_group (mocker )), \
136
+ patch ('vllm.distributed.parallel_state.get_tp_group' , return_value = mock_dp_and_tp_group (mocker )), \
137
+ patch ('vllm_ascend.ops.fused_moe.get_dp_group' , return_value = mock_dp_and_tp_group (mocker )), \
138
+ patch ('vllm.model_executor.layers.fused_moe.layer.get_dp_group' , return_value = mock_dp_and_tp_group (mocker )), \
139
+ patch ('torch.distributed.all_gather' ), \
140
+ patch ('torch.distributed.all_to_all_single' ), \
141
+ patch ('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce' ), \
142
+ patch ('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter' ), \
143
+ patch ('vllm.model_executor.layers.fused_moe.config.get_dp_group' ,
144
+ return_value = mock_dp_and_tp_group (mocker )), \
145
+ patch ('vllm_ascend.ops.fused_moe.get_ascend_config' ,
146
+ return_value = MagicMock (
147
+ torchair_graph_config = MagicMock (enabled = False , enable_multistream_moe = False ),
148
+ expert_map_path = None
149
+ )), \
150
+ patch ('vllm_ascend.ops.fused_moe.determine_expert_map' ,
151
+ return_value = (3 , torch .tensor ([0 , 1 , 2 , - 1 , - 1 , - 1 , - 1 , - 1 ]))), \
152
+ patch ('vllm_ascend.ops.fused_moe.get_forward_context' ,
153
+ return_value = mock_forward_context_obj ), \
154
154
patch ('vllm_ascend.ops.fused_moe.get_current_vllm_config' ,
155
- return_value = MagicMock (
156
- parallel_config = MagicMock (tensor_parallel_size = 2 ),
157
- scheduler_config = MagicMock (max_num_seqs = 4 ),
158
- model_config = MagicMock (max_model_len = 2048 )
159
- )), \
155
+ return_value = MagicMock (
156
+ parallel_config = MagicMock (tensor_parallel_size = 2 ),
157
+ scheduler_config = MagicMock (max_num_seqs = 4 ),
158
+ model_config = MagicMock (max_model_len = 2048 )
159
+ )), \
160
160
patch ("vllm_ascend.utils.get_ascend_soc_version" , return_value = AscendSocVersion .A3 ), \
161
- patch .object (token_dispatcher_module , 'setup_token_dispatchers' , mock_setup_token_dispatchers ):
161
+ patch .object (token_dispatcher_module , 'setup_token_dispatchers' , mock_setup_token_dispatchers ), \
162
+ patch ('vllm_ascend.ops.layers.moe_mlp.get_forward_context' ,
163
+ return_value = mock_forward_context_obj ):
162
164
163
165
yield {
164
166
'mock_forward_context_obj' : mock_forward_context_obj ,
@@ -524,27 +526,22 @@ def test_select_experts(self, mock_dist_env, mock_moe_env,
524
526
525
527
class TestUnifiedApplyMLP (TestBase ):
526
528
527
- @patch ('vllm_ascend.ops.fused_moe.get_forward_context' )
528
- @patch ('vllm_ascend.ops.fused_moe.get_mc2_group' )
529
- @patch ('vllm_ascend.ops.fused_moe.is_310p' )
529
+ @patch ('vllm_ascend.ops.layers.moe_mlp.get_forward_context' )
530
+ @patch ('vllm_ascend.ops.layers.moe_mlp.is_310p' )
530
531
@patch ('torch_npu.npu_grouped_matmul' )
531
532
@patch ('torch_npu.npu_dynamic_quant' )
532
533
@patch ('torch_npu.npu_dequant_swiglu_quant' )
533
534
def test_unified_apply_mlp_with_quantization_mc2 (self , mock_npu_dequant ,
534
535
mock_npu_dynamic_quant ,
535
536
mock_npu_grouped_matmul ,
536
537
mock_is_310p ,
537
- mock_get_mc2_group ,
538
538
mock_get_forward_context ):
539
539
540
540
mock_forward_context = MagicMock ()
541
541
mock_forward_context .with_quant = True
542
542
mock_forward_context .fused_moe_state = FusedMoEState .MC2
543
543
mock_get_forward_context .return_value = mock_forward_context
544
544
545
- mock_mc2_group = MagicMock ()
546
- mock_get_mc2_group .return_value = mock_mc2_group
547
-
548
545
mock_is_310p .return_value = False
549
546
550
547
mock_npu_dynamic_quant .return_value = (torch .randint (- 128 ,
@@ -597,8 +594,8 @@ def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
597
594
598
595
self .assertEqual (result .dtype , torch .bfloat16 )
599
596
600
- @patch ('vllm_ascend.ops.fused_moe .get_forward_context' )
601
- @patch ('vllm_ascend.ops.fused_moe .is_310p' )
597
+ @patch ('vllm_ascend.ops.layers.moe_mlp .get_forward_context' )
598
+ @patch ('vllm_ascend.ops.layers.moe_mlp .is_310p' )
602
599
@patch ('torch_npu.npu_grouped_matmul' )
603
600
@patch ('torch_npu.npu_swiglu' )
604
601
@patch ('torch_npu.npu_dynamic_quant' )
@@ -645,7 +642,7 @@ def test_unified_apply_mlp_without_quantization(
645
642
self .assertEqual (result .shape , hidden_states .shape )
646
643
self .assertEqual (result .dtype , torch .float16 )
647
644
648
- @patch ('vllm_ascend.ops.fused_moe .get_forward_context' )
645
+ @patch ('vllm_ascend.ops.layers.moe_mlp .get_forward_context' )
649
646
@patch ('torch_npu.npu_grouped_matmul' )
650
647
@patch ('torch_npu.npu_swiglu' )
651
648
@patch ('torch_npu.npu_dynamic_quant' )
@@ -705,8 +702,8 @@ def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
705
702
self .assertEqual (result .shape , hidden_states .shape )
706
703
self .assertEqual (result .dtype , torch .bfloat16 )
707
704
708
- @patch ('vllm_ascend.ops.fused_moe .get_forward_context' )
709
- @patch ('vllm_ascend.ops.fused_moe .is_310p' )
705
+ @patch ('vllm_ascend.ops.layers.moe_mlp .get_forward_context' )
706
+ @patch ('vllm_ascend.ops.layers.moe_mlp .is_310p' )
710
707
@patch ('torch_npu.npu_grouped_matmul' )
711
708
@patch ('torch_npu.npu_swiglu' )
712
709
@patch ('torch_npu.npu_dynamic_quant' )
@@ -755,4 +752,4 @@ def test_unified_apply_mlp_without_quantization_310p(
755
752
mock_npu_swiglu .assert_called_once ()
756
753
757
754
self .assertEqual (result .shape , hidden_states .shape )
758
- self .assertEqual (result .dtype , torch .float16 )
755
+ self .assertEqual (result .dtype , torch .float16 )
0 commit comments