|
| 1 | +import copy |
| 2 | +from unittest.mock import Mock, patch |
| 3 | + |
| 4 | +import torch |
| 5 | + |
| 6 | +from tests.ut.base import TestBase |
| 7 | +from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import ( |
| 8 | + TorchairAscendW4A8DynamicFusedMoEMethod, |
| 9 | + TorchairAscendW4A8DynamicLinearMethod) |
| 10 | + |
| 11 | + |
| 12 | +class TestAscendW4A8DynamicLinearMethod(TestBase): |
| 13 | + |
| 14 | + def setUp(self): |
| 15 | + self.method = TorchairAscendW4A8DynamicLinearMethod() |
| 16 | + self.method.group_size = 8 |
| 17 | + |
| 18 | + def test_get_weight(self): |
| 19 | + weight = self.method.get_weight(8, 32, torch.bfloat16) |
| 20 | + self.assertEqual(weight["weight"].dtype, torch.int8) |
| 21 | + self.assertEqual(weight["weight"].shape, (32, 8)) |
| 22 | + |
| 23 | + def test_get_pergroup_param(self): |
| 24 | + params = self.method.get_pergroup_param(8, 32, torch.bfloat16) |
| 25 | + self.assertEqual(params["weight_scale"].dtype, torch.bfloat16) |
| 26 | + self.assertEqual(params["weight_scale"].shape, (32, 1)) |
| 27 | + self.assertEqual(params["weight_offset"].dtype, torch.bfloat16) |
| 28 | + self.assertEqual(params["weight_offset"].shape, (32, 1)) |
| 29 | + self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16) |
| 30 | + self.assertEqual(params["weight_scale_second"].shape, (32, 1)) |
| 31 | + self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16) |
| 32 | + self.assertEqual(params["weight_offset_second"].shape, (32, 1)) |
| 33 | + |
| 34 | + |
| 35 | +class TestAscendW4A8DynamicFusedMoEMethod(TestBase): |
| 36 | + experts = 8 |
| 37 | + input_size = 16 |
| 38 | + output_size = 56 |
| 39 | + group_size = 2 |
| 40 | + |
| 41 | + @patch( |
| 42 | + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_current_vllm_config' |
| 43 | + ) |
| 44 | + @patch( |
| 45 | + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_ep_group') |
| 46 | + @patch("vllm_ascend.ascend_config.get_ascend_config") |
| 47 | + @patch( |
| 48 | + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_mc2_group' |
| 49 | + ) |
| 50 | + @patch('torch.distributed.get_rank', return_value=0) |
| 51 | + def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ascend_config, |
| 52 | + mock_get_ep_group, get_current_vllm_config): |
| 53 | + mock_ascend_config = Mock() |
| 54 | + mock_ascend_config.torchair_graph_config = Mock(enabled=False) |
| 55 | + mock_get_ascend_config.return_value = mock_ascend_config |
| 56 | + mock_vllm_config = Mock() |
| 57 | + mock_vllm_config.quant_config = Mock(quant_description={ |
| 58 | + "group_size": self.group_size, |
| 59 | + "version": "0.0.0" |
| 60 | + }) |
| 61 | + mock_vllm_config.parallel_config = Mock(enable_expert_parallel=True) |
| 62 | + get_current_vllm_config.return_value = mock_vllm_config |
| 63 | + self.quant_method = TorchairAscendW4A8DynamicFusedMoEMethod() |
| 64 | + |
| 65 | + def test_get_weight(self): |
| 66 | + # old quant version w4a8 weight |
| 67 | + param_dict = self.quant_method.get_weight(self.experts, |
| 68 | + self.input_size, |
| 69 | + self.output_size, |
| 70 | + torch.bfloat16) |
| 71 | + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) |
| 72 | + self.assertEqual(param_dict["w13_weight"].shape, |
| 73 | + (self.experts, 2 * self.input_size, self.output_size)) |
| 74 | + # new quant version weight |
| 75 | + self.quant_method.new_quant_version = True |
| 76 | + param_dict = self.quant_method.get_weight(self.experts, |
| 77 | + self.input_size, |
| 78 | + self.output_size, |
| 79 | + torch.bfloat16) |
| 80 | + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) |
| 81 | + self.assertEqual(param_dict["w13_weight"].shape, |
| 82 | + (self.experts, self.input_size, self.output_size)) |
| 83 | + |
| 84 | + def test_get_dynamic_quant_param(self): |
| 85 | + # old quant version weight |
| 86 | + param_dict = self.quant_method.get_dynamic_quant_param( |
| 87 | + self.experts, self.input_size, self.output_size, torch.bfloat16) |
| 88 | + self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16) |
| 89 | + self.assertEqual(param_dict["w13_weight_scale"].shape, |
| 90 | + (self.experts, 2 * self.input_size, 1)) |
| 91 | + self.assertEqual(param_dict["w13_weight_scale_second"].dtype, |
| 92 | + torch.bfloat16) |
| 93 | + self.assertEqual(param_dict["w13_weight_scale_second"].shape, |
| 94 | + (self.experts, 2 * self.input_size, |
| 95 | + self.output_size // self.group_size)) |
| 96 | + self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16) |
| 97 | + self.assertEqual(param_dict["w2_weight_scale"].shape, |
| 98 | + (self.experts, self.output_size, 1)) |
| 99 | + self.assertEqual(param_dict["w2_weight_scale_second"].dtype, |
| 100 | + torch.bfloat16) |
| 101 | + self.assertEqual(param_dict["w2_weight_scale_second"].shape, |
| 102 | + (self.experts, self.output_size, |
| 103 | + self.input_size // self.group_size)) |
| 104 | + # new quant version weight |
| 105 | + self.quant_method.new_quant_version = True |
| 106 | + param_dict = self.quant_method.get_dynamic_quant_param( |
| 107 | + self.experts, self.input_size, self.output_size, torch.bfloat16) |
| 108 | + self.assertEqual(param_dict["w2_scale_bias"].dtype, torch.float32) |
| 109 | + self.assertEqual( |
| 110 | + param_dict["w2_scale_bias"].shape, |
| 111 | + (self.experts, self.output_size, 16 // self.quant_method.tp_size)) |
| 112 | + |
| 113 | + @patch('torch_npu.npu_quantize') |
| 114 | + @patch('torch.Tensor.npu') |
| 115 | + def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize): |
| 116 | + # old quant version weight |
| 117 | + layer = torch.nn.Module() |
| 118 | + layer.w13_weight = torch.nn.Parameter(torch.zeros( |
| 119 | + (self.experts, 2 * self.input_size, self.output_size), |
| 120 | + dtype=torch.int8), |
| 121 | + requires_grad=False) |
| 122 | + layer.w2_weight = torch.nn.Parameter(torch.zeros( |
| 123 | + (self.experts, self.output_size, self.input_size), |
| 124 | + dtype=torch.int8), |
| 125 | + requires_grad=False) |
| 126 | + layer.w13_weight_scale = torch.nn.Parameter(torch.ones( |
| 127 | + (self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16), |
| 128 | + requires_grad=False) |
| 129 | + layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones( |
| 130 | + (self.experts, 2 * self.input_size, |
| 131 | + self.output_size // self.group_size), |
| 132 | + dtype=torch.bfloat16), |
| 133 | + requires_grad=False) |
| 134 | + layer.w2_weight_scale = torch.nn.Parameter(torch.ones( |
| 135 | + (self.experts, self.output_size, 1), dtype=torch.bfloat16), |
| 136 | + requires_grad=False) |
| 137 | + layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones( |
| 138 | + (self.experts, self.output_size, |
| 139 | + self.input_size // self.group_size), |
| 140 | + dtype=torch.bfloat16), |
| 141 | + requires_grad=False) |
| 142 | + new_layer = copy.deepcopy(layer) |
| 143 | + |
| 144 | + mock_npu.return_value = torch.Tensor() |
| 145 | + mock_npu_quantize.return_value = torch.Tensor() |
| 146 | + self.quant_method.process_weights_after_loading(layer) |
| 147 | + self.assertTrue(hasattr(layer, "w13_scale_bias")) |
| 148 | + self.assertEqual(layer.w13_scale_bias.data.shape, |
| 149 | + (self.experts, 2 * self.input_size)) |
| 150 | + self.assertEqual(layer.w13_scale_bias.data.dtype, torch.float32) |
| 151 | + self.assertTrue(hasattr(layer, "w2_scale_bias")) |
| 152 | + self.assertEqual(layer.w2_scale_bias.data.shape, |
| 153 | + (self.experts, self.output_size)) |
| 154 | + self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32) |
| 155 | + # new quant version weight |
| 156 | + self.quant_method.new_quant_version = True |
| 157 | + new_layer.w13_weight.data = torch.zeros( |
| 158 | + (self.experts, self.input_size, self.output_size), |
| 159 | + dtype=torch.int8) |
| 160 | + new_layer.w2_weight.data = torch.zeros( |
| 161 | + (self.experts, self.output_size // 2, self.input_size), |
| 162 | + dtype=torch.int8) |
| 163 | + w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1), |
| 164 | + dtype=torch.float32) |
| 165 | + new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias, |
| 166 | + requires_grad=False) |
| 167 | + w2_scale_bias = torch.zeros( |
| 168 | + (self.experts, self.output_size, 16 // self.quant_method.tp_size), |
| 169 | + dtype=torch.float32) |
| 170 | + new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias, |
| 171 | + requires_grad=False) |
| 172 | + self.quant_method.process_weights_after_loading(new_layer) |
| 173 | + self.assertEqual(new_layer.w13_scale_bias.data.shape, |
| 174 | + (self.experts, 2 * self.input_size)) |
| 175 | + self.assertEqual(new_layer.w2_scale_bias.data.shape, |
| 176 | + (self.experts, self.output_size)) |
0 commit comments