Skip to content

Commit 3166dde

Browse files
author
weijinqian_v1
committed
[Refactor] The sequence parallelism characteristics in the MoE and Dense models are integrated into a single solution.
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
1 parent e18e620 commit 3166dde

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

vllm_ascend/ops/linear_op.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ def apply_impl(self, input_):
9191
# Replace layer.forward to customize the layer computation process.
9292
def apply(self, input_):
9393
output, output_bias = self.apply_impl(input_)
94-
if dense_optim_enable():
95-
torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
9694
if not self.return_bias:
9795
return output
9896
return output, output_bias
@@ -123,6 +121,14 @@ def update_attrs(self):
123121
self.reduce_results = self.layer.reduce_results
124122
self.input_size_per_partition = self.layer.input_size_per_partition
125123

124+
def apply(self, input_):
125+
output, output_bias = self.apply_impl(input_)
126+
if dense_optim_enable():
127+
torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
128+
if not self.return_bias:
129+
return output
130+
return output, output_bias
131+
126132

127133
class MLPColumnParallelOp(CustomColumnParallelOp):
128134

0 commit comments

Comments
 (0)