engineV1 support pipeline parallel

weiguihua2 · weiguihua2 · commit 35c7f37da40f · 2025-07-10T18:53:20.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1404,9 +1404,6 @@ def execute_model(
                  scheduler_output, intermediate_tensors))
 
         with ProfileExecuteDuration().capture_async("post process"):
-            if self.input_batch.pooling_params:
-                return self._pool(hidden_states, num_scheduled_tokens,
-                                  num_scheduled_tokens_np)
             # Broadcast PP output for external_launcher (torchrun)
             # to make sure we are synced across pp ranks
             # TODO: Support overlapping mirco-batches
@@ -1423,6 +1420,9 @@ def execute_model(
                     hidden_states.tensors, all_gather_group=get_tp_group())
                 logits = None
             else:
+                if self.input_batch.pooling_params:
+                    return self._pool(hidden_states, num_scheduled_tokens,
+                                      num_scheduled_tokens_np)
                 sample_hidden_states = hidden_states[logits_indices]
                 logits = self.model.compute_logits(sample_hidden_states, None)
             if broadcast_pp_output: