BugFix: qwen model sequence parallel can not get batch size (#11147)

Jason233333 · web-flow · commit c621f0cfa38b · 2025-10-24T16:57:09.000+08:00
* feat(model): add support for Qwen model in RL PipelineParallel

* feat(model): add support for QwenMoe model in RL PipelineParallel

* feat(model): add support for QwenMoe model in RL PipelineParallel

* BugFix: qwen model sequence parallel can not get batch size
diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py
@@ -1490,7 +1490,12 @@ def forward(self, hidden_states, tensor_parallel_output=None, batch_size=None):
 
         if self.config.sequence_parallel:
             hidden_states = GatherOp.apply(hidden_states)
-            hidden_states = paddle.reshape_(hidden_states, [batch_size, -1, self.config.hidden_size])
+            if batch_size is not None:
+                hidden_states = paddle.reshape_(hidden_states, [batch_size, -1, self.config.hidden_size])
+            else:
+                hidden_states = paddle.reshape_(
+                    hidden_states, [-1, self.config.max_sequence_length, self.config.hidden_size]
+                )
 
         if tensor_parallel_output is None:
             tensor_parallel_output = self.config.tensor_parallel_output
diff --git a/paddlenlp/transformers/qwen2/modeling_pp.py b/paddlenlp/transformers/qwen2/modeling_pp.py
@@ -172,6 +172,7 @@ def forward(self, args):
         elif attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.dtype == paddle.int64:
             attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices
 
+        batch_size = position_ids.shape[0]
         if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
             recompute_fn = rr_recompute if any(self.skip_recompute_ops.values()) else recompute
             if attention_mask is not None or attn_mask_startend_row_indices is not None:
@@ -182,6 +183,7 @@ def forward(self, args):
                     attention_mask=attention_mask,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=False,
+                    batch_size=batch_size,
                 )
             else:
                 # for pretrain
@@ -191,13 +193,15 @@ def forward(self, args):
                     position_ids=position_ids,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=self.config.recompute_use_reentrant,
+                    batch_size=batch_size,
                 )
         else:
             hidden_states = super().forward(
                 hidden_states,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
                 attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                batch_size=batch_size,
             )
 
         return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)
diff --git a/paddlenlp/transformers/qwen3/modeling_pp.py b/paddlenlp/transformers/qwen3/modeling_pp.py
@@ -172,6 +172,7 @@ def forward(self, args):
         elif attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.dtype == paddle.int64:
             attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices
 
+        batch_size = position_ids.shape[0]
         if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
             recompute_fn = rr_recompute if any(self.skip_recompute_ops.values()) else recompute
             if attention_mask is not None or attn_mask_startend_row_indices is not None:
@@ -182,6 +183,7 @@ def forward(self, args):
                     attention_mask=attention_mask,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=False,
+                    batch_size=batch_size,
                 )
             else:
                 # for pretrain
@@ -191,13 +193,15 @@ def forward(self, args):
                     position_ids=position_ids,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=self.config.recompute_use_reentrant,
+                    batch_size=batch_size,
                 )
         else:
             hidden_states = super().forward(
                 hidden_states,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
                 attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                batch_size=batch_size,
             )
 
         return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)
diff --git a/paddlenlp/transformers/qwen3_moe/modeling_pp.py b/paddlenlp/transformers/qwen3_moe/modeling_pp.py
@@ -68,6 +68,7 @@ def forward(self, args):
         elif attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.dtype == paddle.int64:
             attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices
 
+        batch_size = position_ids.shape[0]
         if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
             if attention_mask is not None or attn_mask_startend_row_indices is not None:
                 hidden_states = recompute(
@@ -77,6 +78,7 @@ def forward(self, args):
                     attention_mask=attention_mask,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=False,
+                    batch_size=batch_size,
                 )
             else:
                 # for pretrain
@@ -86,13 +88,15 @@ def forward(self, args):
                     position_ids=position_ids,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=self.config.recompute_use_reentrant,
+                    batch_size=batch_size,
                 )
         else:
             hidden_states = super().forward(
                 hidden_states,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
                 attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                batch_size=batch_size,
             )
 
         return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)