From c3f859da571918d69200648144dcf2431505b99a Mon Sep 17 00:00:00 2001
From: hjh <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 7 Apr 2025 14:30:44 +0800
Subject: [PATCH 01/68] liger grpo loss

---
 swift/llm/argument/rlhf_args.py             |  9 ++-
 swift/trainers/arguments.py                 |  2 +
 swift/trainers/rlhf_trainer/grpo_trainer.py | 76 ++++++++++++++++++++-
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index d9a8acb7b5..2836263643 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -5,7 +5,7 @@
 
 from swift.llm import MODEL_MAPPING
 from swift.trainers.arguments import GRPOArgumentsMixin
-from swift.utils import get_logger, set_default_ddp_config
+from swift.utils import get_logger, is_liger_available, set_default_ddp_config
 from .train_args import TrainArguments
 
 logger = get_logger()
@@ -235,3 +235,10 @@ def _check_grpo(self):
         if self.mini_batch_size:
             assert self.per_device_train_batch_size % self.mini_batch_size == 0,\
                 'per_device_train_batch_size needs be divisible by mini_batch_size'
+
+        if self.use_liger_loss:
+            assert self.mini_batch_size is None, 'liger loss is not compatible with mini batch currently'
+            try:
+                from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+            except ImportError:
+                raise ImportError('liger_kernel is not available. Run `pip install liger-kernel>=0.5.6`.')
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index 5b81f64ba9..84771a439a 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -172,6 +172,8 @@ class GRPOArgumentsMixin:
     # mini-batch
     mini_batch_size: Optional[int] = None
 
+    use_liger_loss: bool = False
+
 
 @dataclass
 class TrainingArguments(SwiftArgumentsMixin, HfTrainingArguments):
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 430fa3a348..44b98bb29b 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -29,7 +29,8 @@
 from swift.plugin import orms
 from swift.plugin.multi_turn import multi_turns
 from swift.utils import (JsonlWriter, gc_collect, get_device, get_device_count, get_dist_setting, get_logger,
-                         get_node_setting, is_lmdeploy_available, is_vllm_available, is_wandb_available)
+                         get_node_setting, is_liger_available, is_lmdeploy_available, is_vllm_available,
+                         is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import patch_lora_merge, patch_lora_unmerge, round_robin
@@ -39,6 +40,9 @@
 except ImportError:
     raise ImportError('Please install trl from source using: `pip install -U trl`')
 
+if is_liger_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
 del HFGRPOTrainer.__init__
 
 logger = get_logger()
@@ -176,6 +180,21 @@ def __init__(self,
 
         super().__init__(model, ref_model, *_args, **kwargs)
 
+        if self.use_liger_loss:
+            if not is_liger_available():
+                raise ImportError(
+                    'Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.')
+            if is_peft_model(model):
+                raise ValueError('Liger loss is not supported with a PEFT model.')
+
+            self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
+                beta=self.beta,
+                epsilon_low=self.epsilon_low,
+                epsilon_high=self.epsilon_high,
+                temperature=self.temperature,
+                use_ref_model=self.ref_model is not None,
+            )
+
         num_processes = self.accelerator.num_processes
         global_batch_size = args.per_device_train_batch_size * num_processes
         possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
@@ -920,6 +939,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         if isinstance(inputs, list):
             assert len(inputs) == 1
             inputs = inputs[0]
+        if self.use_liger_loss:
+            return self.compute_liger_loss(model, inputs)
+
         completion_mask = inputs['completion_mask']
         per_token_logps = self._get_per_token_logps(model, inputs)
 
@@ -986,6 +1008,58 @@ def _get_per_token_logps(self, model, inputs):
         input_ids = input_ids[:, -logits_to_keep:]
         return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
+    @profiling_decorator
+    def _get_last_hidden_state(self, model, inputs):
+        # unwrap the model to access the model.model
+        logits_to_keep = inputs['logits_to_keep']
+        unwrapped_model = self.accelerator.unwrap_model(model)
+        if not unwrapped_model.model_meta.is_multimodal:
+            last_hidden_state = unwrapped_model.model(
+                input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).last_hidden_state
+        else:
+            inputs = {
+                k: v
+                for k, v in inputs.items() if k not in
+                ['logits_to_keep', 'completion_mask', 'ref_per_token_logps', 'advantages', 'old_per_token_logps']
+            }
+            with self._template_context(self.template):
+                last_hidden_state = unwrapped_model.model(**inputs).last_hidden_state
+        last_hidden_state = last_hidden_state[:, :-1, :]  # (B, L-1, H)
+        if logits_to_keep is not None:
+            last_hidden_state = last_hidden_state[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
+        return last_hidden_state
+
+    def compute_liger_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        input_ids = inputs['input_ids']
+        completion_ids = [input_ids[:logits_to_keep] for logits_to_keep in inputs['logits_to_keep']]
+        completion_mask = inputs['completion_mask']
+
+        # get the last hidden state of the model
+        last_hidden_state = self._get_last_hidden_state(model, inputs)
+        unwrapped_model = self.accelerator.unwrap_model(model)
+        # compute loss and metrics using liger grpo loss
+        loss, metrics = self.liger_grpo_loss(
+            _input=last_hidden_state,
+            lin_weight=unwrapped_model.lm_head.weight,
+            selected_token_ids=completion_ids,
+            attention_mask=completion_mask,
+            advantages=inputs['advantages'],
+            bias=unwrapped_model.lm_head.bias,
+            ref_per_token_logps=inputs['ref_per_token_logps'],
+            old_per_token_logps=inputs['old_per_token_logps'],
+        )
+        # Extract metrics from the liger_grpo_loss output
+        # KL divergence is the first metric when beta is non-zero
+        mean_kl = metrics[0] if self.beta != 0.0 else None
+        clip_ratio = metrics[-1]
+
+        mode = 'eval' if self.control.should_evaluate else 'train'
+        if self.beta != 0.0:
+            self._metrics[mode]['kl'].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        self._metrics[mode]['clip_ratio'].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
+        return loss
+
     def evaluation_loop(self, dataloader, *args, **kwargs):
         # set mini_batch_size None in evaluation
         mini_batch_size = self.args.mini_batch_size

From bbce4b231078df84388668c331228c4619d78c33 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 18:03:18 +0800
Subject: [PATCH 02/68] update

---
 swift/trainers/arguments.py                 |  6 ++----
 swift/trainers/rlhf_trainer/grpo_trainer.py | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index c57e2f1304..6a3c2fee5d 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -178,9 +178,6 @@ class GRPOArgumentsMixin:
     # mini-batch
     mini_batch_size: Optional[int] = None
 
-<<<<<<< HEAD
-    use_liger_loss: bool = False
-=======
     # DAPO, https://arxiv.org/abs/2503.14476
     dynamic_sample: bool = False
     max_resample_times: int = 3
@@ -193,7 +190,8 @@ class GRPOArgumentsMixin:
 
     # compatible with trl main branch(0.17.0.dev0)
     wandb_log_unique_prompts: Optional[bool] = None
->>>>>>> origin
+
+    use_liger_loss: bool = False
 
 
 @dataclass
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 81f59c0e5f..699656f455 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -46,8 +46,6 @@
     from trl.extras.profiling import profiling_decorator
 except ImportError:
     raise ImportError('Please install trl: `pip install -U trl`')
-if is_liger_available():
-    from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
 
 del HFGRPOTrainer.__init__
 del HFGRPOTrainer.log
@@ -188,7 +186,21 @@ def __init__(self,
                     self.group = group
 
         super().__init__(model, ref_model, *_args, **kwargs)
-
+        if self.use_liger_loss:
+            from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+            if not is_liger_available():
+                raise ImportError(
+                    'Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.')
+            if is_peft_model(model):
+                raise ValueError('Liger loss is not supported with a PEFT model.')
+
+            self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
+                beta=self.beta,
+                epsilon_low=self.epsilon_low,
+                epsilon_high=self.epsilon_high,
+                temperature=self.temperature,
+                use_ref_model=self.ref_model is not None,
+            )
         self._metrics = {'train': defaultdict(list), 'eval': defaultdict(list)}
         self.log_completions = args.log_completions
         self.jsonl_writer = JsonlWriter(os.path.join(self.args.output_dir, 'completions.jsonl'))

From 63fdcea309cc403435b5fbafa8b9c90a04380543 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 19:14:50 +0800
Subject: [PATCH 03/68] fix

---
 swift/llm/argument/rlhf_args.py             | 2 +-
 swift/trainers/rlhf_trainer/grpo_trainer.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 1522551341..d427bd3312 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -247,4 +247,4 @@ def _check_grpo(self):
             try:
                 from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
             except ImportError:
-                raise ImportError('liger_kernel is not available. Run `pip install liger-kernel>=0.5.6`.')
+                raise ImportError('liger_kernel is not available. Run `pip install -U liger-kernel`.')
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 699656f455..836e952b99 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -186,6 +186,7 @@ def __init__(self,
                     self.group = group
 
         super().__init__(model, ref_model, *_args, **kwargs)
+        self.use_liger_loss = self.args.use_liger_loss
         if self.use_liger_loss:
             from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
             if not is_liger_available():

From 59159012b7e76ddf46ff0019e92033376dbb4188 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 19:23:19 +0800
Subject: [PATCH 04/68] move args

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 48 +++++++++++----------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 836e952b99..ffc9808b7a 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -171,22 +171,13 @@ def __init__(self,
 
         self.num_generations = args.num_generations
         self.temperature = args.temperature
-        model.warnings_issued['estimate_tokens'] = True
-        kwargs['data_collator'] = lambda features: features
-
-        use_vllm = args.use_vllm
-        use_lmdeploy = args.use_lmdeploy
-
-        if self.args.tensor_parallel_size > 1 and self.multi_turn_func:
-            import torch.distributed as dist
-            rank, _, _, _ = get_dist_setting()
-            for tp_group in self.tp_group_ranks():
-                group = dist.new_group(tp_group)
-                if rank in tp_group:
-                    self.group = group
-
-        super().__init__(model, ref_model, *_args, **kwargs)
+        self.use_vllm = args.use_vllm
+        self.use_lmdeploy = args.use_lmdeploy
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
         self.use_liger_loss = self.args.use_liger_loss
+        self.log_completions = args.log_completions
+
         if self.use_liger_loss:
             from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
             if not is_liger_available():
@@ -202,8 +193,21 @@ def __init__(self,
                 temperature=self.temperature,
                 use_ref_model=self.ref_model is not None,
             )
+
+        if self.args.tensor_parallel_size > 1 and self.multi_turn_func:
+            import torch.distributed as dist
+            rank, _, _, _ = get_dist_setting()
+            for tp_group in self.tp_group_ranks():
+                group = dist.new_group(tp_group)
+                if rank in tp_group:
+                    self.group = group
+
+        model.warnings_issued['estimate_tokens'] = True
+        kwargs['data_collator'] = lambda features: features
+
+        super().__init__(model, ref_model, *_args, **kwargs)
+
         self._metrics = {'train': defaultdict(list), 'eval': defaultdict(list)}
-        self.log_completions = args.log_completions
         self.jsonl_writer = JsonlWriter(os.path.join(self.args.output_dir, 'completions.jsonl'))
         # maxlen is set to the total number of forward passes per step. This value of `maxlen` ensures we log only the
         # final optimization step.
@@ -237,7 +241,7 @@ def __init__(self,
         set_seed(args.seed, device_specific=True)
         self.parameter_groups, self.parameter_groups_no_lora = self.split_batches()
         self.infer_device = None
-        self.use_fast_infer = use_vllm or use_lmdeploy  # whether to use the PT backend
+        self.use_fast_infer = self.use_vllm or self.use_lmdeploy  # whether to use the PT backend
         if self.use_fast_infer:
             if self.infer_rank >= 0:
                 fast_infer_device = self.args.vllm_device or self.args.lmdeploy_device
@@ -260,13 +264,13 @@ def __init__(self,
                                          'reducing it by one is sufficient. '
                                          f'In your case: `--num_processes {get_device_count() - 1}`.')
 
-                if use_vllm:
+                if self.use_vllm:
                     if not is_vllm_available():
                         raise ImportError('vLLM is not available and `use_vllm` is set to True. '
                                           'Please install vLLM with `pip install vllm -U` to use it.')
                     self.prepare_vllm(model, fast_infer_device)
                     self.infer_device = fast_infer_device[self.local_infer_rank]
-                elif use_lmdeploy:
+                elif self.use_lmdeploy:
                     if not is_lmdeploy_available():
                         raise ImportError('LMDeploy is not available and `use_lmdeploy` is set to True.'
                                           'Please install LMDeploy with `pip install lmdeploy -U` to use it.')
@@ -321,8 +325,6 @@ def __init__(self,
 
         # Multi-step
         self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
-        self.epsilon_low = args.epsilon
-        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
 
         # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle. # noqa
         self._step = 0
@@ -562,7 +564,7 @@ def _move_model_to_vllm_lmdeploy(self):
                 if self.infer_rank >= 0:
                     if self.args.async_generate:
                         self._wait_queue()
-                    if self.args.use_vllm:
+                    if self.use_vllm:
                         llm_model = self.engine.inner_model
                     else:
                         llm_model = self.engine.engine.engine
@@ -575,7 +577,7 @@ def _move_model_to_vllm_lmdeploy(self):
                     with patch_lora_unmerge(unwrapped_model):
                         unwrapped_model.unmerge_adapter()
 
-        if self.infer_rank >= 0 and self.args.use_vllm and self.args.vllm_enable_prefix_caching:
+        if self.infer_rank >= 0 and self.use_vllm and self.args.vllm_enable_prefix_caching:
             self.engine.engine.reset_prefix_cache()
 
     def _wait_queue(self):

From d0c290cf7ff3ef29c927020e9a04bd521137cff7 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 19:31:22 +0800
Subject: [PATCH 05/68] fix

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index ffc9808b7a..0f47099404 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -175,9 +175,22 @@ def __init__(self,
         self.use_lmdeploy = args.use_lmdeploy
         self.epsilon_low = args.epsilon
         self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
-        self.use_liger_loss = self.args.use_liger_loss
         self.log_completions = args.log_completions
 
+        if self.args.tensor_parallel_size > 1 and self.multi_turn_func:
+            import torch.distributed as dist
+            rank, _, _, _ = get_dist_setting()
+            for tp_group in self.tp_group_ranks():
+                group = dist.new_group(tp_group)
+                if rank in tp_group:
+                    self.group = group
+
+        model.warnings_issued['estimate_tokens'] = True
+        kwargs['data_collator'] = lambda features: features
+
+        super().__init__(model, ref_model, *_args, **kwargs)
+
+        self.use_liger_loss = self.args.use_liger_loss
         if self.use_liger_loss:
             from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
             if not is_liger_available():
@@ -194,19 +207,6 @@ def __init__(self,
                 use_ref_model=self.ref_model is not None,
             )
 
-        if self.args.tensor_parallel_size > 1 and self.multi_turn_func:
-            import torch.distributed as dist
-            rank, _, _, _ = get_dist_setting()
-            for tp_group in self.tp_group_ranks():
-                group = dist.new_group(tp_group)
-                if rank in tp_group:
-                    self.group = group
-
-        model.warnings_issued['estimate_tokens'] = True
-        kwargs['data_collator'] = lambda features: features
-
-        super().__init__(model, ref_model, *_args, **kwargs)
-
         self._metrics = {'train': defaultdict(list), 'eval': defaultdict(list)}
         self.jsonl_writer = JsonlWriter(os.path.join(self.args.output_dir, 'completions.jsonl'))
         # maxlen is set to the total number of forward passes per step. This value of `maxlen` ensures we log only the

From 0a3794f5f35a88ef6c573e013a2ed1f84f397998 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 20:03:57 +0800
Subject: [PATCH 06/68] fix

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 0f47099404..ba3b86a031 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -1138,9 +1138,8 @@ def _get_per_token_logps(self, model, inputs):
         return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
     @profiling_decorator
-    def _get_last_hidden_state(self, model, inputs):
+    def _get_last_hidden_state(self, model, inputs, logits_to_keep):
         # unwrap the model to access the model.model
-        logits_to_keep = inputs['logits_to_keep']
         unwrapped_model = self.accelerator.unwrap_model(model)
         if not unwrapped_model.model_meta.is_multimodal:
             last_hidden_state = unwrapped_model.model(
@@ -1161,11 +1160,12 @@ def _get_last_hidden_state(self, model, inputs):
     def compute_liger_loss(self, model, inputs):
         # Compute the per-token log probabilities for the model
         input_ids = inputs['input_ids']
-        completion_ids = [input_ids[:logits_to_keep] for logits_to_keep in inputs['logits_to_keep']]
+        logits_to_keep = inputs['logits_to_keep']
+        completion_ids = input_ids[:, :logits_to_keep]
         completion_mask = inputs['completion_mask']
 
         # get the last hidden state of the model
-        last_hidden_state = self._get_last_hidden_state(model, inputs)
+        last_hidden_state = self._get_last_hidden_state(model, inputs, logits_to_keep)
         unwrapped_model = self.accelerator.unwrap_model(model)
         # compute loss and metrics using liger grpo loss
         loss, metrics = self.liger_grpo_loss(

From 3b9ee6d45fe6914b769cb294363b1d0f61d3e4b6 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 20:19:41 +0800
Subject: [PATCH 07/68] fix

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index ba3b86a031..c4b7b89464 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -1161,7 +1161,7 @@ def compute_liger_loss(self, model, inputs):
         # Compute the per-token log probabilities for the model
         input_ids = inputs['input_ids']
         logits_to_keep = inputs['logits_to_keep']
-        completion_ids = input_ids[:, :logits_to_keep]
+        completion_ids = input_ids[:, -logits_to_keep:]
         completion_mask = inputs['completion_mask']
 
         # get the last hidden state of the model

From d643ab981ada997525bc59f759771a0a67602e80 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 14 Apr 2025 20:56:04 +0800
Subject: [PATCH 08/68] fix

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index c4b7b89464..e9fc7f3a9e 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -1147,11 +1147,13 @@ def _get_last_hidden_state(self, model, inputs, logits_to_keep):
         else:
             inputs = {
                 k: v
-                for k, v in inputs.items() if k not in
-                ['logits_to_keep', 'completion_mask', 'ref_per_token_logps', 'advantages', 'old_per_token_logps']
+                for k, v in inputs.items() if k not in [
+                    'logits_to_keep', 'completion_mask', 'ref_per_token_logps', 'advantages', 'old_per_token_logps',
+                    'truncated_mask'
+                ]
             }
             with self._template_context(self.template):
-                last_hidden_state = unwrapped_model.model(**inputs).last_hidden_state
+                last_hidden_state = model(**inputs).last_hidden_state
         last_hidden_state = last_hidden_state[:, :-1, :]  # (B, L-1, H)
         if logits_to_keep is not None:
             last_hidden_state = last_hidden_state[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)

From 93fdb7113e1081c35dbf2dbaf350dd97526a99c6 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 15 Apr 2025 14:55:15 +0800
Subject: [PATCH 09/68] require

---
 requirements/install_all.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements/install_all.sh b/requirements/install_all.sh
index 07501035a4..5efa1ecf62 100644
--- a/requirements/install_all.sh
+++ b/requirements/install_all.sh
@@ -9,4 +9,6 @@ pip install timm -U
 pip install deepspeed -U
 pip install qwen_vl_utils qwen_omni_utils decord librosa pyav icecream soundfile -U
 pip install liger_kernel nvitop pre-commit -U
+pip install wandb
+pip install math_verify==0.5.2
 # flash-attn: https://github.com/Dao-AILab/flash-attention/releases

From f87b0423162c4dc56a20d791848b5ba432079350 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 15 Apr 2025 16:58:37 +0800
Subject: [PATCH 10/68] compatible with zero3

---
 swift/llm/template/base.py                  |  3 +-
 swift/trainers/rlhf_trainer/__init__.py     |  2 +-
 swift/trainers/rlhf_trainer/grpo_trainer.py | 18 +++++----
 swift/trainers/rlhf_trainer/utils.py        | 41 +++++++++++++++++++++
 4 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index dc041083b2..45ae7b04b0 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -1047,7 +1047,8 @@ def pre_forward_hook(self, model: nn.Module, args, kwargs):
         old_kwargs = to_device(kwargs, model.device)
         kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
         for k, v in old_kwargs.items():
-            if k in {'input_ids', 'attention_mask', 'labels', 'position_ids'} and k not in kwargs:
+            if k in {'input_ids', 'attention_mask', 'labels', 'position_ids', 'output_hidden_states'
+                     } and k not in kwargs:
                 kwargs[k] = v
         if 'inputs_embeds' in kwargs:
             kwargs.pop('input_ids', None)
diff --git a/swift/trainers/rlhf_trainer/__init__.py b/swift/trainers/rlhf_trainer/__init__.py
index 3b6d6a7fa3..24e8f9d08c 100644
--- a/swift/trainers/rlhf_trainer/__init__.py
+++ b/swift/trainers/rlhf_trainer/__init__.py
@@ -12,7 +12,7 @@
     from .ppo_trainer import PPOTrainer
     from .reward_trainer import RewardTrainer
     from .rlhf_mixin import RLHFTrainerMixin
-    from .utils import _split_into_mini_batches, patch_lora_merge, patch_lora_unmerge, round_robin
+    from .utils import _split_into_mini_batches, patch_lora_merge, patch_lora_unmerge, round_robin, _ForwardRedirection
 else:
     _import_structure = {
         'cpo_trainer': ['CPOTrainer'],
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index e9fc7f3a9e..66603f9b1f 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -40,7 +40,7 @@
                          is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
-from .utils import _split_into_mini_batches, patch_lora_merge, patch_lora_unmerge, round_robin
+from .utils import _ForwardRedirection, _split_into_mini_batches, patch_lora_merge, patch_lora_unmerge, round_robin
 
 try:
     from trl.extras.profiling import profiling_decorator
@@ -206,6 +206,7 @@ def __init__(self,
                 temperature=self.temperature,
                 use_ref_model=self.ref_model is not None,
             )
+            self._forward_redirection = _ForwardRedirection()
 
         self._metrics = {'train': defaultdict(list), 'eval': defaultdict(list)}
         self.jsonl_writer = JsonlWriter(os.path.join(self.args.output_dir, 'completions.jsonl'))
@@ -1052,7 +1053,8 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
             assert len(inputs) == 1
             inputs = inputs[0]
         if self.use_liger_loss:
-            return self.compute_liger_loss(model, inputs)
+            unwrapped_model = self.accelerator.unwrap_model(model)
+            return self.compute_liger_loss(unwrapped_model, inputs)
 
         completion_mask = inputs['completion_mask']
         truncated_mask = inputs['truncated_mask']
@@ -1138,9 +1140,8 @@ def _get_per_token_logps(self, model, inputs):
         return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
     @profiling_decorator
-    def _get_last_hidden_state(self, model, inputs, logits_to_keep):
+    def _get_last_hidden_state(self, unwrapped_model, inputs, logits_to_keep):
         # unwrap the model to access the model.model
-        unwrapped_model = self.accelerator.unwrap_model(model)
         if not unwrapped_model.model_meta.is_multimodal:
             last_hidden_state = unwrapped_model.model(
                 input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).last_hidden_state
@@ -1153,13 +1154,15 @@ def _get_last_hidden_state(self, model, inputs, logits_to_keep):
                 ]
             }
             with self._template_context(self.template):
-                last_hidden_state = model(**inputs).last_hidden_state
+                outputs = unwrapped_model(**inputs, output_hidden_states=True)
+                last_hidden_state = outputs.hidden_states[-1]
+
         last_hidden_state = last_hidden_state[:, :-1, :]  # (B, L-1, H)
         if logits_to_keep is not None:
             last_hidden_state = last_hidden_state[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
         return last_hidden_state
 
-    def compute_liger_loss(self, model, inputs):
+    def compute_liger_loss(self, unwrapped_model, inputs):
         # Compute the per-token log probabilities for the model
         input_ids = inputs['input_ids']
         logits_to_keep = inputs['logits_to_keep']
@@ -1167,8 +1170,7 @@ def compute_liger_loss(self, model, inputs):
         completion_mask = inputs['completion_mask']
 
         # get the last hidden state of the model
-        last_hidden_state = self._get_last_hidden_state(model, inputs, logits_to_keep)
-        unwrapped_model = self.accelerator.unwrap_model(model)
+        last_hidden_state = self._get_last_hidden_state(unwrapped_model, inputs, logits_to_keep)
         # compute loss and metrics using liger grpo loss
         loss, metrics = self.liger_grpo_loss(
             _input=last_hidden_state,
diff --git a/swift/trainers/rlhf_trainer/utils.py b/swift/trainers/rlhf_trainer/utils.py
index 3caca00a31..17e26a0d1b 100644
--- a/swift/trainers/rlhf_trainer/utils.py
+++ b/swift/trainers/rlhf_trainer/utils.py
@@ -5,6 +5,7 @@
 import torch
 from peft.tuners import lora
 from peft.tuners.lora import LoraLayer
+from torch import nn
 
 
 def round_robin(num_reqs, num_workers):
@@ -153,3 +154,43 @@ def _split_into_mini_batches(batch: List, mini_batch_size: int) -> List[List]:
         mini_batch = batch[i:i + mini_batch_size]
         mini_batches.append(mini_batch)
     return mini_batches
+
+
+class _ForwardRedirection:
+    # Code adapted from https://github.com/huggingface/trl/pull/3260
+    def __call__(self, wrapper_module: nn.Module, original_module: nn.Module, method: callable, *args: Any,
+                 **kwargs: Any):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        original_forward = original_module.forward
+
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            self.on_after_inner_forward(wrapper_module, original_module)
+            return out
+
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+
+        wrapper_output = wrapper_module(*args, **kwargs)
+        self.on_after_outer_forward(wrapper_module, original_module)
+        return wrapper_output
+
+    def on_after_inner_forward(self, wrapper_module: nn.Module, original_module: nn.Module) -> None:
+        pass
+
+    def on_after_outer_forward(self, wrapper_module: nn.Module, original_module: nn.Module) -> None:
+        pass

From b82cbf4289d27d3838edbf938e72602e5761e106 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 15 Apr 2025 17:00:24 +0800
Subject: [PATCH 11/68] fix

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 66603f9b1f..f61e5888fc 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -1054,7 +1054,7 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
             inputs = inputs[0]
         if self.use_liger_loss:
             unwrapped_model = self.accelerator.unwrap_model(model)
-            return self.compute_liger_loss(unwrapped_model, inputs)
+            return self._forward_redirection(model, unwrapped_model, self.compute_liger_loss, unwrapped_model, inputs)
 
         completion_mask = inputs['completion_mask']
         truncated_mask = inputs['truncated_mask']

From fc7fabe414d13b990dc67a0b034b0cf2807dfa2a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 1 May 2025 12:20:10 +0800
Subject: [PATCH 12/68] wip

---
 swift/llm/argument/rlhf_args.py             | 16 ++++++++--------
 swift/trainers/rlhf_trainer/grpo_trainer.py |  7 ++++++-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 8c07c90129..e3832814a8 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -211,8 +211,15 @@ def _check_rlhf(self):
     def _check_grpo(self):
         if self.rlhf_type != 'grpo':
             return
-
         from packaging import version
+
+        if self.use_liger_loss:
+            import liger_kernel
+            liger_kernel_version = version.parse(liger_kernel.__version__)
+            assert liger_kernel_version >= version.parse('0.5.8'), (
+                'Your current version of `liger-kernel` is outdated. '
+                'Please update it by running: pip install -U liger-kernel')
+
         import trl
         trl_version = version.parse(trl.__version__)
         assert trl_version >= version.parse('0.17'), ('Your current version of `trl` is outdated. '
@@ -278,10 +285,3 @@ def _external_vllm_warning(self):
                 "Configuration conflict: 'vllm_max_model_len=%s' is ignored for external vLLM. "
                 'Please specify it when launching the inference service: '
                 '`swift deploy --max_model_len <value>`', self.vllm_max_model_len)
-
-        if self.use_liger_loss:
-            assert self.mini_batch_size is None, 'liger loss is not compatible with mini batch currently'
-            try:
-                from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
-            except ImportError:
-                raise ImportError('liger_kernel is not available. Run `pip install -U liger-kernel`.')
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 5bf782c5a5..314ccfee06 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -166,7 +166,8 @@ def __init__(self,
         else:
             self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
 
-        self.num_generations = args.num_generations
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
         self.temperature = args.temperature
         self.loss_type = args.loss_type
         model.warnings_issued['estimate_tokens'] = True
@@ -175,6 +176,8 @@ def __init__(self,
 
         use_vllm = args.use_vllm
         use_lmdeploy = args.use_lmdeploy
+
+        # we initialize vllm_client in RLHFArguments._init_external_vllm (swift/llm/rlhf_args)
         vllm_client = kwargs.pop('vllm_client')  # for external vllm
         self.use_vllm = args.use_vllm
         self.use_lmdeploy = args.use_lmdeploy
@@ -209,6 +212,8 @@ def __init__(self,
                 epsilon_high=self.epsilon_high,
                 temperature=self.temperature,
                 use_ref_model=self.ref_model is not None,
+                loss_type=self.loss_type,
+                max_completion_length=self.max_completion_length,
             )
             self._forward_redirection = _ForwardRedirection()
 

From 8f67b13b22d687e1897de3101b6b9e1df251d85a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 1 May 2025 16:55:36 +0800
Subject: [PATCH 13/68] update liger loss

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 314ccfee06..d1629b6165 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -1158,7 +1158,10 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         if self.use_liger_loss:
             unwrapped_model = self.accelerator.unwrap_model(model)
             return self._forward_redirection(model, unwrapped_model, self.compute_liger_loss, unwrapped_model, inputs)
+        else:
+            return self._compute_loss(model, inputs)
 
+    def _compute_loss(self, model, inputs):
         completion_mask = inputs['completion_mask']
         truncated_mask = inputs['truncated_mask']
         # apply the completion_mask to exclude loss and metrics for overlong completions
@@ -1278,6 +1281,16 @@ def compute_liger_loss(self, unwrapped_model, inputs):
         completion_ids = input_ids[:, -logits_to_keep:]
         completion_mask = inputs['completion_mask']
 
+        # Compute the KL divergence between the model and the reference model
+        ref_per_token_logps = None
+        if self.beta != 0.0:
+            with torch.no_grad():
+                if self.ref_model is not None:
+                    ref_per_token_logps = self._get_per_token_logps(self.ref_model, inputs)
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps = self._get_per_token_logps(self.model, inputs)
+
         # get the last hidden state of the model
         last_hidden_state = self._get_last_hidden_state(unwrapped_model, inputs, logits_to_keep)
         # compute loss and metrics using liger grpo loss
@@ -1288,8 +1301,8 @@ def compute_liger_loss(self, unwrapped_model, inputs):
             attention_mask=completion_mask,
             advantages=inputs['advantages'],
             bias=unwrapped_model.lm_head.bias,
-            ref_per_token_logps=inputs['ref_per_token_logps'],
             old_per_token_logps=inputs['old_per_token_logps'],
+            ref_per_token_logps=ref_per_token_logps,
         )
         # Extract metrics from the liger_grpo_loss output
         # KL divergence is the first metric when beta is non-zero

From 8b4e34653c9b4852fab0926ec2a669b093dc8025 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 1 May 2025 17:36:35 +0800
Subject: [PATCH 14/68] liger&peft

---
 swift/llm/argument/rlhf_args.py             | 12 +++++------
 swift/trainers/rlhf_trainer/grpo_trainer.py | 23 +++++++++------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index e3832814a8..30515763f0 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -213,18 +213,16 @@ def _check_grpo(self):
             return
         from packaging import version
 
-        if self.use_liger_loss:
-            import liger_kernel
-            liger_kernel_version = version.parse(liger_kernel.__version__)
-            assert liger_kernel_version >= version.parse('0.5.8'), (
-                'Your current version of `liger-kernel` is outdated. '
-                'Please update it by running: pip install -U liger-kernel')
-
         import trl
         trl_version = version.parse(trl.__version__)
         assert trl_version >= version.parse('0.17'), ('Your current version of `trl` is outdated. '
                                                       'Please update it by running: pip install -U trl')
 
+        if self.use_liger_loss:
+            from trl.import_utils import is_liger_kernel_available
+            assert is_liger_kernel_available(), (
+                'Please install/update liger-kernel by running: pip install -U liger-kernel')
+
         if self.num_generations < 2:
             raise ValueError(
                 'GRPO requires at least 2 generations per prompt to calculate the advantages. You provided '
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index d1629b6165..f781004e50 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -203,15 +203,13 @@ def __init__(self,
             if not is_liger_available():
                 raise ImportError(
                     'Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.')
-            if is_peft_model(model):
-                raise ValueError('Liger loss is not supported with a PEFT model.')
 
             self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
                 beta=self.beta,
                 epsilon_low=self.epsilon_low,
                 epsilon_high=self.epsilon_high,
                 temperature=self.temperature,
-                use_ref_model=self.ref_model is not None,
+                use_ref_model=self.beta != 0.0,
                 loss_type=self.loss_type,
                 max_completion_length=self.max_completion_length,
             )
@@ -1086,15 +1084,6 @@ def _prepare_batch_inputs(self, inputs: InputsType, rewards: torch.Tensor) -> Li
                 batch_encoded_inputs['old_per_token_logps'] = (
                     self._get_per_token_logps(self.model, batch_encoded_inputs) if self.old_policy else None)
 
-                if self.beta == 0.0:
-                    ref_per_token_logps = None
-                elif self.ref_model is not None:
-                    ref_per_token_logps = self._get_per_token_logps(self.ref_model, batch_encoded_inputs)
-                else:
-                    with self.accelerator.unwrap_model(self.model).disable_adapter():
-                        ref_per_token_logps = self._get_per_token_logps(self.model, batch_encoded_inputs)
-                batch_encoded_inputs['ref_per_token_logps'] = ref_per_token_logps
-
             ga_batch_encoded_inputs.append(batch_encoded_inputs)
 
         return ga_batch_encoded_inputs
@@ -1175,7 +1164,13 @@ def _compute_loss(self, model, inputs):
 
         # Compute the KL divergence between the model and the reference model
         if self.beta != 0.0:
-            ref_per_token_logps = inputs['ref_per_token_logps']
+            with torch.no_grad():
+                if self.ref_model is not None:
+                    ref_per_token_logps = self._get_per_token_logps(self.ref_model, inputs)
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps = self._get_per_token_logps(self.model, inputs)
+
             per_token_kl = (
                 torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1)
 
@@ -1254,6 +1249,8 @@ def _get_per_token_logps(self, model, inputs):
     @profiling_decorator
     def _get_last_hidden_state(self, unwrapped_model, inputs, logits_to_keep):
         # unwrap the model to access the model.model
+        if is_peft_model(unwrapped_model):
+            unwrapped_model = unwrapped_model.base_model.model
         if not unwrapped_model.model_meta.is_multimodal:
             last_hidden_state = unwrapped_model.model(
                 input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).last_hidden_state

From edc1fd14354a3991737c0c5649e428848ae91a7a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 6 May 2025 17:41:54 +0800
Subject: [PATCH 15/68] init

---
 swift/llm/argument/rlhf_args.py             |  25 +-
 swift/trainers/arguments.py                 |  23 +-
 swift/trainers/rlhf_trainer/__init__.py     |   4 +-
 swift/trainers/rlhf_trainer/grpo_trainer.py | 508 ++++++++------------
 swift/trainers/rlhf_trainer/utils.py        |  27 ++
 5 files changed, 277 insertions(+), 310 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index ab5e2c7ea2..3eef44e701 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import warnings
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Literal, Optional
 
@@ -103,6 +104,7 @@ def _prepare_training_args(self, training_args: Dict[str, Any]) -> None:
             training_args['world_size'] = self.global_world_size
 
     def __post_init__(self):
+        self._deprecated_warning()
         self._init_grpo()
         self._init_rm()
         self._init_simpo()
@@ -236,7 +238,7 @@ def _check_grpo(self):
                     'If you encounter an Out-of-Memory (OOM) error, it is recommended to set the `sleep_level`, '
                     '`offload_model`, and `offload_optimizer` parameters.')
                 assert not self.async_generate, 'async_generate requires async mode, but you are under colocate mode'
-                if self.use_lmdeploy and self.tensor_parallel_size > 1:
+                if self.use_lmdeploy and self.vllm_tensor_parallel_size > 1:
                     raise ValueError('Currently LMDeploy do not support tensor parallel')
                 if self.use_vllm and self.sleep_level:
                     logger.warning('It is highly recommended to use `sleep_level==1` in colocate mode,'
@@ -252,7 +254,7 @@ def _check_grpo(self):
                     logger.warning('You are using different GPUs for training and rollout, '
                                    'so you do not need to use sleep_level > 0')
 
-                assert self.tensor_parallel_size == 1, ('async mode do not support tensor parallel right now')
+                assert self.vllm_tensor_parallel_size == 1, ('async mode do not support tensor parallel right now')
 
     def _external_vllm_warning(self):
         if self.rlhf_type != 'grpo' or not self.vllm_server_host:
@@ -273,3 +275,22 @@ def _external_vllm_warning(self):
                 "Configuration conflict: 'vllm_max_model_len=%s' is ignored for external vLLM. "
                 'Please specify it when launching the inference service: '
                 '`swift deploy --max_model_len <value>`', self.vllm_max_model_len)
+
+    def _deprecated_warning(self):
+        if self.rlhf_type != 'grpo':
+            return
+
+        if self.tensor_parallel_size is not None:
+            warnings.warn(
+                "The parameter 'tensor_parallel_size' has been deprecated and will be removed in version 3.6. "
+                "It is recommended to use 'vllm_tensor_parallel_size' instead.", DeprecationWarning)
+            self.vllm_tensor_parallel_size = self.tensor_parallel_size
+
+        if self.vllm_device is not None:
+            warnings.warn("The parameter 'vllm_device' has been deprecated and will be removed in version 3.6. ",
+                          DeprecationWarning)
+
+        if self.vllm_enable_prefix_caching is not None:
+            warnings.warn(
+                "The parameter 'vllm_enable_prefix_caching' has been deprecated and will be removed in version 3.6. ",
+                DeprecationWarning)
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index e6b6f93dbd..cc88752972 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -145,13 +145,21 @@ class GRPOArgumentsMixin:
     repetition_penalty: float = 1.
     num_infer_workers: int = 1
     # vllm
-    vllm_device: List[str] = field(default_factory=lambda: ['auto'])
+    vllm_mode: Literal['server', 'colocate']
+    # internal vllm (colocate)
+    vllm_device: Optional[List[str]] = None  # deprecated
     vllm_gpu_memory_utilization: float = 0.9
     vllm_max_model_len: Optional[int] = None
-    vllm_max_num_seqs: int = 256
     vllm_enforce_eager: bool = False
     vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None  # '{"image": 5, "video": 2}'
-    vllm_enable_prefix_caching: bool = True
+    vllm_enable_prefix_caching: Optional[bool] = None  # deprecated
+    vllm_tensor_parallel_size: int = 1
+    # external vllm (server)
+    vllm_server_host: Optional[str] = None
+    vllm_server_port: int = 8000
+    vllm_server_timeout: float = 240.0
+    vllm_client = None
+
     # reward function args, see details in swift/plugin/orm.py
     # cosine reward, https://arxiv.org/abs/2502.03373
     cosine_min_len_value_wrong: float = -0.5  # r^w_0 in paper, Reward for wrong answers with zero completion length.
@@ -170,7 +178,8 @@ class GRPOArgumentsMixin:
     lmdeploy_cache_max_entry_count: float = 0.8
 
     async_generate: bool = False
-    tensor_parallel_size: int = 1
+    tensor_parallel_size: Optional[int] = None  # deprecated
+
     sleep_level: int = 0
     move_model_batches: Optional[int] = None
     offload_optimizer: bool = False
@@ -191,12 +200,6 @@ class GRPOArgumentsMixin:
     # compatible with trl main branch(0.17.0.dev0)
     wandb_log_unique_prompts: Optional[bool] = None
 
-    # external vllm
-    vllm_server_host: Optional[str] = None
-    vllm_server_port: int = 8000
-    vllm_server_timeout: float = 240.0
-    vllm_client = None
-
     # dataset
     dataset_shuffle: Optional[bool] = True
 
diff --git a/swift/trainers/rlhf_trainer/__init__.py b/swift/trainers/rlhf_trainer/__init__.py
index 3b6d6a7fa3..eca9ba382d 100644
--- a/swift/trainers/rlhf_trainer/__init__.py
+++ b/swift/trainers/rlhf_trainer/__init__.py
@@ -12,7 +12,7 @@
     from .ppo_trainer import PPOTrainer
     from .reward_trainer import RewardTrainer
     from .rlhf_mixin import RLHFTrainerMixin
-    from .utils import _split_into_mini_batches, patch_lora_merge, patch_lora_unmerge, round_robin
+    from .utils import patch_lora_merge, patch_lora_unmerge, round_robin
 else:
     _import_structure = {
         'cpo_trainer': ['CPOTrainer'],
@@ -23,7 +23,7 @@
         'ppo_trainer': ['PPOTrainer'],
         'reward_trainer': ['RewardTrainer'],
         'rlhf_mixin': ['RLHFTrainerMixin'],
-        'utils': ['_split_into_mini_batches', 'patch_lora_merge', 'patch_lora_unmerge', 'round_robin'],
+        'utils': ['patch_lora_merge', 'patch_lora_unmerge', 'round_robin'],
     }
 
     import sys
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 4c03bfa74f..80d5da85ef 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -20,7 +20,7 @@
 import torch
 import torch.nn as nn
 import transformers
-from accelerate.utils import gather, gather_object, is_peft_model, set_seed
+from accelerate.utils import broadcast_object_list, gather, gather_object, is_peft_model, set_seed
 from packaging import version
 from torch.nn import ModuleList
 from torch.utils.data import DataLoader
@@ -41,7 +41,7 @@
                          get_node_setting, is_lmdeploy_available, is_vllm_available, is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
-from .utils import patch_lora_merge, patch_lora_unmerge, round_robin
+from .utils import patch_lora_merge, patch_lora_unmerge, round_robin, unwrap_model_for_generation
 
 del HFGRPOTrainer.__init__
 del HFGRPOTrainer.log
@@ -54,33 +54,6 @@
 OutputsType = List[List[Tuple[List[Dict], str]]]
 
 
-@contextmanager
-def unwrap_model_for_generation(
-    model,
-    accelerator,
-    gather_deepspeed3_params=True,
-    gather_parameters: List = None,
-):
-    unwrapped_model = accelerator.unwrap_model(model)
-    if accelerator.state.deepspeed_plugin is not None and accelerator.state.deepspeed_plugin.zero_stage == 3:
-        if not gather_deepspeed3_params:
-            yield accelerator.unwrap_model(model)
-        else:
-            import deepspeed
-            parameters = [
-                parameter for name, parameter in model.named_parameters()
-                if not gather_parameters or name in gather_parameters
-            ]
-            with deepspeed.zero.GatheredParameters(parameters):
-                from trl.models.utils import remove_hooks
-                remove_hooks(model)
-                yield accelerator.unwrap_model(model)
-                from trl.models.utils import add_hooks
-                add_hooks(model)
-    else:
-        yield unwrapped_model
-
-
 class GRPOCallback(TrainerCallback):
 
     def __init__(self, trainer):
@@ -113,12 +86,15 @@ def __init__(self,
         from swift.trainers.rlhf_arguments import GRPOConfig
         args: GRPOConfig = kwargs['args']
         self.args = args
+        # for async generate
         self.train_queue = Queue()
         self.eval_queue = Queue()
+
         self.processing_class = kwargs.get('template').tokenizer
+
+        # for offload model/optimizer
         self.offload_modules = {}
         self.offload_states = {}
-        _, _, _, local_world_size = get_dist_setting()
 
         if not isinstance(reward_funcs, list):
             reward_funcs = [reward_funcs]
@@ -167,21 +143,17 @@ def __init__(self,
 
         self.num_generations = args.num_generations
         self.temperature = args.temperature
+        self.vllm_mode = args.vllm_mode
+        self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization  # only applies to colocation mode
+        self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size  # only applies to colocation mode
         self.loss_type = args.loss_type
         model.warnings_issued['estimate_tokens'] = True
         kwargs['data_collator'] = lambda features: features
         self.shuffle_dataset = args.dataset_shuffle
 
-        use_vllm = args.use_vllm
-        use_lmdeploy = args.use_lmdeploy
+        self.use_vllm = args.use_vllm
+        self.use_lmdeploy = args.use_lmdeploy
         vllm_client = kwargs.pop('vllm_client')  # for external vllm
-        if self.args.tensor_parallel_size > 1 and self.multi_turn_func:
-            import torch.distributed as dist
-            rank, _, _, _ = get_dist_setting()
-            for tp_group in self.tp_group_ranks():
-                group = dist.new_group(tp_group)
-                if rank in tp_group:
-                    self.group = group
 
         super().__init__(model, ref_model, *_args, **kwargs)
 
@@ -199,6 +171,7 @@ def __init__(self,
             'rewards': defaultdict(lambda: deque(maxlen=maxlen)),
         }
 
+        # num_generation check
         num_processes = self.accelerator.num_processes
         self.effective_train_batch_size = effective_batch_size = \
             args.per_device_train_batch_size * num_processes * args.gradient_accumulation_steps
@@ -228,74 +201,61 @@ def __init__(self,
         set_seed(args.seed, device_specific=True)
         self.parameter_groups, self.parameter_groups_no_lora = self.split_batches()
         self.infer_device = None
-        self.use_fast_infer = use_vllm or use_lmdeploy  # whether to use the PT backend
-        self.is_external_vllm = use_vllm and args.vllm_server_host is not None
-        if self.use_fast_infer:
-            if self.infer_rank >= 0:
-                fast_infer_device = self.args.vllm_device or self.args.lmdeploy_device
-                if fast_infer_device[0] == 'auto':
-                    if get_device_count() == 1:
-                        fast_infer_device = [get_device()]  # particular case when training with only 1 GPU: share it
-                    else:
-                        fast_infer_device = []
-                        for idx in range(get_device_count() - self.args.num_infer_workers, get_device_count()):
-                            fast_infer_device.append(get_device(idx))
-
-                for _device in fast_infer_device:
-                    # Check that the requested device is available
-                    if _device.split(':')[0] in {'cuda', 'npu'} and int(_device.split(':')[1]) >= get_device_count():
-                        raise ValueError(f'The requested device for vllm ({_device}) is not available. '
-                                         f'You are likely using vLLM '
-                                         'without restricting the number of GPUs for training. '
-                                         'Set the `--num_processes` argument to a '
-                                         'value lower than the number of GPUs available on your machine—typically, '
-                                         'reducing it by one is sufficient. '
-                                         f'In your case: `--num_processes {get_device_count() - 1}`.')
-
-                if use_vllm:
-                    if not is_vllm_available():
-                        raise ImportError('vLLM is not available and `use_vllm` is set to True. '
-                                          'Please install vLLM with `pip install vllm -U` to use it.')
-                    if self.is_external_vllm:
-                        self.vllm_client = vllm_client
-                    else:
-                        self.engine = self.prepare_vllm(model, fast_infer_device)
-                    self.infer_device = fast_infer_device[self.local_infer_rank]
-                elif use_lmdeploy:
-                    if not is_lmdeploy_available():
-                        raise ImportError('LMDeploy is not available and `use_lmdeploy` is set to True.'
-                                          'Please install LMDeploy with `pip install lmdeploy -U` to use it.')
-                    from swift.llm import LmdeployEngine
-                    from swift.tuners import Swift
-                    with Swift.grpo_context(model, self.template.processor):
-                        fast_infer_device = int(fast_infer_device[self.local_infer_rank].split(':')[1])
-                        self.engine = LmdeployEngine(
-                            model.model_dir,
-                            model.model_info.torch_dtype,
-                            model_type=model.model_meta.model_type,
-                            devices=[fast_infer_device],
-                            session_len=args.lmdeploy_session_len,
-                            cache_max_entry_count=args.lmdeploy_cache_max_entry_count,
-                            reload_weights=True)
-                        self.infer_device = fast_infer_device
-                        from lmdeploy.turbomind.turbomind import TurboMind
-                        lmdeploy_engine = self.engine.engine.engine
-                        assert isinstance(lmdeploy_engine, TurboMind), (
-                            "Currently only LMDeploy's TurboMind backend is supported. "
-                            'The current model is incompatible - please use vLLM or PyTorch backend instead.')
-                if not self.is_external_vllm:
-                    self.engine.default_template = copy(self.template)  # Avoid thread-unsafe modifications of the mode.
-            self._last_loaded_step = -1  # tag to avoid useless loading during grad accumulation
-
-            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
-            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
-            # synchronize all processes after vLLM has been fully initialized.
-            self.accelerator.wait_for_everyone()
+        self.use_fast_infer = self.use_vllm or self.use_lmdeploy  # whether to use the PT backend
+        # self.is_external_vllm = use_vllm and args.vllm_server_host is not None
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError('vLLM is not available and `use_vllm` is set to True. '
+                                  'Please install vLLM with `pip install vllm -U` to use it.')
+            if self.vllm_mode == 'server':
+                self.vllm_client = vllm_client
+            elif self.vllm_mode == 'colocate':
+                if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0:
+                    raise ValueError(
+                        f'vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size '
+                        f'({self.accelerator.num_processes}) evenly.')
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Create subgroups of ranks for TP, each group with `vllm_tensor_parallel_size` ranks.
+                    # For example, if world_size=8 and vllm_tensor_parallel_size=2 → groups: [0,1], [2,3], [4,5], [6,7]
+                    self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration([
+                        list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size))
+                        for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size)
+                    ])
+
+                self.engine = self.prepare_vllm(model)
+                # Avoid thread-unsafe modifications of the mode.
+                self.engine.default_template = copy(self.template)  # Avoid thread-unsafe modifications of the mode.
+
+        elif self.use_lmdeploy:
+            if not is_lmdeploy_available():
+                raise ImportError('LMDeploy is not available and `use_lmdeploy` is set to True.'
+                                  'Please install LMDeploy with `pip install lmdeploy -U` to use it.')
+            from swift.llm import LmdeployEngine
+            from swift.tuners import Swift
+            with Swift.grpo_context(model, self.template.processor):
+                self.engine = LmdeployEngine(
+                    model.model_dir,
+                    model.model_info.torch_dtype,
+                    model_type=model.model_meta.model_type,
+                    session_len=args.lmdeploy_session_len,
+                    cache_max_entry_count=args.lmdeploy_cache_max_entry_count,
+                    reload_weights=True)
+                from lmdeploy.turbomind.turbomind import TurboMind
+                lmdeploy_engine = self.engine.engine.engine
+                assert isinstance(
+                    lmdeploy_engine,
+                    TurboMind), ("Currently only LMDeploy's TurboMind backend is supported. "
+                                 'The current model is incompatible - please use vLLM or PyTorch backend instead.')
+                # Avoid thread-unsafe modifications of the mode.
+                self.engine.default_template = copy(self.template)  # Avoid thread-unsafe modifications of the mode.
         else:
             from swift.llm import PtEngine
             self.engine = PtEngine.from_model_template(self.model, copy(self.template), max_batch_size=0)  # 0: no limit
-        # Avoid thread-unsafe modifications of the mode.
+
+        self._last_loaded_step = -1  # tag to avoid useless loading during grad accumulation
         self.request_config = RequestConfig(
+            n=1,
             max_tokens=args.max_completion_length,
             temperature=args.temperature,
             top_p=args.top_p,
@@ -304,11 +264,12 @@ def __init__(self,
             stop=args.stop_words,
         )
 
-        if local_world_size == self.args.num_infer_workers == get_device_count() and local_world_size > 1:
-            self.request_config.n = self.args.tensor_parallel_size
-            if self.infer_rank >= 0:
-                self.request_config.seed = self.infer_rank // self.args.tensor_parallel_size
+        # if local_world_size == self.args.num_infer_workers == get_device_count() and local_world_size > 1:
+        #     self.request_config.n = self.vllm_tensor_parallel_size
 
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
         self.model_accepts_loss_kwargs = False
         for i, reward_func in enumerate(self.reward_funcs):
             if isinstance(reward_func, PreTrainedModel) and is_deepspeed_zero3_enabled():
@@ -440,35 +401,32 @@ def split_llm(name):
         parameters_no_lora = [remove_lora_and_prefix(p_list) for p_list in parameters]
         return parameters, parameters_no_lora
 
-    def prepare_vllm(self, model, fast_infer_device):
+    def prepare_vllm(self, model):
         from swift.tuners import Swift
         from swift.llm import VllmEngine
         from swift.llm.infer.infer_engine import GRPOVllmEngine
-        _, _, _, local_world_size = get_dist_setting()
-        if self.args.tensor_parallel_size > 1:
+        if self.vllm_tensor_parallel_size > 1:
             vllm_kwargs = {'distributed_executor_backend': 'external_launcher'}
         else:
             vllm_kwargs = {}
-        if local_world_size == self.args.num_infer_workers == get_device_count() and local_world_size > 1:
             # Compatibility with TP
-            cls = GRPOVllmEngine
-            engine_kwargs = {'seed': 0}
-        else:
-            cls = VllmEngine
-            engine_kwargs = {}
+        cls = GRPOVllmEngine
+        engine_kwargs = {'seed': self.accelerator.process_index // self.vllm_tensor_parallel_size}
+
+        max_num_seqs = self.args.per_device_train_batch_size \
+                                        * self.vllm_tensor_parallel_size \
+                                        * self.args.gradient_accumulation_steps,
         with Swift.grpo_context(model, self.template.processor):
             engine = cls(
                 model.model_dir,
                 model.model_info.torch_dtype,
                 model_type=model.model_meta.model_type,
-                device=fast_infer_device[self.local_infer_rank],
-                tensor_parallel_size=self.args.tensor_parallel_size,
-                gpu_memory_utilization=self.args.vllm_gpu_memory_utilization,
+                vllm_tensor_parallel_size=self.vllm_tensor_parallel_size,
+                gpu_memory_utilization=self.vllm_gpu_memory_utilization,
                 enable_prefix_caching=self.args.vllm_enable_prefix_caching,
-                max_num_seqs=self.args.vllm_max_num_seqs,
+                max_num_seqs=max_num_seqs,
                 enforce_eager=self.args.vllm_enforce_eager,
                 limit_mm_per_prompt=self.args.vllm_limit_mm_per_prompt,
-                num_infer_workers=self.args.num_infer_workers,
                 enable_sleep_mode=self.args.sleep_level > 0,
                 use_async_engine=False,
                 max_model_len=self.args.vllm_max_model_len,
@@ -477,49 +435,6 @@ def prepare_vllm(self, model, fast_infer_device):
             engine.default_template = self.template
         return engine
 
-    @property
-    def infer_rank(self):
-        if self.is_external_vllm:
-            # When using external vLLM, only the main process (rank=0) acts as the client.
-            return 0 if self.accelerator.is_main_process else -1
-        rank, local_rank, world_size, local_world_size = get_dist_setting()
-        node_rank = get_node_setting()[0]
-        for _vllm_rank in range(self.args.num_infer_workers):
-            if local_rank == _vllm_rank:
-                return node_rank * self.args.num_infer_workers + _vllm_rank
-        if local_rank == -1:
-            return 0
-        return -1
-
-    @property
-    def infer_rank_tp_0(self):
-        # whether is tp rank0, get data from this rank
-        # vllm needs all tp ranks inputs and sampling params are the same
-        rank, local_rank, world_size, local_world_size = get_dist_setting()
-        node_rank = get_node_setting()[0]
-        for _vllm_rank in range(self.args.num_infer_workers):
-            if local_rank == _vllm_rank and _vllm_rank % self.args.tensor_parallel_size == 0:
-                return (node_rank * self.args.num_infer_workers + _vllm_rank // self.args.tensor_parallel_size)
-        if local_rank == -1:
-            return 0
-        return -1
-
-    @property
-    def local_infer_rank(self):
-        rank, local_rank, world_size, local_world_size = get_dist_setting()
-        for _vllm_rank in range(self.args.num_infer_workers):
-            if local_rank == _vllm_rank:
-                return _vllm_rank
-
-        return -1
-
-    def tp_group_ranks(self):
-        rank, local_rank, world_size, local_world_size = get_dist_setting()
-        return [
-            list(range(0, world_size))[i:i + self.args.tensor_parallel_size]
-            for i in range(0, world_size, self.args.tensor_parallel_size)
-        ]
-
     @contextmanager
     def _template_context(self, template):
         # The max_length for prompt and completion has already been restricted, so there is no need for max_length here.
@@ -540,7 +455,7 @@ def _template_context(self, template):
 
     @profiling_decorator
     def _move_model_to_vllm_lmdeploy(self):
-        if self.is_external_vllm:
+        if self.vllm_mode == 'server':
             return super()._move_model_to_vllm()
 
         from accelerate.utils.other import is_compiled_module
@@ -577,7 +492,7 @@ def _move_model_to_vllm_lmdeploy(self):
                     parameter_group_no_lora = [n.replace('base_model.model.', '') for n in parameter_group_no_lora]
                     state_dict = {k: v for k, v in state_dict.items() if k in parameter_group_no_lora}
                 assert len(state_dict) > 0 and all([state.shape != torch.Size([0]) for state in state_dict.values()])
-                if self.infer_rank >= 0:
+                if self.use_fast_infer:
                     if self.args.async_generate:
                         self._wait_queue()
                     if self.args.use_vllm:
@@ -592,8 +507,8 @@ def _move_model_to_vllm_lmdeploy(self):
                 if is_peft_model(unwrapped_model):
                     with patch_lora_unmerge(unwrapped_model):
                         unwrapped_model.unmerge_adapter()
-
-        if self.infer_rank >= 0 and self.args.use_vllm and self.args.vllm_enable_prefix_caching:
+        if self.use_vllm and self.vllm_mode == 'colocate':
+            # since update weights, we should reset the prefix cache
             self.engine.engine.reset_prefix_cache()
 
     def _wait_queue(self):
@@ -611,112 +526,143 @@ def reorder_outputs(outputs, distributed_idx):
 
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
-    def _infer_multi_turn(self, inputs_slice: np.ndarray, request_config: RequestConfig) -> Union[OutputsType, List]:
+    def _get_first_turn_results(self, inputs: InputsType, request_config: RequestConfig) -> Union[OutputsType, List]:
+        # inputs: local inputs
+        from swift.llm.infer.protocol import ChatCompletionResponse
+        request_config = copy(request_config)
+        if self.vllm_mode == 'server':
+            # for server mode, we gather all the inputs and send to remote vllm server in main process
+            all_inputs = gather_object(inputs)
+            if self.accelerator.is_main_process:
+                results = List[ChatCompletionResponse] = self._engine_infer(
+                    infer_requests=all_inputs, request_config=request_config, use_tqdm=False)
+            else:
+                results = [None] * len(all_inputs)
+            # Broadcast the results from the main process to all processes,
+            # ensuring each process receives its corresponding slice.
+            results = broadcast_object_list(results, from_process=0)
+            process_slice = slice(
+                self.accelerator.process_index * len(inputs),
+                (self.accelerator.process_index + 1) * len(inputs),
+            )
+            results = results[process_slice]
+        else:
+            # pt / lmdeploy / vllm
+            if self.vllm_tensor_parallel_size > 1:
+                # Gather prompts from all ranks in the TP group and flatten.
+                # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                orig_size = len(inputs)
+                gathered_inputs = [None for _ in range(self.vllm_tensor_parallel_size)]
+                torch.distributed.all_gather_object(gathered_inputs, inputs, group=self.tp_group)
+                inputs = [p for sublist in gathered_inputs for p in sublist]
+
+            results: List[ChatCompletionResponse] = self._engine_infer(
+                infer_requests=inputs, request_config=request_config, use_tqdm=False)
+
+            if self.vllm_tensor_parallel_size > 1:
+                # Slice completions for this rank within its TP group.
+                # Each rank generates all outputs — we keep only our share.
+                local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+                tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
+                results = results[tp_slice]
+
+        return results
+
+    def _infer_single_or_multi_turn(self, inputs: InputsType,
+                                    request_config: RequestConfig) -> Union[OutputsType, List]:
         """Perform multi-turn or single-turn inference with support for tensor parallelism.
 
         Args:
-            inputs_slice: Array of input requests
+            inputs: list of input requests
             request_config: Inference configuration parameters
 
         Returns:
             List of outputs where each entry contains:
-            - List of responses per prompt (length = tensor_parallel_size)
+            - List of responses per prompt (length = vllm_tensor_parallel_size)
             - Each response is a tuple of (message_history, finish_reason)
         """
-        from swift.llm.infer.protocol import ChatCompletionResponse
-        rank, _, _, _ = get_dist_setting()
-        request_config = copy(request_config)
-        results: List[ChatCompletionResponse] = self._engine_infer(
-            infer_requests=inputs_slice, request_config=request_config, use_tqdm=False)
-        prompt_lens = len(inputs_slice)
-        messages_list = [None] * (len(inputs_slice) * self.args.tensor_parallel_size)
-        if self.multi_turn_func:
-            remove_response = True
-            while len(inputs_slice) > 0:
-                request_config.n = 1
-                if self.infer_rank_tp_0 >= 0 or not self.use_fast_infer:
-                    inputs = []
-                    cnt = 0
-                    for i, output in enumerate(results):
-                        for choice in output.choices:
-                            _input: Dict = deepcopy(inputs_slice[i])
-                            if remove_response or _input['messages'][-1]['role'] != 'assistant' or not \
-                                    _input['messages'][-1]['content']:
-                                InferRequest.remove_response(_input['messages'])
-                                _input['messages'].append({'role': 'assistant', 'content': choice.message.content})
-                            else:
-                                _input['messages'][-1]['content'] += choice.message.content
-                            if 'index' not in _input:
-                                _input['index'] = cnt
-                            _input['finish_reason'] = choice.finish_reason
-                            cnt += 1
-                            inputs.append(_input)
-                    results: List[Dict] = self.multi_turn_func(inputs)  # noqa
-                else:
-                    length = sum([len(results[i].choices) for i in range(len(results))])
-                    results = [None] * length
-
-                if self.args.tensor_parallel_size > 1:
-                    # avoid duplicate calling in the same tensor parallel group
-                    import torch.distributed as dist
-                    if 'group_src' in inspect.signature(dist.broadcast_object_list).parameters:
-                        dist.broadcast_object_list(results, group_src=0, group=self.group)
-                    else:
-                        global_src = dist.get_global_rank(self.group, 0)
-                        dist.broadcast_object_list(results, src=global_src, group=self.group)
-                inputs_slice = [r for r in results if not r['finished']]
-                for idx, r in enumerate(results):
-                    if r['finished'] or r['finish_reason'] == 'length':
-                        messages_list[r['index']] = (r['messages'], r['finish_reason'])
-                if len(inputs_slice) > 0:
-                    _input_std = []
-                    for _input in inputs_slice:
-                        _input_std.append(StdTemplateInputs.from_dict(_input))
-                        # StdTemplateInputs will not remove responses in infer
-                    results = self._engine_infer(
-                        infer_requests=_input_std, request_config=request_config, use_tqdm=False)
-                # concat responses from the second loop
-                remove_response = False
+        results = self._get_first_turn_results(inputs, request_config)
 
-            outputs = []
-            assert not any([m is None for m in messages_list])
-            for i in range(0, len(messages_list), self.args.tensor_parallel_size):
-                # reformat to [[x, x, x, x] [x, x, x, x]]
-                # this is the same format of sampling_params.n > 1
-                outputs.append(messages_list[i:i + self.args.tensor_parallel_size])
-            assert len(outputs) == prompt_lens
-            assert all([len(o) == self.args.tensor_parallel_size for o in outputs])
+        if self.multi_turn_func:
+            pass
+            # TODO
+            # we remove response in first turn
+            # messages_list = [None] * (len(inputs) * self.vllm_tensor_parallel_size)
+            # remove_response = True
+            # while len(inputs) > 0:
+            #     if not self.use_fast_infer:
+            #         inputs = []
+            #         cnt = 0
+            #         for i, output in enumerate(results):
+            #             for choice in output.choices:
+            #                 _input: Dict = deepcopy(inputs[i])
+            #                 if remove_response or _input['messages'][-1]['role'] != 'assistant' or not \
+            #                         _input['messages'][-1]['content']:
+            #                     InferRequest.remove_response(_input['messages'])
+            #                     _input['messages'].append({'role': 'assistant', 'content': choice.message.content})
+            #                 else:
+            #                     _input['messages'][-1]['content'] += choice.message.content
+            #                 if 'index' not in _input:
+            #                     _input['index'] = cnt
+            #                 _input['finish_reason'] = choice.finish_reason
+            #                 cnt += 1
+            #                 inputs.append(_input)
+            #         results: List[Dict] = self.multi_turn_func(inputs)  # noqa
+            #     else:
+            #         length = sum([len(results[i].choices) for i in range(len(results))])
+            #         results = [None] * length
+
+            #     if self.vllm_tensor_parallel_size > 1:
+            #         # avoid duplicate calling in the same tensor parallel group
+            #         import torch.distributed as dist
+            #         if 'group_src' in inspect.signature(dist.broadcast_object_list).parameters:
+            #             dist.broadcast_object_list(results, group_src=0, group=self.group)
+            #         else:
+            #             global_src = dist.get_global_rank(self.group, 0)
+            #             dist.broadcast_object_list(results, src=global_src, group=self.group)
+            #     inputs = [r for r in results if not r['finished']]
+            #     for idx, r in enumerate(results):
+            #         if r['finished'] or r['finish_reason'] == 'length':
+            #             messages_list[r['index']] = (r['messages'], r['finish_reason'])
+            #     if len(inputs) > 0:
+            #         _input_std = []
+            #         for _input in inputs:
+            #             _input_std.append(StdTemplateInputs.from_dict(_input))
+            #             # StdTemplateInputs will not remove responses in infer
+            #         results = self._engine_infer(
+            #             infer_requests=_input_std, request_config=request_config, use_tqdm=False)
+            #     # concat responses from the second loop
+            #     remove_response = False
+
+            # outputs = []
+            # assert not any([m is None for m in messages_list])
+            # for i in range(0, len(messages_list), self.vllm_tensor_parallel_size):
+            #     # reformat to [[x, x, x, x] [x, x, x, x]]
+            #     # this is the same format of sampling_params.n > 1
+            #     outputs.append(messages_list[i:i + self.vllm_tensor_parallel_size])
+            # assert len(outputs) == prompt_lens
+            # assert all([len(o) == self.vllm_tensor_parallel_size for o in outputs])
         else:
             # single turn
             outputs = []
             for i, output in enumerate(results):
                 _choices = []
                 for choice in output.choices:
-                    _input: Dict = deepcopy(inputs_slice[i])
+                    _input: Dict = deepcopy(inputs[i])
                     InferRequest.remove_response(_input['messages'])
                     _input['messages'].append({'role': 'assistant', 'content': choice.message.content})
                     _choices.append((_input['messages'], choice.finish_reason))
                 outputs.append(_choices)
-            assert len(outputs) == prompt_lens
-            assert all([len(o) == self.args.tensor_parallel_size for o in outputs])
-
-        if self.args.tensor_parallel_size > 1:
-            if self.infer_rank_tp_0 < 0:
-                outputs = []
-            else:
-                _outputs = []
-                for tp_idx in range(self.args.tensor_parallel_size):
-                    for prompt_idx in range(len(outputs)):
-                        _outputs.append(outputs[prompt_idx][tp_idx])
-                outputs = [_outputs]
+            assert len(outputs) == len(inputs)
+            assert all([len(o) == self.vllm_tensor_parallel_size for o in outputs])
 
         return outputs
 
     def async_infer(self, inputs, inputs_slice, distributed_idx):
-
+        # TODO: compatible with external server
         def infer_task():
             with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
-                return self._infer_multi_turn(inputs_slice, self.request_config)
+                return self._infer_single_or_multi_turn(inputs_slice, self.request_config)
 
         future: Future = self.executor.submit(infer_task)
         # pre-fetch the queue to avoid switching back to eval_queue at the end of training sample sampling
@@ -728,6 +674,7 @@ def done(_self):
         future.add_done_callback(done)
 
     def _prefetch(self, dataloader: DataLoader):
+        # TODO: compatible with external server
         inputs = next(iter(dataloader))
         all_inputs = gather_object(inputs)
         nnodes = get_node_setting()[1]
@@ -735,7 +682,7 @@ def _prefetch(self, dataloader: DataLoader):
         if self.infer_rank >= 0:
             _input_slice = np.array(all_inputs)[distributed_idx[self.infer_rank]]
             with self.multi_turn_completion_length_context():
-                outputs = self._infer_multi_turn(_input_slice, self.request_config)
+                outputs = self._infer_single_or_multi_turn(_input_slice, self.request_config)
             self._queue.put(DataCache(inputs, outputs, distributed_idx))
         else:
             self._queue.put(DataCache(inputs, [], distributed_idx))
@@ -751,7 +698,7 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
         inputs: local inputs
         """
 
-        if not self.is_external_vllm and self.args.sleep_level > 0 and self.infer_rank >= 0:
+        if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:
             if self.args.offload_model:
                 self.offload_model()
             if self.args.offload_optimizer:
@@ -765,44 +712,19 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
         if self.state.global_step != self._last_loaded_step:
             self._move_model_to_vllm_lmdeploy()
             self._last_loaded_step = self.state.global_step
-        all_inputs = gather_object(inputs)
-        # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
-        # Distribute inputs to different workers
-        # for example, 2 workers, 6 inputs, 0/2/4 dispatch to the first worker
-        # 1/3/5 dispatch to the second worker
-        # trying to shuffle and average the length
-        nnodes = get_node_setting()[1]
-        num_workers = 1 if self.is_external_vllm else nnodes
-        distributed_idx = round_robin(len(all_inputs), num_workers * self.args.num_infer_workers)
-        if self.infer_rank >= 0:
-            _input_slice = np.array(all_inputs)[distributed_idx[self.infer_rank]]
-            if self.args.async_generate:
-                self.async_infer(inputs, _input_slice, distributed_idx)
-                data_cache = self._queue.get()
-                inputs = data_cache.inputs
-                outputs = data_cache.outputs
-                distributed_idx = data_cache.distributed_idx
-            else:
-                with set_device_context(self.infer_device):
-                    request_config = copy(self.request_config)
-                    if self.args.tensor_parallel_size > 1:
-                        request_config.seed += self.state.global_step
-                    with self.multi_turn_completion_length_context():
-                        outputs = self._infer_multi_turn(_input_slice, self.request_config)
+        if self.args.async_generate:
+            # TODO
+            # self.async_infer(inputs, _input_slice, distributed_idx)
+            # data_cache = self._queue.get()
+            # inputs = data_cache.inputs
+            # outputs = data_cache.outputs
+            # distributed_idx = data_cache.distributed_idx
+            pass
         else:
-            if self.args.async_generate:
-                # using old model to generate, which will ignore the `clip` of advantages.
-                self._queue.put(DataCache(inputs, [], distributed_idx))
-                data_cache = self._queue.get()
-                inputs = data_cache.inputs
-                distributed_idx = data_cache.distributed_idx
-            outputs = []
-        outputs = gather_object(outputs)
-        if self.args.tensor_parallel_size > 1:
-            outputs = [[item] for output in outputs for item in output]
-        if not self.is_external_vllm:
-            outputs = self.reorder_outputs(outputs, distributed_idx)
-        if not self.is_external_vllm and self.args.sleep_level > 0 and self.infer_rank >= 0:
+            with self.multi_turn_completion_length_context():
+                outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
+
+        if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:
             self.engine.engine.sleep(level=self.args.sleep_level)
             if self.args.gc_collect_after_offload:
                 gc_collect()
@@ -825,12 +747,6 @@ def _generate_completions(self, inputs: InputsType) -> InputsType:
         mode = 'train' if self.model.training else 'eval'
         if self.use_fast_infer:
             inputs, outputs = self._fast_infer(inputs)
-            # Slice to keep only the local part of the data
-            process_slice = slice(
-                self.accelerator.process_index * len(inputs),
-                (self.accelerator.process_index + 1) * len(inputs),
-            )
-            outputs = outputs[process_slice]
         else:
             # pt infer
             is_multimodal = self.model.model_meta.is_multimodal
@@ -839,7 +755,7 @@ def _generate_completions(self, inputs: InputsType) -> InputsType:
             with unwrap_model_for_generation(
                     self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
             ), self.multi_turn_completion_length_context():
-                outputs = self._infer_multi_turn(inputs, self.request_config)
+                outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
                 if mode == 'train':
                     # In training mode, ensure the model is returned to train() mode after inference
                     # This is necessary as pt engines set the model to eval mode during generation
@@ -1211,7 +1127,7 @@ def _engine_infer(
         *,
         use_tqdm: Optional[bool] = None,
     ):
-        if self.is_external_vllm:
+        if self.vllm_mode == 'server':
             self._process_infer_requests_images(infer_requests)
             return self.vllm_client.infer(infer_requests.tolist(), asdict(request_config), use_tqdm=use_tqdm)
         else:
@@ -1305,7 +1221,7 @@ def multi_turn_completion_length_context(self):
         Ensures the total sequence length (prompt + completion) never exceeds:
             min(original_max_len, prompt_tokens + max_completion_length)
         """
-        if not (self.multi_turn_func and self.infer_rank >= 0) or self.is_external_vllm:
+        if not (self.multi_turn_func and self.use_fast_infer) or self.vllm_mode == 'server':
             yield
             return
 
diff --git a/swift/trainers/rlhf_trainer/utils.py b/swift/trainers/rlhf_trainer/utils.py
index eb00f2c6fa..612a62dd50 100644
--- a/swift/trainers/rlhf_trainer/utils.py
+++ b/swift/trainers/rlhf_trainer/utils.py
@@ -130,3 +130,30 @@ def unmerge_patched(self):
                 del module.unmerge_origin
                 module._cache_pop = module._cache_pop_origin
                 del module._cache_pop_origin
+
+
+@contextmanager
+def unwrap_model_for_generation(
+    model,
+    accelerator,
+    gather_deepspeed3_params=True,
+    gather_parameters: List = None,
+):
+    unwrapped_model = accelerator.unwrap_model(model)
+    if accelerator.state.deepspeed_plugin is not None and accelerator.state.deepspeed_plugin.zero_stage == 3:
+        if not gather_deepspeed3_params:
+            yield accelerator.unwrap_model(model)
+        else:
+            import deepspeed
+            parameters = [
+                parameter for name, parameter in model.named_parameters()
+                if not gather_parameters or name in gather_parameters
+            ]
+            with deepspeed.zero.GatheredParameters(parameters):
+                from trl.models.utils import remove_hooks
+                remove_hooks(model)
+                yield accelerator.unwrap_model(model)
+                from trl.models.utils import add_hooks
+                add_hooks(model)
+    else:
+        yield unwrapped_model

From 07a104068cb032ed13220fc944c896f5e76ff1ee Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 6 May 2025 19:29:00 +0800
Subject: [PATCH 16/68] fix default

---
 swift/llm/argument/rlhf_args.py             | 49 ++++++---------------
 swift/trainers/arguments.py                 |  4 +-
 swift/trainers/rlhf_trainer/grpo_trainer.py |  7 +--
 3 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 3eef44e701..632faeb722 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -219,42 +219,9 @@ def _check_grpo(self):
             raise ValueError(
                 'GRPO requires at least 2 generations per prompt to calculate the advantages. You provided '
                 f'{self.num_generations}, which is less than the minimum required.')
-        from swift.utils import get_device_count, get_dist_setting
-        device_count = get_device_count()
-        _, _, _, local_world_size = get_dist_setting()
-        num_infer_workers = self.num_infer_workers
-        fast_infer = self.use_vllm or self.use_lmdeploy
-        if fast_infer and self.vllm_server_host is None:
-            is_colocate_mode = (device_count == num_infer_workers)
-
-            if is_colocate_mode:
-                # colocate mode
-                assert device_count == local_world_size, (
-                    f'Colocate mode requires device_count({device_count}) == num_infer_workers({num_infer_workers}). '
-                    'Please check if your device count matches NPROC_PER_NODE setting.')
-                logger.info(
-                    'You are using colocate mode because you have set num_infer_workers to be the same as '
-                    'NPROC_PER_NODE, where model training and sampling will be performed on a single GPU. '
-                    'If you encounter an Out-of-Memory (OOM) error, it is recommended to set the `sleep_level`, '
-                    '`offload_model`, and `offload_optimizer` parameters.')
-                assert not self.async_generate, 'async_generate requires async mode, but you are under colocate mode'
-                if self.use_lmdeploy and self.vllm_tensor_parallel_size > 1:
-                    raise ValueError('Currently LMDeploy do not support tensor parallel')
-                if self.use_vllm and self.sleep_level:
-                    logger.warning('It is highly recommended to use `sleep_level==1` in colocate mode,'
-                                   'otherwise it may lead to an OOM (Out of Memory) error.')
-            else:
-                # async mode
-                assert device_count == (local_world_size + num_infer_workers), (
-                    f'Async mode requires total GPUs({device_count}) = training GPUs({local_world_size}) + '
-                    f'inference workers({num_infer_workers}). Please adjust your GPU allocation.')
-                logger.info(
-                    'You are using async mode, where model training and sampling will be performed on different GPUs.')
-                if self.sleep_level > 0:
-                    logger.warning('You are using different GPUs for training and rollout, '
-                                   'so you do not need to use sleep_level > 0')
-
-                assert self.vllm_tensor_parallel_size == 1, ('async mode do not support tensor parallel right now')
+
+        if self.vllm_mode == 'server':
+            assert not self.use_vllm or self.vllm_server_host is not None
 
     def _external_vllm_warning(self):
         if self.rlhf_type != 'grpo' or not self.vllm_server_host:
@@ -294,3 +261,13 @@ def _deprecated_warning(self):
             warnings.warn(
                 "The parameter 'vllm_enable_prefix_caching' has been deprecated and will be removed in version 3.6. ",
                 DeprecationWarning)
+
+        if self.num_infer_workers is not None:
+            warnings.warn(
+                "The parameter 'num_infer_workers' has been deprecated and will be removed in version 3.6. "
+                'If you wish to use colocate mode, please use `vllm_mode colocate` instead. '
+                'If you wish to use async mode, please use `vllm_mode server` and external vLLM server instead.',
+                DeprecationWarning)
+            if self.use_vllm and self.vllm_server_host is None:
+                logger.info('set vllm_mode to colocate since vllm_server_host is not provided')
+                self.vllm_mode = 'colocate'
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index cc88752972..99532a419e 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -143,9 +143,9 @@ class GRPOArgumentsMixin:
     top_k: int = 50
     top_p: float = 0.9
     repetition_penalty: float = 1.
-    num_infer_workers: int = 1
+    num_infer_workers: Optional[int] = None  # deprecated
     # vllm
-    vllm_mode: Literal['server', 'colocate']
+    vllm_mode: Literal['server', 'colocate'] = 'colocate'
     # internal vllm (colocate)
     vllm_device: Optional[List[str]] = None  # deprecated
     vllm_gpu_memory_utilization: float = 0.9
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 80d5da85ef..3f877cb946 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -413,9 +413,10 @@ def prepare_vllm(self, model):
         cls = GRPOVllmEngine
         engine_kwargs = {'seed': self.accelerator.process_index // self.vllm_tensor_parallel_size}
 
-        max_num_seqs = self.args.per_device_train_batch_size \
-                                        * self.vllm_tensor_parallel_size \
-                                        * self.args.gradient_accumulation_steps,
+        max_num_seqs = (
+            self.args.per_device_train_batch_size * self.vllm_tensor_parallel_size
+            * self.args.gradient_accumulation_steps)
+
         with Swift.grpo_context(model, self.template.processor):
             engine = cls(
                 model.model_dir,

From 0303461f3048cfac6ca13b6c036605c0bb76b628 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 15:00:02 +0800
Subject: [PATCH 17/68] fix

---
 swift/llm/infer/infer_engine/__init__.py         |  4 ++--
 swift/llm/infer/infer_engine/grpo_vllm_engine.py |  6 +++---
 swift/llm/infer/infer_engine/utils.py            | 10 ----------
 swift/trainers/rlhf_trainer/grpo_trainer.py      | 12 +++---------
 4 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/swift/llm/infer/infer_engine/__init__.py b/swift/llm/infer/infer_engine/__init__.py
index 668e0a726c..49a54005ea 100644
--- a/swift/llm/infer/infer_engine/__init__.py
+++ b/swift/llm/infer/infer_engine/__init__.py
@@ -11,7 +11,7 @@
     from .infer_client import InferClient
     from .infer_engine import InferEngine
     from .base import BaseInferEngine
-    from .utils import prepare_generation_config, AdapterRequest, set_device_context, patch_vllm_memory_leak
+    from .utils import prepare_generation_config, AdapterRequest, patch_vllm_memory_leak
 else:
     _import_structure = {
         'vllm_engine': ['VllmEngine'],
@@ -21,7 +21,7 @@
         'infer_client': ['InferClient'],
         'infer_engine': ['InferEngine'],
         'base': ['BaseInferEngine'],
-        'utils': ['prepare_generation_config', 'AdapterRequest', 'set_device_context', 'patch_vllm_memory_leak'],
+        'utils': ['prepare_generation_config', 'AdapterRequest', 'patch_vllm_memory_leak'],
     }
 
     import sys
diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
index 0e5b4e7363..ed5b4a1c0a 100644
--- a/swift/llm/infer/infer_engine/grpo_vllm_engine.py
+++ b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
@@ -138,11 +138,11 @@ def infer(
             prompts.append(llm_inputs)
 
         generation_configs = []
-        seed = request_config.seed
-        assert seed >= 0, 'Seed is needed for GRPOVllmEngine.'
+        # seed = request_config.seed
+        # assert seed >= 0, 'Seed is needed for GRPOVllmEngine.'
         for i, _ in enumerate(prompts):
             request_config = copy(request_config)
-            request_config.seed = seed + i
+            # request_config.seed = seed + i
             generation_config = self._prepare_generation_config(request_config)
             self._add_stop_words(generation_config, request_config, template.template_meta)
             generation_configs.append(generation_config)
diff --git a/swift/llm/infer/infer_engine/utils.py b/swift/llm/infer/infer_engine/utils.py
index c2e8f0bd79..ff94de9399 100644
--- a/swift/llm/infer/infer_engine/utils.py
+++ b/swift/llm/infer/infer_engine/utils.py
@@ -472,16 +472,6 @@ def new_group_context():
     return new_group_context() if device_type == 'npu' else nullcontext()
 
 
-@contextmanager
-def set_device_context(device: Union[str, int]):
-    origin_device = get_current_device()
-    set_device(device)
-    try:
-        yield
-    finally:
-        set_device(origin_device)
-
-
 @contextmanager
 def restore_torch_device_after_vllm_init():
     """
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 3f877cb946..70443753de 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -33,12 +33,10 @@
 from trl.trainer.grpo_trainer import nanmax, nanmin
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
-from swift.llm.infer.infer_engine import set_device_context
 from swift.llm.template.template_inputs import StdTemplateInputs
 from swift.plugin import orms
 from swift.plugin.multi_turn import multi_turns
-from swift.utils import (JsonlWriter, gc_collect, get_device, get_device_count, get_dist_setting, get_logger,
-                         get_node_setting, is_lmdeploy_available, is_vllm_available, is_wandb_available)
+from swift.utils import (JsonlWriter, gc_collect, get_logger, get_node_setting, is_lmdeploy_available, is_vllm_available, is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import patch_lora_merge, patch_lora_unmerge, round_robin, unwrap_model_for_generation
@@ -200,7 +198,6 @@ def __init__(self,
         # it's safer to set it in all cases.
         set_seed(args.seed, device_specific=True)
         self.parameter_groups, self.parameter_groups_no_lora = self.split_batches()
-        self.infer_device = None
         self.use_fast_infer = self.use_vllm or self.use_lmdeploy  # whether to use the PT backend
         # self.is_external_vllm = use_vllm and args.vllm_server_host is not None
         if self.use_vllm:
@@ -264,9 +261,6 @@ def __init__(self,
             stop=args.stop_words,
         )
 
-        # if local_world_size == self.args.num_infer_workers == get_device_count() and local_world_size > 1:
-        #     self.request_config.n = self.vllm_tensor_parallel_size
-
         # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
         # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
         # self.model_accepts_loss_kwargs to False to enable scaling.
@@ -422,7 +416,7 @@ def prepare_vllm(self, model):
                 model.model_dir,
                 model.model_info.torch_dtype,
                 model_type=model.model_meta.model_type,
-                vllm_tensor_parallel_size=self.vllm_tensor_parallel_size,
+                tensor_parallel_size=self.vllm_tensor_parallel_size,
                 gpu_memory_utilization=self.vllm_gpu_memory_utilization,
                 enable_prefix_caching=self.args.vllm_enable_prefix_caching,
                 max_num_seqs=max_num_seqs,
@@ -662,7 +656,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
     def async_infer(self, inputs, inputs_slice, distributed_idx):
         # TODO: compatible with external server
         def infer_task():
-            with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
+            with self.multi_turn_completion_length_context():
                 return self._infer_single_or_multi_turn(inputs_slice, self.request_config)
 
         future: Future = self.executor.submit(infer_task)

From 854f3571f93e8f6847a9815e58f348619ae4877b Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 16:01:31 +0800
Subject: [PATCH 18/68] fix seed

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 70443753de..5f630dc254 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -36,7 +36,8 @@
 from swift.llm.template.template_inputs import StdTemplateInputs
 from swift.plugin import orms
 from swift.plugin.multi_turn import multi_turns
-from swift.utils import (JsonlWriter, gc_collect, get_logger, get_node_setting, is_lmdeploy_available, is_vllm_available, is_wandb_available)
+from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, get_node_setting, is_lmdeploy_available,
+                         is_vllm_available, is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import patch_lora_merge, patch_lora_unmerge, round_robin, unwrap_model_for_generation
@@ -410,7 +411,7 @@ def prepare_vllm(self, model):
         max_num_seqs = (
             self.args.per_device_train_batch_size * self.vllm_tensor_parallel_size
             * self.args.gradient_accumulation_steps)
-
+        current_device = get_device()
         with Swift.grpo_context(model, self.template.processor):
             engine = cls(
                 model.model_dir,
@@ -424,6 +425,7 @@ def prepare_vllm(self, model):
                 limit_mm_per_prompt=self.args.vllm_limit_mm_per_prompt,
                 enable_sleep_mode=self.args.sleep_level > 0,
                 use_async_engine=False,
+                device=current_device,
                 max_model_len=self.args.vllm_max_model_len,
                 engine_kwargs=engine_kwargs,
                 **vllm_kwargs)
@@ -550,7 +552,8 @@ def _get_first_turn_results(self, inputs: InputsType, request_config: RequestCon
                 gathered_inputs = [None for _ in range(self.vllm_tensor_parallel_size)]
                 torch.distributed.all_gather_object(gathered_inputs, inputs, group=self.tp_group)
                 inputs = [p for sublist in gathered_inputs for p in sublist]
-
+            # confirm that the seed is same in tp group
+            request_config.seed = self.accelerator.process_index // self.vllm_tensor_parallel_size
             results: List[ChatCompletionResponse] = self._engine_infer(
                 infer_requests=inputs, request_config=request_config, use_tqdm=False)
 
@@ -649,7 +652,6 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                     _choices.append((_input['messages'], choice.finish_reason))
                 outputs.append(_choices)
             assert len(outputs) == len(inputs)
-            assert all([len(o) == self.vllm_tensor_parallel_size for o in outputs])
 
         return outputs
 

From 7df2b5dd9094ec614c4aa6224e54e5f4bdc23b89 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 16:15:34 +0800
Subject: [PATCH 19/68] fix

---
 swift/llm/infer/infer_engine/grpo_vllm_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
index ed5b4a1c0a..0e5b4e7363 100644
--- a/swift/llm/infer/infer_engine/grpo_vllm_engine.py
+++ b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
@@ -138,11 +138,11 @@ def infer(
             prompts.append(llm_inputs)
 
         generation_configs = []
-        # seed = request_config.seed
-        # assert seed >= 0, 'Seed is needed for GRPOVllmEngine.'
+        seed = request_config.seed
+        assert seed >= 0, 'Seed is needed for GRPOVllmEngine.'
         for i, _ in enumerate(prompts):
             request_config = copy(request_config)
-            # request_config.seed = seed + i
+            request_config.seed = seed + i
             generation_config = self._prepare_generation_config(request_config)
             self._add_stop_words(generation_config, request_config, template.template_meta)
             generation_configs.append(generation_config)

From fda82ee02b922b851c443370a8aef6f43e2f645d Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 16:51:19 +0800
Subject: [PATCH 20/68] wip

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 148 ++++++++++++--------
 1 file changed, 88 insertions(+), 60 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 5f630dc254..2e0104b36f 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -581,66 +581,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
         """
         results = self._get_first_turn_results(inputs, request_config)
 
-        if self.multi_turn_func:
-            pass
-            # TODO
-            # we remove response in first turn
-            # messages_list = [None] * (len(inputs) * self.vllm_tensor_parallel_size)
-            # remove_response = True
-            # while len(inputs) > 0:
-            #     if not self.use_fast_infer:
-            #         inputs = []
-            #         cnt = 0
-            #         for i, output in enumerate(results):
-            #             for choice in output.choices:
-            #                 _input: Dict = deepcopy(inputs[i])
-            #                 if remove_response or _input['messages'][-1]['role'] != 'assistant' or not \
-            #                         _input['messages'][-1]['content']:
-            #                     InferRequest.remove_response(_input['messages'])
-            #                     _input['messages'].append({'role': 'assistant', 'content': choice.message.content})
-            #                 else:
-            #                     _input['messages'][-1]['content'] += choice.message.content
-            #                 if 'index' not in _input:
-            #                     _input['index'] = cnt
-            #                 _input['finish_reason'] = choice.finish_reason
-            #                 cnt += 1
-            #                 inputs.append(_input)
-            #         results: List[Dict] = self.multi_turn_func(inputs)  # noqa
-            #     else:
-            #         length = sum([len(results[i].choices) for i in range(len(results))])
-            #         results = [None] * length
-
-            #     if self.vllm_tensor_parallel_size > 1:
-            #         # avoid duplicate calling in the same tensor parallel group
-            #         import torch.distributed as dist
-            #         if 'group_src' in inspect.signature(dist.broadcast_object_list).parameters:
-            #             dist.broadcast_object_list(results, group_src=0, group=self.group)
-            #         else:
-            #             global_src = dist.get_global_rank(self.group, 0)
-            #             dist.broadcast_object_list(results, src=global_src, group=self.group)
-            #     inputs = [r for r in results if not r['finished']]
-            #     for idx, r in enumerate(results):
-            #         if r['finished'] or r['finish_reason'] == 'length':
-            #             messages_list[r['index']] = (r['messages'], r['finish_reason'])
-            #     if len(inputs) > 0:
-            #         _input_std = []
-            #         for _input in inputs:
-            #             _input_std.append(StdTemplateInputs.from_dict(_input))
-            #             # StdTemplateInputs will not remove responses in infer
-            #         results = self._engine_infer(
-            #             infer_requests=_input_std, request_config=request_config, use_tqdm=False)
-            #     # concat responses from the second loop
-            #     remove_response = False
-
-            # outputs = []
-            # assert not any([m is None for m in messages_list])
-            # for i in range(0, len(messages_list), self.vllm_tensor_parallel_size):
-            #     # reformat to [[x, x, x, x] [x, x, x, x]]
-            #     # this is the same format of sampling_params.n > 1
-            #     outputs.append(messages_list[i:i + self.vllm_tensor_parallel_size])
-            # assert len(outputs) == prompt_lens
-            # assert all([len(o) == self.vllm_tensor_parallel_size for o in outputs])
-        else:
+        if not self.multi_turn_func:
             # single turn
             outputs = []
             for i, output in enumerate(results):
@@ -652,7 +593,94 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                     _choices.append((_input['messages'], choice.finish_reason))
                 outputs.append(_choices)
             assert len(outputs) == len(inputs)
+        else:
+            # multi turn
+            orig_size = len(inputs)
+            messages_list = [None] * orig_size
+            # we remove origin response in first turn
+            first_turn = True
+            while len(inputs) > 0:
+                if not self.use_fast_infer:
+                    inputs = []
+                    cnt = 0
+                    for i, output in enumerate(results):
+                        for choice in output.choices:
+                            current_input = deepcopy(inputs[i])
+                            messages = current_input['messages']
+                            last_message = messages[-1]
+                            
+                            # Determine if we need to append or update the last message
+                            if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
+                                InferRequest.remove_response(messages)
+                                messages.append({
+                                    'role': 'assistant',
+                                    'content': choice.message.content
+                                })
+                            else:
+                                last_message['content'] += choice.message.content
+                            
+                            if 'index' not in current_input:
+                                current_input['index'] = cnt
+                            _input['finish_reason'] = choice.finish_reason
+                            cnt += 1
+                            inputs.append(_input)
+
+                    for i, output in enumerate(results):
+                        for choice_idx, choice in enumerate(output.choices):
+                            # Create a deep copy of the input
+                            current_input = deepcopy(inputs[i])
+                            messages = current_input['messages']
+                            last_message = messages[-1]
+                            
+                            # Determine if we need to append or update the last message
+                            if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
+                                InferRequest.remove_response(messages)
+                                messages.append({
+                                    'role': 'assistant',
+                                    'content': choice.message.content
+                                })
+                            else:
+                                last_message['content'] += choice.message.content
+                            
+                            # Set additional fields
+                            current_input.setdefault('index', len(inputs))
+                            current_input['finish_reason'] = choice.finish_reason
+                            
+                            inputs.append(current_input)
+                    results: List[Dict] = self.multi_turn_func(inputs)
+                else:
+                    length = sum([len(results[i].choices) for i in range(len(results))])
+                    results = [None] * length
 
+                if self.vllm_tensor_parallel_size > 1:
+                    # avoid duplicate calling in the same tensor parallel group
+                    import torch.distributed as dist
+                    if 'group_src' in inspect.signature(dist.broadcast_object_list).parameters:
+                        dist.broadcast_object_list(results, group_src=0, group=self.group)
+                    else:
+                        global_src = dist.get_global_rank(self.group, 0)
+                        dist.broadcast_object_list(results, src=global_src, group=self.group)
+                inputs = [r for r in results if not r['finished']]
+                for idx, r in enumerate(results):
+                    if r['finished'] or r['finish_reason'] == 'length':
+                        messages_list[r['index']] = (r['messages'], r['finish_reason'])
+                if len(inputs) > 0:
+                    _input_std = []
+                    for _input in inputs:
+                        _input_std.append(StdTemplateInputs.from_dict(_input))
+                        # StdTemplateInputs will not remove responses in infer
+                    results = self._engine_infer(
+                        infer_requests=_input_std, request_config=request_config, use_tqdm=False)
+                # concat responses from the second loop
+                first_turn = False
+
+            outputs = []
+            assert not any([m is None for m in messages_list])
+            for i in range(0, len(messages_list), self.vllm_tensor_parallel_size):
+                # reformat to [[x, x, x, x] [x, x, x, x]]
+                # this is the same format of sampling_params.n > 1
+                outputs.append(messages_list[i:i + self.vllm_tensor_parallel_size])
+            assert len(outputs) == len(inputs)
         return outputs
 
     def async_infer(self, inputs, inputs_slice, distributed_idx):

From 5d8d4a2310e80d76d6c24a6a357f3e9be2677ddd Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 16:52:09 +0800
Subject: [PATCH 21/68] wip multi turn

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 2e0104b36f..2f3ab6d622 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -621,7 +621,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                             
                             if 'index' not in current_input:
                                 current_input['index'] = cnt
-                            _input['finish_reason'] = choice.finish_reason
+                            current_input['finish_reason'] = choice.finish_reason
                             cnt += 1
                             inputs.append(_input)
 

From ac52340d49b97b372e2c5696ba5ceaa9aff41901 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 18:02:33 +0800
Subject: [PATCH 22/68] multi turn

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 135 ++++++--------------
 1 file changed, 41 insertions(+), 94 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 2f3ab6d622..7a9355f6d9 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -523,7 +523,7 @@ def reorder_outputs(outputs, distributed_idx):
 
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
-    def _get_first_turn_results(self, inputs: InputsType, request_config: RequestConfig) -> Union[OutputsType, List]:
+    def _infer(self, inputs: InputsType, request_config: RequestConfig) -> Union[OutputsType, List]:
         # inputs: local inputs
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
@@ -579,10 +579,10 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
             - List of responses per prompt (length = vllm_tensor_parallel_size)
             - Each response is a tuple of (message_history, finish_reason)
         """
-        results = self._get_first_turn_results(inputs, request_config)
+        results = self._infer(inputs, request_config)
 
         if not self.multi_turn_func:
-            # single turn
+            # Single-turn: combine completions with messages and retain the finish reason.
             outputs = []
             for i, output in enumerate(results):
                 _choices = []
@@ -594,93 +594,56 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                 outputs.append(_choices)
             assert len(outputs) == len(inputs)
         else:
-            # multi turn
+            # Multi-turn: continue to rollout until finished."
             orig_size = len(inputs)
-            messages_list = [None] * orig_size
+            outputs = [None] * orig_size
             # we remove origin response in first turn
             first_turn = True
             while len(inputs) > 0:
-                if not self.use_fast_infer:
-                    inputs = []
-                    cnt = 0
-                    for i, output in enumerate(results):
-                        for choice in output.choices:
-                            current_input = deepcopy(inputs[i])
-                            messages = current_input['messages']
-                            last_message = messages[-1]
-                            
-                            # Determine if we need to append or update the last message
-                            if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
-                                InferRequest.remove_response(messages)
-                                messages.append({
-                                    'role': 'assistant',
-                                    'content': choice.message.content
-                                })
-                            else:
-                                last_message['content'] += choice.message.content
-                            
-                            if 'index' not in current_input:
-                                current_input['index'] = cnt
-                            current_input['finish_reason'] = choice.finish_reason
-                            cnt += 1
-                            inputs.append(_input)
-
-                    for i, output in enumerate(results):
-                        for choice_idx, choice in enumerate(output.choices):
-                            # Create a deep copy of the input
-                            current_input = deepcopy(inputs[i])
-                            messages = current_input['messages']
-                            last_message = messages[-1]
-                            
-                            # Determine if we need to append or update the last message
-                            if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
-                                InferRequest.remove_response(messages)
-                                messages.append({
-                                    'role': 'assistant',
-                                    'content': choice.message.content
-                                })
-                            else:
-                                last_message['content'] += choice.message.content
-                            
-                            # Set additional fields
-                            current_input.setdefault('index', len(inputs))
-                            current_input['finish_reason'] = choice.finish_reason
-                            
-                            inputs.append(current_input)
-                    results: List[Dict] = self.multi_turn_func(inputs)
-                else:
-                    length = sum([len(results[i].choices) for i in range(len(results))])
-                    results = [None] * length
-
-                if self.vllm_tensor_parallel_size > 1:
-                    # avoid duplicate calling in the same tensor parallel group
-                    import torch.distributed as dist
-                    if 'group_src' in inspect.signature(dist.broadcast_object_list).parameters:
-                        dist.broadcast_object_list(results, group_src=0, group=self.group)
-                    else:
-                        global_src = dist.get_global_rank(self.group, 0)
-                        dist.broadcast_object_list(results, src=global_src, group=self.group)
+                # inputs for current turn
+                current_inputs = []
+                cnt = 0
+                # combine completions from results with messages
+                for i, output in enumerate(results):
+                    for choice in output.choices:
+                        current_input = deepcopy(inputs[i])
+                        messages = current_input['messages']
+                        last_message = messages[-1]
+
+                        # Determine if we need to append or update the last message
+                        if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
+                            InferRequest.remove_response(messages)
+                            messages.append({'role': 'assistant', 'content': choice.message.content})
+                        else:
+                            last_message['content'] += choice.message.content
+
+                        if 'index' not in current_input:
+                            current_input['index'] = cnt
+                        current_input['finish_reason'] = choice.finish_reason
+                        cnt += 1
+                        current_inputs.append(_input)
+
+                # Process messages in the multi-turn function
+                results: List[Dict] = self.multi_turn_func(inputs)
+
+                # Retain messages that are not yet finished for the next round of rollout
                 inputs = [r for r in results if not r['finished']]
+
+                # Save the finished messages to the results
                 for idx, r in enumerate(results):
                     if r['finished'] or r['finish_reason'] == 'length':
-                        messages_list[r['index']] = (r['messages'], r['finish_reason'])
+                        outputs[r['index']] = (r['messages'], r['finish_reason'])
+
                 if len(inputs) > 0:
                     _input_std = []
                     for _input in inputs:
                         _input_std.append(StdTemplateInputs.from_dict(_input))
                         # StdTemplateInputs will not remove responses in infer
-                    results = self._engine_infer(
-                        infer_requests=_input_std, request_config=request_config, use_tqdm=False)
+                    results = self._infer(infer_requests=_input_std, request_config=request_config, use_tqdm=False)
                 # concat responses from the second loop
                 first_turn = False
 
-            outputs = []
-            assert not any([m is None for m in messages_list])
-            for i in range(0, len(messages_list), self.vllm_tensor_parallel_size):
-                # reformat to [[x, x, x, x] [x, x, x, x]]
-                # this is the same format of sampling_params.n > 1
-                outputs.append(messages_list[i:i + self.vllm_tensor_parallel_size])
-            assert len(outputs) == len(inputs)
+        assert not any([o is None for o in outputs])
         return outputs
 
     def async_infer(self, inputs, inputs_slice, distributed_idx):
@@ -715,14 +678,6 @@ def _prefetch(self, dataloader: DataLoader):
             self.accelerator.wait_for_everyone()
 
     def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
-        """
-        This function performs fast inference by managing model and optimizer offloading,
-        loading weights if necessary, distributing inputs among workers, and generating
-        completions using the vLLM/LMDeploy framework. It supports both synchronous and asynchronous
-        inference modes.
-        inputs: local inputs
-        """
-
         if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:
             if self.args.offload_model:
                 self.offload_model()
@@ -737,17 +692,9 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
         if self.state.global_step != self._last_loaded_step:
             self._move_model_to_vllm_lmdeploy()
             self._last_loaded_step = self.state.global_step
-        if self.args.async_generate:
-            # TODO
-            # self.async_infer(inputs, _input_slice, distributed_idx)
-            # data_cache = self._queue.get()
-            # inputs = data_cache.inputs
-            # outputs = data_cache.outputs
-            # distributed_idx = data_cache.distributed_idx
-            pass
-        else:
-            with self.multi_turn_completion_length_context():
-                outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
+
+        with self.multi_turn_completion_length_context():
+            outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
 
         if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:
             self.engine.engine.sleep(level=self.args.sleep_level)

From 578a365efe7d1df2f2b04feacd87f0ff9798c267 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 18:04:46 +0800
Subject: [PATCH 23/68] fix comment

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 7a9355f6d9..76f43812ad 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -568,7 +568,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> Union[Out
 
     def _infer_single_or_multi_turn(self, inputs: InputsType,
                                     request_config: RequestConfig) -> Union[OutputsType, List]:
-        """Perform multi-turn or single-turn inference with support for tensor parallelism.
+        """Perform multi-turn or single-turn inference
 
         Args:
             inputs: list of input requests
@@ -576,7 +576,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
 
         Returns:
             List of outputs where each entry contains:
-            - List of responses per prompt (length = vllm_tensor_parallel_size)
+            - List of responses per prompt
             - Each response is a tuple of (message_history, finish_reason)
         """
         results = self._infer(inputs, request_config)
@@ -594,7 +594,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                 outputs.append(_choices)
             assert len(outputs) == len(inputs)
         else:
-            # Multi-turn: continue to rollout until finished."
+            # Multi-turn: continue to rollout until finished.
             orig_size = len(inputs)
             outputs = [None] * orig_size
             # we remove origin response in first turn
@@ -630,7 +630,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                 inputs = [r for r in results if not r['finished']]
 
                 # Save the finished messages to the results
-                for idx, r in enumerate(results):
+                for r in results:
                     if r['finished'] or r['finish_reason'] == 'length':
                         outputs[r['index']] = (r['messages'], r['finish_reason'])
 

From 9a49fb578e13df4fee9f3bce4764436f57ea2640 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 18:08:10 +0800
Subject: [PATCH 24/68] fix peft model inspect and labels

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 76f43812ad..58e3388c78 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -200,7 +200,6 @@ def __init__(self,
         set_seed(args.seed, device_specific=True)
         self.parameter_groups, self.parameter_groups_no_lora = self.split_batches()
         self.use_fast_infer = self.use_vllm or self.use_lmdeploy  # whether to use the PT backend
-        # self.is_external_vllm = use_vllm and args.vllm_server_host is not None
         if self.use_vllm:
             if not is_vllm_available():
                 raise ImportError('vLLM is not available and `use_vllm` is set to True. '
@@ -896,7 +895,7 @@ def _prepare_batch_inputs(self, inputs: InputsType, rewards: torch.Tensor) -> Li
                 batch_encoded_inputs = to_device(template.data_collator(batch_encoded_inputs), self.model.device)
 
             # Process labels and masks
-            labels = batch_encoded_inputs['labels']
+            labels = batch_encoded_inputs.pop('labels')
             logits_to_keep = (labels.shape[-1] - (torch.ne(labels, -100).int().argmax(-1))).max().item()
             batch_encoded_inputs.update({
                 'completion_mask':
@@ -1052,7 +1051,10 @@ def _get_per_token_logps(self, model, inputs):
         logits_to_keep = inputs['logits_to_keep']
         input_ids = inputs['input_ids']
         unwrapped_model = self.accelerator.unwrap_model(model)
-        parameters = inspect.signature(unwrapped_model.forward).parameters
+        if is_peft_model(unwrapped_model):
+            parameters = inspect.signature(unwrapped_model.base_model.model.forward).parameters
+        else:
+            parameters = inspect.signature(unwrapped_model.forward).parameters
         if not unwrapped_model.model_meta.is_multimodal and 'logits_to_keep' in parameters:
             # save memory
             return super()._get_per_token_logps(model, input_ids, inputs['attention_mask'], logits_to_keep)

From 5579c3e0e11f25e09b174fdbd608e3b7801ef4cd Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 19:05:50 +0800
Subject: [PATCH 25/68] fix multi turn

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 58e3388c78..ddcee4c92e 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -620,7 +620,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                             current_input['index'] = cnt
                         current_input['finish_reason'] = choice.finish_reason
                         cnt += 1
-                        current_inputs.append(_input)
+                        current_inputs.append(current_input)
 
                 # Process messages in the multi-turn function
                 results: List[Dict] = self.multi_turn_func(inputs)

From 7de8aab9394a5eaeaa27f1bea40189e087f0c0fc Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 7 May 2025 20:44:55 +0800
Subject: [PATCH 26/68] update multi turn

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 33 +++++++++++----------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index ddcee4c92e..a7fcd62078 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -50,7 +50,7 @@
     import wandb
 
 InputsType = List[Dict[str, Union[torch.Tensor, Any]]]
-OutputsType = List[List[Tuple[List[Dict], str]]]
+OutputsType = List[Tuple[List[Dict], str]]
 
 
 class GRPOCallback(TrainerCallback):
@@ -522,7 +522,7 @@ def reorder_outputs(outputs, distributed_idx):
 
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
-    def _infer(self, inputs: InputsType, request_config: RequestConfig) -> Union[OutputsType, List]:
+    def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsType:
         # inputs: local inputs
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
@@ -565,8 +565,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> Union[Out
 
         return results
 
-    def _infer_single_or_multi_turn(self, inputs: InputsType,
-                                    request_config: RequestConfig) -> Union[OutputsType, List]:
+    def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: RequestConfig) -> OutputsType:
         """Perform multi-turn or single-turn inference
 
         Args:
@@ -578,6 +577,8 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
             - List of responses per prompt
             - Each response is a tuple of (message_history, finish_reason)
         """
+
+        # infer first turn
         results = self._infer(inputs, request_config)
 
         if not self.multi_turn_func:
@@ -591,6 +592,8 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                     _input['messages'].append({'role': 'assistant', 'content': choice.message.content})
                     _choices.append((_input['messages'], choice.finish_reason))
                 outputs.append(_choices)
+            # flatten 2D list to 1D list
+            outputs = [item for sublist in outputs for item in sublist]
             assert len(outputs) == len(inputs)
         else:
             # Multi-turn: continue to rollout until finished.
@@ -598,7 +601,8 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
             outputs = [None] * orig_size
             # we remove origin response in first turn
             first_turn = True
-            while len(inputs) > 0:
+            next_turn_inputs = inputs.copy()
+            while len(next_turn_inputs) > 0:
                 # inputs for current turn
                 current_inputs = []
                 cnt = 0
@@ -623,26 +627,23 @@ def _infer_single_or_multi_turn(self, inputs: InputsType,
                         current_inputs.append(current_input)
 
                 # Process messages in the multi-turn function
-                results: List[Dict] = self.multi_turn_func(inputs)
+                results: List[Dict] = self.multi_turn_func(current_inputs)
 
                 # Retain messages that are not yet finished for the next round of rollout
-                inputs = [r for r in results if not r['finished']]
-
-                # Save the finished messages to the results
+                next_turn_inputs = []
                 for r in results:
                     if r['finished'] or r['finish_reason'] == 'length':
                         outputs[r['index']] = (r['messages'], r['finish_reason'])
+                    else:
+                        next_turn_inputs.append(r)
+                if next_turn_inputs:
+                    # TODO: StdTemplateInputs will not remove responses in infer
+                    results = self._infer(infer_requests=next_turn_inputs, request_config=request_config, use_tqdm=False)
 
-                if len(inputs) > 0:
-                    _input_std = []
-                    for _input in inputs:
-                        _input_std.append(StdTemplateInputs.from_dict(_input))
-                        # StdTemplateInputs will not remove responses in infer
-                    results = self._infer(infer_requests=_input_std, request_config=request_config, use_tqdm=False)
                 # concat responses from the second loop
                 first_turn = False
 
-        assert not any([o is None for o in outputs])
+            assert not any([o is None for o in outputs])
         return outputs
 
     def async_infer(self, inputs, inputs_slice, distributed_idx):

From 438f1f70924e39aa5e727bc58c4149b150711332 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 11:14:13 +0800
Subject: [PATCH 27/68] multi turn not remove response

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index a7fcd62078..075020568a 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -635,10 +635,13 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                     if r['finished'] or r['finish_reason'] == 'length':
                         outputs[r['index']] = (r['messages'], r['finish_reason'])
                     else:
+                        if r['messages'][-1]['content'] == 'assistant':
+                            # infer will remove response, so we add dummy response here
+                            r['messages'].append({'role': 'assistant', 'content': None})
                         next_turn_inputs.append(r)
                 if next_turn_inputs:
-                    # TODO: StdTemplateInputs will not remove responses in infer
-                    results = self._infer(infer_requests=next_turn_inputs, request_config=request_config, use_tqdm=False)
+                    results = self._infer(
+                        infer_requests=next_turn_inputs, request_config=request_config, use_tqdm=False)
 
                 # concat responses from the second loop
                 first_turn = False

From d69a9ae8a22f5d935447caa107f767b0b769c0f0 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 11:58:01 +0800
Subject: [PATCH 28/68] fix

---
 swift/llm/argument/rlhf_args.py             | 17 +++++++----------
 swift/trainers/rlhf_trainer/grpo_trainer.py | 13 +++++++------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 632faeb722..e8bcb286f7 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -248,26 +248,23 @@ def _deprecated_warning(self):
             return
 
         if self.tensor_parallel_size is not None:
-            warnings.warn(
+            logger.warning(
                 "The parameter 'tensor_parallel_size' has been deprecated and will be removed in version 3.6. "
-                "It is recommended to use 'vllm_tensor_parallel_size' instead.", DeprecationWarning)
+                "It is recommended to use 'vllm_tensor_parallel_size' instead.")
             self.vllm_tensor_parallel_size = self.tensor_parallel_size
 
         if self.vllm_device is not None:
-            warnings.warn("The parameter 'vllm_device' has been deprecated and will be removed in version 3.6. ",
-                          DeprecationWarning)
+            logger.warning("The parameter 'vllm_device' has been deprecated and will be removed in version 3.6. ")
 
         if self.vllm_enable_prefix_caching is not None:
-            warnings.warn(
-                "The parameter 'vllm_enable_prefix_caching' has been deprecated and will be removed in version 3.6. ",
-                DeprecationWarning)
+            logger.warning(
+                "The parameter 'vllm_enable_prefix_caching' has been deprecated and will be removed in version 3.6. ")
 
         if self.num_infer_workers is not None:
-            warnings.warn(
+            logger.warning(
                 "The parameter 'num_infer_workers' has been deprecated and will be removed in version 3.6. "
                 'If you wish to use colocate mode, please use `vllm_mode colocate` instead. '
-                'If you wish to use async mode, please use `vllm_mode server` and external vLLM server instead.',
-                DeprecationWarning)
+                'If you wish to use async mode, please use `vllm_mode server` and external vLLM server instead.')
             if self.use_vllm and self.vllm_server_host is None:
                 logger.info('set vllm_mode to colocate since vllm_server_host is not provided')
                 self.vllm_mode = 'colocate'
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 075020568a..e34c205367 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -602,14 +602,15 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
             # we remove origin response in first turn
             first_turn = True
             next_turn_inputs = inputs.copy()
+            last_turn_results = results
             while len(next_turn_inputs) > 0:
                 # inputs for current turn
                 current_inputs = []
                 cnt = 0
                 # combine completions from results with messages
-                for i, output in enumerate(results):
+                for i, output in enumerate(last_turn_results):
                     for choice in output.choices:
-                        current_input = deepcopy(inputs[i])
+                        current_input = deepcopy(next_turn_inputs[i])
                         messages = current_input['messages']
                         last_message = messages[-1]
 
@@ -627,11 +628,11 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                         current_inputs.append(current_input)
 
                 # Process messages in the multi-turn function
-                results: List[Dict] = self.multi_turn_func(current_inputs)
+                current_results: List[Dict] = self.multi_turn_func(current_inputs)
 
                 # Retain messages that are not yet finished for the next round of rollout
                 next_turn_inputs = []
-                for r in results:
+                for r in current_results:
                     if r['finished'] or r['finish_reason'] == 'length':
                         outputs[r['index']] = (r['messages'], r['finish_reason'])
                     else:
@@ -640,9 +641,9 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                             r['messages'].append({'role': 'assistant', 'content': None})
                         next_turn_inputs.append(r)
                 if next_turn_inputs:
-                    results = self._infer(
-                        infer_requests=next_turn_inputs, request_config=request_config, use_tqdm=False)
+                    current_results = self._infer(next_turn_inputs, request_config)
 
+                last_turn_results = current_results
                 # concat responses from the second loop
                 first_turn = False
 

From 451fd02d1ac46f79f8bcd43e6893da03f3557da0 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 14:31:51 +0800
Subject: [PATCH 29/68] fix multi turn concate response

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index e34c205367..8659924c90 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -614,12 +614,16 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                         messages = current_input['messages']
                         last_message = messages[-1]
 
-                        # Determine if we need to append or update the last message
-                        if first_turn or last_message['role'] != 'assistant' or not last_message['content']:
+                        # Determine whether to append a new message or update the last one based on the current state
+                        if first_turn or not last_message['content']:
+                            # If it's the first turn or the last message content is empty(dummy), remove the response
                             InferRequest.remove_response(messages)
-                            messages.append({'role': 'assistant', 'content': choice.message.content})
-                        else:
+                        if last_message['role'] == 'assistant':
+                            # If the last message was assistant, concatenate the new content to it
                             last_message['content'] += choice.message.content
+                        else:
+                            # append a new message from the assistant
+                            messages.append({'role': 'assistant', 'content': choice.message.content})
 
                         if 'index' not in current_input:
                             current_input['index'] = cnt
@@ -636,7 +640,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                     if r['finished'] or r['finish_reason'] == 'length':
                         outputs[r['index']] = (r['messages'], r['finish_reason'])
                     else:
-                        if r['messages'][-1]['content'] == 'assistant':
+                        if r['messages'][-1]['role'] == 'assistant':
                             # infer will remove response, so we add dummy response here
                             r['messages'].append({'role': 'assistant', 'content': None})
                         next_turn_inputs.append(r)

From c3a1aa954b712a24377dc49c55ea36b845ce11ed Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 14:37:53 +0800
Subject: [PATCH 30/68] fix multi turn message check

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 8659924c90..38e33072ea 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -612,15 +612,14 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
                     for choice in output.choices:
                         current_input = deepcopy(next_turn_inputs[i])
                         messages = current_input['messages']
-                        last_message = messages[-1]
 
                         # Determine whether to append a new message or update the last one based on the current state
-                        if first_turn or not last_message['content']:
+                        if first_turn or not messages[-1]['content']:
                             # If it's the first turn or the last message content is empty(dummy), remove the response
                             InferRequest.remove_response(messages)
-                        if last_message['role'] == 'assistant':
+                        if messages[-1]['role'] == 'assistant':
                             # If the last message was assistant, concatenate the new content to it
-                            last_message['content'] += choice.message.content
+                            messages[-1]['content'] += choice.message.content
                         else:
                             # append a new message from the assistant
                             messages.append({'role': 'assistant', 'content': choice.message.content})

From 300610e47e791f58c1f16800f75ae96e5ec21858 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 15:40:24 +0800
Subject: [PATCH 31/68] fix infer

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 38e33072ea..71f54aa3da 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -50,6 +50,7 @@
     import wandb
 
 InputsType = List[Dict[str, Union[torch.Tensor, Any]]]
+# tuple: (messages, finish_reason)
 OutputsType = List[Tuple[List[Dict], str]]
 
 
@@ -745,8 +746,8 @@ def _generate_completions(self, inputs: InputsType) -> InputsType:
                 outputs = [output[0] for output in outputs]
 
         for i, output in enumerate(outputs):
-            inputs[i]['messages'] = output[0][0]
-            inputs[i]['is_truncated'] = output[0][1] == 'length'
+            inputs[i]['messages'] = output[0]
+            inputs[i]['is_truncated'] = output[1] == 'length'
 
         return inputs
 

From fd08ccd738739caecac55473d888cc69accce080 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 16:08:52 +0800
Subject: [PATCH 32/68] external async generate

---
 swift/llm/argument/rlhf_args.py             |  3 ++
 swift/trainers/rlhf_trainer/grpo_trainer.py | 38 ++++++++++++---------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index e8bcb286f7..5a2d2f288a 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -223,6 +223,9 @@ def _check_grpo(self):
         if self.vllm_mode == 'server':
             assert not self.use_vllm or self.vllm_server_host is not None
 
+        if self.async_generate:
+            assert self.vllm_mode == 'server', 'async generate require vllm_mode == server'
+
     def _external_vllm_warning(self):
         if self.rlhf_type != 'grpo' or not self.vllm_server_host:
             return
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 71f54aa3da..e41b8808ed 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -70,7 +70,6 @@ def on_train_begin(self, args, state, control, **kwargs):
 class DataCache:
     inputs: List[Dict] = field(default_factory=list)
     outputs: List[Dict] = field(default_factory=list)
-    distributed_idx: List[List] = field(default_factory=list)
 
 
 class GRPOTrainer(RLHFTrainerMixin, SwiftMixin, HFGRPOTrainer):
@@ -153,6 +152,7 @@ def __init__(self,
 
         self.use_vllm = args.use_vllm
         self.use_lmdeploy = args.use_lmdeploy
+        self.async_generate = args.async_generate
         vllm_client = kwargs.pop('vllm_client')  # for external vllm
 
         super().__init__(model, ref_model, *_args, **kwargs)
@@ -281,7 +281,7 @@ def __init__(self,
         # Buffer the batch to reuse generated outputs across multiple updates. For more details, see
         # `_get_train_sampler` and `_prepare_inputs`.
         self._buffered_inputs = None
-        if self.args.async_generate:
+        if self.async_generate:
             self.add_callback(GRPOCallback(self))
 
         if self.args.dynamic_sample:
@@ -491,6 +491,7 @@ def _move_model_to_vllm_lmdeploy(self):
                 assert len(state_dict) > 0 and all([state.shape != torch.Size([0]) for state in state_dict.values()])
                 if self.use_fast_infer:
                     if self.args.async_generate:
+                        # before sync weight, we should wait async generate finish
                         self._wait_queue()
                     if self.args.use_vllm:
                         llm_model = self.engine.inner_model
@@ -654,34 +655,31 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
             assert not any([o is None for o in outputs])
         return outputs
 
-    def async_infer(self, inputs, inputs_slice, distributed_idx):
-        # TODO: compatible with external server
+    def async_infer(self, inputs):
+
         def infer_task():
             with self.multi_turn_completion_length_context():
-                return self._infer_single_or_multi_turn(inputs_slice, self.request_config)
+                return self._infer_single_or_multi_turn(inputs, self.request_config)
 
         future: Future = self.executor.submit(infer_task)
         # pre-fetch the queue to avoid switching back to eval_queue at the end of training sample sampling
         current_queue = self._queue
 
         def done(_self):
-            current_queue.put(DataCache(inputs, _self.result(), distributed_idx))
+            current_queue.put(DataCache(inputs, _self.result()))
 
         future.add_done_callback(done)
 
     def _prefetch(self, dataloader: DataLoader):
-        # TODO: compatible with external server
+        # TODO: asyncio
         inputs = next(iter(dataloader))
         all_inputs = gather_object(inputs)
-        nnodes = get_node_setting()[1]
-        distributed_idx = round_robin(len(all_inputs), nnodes * self.args.num_infer_workers)
-        if self.infer_rank >= 0:
-            _input_slice = np.array(all_inputs)[distributed_idx[self.infer_rank]]
+        if self.accelerator.is_main_process:
             with self.multi_turn_completion_length_context():
-                outputs = self._infer_single_or_multi_turn(_input_slice, self.request_config)
-            self._queue.put(DataCache(inputs, outputs, distributed_idx))
+                outputs = self._infer_single_or_multi_turn(all_inputs, self.request_config)
+            self._queue.put(DataCache(inputs, outputs))
         else:
-            self._queue.put(DataCache(inputs, [], distributed_idx))
+            self._queue.put(DataCache(inputs, []))
         if self.accelerator.num_processes > 1:
             self.accelerator.wait_for_everyone()
 
@@ -701,8 +699,16 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
             self._move_model_to_vllm_lmdeploy()
             self._last_loaded_step = self.state.global_step
 
-        with self.multi_turn_completion_length_context():
-            outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
+        if self.async_generate:
+            # send this step data to server
+            self.async_infer(inputs)
+            # get last step data from cache
+            data_cache = self._queue.get()
+            inputs = data_cache.inputs
+            outputs = data_cache.outputs
+        else:
+            with self.multi_turn_completion_length_context():
+                outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
 
         if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:
             self.engine.engine.sleep(level=self.args.sleep_level)

From 9da6242711c65645ce3824f1e113ad9f808f4a7b Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 16:20:00 +0800
Subject: [PATCH 33/68] clean argument check

---
 swift/llm/argument/rlhf_args.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 5a2d2f288a..f84ef58a60 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -158,6 +158,16 @@ def _init_grpo(self):
                 if self.soft_max_length is None:
                     self.soft_max_length = self.max_completion_length
                     logger.info(f'Auto-configured soft_max_length = max_completion_length {self.max_completion_length}')
+            if self.use_vllm:
+                # set vllm mode
+                if self.vllm_server_host is not None:
+                    if self.vllm_mode != 'server':
+                        self.vllm_mode = 'server'
+                        logger.warning('set vllm_mode to `server` since vllm_server_host is provided')
+                else:
+                    if self.vllm_mode != 'colocate':
+                        self.vllm_mode = 'colocate'
+                        logger.warning('set vllm_mode to `colocate` since vllm_server_host is not provided')
 
     def _init_ppo(self):
         if self.rlhf_type == 'ppo':
@@ -224,7 +234,9 @@ def _check_grpo(self):
             assert not self.use_vllm or self.vllm_server_host is not None
 
         if self.async_generate:
-            assert self.vllm_mode == 'server', 'async generate require vllm_mode == server'
+            assert self.vllm_mode == 'server', 'async generate require vllm_mode == server, '
+            'please deploy vLLM server by `swift rollout` and assign with `vllm_server_host` '
+            'for more infomations, please check https://swift.readthedocs.io/en/latest/Instruction/GRPO.html'
 
     def _external_vllm_warning(self):
         if self.rlhf_type != 'grpo' or not self.vllm_server_host:
@@ -268,6 +280,3 @@ def _deprecated_warning(self):
                 "The parameter 'num_infer_workers' has been deprecated and will be removed in version 3.6. "
                 'If you wish to use colocate mode, please use `vllm_mode colocate` instead. '
                 'If you wish to use async mode, please use `vllm_mode server` and external vLLM server instead.')
-            if self.use_vllm and self.vllm_server_host is None:
-                logger.info('set vllm_mode to colocate since vllm_server_host is not provided')
-                self.vllm_mode = 'colocate'

From 8a22c9b2e66617399bd616d3ef5951c6e5f321c1 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 16:29:44 +0800
Subject: [PATCH 34/68] fix async generate

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index e41b8808ed..6639fa81a2 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -671,17 +671,9 @@ def done(_self):
         future.add_done_callback(done)
 
     def _prefetch(self, dataloader: DataLoader):
-        # TODO: asyncio
         inputs = next(iter(dataloader))
-        all_inputs = gather_object(inputs)
-        if self.accelerator.is_main_process:
-            with self.multi_turn_completion_length_context():
-                outputs = self._infer_single_or_multi_turn(all_inputs, self.request_config)
-            self._queue.put(DataCache(inputs, outputs))
-        else:
-            self._queue.put(DataCache(inputs, []))
-        if self.accelerator.num_processes > 1:
-            self.accelerator.wait_for_everyone()
+        outputs = self._infer_single_or_multi_turn(inputs, self.request_config)
+        self._queue.put(DataCache(inputs, outputs))
 
     def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
         if self.vllm_mode == 'colocate' and self.args.sleep_level > 0:

From 8ba033090fb8fad7b68ea4b0b054408a1564f03c Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 16:37:03 +0800
Subject: [PATCH 35/68] fix server infer to list

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 6639fa81a2..46e3be598b 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -16,7 +16,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import datasets
-import numpy as np
 import torch
 import torch.nn as nn
 import transformers
@@ -33,14 +32,13 @@
 from trl.trainer.grpo_trainer import nanmax, nanmin
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
-from swift.llm.template.template_inputs import StdTemplateInputs
 from swift.plugin import orms
 from swift.plugin.multi_turn import multi_turns
-from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, get_node_setting, is_lmdeploy_available,
-                         is_vllm_available, is_wandb_available)
+from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, is_lmdeploy_available, is_vllm_available,
+                         is_wandb_available)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
-from .utils import patch_lora_merge, patch_lora_unmerge, round_robin, unwrap_model_for_generation
+from .utils import patch_lora_merge, patch_lora_unmerge, unwrap_model_for_generation
 
 del HFGRPOTrainer.__init__
 del HFGRPOTrainer.log
@@ -533,7 +531,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
             all_inputs = gather_object(inputs)
             if self.accelerator.is_main_process:
                 results = List[ChatCompletionResponse] = self._engine_infer(
-                    infer_requests=all_inputs, request_config=request_config, use_tqdm=False)
+                    infer_requests=all_inputs, request_config=request_config)
             else:
                 results = [None] * len(all_inputs)
             # Broadcast the results from the main process to all processes,
@@ -556,7 +554,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
             # confirm that the seed is same in tp group
             request_config.seed = self.accelerator.process_index // self.vllm_tensor_parallel_size
             results: List[ChatCompletionResponse] = self._engine_infer(
-                infer_requests=inputs, request_config=request_config, use_tqdm=False)
+                infer_requests=inputs, request_config=request_config)
 
             if self.vllm_tensor_parallel_size > 1:
                 # Slice completions for this rank within its TP group.
@@ -1106,11 +1104,11 @@ def _engine_infer(
         infer_requests: List[InferRequest],
         request_config: Optional[RequestConfig] = None,
         *,
-        use_tqdm: Optional[bool] = None,
+        use_tqdm: Optional[bool] = False,
     ):
         if self.vllm_mode == 'server':
             self._process_infer_requests_images(infer_requests)
-            return self.vllm_client.infer(infer_requests.tolist(), asdict(request_config), use_tqdm=use_tqdm)
+            return self.vllm_client.infer(infer_requests, asdict(request_config), use_tqdm=use_tqdm)
         else:
             return self.engine.infer(infer_requests, request_config, use_tqdm=use_tqdm)
 

From 0926a3c239579e7ef7258a0ee1b6f5cddecf31bd Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 16:43:24 +0800
Subject: [PATCH 36/68] fix server infer

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 46e3be598b..e4c0bfc2b8 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -530,7 +530,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
             # for server mode, we gather all the inputs and send to remote vllm server in main process
             all_inputs = gather_object(inputs)
             if self.accelerator.is_main_process:
-                results = List[ChatCompletionResponse] = self._engine_infer(
+                results: List[ChatCompletionResponse] = self._engine_infer(
                     infer_requests=all_inputs, request_config=request_config)
             else:
                 results = [None] * len(all_inputs)

From 0c3827a33035d25b87377f51af99ed33f88b90e9 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 20:25:56 +0800
Subject: [PATCH 37/68] catch async generate error

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 27 ++++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index e4c0bfc2b8..7112371fba 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -144,6 +144,7 @@ def __init__(self,
         self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization  # only applies to colocation mode
         self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size  # only applies to colocation mode
         self.loss_type = args.loss_type
+        self.max_completion_length = args.max_completion_length
         model.warnings_issued['estimate_tokens'] = True
         kwargs['data_collator'] = lambda features: features
         self.shuffle_dataset = args.dataset_shuffle
@@ -526,9 +527,14 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
         # inputs: local inputs
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
+        # keys from InferRequest
+        infer_inputs = {
+            k: v
+            for k, v in inputs.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
+        }
         if self.vllm_mode == 'server':
             # for server mode, we gather all the inputs and send to remote vllm server in main process
-            all_inputs = gather_object(inputs)
+            all_inputs = gather_object(infer_inputs)
             if self.accelerator.is_main_process:
                 results: List[ChatCompletionResponse] = self._engine_infer(
                     infer_requests=all_inputs, request_config=request_config)
@@ -654,17 +660,26 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
         return outputs
 
     def async_infer(self, inputs):
+        current_queue = self._queue
 
         def infer_task():
-            with self.multi_turn_completion_length_context():
-                return self._infer_single_or_multi_turn(inputs, self.request_config)
+            try:
+                with self.multi_turn_completion_length_context():
+                    return self._infer_single_or_multi_turn(inputs, self.request_config)
+            except Exception as e:
+                logger.error('Inference task failed: %s', str(e))
+                raise
 
         future: Future = self.executor.submit(infer_task)
+
         # pre-fetch the queue to avoid switching back to eval_queue at the end of training sample sampling
-        current_queue = self._queue
 
-        def done(_self):
-            current_queue.put(DataCache(inputs, _self.result()))
+        def done(future):
+            try:
+                result = future.result()
+                current_queue.put(DataCache(inputs, result))
+            except Exception as e:
+                logger.error('Error in async_infer callback: %s', str(e))
 
         future.add_done_callback(done)
 

From fbc2b5475e7c26e8ff02fd33ac16346772dce5f9 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 20:58:07 +0800
Subject: [PATCH 38/68] fix infer inputs

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 7112371fba..88f50c3cba 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -528,10 +528,10 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
         # keys from InferRequest
-        infer_inputs = {
+        infer_inputs = [{
             k: v
-            for k, v in inputs.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
-        }
+            for k, v in inp.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
+        } for inp in inputs]
         if self.vllm_mode == 'server':
             # for server mode, we gather all the inputs and send to remote vllm server in main process
             all_inputs = gather_object(infer_inputs)

From 57445b49d04854ddee519e32f81babebe51b915a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 21:19:32 +0800
Subject: [PATCH 39/68] fix async generate

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 88f50c3cba..6fdfd05586 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -523,18 +523,24 @@ def reorder_outputs(outputs, distributed_idx):
 
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
-    def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsType:
+    def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_inputs: bool = False) -> OutputsType:
         # inputs: local inputs
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
         # keys from InferRequest
+        per_device_size = len(inputs)
+        if is_global_inputs:
+            per_device_size / self.accelerator.num_processes
         infer_inputs = [{
             k: v
             for k, v in inp.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
         } for inp in inputs]
         if self.vllm_mode == 'server':
             # for server mode, we gather all the inputs and send to remote vllm server in main process
-            all_inputs = gather_object(infer_inputs)
+            if is_global_inputs:
+                all_inputs = infer_inputs
+            else:
+                all_inputs = gather_object(infer_inputs)
             if self.accelerator.is_main_process:
                 results: List[ChatCompletionResponse] = self._engine_infer(
                     infer_requests=all_inputs, request_config=request_config)
@@ -544,8 +550,8 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
             # ensuring each process receives its corresponding slice.
             results = broadcast_object_list(results, from_process=0)
             process_slice = slice(
-                self.accelerator.process_index * len(inputs),
-                (self.accelerator.process_index + 1) * len(inputs),
+                self.accelerator.process_index * per_device_size,
+                (self.accelerator.process_index + 1) * per_device_size,
             )
             results = results[process_slice]
         else:
@@ -571,7 +577,10 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig) -> OutputsTy
 
         return results
 
-    def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: RequestConfig) -> OutputsType:
+    def _infer_single_or_multi_turn(self,
+                                    inputs: InputsType,
+                                    request_config: RequestConfig,
+                                    is_global_inputs: bool = False) -> OutputsType:
         """Perform multi-turn or single-turn inference
 
         Args:
@@ -585,7 +594,7 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
         """
 
         # infer first turn
-        results = self._infer(inputs, request_config)
+        results = self._infer(inputs, request_config, is_global_inputs)
 
         if not self.multi_turn_func:
             # Single-turn: combine completions with messages and retain the finish reason.
@@ -659,13 +668,13 @@ def _infer_single_or_multi_turn(self, inputs: InputsType, request_config: Reques
             assert not any([o is None for o in outputs])
         return outputs
 
-    def async_infer(self, inputs):
+    def async_infer(self, all_inputs):
         current_queue = self._queue
 
         def infer_task():
             try:
                 with self.multi_turn_completion_length_context():
-                    return self._infer_single_or_multi_turn(inputs, self.request_config)
+                    return self._infer_single_or_multi_turn(all_inputs, self.request_config, is_global_inputs=True)
             except Exception as e:
                 logger.error('Inference task failed: %s', str(e))
                 raise
@@ -706,7 +715,8 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
 
         if self.async_generate:
             # send this step data to server
-            self.async_infer(inputs)
+            all_inputs = gather_object(inputs)
+            self.async_infer(all_inputs)
             # get last step data from cache
             data_cache = self._queue.get()
             inputs = data_cache.inputs

From e2330f928c735c2211f7ca378090edfc4b265740 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Thu, 8 May 2025 21:25:35 +0800
Subject: [PATCH 40/68] fix size

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 6fdfd05586..4babbc5ec3 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -530,7 +530,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
         # keys from InferRequest
         per_device_size = len(inputs)
         if is_global_inputs:
-            per_device_size / self.accelerator.num_processes
+            per_device_size /= self.accelerator.num_processes
         infer_inputs = [{
             k: v
             for k, v in inp.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
@@ -715,6 +715,7 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
 
         if self.async_generate:
             # send this step data to server
+            # we gather inputs outside the thread for prevent potential gather deadlock
             all_inputs = gather_object(inputs)
             self.async_infer(all_inputs)
             # get last step data from cache

From 37a06f9f698e50fae035931ab2943a442d7f409a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Fri, 9 May 2025 11:11:12 +0800
Subject: [PATCH 41/68] remove vllm context

---
 swift/llm/infer/infer_engine/vllm_engine.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
index 4e968086d6..639ef6f533 100644
--- a/swift/llm/infer/infer_engine/vllm_engine.py
+++ b/swift/llm/infer/infer_engine/vllm_engine.py
@@ -10,6 +10,7 @@
 from packaging import version
 from tqdm import tqdm
 from transformers import GenerationConfig
+from transformers.utils import is_torch_npu_available
 
 from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer
 from swift.plugin import Metric
@@ -98,12 +99,10 @@ def __init__(
             quantization=quantization,
             engine_kwargs=engine_kwargs,
         )
-        nnodes = get_node_setting()[1]
-        total_infer_workers = num_infer_workers * nnodes
-        context, npu_context = patch_vllm(world_size=total_infer_workers), nullcontext()
-        if tensor_parallel_size == 1 or pipeline_parallel_size == 1:
-            npu_context = patch_npu_vllm(self.engine_args.device)
-        with context, npu_context:
+        context = nullcontext()
+        if is_torch_npu_available() and (tensor_parallel_size == 1 or pipeline_parallel_size == 1):
+            context = patch_npu_vllm(self.engine_args.device)
+        with context:
             self._prepare_engine()
         self._load_generation_config()
         self._fix_vllm_bug()

From 66ad138e9473cef429612a25a49aa294d50a3f46 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Fri, 9 May 2025 11:30:14 +0800
Subject: [PATCH 42/68] reward model prepare ds

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 4babbc5ec3..7fe63708b4 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -29,6 +29,7 @@
 from transformers.trainer_utils import seed_worker
 from trl import GRPOTrainer as HFGRPOTrainer
 from trl.extras.profiling import profiling_decorator
+from trl.models import prepare_deepspeed
 from trl.trainer.grpo_trainer import nanmax, nanmin
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
@@ -266,9 +267,12 @@ def __init__(self,
         # self.model_accepts_loss_kwargs to False to enable scaling.
         self.model_accepts_loss_kwargs = False
         for i, reward_func in enumerate(self.reward_funcs):
-            if isinstance(reward_func, PreTrainedModel) and is_deepspeed_zero3_enabled():
-                from trl.models.utils import prepare_deepspeed
-                prepare_deepspeed(reward_func, self.accelerator)  # Does not wrap DeepSpeedEngine
+            if isinstance(reward_func, PreTrainedModel):
+                if self.is_deepspeed_enabled:
+                    self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator)
+                else:
+                    self.reward_funcs[i] = self.accelerator.prepare_model(
+                        reward_func, evaluation_mode=True, device_placement=True)
 
         # Multi-step
         self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper

From f4a05d3bb40b74895d7c4db115a7dbd8d29b6a88 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 11:23:33 +0800
Subject: [PATCH 43/68] lint

---
 swift/llm/argument/rlhf_args.py             | 1 -
 swift/trainers/rlhf_trainer/grpo_trainer.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index f84ef58a60..ef33c113be 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-import warnings
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Literal, Optional
 
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 3e8dfdb65d..4db70dfe8a 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -34,7 +34,6 @@
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
 from swift.plugin import multi_turns, orms, rm_plugins
-
 from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, is_lmdeploy_available, is_vllm_available,
                          is_wandb_available)
 from ..mixin import SwiftMixin

From 2b5198e6ac182aa37e85d8441c7a6940df426f6a Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 15:09:08 +0800
Subject: [PATCH 44/68] fix multi turn + TP

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 4db70dfe8a..e89f4048de 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -588,7 +588,16 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
             if self.vllm_tensor_parallel_size > 1:
                 # Gather prompts from all ranks in the TP group and flatten.
                 # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
-                orig_size = len(inputs)
+                # Note: The input sizes may differ across ranks (e.g., in multi-turn scenarios,
+                # the amount of data each rank continues to process may vary).
+                local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+                local_input_length = len(inputs)
+                all_input_lengths = [None] * self.vllm_tensor_parallel_size
+                torch.distributed.all_gather_object(all_input_lengths, local_input_length, group=self.tp_group)
+                start_idx = sum(all_input_lengths[:local_rank_in_group])
+                end_idx = start_idx + all_input_lengths[local_rank_in_group]
+
+                # orig_size = len(inputs)/
                 gathered_inputs = [None for _ in range(self.vllm_tensor_parallel_size)]
                 torch.distributed.all_gather_object(gathered_inputs, inputs, group=self.tp_group)
                 inputs = [p for sublist in gathered_inputs for p in sublist]
@@ -600,9 +609,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
             if self.vllm_tensor_parallel_size > 1:
                 # Slice completions for this rank within its TP group.
                 # Each rank generates all outputs — we keep only our share.
-                local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
-                tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
-                results = results[tp_slice]
+                results = results[start_idx:end_idx]
 
         return results
 
@@ -793,8 +800,6 @@ def _generate_completions(self, inputs: InputsType) -> InputsType:
                     self.model.train()
             if is_multimodal:
                 self.template.register_post_encode_hook(models)
-            if isinstance(outputs[0][0], list):
-                outputs = [output[0] for output in outputs]
 
         for i, output in enumerate(outputs):
             inputs[i]['messages'] = output[0]

From a4794656fea58cd959ee61acbaad6d5a426b26fa Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 16:36:53 +0800
Subject: [PATCH 45/68] external path image

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index e89f4048de..3088deba2d 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -24,7 +24,6 @@
 from torch.nn import ModuleList
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, TrainerCallback
-from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 from transformers.trainer_utils import seed_worker
 from trl import GRPOTrainer as HFGRPOTrainer
@@ -39,6 +38,7 @@
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import patch_lora_merge, patch_lora_unmerge, unwrap_model_for_generation
+from .vllm_client import VLLMClient
 
 del HFGRPOTrainer.__init__
 del HFGRPOTrainer.log
@@ -230,7 +230,7 @@ def __init__(self,
                 raise ImportError('vLLM is not available and `use_vllm` is set to True. '
                                   'Please install vLLM with `pip install vllm -U` to use it.')
             if self.vllm_mode == 'server':
-                self.vllm_client = vllm_client
+                self.vllm_client: VLLMClient = vllm_client
             elif self.vllm_mode == 'colocate':
                 if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0:
                     raise ValueError(
@@ -1154,6 +1154,7 @@ def _engine_infer(
             return self.engine.infer(infer_requests, request_config, use_tqdm=use_tqdm)
 
     def _process_infer_requests_images(self, infer_requests: List[InferRequest]):
+        # Process image format into a format that session.post can accept
         import base64
         if not any('images' in request for request in infer_requests):
             return
@@ -1163,6 +1164,8 @@ def _process_infer_requests_images(self, infer_requests: List[InferRequest]):
             for i, img in enumerate(request['images']):
                 if 'bytes' in img and img['bytes']:
                     request['images'][i] = base64.b64encode(img['bytes']).decode('utf-8')
+                elif 'path' in img and img['path']:
+                    request['images'][i] = img['path']
         return
 
     @property

From 1fb25db0235c9c9a790e1d0ac94420930fb018bf Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 17:28:24 +0800
Subject: [PATCH 46/68] fix async generate and doc

---
 docs/source/Instruction/GRPO.md             | 64 +++++++++++++-------
 docs/source_en/Instruction/GRPO.md          | 65 ++++++++++++++-------
 swift/trainers/rlhf_trainer/grpo_trainer.py |  4 +-
 3 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index df2d170cbf..efafaa97fe 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -11,7 +11,7 @@ pip install -U trl
 ```
 
 **更新日志**
-
+- **2025-05-13** — Internal部分重构，支持vLLM>=0.8
 - **2025-05-11** — 支持生成式奖励模型，通过 reward_model_plugin 自定义奖励模型逻辑。有关更多详细信息，请参阅[自定义奖励模型](#自定义奖励模型)部分。
 - **2025-04-30** — external vllm server 的启动命令改为 `swift rollout`
 
@@ -27,31 +27,47 @@ pip install -U trl
 
 GRPO 训练框架支持集成高性能推理引擎（如 vLLM）来加速采样过程，提供以下两种部署模式：
 
-### 1. 内部集成模式 (Internal)
+### 1. Colocate Mode
+
+- 训练与推理共享GPU资源，在 Trainer 内部启动推理服务，
 
-- 在Trainer内部直接启动推理服务
-- 提供两种资源分配策略：
-  - **协同模式 (Colocate)**: 训练与推理共享GPU资源
-  - **异步模式 (Async)**: 训练与推理使用独立GPU资源
+启动参数
+```bash
+--vllm_mode colocate
+```
 
-### GRPO训练资源配置方案
-| 配置场景                 | NPROC_PER_NODE | num_infer_workers | 资源分配说明             |
-|--------------------------|----------------|------------------|------------------------|
-| **Colocate**   | =总GPU数      | =总GPU数          | 训练和推理共享全部GPU资源              |
-| **Async**      | =训练卡数      | =推理卡数         | 必须满足：训练卡数 + 推理卡数 = 总GPU数 |
+#### Colocate 模式下的显存优化方案
+在 Colocate 模式下运行时，容易出现显存不足（OOM）的情况。以下是几种有效的显存优化方法和参数配置：
 
-**注：**
-1. 在Colocate模式下推荐设置`sleep_level=1`, 在模型训练时释放vLLM占用显存
-2. 总GPU数指可见的GPU设备总数
+1. 在训练阶段，释放 vLLM 占用的显存：
 
-### 2. 外部服务模式 (External)
-连接外部的 vLLM 推理服务器
-使用时，使用以下参数配置外部 vLLM 服务器
 ```bash
---vllm_server_host <服务器IP> \
---vllm_server_port <服务端口> \
---vllm_server_timeout <超时时间> \
+--sleep_level 1
+```
+
+2. 在vLLM 推理阶段，释放训练模型和优化器占用的显存：
+
+```bash
+--offload_optimizer true \
+--offload_model true \
+--gc_collect_after_offload true \
+```
+
+3. 在vLLM中使用 Tensor Parallel 技术：
+
+```bash
+--tensor_parallel_size [tp_size]
+```
+
+4. 分批 Gather 模型权重（zero3下同步 vLLM 权重时）：
+```bash
+--move_model_batches [批次数量]
 ```
+
+### 2. Async Mode
+
+- 训练与推理资源分离，在外面启动单独的推理服务器
+
 使用`swift rollout`命令部署vLLM 服务器, 现仅支持vLLM backend
 ```bash
 CUDA_VISIBLE_DEVICES=2 \
@@ -59,6 +75,14 @@ swift rollout \
   --model Qwen/Qwen2.5-VL-7B-Instruct \
   --tensor_parallel_size 2 \
 ```
+
+训练使用以下参数配置外部 vLLM 服务器
+```bash
+--vllm_server_host <服务器IP> \
+--vllm_server_port <服务端口> \
+--vllm_server_timeout <超时时间> \
+```
+
 完整脚本可以参考[这里](../../../examples/train/grpo/multi_node/Qwen2_5_32B_full.sh)
 
 
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 153dfe1706..f6c661645d 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -29,42 +29,67 @@ pip install -U trl
 
 The GRPO training framework supports the integration of high-performance inference engines (such as vLLM) to accelerate the sampling process, offering the following two deployment modes:
 
-### 1. Internal Integration Mode
+### 1. Colocate Mode
 
-- Launch the inference service directly within the Trainer.
-- Provides two resource allocation strategies:
-  - **Colocate Mode**: Training and inference share GPU resources.
-  - **Async Mode**: Training and inference use separate GPU resources.
+Training and inference share GPU resources; the inference service is started internally within the Trainer.
 
-### GRPO Training Resource Allocation Scheme
+Launch Parameters
+```bash
+--vllm_mode colocate
+```
 
-| Configuration Scenario  | NPROC_PER_NODE | num_infer_workers | Resource Allocation Description       |
-|-------------------------|----------------|-------------------|---------------------------------------|
-| **Colocate**            | = Total GPUs   | = Total GPUs      | Training and inference share all GPU resources. |
-| **Async**               | = Training GPUs| = Inference GPUs  | Must satisfy: Training GPUs + Inference GPUs = Total GPUs. |
+#### Memory Optimization Strategies in Colocate Mode
+When running in Colocate Mode , out-of-memory (OOM) errors are common due to simultaneous training and inference workloads. Below are effective memory optimization strategies and configuration parameters:
 
-**Note:**
-1. In Colocate mode, it is recommended to set `sleep_level=1` to release the GPU memory occupied by vLLM during model training.
-2. Total GPUs refers to the total number of visible GPU devices.
+1. Release vLLM memory during training:
 
-### 2. External Service Mode
+```bash
+--sleep_level 1
+```
 
-Connect to an external vLLM inference server.
-When using this mode, configure the external vLLM server with the following parameters:
+2. Offload training model and optimizer memory during vLLM inference:
 
 ```bash
---vllm_server_host <Server IP> \
---vllm_server_port <Server Port> \
---vllm_server_timeout <Timeout> \
+--offload_optimizer true \
+--offload_model true \
+--gc_collect_after_offload true \
 ```
 
-Deploy the vLLM server using the `swift rollout` command. Currently, only the vLLM backend is supported.
+3. Use Tensor Parallelism in vLLM:
+
+```bash
+--tensor_parallel_size [tp_size]
+```
+
+4. Batched gathering of model weights (when synchronizing vLLM weights under ZeRO-3):
+
+```bash
+--move_model_batches [number_of_batches]
+```
+
+
+### 2. Async Mode
+
+Training and inference use separate resources; a dedicated inference server is launched externally.
+
+Deploy the vLLM server using the swift rollout command. Currently, only the vLLM backend is supported:
+
 ```bash
 CUDA_VISIBLE_DEVICES=2 \
 swift rollout \
   --model Qwen/Qwen2.5-VL-7B-Instruct \
   --tensor_parallel_size 2 \
 ```
+
+Use the following parameters in training to connect to an external vLLM server:
+
+```bash
+--vllm_mode server \
+--vllm_server_host <Server IP> \
+--vllm_server_port <Server Port> \
+--vllm_server_timeout <Timeout> \
+```
+
 The complete script can be found [here](../../../examples/train/grpo/multi_node/Qwen2_5_32B_full.sh) .
 
 ## Reward Functions
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 3088deba2d..90c211889b 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -559,7 +559,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
         # keys from InferRequest
         per_device_size = len(inputs)
         if is_global_inputs:
-            per_device_size /= self.accelerator.num_processes
+            per_device_size //= self.accelerator.num_processes
         infer_inputs = [{
             k: v
             for k, v in inp.items() if k in ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
@@ -722,7 +722,7 @@ def infer_task():
         def done(future):
             try:
                 result = future.result()
-                current_queue.put(DataCache(inputs, result))
+                current_queue.put(DataCache(all_inputs, result))
             except Exception as e:
                 logger.error('Error in async_infer callback: %s', str(e))
 

From 7394dc90e6feff2b49bc8488821e25aa387b95ce Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 18:09:14 +0800
Subject: [PATCH 47/68] update doc

---
 docs/source/Instruction/GRPO.md               | 217 +-----------------
 ...44\350\241\214\345\217\202\346\225\260.md" |   4 +-
 docs/source_en/Instruction/GRPO.md            |  15 +-
 swift/llm/argument/rlhf_args.py               |   6 +-
 swift/trainers/arguments.py                   |   5 +-
 5 files changed, 24 insertions(+), 223 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index efafaa97fe..a8c2dd3be0 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -11,7 +11,7 @@ pip install -U trl
 ```
 
 **更新日志**
-- **2025-05-13** — Internal部分重构，支持vLLM>=0.8
+- **2025-05-13** — Internal部分代码重构，支持vLLM>=0.8
 - **2025-05-11** — 支持生成式奖励模型，通过 reward_model_plugin 自定义奖励模型逻辑。有关更多详细信息，请参阅[自定义奖励模型](#自定义奖励模型)部分。
 - **2025-04-30** — external vllm server 的启动命令改为 `swift rollout`
 
@@ -60,6 +60,7 @@ GRPO 训练框架支持集成高性能推理引擎（如 vLLM）来加速采样
 ```
 
 4. 分批 Gather 模型权重（zero3下同步 vLLM 权重时）：
+
 ```bash
 --move_model_batches [批次数量]
 ```
@@ -202,12 +203,11 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - num_iterations: 每个批次代更新次数，默认为1.
 - epsilon: clip 系数，默认为0.2.
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
-- async_generate: 异步rollout以提高训练速度，默认`false`.
+- async_generate: 异步rollout以提高训练速度，仅支持async mode，默认`false`.
 - sleep_level: vllm特有参数，在训练和rollout复用卡的时候，可以选择vllm进行offload.
-- move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个
+- move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个.
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
-  - 注意：若该参数设置为True，训练时grad_norm一直为0，请安装`vllm==0.7.3`
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
 - multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
 - dynamic_sample：筛除group内奖励标准差为0的数据，额外采样新数据，默认为False。
@@ -220,165 +220,7 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 
 奖励函数参数，见[内置奖励函数](#内置奖励函数)
 
-可以使用vLLM、LMDeploy作为采样后端加速训练
-多卡vLLM
-```bash
-# async mode
-# 要求 num_infer_workers(部署) + NPROC_PER_NODE(训练) = device_count
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=7 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format \
-    --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
-    --num_infer_workers 1 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#5000' \
-    --max_completion_length 2048 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --learning_rate 1e-6 \
-    --gradient_accumulation_steps 2 \
-    --eval_steps 200 \
-    --save_steps 200 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 4096 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 7 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true
-
-# colocate mode
-# 要求 num_infer_workers(部署) = NPROC_PER_NODE(训练) = device_count
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=8 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-1.5B \
-    --reward_funcs accuracy format \
-    --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.5 \
-    --vllm_max_model_len 8192 \
-    --num_infer_workers 8 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#5000' \
-    --max_completion_length 2048 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --learning_rate 1e-6 \
-    --gradient_accumulation_steps 2 \
-    --eval_steps 200 \
-    --save_steps 200 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 4096 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 8 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true \
-    --sleep_level 1 \
-    --offload_model true \
-    --offload_optimizer true \
-    --gc_collect_after_offload true \
-    --log_completions true
-```
-
-
-单卡
-```bash
-# PT backend
-CUDA_VISIBLE_DEVICES=0 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format \
-    --train_type lora \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --target_modules all-linear \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#1000' \
-    --max_completion_length 1024 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 4 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --log_completions true
-
-# vLLM backend
-CUDA_VISIBLE_DEVICES=0 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --vllm_gpu_memory_utilization 0.5 \
-    --use_vllm true \
-    --sleep_level 1 \
-    --offload_model true \
-    --offload_optimizer true \
-    --gc_collect_after_offload true \
-    --reward_funcs accuracy format \
-    --train_type lora \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --target_modules all-linear \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#1000' \
-    --max_completion_length 1024 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 4 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --log_completions true
-```
-多机训练参考[这里](../../../examples/train/grpo/multi_node/)
-
-注：内部集成模式下，需要不同节点的GPU配置以及训练参数相同
+运行脚本参考[这里](../../../examples/train/grpo/)
 
 ## 自定义奖励模型
 默认情况下，奖励模型指的是包含数值头的分类模型（通常称为输出奖励模型（ORM））。这些模型对其他模型的输出进行评分，产生一个标量值，表示模型响应的质量。
@@ -408,7 +250,6 @@ swift rlhf \
     --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
     --reward_model_plugin genrm my_rmplugin \
     --reward_weights 0.1 1 1 \
-    --num_infer_workers 8 \
     --vllm_gpu_memory_utilization 0.5 \
     --sleep_level 1 \
     --offload_model true \
@@ -431,55 +272,15 @@ swift rlhf \
 - Token level Loss
 - Soft Overlong Punishment
 
-其中Token level Loss是默认实现，不用额外设置。对于其余trick，我们可以基于GRPOTrainer，设置以下参数实现。
+以上trick，我们可以基于GRPOTrainer，设置以下参数实现。
+
+其中Token level Loss是通过使用参数 loss type `bnpo` 实现
 
 | 参数                 | 类型      | 值      |
 |----------------------|-----------|-------------|
+｜`--loss_type`        | `str`      |   `bnpo`   |
 | `--epsilon_high`     | `float`   | `0.28`      |
 | `--dynamic_sample`   | `bool`    | `true`      |
 | `--overlong_filter`  | `bool`    | `true`      |
 | `--reward_funcs`     | `str`     | `soft_overlong`|
 | `--max_resample_times` | `int`    | `3`        |
-
-参考训练脚本(八卡colocate mode)
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=8 \
-WANDB_API_KEY=xxx \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-1.5B \
-    --reward_funcs accuracy soft_overlong \
-    --max_completion_length 4096 \
-    --soft_cache_length 819 \
-    --epsilon 0.2 \
-    --epsilon_high 0.28 \
-    --dynamic_sample true \
-    --overlong_filter true \
-    --max_resample_times 3 \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.6 \
-    --num_infer_workers 8 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset AI-MO/NuminaMath-TIR#5000 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-6 \
-    --eval_steps 1000 \
-    --save_steps 1000 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 8 \
-    --temperature 1.0 \
-    --top_p 1.0 \
-    --deepspeed zero2 \
-    --log_completions true \
-    --num_iterations 1 \
-    --report_to tensorboard wandb \
-    --beta 0.0 \
-```
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index d4cb475f58..254fef34ad 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -408,9 +408,7 @@ reward模型参数将在PPO、GRPO中使用。
 - loss_type: loss 归一化的类型，可选项为['grpo', 'bnpo', 'dr_grpo'], 默认为'grpo', 具体查看该[pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)。
 - log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False。
   - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容。
-- use_vllm: 是否使用vLLM作为GRPO生成的infer_backend，默认为False。
-- num_infer_workers: 每个node上推理worker数量，仅对vllm或者lmdeploy时有效。
-- vllm_device: 设置vLLM部署的设备，可以设置为`auto`，代表按照num_infer_workers数量使用最后的几张卡，否则请传入和num_infer_workers相等数量的设备，例如`--vllm_device cuda:1 cuda:2`。
+- use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
 - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
 - vllm_max_model_len: vllm透传参数，默认为None。
 - vllm_max_num_seqs: vllm透传参数，默认为256。
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index f6c661645d..477ae6d860 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -35,6 +35,7 @@ Training and inference share GPU resources; the inference service is started int
 
 Launch Parameters
 ```bash
+--use_vllm true \
 --vllm_mode colocate
 ```
 
@@ -52,13 +53,13 @@ When running in Colocate Mode , out-of-memory (OOM) errors are common due to sim
 ```bash
 --offload_optimizer true \
 --offload_model true \
---gc_collect_after_offload true \
+--gc_collect_after_offload true
 ```
 
 3. Use Tensor Parallelism in vLLM:
 
 ```bash
---tensor_parallel_size [tp_size]
+--vllm_tensor_parallel_size [tp_size]
 ```
 
 4. Batched gathering of model weights (when synchronizing vLLM weights under ZeRO-3):
@@ -84,6 +85,7 @@ swift rollout \
 Use the following parameters in training to connect to an external vLLM server:
 
 ```bash
+--use_vllm true \
 --vllm_mode server \
 --vllm_server_host <Server IP> \
 --vllm_server_port <Server Port> \
@@ -194,17 +196,16 @@ Arguments
 - loss_type: The type of loss normalization. Options are ['grpo', 'bnpo', 'dr_grpo'], default is 'grpo'. For details, see this [pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
 - log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
   - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
-- use_vllm: Whether to use vLLM as the back-end for sampling generation; default is False, using it is recommended to speed up training.
-- vllm_device: Device for deploying vLLM, default is auto, meaning the first unused GPU. Use cuda:x to specify a particular card.
+- use_vllm: Whether to use vLLM as the back-end for sampling generation; default is False, using pt(pytorch) engine to rollout.
+- vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or `"colocate"`
 - vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
-- vllm_max_model_len: vLLM passthrough parameter, default is None.
-- vllm_max_num_seqs: vLLM passthrough parameter, default is 256.
+- vllm_max_model_len: used in colocate mode, vLLM passthrough parameter, the total length limit of model, default is None.
 - vllm_enforce_eager: vLLM passthrough parameter, default is False.
 - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
-- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
 - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
 - vllm_server_port: The service port of the vLLM server. Default is 8000.
 - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- vllm_tensor_parallel_size: used in colocate mode, the tensor parallel size of vLLM engine, default is 1.
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index ef33c113be..e0a6d01505 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -270,9 +270,9 @@ def _deprecated_warning(self):
         if self.vllm_device is not None:
             logger.warning("The parameter 'vllm_device' has been deprecated and will be removed in version 3.6. ")
 
-        if self.vllm_enable_prefix_caching is not None:
-            logger.warning(
-                "The parameter 'vllm_enable_prefix_caching' has been deprecated and will be removed in version 3.6. ")
+        if self.vllm_max_num_seqs is not None:
+            logger.warning("The parameter 'vllm_max_num_seqs' is automatically set, "
+                           'and has been deprecated and will be removed in version 3.6. ')
 
         if self.num_infer_workers is not None:
             logger.warning(
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index b1bfe3bc81..ccb932a957 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -151,15 +151,16 @@ class GRPOArgumentsMixin:
     vllm_device: Optional[List[str]] = None  # deprecated
     vllm_gpu_memory_utilization: float = 0.9
     vllm_max_model_len: Optional[int] = None
+    vllm_max_num_seqs: Optional[int] = None  # deprecated
     vllm_enforce_eager: bool = False
     vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None  # '{"image": 5, "video": 2}'
-    vllm_enable_prefix_caching: Optional[bool] = None  # deprecated
+    vllm_enable_prefix_caching: bool = True
     vllm_tensor_parallel_size: int = 1
     # external vllm (server)
     vllm_server_host: Optional[str] = None
     vllm_server_port: int = 8000
     vllm_server_timeout: float = 240.0
-    vllm_client = None
+    vllm_client = None  # Not required to set, used for client instantiation
 
     # reward function args, see details in swift/plugin/orm.py
     # cosine reward, https://arxiv.org/abs/2502.03373

From 4160ad3d2ae0ac843d26cd793db8baf2eb9b3951 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 18:16:42 +0800
Subject: [PATCH 48/68] remove async mode script

---
 examples/train/grpo/external/README.md        |  2 +-
 examples/train/grpo/internal/README.md        | 13 +-----
 examples/train/grpo/internal/full_lmdeploy.sh | 41 -------------------
 examples/train/grpo/internal/full_vllm.sh     | 38 -----------------
 .../train/grpo/internal/full_vllm_qwenvl.sh   | 41 -------------------
 5 files changed, 2 insertions(+), 133 deletions(-)
 delete mode 100644 examples/train/grpo/internal/full_lmdeploy.sh
 delete mode 100644 examples/train/grpo/internal/full_vllm.sh
 delete mode 100644 examples/train/grpo/internal/full_vllm_qwenvl.sh

diff --git a/examples/train/grpo/external/README.md b/examples/train/grpo/external/README.md
index e71c0807af..97bf7f4125 100644
--- a/examples/train/grpo/external/README.md
+++ b/examples/train/grpo/external/README.md
@@ -7,7 +7,7 @@
 
 ## **Introduction**
 
-The GRPO (Gradient-based Reinforcement Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **External Mode** allows you to connect to an external vLLM inference server, separating the inference service from the training process. This mode is ideal for scenarios where you want to offload inference to dedicated hardware or servers, improving resource utilization and scalability.
+The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **External Mode** allows you to connect to an external vLLM inference server, separating the inference service from the training process. This mode is ideal for scenarios where you want to offload inference to dedicated hardware or servers, improving resource utilization and scalability.
 
 This folder contains scripts and instructions for running GRPO in **External Mode**, enabling integration with an external vLLM server.
 
diff --git a/examples/train/grpo/internal/README.md b/examples/train/grpo/internal/README.md
index d15220fc6c..ba25514d08 100644
--- a/examples/train/grpo/internal/README.md
+++ b/examples/train/grpo/internal/README.md
@@ -2,20 +2,9 @@
 
 ---
 
-## Known Issues
-Bugs in **vLLM >= 0.8**
-1. DeepSpeed ZeRO-3 Mode :
-    When using DeepSpeed's ZeRO-3 configuration, gradients may become zero during training.
-
-2. Async Mode
-    In certain scenarios, the asynchronous mode (Async Mode) may hang, causing the program to become unresponsive.
-
-To ensure stability and compatibility, it is recommended to use **vLLM 0.7.3** to avoid the above issues.
-
-
 ## **Introduction**
 
-The GRPO (Gradient-based Reinforcement Policy Optimization) training framework supports integrating high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows the inference service to be directly launched within the Trainer, reducing external dependencies and simplifying deployment.
+The GRPO (Group Relative Policy Optimization) training framework supports integrating high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows the inference service to be directly launched within the Trainer, reducing external dependencies and simplifying deployment.
 
 This folder contains scripts and instructions for running GRPO in **Internal Mode**, where the model training and inference are tightly integrated with flexible resource allocation strategies.
 
diff --git a/examples/train/grpo/internal/full_lmdeploy.sh b/examples/train/grpo/internal/full_lmdeploy.sh
deleted file mode 100644
index feda75e9b4..0000000000
--- a/examples/train/grpo/internal/full_lmdeploy.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-# A800 * 8
-# pip install lmdeploy==0.7.1
-# exp link: https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz
-# In exp no `--system 'examples/train/grpo/prompt.txt'`, so the format reward is not correct and there are speed diffs with this script
-# important args: --num_infer_workers 2 --num_iterations 2 --use_lmdeploy true --async_generate true
-# if forward/backward error: pip install deepspeed==0.14.5
-# and change deepspeed zero3.json stage3_prefetch_bucket_size=0
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=6 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format \
-    --use_lmdeploy true \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset AI-MO/NuminaMath-TIR#5000 \
-    --max_completion_length 1536 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 10 \
-    --per_device_eval_batch_size 10 \
-    --learning_rate 1e-6 \
-    --eval_steps 1000 \
-    --save_steps 1000 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 60 \
-    --temperature 1.0 \
-    --top_p 0.9 \
-    --top_k 50 \
-    --async_generate true \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero3 \
-    --log_completions true \
-    --num_iterations 2 \
-    --num_infer_workers 2 \
diff --git a/examples/train/grpo/internal/full_vllm.sh b/examples/train/grpo/internal/full_vllm.sh
deleted file mode 100644
index df5c59fcdd..0000000000
--- a/examples/train/grpo/internal/full_vllm.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-# One GPU is left for vLLM inference acceleration.
-# pip install math_verify # reward function
-# pip install -U trl
-# GPU memory: 8 * 80GiB
-
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=7 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B-Instruct \
-    --reward_funcs accuracy format \
-    --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#5000' \
-    --max_completion_length 2048 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --learning_rate 1e-6 \
-    --gradient_accumulation_steps 2 \
-    --eval_steps 200 \
-    --save_steps 200 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 4096 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 7 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true
diff --git a/examples/train/grpo/internal/full_vllm_qwenvl.sh b/examples/train/grpo/internal/full_vllm_qwenvl.sh
deleted file mode 100644
index 1dcfca57a9..0000000000
--- a/examples/train/grpo/internal/full_vllm_qwenvl.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-# Two GPUs are left for vLLM inference acceleration.
-# pip install math_verify # reward function
-# pip install -U trl
-# GPU memory: 8 * 60GiB
-
-MAX_PIXELS=602112 \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=6 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-VL-3B-Instruct \
-    --external_plugins examples/train/grpo/plugin/plugin.py \
-    --reward_funcs external_r1v_acc format \
-    --use_vllm true \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset lmms-lab/multimodal-open-r1-8k-verified \
-    --max_completion_length 1536 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-7 \
-    --eval_steps 1000 \
-    --save_steps 1000 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 24 \
-    --temperature 1.0 \
-    --top_p 0.9 \
-    --top_k 50 \
-    --async_generate true \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true \
-    --num_iterations 1 \
-    --num_infer_workers 2 \
-    --report_to tensorboard wandb

From 47bb902c4b313a1461b592fa1511cc12d16b725b Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 23:36:17 +0800
Subject: [PATCH 49/68] doc wip and deprecate patch

---
 ...43\347\240\201\350\256\255\347\273\203.md" |   2 -
 ...41\346\200\201\350\256\255\347\273\203.md" |   1 -
 docs/source/Instruction/GRPO.md               |   4 +-
 ...44\350\241\214\345\217\202\346\225\260.md" |   2 +-
 .../GRPO-Multi-Modal-Training.md              |   9 +-
 .../Instruction/Command-line-parameters.md    |   5 +-
 docs/source_en/Instruction/GRPO.md            | 160 +-----------------
 examples/train/grpo/external/grpo.sh          |   3 +-
 examples/train/grpo/internal/README.md        |  37 ----
 .../grpo/internal/multi_gpu_mp_colocate.sh    |   2 -
 .../train/grpo/internal/train_72b_4gpu.sh     |   1 -
 .../train/grpo/internal/train_multi_round.sh  |   1 -
 .../train/grpo/multi_node/Qwen2_5_32B_full.sh |   1 -
 examples/train/grpo/multi_node/multi_node1.sh |   4 +-
 examples/train/grpo/multi_node/multi_node2.sh |   6 +-
 examples/train/grpo/multi_node/train_dlc.sh   |   1 -
 ...rnal_rm.sh => run_external_reward_func.sh} |   0
 .../grpo/plugin/run_external_reward_model.sh  |  19 +++
 swift/llm/argument/rlhf_args.py               |   6 -
 swift/llm/infer/infer_engine/utils.py         | 116 -------------
 swift/llm/infer/infer_engine/vllm_engine.py   |   3 +-
 21 files changed, 34 insertions(+), 349 deletions(-)
 delete mode 100644 examples/train/grpo/internal/README.md
 rename examples/train/grpo/plugin/{run_external_rm.sh => run_external_reward_func.sh} (100%)
 create mode 100644 examples/train/grpo/plugin/run_external_reward_model.sh

diff --git "a/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md" "b/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
index 40cf9ef874..78e892a585 100644
--- "a/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
+++ "b/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
@@ -73,7 +73,6 @@ swift rlhf \
     --dataset_num_proc 4 \
     --num_generations 14 \
     --temperature 0.9 \
-    --num_infer_workers 1 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
     --log_completions true \
@@ -123,7 +122,6 @@ swift rlhf \
     --dataset_num_proc 4 \
     --num_generations 14 \
     --temperature 0.9 \
-    --num_infer_workers 1 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
     --log_completions true \
diff --git "a/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md" "b/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
index ffbd1e4e5c..6199f6bd13 100644
--- "a/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
+++ "b/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
@@ -143,7 +143,6 @@ swift rlhf \
     --log_completions true \
     --report_to wandb \
     --num_iterations 1 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
 
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index a8c2dd3be0..837cfd1061 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -102,7 +102,7 @@ orms['dummy']= DummyLengthRewardFunction
 
 可以在`swift/examples/train/grpo/plugin/plugin.py`中加入该奖励函数，使用参数`--external_plugins examples/train/grpo/plugin/plugin.py`进行注册，并通过 reward_funcs 参数进行指定
 
-执行脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh)
+执行脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh)
 
 ### 内置奖励函数
 swift内置了五种基于规则的奖励函数(代码见swift/plugin/orm.py)
@@ -278,7 +278,7 @@ swift rlhf \
 
 | 参数                 | 类型      | 值      |
 |----------------------|-----------|-------------|
-｜`--loss_type`        | `str`      |   `bnpo`   |
+｜`--loss_type`        | `str`      | `bnpo`     |
 | `--epsilon_high`     | `float`   | `0.28`      |
 | `--dynamic_sample`   | `bool`    | `true`      |
 | `--overlong_filter`  | `bool`    | `true`      |
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 254fef34ad..fdb953551f 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -12,7 +12,7 @@
 - 🔥tuner_backend: 可选为'peft'，'unsloth'。默认为'peft'。
 - 🔥train_type: 可选为: 'lora'、'full'、'longlora'、'adalora'、'llamapro'、'adapter'、'vera'、'boft'、'fourierft'、'reft'。默认为'lora'。
 - 🔥adapters: 用于指定adapter的id/path的list，默认为`[]`。
-- external_plugins: 外部plugin py文件列表，这些文件会被注册进plugin模块中，例子请参见[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh)。
+- external_plugins: 外部plugin py文件列表，这些文件会被注册进plugin模块中，例子请参见[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh)。
 - seed: 默认为42。
 - model_kwargs: 特定模型可传入的额外参数，该参数列表会在训练推理时打印日志进行提示。例如`--model_kwargs '{"fps_max_frames": 12}'`。
 - load_args: 当指定`--resume_from_checkpoint`、`--model`、`--adapters`会读取保存文件中的`args.json`，将默认为None的`基本参数`（除去数据参数和生成参数）进行赋值（可通过手动传入进行覆盖）。推理和导出时默认为True，训练时默认为False。
diff --git a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
index 4f37a07f04..7a584e56f6 100644
--- a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
+++ b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
@@ -112,13 +112,13 @@ orms['external_r1v_acc'] = MultiModalAccuracyORM
 
 #### **Training Parameters**
 
-We selected `Qwen2.5-VL-3B-Instruct` as the base model for training. The main reason for choosing the `Instruct` model over the base model is to rapidly achieve format rewards. Experiments were conducted on 8 GPUs. SWIFT GRPO training supports multi-GPU deployment to accelerate rollouts, so we set `num_infer_workers` to 2 and processes to 6 (2 GPUs for deployment, 6 GPUs for training). If you encounter deployment errors for `qwen2.5-vl` on `vllm`, refer to [this issue](https://github.com/vllm-project/vllm/issues/13285).
+We selected `Qwen2.5-VL-3B-Instruct` as the base model for training. The main reason for choosing the `Instruct` model over the base model is to rapidly achieve format rewards. Experiments were conducted on 8 GPUs. SWIFT GRPO training supports multi-GPU deployment to accelerate rollouts. If you encounter deployment errors for `qwen2.5-vl` on `vllm`, refer to [this issue](https://github.com/vllm-project/vllm/issues/13285).
 
 Since the task is simple, we set `max_completion_length` to 1024 and selected `external_r1v_acc` and `format` as reward functions. The learning rate and beta are set to `1e-6` and `0.001`, respectively. Other configurations are as follows. The settings for `batch_size` and `num_generations` can be referenced from [GRPO Full Workflow](./GRPO完整流程.md).
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
-NPROC_PER_NODE=6 \
+NPROC_PER_NODE=8 \
 swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-VL-3B-Instruct \
@@ -146,14 +146,13 @@ swift rlhf \
     --output_dir output/GRPO_CLEVR_COUNTDOWN \
     --warmup_ratio 0.01 \
     --dataloader_num_workers 4 \
-    --num_generations 24 \
+    --num_generations 32 \
     --temperature 1.0 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero3 \
     --log_completions true \
     --report_to wandb \
     --num_iterations 1 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
 ```
@@ -232,7 +231,6 @@ swift rlhf \
     --log_completions true \
     --report_to wandb \
     --num_iterations 2 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
     --max_grad_norm 0.5 \
@@ -321,7 +319,6 @@ swift rlhf \
     --log_completions true \
     --report_to wandb \
     --num_iterations 2 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
     --max_grad_norm 0.5 \
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 2cdaafa4f1..6b36ece8a4 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -13,7 +13,7 @@ Hints:
 - 🔥tuner_backend: Options are 'peft', 'unsloth'. Default is 'peft'.
 - 🔥train_type: Options are: 'lora', 'full', 'longlora', 'adalora', 'llamapro', 'adapter', 'vera', 'boft', 'fourierft', 'reft'. Default is 'lora'.
 - 🔥adapters: A list used to specify the id/path of the adapter. Default is `[]`.
-- external_plugins: A list of external plugin py files which will be registered into the plugin mappings，please check [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh).
+- external_plugins: A list of external plugin py files which will be registered into the plugin mappings，please check [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh).
 - seed: Default is 42.
 - model_kwargs: Additional parameters specific to the model that can be passed in. This list of parameters will log a message during training and inference for reference. For example, `--model_kwargs '{"fps_max_frames": 12}'`.
 - load_args: When specifying `--resume_from_checkpoint`, `--model`, or `--adapters`, it will read the `args.json` file saved in the checkpoint, assigning values to the default None `basic arguments` (excluding data and generation arguments) which can be overridden by manually passing them in. The default is True for inference and export, and False for training.
@@ -421,11 +421,8 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
   - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
 - use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
-- num_infer_workers: The number of inference workers per node. This setting is only effective when using vLLM or lmdeploy.
-- vllm_device: Configures the devices for deploying vLLM. You can set it to auto, which will allocate the last few GPUs based on the value of num_infer_workers. Alternatively, specify a number of devices equal to num_infer_workers. For example: --vllm_device cuda:1 cuda:2.
 - vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
 - vllm_max_model_len: vLLM passthrough parameter, default is None.
-- vllm_max_num_seqs: vLLM passthrough parameter, default is 256.
 - vllm_enforce_eager: vLLM passthrough parameter, default is False.
 - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
 - vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 477ae6d860..3207ce7ff2 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -109,7 +109,7 @@ orms['dummy']= DummyLengthRewardFunction
 ```
 You can add this reward function in `swift/examples/train/grpo/plugin/plugin.py` and register it using the parameter `--external_plugins examples/train/grpo/plugin/plugin.py`, then specify it using the reward_funcs parameter.
 
-For an example of how to execute the script, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh).
+For an example of how to execute the script, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh).
 
 
 
@@ -224,161 +224,6 @@ The hyperparameters for the reward function can be found in the [Built-in Reward
 
 You can use vLLM and LMDeploy as sampling backends to accelerate training.
 
-Multi-GPU vLLM
-```bash
-# async mode
-# The requirement is that num_infer_workers (deployment) + NPROC_PER_NODE (training) = device_count.
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=7 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format cosine repetition\
-    --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
-    --num_infer_workers 1 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#5000' \
-    --max_completion_length 2048 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --learning_rate 1e-6 \
-    --gradient_accumulation_steps 2 \
-    --eval_steps 200 \
-    --save_steps 200 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 4096 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 7 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true
-
-# colocate mode
-# The requirement is that num_infer_workers (deployment) = NPROC_PER_NODE (training) = device_count.
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=8 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-1.5B \
-    --reward_funcs accuracy format \
-    --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
-    --num_infer_workers 8 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#5000' \
-    --max_completion_length 2048 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --learning_rate 1e-6 \
-    --gradient_accumulation_steps 2 \
-    --eval_steps 200 \
-    --save_steps 200 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 4096 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 8 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero2 \
-    --log_completions true \
-    --sleep_level 1 \
-    --offload_model true \
-    --offload_optimizer true \
-    --gc_collect_after_offload true \
-    --log_completions true \
-```
-
-Single-GPU
-```bash
-# PT backend
-CUDA_VISIBLE_DEVICES=0 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format cosine repetition\
-    --train_type lora \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --target_modules all-linear \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#1000' \
-    --max_completion_length 1024 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 4 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --log_completions true
-
-# vLLM backend
-CUDA_VISIBLE_DEVICES=0 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --vllm_gpu_memory_utilization 0.5 \
-    --use_vllm true \
-    --sleep_level 1 \
-    --offload_model true \
-    --offload_optimizer true \
-    --gc_collect_after_offload true \
-    --reward_funcs accuracy format \
-    --train_type lora \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --target_modules all-linear \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#1000' \
-    --max_completion_length 1024 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 4 \
-    --temperature 0.9 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --log_completions true
-```
-
 For multi-node training, refer to [here](../../../examples/train/grpo/multi_node/) .
 
 Note : In the internal integration mode, the GPU configurations and training parameters must be identical across different nodes.
@@ -412,7 +257,6 @@ swift rlhf \
     --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
     --reward_model_plugin genrm my_rmplugin \
     --reward_weights 0.1 1 1 \
-    --num_infer_workers 8 \
     --vllm_gpu_memory_utilization 0.5 \
     --sleep_level 1 \
     --offload_model true \
@@ -441,6 +285,7 @@ Among these, Token level Loss is implemented by default and does not require add
 
 | Parameter                 | Type      | Value      |
 |----------------------|-----------|-------------|
+｜`--loss_type`        | `str`      | `bnpo`     |
 | `--epsilon_high`     | `float`   | `0.28`      |
 | `--dynamic_sample`   | `bool`    | `true`      |
 | `--overlong_filter`  | `bool`    | `true`      |
@@ -466,7 +311,6 @@ swift rlhf \
     --max_resample_times 3 \
     --use_vllm true \
     --vllm_gpu_memory_utilization 0.6 \
-    --num_infer_workers 8 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset AI-MO/NuminaMath-TIR#5000 \
diff --git a/examples/train/grpo/external/grpo.sh b/examples/train/grpo/external/grpo.sh
index c5bc954036..64294ec79b 100644
--- a/examples/train/grpo/external/grpo.sh
+++ b/examples/train/grpo/external/grpo.sh
@@ -5,7 +5,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-32B-Instruct \
     --reward_funcs accuracy \
     --use_vllm true \
-    --vllm_server_host xxx \
+    --vllm_server_host 127.0.0.1 \
     --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
@@ -28,6 +28,5 @@ swift rlhf \
     --deepspeed zero3 \
     --log_completions true \
     --num_iterations 1 \
-    --num_infer_workers 1 \
     --report_to tensorboard wandb \
     --beta 0.0
diff --git a/examples/train/grpo/internal/README.md b/examples/train/grpo/internal/README.md
deleted file mode 100644
index ba25514d08..0000000000
--- a/examples/train/grpo/internal/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# README: GRPO Internal Mode Execution Scripts
-
----
-
-## **Introduction**
-
-The GRPO (Group Relative Policy Optimization) training framework supports integrating high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows the inference service to be directly launched within the Trainer, reducing external dependencies and simplifying deployment.
-
-This folder contains scripts and instructions for running GRPO in **Internal Mode**, where the model training and inference are tightly integrated with flexible resource allocation strategies.
-
-
-## **Resource Allocation Strategies**
-
-GRPO provides two resource allocation strategies under the Internal mode:
-
-### 1. **Colocate Mode**
-
-- **Description**: Training and inference share GPU resources.
-- **Recommended Setting**:
-  - Set `sleep_level=1` to release vLLM memory during training steps.
-- **Resource Allocation Rules**:
-  ```plaintext
-  NPROC_PER_NODE = Total number of GPUs
-  num_infer_workers = Total number of GPUs
-  ```
-
-### 2. **Async Mode**
-
-- **Description**: Training and inference use independent GPU resources.
-- **Recommended Setting**:
-  - Set `sleep_level=1` to release vLLM memory during training steps.
-- **Resource Allocation Rules**:
-  ```plaintext
-    NPROC_PER_NODE = Number of training GPUs
-    num_infer_workers = Number of inference GPUs
-    Must satisfy: Number of training GPUs + Number of inference GPUs = Total GPU count
-  ```
diff --git a/examples/train/grpo/internal/multi_gpu_mp_colocate.sh b/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
index 5f17a47ec4..dd29bc183f 100644
--- a/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
+++ b/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
@@ -24,8 +24,6 @@ swift rlhf \
     --use_vllm true \
     --vllm_gpu_memory_utilization 0.5 \
     --sleep_level 1 \
-    --deepspeed zero3 \
-    --num_infer_workers 8 \
     --tensor_parallel_size 4 \
     --temperature 1.0 \
     --top_p 0.85
diff --git a/examples/train/grpo/internal/train_72b_4gpu.sh b/examples/train/grpo/internal/train_72b_4gpu.sh
index 3461db00e1..a050db3bdf 100644
--- a/examples/train/grpo/internal/train_72b_4gpu.sh
+++ b/examples/train/grpo/internal/train_72b_4gpu.sh
@@ -36,7 +36,6 @@ swift rlhf \
     --top_p 1.0 \
     --top_k 80 \
     --log_completions true \
-    --num_infer_workers 4 \
     --tensor_parallel_size 4 \
     --async_generate false \
     --move_model_batches 16 \
diff --git a/examples/train/grpo/internal/train_multi_round.sh b/examples/train/grpo/internal/train_multi_round.sh
index e9d5042f03..6beb6eef12 100644
--- a/examples/train/grpo/internal/train_multi_round.sh
+++ b/examples/train/grpo/internal/train_multi_round.sh
@@ -34,7 +34,6 @@ swift rlhf \
     --top_p 1.0 \
     --top_k 80 \
     --log_completions true \
-    --num_infer_workers 8 \
     --tensor_parallel_size 4 \
     --async_generate false \
     --offload_optimizer true \
diff --git a/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh b/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
index 7ba4efa3d8..244967c95d 100644
--- a/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
+++ b/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
@@ -43,6 +43,5 @@ swift rlhf \
     --deepspeed zero3 \
     --log_completions true \
     --num_iterations 1 \
-    --num_infer_workers 1 \
     --report_to tensorboard wandb \
     --beta 0.0
diff --git a/examples/train/grpo/multi_node/multi_node1.sh b/examples/train/grpo/multi_node/multi_node1.sh
index 8e895a7f67..00758b266f 100755
--- a/examples/train/grpo/multi_node/multi_node1.sh
+++ b/examples/train/grpo/multi_node/multi_node1.sh
@@ -8,7 +8,7 @@ export NNODES=2
 export NODE_RANK=0
 export MASTER_ADDR=127.0.0.1
 export MASTER_PORT=29500
-export NPROC_PER_NODE=3
+export NPROC_PER_NODE=4
 
 swift rlhf \
     --rlhf_type grpo \
@@ -37,7 +37,7 @@ swift rlhf \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
     --dataset_num_proc 4 \
-    --num_generations 7 \
+    --num_generations 8 \
     --temperature 0.9 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
diff --git a/examples/train/grpo/multi_node/multi_node2.sh b/examples/train/grpo/multi_node/multi_node2.sh
index c3786b7ca3..50766dd624 100755
--- a/examples/train/grpo/multi_node/multi_node2.sh
+++ b/examples/train/grpo/multi_node/multi_node2.sh
@@ -3,17 +3,15 @@ export NNODES=2
 export NODE_RANK=1
 export MASTER_ADDR=xxx.xxx.xxx.xxx
 export MASTER_PORT=29500
-export NPROC_PER_NODE=3
+export NPROC_PER_NODE=4
 
 swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-Math-7B \
     --reward_funcs accuracy format \
     --use_vllm true \
-    --vllm_device auto \
     --vllm_gpu_memory_utilization 0.5 \
     --vllm_max_model_len 4096 \
-    --num_infer_workers 1 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-MO/NuminaMath-TIR#5000' \
@@ -32,7 +30,7 @@ swift rlhf \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
     --dataset_num_proc 4 \
-    --num_generations 7 \
+    --num_generations 8 \
     --temperature 0.9 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
diff --git a/examples/train/grpo/multi_node/train_dlc.sh b/examples/train/grpo/multi_node/train_dlc.sh
index a020dd24ff..07233f3d8d 100644
--- a/examples/train/grpo/multi_node/train_dlc.sh
+++ b/examples/train/grpo/multi_node/train_dlc.sh
@@ -33,7 +33,6 @@ torchrun \
     --vllm_gpu_memory_utilization 0.3 \
     --sleep_level 1 \
     --deepspeed zero3_offload \
-    --num_infer_workers 8 \
     --tensor_parallel_size 4 \
     --temperature 1.0 \
     --top_p 0.85
diff --git a/examples/train/grpo/plugin/run_external_rm.sh b/examples/train/grpo/plugin/run_external_reward_func.sh
similarity index 100%
rename from examples/train/grpo/plugin/run_external_rm.sh
rename to examples/train/grpo/plugin/run_external_reward_func.sh
diff --git a/examples/train/grpo/plugin/run_external_reward_model.sh b/examples/train/grpo/plugin/run_external_reward_model.sh
new file mode 100644
index 0000000000..45b0435459
--- /dev/null
+++ b/examples/train/grpo/plugin/run_external_reward_model.sh
@@ -0,0 +1,19 @@
+# see rm_plugin example in swift/plugin/rm_plugin.py
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --dataset AI-MO/NuminaMath-TIR#5000 \
+    --external_plugins examples/train/grpo/plugin/plugin.py \
+    --reward_funcs format \
+    --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
+    --reward_model_plugin genrm my_rmplugin \
+    --reward_weights 0.1 1 1 \
+    --vllm_gpu_memory_utilization 0.5 \
+    --sleep_level 1 \
+    --offload_model true \
+    --offload_optimizer true \
+    --gc_collect_after_offload true \
+    --log_completions true \
+    --deepspeed zero2
\ No newline at end of file
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index e0a6d01505..dfedd3e397 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -245,12 +245,6 @@ def _external_vllm_warning(self):
             logger.warning("Configuration conflict: External vLLM engine detected, but 'vllm_device' is set to '%s'. ",
                            self.vllm_device)
 
-        if self.num_infer_workers != 1:
-            logger.warning(
-                "Auto-adjustment: Changing 'num_infer_workers' from %s to 1 because external vLLM engine is detected",
-                self.num_infer_workers)
-            self.num_infer_workers = 1
-
         if self.vllm_max_model_len is not None:
             logger.warning(
                 "Configuration conflict: 'vllm_max_model_len=%s' is ignored for external vLLM. "
diff --git a/swift/llm/infer/infer_engine/utils.py b/swift/llm/infer/infer_engine/utils.py
index ff94de9399..338e642962 100644
--- a/swift/llm/infer/infer_engine/utils.py
+++ b/swift/llm/infer/infer_engine/utils.py
@@ -358,102 +358,6 @@ def _create_model_instance(self, device_id):
     TurboMindInstance._create_model_instance = _create_model_instance
 
 
-def patch_vllm(world_size=1):
-
-    @contextmanager
-    def _get_context():
-        from vllm.distributed.parallel_state import GroupCoordinator
-        from unittest.mock import patch
-        try:
-            from vllm.worker.worker import Worker
-            getattr(Worker, '_assert_memory_footprint_increased_during_profiling')
-            profiling_patch = patch(
-                'vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling', return_value=None)
-        except (ImportError, AttributeError):
-            profiling_patch = nullcontext()
-
-        __origin_init__ = GroupCoordinator.__init__
-
-        def get_world_size(group=None) -> int:
-            if not group:
-                # Given size
-                return world_size
-            else:
-                return torch.distributed.get_world_size_origin(group)
-
-        def __init__(self, group_ranks, local_rank, *args, **kwargs):
-            node_rank, nnodes = get_node_setting()
-            device_count = get_device_count()
-            num_infer_workers = world_size // nnodes
-
-            def map_rank_to_real_device(obj):
-                # Use the last devices
-                # world_size=4 gpus=8 [0,1,2,3] will map to [4,5,6,7]
-                diff = device_count - num_infer_workers
-                if diff < 0:
-                    diff = 0
-                if isinstance(obj, list):
-                    return [map_rank_to_real_device(o) for o in obj]
-                elif isinstance(obj, int):
-                    return obj + diff
-                else:
-                    raise ValueError(f'Unsupported type: {obj}')
-
-            if kwargs.get('group_name') == 'world':
-                local_rank = local_rank + node_rank * num_infer_workers
-            else:
-                local_rank = map_rank_to_real_device(local_rank - node_rank * num_infer_workers)
-            rank = dist.get_rank()
-            if world_size == 1 and [rank] not in group_ranks:
-                # for ddp inference
-                group_ranks = [[rank]]
-            if nnodes > 1 and num_infer_workers < device_count:
-                """
-                Map group_ranks to global ranks
-
-                Example:
-                  - Number of nodes (nnodes): 2
-                  - Devices per node (device_count): 4
-                  - Inference workers per node (num_infer_workers): 1
-
-                  Initial group_ranks:
-                      [[0, 1]]
-
-                  After mapping to global ranks:
-                      [[0, 3]]  # Global ranks corresponding to the local ranks
-                """
-                train_device_count = device_count - num_infer_workers
-                # vllm.worker.init_distributed_environment
-                if len(group_ranks) == 1:
-                    group_ranks = group_ranks[0]
-                    for i in range(nnodes):
-                        group_ranks[i * num_infer_workers:(i + 1) * num_infer_workers] = [
-                            train_device_count * i + j for j in range(num_infer_workers)
-                        ]
-                    group_ranks = [group_ranks]
-                # vllm.worker.ensure_model_parallel_initialized
-                else:
-                    for i in range(nnodes):
-                        for j in range(num_infer_workers):
-                            group_ranks[i * num_infer_workers + j] = [train_device_count * i + j]
-
-            return __origin_init__(self, group_ranks, local_rank, *args, **kwargs)
-
-        GroupCoordinator.__init__ = __init__
-
-        try:
-            with profiling_patch, restore_torch_device_after_vllm_init():
-                torch.distributed.get_world_size_origin = torch.distributed.get_world_size
-                torch.distributed.get_world_size = get_world_size
-                yield
-                torch.distributed.get_world_size = torch.distributed.get_world_size_origin
-                del torch.distributed.get_world_size_origin
-        finally:
-            GroupCoordinator.__init__ = __origin_init__
-
-    return _get_context() if dist.is_initialized() else nullcontext()
-
-
 def patch_npu_vllm(vllm_device: str):
     if isinstance(vllm_device, int):
         vllm_device = get_device(vllm_device)
@@ -472,26 +376,6 @@ def new_group_context():
     return new_group_context() if device_type == 'npu' else nullcontext()
 
 
-@contextmanager
-def restore_torch_device_after_vllm_init():
-    """
-    A context manager to restore the original CUDA device after potential modifications.
-
-    This is specifically designed to address an issue in Distributed Data Parallel (DDP)
-    scenarios where the initialization of the vLLM engine may inadvertently modify the
-    default CUDA device. The context manager saves the current device at the start and
-    ensures it is restored upon exit, even if the device is modified within the context.
-
-    """
-    origin_device = get_current_device()
-    try:
-        yield
-    finally:
-        current_device = get_current_device()
-        if origin_device != current_device:
-            set_device(origin_device)
-
-
 def patch_vllm_memory_leak():
     import vllm
     if version.parse(vllm.__version__) != version.parse('0.7.3'):
diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
index 639ef6f533..addb4296c9 100644
--- a/swift/llm/infer/infer_engine/vllm_engine.py
+++ b/swift/llm/infer/infer_engine/vllm_engine.py
@@ -19,7 +19,7 @@
                         ChatCompletionStreamResponse, ChatMessage, DeltaMessage, RequestConfig, random_uuid)
 from .infer_engine import InferEngine
 from .patch import patch_auto_config, patch_auto_tokenizer
-from .utils import AdapterRequest, InferStreamer, patch_npu_vllm, patch_vllm
+from .utils import AdapterRequest, InferStreamer, patch_npu_vllm
 
 try:
     # After setting the environment variables, import vllm. This way of writing allows lint to pass.
@@ -62,7 +62,6 @@ def __init__(
         max_loras: int = 1,
         max_lora_rank: int = 16,
         enable_prefix_caching: bool = False,
-        num_infer_workers: int = 1,
         enable_sleep_mode: bool = False,
         distributed_executor_backend: Optional[str] = None,
         quantization: Optional[str] = None,

From 37c68d2026f4c0bed5bb932d6ffa8ccec72b757f Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Mon, 12 May 2025 23:43:15 +0800
Subject: [PATCH 50/68] lint

---
 examples/train/grpo/plugin/run_external_reward_model.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train/grpo/plugin/run_external_reward_model.sh b/examples/train/grpo/plugin/run_external_reward_model.sh
index 45b0435459..174e19ad2a 100644
--- a/examples/train/grpo/plugin/run_external_reward_model.sh
+++ b/examples/train/grpo/plugin/run_external_reward_model.sh
@@ -16,4 +16,4 @@ swift rlhf \
     --offload_optimizer true \
     --gc_collect_after_offload true \
     --log_completions true \
-    --deepspeed zero2
\ No newline at end of file
+    --deepspeed zero2

From f7700fa6c656660b3f8ea49fec9b6fbdace1111c Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 10:33:18 +0800
Subject: [PATCH 51/68] doc and scipt wip

---
 ...43\347\240\201\350\256\255\347\273\203.md" | 23 ++++++++-----
 ...41\346\200\201\350\256\255\347\273\203.md" | 33 ++++++++++--------
 ...14\346\225\264\346\265\201\347\250\213.md" | 16 ++++++---
 docs/source/Instruction/GRPO.md               |  1 -
 .../BestPractices/GRPO-Code-Training.md       | 25 ++++++++------
 .../GRPO-Multi-Modal-Training.md              | 34 ++++++++++++-------
 docs/source_en/BestPractices/GRPO.md          | 22 ++++++------
 .../train/grpo/internal/lora_qwenvl72b.sh     |  1 -
 examples/train/grpo/internal/lora_vllm.sh     |  1 -
 examples/train/grpo/multi_node/multi_node1.sh |  2 --
 .../infer/infer_engine/grpo_vllm_engine.py    |  1 -
 11 files changed, 93 insertions(+), 66 deletions(-)

diff --git "a/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md" "b/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
index 78e892a585..9f963c76c1 100644
--- "a/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
+++ "b/docs/source/BestPractices/GRPO\344\273\243\347\240\201\350\256\255\347\273\203.md"
@@ -38,10 +38,17 @@
 - 在[e2b](https://e2b.dev/dashboard)注册获取E2B_API_KEY，并设置为环境变量。
 - `--reward_funcs`添加`external_code_reward`作为奖励函数。
 - `--external_plugins`设置为plugin.py的路径。
+首先拉起 vLLM server
+```bash
+CUDA_VISIBLE_DEVICES=7 \
+swift rollout \
+  --model Qwen/Qwen2.5-7B-Instruct
+```
+
 ```bash
 E2B_API_KEY=xxx \
 WANDB_API_KEY=xxx \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
 NPROC_PER_NODE=7 \
 swift rlhf \
     --rlhf_type grpo \
@@ -49,10 +56,10 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_code_reward external_code_format \
     --reward_weights 1.0 0.1 \
+    --vllm_mode server \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
@@ -90,7 +97,7 @@ swift rlhf \
 JUDGE0_ENDPOINT=xxx \
 JUDGE0_X_AUTH_TOKEN=xxx \
 WANDB_API_KEY=xxx \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
 NPROC_PER_NODE=7 \
 swift rlhf \
     --rlhf_type grpo \
@@ -98,10 +105,10 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_code_reward_by_judge0 external_code_format \
     --reward_weights 1.0 0.1 \
+    --vllm_mode server \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
diff --git "a/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md" "b/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
index 6199f6bd13..3839d0646c 100644
--- "a/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
+++ "b/docs/source/BestPractices/GRPO\345\244\232\346\250\241\346\200\201\350\256\255\347\273\203.md"
@@ -102,12 +102,19 @@ orms['external_r1v_acc'] = MultiModalAccuracyORM
 
 ### GRPO训练实验记录
 #### 训练参数：
-我们选取 Qwen2.5-VL-3B-Instruct 作为基础模型进行训练，选取 Instruct 而不是基模的主要原因是可以更快地获取 format reward。我们在八卡 GPU 上进行实验。SWIFT GRPO训练已支持多卡部署模型以加速rollout，因此我们设置num_infer_workers为2，进程数为6，即2卡部署，6卡训练。如果遇到vllm部署qwen2.5-vl报错，可以参考[issue](https://github.com/vllm-project/vllm/issues/13285)
+我们选取 Qwen2.5-VL-3B-Instruct 作为基础模型进行训练，选取 Instruct 而不是基模的主要原因是可以更快地获取 format reward。我们在八卡 GPU 上进行实验。如果遇到vllm部署qwen2.5-vl报错，可以参考[issue](https://github.com/vllm-project/vllm/issues/13285)
 
 由于任务简单，我们设置max_completion_length为1024，奖励函数选择external_r1v_acc和format，学习率和beta分别设置为1e-6和0.001。其他设置如下所示，batch_size和num_generations的设置原则可以参考[GRPO完整流程](./GRPO完整流程.md)。
+首先拉起 external vLLM server
+```bash
+CUDA_VISIBLE_DEVICES=6,7 \
+swift rollout \
+    --model Qwen/Qwen2.5-VL-3B-Instruct
+```
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
 NPROC_PER_NODE=6 \
 swift rlhf \
     --rlhf_type grpo \
@@ -115,12 +122,12 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'okwinds/clevr_cogen_a_train' \
-    --vllm_max_model_len 8192 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -145,7 +152,6 @@ swift rlhf \
     --num_iterations 1 \
     --async_generate false \
     --beta 0.001 \
-
 ```
 #### 实验现象
 ![image.png](../../resources/grpo_clevr_count.png)
@@ -181,6 +187,7 @@ step 400:
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
 MAX_PIXELS=401408 \
 NPROC_PER_NODE=6 \
 swift rlhf \
@@ -189,12 +196,12 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
-    --vllm_max_model_len 8192 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -218,7 +225,6 @@ swift rlhf \
     --log_completions true \
     --report_to wandb \
     --num_iterations 2 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
     --max_grad_norm 0.5 \
@@ -247,6 +253,7 @@ Assistant:
 选取的模型和大部分超参数与上一个实验相似，由于训练的时候出现了OOM，我们设置`MAX_PIXELS=262144`以降低显存占用。
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
 MAX_PIXELS=262144 \
 MASTER_PORT=29600 \
 NPROC_PER_NODE=6 \
@@ -256,12 +263,12 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
-    --vllm_max_model_len 8192
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -285,11 +292,9 @@ swift rlhf \
     --log_completions true \
     --report_to wandb \
     --num_iterations 2 \
-    --num_infer_workers 2 \
     --async_generate false \
     --beta 0.001 \
     --max_grad_norm 0.5 \
-
 ```
 
 #### 实验现象
diff --git "a/docs/source/BestPractices/GRPO\345\256\214\346\225\264\346\265\201\347\250\213.md" "b/docs/source/BestPractices/GRPO\345\256\214\346\225\264\346\265\201\347\250\213.md"
index 44f639cb2b..97d4cbfac8 100644
--- "a/docs/source/BestPractices/GRPO\345\256\214\346\225\264\346\265\201\347\250\213.md"
+++ "b/docs/source/BestPractices/GRPO\345\256\214\346\225\264\346\265\201\347\250\213.md"
@@ -96,7 +96,7 @@ $$
 
 由于任务较为简单，我们设置 max_completion_length 和 vllm_max_model_len 为1024，如果有更复杂的任务，可以适当加大模型输出长度，但请注意，**这两个参数越大，模型训练需要的显存越多，训练速度越慢，单个step的训练时间与max_completion_length呈现线性关系**。
 
-在我们的实验中，总batch_size为 $num\_processes \times per\_device\_train\_batch\_size \times gradient\_accumulation\_steps = 2 \times 8 \times 8 = 128$ 而参数设置有一个限制，即：$num\_processes \times per\_device\_train\_batch\_size$ 必须整除 $num\_generations$，其中，$num\_generations$就是GRPO公式中的 $G$，故我们设置为8。 注意，这里单卡batch_size设置也与显存息息相关，请根据显存上限设置一个合适的值。 同时，还有一个公式，即总的steps数量 :$num\_steps = epochs \times len(datasets) \times num\_generations \div batch\_size $，需要根据这个来合理规划训练的学习率和warmup设置。
+在我们的实验中，总batch_size为 $num\_processes \times per\_device\_train\_batch\_size \times gradient\_accumulation\_steps = 2 \times 8 \times 8 = 128$。 注意，这里单卡batch_size设置也与显存息息相关，请根据显存上限设置一个合适的值。 同时，还有一个公式，即总的steps数量 :$num\_steps = epochs \times len(datasets) \times num\_generations \div batch\_size $，需要根据这个来合理规划训练的学习率和warmup设置。
 
 最后比较重要的设置是学习率和 beta，学习率比较好理解，而beta则是是以上公式的 $\beta$，即KL散度的梯度的权重。这两个参数设置的越大，模型收敛原则上更快，但训练往往会不稳定。经过实验，我们分别设置为 `5e-7` 和 `0.001`。在实际训练中，请根据是否出现不稳定的震荡情况适当调整这两个参数。
 
@@ -104,7 +104,13 @@ $$
 
 其他参数的设置，没有做太多探讨，所以这里不进行详细说明。
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2 \
+CUDA_VISIBLE_DEVICES=2 \
+swift rollout \
+    --model Qwen/Qwen2.5-3B-Instruct
+```
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 \
 WANDB_API_KEY=your_wandb_key \
 NPROC_PER_NODE=2 \
 swift rlhf \
@@ -113,8 +119,9 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_countdown format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'zouxuhong/Countdown-Tasks-3to4#50000' \
@@ -137,7 +144,6 @@ swift rlhf \
     --system 'You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.' \
     --deepspeed zero3 \
     --log_completions true \
-    --vllm_max_model_len 1024 \
     --report_to wandb \
     --beta 0.001 \
     --num_iterations 1
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index 837cfd1061..c987ee86f2 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -190,7 +190,6 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False
   - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容
 - use_vllm: 是否使用vLLM作为采样的生成后端，默认为False，建议使用加快训练速度
-- vllm_device: 设置vLLM部署的设备，默认为`auto`, 即未被使用的第一张显卡，使用`cuda:x`来设置特定的卡。
 - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9
 - vllm_max_model_len: vllm透传参数，默认为None
 - vllm_max_num_seqs: vllm透传参数，默认为256
diff --git a/docs/source_en/BestPractices/GRPO-Code-Training.md b/docs/source_en/BestPractices/GRPO-Code-Training.md
index eadab66d83..d4af4ba30f 100644
--- a/docs/source_en/BestPractices/GRPO-Code-Training.md
+++ b/docs/source_en/BestPractices/GRPO-Code-Training.md
@@ -42,10 +42,17 @@ Note: Currently, executing code through E2B only supports the Python language. I
 - Add `external_code_reward` as a reward function with `--reward_funcs`.
 - Set `--external_plugins` to the path of plugin.py.
 
+launch external vLLM server using following script
+```bash
+CUDA_VISIBLE_DEVICES=7 \
+swift rollout \
+  --model Qwen/Qwen2.5-7B-Instruct
+```
+
 ```bash
 E2B_API_KEY=xxx \
 WANDB_API_KEY=xxx \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
 NPROC_PER_NODE=7 \
 swift rlhf \
     --rlhf_type grpo \
@@ -53,10 +60,10 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_code_reward external_code_format \
     --reward_weights 1.0 0.1 \
+    --vllm_mode server \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
@@ -77,7 +84,6 @@ swift rlhf \
     --dataset_num_proc 4 \
     --num_generations 14 \
     --temperature 0.9 \
-    --num_infer_workers 1 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
     --log_completions true \
@@ -94,7 +100,7 @@ swift rlhf \
 JUDGE0_ENDPOINT=xxx \
 JUDGE0_X_AUTH_TOKEN=xxx \
 WANDB_API_KEY=xxx \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
 NPROC_PER_NODE=7 \
 swift rlhf \
     --rlhf_type grpo \
@@ -102,10 +108,10 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_code_reward_by_judge0 external_code_format \
     --reward_weights 1.0 0.1 \
+    --vllm_mode server \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --vllm_max_model_len 8192 \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
@@ -126,7 +132,6 @@ swift rlhf \
     --dataset_num_proc 4 \
     --num_generations 14 \
     --temperature 0.9 \
-    --num_infer_workers 1 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero2 \
     --log_completions true \
diff --git a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
index 7a584e56f6..43ad528a54 100644
--- a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
+++ b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
@@ -114,23 +114,31 @@ orms['external_r1v_acc'] = MultiModalAccuracyORM
 
 We selected `Qwen2.5-VL-3B-Instruct` as the base model for training. The main reason for choosing the `Instruct` model over the base model is to rapidly achieve format rewards. Experiments were conducted on 8 GPUs. SWIFT GRPO training supports multi-GPU deployment to accelerate rollouts. If you encounter deployment errors for `qwen2.5-vl` on `vllm`, refer to [this issue](https://github.com/vllm-project/vllm/issues/13285).
 
-Since the task is simple, we set `max_completion_length` to 1024 and selected `external_r1v_acc` and `format` as reward functions. The learning rate and beta are set to `1e-6` and `0.001`, respectively. Other configurations are as follows. The settings for `batch_size` and `num_generations` can be referenced from [GRPO Full Workflow](./GRPO完整流程.md).
+Since the task is simple, we set `max_completion_length` to 1024 and selected `external_r1v_acc` and `format` as reward functions. The learning rate and beta are set to `1e-6` and `0.001`, respectively. Other configurations are as follows. The settings for `batch_size` and `num_generations` can be referenced from [GRPO Full Workflow](./GRPO.md).
+
+launch external vLLM server using following script
+```bash
+CUDA_VISIBLE_DEVICES=6,7 \
+swift rollout \
+    --model Qwen/Qwen2.5-VL-3B-Instruct
+```
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
-NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
+NPROC_PER_NODE=6 \
 swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-VL-3B-Instruct \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'okwinds/clevr_cogen_a_train' \
-    --vllm_max_model_len 8192 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -146,7 +154,7 @@ swift rlhf \
     --output_dir output/GRPO_CLEVR_COUNTDOWN \
     --warmup_ratio 0.01 \
     --dataloader_num_workers 4 \
-    --num_generations 32 \
+    --num_generations 24 \
     --temperature 1.0 \
     --system 'examples/train/grpo/prompt.txt' \
     --deepspeed zero3 \
@@ -194,6 +202,7 @@ The selected model and most hyperparameters are similar to the previous experime
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
 MAX_PIXELS=401408 \
 NPROC_PER_NODE=6 \
 swift rlhf \
@@ -202,12 +211,12 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
-    --vllm_max_model_len 8192 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -281,6 +290,7 @@ The selected model and most hyperparameters are similar to the previous experime
 
 ```shell
 WANDB_API_KEY=your_wandb_api_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
 MAX_PIXELS=262144 \
 MASTER_PORT=29600 \
 NPROC_PER_NODE=6 \
@@ -290,12 +300,12 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_r1v_acc format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
-    --vllm_max_model_len 8192 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
diff --git a/docs/source_en/BestPractices/GRPO.md b/docs/source_en/BestPractices/GRPO.md
index 2910370772..f1cd1f1491 100644
--- a/docs/source_en/BestPractices/GRPO.md
+++ b/docs/source_en/BestPractices/GRPO.md
@@ -109,13 +109,7 @@ $$
 \text{total batch size} = \text{num\_processes} \times \text{per\_device\_train\_batch\_size} \times \text{gradient\_accumulation\_steps} = 2 \times 8 \times 8 = 128
 $$
 
-There is a constraint:
-
-$$
-\text{num\_processes} \times \text{per\_device\_train\_batch\_size} \text{ must divide evenly into } \text{num\_generations},
-$$
-
-where `num_generations` corresponds to $G$ in the GRPO formula. Therefore, we set it to 8. Note that the single-GPU batch size is also closely related to GPU memory capacity, so set an appropriate value based on memory limits. Additionally, the total number of steps can be calculated as:
+Note that the single-GPU batch size is also closely related to GPU memory capacity, so set an appropriate value based on memory limits. Additionally, the total number of steps can be calculated as:
 
 $$
 \text{num\_steps} = \text{epochs} \times \text{len(datasets)} \times \text{num\_generations} \div \text{batch\_size}
@@ -130,7 +124,13 @@ For KL divergence, the community has extensive discussions, such as [Why GRPO Ad
 Other parameter settings were not explored in detail and will not be discussed here.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2 \
+CUDA_VISIBLE_DEVICES=2 \
+swift rollout \
+    --model Qwen/Qwen2.5-3B-Instruct
+```
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 \
 WANDB_API_KEY=your_wandb_key \
 NPROC_PER_NODE=2 \
 swift rlhf \
@@ -139,8 +139,9 @@ swift rlhf \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs external_countdown format \
     --use_vllm true \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.6 \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
+    --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'zouxuhong/Countdown-Tasks-3to4#50000' \
@@ -163,7 +164,6 @@ swift rlhf \
     --system 'You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.' \
     --deepspeed zero3 \
     --log_completions true \
-    --vllm_max_model_len 1024 \
     --report_to wandb \
     --beta 0.001 \
     --num_iterations 1
diff --git a/examples/train/grpo/internal/lora_qwenvl72b.sh b/examples/train/grpo/internal/lora_qwenvl72b.sh
index d5401132b0..d2c1bfcaae 100755
--- a/examples/train/grpo/internal/lora_qwenvl72b.sh
+++ b/examples/train/grpo/internal/lora_qwenvl72b.sh
@@ -40,7 +40,6 @@ swift rlhf \
   --top_p 1.0 \
   --top_k 80 \
   --log_completions true \
-  --num_infer_workers 8 \
   --tensor_parallel_size 4 \
   --async_generate false \
   --offload_optimizer true \
diff --git a/examples/train/grpo/internal/lora_vllm.sh b/examples/train/grpo/internal/lora_vllm.sh
index d9cc6269fb..2283e9dc81 100644
--- a/examples/train/grpo/internal/lora_vllm.sh
+++ b/examples/train/grpo/internal/lora_vllm.sh
@@ -10,7 +10,6 @@ swift rlhf \
     --reward_funcs accuracy format \
     --train_type lora \
     --use_vllm true \
-    --vllm_device auto \
     --vllm_gpu_memory_utilization 0.5 \
     --vllm_max_model_len 8192 \
     --lora_rank 8 \
diff --git a/examples/train/grpo/multi_node/multi_node1.sh b/examples/train/grpo/multi_node/multi_node1.sh
index 00758b266f..6fb864536c 100755
--- a/examples/train/grpo/multi_node/multi_node1.sh
+++ b/examples/train/grpo/multi_node/multi_node1.sh
@@ -15,10 +15,8 @@ swift rlhf \
     --model Qwen/Qwen2.5-Math-7B \
     --reward_funcs accuracy format \
     --use_vllm true \
-    --vllm_device auto \
     --vllm_gpu_memory_utilization 0.5 \
     --vllm_max_model_len 4096 \
-    --num_infer_workers 1 \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-MO/NuminaMath-TIR#5000' \
diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
index 0e5b4e7363..0e53f14a63 100644
--- a/swift/llm/infer/infer_engine/grpo_vllm_engine.py
+++ b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
@@ -50,7 +50,6 @@ def __init__(
         max_loras: int = 1,
         max_lora_rank: int = 16,
         enable_prefix_caching: bool = False,
-        num_infer_workers: int = 1,
         enable_sleep_mode: bool = False,
         distributed_executor_backend: Optional[str] = None,
         engine_kwargs: Optional[Dict[str, Any]] = None,

From 6a572fa45272b0250ae98daf12888ba950e1a8c1 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 11:01:41 +0800
Subject: [PATCH 52/68] doc update

---
 docs/source/Instruction/GRPO.md               | 22 ++++++-----
 ...44\350\241\214\345\217\202\346\225\260.md" | 21 +++++-----
 docs/source_en/BestPractices/GRPO.md          |  2 +-
 .../Instruction/Command-line-parameters.md    | 20 +++++-----
 docs/source_en/Instruction/GRPO.md            | 23 +++++------
 .../train/grpo/internal/lora_qwenvl72b.sh     |  3 --
 examples/train/grpo/internal/lora_vllm.sh     | 39 -------------------
 .../train/grpo/internal/train_72b_4gpu.sh     |  3 --
 .../train/grpo/internal/train_multi_round.sh  |  3 --
 swift/llm/infer/infer_engine/utils.py         |  2 +
 swift/trainers/rlhf_trainer/grpo_trainer.py   |  6 +--
 11 files changed, 51 insertions(+), 93 deletions(-)
 delete mode 100644 examples/train/grpo/internal/lora_vllm.sh

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index c987ee86f2..bb1dd81380 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -189,16 +189,18 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - loss_type: loss 归一化的类型，可选项为['grpo', 'bnpo', 'dr_grpo'], 默认为'grpo', 具体查看该[pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
 - log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False
   - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容
-- use_vllm: 是否使用vLLM作为采样的生成后端，默认为False，建议使用加快训练速度
-- vllm_gpu_memory_utilization: vllm透传参数，默认为0.9
-- vllm_max_model_len: vllm透传参数，默认为None
-- vllm_max_num_seqs: vllm透传参数，默认为256
-- vllm_enforce_eager: vllm透传参数，默认为False
-- vllm_limit_mm_per_prompt: vllm透传参数，默认为None
-- vllm_enable_prefix_caching: vllm透传参数，默认为True
-- vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用
-- vllm_server_port vLLM server 服务端口，默认为8000
-- vllm_server_timeout 连接vLLM server的超时时间，默认为120s
+- use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
+- vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。使用server端时，
+- vllm_mode server 参数
+  - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
+  - vllm_server_port vLLM server 服务端口，默认为8000。
+  - vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
+- vllm_mode colocate 参数
+  - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
+  - vllm_max_model_len: vllm透传参数，默认为None。
+  - vllm_enforce_eager: vllm透传参数，默认为False。
+  - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
+  - vllm_enable_prefix_caching: vllm透传参数，默认为True。
 - num_iterations: 每个批次代更新次数，默认为1.
 - epsilon: clip 系数，默认为0.2.
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index fdb953551f..bcb0dff1c5 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -409,15 +409,17 @@ reward模型参数将在PPO、GRPO中使用。
 - log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False。
   - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容。
 - use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
-- vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
-- vllm_max_model_len: vllm透传参数，默认为None。
-- vllm_max_num_seqs: vllm透传参数，默认为256。
-- vllm_enforce_eager: vllm透传参数，默认为False。
-- vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
-- vllm_enable_prefix_caching: vllm透传参数，默认为True。
-- vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
-- vllm_server_port vLLM server 服务端口，默认为8000。
-- vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
+- vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。使用server端时，
+- vllm_mode server 参数
+  - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
+  - vllm_server_port vLLM server 服务端口，默认为8000。
+  - vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
+- vllm_mode colocate 参数
+  - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
+  - vllm_max_model_len: vllm透传参数，默认为None。
+  - vllm_enforce_eager: vllm透传参数，默认为False。
+  - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
+  - vllm_enable_prefix_caching: vllm透传参数，默认为True。
 - top_k: 默认为50。
 - top_p: 默认为0.9。
 - repetition_penalty: 重复惩罚项。默认为1.。
@@ -429,7 +431,6 @@ reward模型参数将在PPO、GRPO中使用。
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个。
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False。
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False。
-  - 注意：若该参数设置为True，训练时grad_norm一直为0，请安装`vllm==0.7.3`。
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False。
 - multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现。
 - dynamic_sample：筛除group内奖励标准差为0的数据，额外采样新数据，默认为False。
diff --git a/docs/source_en/BestPractices/GRPO.md b/docs/source_en/BestPractices/GRPO.md
index f1cd1f1491..271aa8263e 100644
--- a/docs/source_en/BestPractices/GRPO.md
+++ b/docs/source_en/BestPractices/GRPO.md
@@ -101,7 +101,7 @@ $$
 
 We selected Qwen2.5-3B-Instruct as the base model for training, as using an instruct-tuned model allows for faster acquisition of format rewards. The experiment was conducted on three GPUs, with vLLM inference deployed on the last GPU and two processes set on the remaining GPUs for gradient updates.
 
-Since the task is relatively simple, we set both `max_completion_length` and `vllm_max_model_len` to 1024. For more complex tasks, the model output length can be increased appropriately, but note that **the larger these parameters, the more GPU memory is required, and the slower the training speed**. The training time per step is linearly related to `max_completion_length`.
+Since the task is relatively simple, we set both `max_completion_length` to 1024. For more complex tasks, the model output length can be increased appropriately, but note that **the larger these parameters, the more GPU memory is required, and the slower the training speed**. The training time per step is linearly related to `max_completion_length`.
 
 In our experiment, the total batch size is:
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 6b36ece8a4..c8b4361457 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -421,14 +421,17 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
   - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
 - use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
-- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
-- vllm_max_model_len: vLLM passthrough parameter, default is None.
-- vllm_enforce_eager: vLLM passthrough parameter, default is False.
-- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
-- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
-- vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
-- vllm_server_port: The service port of the vLLM server. Default is 8000.
-- vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `server` or `colocate`
+- vllm_mode server parameter
+  - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
+  - vllm_server_port: The service port of the vLLM server. Default is 8000.
+  - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- vllm_mode colocate parameter
+  - vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
+  - vllm_max_model_len: vLLM passthrough parameter, the total length limit of model, default is None.
+  - vllm_enforce_eager: vLLM passthrough parameter, default is False.
+  - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
+  - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
 - top_k: Default is 50.
 - top_p: Default is 0.9.
 - repetition_penalty: Repetition penalty term. Default is 1.
@@ -440,7 +443,6 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
-  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
 - multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
 - dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 3207ce7ff2..7d7e5d54d4 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -196,16 +196,18 @@ Arguments
 - loss_type: The type of loss normalization. Options are ['grpo', 'bnpo', 'dr_grpo'], default is 'grpo'. For details, see this [pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
 - log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
   - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
-- use_vllm: Whether to use vLLM as the back-end for sampling generation; default is False, using pt(pytorch) engine to rollout.
-- vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or `"colocate"`
-- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
-- vllm_max_model_len: used in colocate mode, vLLM passthrough parameter, the total length limit of model, default is None.
-- vllm_enforce_eager: vLLM passthrough parameter, default is False.
-- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
-- vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
-- vllm_server_port: The service port of the vLLM server. Default is 8000.
-- vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
-- vllm_tensor_parallel_size: used in colocate mode, the tensor parallel size of vLLM engine, default is 1.
+- use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
+- vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `server` or `colocate`
+- vllm_mode server parameter
+  - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
+  - vllm_server_port: The service port of the vLLM server. Default is 8000.
+  - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- vllm_mode colocate parameter
+  - vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
+  - vllm_max_model_len: vLLM passthrough parameter, the total length limit of model, default is None.
+  - vllm_enforce_eager: vLLM passthrough parameter, default is False.
+  - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
+  - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
@@ -214,7 +216,6 @@ Arguments
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
-  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
 - multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
 - dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
diff --git a/examples/train/grpo/internal/lora_qwenvl72b.sh b/examples/train/grpo/internal/lora_qwenvl72b.sh
index d2c1bfcaae..d8941be25e 100755
--- a/examples/train/grpo/internal/lora_qwenvl72b.sh
+++ b/examples/train/grpo/internal/lora_qwenvl72b.sh
@@ -1,9 +1,6 @@
 # pip install math_verify # reward function
 # GPU memory: 8 * 80GiB
 
-# Note: If the grad_norm remains zero during training,
-# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
-
 MAX_PIXELS=602112 \
 WANDB_API_KEY=xxx \
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
diff --git a/examples/train/grpo/internal/lora_vllm.sh b/examples/train/grpo/internal/lora_vllm.sh
deleted file mode 100644
index 2283e9dc81..0000000000
--- a/examples/train/grpo/internal/lora_vllm.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-# pip install math_verify # reward function
-# pip install -U trl
-# GPU memory: 2 * 80GiB
-
-MASTER_PORT=29501 \
-CUDA_VISIBLE_DEVICES=0,1 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-7B \
-    --reward_funcs accuracy format \
-    --train_type lora \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.5 \
-    --vllm_max_model_len 8192 \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --target_modules all-linear \
-    --torch_dtype bfloat16 \
-    --dataset 'AI-MO/NuminaMath-TIR#1000' \
-    --max_completion_length 1024 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 16 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 16 \
-    --temperature 0.9 \
-    --deepspeed zero2 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --log_completions true
diff --git a/examples/train/grpo/internal/train_72b_4gpu.sh b/examples/train/grpo/internal/train_72b_4gpu.sh
index a050db3bdf..58dcc788a8 100644
--- a/examples/train/grpo/internal/train_72b_4gpu.sh
+++ b/examples/train/grpo/internal/train_72b_4gpu.sh
@@ -1,8 +1,5 @@
 # 4*80G GPU
 
-# Note: If the grad_norm remains zero during training,
-# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
-
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift rlhf \
diff --git a/examples/train/grpo/internal/train_multi_round.sh b/examples/train/grpo/internal/train_multi_round.sh
index 6beb6eef12..8c467dbc69 100644
--- a/examples/train/grpo/internal/train_multi_round.sh
+++ b/examples/train/grpo/internal/train_multi_round.sh
@@ -1,6 +1,3 @@
-# Note: If the grad_norm remains zero during training,
-# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
-
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NPROC_PER_NODE=8 \
 swift rlhf \
diff --git a/swift/llm/infer/infer_engine/utils.py b/swift/llm/infer/infer_engine/utils.py
index 338e642962..77c849d1c2 100644
--- a/swift/llm/infer/infer_engine/utils.py
+++ b/swift/llm/infer/infer_engine/utils.py
@@ -377,6 +377,8 @@ def new_group_context():
 
 
 def patch_vllm_memory_leak():
+    # fix vllm 0.7.3 memory leak
+    # https://github.com/vllm-project/vllm/pull/14326
     import vllm
     if version.parse(vllm.__version__) != version.parse('0.7.3'):
         return
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 90c211889b..693e1e552f 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -426,14 +426,12 @@ def split_llm(name):
 
     def prepare_vllm(self, model):
         from swift.tuners import Swift
-        from swift.llm import VllmEngine
         from swift.llm.infer.infer_engine import GRPOVllmEngine
         if self.vllm_tensor_parallel_size > 1:
             vllm_kwargs = {'distributed_executor_backend': 'external_launcher'}
         else:
             vllm_kwargs = {}
-            # Compatibility with TP
-        cls = GRPOVllmEngine
+
         engine_kwargs = {'seed': self.accelerator.process_index // self.vllm_tensor_parallel_size}
 
         max_num_seqs = (
@@ -441,7 +439,7 @@ def prepare_vllm(self, model):
             * self.args.gradient_accumulation_steps)
         current_device = get_device()
         with Swift.grpo_context(model, self.template.processor):
-            engine = cls(
+            engine = GRPOVllmEngine(
                 model.model_dir,
                 model.model_info.torch_dtype,
                 model_type=model.model_meta.model_type,

From 4afbdc3dda62b29a141f0c326c1ab2c07322d738 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 11:16:23 +0800
Subject: [PATCH 53/68] doc

---
 docs/source/Instruction/GRPO.md    | 11 ++++++-----
 docs/source_en/Instruction/GRPO.md |  7 ++++---
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index bb1dd81380..f0cb319ca0 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -9,11 +9,12 @@
 pip install math_verify==0.5.2 # reward function
 pip install -U trl
 ```
+GRPOTrainer在swift3.6.dev进行了代码重构，如果你使用的swift版本<=3.5, 请参考[stable文档](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
 
 **更新日志**
-- **2025-05-13** — Internal部分代码重构，支持vLLM>=0.8
+- **2025-05-13** — 为了代码的可读性和维护性， GRPOTrainer代码重构，Internal mode 支持vLLM>=0.8。
 - **2025-05-11** — 支持生成式奖励模型，通过 reward_model_plugin 自定义奖励模型逻辑。有关更多详细信息，请参阅[自定义奖励模型](#自定义奖励模型)部分。
-- **2025-04-30** — external vllm server 的启动命令改为 `swift rollout`
+- **2025-04-30** — external vllm server 的启动命令改为 `swift rollout`。
 
 **FAQ**
 1. 训练过程中 loss 接近0 是正常情况， 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
@@ -27,7 +28,7 @@ pip install -U trl
 
 GRPO 训练框架支持集成高性能推理引擎（如 vLLM）来加速采样过程，提供以下两种部署模式：
 
-### 1. Colocate Mode
+### 1. Colocate(Internal) Mode
 
 - 训练与推理共享GPU资源，在 Trainer 内部启动推理服务，
 
@@ -65,9 +66,9 @@ GRPO 训练框架支持集成高性能推理引擎（如 vLLM）来加速采样
 --move_model_batches [批次数量]
 ```
 
-### 2. Async Mode
+### 2. Async(External) Mode
 
-- 训练与推理资源分离，在外面启动单独的推理服务器
+- 训练与推理资源分离，启动单独的推理服务器
 
 使用`swift rollout`命令部署vLLM 服务器, 现仅支持vLLM backend
 ```bash
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 7d7e5d54d4..7c6f335168 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -10,9 +10,10 @@ environments
 pip install math_verify # reward function
 pip install -U trl
 ```
+The GRPOTrainer has been refactored in swift 3.6.dev. If you are using a version of Swift ≤ 3.5 , please refer to the[stable doc](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
 
 **Dev Log**
-
+- **2025-05-13** — The GRPOTrainer code has been refactored to improve code readability and maintainability. Internal mode now supports vLLM ≥ 0.8.
 - **2025-05-11** — Implemented support for the **Generative Reward Model** and enabled customized reward model processing logic through the reward plugin. For more details, refer to the [Customized Reward Models](#customized-reward-models) section.
 - **2025-04-30** — The startup command for the external vLLM server has been changed to swift rollout.
 
@@ -29,7 +30,7 @@ pip install -U trl
 
 The GRPO training framework supports the integration of high-performance inference engines (such as vLLM) to accelerate the sampling process, offering the following two deployment modes:
 
-### 1. Colocate Mode
+### 1. Colocate(Internal) Mode
 
 Training and inference share GPU resources; the inference service is started internally within the Trainer.
 
@@ -69,7 +70,7 @@ When running in Colocate Mode , out-of-memory (OOM) errors are common due to sim
 ```
 
 
-### 2. Async Mode
+### 2. Async(External) Mode
 
 Training and inference use separate resources; a dedicated inference server is launched externally.
 

From df2ce3d952aa612c1cbc75e9048c7dcb91d092e9 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 13:40:08 +0800
Subject: [PATCH 54/68] doc update

---
 docs/source/Instruction/GRPO.md    | 77 +++++++++++++++++++++++-------
 docs/source_en/Instruction/GRPO.md | 48 ++++++++++++++++++-
 2 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index f0cb319ca0..9a19790dd1 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -9,19 +9,14 @@
 pip install math_verify==0.5.2 # reward function
 pip install -U trl
 ```
-GRPOTrainer在swift3.6.dev进行了代码重构，如果你使用的swift版本<=3.5, 请参考[stable文档](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
+
+GRPOTrainer在swift3.5.dev进行了代码重构，如果你使用的swift版本<3.5, 请参考[stable文档](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
 
 **更新日志**
 - **2025-05-13** — 为了代码的可读性和维护性， GRPOTrainer代码重构，Internal mode 支持vLLM>=0.8。
 - **2025-05-11** — 支持生成式奖励模型，通过 reward_model_plugin 自定义奖励模型逻辑。有关更多详细信息，请参阅[自定义奖励模型](#自定义奖励模型)部分。
 - **2025-04-30** — external vllm server 的启动命令改为 `swift rollout`。
 
-**FAQ**
-1. 训练过程中 loss 接近0 是正常情况， 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
-2. 训练的steps怎么计算? 参考[issue](https://github.com/modelscope/ms-swift/issues/3912)
-3. clip_ratio为什么总是1? 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
-
-
 ## 集群支持
 
 ![](../../resources/grpo.png)
@@ -34,6 +29,7 @@ GRPO 训练框架支持集成高性能推理引擎（如 vLLM）来加速采样
 
 启动参数
 ```bash
+--use_vllm true \
 --vllm_mode colocate
 ```
 
@@ -80,6 +76,8 @@ swift rollout \
 
 训练使用以下参数配置外部 vLLM 服务器
 ```bash
+--use_vllm true \
+--vllm_mode server \
 --vllm_server_host <服务器IP> \
 --vllm_server_port <服务端口> \
 --vllm_server_timeout <超时时间> \
@@ -191,21 +189,21 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False
   - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容
 - use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
-- vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。使用server端时，
+- vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。
 - vllm_mode server 参数
-  - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
-  - vllm_server_port vLLM server 服务端口，默认为8000。
-  - vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
+  - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用.
+  - vllm_server_port vLLM server 服务端口，默认为8000.
+  - vllm_server_timeout 连接vLLM server的超时时间，默认为120s.
+  - async_generate: 异步rollout以提高训练速度，默认`false`.
 - vllm_mode colocate 参数
-  - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
-  - vllm_max_model_len: vllm透传参数，默认为None。
-  - vllm_enforce_eager: vllm透传参数，默认为False。
-  - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
-  - vllm_enable_prefix_caching: vllm透传参数，默认为True。
+  - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9.
+  - vllm_max_model_len: vllm透传参数，默认为None.
+  - vllm_enforce_eager: vllm透传参数，默认为False.
+  - vllm_limit_mm_per_prompt: vllm透传参数，默认为None.
+  - vllm_enable_prefix_caching: vllm透传参数，默认为True.
 - num_iterations: 每个批次代更新次数，默认为1.
 - epsilon: clip 系数，默认为0.2.
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
-- async_generate: 异步rollout以提高训练速度，仅支持async mode，默认`false`.
 - sleep_level: vllm特有参数，在训练和rollout复用卡的时候，可以选择vllm进行offload.
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个.
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
@@ -286,3 +284,48 @@ swift rlhf \
 | `--overlong_filter`  | `bool`    | `true`      |
 | `--reward_funcs`     | `str`     | `soft_overlong`|
 | `--max_resample_times` | `int`    | `3`        |
+
+
+## FAQ
+**1. 训练过程中 loss 等于0 / 接近0 / 小于0**
+
+正常情况， 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
+
+**2. num_generations / 批量大小相关**
+
+在 GRPO 中，batch_size 以 completion（模型生成结果） 为单位。例如，设置 per_device_train_batch_size=8 表示每张 GPU 在训练过程中会同时处理 8 个 completion 的 loss 计算。
+
+训练阶段，在一次完整的梯度累计 batch 中，总的 completion 数量等于：
+
+```
+num_processes * per_device_train_batch_size * gradient_accumulation_steps
+```
+
+在评估阶段，completion 的数量等于：
+```
+num_processes * per_device_eval_batch_size
+```
+
+参数 `num_generations` 必须能够被以上两个值整除，以保证生成任务可以均匀分配到各个设备上。
+
+**示例**
+
+在 8 卡的环境下，若设置 `num_generations = 16`，则要求：
+
+- per_device_train_batch_size * gradient_accumulation_steps
+- per_device_eval_batch_size
+这两个值都应大于或等于 2，以满足整除条件。
+
+**3. 为什么 KL 出现了NaN**
+
+开启 overlong_filter 后，某一卡上的所有 completion 都被截断
+
+**4. 训练的steps怎么计算?**
+
+参考[issue](https://github.com/modelscope/ms-swift/issues/3912)
+
+**5. clip_ratio为什么总是1?**
+
+num_iterations = 1，async_generate = False 下为 on-policy RL，old_policy此时等于policy
+
+参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 7c6f335168..c02561e978 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -10,7 +10,7 @@ environments
 pip install math_verify # reward function
 pip install -U trl
 ```
-The GRPOTrainer has been refactored in swift 3.6.dev. If you are using a version of Swift ≤ 3.5 , please refer to the[stable doc](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
+The GRPOTrainer has been refactored in swift 3.5.dev. If you are using a version of Swift < 3.5 , please refer to the[stable doc](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
 
 **Dev Log**
 - **2025-05-13** — The GRPOTrainer code has been refactored to improve code readability and maintainability. Internal mode now supports vLLM ≥ 0.8.
@@ -203,6 +203,7 @@ Arguments
   - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
   - vllm_server_port: The service port of the vLLM server. Default is 8000.
   - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+  - async_generate: Use async rollout to improve train speed，default `false`.
 - vllm_mode colocate parameter
   - vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
   - vllm_max_model_len: vLLM passthrough parameter, the total length limit of model, default is None.
@@ -212,7 +213,6 @@ Arguments
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
-- async_generate: Use async rollout to improve train speed，default `false`.
 - sleep_level: vllm specific，when both actor and rollout in the same GPU，you can make vllm sleep when model is training.
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
@@ -341,3 +341,47 @@ swift rlhf \
     --gc_collect_after_offload true \
     --log_completions true
 ```
+
+## FAQ
+**1. Loss equals zero / close to zero / negative during training**
+
+This is normal in certain cases.
+See reference: [issue](https://github.com/huggingface/open-r1/issues/239 #issuecomment-2646297851)
+
+**2. num_generations / Batch size calculation**
+
+In GRPO, the batch size is defined in terms of completions (i.e., model generation outputs). For example, setting per_device_train_batch_size=8 means that each GPU processes 8 completions for loss computation during training.
+
+During training, within a single gradient accumulation batch, the total number of completions is given by:
+
+```
+num_processes * per_device_train_batch_size * gradient_accumulation_steps
+```
+
+During evaluation, the number of completions is:
+```
+num_processes * per_device_eval_batch_size
+```
+The parameter num_generations must be divisible by both of these values to ensure even distribution of generation tasks across devices.
+
+**Example**
+In an 8-GPU setup, if you set num_generations = 16, then both:
+
+- per_device_train_batch_size * gradient_accumulation_steps
+- per_device_eval_batch_size
+
+should be at least 2 to satisfy the divisibility condition.
+
+**3. Why does KL become NaN?**
+
+After enabling overlong_filter, all completions on one GPU may have been truncated.
+
+**4. How are the training steps calculated?**
+
+See reference: [issue](https://github.com/modelscope/ms-swift/issues/3912)
+
+**5. Why is clip_ratio always 1?**
+
+When num_iterations = 1 and async_generate = False, it's on-policy RL, and old_policy is equal to policy.
+
+See reference: [issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)

From b101e4b2a9f6471e4c5b9072f849e9d8474b54d4 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 13:56:45 +0800
Subject: [PATCH 55/68] doc update

---
 examples/train/grpo/external/README.md        | 15 +++++++++++----
 .../multi_gpu_agent.sh => external/agent.sh}  | 11 +++++++++--
 examples/train/grpo/external/grpo.sh          |  7 ++++++-
 examples/train/grpo/internal/README.md        | 19 +++++++++++++++++++
 .../grpo/internal/multi_gpu_mp_colocate.sh    |  7 ++++---
 5 files changed, 49 insertions(+), 10 deletions(-)
 rename examples/train/grpo/{internal/multi_gpu_agent.sh => external/agent.sh} (85%)
 create mode 100644 examples/train/grpo/internal/README.md

diff --git a/examples/train/grpo/external/README.md b/examples/train/grpo/external/README.md
index 97bf7f4125..8e4bbaca48 100644
--- a/examples/train/grpo/external/README.md
+++ b/examples/train/grpo/external/README.md
@@ -1,9 +1,11 @@
-# README: GRPO External Mode Execution Scripts
+# README: GRPO External(Async) Mode Execution Scripts
 
 ---
 
-> **Note**: External mode requires vLLM version 0.8.3 or higher.
+> **Note**: External mode requires
 
+1. vLLM version 0.8.3 or higher.
+2. trl version 0.17.0 or higher
 
 ## **Introduction**
 
@@ -37,10 +39,15 @@ swift rollout \
 ```
 
 ## Training with External vLLM Server
+Configuration Parameters
+
 ```bash
+--use_vllm true \
+--vllm_mode server \
 --vllm_server_host <server ip> \
 --vllm_server_port <server port> \
 --vllm_server_timeout <Timeout duration> \
 ```
-Configuration Parameters
-When using an external vLLM server, configure the following parameters:
+
+## Multi-Node Training
+On each node, execute the original single-node training script, using the environment variables `NNODES` and `NODE_RANK`, and ensure consistent use of configuration parameters across all nodes.
diff --git a/examples/train/grpo/internal/multi_gpu_agent.sh b/examples/train/grpo/external/agent.sh
similarity index 85%
rename from examples/train/grpo/internal/multi_gpu_agent.sh
rename to examples/train/grpo/external/agent.sh
index 7210dfe424..b4ca720113 100644
--- a/examples/train/grpo/internal/multi_gpu_agent.sh
+++ b/examples/train/grpo/external/agent.sh
@@ -1,12 +1,21 @@
 # wandb result link: https://wandb.ai/tastelikefeet/tastelikefeet?nw=nwuseryuzezyz
 # model link: https://www.modelscope.cn/models/swift/Qwen2-7B-Agent-GRPO
 # WANDB_API_KEY=xxx \
+
+# CUDA_VISIBLE_DEVICES=7 \
+# swift rollout \
+#     --model Qwen/Qwen2.5-7B
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
 NPROC_PER_NODE=7 \
 swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-7B \
     --train_type full \
     --dataset LLM-Research/xlam-function-calling-60k \
+    --use_vllm true \
+    --vllm_mode server \
+    --vllm_server_host 127.0.0.1 \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --max_length 2048 \
@@ -23,8 +32,6 @@ swift rlhf \
     --max_completion_length 1024 \
     --reward_funcs toolbench react_format \
     --num_generations 49 \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.7 \
     --deepspeed zero3 \
     --temperature 1.0 \
     --stop_words Observation: \
diff --git a/examples/train/grpo/external/grpo.sh b/examples/train/grpo/external/grpo.sh
index 64294ec79b..84413aa778 100644
--- a/examples/train/grpo/external/grpo.sh
+++ b/examples/train/grpo/external/grpo.sh
@@ -1,3 +1,8 @@
+# CUDA_VISIBLE_DEVICES=0,1 \
+# swift rollout \
+#     --model Qwen/Qwen2.5-32B-Instruct \
+#     --tensor_parallel_size 2
+
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NPROC_PER_NODE=8 \
 swift rlhf \
@@ -5,7 +10,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-32B-Instruct \
     --reward_funcs accuracy \
     --use_vllm true \
-    --vllm_server_host 127.0.0.1 \
+    --vllm_server_host xxx \
     --vllm_server_port 8000 \
     --train_type full \
     --torch_dtype bfloat16 \
diff --git a/examples/train/grpo/internal/README.md b/examples/train/grpo/internal/README.md
new file mode 100644
index 0000000000..1423f91c09
--- /dev/null
+++ b/examples/train/grpo/internal/README.md
@@ -0,0 +1,19 @@
+# README: GRPO Internal(Colocate) Mode Execution Scripts
+
+---
+
+## **Introduction**
+
+The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows you 使用相同的GPU资源部署vLLM和训练
+
+This folder contains scripts and instructions for running GRPO in **Internal Mode**
+
+## Training with Internal mode 
+```bash
+--use_vllm true \
+--vllm_mode colocate \
+--vllm_gpu_memory_utilization [ut_ratio] \
+```
+
+## Multi-Node Training
+On each node, execute the original single-node training script, using the environment variables `NNODES` and `NODE_RANK`, and ensure consistent use of configuration parameters across all nodes.
diff --git a/examples/train/grpo/internal/multi_gpu_mp_colocate.sh b/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
index dd29bc183f..60f790954a 100644
--- a/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
+++ b/examples/train/grpo/internal/multi_gpu_mp_colocate.sh
@@ -5,6 +5,10 @@ swift rlhf \
     --model Qwen/Qwen2.5-VL-7B-Instruct \
     --train_type lora \
     --dataset AI-ModelScope/chartqa_digit_r1v_format \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_tensor_parallel_size 4 \
     --torch_dtype bfloat16 \
     --system examples/train/grpo/prompt.txt \
     --num_train_epochs 1 \
@@ -21,9 +25,6 @@ swift rlhf \
     --max_completion_length 1024 \
     --reward_funcs accuracy format \
     --num_generations 8 \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.5 \
     --sleep_level 1 \
-    --tensor_parallel_size 4 \
     --temperature 1.0 \
     --top_p 0.85

From 1939873420d7f956db2a307e9487fc6c496d9643 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 14:05:04 +0800
Subject: [PATCH 56/68] update doc and readme

---
 examples/train/grpo/external/README.md               |  7 +++++++
 examples/train/grpo/external/grpo.sh                 |  1 +
 examples/train/grpo/internal/README.md               | 12 ++++++++++--
 .../internal/{grpo.sh => grpo_pt_without_vllm.sh}    |  0
 examples/train/grpo/internal/lora_qwenvl72b.sh       |  9 +++++----
 examples/train/grpo/internal/train_72b_4gpu.sh       |  9 +++++----
 examples/train/grpo/internal/train_multi_round.sh    |  9 +++++----
 examples/train/grpo/multi_node/Qwen2_5_32B_full.sh   |  1 +
 examples/train/grpo/multi_node/multi_node1.sh        |  1 +
 examples/train/grpo/multi_node/multi_node2.sh        |  1 +
 examples/train/grpo/multi_node/train_dlc.sh          |  7 ++++---
 11 files changed, 40 insertions(+), 17 deletions(-)
 rename examples/train/grpo/internal/{grpo.sh => grpo_pt_without_vllm.sh} (100%)

diff --git a/examples/train/grpo/external/README.md b/examples/train/grpo/external/README.md
index 8e4bbaca48..f80097961c 100644
--- a/examples/train/grpo/external/README.md
+++ b/examples/train/grpo/external/README.md
@@ -6,6 +6,13 @@
 
 1. vLLM version 0.8.3 or higher.
 2. trl version 0.17.0 or higher
+3. ms-swift source code version
+
+```
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+```
 
 ## **Introduction**
 
diff --git a/examples/train/grpo/external/grpo.sh b/examples/train/grpo/external/grpo.sh
index 84413aa778..c1c9cbceeb 100644
--- a/examples/train/grpo/external/grpo.sh
+++ b/examples/train/grpo/external/grpo.sh
@@ -10,6 +10,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-32B-Instruct \
     --reward_funcs accuracy \
     --use_vllm true \
+    --vllm_mode server \
     --vllm_server_host xxx \
     --vllm_server_port 8000 \
     --train_type full \
diff --git a/examples/train/grpo/internal/README.md b/examples/train/grpo/internal/README.md
index 1423f91c09..23da7a0126 100644
--- a/examples/train/grpo/internal/README.md
+++ b/examples/train/grpo/internal/README.md
@@ -1,14 +1,22 @@
 # README: GRPO Internal(Colocate) Mode Execution Scripts
 
 ---
+**NOTE**
+The scripts in this folder require the source code version of ms-swift.
+
+```
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+```
 
 ## **Introduction**
 
-The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows you 使用相同的GPU资源部署vLLM和训练
+The GRPO (Group Relative Policy Optimization) training framework supports high-performance inference engines like vLLM to accelerate the sampling process. The **Internal Mode** allows you to deploy vLLM and perform training using the same GPU resources.
 
 This folder contains scripts and instructions for running GRPO in **Internal Mode**
 
-## Training with Internal mode 
+## Training with Internal mode
 ```bash
 --use_vllm true \
 --vllm_mode colocate \
diff --git a/examples/train/grpo/internal/grpo.sh b/examples/train/grpo/internal/grpo_pt_without_vllm.sh
similarity index 100%
rename from examples/train/grpo/internal/grpo.sh
rename to examples/train/grpo/internal/grpo_pt_without_vllm.sh
diff --git a/examples/train/grpo/internal/lora_qwenvl72b.sh b/examples/train/grpo/internal/lora_qwenvl72b.sh
index d8941be25e..787d386110 100755
--- a/examples/train/grpo/internal/lora_qwenvl72b.sh
+++ b/examples/train/grpo/internal/lora_qwenvl72b.sh
@@ -9,6 +9,11 @@ swift rlhf \
   --rlhf_type grpo \
   --model Qwen/Qwen2.5-VL-72B-Instruct \
   --train_type lora \
+  --use_vllm true \
+  --vllm_mode colocate \
+  --vllm_gpu_memory_utilization 0.5 \
+  --vllm_max_model_len 8192 \
+  --vllm_tensor_parallel_size 4 \
   --dataset lmms-lab/multimodal-open-r1-8k-verified#1000 \
   --external_plugins examples/train/grpo/plugin/plugin.py \
   --reward_funcs external_r1v_acc format \
@@ -29,15 +34,11 @@ swift rlhf \
   --dataloader_num_workers 4 \
   --max_completion_length 2048 \
   --num_generations 8 \
-  --use_vllm true \
-  --vllm_gpu_memory_utilization 0.5 \
-  --vllm_max_model_len 8192 \
   --deepspeed zero3 \
   --temperature 1.1 \
   --top_p 1.0 \
   --top_k 80 \
   --log_completions true \
-  --tensor_parallel_size 4 \
   --async_generate false \
   --offload_optimizer true \
   --offload_model true \
diff --git a/examples/train/grpo/internal/train_72b_4gpu.sh b/examples/train/grpo/internal/train_72b_4gpu.sh
index 58dcc788a8..cdea6d383a 100644
--- a/examples/train/grpo/internal/train_72b_4gpu.sh
+++ b/examples/train/grpo/internal/train_72b_4gpu.sh
@@ -6,6 +6,11 @@ swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-72B-Instruct \
     --train_type lora \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_max_model_len 2048 \
+    --vllm_tensor_parallel_size 4 \
     --dataset AI-MO/NuminaMath-TIR#10000 \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
@@ -25,15 +30,11 @@ swift rlhf \
     --reward_funcs accuracy format \
     --num_generations 4 \
     --system examples/train/grpo/prompt.txt \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.5 \
-    --vllm_max_model_len 2048 \
     --deepspeed zero3_offload \
     --temperature 1.0 \
     --top_p 1.0 \
     --top_k 80 \
     --log_completions true \
-    --tensor_parallel_size 4 \
     --async_generate false \
     --move_model_batches 16 \
     --offload_optimizer true \
diff --git a/examples/train/grpo/internal/train_multi_round.sh b/examples/train/grpo/internal/train_multi_round.sh
index 8c467dbc69..afc88b6ca4 100644
--- a/examples/train/grpo/internal/train_multi_round.sh
+++ b/examples/train/grpo/internal/train_multi_round.sh
@@ -6,6 +6,11 @@ swift rlhf \
     --train_type full \
     --dataset AI-MO/NuminaMath-TIR#10000 \
     --torch_dtype bfloat16 \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_max_model_len 2048 \
+    --vllm_tensor_parallel_size 4 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --per_device_train_batch_size 4 \
@@ -23,15 +28,11 @@ swift rlhf \
     --reward_funcs accuracy format \
     --num_generations 32 \
     --system examples/train/grpo/prompt.txt \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.5 \
-    --vllm_max_model_len 2048 \
     --deepspeed zero3 \
     --temperature 1.0 \
     --top_p 1.0 \
     --top_k 80 \
     --log_completions true \
-    --tensor_parallel_size 4 \
     --async_generate false \
     --offload_optimizer true \
     --offload_model true \
diff --git a/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh b/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
index 244967c95d..8be0012482 100644
--- a/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
+++ b/examples/train/grpo/multi_node/Qwen2_5_32B_full.sh
@@ -20,6 +20,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-32B-Instruct \
     --reward_funcs accuracy \
     --use_vllm true \
+    --vllm_mode colocate \
     --vllm_server_host xxx \
     --vllm_server_port 8000 \
     --train_type full \
diff --git a/examples/train/grpo/multi_node/multi_node1.sh b/examples/train/grpo/multi_node/multi_node1.sh
index 6fb864536c..99fd68d011 100755
--- a/examples/train/grpo/multi_node/multi_node1.sh
+++ b/examples/train/grpo/multi_node/multi_node1.sh
@@ -15,6 +15,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-Math-7B \
     --reward_funcs accuracy format \
     --use_vllm true \
+    --vllm_mode colocate \
     --vllm_gpu_memory_utilization 0.5 \
     --vllm_max_model_len 4096 \
     --train_type full \
diff --git a/examples/train/grpo/multi_node/multi_node2.sh b/examples/train/grpo/multi_node/multi_node2.sh
index 50766dd624..4f69543938 100755
--- a/examples/train/grpo/multi_node/multi_node2.sh
+++ b/examples/train/grpo/multi_node/multi_node2.sh
@@ -10,6 +10,7 @@ swift rlhf \
     --model Qwen/Qwen2.5-Math-7B \
     --reward_funcs accuracy format \
     --use_vllm true \
+    --vllm_mode colocate \
     --vllm_gpu_memory_utilization 0.5 \
     --vllm_max_model_len 4096 \
     --train_type full \
diff --git a/examples/train/grpo/multi_node/train_dlc.sh b/examples/train/grpo/multi_node/train_dlc.sh
index 07233f3d8d..f3fd512501 100644
--- a/examples/train/grpo/multi_node/train_dlc.sh
+++ b/examples/train/grpo/multi_node/train_dlc.sh
@@ -16,7 +16,11 @@ torchrun \
     --system examples/train/grpo/prompt.txt \
     --num_train_epochs 1 \
     --max_length 2048 \
+    --use_vllm true \
+    --vllm_mode colocate \
     --vllm_max_model_len 2048 \
+    --vllm_gpu_memory_utilization 0.3 \
+    --vllm_tensor_parallel_size 4 \
     --per_device_train_batch_size 4 \
     --per_device_eval_batch_size 4 \
     --learning_rate 1e-6 \
@@ -29,10 +33,7 @@ torchrun \
     --max_completion_length 2048 \
     --reward_funcs accuracy format \
     --num_generations 48 \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.3 \
     --sleep_level 1 \
     --deepspeed zero3_offload \
-    --tensor_parallel_size 4 \
     --temperature 1.0 \
     --top_p 0.85

From dae81c1d63497759a7eeb85b8d5756d285afc682 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 14:16:11 +0800
Subject: [PATCH 57/68] update grpo doc

---
 docs/source/Instruction/GRPO.md               |  4 +-
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +-
 .../Instruction/Command-line-parameters.md    |  3 +-
 docs/source_en/Instruction/GRPO.md            | 55 ++-----------------
 4 files changed, 12 insertions(+), 52 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index 9a19790dd1..b1897bae63 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -74,6 +74,8 @@ swift rollout \
   --tensor_parallel_size 2 \
 ```
 
+对于更多 vLLM 参数，你可以参考[vLLM参数](./命令行参数.md#vllm参数)
+
 训练使用以下参数配置外部 vLLM 服务器
 ```bash
 --use_vllm true \
@@ -201,10 +203,10 @@ A conversation between User and Assistant. The user asks a question, and the Ass
   - vllm_enforce_eager: vllm透传参数，默认为False.
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None.
   - vllm_enable_prefix_caching: vllm透传参数，默认为True.
+  - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
 - num_iterations: 每个批次代更新次数，默认为1.
 - epsilon: clip 系数，默认为0.2.
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
-- sleep_level: vllm特有参数，在训练和rollout复用卡的时候，可以选择vllm进行offload.
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个.
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index bcb0dff1c5..4bb10f7c4f 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -420,6 +420,7 @@ reward模型参数将在PPO、GRPO中使用。
   - vllm_enforce_eager: vllm透传参数，默认为False。
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
   - vllm_enable_prefix_caching: vllm透传参数，默认为True。
+  - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
 - top_k: 默认为50。
 - top_p: 默认为0.9。
 - repetition_penalty: 重复惩罚项。默认为1.。
@@ -427,7 +428,6 @@ reward模型参数将在PPO、GRPO中使用。
 - epsilon: clip 系数，默认为0.2。
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
 - async_generate: 异步rollout以提高训练速度，默认`false`。
-- sleep_level: vllm特有参数，在训练和rollout复用卡的时候，可以选择vllm进行offload。
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个。
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False。
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index c8b4361457..e37e336c74 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -432,6 +432,8 @@ The meanings of the following parameters can be referenced [here](https://huggin
   - vllm_enforce_eager: vLLM passthrough parameter, default is False.
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
+  - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
+
 - top_k: Default is 50.
 - top_p: Default is 0.9.
 - repetition_penalty: Repetition penalty term. Default is 1.
@@ -439,7 +441,6 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
 - async_generate: Use async rollout to improve train speed，default `false`.
-- sleep_level: vllm specific，when both actor and rollout in the same GPU，you can make vllm sleep when model is training.
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index c02561e978..c0ffe0ebaf 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -83,6 +83,8 @@ swift rollout \
   --tensor_parallel_size 2 \
 ```
 
+For more vLLM parameters, you can refer to [vLLM arguments](./Command-line-parameters.md#vllm-arguments)
+
 Use the following parameters in training to connect to an external vLLM server:
 
 ```bash
@@ -210,10 +212,10 @@ Arguments
   - vllm_enforce_eager: vLLM passthrough parameter, default is False.
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
+  - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
-- sleep_level: vllm specific，when both actor and rollout in the same GPU，you can make vllm sleep when model is training.
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
@@ -282,7 +284,9 @@ Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) introduces severa
 - Token level Loss
 - Soft Overlong Punishment
 
-Among these, Token level Loss is implemented by default and does not require additional settings. For the other tricks, we can achieve the desired setup based on GRPOTrainer by configuring the following parameters.
+For the above tricks, we can achieve the desired setup based on GRPOTrainer by configuring the following parameters.
+
+The token-level loss is implemented by using the loss type bnpo.
 
 
 | Parameter                 | Type      | Value      |
@@ -294,53 +298,6 @@ Among these, Token level Loss is implemented by default and does not require add
 | `--reward_funcs`     | `str`     | `soft_overlong`|
 | `--max_resample_times` | `int`    | `3`        |
 
-Reference training script (for 8-card colocate mode):
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=8 \
-WANDB_API_KEY=xxx \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-1.5B \
-    --reward_funcs accuracy soft_overlong \
-    --max_completion_length 4096 \
-    --soft_cache_length 819 \
-    --epsilon 0.2 \
-    --epsilon_high 0.28 \
-    --dynamic_sample true \
-    --overlong_filter true \
-    --max_resample_times 3 \
-    --use_vllm true \
-    --vllm_gpu_memory_utilization 0.6 \
-    --train_type full \
-    --torch_dtype bfloat16 \
-    --dataset AI-MO/NuminaMath-TIR#5000 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --learning_rate 1e-6 \
-    --eval_steps 1000 \
-    --save_steps 1000 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 8 \
-    --temperature 1.0 \
-    --top_p 1.0 \
-    --deepspeed zero2 \
-    --log_completions true \
-    --num_iterations 1 \
-    --report_to tensorboard wandb \
-    --beta 0.0 \
-    --sleep_level 1 \
-    --offload_model true \
-    --offload_optimizer true \
-    --gc_collect_after_offload true \
-    --log_completions true
-```
 
 ## FAQ
 **1. Loss equals zero / close to zero / negative during training**

From 05054d058c7e6693caf9b084bed89a41a7bcb67d Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 14:51:52 +0800
Subject: [PATCH 58/68] update scripts

---
 docs/source/Instruction/GRPO.md               |  2 +
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +
 .../Instruction/Command-line-parameters.md    |  3 +-
 docs/source_en/Instruction/GRPO.md            |  2 +
 examples/train/grpo/internal/lmdeploy.sh      | 40 +++++++++++++++++++
 .../{grpo_pt_without_vllm.sh => pt.sh}        |  2 +
 ...lti_gpu_mp_colocate.sh => pt_devicemap.sh} |  0
 .../{train_72b_4gpu.sh => vllm_72b_4gpu.sh}   |  0
 ...ra_qwenvl72b.sh => vllm_lora_qwenvl72b.sh} |  0
 ...ain_multi_round.sh => vllm_multi_round.sh} |  0
 .../grpo/plugin/run_external_reward_func.sh   |  1 +
 .../grpo/plugin/run_external_reward_model.sh  |  6 ++-
 12 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 examples/train/grpo/internal/lmdeploy.sh
 rename examples/train/grpo/internal/{grpo_pt_without_vllm.sh => pt.sh} (96%)
 rename examples/train/grpo/internal/{multi_gpu_mp_colocate.sh => pt_devicemap.sh} (100%)
 rename examples/train/grpo/internal/{train_72b_4gpu.sh => vllm_72b_4gpu.sh} (100%)
 rename examples/train/grpo/internal/{lora_qwenvl72b.sh => vllm_lora_qwenvl72b.sh} (100%)
 rename examples/train/grpo/internal/{train_multi_round.sh => vllm_multi_round.sh} (100%)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index b1897bae63..4d723c2413 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -204,6 +204,8 @@ A conversation between User and Assistant. The user asks a question, and the Ass
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None.
   - vllm_enable_prefix_caching: vllm透传参数，默认为True.
   - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
+- use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
+- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8
 - num_iterations: 每个批次代更新次数，默认为1.
 - epsilon: clip 系数，默认为0.2.
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 4bb10f7c4f..67939390c5 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -421,6 +421,8 @@ reward模型参数将在PPO、GRPO中使用。
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
   - vllm_enable_prefix_caching: vllm透传参数，默认为True。
   - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
+- use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
+- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8
 - top_k: 默认为50。
 - top_p: 默认为0.9。
 - repetition_penalty: 重复惩罚项。默认为1.。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index e37e336c74..7be4e75d13 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -433,7 +433,8 @@ The meanings of the following parameters can be referenced [here](https://huggin
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
   - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
-
+- use_lmdeploy: Whether to use LMDeoloy as the infer_backend for GRPO generation, default is False.
+- lmdeploy_cache_max_entry_count: LMDeploy passthrough parameter, default is 0.8
 - top_k: Default is 50.
 - top_p: Default is 0.9.
 - repetition_penalty: Repetition penalty term. Default is 1.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index c0ffe0ebaf..6380d60c75 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -213,6 +213,8 @@ Arguments
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
   - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
+- use_lmdeploy: Whether to use LMDeoloy as the infer_backend for GRPO generation, default is False.
+- lmdeploy_cache_max_entry_count: LMDeploy passthrough parameter, default is 0.8
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
diff --git a/examples/train/grpo/internal/lmdeploy.sh b/examples/train/grpo/internal/lmdeploy.sh
new file mode 100644
index 0000000000..76bff5f353
--- /dev/null
+++ b/examples/train/grpo/internal/lmdeploy.sh
@@ -0,0 +1,40 @@
+# A800 * 8
+CUDA_LAUNCH_BLOCKING=1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-1.5B \
+    --reward_funcs accuracy format \
+    --use_lmdeploy true \
+    --lmdeploy_cache_max_entry_count 0.7 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --dataset AI-MO/NuminaMath-TIR#5000 \
+    --max_completion_length 1536 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --learning_rate 1e-6 \
+    --eval_steps 1000 \
+    --save_steps 1000 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 32 \
+    --temperature 1.0 \
+    --top_p 0.9 \
+    --top_k 50 \
+    --system 'examples/train/grpo/prompt.txt' \
+    --deepspeed zero3 \
+    --log_completions true \
+    --num_iterations 2 \
+    --move_model_batches 16 \
+    --offload_optimizer true \
+    --offload_model true \
+    --gc_collect_after_offload true
\ No newline at end of file
diff --git a/examples/train/grpo/internal/grpo_pt_without_vllm.sh b/examples/train/grpo/internal/pt.sh
similarity index 96%
rename from examples/train/grpo/internal/grpo_pt_without_vllm.sh
rename to examples/train/grpo/internal/pt.sh
index ec464ca0c8..04472c1979 100644
--- a/examples/train/grpo/internal/grpo_pt_without_vllm.sh
+++ b/examples/train/grpo/internal/pt.sh
@@ -2,6 +2,8 @@
 # pip install -U trl
 # GPU memory: 80GiB
 # You can set `--reward_model` to use a reward model to provide rewards.
+# PTEngine(pytorch) to rollout
+
 CUDA_VISIBLE_DEVICES=0 \
 swift rlhf \
     --rlhf_type grpo \
diff --git a/examples/train/grpo/internal/multi_gpu_mp_colocate.sh b/examples/train/grpo/internal/pt_devicemap.sh
similarity index 100%
rename from examples/train/grpo/internal/multi_gpu_mp_colocate.sh
rename to examples/train/grpo/internal/pt_devicemap.sh
diff --git a/examples/train/grpo/internal/train_72b_4gpu.sh b/examples/train/grpo/internal/vllm_72b_4gpu.sh
similarity index 100%
rename from examples/train/grpo/internal/train_72b_4gpu.sh
rename to examples/train/grpo/internal/vllm_72b_4gpu.sh
diff --git a/examples/train/grpo/internal/lora_qwenvl72b.sh b/examples/train/grpo/internal/vllm_lora_qwenvl72b.sh
similarity index 100%
rename from examples/train/grpo/internal/lora_qwenvl72b.sh
rename to examples/train/grpo/internal/vllm_lora_qwenvl72b.sh
diff --git a/examples/train/grpo/internal/train_multi_round.sh b/examples/train/grpo/internal/vllm_multi_round.sh
similarity index 100%
rename from examples/train/grpo/internal/train_multi_round.sh
rename to examples/train/grpo/internal/vllm_multi_round.sh
diff --git a/examples/train/grpo/plugin/run_external_reward_func.sh b/examples/train/grpo/plugin/run_external_reward_func.sh
index e2fcac2c42..0ee5b2aa09 100644
--- a/examples/train/grpo/plugin/run_external_reward_func.sh
+++ b/examples/train/grpo/plugin/run_external_reward_func.sh
@@ -1,6 +1,7 @@
 # pip install math_verify # reward function
 # pip install -U trl
 # GPU memory: 80GiB
+# register customized plugin in external_plugins file
 
 CUDA_VISIBLE_DEVICES=0 \
 swift rlhf \
diff --git a/examples/train/grpo/plugin/run_external_reward_model.sh b/examples/train/grpo/plugin/run_external_reward_model.sh
index 174e19ad2a..5afcbe576f 100644
--- a/examples/train/grpo/plugin/run_external_reward_model.sh
+++ b/examples/train/grpo/plugin/run_external_reward_model.sh
@@ -1,16 +1,20 @@
 # see rm_plugin example in swift/plugin/rm_plugin.py
+# register customized plugin in external_plugins file
+
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NPROC_PER_NODE=8 \
 swift rlhf \
     --rlhf_type grpo \
     --model Qwen/Qwen2.5-7B \
     --dataset AI-MO/NuminaMath-TIR#5000 \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
     --external_plugins examples/train/grpo/plugin/plugin.py \
     --reward_funcs format \
     --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
     --reward_model_plugin genrm my_rmplugin \
     --reward_weights 0.1 1 1 \
-    --vllm_gpu_memory_utilization 0.5 \
     --sleep_level 1 \
     --offload_model true \
     --offload_optimizer true \

From 11307be29e556bbd73fcc077dca120c52049c183 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 15:05:44 +0800
Subject: [PATCH 59/68] rm script

---
 examples/train/grpo/internal/lmdeploy.sh | 40 ------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 examples/train/grpo/internal/lmdeploy.sh

diff --git a/examples/train/grpo/internal/lmdeploy.sh b/examples/train/grpo/internal/lmdeploy.sh
deleted file mode 100644
index 76bff5f353..0000000000
--- a/examples/train/grpo/internal/lmdeploy.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# A800 * 8
-CUDA_LAUNCH_BLOCKING=1 \
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-NPROC_PER_NODE=8 \
-swift rlhf \
-    --rlhf_type grpo \
-    --model Qwen/Qwen2.5-1.5B \
-    --reward_funcs accuracy format \
-    --use_lmdeploy true \
-    --lmdeploy_cache_max_entry_count 0.7 \
-    --train_type lora \
-    --torch_dtype bfloat16 \
-    --dataset AI-MO/NuminaMath-TIR#5000 \
-    --max_completion_length 1536 \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps 8 \
-    --learning_rate 1e-6 \
-    --eval_steps 1000 \
-    --save_steps 1000 \
-    --save_total_limit 2 \
-    --logging_steps 5 \
-    --max_length 2048 \
-    --output_dir output \
-    --warmup_ratio 0.05 \
-    --dataloader_num_workers 4 \
-    --dataset_num_proc 4 \
-    --num_generations 32 \
-    --temperature 1.0 \
-    --top_p 0.9 \
-    --top_k 50 \
-    --system 'examples/train/grpo/prompt.txt' \
-    --deepspeed zero3 \
-    --log_completions true \
-    --num_iterations 2 \
-    --move_model_batches 16 \
-    --offload_optimizer true \
-    --offload_model true \
-    --gc_collect_after_offload true
\ No newline at end of file

From 7bbed3f2d7a534fedd54e63fd9838db72a8b52c6 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 15:24:05 +0800
Subject: [PATCH 60/68] update completion_length_limit_scope argument

---
 docs/source/Instruction/GRPO.md               | 27 +++++++++----------
 ...44\350\241\214\345\217\202\346\225\260.md" | 10 +++----
 .../Instruction/Command-line-parameters.md    |  6 ++++-
 docs/source_en/Instruction/GRPO.md            |  6 ++++-
 swift/trainers/arguments.py                   |  1 +
 swift/trainers/rlhf_trainer/grpo_trainer.py   |  7 ++---
 6 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index 4d723c2413..0e7083a1ce 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -203,24 +203,23 @@ A conversation between User and Assistant. The user asks a question, and the Ass
   - vllm_enforce_eager: vllm透传参数，默认为False.
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None.
   - vllm_enable_prefix_caching: vllm透传参数，默认为True.
-  - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
+  - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放.
 - use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
-- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8
-- num_iterations: 每个批次代更新次数，默认为1.
-- epsilon: clip 系数，默认为0.2.
-- epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围.
-- move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个.
-- offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
-- offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
-- gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
-- multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
+- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8。
+- num_iterations: 每个批次代更新次数，默认为1。
+- epsilon: clip 系数，默认为0.2。
+- epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
+- move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个。
+- offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False。
+- offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False。
+- gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False。
+- multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现。
+- completion_length_limit_scope: 在多轮对话中，`max_completion_length` 的限制范围。
+`total`限制所有对话轮次的总输出长度不超过`max_completion_length`, `per_round`限制每一轮的输出长度。
+默认为`per_round`, 当前仅对 colocate mode 生效。
 - dynamic_sample：筛除group内奖励标准差为0的数据，额外采样新数据，默认为False。
 - max_resample_times：dynamic_sample设置下限制重采样次数，默认3次。
 - overlong_filter：跳过超长截断的样本，不参与loss计算，默认为False。
-- vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用 \
-- vllm_server_port vLLM server 服务端口，默认为8000 \
-- vllm_server_timeout 连接vLLM server的超时时间，默认为120s \
-
 
 奖励函数参数，见[内置奖励函数](#内置奖励函数)
 
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 67939390c5..e7707616a5 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -414,6 +414,7 @@ reward模型参数将在PPO、GRPO中使用。
   - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
   - vllm_server_port vLLM server 服务端口，默认为8000。
   - vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
+  - async_generate: 异步rollout以提高训练速度，默认`false`。
 - vllm_mode colocate 参数
   - vllm_gpu_memory_utilization: vllm透传参数，默认为0.9。
   - vllm_max_model_len: vllm透传参数，默认为None。
@@ -422,19 +423,18 @@ reward模型参数将在PPO、GRPO中使用。
   - vllm_enable_prefix_caching: vllm透传参数，默认为True。
   - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
 - use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
-- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8
-- top_k: 默认为50。
-- top_p: 默认为0.9。
-- repetition_penalty: 重复惩罚项。默认为1.。
+- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8。
 - num_iterations: 每个批次代更新次数，默认为1。
 - epsilon: clip 系数，默认为0.2。
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
-- async_generate: 异步rollout以提高训练速度，默认`false`。
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个。
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False。
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False。
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False。
 - multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现。
+- completion_length_limit_scope: 在多轮对话中，`max_completion_length` 的限制范围。
+`total`限制所有对话轮次的总输出长度不超过`max_completion_length`, `per_round`限制每一轮的输出长度。
+默认为`per_round`, 当前仅对 colocate mode 生效。
 - dynamic_sample：筛除group内奖励标准差为0的数据，额外采样新数据，默认为False。
 - max_resample_times：dynamic_sample设置下限制重采样次数，默认3次。
 - overlong_filter：跳过超长截断的样本，不参与loss计算，默认为False。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 7be4e75d13..5ce3d823d3 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -446,7 +446,11 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
-- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py.
+- completion_length_limit_scope: Specifies the scope of the `max_completion_length` limit in multi-turn conversations.
+When set to `total`, the total output length across all turns must not exceed `max_completion_length`.
+When set to `per_round`, each individual turn's output length is limited separately.
+Defaults to `per_round`. Currently only takes effect in colocate mode.
 - dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
 - max_resample_times: Under the dynamic_sample setting, limit the number of resampling attempts to a maximum of 3. Default is 3 times.
 - overlong_filter: Skip overlong truncated samples, which will not be included in loss calculation. Default is False.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 6380d60c75..eabe3f0666 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -222,7 +222,11 @@ Arguments
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
-- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py.
+- completion_length_limit_scope: Specifies the scope of the `max_completion_length` limit in multi-turn conversations.
+When set to `total`, the total output length across all turns must not exceed `max_completion_length`.
+When set to `per_round`, each individual turn's output length is limited separately.
+Defaults to `per_round`. Currently only takes effect in colocate mode.
 - dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
 - max_resample_times: Under the dynamic_sample setting, limit the number of resampling attempts to a maximum of 3. Default is 3 times.
 - overlong_filter: Skip overlong truncated samples, which will not be included in loss calculation. Default is False.
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index ccb932a957..2fc4ca6961 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -190,6 +190,7 @@ class GRPOArgumentsMixin:
     offload_model: bool = False
     gc_collect_after_offload: bool = False
     multi_turn_func: Optional[str] = None
+    completion_length_limit_scope: Literal['total', 'per_round'] = 'per_round'
 
     # DAPO, https://arxiv.org/abs/2503.14476
     dynamic_sample: bool = False
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 693e1e552f..3cdcbc35d7 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -170,6 +170,7 @@ def __init__(self,
         self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size  # only applies to colocation mode
         self.loss_type = args.loss_type
         self.max_completion_length = args.max_completion_length
+        self.completion_length_limit_scope = args.completion_length_limit_scope
         model.warnings_issued['estimate_tokens'] = True
         kwargs['data_collator'] = lambda features: features
         self.shuffle_dataset = args.dataset_shuffle
@@ -532,7 +533,7 @@ def _move_model_to_vllm_lmdeploy(self):
                     with patch_lora_unmerge(unwrapped_model):
                         unwrapped_model.unmerge_adapter()
         if self.use_vllm and self.vllm_mode == 'colocate':
-            # since update weights, we should reset the prefix cache
+            # since vLLM model weights has been updated, we should reset the prefix cache
             self.engine.engine.reset_prefix_cache()
 
     def _wait_queue(self):
@@ -551,7 +552,6 @@ def reorder_outputs(outputs, distributed_idx):
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
     def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_inputs: bool = False) -> OutputsType:
-        # inputs: local inputs
         from swift.llm.infer.protocol import ChatCompletionResponse
         request_config = copy(request_config)
         # keys from InferRequest
@@ -1242,7 +1242,8 @@ def multi_turn_completion_length_context(self):
         Ensures the total sequence length (prompt + completion) never exceeds:
             min(original_max_len, prompt_tokens + max_completion_length)
         """
-        if not (self.multi_turn_func and self.use_fast_infer) or self.vllm_mode == 'server':
+        if not (self.multi_turn_func and
+                self.use_fast_infer) or self.vllm_mode == 'server' or self.completion_length_limit_scope == 'per_round':
             yield
             return
 

From 829a7eae764b320a434f3e633f4f4046b90527fa Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 16:12:10 +0800
Subject: [PATCH 61/68] fix epsilon

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 061f4240f3..d85af35c2e 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -320,6 +320,8 @@ def __init__(self,
 
         # Multi-step
         self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
 
         # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle. # noqa
         self._step = 0

From f2b4aac4b2b35e5510ad9dfdf137fd71ac1022fd Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 16:32:20 +0800
Subject: [PATCH 62/68] update stable doc reference

---
 docs/source/Instruction/GRPO.md    | 2 +-
 docs/source_en/Instruction/GRPO.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index 0e7083a1ce..a46a30558c 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -10,7 +10,7 @@ pip install math_verify==0.5.2 # reward function
 pip install -U trl
 ```
 
-GRPOTrainer在swift3.5.dev进行了代码重构，如果你使用的swift版本<3.5, 请参考[stable文档](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
+GRPOTrainer在swift3.5.dev进行了代码重构，如果你使用的swift版本<3.5, 请参考[stable文档](https://github.com/modelscope/ms-swift/blob/v3.4.1/docs/source/Instruction/GRPO.md)
 
 **更新日志**
 - **2025-05-13** — 为了代码的可读性和维护性， GRPOTrainer代码重构，Internal mode 支持vLLM>=0.8。
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index eabe3f0666..2b5ccca667 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -10,7 +10,7 @@ environments
 pip install math_verify # reward function
 pip install -U trl
 ```
-The GRPOTrainer has been refactored in swift 3.5.dev. If you are using a version of Swift < 3.5 , please refer to the[stable doc](https://swift.readthedocs.io/zh-cn/stable/Instruction/GRPO.html)
+The GRPOTrainer has been refactored in swift 3.5.dev. If you are using a version of Swift < 3.5 , please refer to the[stable doc](https://github.com/modelscope/ms-swift/blob/v3.4.1/docs/source_en/Instruction/GRPO.md)
 
 **Dev Log**
 - **2025-05-13** — The GRPOTrainer code has been refactored to improve code readability and maintainability. Internal mode now supports vLLM ≥ 0.8.

From cb7ff52dbcbd6a4f79fdb23bfaeaa368d535d977 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 16:40:41 +0800
Subject: [PATCH 63/68] remove lmdeploy

---
 docs/source/Instruction/GRPO.md               |  2 --
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 --
 .../Instruction/Command-line-parameters.md    |  2 --
 docs/source_en/Instruction/GRPO.md            |  2 --
 swift/llm/argument/rlhf_args.py               |  2 +-
 swift/trainers/arguments.py                   |  5 ---
 swift/trainers/rlhf_trainer/grpo_trainer.py   | 35 +++----------------
 7 files changed, 6 insertions(+), 44 deletions(-)

diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
index a46a30558c..b04dc5891e 100644
--- a/docs/source/Instruction/GRPO.md
+++ b/docs/source/Instruction/GRPO.md
@@ -204,8 +204,6 @@ A conversation between User and Assistant. The user asks a question, and the Ass
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None.
   - vllm_enable_prefix_caching: vllm透传参数，默认为True.
   - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放.
-- use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
-- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8。
 - num_iterations: 每个批次代更新次数，默认为1。
 - epsilon: clip 系数，默认为0.2。
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index e7707616a5..53cc35cf73 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -422,8 +422,6 @@ reward模型参数将在PPO、GRPO中使用。
   - vllm_limit_mm_per_prompt: vllm透传参数，默认为None。
   - vllm_enable_prefix_caching: vllm透传参数，默认为True。
   - sleep_level: 训练时释放 vLLM 显存，可选项为[0, 1], 默认为0，不释放
-- use_lmdeploy: 是否使用 LMDeoloy 作为 GRPO 生成的 infer_backend，默认为False。
-- lmdeploy_cache_max_entry_count: LMDeploy 透传参数, 默认为0.8。
 - num_iterations: 每个批次代更新次数，默认为1。
 - epsilon: clip 系数，默认为0.2。
 - epsilon_high: upper clip 系数，默认为None，设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 5ce3d823d3..fce0f89cdf 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -433,8 +433,6 @@ The meanings of the following parameters can be referenced [here](https://huggin
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
   - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
-- use_lmdeploy: Whether to use LMDeoloy as the infer_backend for GRPO generation, default is False.
-- lmdeploy_cache_max_entry_count: LMDeploy passthrough parameter, default is 0.8
 - top_k: Default is 50.
 - top_p: Default is 0.9.
 - repetition_penalty: Repetition penalty term. Default is 1.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
index 2b5ccca667..78824f6380 100644
--- a/docs/source_en/Instruction/GRPO.md
+++ b/docs/source_en/Instruction/GRPO.md
@@ -213,8 +213,6 @@ Arguments
   - vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
   - vllm_tensor_parallel_size: the tensor parallel size of vLLM engine, default is 1.
   - sleep_level: make vllm sleep when model is training. Options are 0 or 1, default is 0, no sleep
-- use_lmdeploy: Whether to use LMDeoloy as the infer_backend for GRPO generation, default is False.
-- lmdeploy_cache_max_entry_count: LMDeploy passthrough parameter, default is 0.8
 - num_iterations: number of iterations per batch. Default is 1.
 - epsilon: epsilon value for clipping. Default is 0.2.
 - epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index dfedd3e397..7dba932c53 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -132,7 +132,7 @@ def __post_init__(self):
 
     def _init_grpo(self):
         if self.rlhf_type == 'grpo':
-            if self.use_vllm or self.use_lmdeploy:
+            if self.use_vllm:
                 os.environ['USE_FAST_INFERENCE'] = '1'
                 set_default_ddp_config()
             if self.async_generate or not self.use_vllm:
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index 2fc4ca6961..b5d039dba8 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -175,11 +175,6 @@ class GRPOArgumentsMixin:
 
     reward_model: Optional[List[str]] = None
     reward_model_plugin: Optional[List[str]] = None
-    # LMDeploy in GRPO
-    use_lmdeploy: bool = False
-    lmdeploy_device: Optional[str] = 'auto'
-    lmdeploy_session_len: Optional[int] = None
-    lmdeploy_cache_max_entry_count: float = 0.8
 
     async_generate: bool = False
     tensor_parallel_size: Optional[int] = None  # deprecated
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 3cdcbc35d7..6f375816f0 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -33,8 +33,7 @@
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
 from swift.plugin import multi_turns, orms, rm_plugins
-from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, is_lmdeploy_available, is_vllm_available,
-                         is_wandb_available)
+from swift.utils import JsonlWriter, gc_collect, get_device, get_logger, is_vllm_available, is_wandb_available
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import patch_lora_merge, patch_lora_unmerge, unwrap_model_for_generation
@@ -176,7 +175,6 @@ def __init__(self,
         self.shuffle_dataset = args.dataset_shuffle
 
         self.use_vllm = args.use_vllm
-        self.use_lmdeploy = args.use_lmdeploy
         self.async_generate = args.async_generate
         vllm_client = kwargs.pop('vllm_client')  # for external vllm
 
@@ -225,7 +223,7 @@ def __init__(self,
         # it's safer to set it in all cases.
         set_seed(args.seed, device_specific=True)
         self.parameter_groups, self.parameter_groups_no_lora = self.split_batches()
-        self.use_fast_infer = self.use_vllm or self.use_lmdeploy  # whether to use the PT backend
+        self.use_fast_infer = self.use_vllm  # whether to use the PT backend
         if self.use_vllm:
             if not is_vllm_available():
                 raise ImportError('vLLM is not available and `use_vllm` is set to True. '
@@ -249,29 +247,6 @@ def __init__(self,
                 self.engine = self.prepare_vllm(model)
                 # Avoid thread-unsafe modifications of the mode.
                 self.engine.default_template = copy(self.template)  # Avoid thread-unsafe modifications of the mode.
-
-        elif self.use_lmdeploy:
-            if not is_lmdeploy_available():
-                raise ImportError('LMDeploy is not available and `use_lmdeploy` is set to True.'
-                                  'Please install LMDeploy with `pip install lmdeploy -U` to use it.')
-            from swift.llm import LmdeployEngine
-            from swift.tuners import Swift
-            with Swift.grpo_context(model, self.template.processor):
-                self.engine = LmdeployEngine(
-                    model.model_dir,
-                    model.model_info.torch_dtype,
-                    model_type=model.model_meta.model_type,
-                    session_len=args.lmdeploy_session_len,
-                    cache_max_entry_count=args.lmdeploy_cache_max_entry_count,
-                    reload_weights=True)
-                from lmdeploy.turbomind.turbomind import TurboMind
-                lmdeploy_engine = self.engine.engine.engine
-                assert isinstance(
-                    lmdeploy_engine,
-                    TurboMind), ("Currently only LMDeploy's TurboMind backend is supported. "
-                                 'The current model is incompatible - please use vLLM or PyTorch backend instead.')
-                # Avoid thread-unsafe modifications of the mode.
-                self.engine.default_template = copy(self.template)  # Avoid thread-unsafe modifications of the mode.
         else:
             from swift.llm import PtEngine
             self.engine = PtEngine.from_model_template(self.model, copy(self.template), max_batch_size=0)  # 0: no limit
@@ -478,7 +453,7 @@ def _template_context(self, template):
             template.max_length = max_length
 
     @profiling_decorator
-    def _move_model_to_vllm_lmdeploy(self):
+    def _move_model_to_vllm(self):
         if self.vllm_mode == 'server':
             return super()._move_model_to_vllm()
 
@@ -582,7 +557,7 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
             )
             results = results[process_slice]
         else:
-            # pt / lmdeploy / vllm
+            # pt / vllm
             if self.vllm_tensor_parallel_size > 1:
                 # Gather prompts from all ranks in the TP group and flatten.
                 # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
@@ -744,7 +719,7 @@ def _fast_infer(self, inputs: InputsType) -> Tuple[InputsType, OutputsType]:
                 self.engine.engine.wake_up()
         # First, have main process load weights if needed
         if self.state.global_step != self._last_loaded_step:
-            self._move_model_to_vllm_lmdeploy()
+            self._move_model_to_vllm()
             self._last_loaded_step = self.state.global_step
 
         if self.async_generate:

From 5e9e3b5699de1bb24979bd07f985aff5ad78fce1 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 19:05:32 +0800
Subject: [PATCH 64/68] set different seed bewteen processes

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 6f375816f0..71483d1eda 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -575,7 +575,10 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
                 torch.distributed.all_gather_object(gathered_inputs, inputs, group=self.tp_group)
                 inputs = [p for sublist in gathered_inputs for p in sublist]
             # confirm that the seed is same in tp group
-            request_config.seed = self.accelerator.process_index // self.vllm_tensor_parallel_size
+            mode = 'train' if self.model.training else 'eval'
+            batch_size = (
+                self.args.per_device_train_batch_size if mode == 'train' else self.args.per_device_eval_batch_size)
+            request_config.seed = batch_size * self.accelerator.process_index // self.vllm_tensor_parallel_size
             results: List[ChatCompletionResponse] = self._engine_infer(
                 infer_requests=inputs, request_config=request_config)
 

From 25ac346c7469d9fb5ab42b331f988a9f5037e248 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 19:14:29 +0800
Subject: [PATCH 65/68] fix seed

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 71483d1eda..ddcdc9b3fe 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -577,7 +577,8 @@ def _infer(self, inputs: InputsType, request_config: RequestConfig, is_global_in
             # confirm that the seed is same in tp group
             mode = 'train' if self.model.training else 'eval'
             batch_size = (
-                self.args.per_device_train_batch_size if mode == 'train' else self.args.per_device_eval_batch_size)
+                self.args.per_device_train_batch_size
+                * self.args.gradient_accumulation_steps if mode == 'train' else self.args.per_device_eval_batch_size)
             request_config.seed = batch_size * self.accelerator.process_index // self.vllm_tensor_parallel_size
             results: List[ChatCompletionResponse] = self._engine_infer(
                 infer_requests=inputs, request_config=request_config)

From 346396f85c3031609c5db69bb4490522eb612601 Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 20:04:06 +0800
Subject: [PATCH 66/68] remove liger check

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index 7cffec1932..b808a47ae8 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -33,8 +33,7 @@
 
 from swift.llm import InferRequest, MultiModelKeys, RequestConfig, RowPreprocessor, get_model_arch, to_device
 from swift.plugin import multi_turns, orms, rm_plugins
-from swift.utils import (JsonlWriter, gc_collect, get_device, get_logger, is_liger_available, is_vllm_available,
-                         is_wandb_available)
+from swift.utils import JsonlWriter, gc_collect, get_device, get_logger, is_vllm_available, is_wandb_available
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 from .utils import _ForwardRedirection, patch_lora_merge, patch_lora_unmerge, unwrap_model_for_generation
@@ -184,9 +183,6 @@ def __init__(self,
         self.use_liger_loss = self.args.use_liger_loss
         if self.use_liger_loss:
             from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
-            if not is_liger_available():
-                raise ImportError(
-                    'Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.')
 
             self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
                 beta=self.beta,

From 3045802a4d20cb2dab6d94b884ad84d47f9ccd0b Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Tue, 13 May 2025 20:31:28 +0800
Subject: [PATCH 67/68] fix epsilon

---
 swift/trainers/rlhf_trainer/grpo_trainer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
index b808a47ae8..7f014eec06 100644
--- a/swift/trainers/rlhf_trainer/grpo_trainer.py
+++ b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -179,6 +179,10 @@ def __init__(self,
         vllm_client = kwargs.pop('vllm_client')  # for external vllm
 
         super().__init__(model, ref_model, *_args, **kwargs)
+        # Multi-step
+        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
 
         self.use_liger_loss = self.args.use_liger_loss
         if self.use_liger_loss:
@@ -290,11 +294,6 @@ def __init__(self,
                     self.reward_funcs[i] = self.accelerator.prepare_model(
                         reward_func, evaluation_mode=True, device_placement=True)
 
-        # Multi-step
-        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
-        self.epsilon_low = args.epsilon
-        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
-
         # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle. # noqa
         self._step = 0
         # Buffer the batch to reuse generated outputs across multiple updates. For more details, see

From 4bf7996b4fee3726c8504bfcd70e589c6818e46c Mon Sep 17 00:00:00 2001
From: hjh0119 <hujinghan.hjh@alibaba-inc.com>
Date: Wed, 14 May 2025 11:47:40 +0800
Subject: [PATCH 68/68] remvoe unused import

---
 swift/llm/argument/rlhf_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 3036da26a2..552d49f9c3 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -5,7 +5,7 @@
 
 from swift.llm import MODEL_MAPPING
 from swift.trainers.arguments import GRPOArgumentsMixin
-from swift.utils import get_logger, is_liger_available, is_master, set_default_ddp_config
+from swift.utils import get_logger, is_master, set_default_ddp_config
 from .train_args import TrainArguments
 
 logger = get_logger()