Merge remote-tracking branch 'origin/vllm_client_custom_url' into vllm_client_custom_url

re-imagined · re-imagined · commit cd35db0909ba · 2025-04-24T11:03:42.000+08:00
diff --git a/docs/source/sft_trainer.md b/docs/source/sft_trainer.md
@@ -424,9 +424,9 @@ Below are some numbers you can get in terms of speedup and memory efficiency, us
 
 | use_flash_attn_1 | model_name        | max_seq_len | batch_size | time per training step |
 | ---------------- | ----------------- | ----------- | ---------- | ---------------------- |
-| x                | facebook/opt-350m | 2048        | 8          | ~59.1s                 |
+| ✓                | facebook/opt-350m | 2048        | 8          | ~59.1s                 |
 |                  | facebook/opt-350m | 2048        | 8          | **OOM**                |
-| x                | facebook/opt-350m | 2048        | 4          | ~30.3s                 |
+| ✓                | facebook/opt-350m | 2048        | 4          | ~30.3s                 |
 |                  | facebook/opt-350m | 2048        | 4          | ~148.9s                |
 
 ### Using Flash Attention-2
diff --git a/docs/source/text_environments.md b/docs/source/text_environments.md
@@ -157,7 +157,7 @@ The `TextHistory` object stores the interactions between the model and the text
 
 ### Attributes
 
-The following table summarises the available attributes of the `TextEnvironment` class:
+The following table summarises the available attributes of the `TextHistory` class:
 
 | Attribute           | Description     |
 |:-------------------|:----------------|
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -24,7 +24,7 @@
 from transformers.utils import is_peft_available
 
 from trl import GRPOConfig, GRPOTrainer
-from trl.trainer.grpo_trainer import RepeatRandomSampler
+from trl.trainer.grpo_trainer import RepeatSampler
 
 from .testing_utils import require_vllm
 
@@ -36,7 +36,7 @@
 class RepeatRandomSamplerTester(unittest.TestCase):
     def test_sampler(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=2)
+        sampler = RepeatSampler(dataset, mini_repeat_count=2)
         # Should output something like [4, 4, 3, 3, 0, 0, 1, 1, 2, 2, 6, 6, 5, 5]
         sampled = list(sampler)
         # Check that the length is doubled
@@ -46,9 +46,16 @@ def test_sampler(self):
         # Check that each element is repeated twice
         assert all(sampled[i] == sampled[i + 1] for i in range(0, len(sampled), 2))
 
+    def test_sampler_no_shuffle(self):
+        dataset = ["a", "b", "c", "d", "e", "f", "g"]
+        sampler = RepeatSampler(dataset, mini_repeat_count=2, shuffle=False)
+        sampled = list(sampler)
+        expected = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]
+        self.assertEqual(sampled, expected)
+
     def test_sampler_no_repeat(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=1)
+        sampler = RepeatSampler(dataset, mini_repeat_count=1)
         # Should output something like [4, 3, 0, 1, 2, 6, 5]
         sampled = list(sampler)
         # Check that the length is the same
@@ -58,7 +65,7 @@ def test_sampler_no_repeat(self):
 
     def test_sampler_with_batch_size(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g", "h"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=1, batch_size=2, repeat_count=2)
+        sampler = RepeatSampler(dataset, mini_repeat_count=1, batch_size=2, repeat_count=2)
         # Should output something like [4, 3, 4, 3, 0, 1, 0, 1, 2, 6, 2, 6, 5, 7, 5, 7]
         sampled = list(sampler)
         # Check that the length is doubled
@@ -70,7 +77,7 @@ def test_sampler_with_batch_size(self):
 
     def test_sampler_with_batch_size_and_drop(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=1, batch_size=2, repeat_count=2)
+        sampler = RepeatSampler(dataset, mini_repeat_count=1, batch_size=2, repeat_count=2)
         # Should output something like [4, 3, 4, 3, 0, 1, 0, 1, 2, 6, 2, 6]
         sampled = list(sampler)
         # Check that the length is doubled
@@ -84,7 +91,7 @@ def test_sampler_with_batch_size_and_drop(self):
 
     def test_sampler_with_mini_repeat_count_and_batch_size_1(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=2, batch_size=3, repeat_count=2)
+        sampler = RepeatSampler(dataset, mini_repeat_count=2, batch_size=3, repeat_count=2)
         # Should output something like [4, 4, 3, 3, 0, 0, 4, 4, 3, 3, 0, 0,
         #                               1, 1, 2, 2, 6, 6, 1, 1, 2, 2, 6, 6]
         sampled = list(sampler)
@@ -100,7 +107,7 @@ def test_sampler_with_mini_repeat_count_and_batch_size_1(self):
 
     def test_sampler_with_mini_repeat_count_and_batch_size_2(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=3, batch_size=2, repeat_count=2)
+        sampler = RepeatSampler(dataset, mini_repeat_count=3, batch_size=2, repeat_count=2)
         # Should output something like [4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3,
         #                               0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
         #                               2, 2, 2, 6, 6, 6, 2, 2, 2, 6, 6, 6]
@@ -118,7 +125,7 @@ def test_sampler_with_mini_repeat_count_and_batch_size_2(self):
 
     def test_sampler_with_mini_repeat_count_and_batch_size_3(self):
         dataset = ["a", "b", "c", "d", "e", "f", "g"]
-        sampler = RepeatRandomSampler(dataset, mini_repeat_count=2, batch_size=2, repeat_count=3)
+        sampler = RepeatSampler(dataset, mini_repeat_count=2, batch_size=2, repeat_count=3)
         # Should output something like [4, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3,
         #                               0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
         #                               2, 2, 6, 6, 2, 2, 6, 6, 2, 2, 6, 6]
diff --git a/trl/scripts/vllm_serve.py b/trl/scripts/vllm_serve.py
@@ -174,6 +174,9 @@ class ScriptArguments:
         enable_prefix_caching (`bool` or `None`, *optional*, defaults to `None`):
             Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support
             this feature.
+        enforce_eager (`bool` or `None`, *optional*, defaults to `None`):
+            Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the
+            model in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid.
     """
 
     model: str = field(metadata={"help": "Model name or path to load the model from."})
@@ -224,6 +227,14 @@ class ScriptArguments:
             "hardware support this feature."
         },
     )
+    enforce_eager: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always "
+            "execute the model in eager mode. If `False` (default behavior), we will use CUDA graph and eager "
+            "execution in hybrid."
+        },
+    )
 
 
 def main(script_args: ScriptArguments):
@@ -250,6 +261,7 @@ def main(script_args: ScriptArguments):
         revision=script_args.revision,
         tensor_parallel_size=script_args.tensor_parallel_size,
         gpu_memory_utilization=script_args.gpu_memory_utilization,
+        enforce_eager=script_args.enforce_eager,
         dtype=script_args.dtype,
         # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
         # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -59,6 +59,8 @@ class GRPOConfig(TrainingArguments):
             improving generation speed. However, disabling this option allows training models that exceed the VRAM
             capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
             with vLLM generation.
+        shuffle_dataset (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training dataset.
 
         > Parameters that control generation
 
@@ -225,6 +227,10 @@ class GRPOConfig(TrainingArguments):
             "is not compatible with vLLM generation."
         },
     )
+    shuffle_dataset: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether to shuffle the training dataset."},
+    )
 
     # Parameters that control generation
     temperature: float = field(
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -78,7 +78,7 @@
 RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
 
 
-class RepeatRandomSampler(Sampler):
+class RepeatSampler(Sampler):
     """
     Sampler that repeats the indices of a dataset in a structured manner.
 
@@ -91,6 +91,8 @@ class RepeatRandomSampler(Sampler):
             Number of unique indices per batch.
         repeat_count (`int`, *optional*, defaults to `1`):
             Number of times to repeat the full sampling process.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the dataset.
         seed (`int` or `None`, *optional*, defaults to `None`):
             Random seed for reproducibility (only affects this sampler).
 
@@ -132,21 +134,28 @@ def __init__(
         mini_repeat_count: int,
         batch_size: int = 1,
         repeat_count: int = 1,
+        shuffle: bool = True,
         seed: Optional[int] = None,
     ):
         self.data_source = data_source
         self.mini_repeat_count = mini_repeat_count
         self.batch_size = batch_size
         self.repeat_count = repeat_count
         self.num_samples = len(data_source)
+        self.shuffle = shuffle
         self.seed = seed
-        self.generator = torch.Generator()  # Create a local random generator
-        if seed is not None:
-            self.generator.manual_seed(seed)
+
+        if shuffle:
+            self.generator = torch.Generator()  # Create a local random generator
+            if seed is not None:
+                self.generator.manual_seed(seed)
 
     def __iter__(self):
-        # E.g., [2, 4, 3, 1, 0, 6, 5] (num_samples = 7)
-        indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
+        if self.shuffle:
+            # E.g., [2, 4, 3, 1, 0, 6, 5] (num_samples = 7)
+            indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
+        else:
+            indexes = list(range(self.num_samples))
 
         #    [2, 4, 3, 1, 0, 6, 5]
         # -> [[2, 4, 3], [1, 0, 6], [5]]  (batch_size = 3)
@@ -166,6 +175,15 @@ def __len__(self) -> int:
         return self.num_samples * self.mini_repeat_count * self.repeat_count
 
 
+class RepeatRandomSampler(RepeatSampler):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "RepeatRandomSampler is deprecated and will be removed in version 0.18. Use RepeatSampler instead.",
+            DeprecationWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
 # torch.nanstd doesn't exist, so we define it here
 def nanstd(tensor: torch.Tensor) -> torch.Tensor:
     """
@@ -312,7 +330,9 @@ def reward_func(completions, **kwargs):
             Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
         processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
             Processing class used to process the data. The padding side must be set to "left". If `None`, the
-            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`].
+            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. A
+            padding token, `processing_class.pad_token`, must be set. If the processing class has not set a padding
+            token, `processing_class.eos_token` will be used as the default.
         reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
             Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
 
@@ -418,6 +438,8 @@ def __init__(
         # Processing class
         if processing_class is None:
             processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left")
+        if processing_class.pad_token is None:
+            processing_class.pad_token = processing_class.eos_token
 
         # Reward functions
         if not isinstance(reward_funcs, list):
@@ -481,6 +503,8 @@ def data_collator(features):  # No data collation is needed in GRPO
         self.mask_truncated_completions = args.mask_truncated_completions
 
         # Datasets
+        self.shuffle_dataset = args.shuffle_dataset
+
         if (
             isinstance(train_dataset, IterableDataset)
             or isinstance(eval_dataset, IterableDataset)
@@ -734,17 +758,18 @@ def _get_train_sampler(self) -> Sampler:
             * self.accelerator.num_processes
             * self.args.gradient_accumulation_steps
         )
-        return RepeatRandomSampler(
+        return RepeatSampler(
             data_source=self.train_dataset,
             mini_repeat_count=self.num_generations,
             batch_size=effective_batch_size // self.num_generations,
             repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
+            shuffle=self.shuffle_dataset,
             seed=self.args.seed,
         )
 
     def _get_eval_sampler(self, eval_dataset) -> Sampler:
         # See _get_train_sampler for an explanation of the sampler.
-        return RepeatRandomSampler(
+        return RepeatSampler(
             data_source=eval_dataset,
             mini_repeat_count=self.num_generations,
             seed=self.args.seed,