make e2e tests a bit faster by reducing test split size (axolotl-ai-cloud#2522) [skip ci]

winglian · web-flow · commit de8a625dd720 · 2025-04-12T07:24:43.000-07:00
* [ci] make e2e tests a bit faster by reducing test split size

* use 10% split of alpaca dataset to speed up dataset loading/tokenization

* reduce gas 4-&gt;2 for most e2e tests

* increase val set size for packing
diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -25,7 +25,7 @@ def min_cfg(temp_dir):
         ],
         "cut_cross_entropy": True,
         "sequence_len": 1024,
-        "val_set_size": 0.1,
+        "val_set_size": 0.02,
         "special_tokens": {
             "pad_token": "<|endoftext|>",
         },
@@ -79,7 +79,7 @@ def test_qwen2_w_cce(self, temp_dir):
                 ],
                 "cut_cross_entropy": True,
                 "sequence_len": 1024,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
@@ -55,7 +55,7 @@ def test_loss_llama(self, temp_dir):
                 ],
                 "num_epochs": 1,
                 "micro_batch_size": 2,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
@@ -58,12 +58,13 @@ def test_lora_ddp(self, temp_dir):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
                 "max_steps": 2,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
@@ -201,7 +202,7 @@ def test_dpo_lora_ddp(self, temp_dir):
                 "num_epochs": 1,
                 "max_steps": 2,
                 "micro_batch_size": 2,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "warmup_steps": 0,
@@ -279,7 +280,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
                 "num_epochs": 1,
                 "max_steps": 2,
                 "micro_batch_size": 2,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "warmup_steps": 0,
@@ -335,6 +336,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -398,14 +400,15 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 1024,
-                "val_set_size": 0.01,
+                "val_set_size": 0.05,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -484,6 +487,7 @@ def test_fsdp2_packed(
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -565,7 +569,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
-                        "split": "train[:25%]",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -660,14 +664,15 @@ def test_ds_zero3_packed(
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 1024,
-                "val_set_size": 0.01,
+                "val_set_size": 0.05,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -741,6 +746,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -814,6 +820,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
@@ -45,6 +45,7 @@ def test_lora_ddp(self, temp_dir):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -103,6 +104,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
diff --git a/tests/e2e/multigpu/test_sp.py b/tests/e2e/multigpu/test_sp.py
@@ -40,6 +40,7 @@ def test_sequence_parallel_training(self, temp_dir):
                     {
                         "path": "tatsu-lab/alpaca",
                         "type": "alpaca",
+                        "split": "train[:10%]",
                     },
                 ],
                 "num_epochs": 1,
diff --git a/tests/e2e/patched/test_llama_s2_attention.py b/tests/e2e/patched/test_llama_s2_attention.py
@@ -43,7 +43,7 @@ def test_lora_s2_attn(self, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {},
                 "datasets": [
                     {
@@ -83,7 +83,7 @@ def test_fft_s2_attn(self, temp_dir):
                 "sample_packing": False,
                 "flash_attention": True,
                 "s2_attention": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {},
                 "datasets": [
                     {
diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py
@@ -27,7 +27,7 @@ def test_mixtral_multipack(self, temp_dir):
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {},
                 "datasets": [
                     {
@@ -59,7 +59,7 @@ def test_mistral_multipack(self, temp_dir):
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {},
                 "datasets": [
                     {
diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py
@@ -88,7 +88,7 @@ def test_qlora_packed(self, temp_dir):
                 "lora_alpha": 32,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
@@ -47,7 +47,7 @@ def test_loss_llama(self, temp_dir):
                 ],
                 "num_epochs": 1,
                 "micro_batch_size": 2,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py
@@ -65,7 +65,7 @@ def test_lora_deepseekv3(self, temp_dir, sample_packing):
                 "chat_template": "deepseek_v3",
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
@@ -115,7 +115,7 @@ def test_fft_deepseekv3(self, temp_dir, sample_packing):
                 },
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py
@@ -41,7 +41,7 @@ def test_lora(self, temp_dir):
                     "word_embeddings",
                     "lm_head",
                 ],
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "bos_token": "<|endoftext|>",
                     "pad_token": "<|endoftext|>",
@@ -92,7 +92,7 @@ def test_lora_added_vocab(self, temp_dir):
                     "word_embeddings",
                     "lm_head",
                 ],
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "bos_token": "<|endoftext|>",
                     "pad_token": "<|endoftext|>",
@@ -137,7 +137,7 @@ def test_ft(self, temp_dir):
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
                 "flash_attention": True,
                 "sequence_len": 1024,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "bos_token": "<|endoftext|>",
                     "pad_token": "<|endoftext|>",
diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py
@@ -62,7 +62,7 @@ def test_lora_gemma2(self, temp_dir, sample_packing):
                 "chat_template": "gemma",  # gemma2's template is same as gemma
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
@@ -114,7 +114,7 @@ def test_fft_gemma2(self, temp_dir, sample_packing):
                 },
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py
@@ -61,7 +61,7 @@ def test_lora_gemma3_text(self, temp_dir, sample_packing):
                 "chat_template": "gemma3",
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
@@ -112,7 +112,7 @@ def test_fft_gemma3_text(self, temp_dir, sample_packing):
                 },
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py
@@ -30,7 +30,7 @@ def test_fft_trust_remote_code(self, temp_dir):
                 "tokenizer_type": "LlamaTokenizer",
                 "trust_remote_code": True,
                 "sequence_len": 512,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "unk_token": "<unk>",
                     "bos_token": "<s>",
diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py
@@ -52,7 +52,7 @@ def test_lora_llama_vision_text_only_dataset(self, temp_dir):
                 ],
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
@@ -99,7 +99,7 @@ def test_lora_llama_vision_multimodal_dataset(self, temp_dir):
                 ],
                 "num_epochs": 1,
                 "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
diff --git a/tests/e2e/test_load_model.py b/tests/e2e/test_load_model.py
@@ -36,7 +36,7 @@ def setup_method(self):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "unk_token": "<unk>",
                     "bos_token": "<s>",
diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py
@@ -37,7 +37,7 @@ def test_lora(self, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "unk_token": "<unk>",
                     "bos_token": "<s>",
diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
@@ -39,7 +39,7 @@ def test_lora(self, temp_dir):
                 "lora_alpha": 64,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "unk_token": "<unk>",
                     "bos_token": "<s>",
@@ -80,7 +80,7 @@ def test_ft(self, temp_dir):
                 "base_model": "openaccess-ai-collective/tiny-mistral",
                 "flash_attention": True,
                 "sequence_len": 1024,
-                "val_set_size": 0.1,
+                "val_set_size": 0.02,
                 "special_tokens": {
                     "unk_token": "<unk>",
                     "bos_token": "<s>",
diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py
diff --git a/tests/e2e/test_schedulers.py b/tests/e2e/test_schedulers.py
diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ def test_sequence_parallel_training(self, temp_dir):`
`40`	`40`	`{`
`41`	`41`	`"path": "tatsu-lab/alpaca",`
`42`	`42`	`"type": "alpaca",`
	`43`	`+ "split": "train[:10%]",`
`43`	`44`	`},`
`44`	`45`	`],`
`45`	`46`	`"num_epochs": 1,`