File tree Expand file tree Collapse file tree 7 files changed +2014
-3
lines changed Expand file tree Collapse file tree 7 files changed +2014
-3
lines changed Original file line number Diff line number Diff line change
1
+ training :
2
+ train_iters : 100_000
3
+ logs :
4
+ interval : 10
5
+ evaluations :
6
+ gsm8k :
7
+ run_interval :
8
+ interval : 10
9
+ evaluator :
10
+ type : lm_eval
11
+ cli_args :
12
+ - --tasks
13
+ - gsm8k
14
+ - --output_path
15
+ - /mnt/checkpoints/test/denis/smol_eval_experiment/lm_eval
16
+ stack_3b :
17
+ run_interval :
18
+ interval : 10
19
+ evaluator :
20
+ type : loss
21
+ iterations : 10
22
+ dataset_name : stack_3b
23
+ fineweb :
24
+ run_interval :
25
+ interval : 10
26
+ evaluator :
27
+ iterations : 10
28
+ dataset_name : fineweb
29
+ checkpoint :
30
+ interval : 1000
31
+ keep : 5
32
+ test_iters : 0
33
+ export : # (1)!
34
+ format : llama
35
+ interval : 20_000
36
+ batch :
37
+ micro_batch_size : 16
38
+ sequence_length : 4096
39
+ batch_size : 32
40
+ data :
41
+ tokenizer :
42
+ path : /mnt/checkpoints/pretrained_models/Qwen2-1.5B-Instruct
43
+ bos_token : " <|endoftext|>"
44
+ datasets :
45
+ # Bad dataset they are tokenized with different tokenizer, then llama
46
+ training :
47
+ type : file
48
+ path : /mnt/datasets/test/denis/fineweb_the_stack_3b.yaml
49
+ stack_3b :
50
+ type : memmap
51
+ path : /mnt/datasets/data_collections/the_stack_3b/tokens/stack_3b/default/train/99
52
+ fineweb :
53
+ type : memmap
54
+ path : /mnt/datasets/data_collections/standalone_datasets/tokens/HuggingFaceFW/fineweb/default/train/9_1000
55
+ optimizer :
56
+ weight_decay : 0.1
57
+ beta_1 : 0.9
58
+ beta_2 : 0.95
59
+ learning_rate :
60
+ base : 1.0e-04 # (3)!
61
+ minimum : 1.0e-05
62
+ decay_style : cosine
63
+ decay_iterations : 100_000
64
+ warmup_iterations : 2000
65
+ pretrained : # (4)!
66
+ format : qwen2
67
+ path : /mnt/checkpoints/pretrained_models/Qwen2-1.5B-Instruct
68
+ model_weights : yes # (5)!
69
+ model :
70
+ base_model :
71
+ transformer :
72
+ use_flash_attention : yes
73
+ cross_entropy_impl : fused
74
+ multi_stage :
75
+ zero_stage : 2
76
+ distributed :
77
+ training_dtype : bf16
78
+
79
+ run :
80
+ experiment_dir : " /mnt/checkpoints/test/denis/qwen_eval_experiment"
81
+
82
+ # training:
83
+ # logs:
84
+ # interval: 10
85
+ # wandb:
86
+ # project_name: ${job.project_name}
87
+ # group_name: ${job.project_version}
Original file line number Diff line number Diff line change
1
+ training :
2
+ train_iters : 100_000
3
+ logs :
4
+ interval : 10
5
+ evaluations :
6
+ gsm8k :
7
+ run_interval :
8
+ interval : 10
9
+ evaluator :
10
+ type : lm_eval
11
+ cli_args :
12
+ - --tasks
13
+ - gsm8k
14
+ - --output_path
15
+ - /mnt/checkpoints/test/denis/smol_eval_experiment/lm_eval
16
+ stack_3b :
17
+ run_interval :
18
+ interval : 10
19
+ evaluator :
20
+ type : loss
21
+ iterations : 10
22
+ dataset_name : stack_3b
23
+ fineweb :
24
+ run_interval :
25
+ interval : 10
26
+ evaluator :
27
+ iterations : 10
28
+ dataset_name : fineweb
29
+ checkpoint :
30
+ interval : 1000
31
+ keep : 5
32
+ test_iters : 0
33
+ export : # (1)!
34
+ format : llama
35
+ interval : 20_000
36
+ batch :
37
+ micro_batch_size : 16
38
+ sequence_length : 4096
39
+ batch_size : 32
40
+ data :
41
+ tokenizer :
42
+ path : /mnt/checkpoints/pretrained_models/SmolLM2-135M-Instruct
43
+ datasets :
44
+ # Bad dataset they are tokenized with different tokenizer, then llama
45
+ training :
46
+ type : file
47
+ path : /mnt/datasets/test/denis/fineweb_the_stack_3b.yaml
48
+ stack_3b :
49
+ type : memmap
50
+ path : /mnt/datasets/data_collections/the_stack_3b/tokens/stack_3b/default/train/99
51
+ fineweb :
52
+ type : memmap
53
+ path : /mnt/datasets/data_collections/standalone_datasets/tokens/HuggingFaceFW/fineweb/default/train/9_1000
54
+ optimizer :
55
+ weight_decay : 0.1
56
+ beta_1 : 0.9
57
+ beta_2 : 0.95
58
+ learning_rate :
59
+ base : 1.0e-04 # (3)!
60
+ minimum : 1.0e-05
61
+ decay_style : cosine
62
+ decay_iterations : 100_000
63
+ warmup_iterations : 2000
64
+ pretrained : # (4)!
65
+ format : llama
66
+ path : /mnt/checkpoints/pretrained_models/SmolLM2-135M-Instruct/
67
+ model_weights : yes # (5)!
68
+ model :
69
+ base_model :
70
+ transformer :
71
+ use_flash_attention : yes
72
+ cross_entropy_impl : fused
73
+ multi_stage :
74
+ zero_stage : 2
75
+ distributed :
76
+ training_dtype : bf16
77
+
78
+ run :
79
+ experiment_dir : " /mnt/checkpoints/test/denis/smol_eval_experiment"
80
+
81
+ # training:
82
+ # logs:
83
+ # interval: 10
84
+ # wandb:
85
+ # project_name: ${job.project_name}
86
+ # group_name: ${job.project_version}
You can’t perform that action at this time.
0 commit comments