Skip to content

Commit cb744b2

Browse files
committed
copy from sandbox
1 parent 0f77750 commit cb744b2

File tree

7 files changed

+2014
-3
lines changed

7 files changed

+2014
-3
lines changed

examples/qwen_evaluate.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
training:
2+
train_iters: 100_000
3+
logs:
4+
interval: 10
5+
evaluations:
6+
gsm8k:
7+
run_interval:
8+
interval: 10
9+
evaluator:
10+
type: lm_eval
11+
cli_args:
12+
- --tasks
13+
- gsm8k
14+
- --output_path
15+
- /mnt/checkpoints/test/denis/smol_eval_experiment/lm_eval
16+
stack_3b:
17+
run_interval:
18+
interval: 10
19+
evaluator:
20+
type: loss
21+
iterations: 10
22+
dataset_name: stack_3b
23+
fineweb:
24+
run_interval:
25+
interval: 10
26+
evaluator:
27+
iterations: 10
28+
dataset_name: fineweb
29+
checkpoint:
30+
interval: 1000
31+
keep: 5
32+
test_iters: 0
33+
export: # (1)!
34+
format: llama
35+
interval: 20_000
36+
batch:
37+
micro_batch_size: 16
38+
sequence_length: 4096
39+
batch_size: 32
40+
data:
41+
tokenizer:
42+
path: /mnt/checkpoints/pretrained_models/Qwen2-1.5B-Instruct
43+
bos_token: "<|endoftext|>"
44+
datasets:
45+
# Bad dataset they are tokenized with different tokenizer, then llama
46+
training:
47+
type: file
48+
path: /mnt/datasets/test/denis/fineweb_the_stack_3b.yaml
49+
stack_3b:
50+
type: memmap
51+
path: /mnt/datasets/data_collections/the_stack_3b/tokens/stack_3b/default/train/99
52+
fineweb:
53+
type: memmap
54+
path: /mnt/datasets/data_collections/standalone_datasets/tokens/HuggingFaceFW/fineweb/default/train/9_1000
55+
optimizer:
56+
weight_decay: 0.1
57+
beta_1: 0.9
58+
beta_2: 0.95
59+
learning_rate:
60+
base: 1.0e-04 # (3)!
61+
minimum: 1.0e-05
62+
decay_style: cosine
63+
decay_iterations: 100_000
64+
warmup_iterations: 2000
65+
pretrained: # (4)!
66+
format: qwen2
67+
path: /mnt/checkpoints/pretrained_models/Qwen2-1.5B-Instruct
68+
model_weights: yes # (5)!
69+
model:
70+
base_model:
71+
transformer:
72+
use_flash_attention: yes
73+
cross_entropy_impl: fused
74+
multi_stage:
75+
zero_stage: 2
76+
distributed:
77+
training_dtype: bf16
78+
79+
run:
80+
experiment_dir: "/mnt/checkpoints/test/denis/qwen_eval_experiment"
81+
82+
# training:
83+
# logs:
84+
# interval: 10
85+
# wandb:
86+
# project_name: ${job.project_name}
87+
# group_name: ${job.project_version}

examples/smol_evaluate.yaml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
training:
2+
train_iters: 100_000
3+
logs:
4+
interval: 10
5+
evaluations:
6+
gsm8k:
7+
run_interval:
8+
interval: 10
9+
evaluator:
10+
type: lm_eval
11+
cli_args:
12+
- --tasks
13+
- gsm8k
14+
- --output_path
15+
- /mnt/checkpoints/test/denis/smol_eval_experiment/lm_eval
16+
stack_3b:
17+
run_interval:
18+
interval: 10
19+
evaluator:
20+
type: loss
21+
iterations: 10
22+
dataset_name: stack_3b
23+
fineweb:
24+
run_interval:
25+
interval: 10
26+
evaluator:
27+
iterations: 10
28+
dataset_name: fineweb
29+
checkpoint:
30+
interval: 1000
31+
keep: 5
32+
test_iters: 0
33+
export: # (1)!
34+
format: llama
35+
interval: 20_000
36+
batch:
37+
micro_batch_size: 16
38+
sequence_length: 4096
39+
batch_size: 32
40+
data:
41+
tokenizer:
42+
path: /mnt/checkpoints/pretrained_models/SmolLM2-135M-Instruct
43+
datasets:
44+
# Bad dataset they are tokenized with different tokenizer, then llama
45+
training:
46+
type: file
47+
path: /mnt/datasets/test/denis/fineweb_the_stack_3b.yaml
48+
stack_3b:
49+
type: memmap
50+
path: /mnt/datasets/data_collections/the_stack_3b/tokens/stack_3b/default/train/99
51+
fineweb:
52+
type: memmap
53+
path: /mnt/datasets/data_collections/standalone_datasets/tokens/HuggingFaceFW/fineweb/default/train/9_1000
54+
optimizer:
55+
weight_decay: 0.1
56+
beta_1: 0.9
57+
beta_2: 0.95
58+
learning_rate:
59+
base: 1.0e-04 # (3)!
60+
minimum: 1.0e-05
61+
decay_style: cosine
62+
decay_iterations: 100_000
63+
warmup_iterations: 2000
64+
pretrained: # (4)!
65+
format: llama
66+
path: /mnt/checkpoints/pretrained_models/SmolLM2-135M-Instruct/
67+
model_weights: yes # (5)!
68+
model:
69+
base_model:
70+
transformer:
71+
use_flash_attention: yes
72+
cross_entropy_impl: fused
73+
multi_stage:
74+
zero_stage: 2
75+
distributed:
76+
training_dtype: bf16
77+
78+
run:
79+
experiment_dir: "/mnt/checkpoints/test/denis/smol_eval_experiment"
80+
81+
# training:
82+
# logs:
83+
# interval: 10
84+
# wandb:
85+
# project_name: ${job.project_name}
86+
# group_name: ${job.project_version}

0 commit comments

Comments
 (0)