kweinmeister commited on
Commit
82a9400
·
verified ·
1 Parent(s): e6d1033

Training in progress, step 154

Browse files
README.md CHANGED
@@ -1,8 +1,7 @@
1
  ---
2
- base_model: google/gemma-2-27b-it
3
- license: gemma
4
  library_name: peft
5
- pipeline_tag: text-generation
 
6
  tags:
7
  - axolotl
8
  - generated_from_trainer
@@ -21,7 +20,121 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  axolotl version: `0.6.0`
23
  ```yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  base_model: google/gemma-2-27b-it
 
 
25
  hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
26
 
27
  load_in_8bit: false
@@ -39,6 +152,7 @@ val_set_size: 0.1
39
  output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
40
 
41
  adapter: qlora
 
42
  lora_r: 32
43
  lora_alpha: 16
44
  lora_dropout: 0.05
@@ -46,21 +160,26 @@ lora_target_linear: true
46
 
47
  sequence_len: 2048
48
  sample_packing: true
 
49
  pad_to_sequence_len: true
50
 
51
  gradient_accumulation_steps: 4
52
  micro_batch_size: 2
53
  num_epochs: 3
 
54
  optimizer: adamw_torch
55
  lr_scheduler: cosine
56
  learning_rate: 2e-5
57
 
 
58
  train_on_inputs: false
59
  group_by_length: false
60
  bf16: auto
61
  fp16:
62
  tf32: false
63
 
 
 
64
  gradient_checkpointing: true
65
  early_stopping_patience:
66
  resume_from_checkpoint:
@@ -69,16 +188,56 @@ logging_steps: 1
69
  xformers_attention:
70
  flash_attention: false
71
 
 
 
 
 
72
  warmup_ratio: 0.1
73
  evals_per_epoch: 4
74
  eval_max_new_tokens: 128
75
  saves_per_epoch: 1
76
  debug:
77
- deepspeed: deepspeed_configs/zero1.json
78
  weight_decay: 0.0
79
 
 
 
80
  fsdp:
81
  fsdp_config:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ```
83
 
84
  </details><br>
 
1
  ---
 
 
2
  library_name: peft
3
+ license: gemma
4
+ base_model: google/gemma-2-27b-it
5
  tags:
6
  - axolotl
7
  - generated_from_trainer
 
20
 
21
  axolotl version: `0.6.0`
22
  ```yaml
23
+ # base_model: meta-llama/Llama-3.2-1B-Instruct
24
+ # # Automatically upload checkpoint and final model to HF
25
+ # # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-MetaMathQA
26
+ # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-gsm8k
27
+
28
+ # load_in_8bit: false
29
+ # load_in_4bit: true
30
+ # strict: false
31
+
32
+
33
+ # datasets:
34
+ # - path: openai/gsm8k
35
+ # type: alpaca_chat.load_qa
36
+ # name: "main"
37
+ # train_on_split: "train"
38
+
39
+
40
+ # # datasets:
41
+ # # - path: meta-math/MetaMathQA
42
+ # # type:
43
+ # # field_instruction: query
44
+ # # field_output: response
45
+
46
+ # val_set_size: 0.1
47
+ # # output_dir: "/mnt/disks/gcs/axolotl/outputs/out"
48
+ # output_dir: "/mnt/disks/gcs/axolotl/outputs/gsm8k-out"
49
+ # # output_dir: "/mnt/disks/gcs/axolotl/outputs/MetaMathQA-out"
50
+
51
+ # adapter: qlora
52
+ # lora_model_dir:
53
+
54
+ # sequence_len: 2048
55
+ # sample_packing: true
56
+ # eval_sample_packing: true
57
+ # pad_to_sequence_len: true
58
+
59
+ # lora_r: 32
60
+ # lora_alpha: 16
61
+ # lora_dropout: 0.05
62
+ # lora_fan_in_fan_out:
63
+ # lora_target_modules:
64
+ # - gate_proj
65
+ # - down_proj
66
+ # - up_proj
67
+ # - q_proj
68
+ # - v_proj
69
+ # - k_proj
70
+ # - o_proj
71
+
72
+ # wandb_project:
73
+ # wandb_entity:
74
+ # wandb_watch:
75
+ # wandb_name:
76
+ # wandb_log_model:
77
+
78
+ # gradient_accumulation_steps: 4
79
+ # micro_batch_size: 2
80
+ # num_epochs: 3
81
+ # # optimizer: adamw_bnb_8bit
82
+ # optimizer: adamw_torch
83
+ # lr_scheduler: cosine
84
+ # learning_rate: 2e-5
85
+
86
+ # train_on_inputs: false
87
+ # group_by_length: false
88
+ # bf16: auto
89
+ # fp16:
90
+ # tf32: false
91
+
92
+ # # gradient_checkpointing: true
93
+ # gradient_checkpointing: false
94
+ # early_stopping_patience:
95
+ # resume_from_checkpoint:
96
+ # local_rank:
97
+ # logging_steps: 1
98
+ # xformers_attention:
99
+ # flash_attention: true
100
+
101
+ # loss_watchdog_threshold: 5.0
102
+ # loss_watchdog_patience: 3
103
+
104
+ # warmup_steps: 10
105
+ # evals_per_epoch: 4
106
+ # eval_table_size:
107
+ # eval_max_new_tokens: 128
108
+ # saves_per_epoch: 1
109
+ # debug:
110
+ # deepspeed:
111
+ # weight_decay: 0.0
112
+ # # fsdp:
113
+ # # fsdp_config:
114
+ # fsdp:
115
+ # - full_shard
116
+ # - auto_wrap
117
+ # fsdp_config:
118
+ # fsdp_limit_all_gathers: true
119
+ # fsdp_sync_module_states: true
120
+ # fsdp_offload_params: true
121
+ # fsdp_use_orig_params: false
122
+ # fsdp_cpu_ram_efficient_loading: true
123
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
124
+ # fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
125
+ # fsdp_state_dict_type: FULL_STATE_DICT
126
+ # fsdp_sharding_strategy: FULL_SHARD
127
+ # fsdp_activation_checkpointing: true
128
+ # special_tokens:
129
+ # # pad_token: "<|end_of_text|>"
130
+ # special_tokens:
131
+ # bos_token: "<|begin_of_text|>"
132
+ # eos_token: "<|eot_id|>"
133
+ # pad_token: "<|finetune_right_pad_id|>"
134
+
135
  base_model: google/gemma-2-27b-it
136
+ # model_type: AutoModelForCausalLM
137
+ # tokenizer_type: AutoTokenizer
138
  hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
139
 
140
  load_in_8bit: false
 
152
  output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
153
 
154
  adapter: qlora
155
+
156
  lora_r: 32
157
  lora_alpha: 16
158
  lora_dropout: 0.05
 
160
 
161
  sequence_len: 2048
162
  sample_packing: true
163
+ # eval_sample_packing: true
164
  pad_to_sequence_len: true
165
 
166
  gradient_accumulation_steps: 4
167
  micro_batch_size: 2
168
  num_epochs: 3
169
+ # optimizer: adamw_bnb_8bit
170
  optimizer: adamw_torch
171
  lr_scheduler: cosine
172
  learning_rate: 2e-5
173
 
174
+
175
  train_on_inputs: false
176
  group_by_length: false
177
  bf16: auto
178
  fp16:
179
  tf32: false
180
 
181
+
182
+ # gradient_checkpointing: false
183
  gradient_checkpointing: true
184
  early_stopping_patience:
185
  resume_from_checkpoint:
 
188
  xformers_attention:
189
  flash_attention: false
190
 
191
+ # loss_watchdog_threshold: 5.0
192
+ # loss_watchdog_patience: 3
193
+
194
+
195
  warmup_ratio: 0.1
196
  evals_per_epoch: 4
197
  eval_max_new_tokens: 128
198
  saves_per_epoch: 1
199
  debug:
200
+ # deepspeed:
201
  weight_decay: 0.0
202
 
203
+ deepspeed: deepspeed_configs/zero1.json
204
+
205
  fsdp:
206
  fsdp_config:
207
+ # fsdp:
208
+ # - full_shard
209
+ # - auto_wrap
210
+
211
+ # fsdp_config:
212
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
213
+ # fsdp_backward_prefetch: BACKWARD_PRE
214
+ # fsdp_cpu_ram_efficient_loading: true
215
+ # fsdp_forward_prefetch: false
216
+ # fsdp_offload_params: true
217
+ # fsdp_sharding_strategy: FULL_SHARD
218
+ # fsdp_state_dict_type: SHARDED_STATE_DICT
219
+ # fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
220
+ # fsdp_sync_module_states: true
221
+ # fsdp_use_orig_params: true
222
+
223
+
224
+ # fsdp_config:
225
+ # fsdp_limit_all_gathers: true
226
+ # fsdp_sync_module_states: true
227
+ # fsdp_offload_params: true
228
+ # fsdp_use_orig_params: false
229
+ # fsdp_cpu_ram_efficient_loading: true
230
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
231
+ # fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
232
+ # fsdp_state_dict_type: FULL_STATE_DICT
233
+ # fsdp_sharding_strategy: FULL_SHARD
234
+ # fsdp_activation_checkpointing: true
235
+ # special_tokens:
236
+ # # pad_token: "<|end_of_text|>"
237
+ # special_tokens:
238
+ # bos_token: "<|begin_of_text|>"
239
+ # eos_token: "<|eot_id|>"
240
+ # pad_token: "<|finetune_right_pad_id|>"
241
  ```
242
 
243
  </details><br>
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "gate_proj",
27
  "v_proj",
28
- "k_proj",
29
- "o_proj",
30
- "q_proj",
31
  "down_proj",
32
- "up_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "o_proj",
27
  "gate_proj",
28
  "v_proj",
 
 
 
29
  "down_proj",
30
+ "k_proj",
31
+ "up_proj",
32
+ "q_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5709a09cb3224a0ce11a226d4f596d5351cb9762a1b2a0eb00e00519c8b9f431
3
  size 456807968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b08d330dcb589f67d630845c8d7b4e9398a45427d9eac63f91ba0b7f03406c
3
  size 456807968
config.json CHANGED
@@ -27,7 +27,7 @@
27
  "_load_in_4bit": true,
28
  "_load_in_8bit": false,
29
  "bnb_4bit_compute_dtype": "bfloat16",
30
- "bnb_4bit_quant_storage": "bfloat16",
31
  "bnb_4bit_quant_type": "nf4",
32
  "bnb_4bit_use_double_quant": true,
33
  "llm_int8_enable_fp32_cpu_offload": false,
 
27
  "_load_in_4bit": true,
28
  "_load_in_8bit": false,
29
  "bnb_4bit_compute_dtype": "bfloat16",
30
+ "bnb_4bit_quant_storage": "uint8",
31
  "bnb_4bit_quant_type": "nf4",
32
  "bnb_4bit_use_double_quant": true,
33
  "llm_int8_enable_fp32_cpu_offload": false,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d1596f14357a7da8bd1d96e2ae56deec6db9b25132673be4df1cd5d1a18bc8e
3
  size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6b229a80811553a8917ed8e0f624c6e8ad3674fe3237b402de0aa44dd4c70d
3
  size 7992