Training in progress, step 154
Browse files- README.md +163 -4
- adapter_config.json +4 -4
- adapter_model.safetensors +1 -1
- config.json +1 -1
- training_args.bin +1 -1
README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
---
|
2 |
-
base_model: google/gemma-2-27b-it
|
3 |
-
license: gemma
|
4 |
library_name: peft
|
5 |
-
|
|
|
6 |
tags:
|
7 |
- axolotl
|
8 |
- generated_from_trainer
|
@@ -21,7 +20,121 @@ should probably proofread and complete it, then remove this comment. -->
|
|
21 |
|
22 |
axolotl version: `0.6.0`
|
23 |
```yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
base_model: google/gemma-2-27b-it
|
|
|
|
|
25 |
hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
|
26 |
|
27 |
load_in_8bit: false
|
@@ -39,6 +152,7 @@ val_set_size: 0.1
|
|
39 |
output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
|
40 |
|
41 |
adapter: qlora
|
|
|
42 |
lora_r: 32
|
43 |
lora_alpha: 16
|
44 |
lora_dropout: 0.05
|
@@ -46,21 +160,26 @@ lora_target_linear: true
|
|
46 |
|
47 |
sequence_len: 2048
|
48 |
sample_packing: true
|
|
|
49 |
pad_to_sequence_len: true
|
50 |
|
51 |
gradient_accumulation_steps: 4
|
52 |
micro_batch_size: 2
|
53 |
num_epochs: 3
|
|
|
54 |
optimizer: adamw_torch
|
55 |
lr_scheduler: cosine
|
56 |
learning_rate: 2e-5
|
57 |
|
|
|
58 |
train_on_inputs: false
|
59 |
group_by_length: false
|
60 |
bf16: auto
|
61 |
fp16:
|
62 |
tf32: false
|
63 |
|
|
|
|
|
64 |
gradient_checkpointing: true
|
65 |
early_stopping_patience:
|
66 |
resume_from_checkpoint:
|
@@ -69,16 +188,56 @@ logging_steps: 1
|
|
69 |
xformers_attention:
|
70 |
flash_attention: false
|
71 |
|
|
|
|
|
|
|
|
|
72 |
warmup_ratio: 0.1
|
73 |
evals_per_epoch: 4
|
74 |
eval_max_new_tokens: 128
|
75 |
saves_per_epoch: 1
|
76 |
debug:
|
77 |
-
deepspeed:
|
78 |
weight_decay: 0.0
|
79 |
|
|
|
|
|
80 |
fsdp:
|
81 |
fsdp_config:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
```
|
83 |
|
84 |
</details><br>
|
|
|
1 |
---
|
|
|
|
|
2 |
library_name: peft
|
3 |
+
license: gemma
|
4 |
+
base_model: google/gemma-2-27b-it
|
5 |
tags:
|
6 |
- axolotl
|
7 |
- generated_from_trainer
|
|
|
20 |
|
21 |
axolotl version: `0.6.0`
|
22 |
```yaml
|
23 |
+
# base_model: meta-llama/Llama-3.2-1B-Instruct
|
24 |
+
# # Automatically upload checkpoint and final model to HF
|
25 |
+
# # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-MetaMathQA
|
26 |
+
# hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-gsm8k
|
27 |
+
|
28 |
+
# load_in_8bit: false
|
29 |
+
# load_in_4bit: true
|
30 |
+
# strict: false
|
31 |
+
|
32 |
+
|
33 |
+
# datasets:
|
34 |
+
# - path: openai/gsm8k
|
35 |
+
# type: alpaca_chat.load_qa
|
36 |
+
# name: "main"
|
37 |
+
# train_on_split: "train"
|
38 |
+
|
39 |
+
|
40 |
+
# # datasets:
|
41 |
+
# # - path: meta-math/MetaMathQA
|
42 |
+
# # type:
|
43 |
+
# # field_instruction: query
|
44 |
+
# # field_output: response
|
45 |
+
|
46 |
+
# val_set_size: 0.1
|
47 |
+
# # output_dir: "/mnt/disks/gcs/axolotl/outputs/out"
|
48 |
+
# output_dir: "/mnt/disks/gcs/axolotl/outputs/gsm8k-out"
|
49 |
+
# # output_dir: "/mnt/disks/gcs/axolotl/outputs/MetaMathQA-out"
|
50 |
+
|
51 |
+
# adapter: qlora
|
52 |
+
# lora_model_dir:
|
53 |
+
|
54 |
+
# sequence_len: 2048
|
55 |
+
# sample_packing: true
|
56 |
+
# eval_sample_packing: true
|
57 |
+
# pad_to_sequence_len: true
|
58 |
+
|
59 |
+
# lora_r: 32
|
60 |
+
# lora_alpha: 16
|
61 |
+
# lora_dropout: 0.05
|
62 |
+
# lora_fan_in_fan_out:
|
63 |
+
# lora_target_modules:
|
64 |
+
# - gate_proj
|
65 |
+
# - down_proj
|
66 |
+
# - up_proj
|
67 |
+
# - q_proj
|
68 |
+
# - v_proj
|
69 |
+
# - k_proj
|
70 |
+
# - o_proj
|
71 |
+
|
72 |
+
# wandb_project:
|
73 |
+
# wandb_entity:
|
74 |
+
# wandb_watch:
|
75 |
+
# wandb_name:
|
76 |
+
# wandb_log_model:
|
77 |
+
|
78 |
+
# gradient_accumulation_steps: 4
|
79 |
+
# micro_batch_size: 2
|
80 |
+
# num_epochs: 3
|
81 |
+
# # optimizer: adamw_bnb_8bit
|
82 |
+
# optimizer: adamw_torch
|
83 |
+
# lr_scheduler: cosine
|
84 |
+
# learning_rate: 2e-5
|
85 |
+
|
86 |
+
# train_on_inputs: false
|
87 |
+
# group_by_length: false
|
88 |
+
# bf16: auto
|
89 |
+
# fp16:
|
90 |
+
# tf32: false
|
91 |
+
|
92 |
+
# # gradient_checkpointing: true
|
93 |
+
# gradient_checkpointing: false
|
94 |
+
# early_stopping_patience:
|
95 |
+
# resume_from_checkpoint:
|
96 |
+
# local_rank:
|
97 |
+
# logging_steps: 1
|
98 |
+
# xformers_attention:
|
99 |
+
# flash_attention: true
|
100 |
+
|
101 |
+
# loss_watchdog_threshold: 5.0
|
102 |
+
# loss_watchdog_patience: 3
|
103 |
+
|
104 |
+
# warmup_steps: 10
|
105 |
+
# evals_per_epoch: 4
|
106 |
+
# eval_table_size:
|
107 |
+
# eval_max_new_tokens: 128
|
108 |
+
# saves_per_epoch: 1
|
109 |
+
# debug:
|
110 |
+
# deepspeed:
|
111 |
+
# weight_decay: 0.0
|
112 |
+
# # fsdp:
|
113 |
+
# # fsdp_config:
|
114 |
+
# fsdp:
|
115 |
+
# - full_shard
|
116 |
+
# - auto_wrap
|
117 |
+
# fsdp_config:
|
118 |
+
# fsdp_limit_all_gathers: true
|
119 |
+
# fsdp_sync_module_states: true
|
120 |
+
# fsdp_offload_params: true
|
121 |
+
# fsdp_use_orig_params: false
|
122 |
+
# fsdp_cpu_ram_efficient_loading: true
|
123 |
+
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
124 |
+
# fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
125 |
+
# fsdp_state_dict_type: FULL_STATE_DICT
|
126 |
+
# fsdp_sharding_strategy: FULL_SHARD
|
127 |
+
# fsdp_activation_checkpointing: true
|
128 |
+
# special_tokens:
|
129 |
+
# # pad_token: "<|end_of_text|>"
|
130 |
+
# special_tokens:
|
131 |
+
# bos_token: "<|begin_of_text|>"
|
132 |
+
# eos_token: "<|eot_id|>"
|
133 |
+
# pad_token: "<|finetune_right_pad_id|>"
|
134 |
+
|
135 |
base_model: google/gemma-2-27b-it
|
136 |
+
# model_type: AutoModelForCausalLM
|
137 |
+
# tokenizer_type: AutoTokenizer
|
138 |
hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
|
139 |
|
140 |
load_in_8bit: false
|
|
|
152 |
output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
|
153 |
|
154 |
adapter: qlora
|
155 |
+
|
156 |
lora_r: 32
|
157 |
lora_alpha: 16
|
158 |
lora_dropout: 0.05
|
|
|
160 |
|
161 |
sequence_len: 2048
|
162 |
sample_packing: true
|
163 |
+
# eval_sample_packing: true
|
164 |
pad_to_sequence_len: true
|
165 |
|
166 |
gradient_accumulation_steps: 4
|
167 |
micro_batch_size: 2
|
168 |
num_epochs: 3
|
169 |
+
# optimizer: adamw_bnb_8bit
|
170 |
optimizer: adamw_torch
|
171 |
lr_scheduler: cosine
|
172 |
learning_rate: 2e-5
|
173 |
|
174 |
+
|
175 |
train_on_inputs: false
|
176 |
group_by_length: false
|
177 |
bf16: auto
|
178 |
fp16:
|
179 |
tf32: false
|
180 |
|
181 |
+
|
182 |
+
# gradient_checkpointing: false
|
183 |
gradient_checkpointing: true
|
184 |
early_stopping_patience:
|
185 |
resume_from_checkpoint:
|
|
|
188 |
xformers_attention:
|
189 |
flash_attention: false
|
190 |
|
191 |
+
# loss_watchdog_threshold: 5.0
|
192 |
+
# loss_watchdog_patience: 3
|
193 |
+
|
194 |
+
|
195 |
warmup_ratio: 0.1
|
196 |
evals_per_epoch: 4
|
197 |
eval_max_new_tokens: 128
|
198 |
saves_per_epoch: 1
|
199 |
debug:
|
200 |
+
# deepspeed:
|
201 |
weight_decay: 0.0
|
202 |
|
203 |
+
deepspeed: deepspeed_configs/zero1.json
|
204 |
+
|
205 |
fsdp:
|
206 |
fsdp_config:
|
207 |
+
# fsdp:
|
208 |
+
# - full_shard
|
209 |
+
# - auto_wrap
|
210 |
+
|
211 |
+
# fsdp_config:
|
212 |
+
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
213 |
+
# fsdp_backward_prefetch: BACKWARD_PRE
|
214 |
+
# fsdp_cpu_ram_efficient_loading: true
|
215 |
+
# fsdp_forward_prefetch: false
|
216 |
+
# fsdp_offload_params: true
|
217 |
+
# fsdp_sharding_strategy: FULL_SHARD
|
218 |
+
# fsdp_state_dict_type: SHARDED_STATE_DICT
|
219 |
+
# fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
|
220 |
+
# fsdp_sync_module_states: true
|
221 |
+
# fsdp_use_orig_params: true
|
222 |
+
|
223 |
+
|
224 |
+
# fsdp_config:
|
225 |
+
# fsdp_limit_all_gathers: true
|
226 |
+
# fsdp_sync_module_states: true
|
227 |
+
# fsdp_offload_params: true
|
228 |
+
# fsdp_use_orig_params: false
|
229 |
+
# fsdp_cpu_ram_efficient_loading: true
|
230 |
+
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
231 |
+
# fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
|
232 |
+
# fsdp_state_dict_type: FULL_STATE_DICT
|
233 |
+
# fsdp_sharding_strategy: FULL_SHARD
|
234 |
+
# fsdp_activation_checkpointing: true
|
235 |
+
# special_tokens:
|
236 |
+
# # pad_token: "<|end_of_text|>"
|
237 |
+
# special_tokens:
|
238 |
+
# bos_token: "<|begin_of_text|>"
|
239 |
+
# eos_token: "<|eot_id|>"
|
240 |
+
# pad_token: "<|finetune_right_pad_id|>"
|
241 |
```
|
242 |
|
243 |
</details><br>
|
adapter_config.json
CHANGED
@@ -23,13 +23,13 @@
|
|
23 |
"rank_pattern": {},
|
24 |
"revision": null,
|
25 |
"target_modules": [
|
|
|
26 |
"gate_proj",
|
27 |
"v_proj",
|
28 |
-
"k_proj",
|
29 |
-
"o_proj",
|
30 |
-
"q_proj",
|
31 |
"down_proj",
|
32 |
-
"
|
|
|
|
|
33 |
],
|
34 |
"task_type": "CAUSAL_LM",
|
35 |
"use_dora": false,
|
|
|
23 |
"rank_pattern": {},
|
24 |
"revision": null,
|
25 |
"target_modules": [
|
26 |
+
"o_proj",
|
27 |
"gate_proj",
|
28 |
"v_proj",
|
|
|
|
|
|
|
29 |
"down_proj",
|
30 |
+
"k_proj",
|
31 |
+
"up_proj",
|
32 |
+
"q_proj"
|
33 |
],
|
34 |
"task_type": "CAUSAL_LM",
|
35 |
"use_dora": false,
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 456807968
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58b08d330dcb589f67d630845c8d7b4e9398a45427d9eac63f91ba0b7f03406c
|
3 |
size 456807968
|
config.json
CHANGED
@@ -27,7 +27,7 @@
|
|
27 |
"_load_in_4bit": true,
|
28 |
"_load_in_8bit": false,
|
29 |
"bnb_4bit_compute_dtype": "bfloat16",
|
30 |
-
"bnb_4bit_quant_storage": "
|
31 |
"bnb_4bit_quant_type": "nf4",
|
32 |
"bnb_4bit_use_double_quant": true,
|
33 |
"llm_int8_enable_fp32_cpu_offload": false,
|
|
|
27 |
"_load_in_4bit": true,
|
28 |
"_load_in_8bit": false,
|
29 |
"bnb_4bit_compute_dtype": "bfloat16",
|
30 |
+
"bnb_4bit_quant_storage": "uint8",
|
31 |
"bnb_4bit_quant_type": "nf4",
|
32 |
"bnb_4bit_use_double_quant": true,
|
33 |
"llm_int8_enable_fp32_cpu_offload": false,
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 7992
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a6b229a80811553a8917ed8e0f624c6e8ad3674fe3237b402de0aa44dd4c70d
|
3 |
size 7992
|