rayonlabs RoyJoy commited on
Commit
e3d9c97
·
verified ·
0 Parent(s):

Duplicate from sn56a2/0217a21b-6782-4e2b-80a2-f08c9efc8d57

Browse files

Co-authored-by: Roy Joy <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: unsloth/tinyllama
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: 0217a21b-6782-4e2b-80a2-f08c9efc8d57
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.4.1`
20
+ ```yaml
21
+ adapter: lora
22
+ base_model: unsloth/tinyllama
23
+ bf16: auto
24
+ chat_template: llama3
25
+ cosine_min_lr_ratio: 0.1
26
+ data_processes: 16
27
+ dataset_prepared_path: null
28
+ datasets:
29
+ - data_files:
30
+ - 19c4a843ffacedb8_train_data.json
31
+ ds_type: json
32
+ format: custom
33
+ path: /workspace/input_data/19c4a843ffacedb8_train_data.json
34
+ type:
35
+ field_input: input
36
+ field_instruction: instruction
37
+ field_output: output
38
+ format: '{instruction} {input}'
39
+ no_input_format: '{instruction}'
40
+ system_format: '{system}'
41
+ system_prompt: ''
42
+ debug: null
43
+ deepspeed: null
44
+ device_map: '{'''':torch.cuda.current_device()}'
45
+ do_eval: true
46
+ early_stopping_patience: 1
47
+ eval_batch_size: 6
48
+ eval_sample_packing: false
49
+ eval_steps: 25
50
+ evaluation_strategy: steps
51
+ flash_attention: true
52
+ fp16: null
53
+ fsdp: null
54
+ fsdp_config: null
55
+ gradient_accumulation_steps: 5
56
+ gradient_checkpointing: true
57
+ group_by_length: true
58
+ hub_model_id: sn56a2/0217a21b-6782-4e2b-80a2-f08c9efc8d57
59
+ hub_repo: stevemonite
60
+ hub_strategy: checkpoint
61
+ hub_token: null
62
+ learning_rate: 0.0001
63
+ load_in_4bit: false
64
+ load_in_8bit: false
65
+ local_rank: null
66
+ logging_steps: 1
67
+ lora_alpha: 64
68
+ lora_dropout: 0.05
69
+ lora_fan_in_fan_out: null
70
+ lora_model_dir: null
71
+ lora_r: 32
72
+ lora_target_linear: true
73
+ lora_target_modules:
74
+ - q_proj
75
+ - v_proj
76
+ lr_scheduler: cosine
77
+ max_grad_norm: 1.0
78
+ max_memory:
79
+ 0: 70GiB
80
+ max_steps: 142
81
+ micro_batch_size: 6
82
+ mlflow_experiment_name: /tmp/19c4a843ffacedb8_train_data.json
83
+ model_type: AutoModelForCausalLM
84
+ num_epochs: 3
85
+ optim_args:
86
+ adam_beta1: 0.9
87
+ adam_beta2: 0.95
88
+ adam_epsilon: 1e-5
89
+ optimizer: adamw_torch
90
+ output_dir: miner_id_24
91
+ pad_to_sequence_len: true
92
+ resume_from_checkpoint: null
93
+ s2_attention: null
94
+ sample_packing: false
95
+ save_steps: 50
96
+ save_strategy: steps
97
+ sequence_len: 2048
98
+ strict: false
99
+ tf32: false
100
+ tokenizer_type: AutoTokenizer
101
+ torch_compile: false
102
+ train_on_inputs: false
103
+ trust_remote_code: true
104
+ val_set_size: 50
105
+ wandb_entity: sn56-miner
106
+ wandb_mode: disabled
107
+ wandb_name: 0217a21b-6782-4e2b-80a2-f08c9efc8d57
108
+ wandb_project: god
109
+ wandb_run: a9i6
110
+ wandb_runid: 0217a21b-6782-4e2b-80a2-f08c9efc8d57
111
+ warmup_raio: 0.03
112
+ warmup_ratio: 0.04
113
+ weight_decay: 0.01
114
+ xformers_attention: null
115
+
116
+ ```
117
+
118
+ </details><br>
119
+
120
+ # 0217a21b-6782-4e2b-80a2-f08c9efc8d57
121
+
122
+ This model is a fine-tuned version of [unsloth/tinyllama](https://huggingface.co/unsloth/tinyllama) on the None dataset.
123
+ It achieves the following results on the evaluation set:
124
+ - Loss: 0.1068
125
+
126
+ ## Model description
127
+
128
+ More information needed
129
+
130
+ ## Intended uses & limitations
131
+
132
+ More information needed
133
+
134
+ ## Training and evaluation data
135
+
136
+ More information needed
137
+
138
+ ## Training procedure
139
+
140
+ ### Training hyperparameters
141
+
142
+ The following hyperparameters were used during training:
143
+ - learning_rate: 0.0001
144
+ - train_batch_size: 6
145
+ - eval_batch_size: 6
146
+ - seed: 42
147
+ - distributed_type: multi-GPU
148
+ - num_devices: 4
149
+ - gradient_accumulation_steps: 5
150
+ - total_train_batch_size: 120
151
+ - total_eval_batch_size: 24
152
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=adam_beta1=0.9,adam_beta2=0.95,adam_epsilon=1e-5
153
+ - lr_scheduler_type: cosine
154
+ - lr_scheduler_warmup_steps: 5
155
+ - training_steps: 142
156
+
157
+ ### Training results
158
+
159
+ | Training Loss | Epoch | Step | Validation Loss |
160
+ |:-------------:|:------:|:----:|:---------------:|
161
+ | 0.8413 | 0.0051 | 1 | 0.8489 |
162
+ | 0.1495 | 0.1287 | 25 | 0.2523 |
163
+ | 0.0913 | 0.2575 | 50 | 0.1599 |
164
+ | 0.0638 | 0.3862 | 75 | 0.1326 |
165
+ | 0.0568 | 0.5149 | 100 | 0.1187 |
166
+ | 0.0471 | 0.6437 | 125 | 0.1068 |
167
+
168
+
169
+ ### Framework versions
170
+
171
+ - PEFT 0.13.2
172
+ - Transformers 4.46.0
173
+ - Pytorch 2.5.0+cu124
174
+ - Datasets 3.0.1
175
+ - Tokenizers 0.20.1
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/tinyllama",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "down_proj",
25
+ "v_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "gate_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde4994cef3b0dba69854e5fdb1e66529ab9485d38ada030a2c2e21f1bf49738
3
+ size 101036698
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1638e49f16369ebdd49a26a3cccb66a9226474a8a995be95c371f2ee1deb1505
3
+ size 100966336
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "unsloth/tinyllama",
4
+ "architectures": [
5
+ "LlamaForCausalLM"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "head_dim": 64,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 5632,
16
+ "max_position_embeddings": 2048,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 22,
21
+ "num_key_value_heads": 4,
22
+ "pad_token_id": 0,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.46.0",
30
+ "unsloth_version": "2024.9",
31
+ "use_cache": false,
32
+ "vocab_size": 32000
33
+ }
last-checkpoint/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/tinyllama
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
last-checkpoint/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/tinyllama",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "down_proj",
25
+ "v_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "gate_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
last-checkpoint/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b5f3b1ecd7bfb6a60fd897ee28be8c994d694ecb061b58bc8eeda9221ef11a
3
+ size 100966336
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb4b23c8a19d5116b9398aad6c3c77553d8efedb6eb6278df55604ba8d309ef
3
+ size 202110330
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f8f27267318b261cd046500398b77ce37072279aa787191b373ca1227e2cefe
3
+ size 15024
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64bb2f27184a144321829e28b2736f9eb1224d3c35e0294dd4ed8f980f6fb581
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd4d6c07f6fea5f7278d4e050c69c8e7ba3d265203ac7f7df6d67c91505b7cc2
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce24024ebbc538599742941099f555a69b891ffbcb1908771967b6160a67269a
3
+ size 15024
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0c82ef6504cec15b93a476d07a29c3a58a42bf2d87f279f10681f1a866d2a5
3
+ size 1064
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
last-checkpoint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
last-checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "left",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.11873143911361694,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.7312049433573635,
5
+ "eval_steps": 25,
6
+ "global_step": 142,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005149330587023687,
13
+ "grad_norm": 1.9118549823760986,
14
+ "learning_rate": 2e-05,
15
+ "loss": 0.8413,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.005149330587023687,
20
+ "eval_loss": 0.8488772511482239,
21
+ "eval_runtime": 0.4861,
22
+ "eval_samples_per_second": 102.863,
23
+ "eval_steps_per_second": 6.172,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.010298661174047374,
28
+ "grad_norm": 1.8143242597579956,
29
+ "learning_rate": 4e-05,
30
+ "loss": 0.7566,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.015447991761071062,
35
+ "grad_norm": 1.861998200416565,
36
+ "learning_rate": 6e-05,
37
+ "loss": 0.7849,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.02059732234809475,
42
+ "grad_norm": 1.755989670753479,
43
+ "learning_rate": 8e-05,
44
+ "loss": 0.7278,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.025746652935118436,
49
+ "grad_norm": 1.1633100509643555,
50
+ "learning_rate": 0.0001,
51
+ "loss": 0.651,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.030895983522142123,
56
+ "grad_norm": 2.0457730293273926,
57
+ "learning_rate": 9.99881689824633e-05,
58
+ "loss": 0.5963,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.03604531410916581,
63
+ "grad_norm": 1.874700665473938,
64
+ "learning_rate": 9.995268215087426e-05,
65
+ "loss": 0.5782,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.0411946446961895,
70
+ "grad_norm": 1.2453465461730957,
71
+ "learning_rate": 9.989355816502525e-05,
72
+ "loss": 0.4915,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.046343975283213185,
77
+ "grad_norm": 1.1409831047058105,
78
+ "learning_rate": 9.981082811366797e-05,
79
+ "loss": 0.4244,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.05149330587023687,
84
+ "grad_norm": 0.8969383835792542,
85
+ "learning_rate": 9.970453549816632e-05,
86
+ "loss": 0.3625,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.05664263645726056,
91
+ "grad_norm": 1.0627210140228271,
92
+ "learning_rate": 9.957473620962246e-05,
93
+ "loss": 0.3223,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.061791967044284246,
98
+ "grad_norm": 0.9921000599861145,
99
+ "learning_rate": 9.94214984994879e-05,
100
+ "loss": 0.2614,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.06694129763130793,
105
+ "grad_norm": 1.0099488496780396,
106
+ "learning_rate": 9.924490294367533e-05,
107
+ "loss": 0.393,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.07209062821833162,
112
+ "grad_norm": 0.9516873359680176,
113
+ "learning_rate": 9.904504240019e-05,
114
+ "loss": 0.3875,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.07723995880535531,
119
+ "grad_norm": 0.7377267479896545,
120
+ "learning_rate": 9.88220219603028e-05,
121
+ "loss": 0.3633,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.082389289392379,
126
+ "grad_norm": 0.6139414310455322,
127
+ "learning_rate": 9.85759588932908e-05,
128
+ "loss": 0.3318,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.08753861997940268,
133
+ "grad_norm": 0.5736819505691528,
134
+ "learning_rate": 9.830698258477458e-05,
135
+ "loss": 0.3028,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.09268795056642637,
140
+ "grad_norm": 0.5561637282371521,
141
+ "learning_rate": 9.801523446868399e-05,
142
+ "loss": 0.3008,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.09783728115345006,
147
+ "grad_norm": 0.5371466279029846,
148
+ "learning_rate": 9.770086795288913e-05,
149
+ "loss": 0.2838,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.10298661174047374,
154
+ "grad_norm": 0.5292208194732666,
155
+ "learning_rate": 9.736404833853502e-05,
156
+ "loss": 0.2504,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.10813594232749743,
161
+ "grad_norm": 0.43221810460090637,
162
+ "learning_rate": 9.700495273312223e-05,
163
+ "loss": 0.2371,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.11328527291452112,
168
+ "grad_norm": 0.4533340334892273,
169
+ "learning_rate": 9.662376995737989e-05,
170
+ "loss": 0.2223,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.1184346035015448,
175
+ "grad_norm": 0.43220922350883484,
176
+ "learning_rate": 9.622070044597935e-05,
177
+ "loss": 0.2191,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.12358393408856849,
182
+ "grad_norm": 0.46570438146591187,
183
+ "learning_rate": 9.579595614214087e-05,
184
+ "loss": 0.1742,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.12873326467559218,
189
+ "grad_norm": 1.1133004426956177,
190
+ "learning_rate": 9.534976038618931e-05,
191
+ "loss": 0.1495,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.12873326467559218,
196
+ "eval_loss": 0.25227421522140503,
197
+ "eval_runtime": 0.4861,
198
+ "eval_samples_per_second": 102.86,
199
+ "eval_steps_per_second": 6.172,
200
+ "step": 25
201
+ },
202
+ {
203
+ "epoch": 0.13388259526261587,
204
+ "grad_norm": 0.7691997289657593,
205
+ "learning_rate": 9.488234779811635e-05,
206
+ "loss": 0.2972,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 0.13903192584963955,
211
+ "grad_norm": 1.2182092666625977,
212
+ "learning_rate": 9.439396415421204e-05,
213
+ "loss": 0.2965,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 0.14418125643666324,
218
+ "grad_norm": 0.8559730648994446,
219
+ "learning_rate": 9.388486625782995e-05,
220
+ "loss": 0.2557,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.14933058702368693,
225
+ "grad_norm": 0.47647181153297424,
226
+ "learning_rate": 9.335532180435412e-05,
227
+ "loss": 0.2372,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.15447991761071062,
232
+ "grad_norm": 0.3898983895778656,
233
+ "learning_rate": 9.280560924043858e-05,
234
+ "loss": 0.2253,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.1596292481977343,
239
+ "grad_norm": 0.42367926239967346,
240
+ "learning_rate": 9.223601761759367e-05,
241
+ "loss": 0.2083,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.164778578784758,
246
+ "grad_norm": 0.4791017472743988,
247
+ "learning_rate": 9.164684644019624e-05,
248
+ "loss": 0.2071,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.16992790937178168,
253
+ "grad_norm": 0.5360159277915955,
254
+ "learning_rate": 9.103840550800329e-05,
255
+ "loss": 0.2034,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.17507723995880536,
260
+ "grad_norm": 0.45532089471817017,
261
+ "learning_rate": 9.041101475325209e-05,
262
+ "loss": 0.1807,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 0.18022657054582905,
267
+ "grad_norm": 0.38769960403442383,
268
+ "learning_rate": 8.976500407243247e-05,
269
+ "loss": 0.1728,
270
+ "step": 35
271
+ },
272
+ {
273
+ "epoch": 0.18537590113285274,
274
+ "grad_norm": 0.3485439419746399,
275
+ "learning_rate": 8.910071315281975e-05,
276
+ "loss": 0.1418,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.19052523171987643,
281
+ "grad_norm": 0.43333378434181213,
282
+ "learning_rate": 8.841849129385921e-05,
283
+ "loss": 0.1204,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.1956745623069001,
288
+ "grad_norm": 0.5296416878700256,
289
+ "learning_rate": 8.771869722349651e-05,
290
+ "loss": 0.24,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.2008238928939238,
295
+ "grad_norm": 0.3325974643230438,
296
+ "learning_rate": 8.700169890955027e-05,
297
+ "loss": 0.2236,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.2059732234809475,
302
+ "grad_norm": 0.42638543248176575,
303
+ "learning_rate": 8.626787336622607e-05,
304
+ "loss": 0.2151,
305
+ "step": 40
306
+ },
307
+ {
308
+ "epoch": 0.21112255406797117,
309
+ "grad_norm": 0.4510488212108612,
310
+ "learning_rate": 8.55176064558738e-05,
311
+ "loss": 0.1921,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.21627188465499486,
316
+ "grad_norm": 0.422617107629776,
317
+ "learning_rate": 8.475129268609227e-05,
318
+ "loss": 0.1846,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.22142121524201855,
323
+ "grad_norm": 0.44664841890335083,
324
+ "learning_rate": 8.396933500228808e-05,
325
+ "loss": 0.1709,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.22657054582904224,
330
+ "grad_norm": 0.4472566246986389,
331
+ "learning_rate": 8.317214457579773e-05,
332
+ "loss": 0.1765,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.23171987641606592,
337
+ "grad_norm": 0.3076079487800598,
338
+ "learning_rate": 8.23601405876841e-05,
339
+ "loss": 0.1503,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.2368692070030896,
344
+ "grad_norm": 0.3404317796230316,
345
+ "learning_rate": 8.153375000832157e-05,
346
+ "loss": 0.1423,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.2420185375901133,
351
+ "grad_norm": 0.3234133720397949,
352
+ "learning_rate": 8.069340737288512e-05,
353
+ "loss": 0.1369,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.24716786817713698,
358
+ "grad_norm": 0.38141271471977234,
359
+ "learning_rate": 7.98395545528617e-05,
360
+ "loss": 0.1283,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.25231719876416064,
365
+ "grad_norm": 0.559747576713562,
366
+ "learning_rate": 7.897264052370409e-05,
367
+ "loss": 0.1243,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.25746652935118436,
372
+ "grad_norm": 0.7120130062103271,
373
+ "learning_rate": 7.809312112874924e-05,
374
+ "loss": 0.0913,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.25746652935118436,
379
+ "eval_loss": 0.15993443131446838,
380
+ "eval_runtime": 0.4874,
381
+ "eval_samples_per_second": 102.589,
382
+ "eval_steps_per_second": 6.155,
383
+ "step": 50
384
+ },
385
+ {
386
+ "epoch": 0.262615859938208,
387
+ "grad_norm": 0.6971345543861389,
388
+ "learning_rate": 7.720145883952544e-05,
389
+ "loss": 0.3077,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 0.26776519052523173,
394
+ "grad_norm": 0.4453997313976288,
395
+ "learning_rate": 7.629812251257401e-05,
396
+ "loss": 0.1954,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 0.2729145211122554,
401
+ "grad_norm": 0.3085014224052429,
402
+ "learning_rate": 7.53835871429139e-05,
403
+ "loss": 0.1709,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 0.2780638516992791,
408
+ "grad_norm": 0.34796738624572754,
409
+ "learning_rate": 7.445833361427828e-05,
410
+ "loss": 0.1641,
411
+ "step": 54
412
+ },
413
+ {
414
+ "epoch": 0.28321318228630277,
415
+ "grad_norm": 0.4266407787799835,
416
+ "learning_rate": 7.352284844625481e-05,
417
+ "loss": 0.1585,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.2883625128733265,
422
+ "grad_norm": 0.5182251930236816,
423
+ "learning_rate": 7.257762353846257e-05,
424
+ "loss": 0.1609,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.29351184346035014,
429
+ "grad_norm": 0.4957202672958374,
430
+ "learning_rate": 7.162315591189978e-05,
431
+ "loss": 0.1558,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.29866117404737386,
436
+ "grad_norm": 0.4565785527229309,
437
+ "learning_rate": 7.065994744759879e-05,
438
+ "loss": 0.1437,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.3038105046343975,
443
+ "grad_norm": 0.36737626791000366,
444
+ "learning_rate": 6.96885046227255e-05,
445
+ "loss": 0.1326,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.30895983522142123,
450
+ "grad_norm": 0.2986319363117218,
451
+ "learning_rate": 6.8709338244262e-05,
452
+ "loss": 0.1241,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.3141091658084449,
457
+ "grad_norm": 0.31537097692489624,
458
+ "learning_rate": 6.772296318041253e-05,
459
+ "loss": 0.119,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.3192584963954686,
464
+ "grad_norm": 0.3971193730831146,
465
+ "learning_rate": 6.672989808987385e-05,
466
+ "loss": 0.102,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.32440782698249226,
471
+ "grad_norm": 0.4525837004184723,
472
+ "learning_rate": 6.573066514911273e-05,
473
+ "loss": 0.2455,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.329557157569516,
478
+ "grad_norm": 0.31870609521865845,
479
+ "learning_rate": 6.472578977779339e-05,
480
+ "loss": 0.1692,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.33470648815653964,
485
+ "grad_norm": 0.43375498056411743,
486
+ "learning_rate": 6.371580036249985e-05,
487
+ "loss": 0.1665,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.33985581874356335,
492
+ "grad_norm": 0.5092880129814148,
493
+ "learning_rate": 6.270122797889806e-05,
494
+ "loss": 0.1618,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.345005149330587,
499
+ "grad_norm": 0.415477454662323,
500
+ "learning_rate": 6.168260611248417e-05,
501
+ "loss": 0.1537,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.35015447991761073,
506
+ "grad_norm": 0.37228500843048096,
507
+ "learning_rate": 6.066047037806549e-05,
508
+ "loss": 0.1468,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.3553038105046344,
513
+ "grad_norm": 0.35446012020111084,
514
+ "learning_rate": 5.9635358238121954e-05,
515
+ "loss": 0.1484,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.3604531410916581,
520
+ "grad_norm": 0.2916114032268524,
521
+ "learning_rate": 5.860780872019601e-05,
522
+ "loss": 0.1268,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.36560247167868176,
527
+ "grad_norm": 0.3278571367263794,
528
+ "learning_rate": 5.7578362133459494e-05,
529
+ "loss": 0.1241,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.3707518022657055,
534
+ "grad_norm": 0.30740585923194885,
535
+ "learning_rate": 5.6547559784606675e-05,
536
+ "loss": 0.1152,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.37590113285272914,
541
+ "grad_norm": 0.27809590101242065,
542
+ "learning_rate": 5.551594369322271e-05,
543
+ "loss": 0.1017,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.38105046343975285,
548
+ "grad_norm": 0.36339494585990906,
549
+ "learning_rate": 5.44840563067773e-05,
550
+ "loss": 0.0868,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.3861997940267765,
555
+ "grad_norm": 0.3924078643321991,
556
+ "learning_rate": 5.3452440215393315e-05,
557
+ "loss": 0.0638,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.3861997940267765,
562
+ "eval_loss": 0.13260947167873383,
563
+ "eval_runtime": 0.4867,
564
+ "eval_samples_per_second": 102.736,
565
+ "eval_steps_per_second": 6.164,
566
+ "step": 75
567
+ },
568
+ {
569
+ "epoch": 0.3913491246138002,
570
+ "grad_norm": 0.6179808378219604,
571
+ "learning_rate": 5.242163786654051e-05,
572
+ "loss": 0.2107,
573
+ "step": 76
574
+ },
575
+ {
576
+ "epoch": 0.3964984552008239,
577
+ "grad_norm": 0.43608739972114563,
578
+ "learning_rate": 5.139219127980399e-05,
579
+ "loss": 0.1587,
580
+ "step": 77
581
+ },
582
+ {
583
+ "epoch": 0.4016477857878476,
584
+ "grad_norm": 0.3661559224128723,
585
+ "learning_rate": 5.036464176187806e-05,
586
+ "loss": 0.1553,
587
+ "step": 78
588
+ },
589
+ {
590
+ "epoch": 0.40679711637487126,
591
+ "grad_norm": 0.41823190450668335,
592
+ "learning_rate": 4.933952962193452e-05,
593
+ "loss": 0.1505,
594
+ "step": 79
595
+ },
596
+ {
597
+ "epoch": 0.411946446961895,
598
+ "grad_norm": 0.41759902238845825,
599
+ "learning_rate": 4.831739388751584e-05,
600
+ "loss": 0.1402,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 0.41709577754891863,
605
+ "grad_norm": 0.37053796648979187,
606
+ "learning_rate": 4.729877202110195e-05,
607
+ "loss": 0.1305,
608
+ "step": 81
609
+ },
610
+ {
611
+ "epoch": 0.42224510813594235,
612
+ "grad_norm": 0.3314639627933502,
613
+ "learning_rate": 4.628419963750016e-05,
614
+ "loss": 0.1211,
615
+ "step": 82
616
+ },
617
+ {
618
+ "epoch": 0.427394438722966,
619
+ "grad_norm": 0.2696942389011383,
620
+ "learning_rate": 4.527421022220663e-05,
621
+ "loss": 0.1101,
622
+ "step": 83
623
+ },
624
+ {
625
+ "epoch": 0.4325437693099897,
626
+ "grad_norm": 0.32313790917396545,
627
+ "learning_rate": 4.426933485088729e-05,
628
+ "loss": 0.1098,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.4376930998970134,
633
+ "grad_norm": 0.289420485496521,
634
+ "learning_rate": 4.327010191012617e-05,
635
+ "loss": 0.1033,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.4428424304840371,
640
+ "grad_norm": 0.25251471996307373,
641
+ "learning_rate": 4.227703681958749e-05,
642
+ "loss": 0.0839,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.44799176107106076,
647
+ "grad_norm": 0.2926040291786194,
648
+ "learning_rate": 4.1290661755738e-05,
649
+ "loss": 0.0633,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.45314109165808447,
654
+ "grad_norm": 0.42090505361557007,
655
+ "learning_rate": 4.03114953772745e-05,
656
+ "loss": 0.1778,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.45829042224510813,
661
+ "grad_norm": 0.3921765983104706,
662
+ "learning_rate": 3.934005255240122e-05,
663
+ "loss": 0.1554,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.46343975283213185,
668
+ "grad_norm": 0.22954587638378143,
669
+ "learning_rate": 3.837684408810023e-05,
670
+ "loss": 0.1447,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.4685890834191555,
675
+ "grad_norm": 0.32881876826286316,
676
+ "learning_rate": 3.7422376461537435e-05,
677
+ "loss": 0.132,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.4737384140061792,
682
+ "grad_norm": 0.3419657051563263,
683
+ "learning_rate": 3.647715155374519e-05,
684
+ "loss": 0.1329,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.4788877445932029,
689
+ "grad_norm": 0.30170801281929016,
690
+ "learning_rate": 3.554166638572175e-05,
691
+ "loss": 0.1174,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.4840370751802266,
696
+ "grad_norm": 0.3186221718788147,
697
+ "learning_rate": 3.461641285708611e-05,
698
+ "loss": 0.1174,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.48918640576725025,
703
+ "grad_norm": 0.2956952452659607,
704
+ "learning_rate": 3.370187748742601e-05,
705
+ "loss": 0.1085,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.49433573635427397,
710
+ "grad_norm": 0.25691288709640503,
711
+ "learning_rate": 3.279854116047457e-05,
712
+ "loss": 0.096,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.49948506694129763,
717
+ "grad_norm": 0.25471076369285583,
718
+ "learning_rate": 3.190687887125077e-05,
719
+ "loss": 0.0961,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.5046343975283213,
724
+ "grad_norm": 0.24956409633159637,
725
+ "learning_rate": 3.102735947629594e-05,
726
+ "loss": 0.0912,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.509783728115345,
731
+ "grad_norm": 0.2942916452884674,
732
+ "learning_rate": 3.0160445447138308e-05,
733
+ "loss": 0.0773,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.5149330587023687,
738
+ "grad_norm": 0.42962557077407837,
739
+ "learning_rate": 2.9306592627114883e-05,
740
+ "loss": 0.0568,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.5149330587023687,
745
+ "eval_loss": 0.11873143911361694,
746
+ "eval_runtime": 0.4851,
747
+ "eval_samples_per_second": 103.066,
748
+ "eval_steps_per_second": 6.184,
749
+ "step": 100
750
+ },
751
+ {
752
+ "epoch": 0.5200823892893924,
753
+ "grad_norm": 0.32270699739456177,
754
+ "learning_rate": 2.846624999167843e-05,
755
+ "loss": 0.1637,
756
+ "step": 101
757
+ },
758
+ {
759
+ "epoch": 0.525231719876416,
760
+ "grad_norm": 0.3115319609642029,
761
+ "learning_rate": 2.7639859412315917e-05,
762
+ "loss": 0.1418,
763
+ "step": 102
764
+ },
765
+ {
766
+ "epoch": 0.5303810504634398,
767
+ "grad_norm": 0.26950860023498535,
768
+ "learning_rate": 2.682785542420229e-05,
769
+ "loss": 0.1285,
770
+ "step": 103
771
+ },
772
+ {
773
+ "epoch": 0.5355303810504635,
774
+ "grad_norm": 0.24307739734649658,
775
+ "learning_rate": 2.603066499771192e-05,
776
+ "loss": 0.1333,
777
+ "step": 104
778
+ },
779
+ {
780
+ "epoch": 0.5406797116374872,
781
+ "grad_norm": 0.2244795262813568,
782
+ "learning_rate": 2.5248707313907747e-05,
783
+ "loss": 0.1211,
784
+ "step": 105
785
+ },
786
+ {
787
+ "epoch": 0.5458290422245108,
788
+ "grad_norm": 0.2730483114719391,
789
+ "learning_rate": 2.4482393544126215e-05,
790
+ "loss": 0.1246,
791
+ "step": 106
792
+ },
793
+ {
794
+ "epoch": 0.5509783728115345,
795
+ "grad_norm": 0.24573828279972076,
796
+ "learning_rate": 2.3732126633773928e-05,
797
+ "loss": 0.1104,
798
+ "step": 107
799
+ },
800
+ {
801
+ "epoch": 0.5561277033985582,
802
+ "grad_norm": 0.24852481484413147,
803
+ "learning_rate": 2.2998301090449738e-05,
804
+ "loss": 0.0935,
805
+ "step": 108
806
+ },
807
+ {
808
+ "epoch": 0.5612770339855818,
809
+ "grad_norm": 0.2701282799243927,
810
+ "learning_rate": 2.2281302776503497e-05,
811
+ "loss": 0.1032,
812
+ "step": 109
813
+ },
814
+ {
815
+ "epoch": 0.5664263645726055,
816
+ "grad_norm": 0.33753320574760437,
817
+ "learning_rate": 2.1581508706140802e-05,
818
+ "loss": 0.1071,
819
+ "step": 110
820
+ },
821
+ {
822
+ "epoch": 0.5715756951596292,
823
+ "grad_norm": 0.36751407384872437,
824
+ "learning_rate": 2.0899286847180243e-05,
825
+ "loss": 0.0919,
826
+ "step": 111
827
+ },
828
+ {
829
+ "epoch": 0.576725025746653,
830
+ "grad_norm": 0.3343943953514099,
831
+ "learning_rate": 2.0234995927567523e-05,
832
+ "loss": 0.0607,
833
+ "step": 112
834
+ },
835
+ {
836
+ "epoch": 0.5818743563336766,
837
+ "grad_norm": 0.30756524205207825,
838
+ "learning_rate": 1.9588985246747925e-05,
839
+ "loss": 0.173,
840
+ "step": 113
841
+ },
842
+ {
843
+ "epoch": 0.5870236869207003,
844
+ "grad_norm": 0.251208633184433,
845
+ "learning_rate": 1.896159449199672e-05,
846
+ "loss": 0.1496,
847
+ "step": 114
848
+ },
849
+ {
850
+ "epoch": 0.592173017507724,
851
+ "grad_norm": 0.22547647356987,
852
+ "learning_rate": 1.835315355980376e-05,
853
+ "loss": 0.1253,
854
+ "step": 115
855
+ },
856
+ {
857
+ "epoch": 0.5973223480947477,
858
+ "grad_norm": 0.2488126903772354,
859
+ "learning_rate": 1.7763982382406352e-05,
860
+ "loss": 0.1204,
861
+ "step": 116
862
+ },
863
+ {
864
+ "epoch": 0.6024716786817713,
865
+ "grad_norm": 0.2688787579536438,
866
+ "learning_rate": 1.7194390759561453e-05,
867
+ "loss": 0.1151,
868
+ "step": 117
869
+ },
870
+ {
871
+ "epoch": 0.607621009268795,
872
+ "grad_norm": 0.24907900393009186,
873
+ "learning_rate": 1.664467819564588e-05,
874
+ "loss": 0.1098,
875
+ "step": 118
876
+ },
877
+ {
878
+ "epoch": 0.6127703398558187,
879
+ "grad_norm": 0.25353243947029114,
880
+ "learning_rate": 1.6115133742170053e-05,
881
+ "loss": 0.1112,
882
+ "step": 119
883
+ },
884
+ {
885
+ "epoch": 0.6179196704428425,
886
+ "grad_norm": 0.21995492279529572,
887
+ "learning_rate": 1.5606035845787987e-05,
888
+ "loss": 0.0976,
889
+ "step": 120
890
+ },
891
+ {
892
+ "epoch": 0.6230690010298661,
893
+ "grad_norm": 0.22032539546489716,
894
+ "learning_rate": 1.511765220188367e-05,
895
+ "loss": 0.087,
896
+ "step": 121
897
+ },
898
+ {
899
+ "epoch": 0.6282183316168898,
900
+ "grad_norm": 0.2590661942958832,
901
+ "learning_rate": 1.4650239613810693e-05,
902
+ "loss": 0.088,
903
+ "step": 122
904
+ },
905
+ {
906
+ "epoch": 0.6333676622039135,
907
+ "grad_norm": 0.2637864351272583,
908
+ "learning_rate": 1.4204043857859129e-05,
909
+ "loss": 0.0884,
910
+ "step": 123
911
+ },
912
+ {
913
+ "epoch": 0.6385169927909372,
914
+ "grad_norm": 0.2572005093097687,
915
+ "learning_rate": 1.3779299554020672e-05,
916
+ "loss": 0.0735,
917
+ "step": 124
918
+ },
919
+ {
920
+ "epoch": 0.6436663233779608,
921
+ "grad_norm": 0.26129183173179626,
922
+ "learning_rate": 1.3376230042620109e-05,
923
+ "loss": 0.0471,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 0.6436663233779608,
928
+ "eval_loss": 0.10684148967266083,
929
+ "eval_runtime": 0.4867,
930
+ "eval_samples_per_second": 102.737,
931
+ "eval_steps_per_second": 6.164,
932
+ "step": 125
933
+ },
934
+ {
935
+ "epoch": 0.6488156539649845,
936
+ "grad_norm": 0.26877304911613464,
937
+ "learning_rate": 1.2995047266877775e-05,
938
+ "loss": 0.1827,
939
+ "step": 126
940
+ },
941
+ {
942
+ "epoch": 0.6539649845520082,
943
+ "grad_norm": 0.2039293497800827,
944
+ "learning_rate": 1.2635951661464995e-05,
945
+ "loss": 0.1372,
946
+ "step": 127
947
+ },
948
+ {
949
+ "epoch": 0.659114315139032,
950
+ "grad_norm": 0.20893456041812897,
951
+ "learning_rate": 1.2299132047110876e-05,
952
+ "loss": 0.1187,
953
+ "step": 128
954
+ },
955
+ {
956
+ "epoch": 0.6642636457260556,
957
+ "grad_norm": 0.2243366241455078,
958
+ "learning_rate": 1.1984765531316038e-05,
959
+ "loss": 0.118,
960
+ "step": 129
961
+ },
962
+ {
963
+ "epoch": 0.6694129763130793,
964
+ "grad_norm": 0.2364204078912735,
965
+ "learning_rate": 1.1693017415225432e-05,
966
+ "loss": 0.1196,
967
+ "step": 130
968
+ },
969
+ {
970
+ "epoch": 0.674562306900103,
971
+ "grad_norm": 0.2659465968608856,
972
+ "learning_rate": 1.1424041106709194e-05,
973
+ "loss": 0.1104,
974
+ "step": 131
975
+ },
976
+ {
977
+ "epoch": 0.6797116374871267,
978
+ "grad_norm": 0.22487813234329224,
979
+ "learning_rate": 1.1177978039697217e-05,
980
+ "loss": 0.1002,
981
+ "step": 132
982
+ },
983
+ {
984
+ "epoch": 0.6848609680741503,
985
+ "grad_norm": 0.22705155611038208,
986
+ "learning_rate": 1.0954957599810003e-05,
987
+ "loss": 0.1075,
988
+ "step": 133
989
+ },
990
+ {
991
+ "epoch": 0.690010298661174,
992
+ "grad_norm": 0.2651651203632355,
993
+ "learning_rate": 1.0755097056324672e-05,
994
+ "loss": 0.1095,
995
+ "step": 134
996
+ },
997
+ {
998
+ "epoch": 0.6951596292481977,
999
+ "grad_norm": 0.25540125370025635,
1000
+ "learning_rate": 1.0578501500512109e-05,
1001
+ "loss": 0.0937,
1002
+ "step": 135
1003
+ },
1004
+ {
1005
+ "epoch": 0.7003089598352215,
1006
+ "grad_norm": 0.2261316478252411,
1007
+ "learning_rate": 1.042526379037754e-05,
1008
+ "loss": 0.0741,
1009
+ "step": 136
1010
+ },
1011
+ {
1012
+ "epoch": 0.7054582904222451,
1013
+ "grad_norm": 0.21159891784191132,
1014
+ "learning_rate": 1.0295464501833682e-05,
1015
+ "loss": 0.0481,
1016
+ "step": 137
1017
+ },
1018
+ {
1019
+ "epoch": 0.7106076210092688,
1020
+ "grad_norm": 0.26530131697654724,
1021
+ "learning_rate": 1.0189171886332038e-05,
1022
+ "loss": 0.1346,
1023
+ "step": 138
1024
+ },
1025
+ {
1026
+ "epoch": 0.7157569515962925,
1027
+ "grad_norm": 0.3140278458595276,
1028
+ "learning_rate": 1.0106441834974748e-05,
1029
+ "loss": 0.1296,
1030
+ "step": 139
1031
+ },
1032
+ {
1033
+ "epoch": 0.7209062821833162,
1034
+ "grad_norm": 0.24784407019615173,
1035
+ "learning_rate": 1.0047317849125743e-05,
1036
+ "loss": 0.1294,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 0.7260556127703398,
1041
+ "grad_norm": 0.19761891663074493,
1042
+ "learning_rate": 1.0011831017536722e-05,
1043
+ "loss": 0.1212,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 0.7312049433573635,
1048
+ "grad_norm": 0.2117881029844284,
1049
+ "learning_rate": 1e-05,
1050
+ "loss": 0.1272,
1051
+ "step": 142
1052
+ }
1053
+ ],
1054
+ "logging_steps": 1,
1055
+ "max_steps": 142,
1056
+ "num_input_tokens_seen": 0,
1057
+ "num_train_epochs": 1,
1058
+ "save_steps": 50,
1059
+ "stateful_callbacks": {
1060
+ "EarlyStoppingCallback": {
1061
+ "args": {
1062
+ "early_stopping_patience": 1,
1063
+ "early_stopping_threshold": 0.0
1064
+ },
1065
+ "attributes": {
1066
+ "early_stopping_patience_counter": 0
1067
+ }
1068
+ },
1069
+ "TrainerControl": {
1070
+ "args": {
1071
+ "should_epoch_stop": false,
1072
+ "should_evaluate": false,
1073
+ "should_log": false,
1074
+ "should_save": true,
1075
+ "should_training_stop": true
1076
+ },
1077
+ "attributes": {}
1078
+ }
1079
+ },
1080
+ "total_flos": 2.22678433233109e+17,
1081
+ "train_batch_size": 6,
1082
+ "trial_name": null,
1083
+ "trial_params": null
1084
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1376ca051303a2cb84d62afc96a2a367e5ee17bca984d21714c1bf522e427658
3
+ size 6840
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "left",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1376ca051303a2cb84d62afc96a2a367e5ee17bca984d21714c1bf522e427658
3
+ size 6840