alexredna commited on
Commit
be86fd3
1 Parent(s): b2d35fb

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.2478
24
 
25
  ## Model description
26
 
@@ -39,13 +39,13 @@ More information needed
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
- - learning_rate: 2e-05
43
- - train_batch_size: 2
44
- - eval_batch_size: 2
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
- - gradient_accumulation_steps: 25
48
- - total_train_batch_size: 50
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
@@ -54,8 +54,8 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.3274 | 0.49 | 20 | 1.2587 |
58
- | 1.3066 | 0.99 | 40 | 1.2478 |
59
 
60
 
61
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0244
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 4e-05
43
+ - train_batch_size: 3
44
+ - eval_batch_size: 3
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - gradient_accumulation_steps: 40
48
+ - total_train_batch_size: 120
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.9699 | 0.38 | 80 | 1.0432 |
58
+ | 0.9576 | 0.77 | 160 | 1.0250 |
59
 
60
 
61
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8eaa4c9a40159f160b328d27da9d86690717d2d35b3e1f6d30319e24afd9f86
3
  size 210609288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:407c48e68f34756106e83912eb76013711e3a8468b6ae862cb64b761489b70e9
3
  size 210609288
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_loss": 1.2477926015853882,
4
- "eval_runtime": 2.1689,
5
- "eval_samples": 91,
6
- "eval_samples_per_second": 4.611,
7
- "eval_steps_per_second": 2.305,
8
- "train_loss": 1.3475643575191498,
9
- "train_runtime": 1594.7957,
10
- "train_samples": 15296,
11
- "train_samples_per_second": 1.268,
12
- "train_steps_per_second": 0.025
13
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_loss": 1.0243600606918335,
4
+ "eval_runtime": 26.1874,
5
+ "eval_samples": 783,
6
+ "eval_samples_per_second": 4.773,
7
+ "eval_steps_per_second": 1.604,
8
+ "train_loss": 0.5025384334856243,
9
+ "train_runtime": 8965.5847,
10
+ "train_samples": 115131,
11
+ "train_samples_per_second": 2.798,
12
+ "train_steps_per_second": 0.023
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_loss": 1.2477926015853882,
4
- "eval_runtime": 2.1689,
5
- "eval_samples": 91,
6
- "eval_samples_per_second": 4.611,
7
- "eval_steps_per_second": 2.305
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_loss": 1.0243600606918335,
4
+ "eval_runtime": 26.1874,
5
+ "eval_samples": 783,
6
+ "eval_samples_per_second": 4.773,
7
+ "eval_steps_per_second": 1.604
8
  }
runs/Jan12_21-29-42_98f107f1aa39/events.out.tfevents.1705095280.98f107f1aa39.697537.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c50b15d601b11a72f8fde357b8245cdee96b6d388fe16f5acd13bf343710759
3
- size 6409
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872a0925e3d193138cf11a01bc340141614694654f8faf810786ffd587802069
3
+ size 8761
runs/Jan12_21-29-42_98f107f1aa39/events.out.tfevents.1705104272.98f107f1aa39.697537.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b866ad253b68036412319354ea040cd28cadcc63287db4b77957a58f3d03ab8
3
+ size 359
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.99,
3
- "train_loss": 1.3475643575191498,
4
- "train_runtime": 1594.7957,
5
- "train_samples": 15296,
6
- "train_samples_per_second": 1.268,
7
- "train_steps_per_second": 0.025
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.5025384334856243,
4
+ "train_runtime": 8965.5847,
5
+ "train_samples": 115131,
6
+ "train_samples_per_second": 2.798,
7
+ "train_steps_per_second": 0.023
8
  }
trainer_state.json CHANGED
@@ -1,100 +1,298 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9891196834817013,
5
- "eval_steps": 20,
6
- "global_step": 40,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "learning_rate": 1.9969173337331283e-05,
14
- "loss": 1.6723,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.12,
19
- "learning_rate": 1.9238795325112867e-05,
20
- "loss": 1.4829,
21
  "step": 5
22
  },
23
  {
24
- "epoch": 0.25,
25
- "learning_rate": 1.7071067811865477e-05,
26
- "loss": 1.3734,
27
  "step": 10
28
  },
29
  {
30
- "epoch": 0.37,
31
- "learning_rate": 1.3826834323650899e-05,
32
- "loss": 1.3486,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.49,
37
- "learning_rate": 1e-05,
38
- "loss": 1.3274,
39
- "step": 20
40
- },
41
- {
42
- "epoch": 0.49,
43
- "eval_loss": 1.258691668510437,
44
- "eval_runtime": 2.1716,
45
- "eval_samples_per_second": 4.605,
46
- "eval_steps_per_second": 2.302,
47
  "step": 20
48
  },
49
  {
50
- "epoch": 0.62,
51
- "learning_rate": 6.173165676349103e-06,
52
- "loss": 1.2978,
53
  "step": 25
54
  },
55
  {
56
- "epoch": 0.74,
57
- "learning_rate": 2.9289321881345257e-06,
58
- "loss": 1.3259,
59
  "step": 30
60
  },
61
  {
62
- "epoch": 0.87,
63
- "learning_rate": 7.612046748871327e-07,
64
- "loss": 1.2801,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.99,
69
- "learning_rate": 0.0,
70
- "loss": 1.3066,
71
  "step": 40
72
  },
73
  {
74
- "epoch": 0.99,
75
- "eval_loss": 1.2477926015853882,
76
- "eval_runtime": 2.1661,
77
- "eval_samples_per_second": 4.617,
78
- "eval_steps_per_second": 2.308,
79
- "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  },
81
  {
82
- "epoch": 0.99,
83
- "step": 40,
84
- "total_flos": 2.6717900760940544e+16,
85
- "train_loss": 1.3475643575191498,
86
- "train_runtime": 1594.7957,
87
- "train_samples_per_second": 1.268,
88
- "train_steps_per_second": 0.025
89
  }
90
  ],
91
  "logging_steps": 5,
92
- "max_steps": 40,
93
  "num_input_tokens_seen": 0,
94
  "num_train_epochs": 1,
95
  "save_steps": 50,
96
- "total_flos": 2.6717900760940544e+16,
97
- "train_batch_size": 2,
98
  "trial_name": null,
99
  "trial_params": null
100
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9996412770536889,
5
+ "eval_steps": 80,
6
+ "global_step": 209,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 3.9997740569453936e-05,
14
+ "loss": 1.3377,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.02,
19
+ "learning_rate": 3.9943539757443494e-05,
20
+ "loss": 1.1879,
21
  "step": 5
22
  },
23
  {
24
+ "epoch": 0.05,
25
+ "learning_rate": 3.97744778056729e-05,
26
+ "loss": 1.1238,
27
  "step": 10
28
  },
29
  {
30
+ "epoch": 0.07,
31
+ "learning_rate": 3.949376867256863e-05,
32
+ "loss": 1.0839,
33
  "step": 15
34
  },
35
  {
36
+ "epoch": 0.1,
37
+ "learning_rate": 3.9102997248704994e-05,
38
+ "loss": 1.0787,
 
 
 
 
 
 
 
 
39
  "step": 20
40
  },
41
  {
42
+ "epoch": 0.12,
43
+ "learning_rate": 3.8604369839019515e-05,
44
+ "loss": 1.0363,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.14,
49
+ "learning_rate": 3.800070170596182e-05,
50
+ "loss": 1.0079,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.17,
55
+ "learning_rate": 3.729540117445352e-05,
56
+ "loss": 1.0173,
57
  "step": 35
58
  },
59
  {
60
+ "epoch": 0.19,
61
+ "learning_rate": 3.6492450388403034e-05,
62
+ "loss": 1.0162,
63
  "step": 40
64
  },
65
  {
66
+ "epoch": 0.22,
67
+ "learning_rate": 3.559638282742449e-05,
68
+ "loss": 1.0247,
69
+ "step": 45
70
+ },
71
+ {
72
+ "epoch": 0.24,
73
+ "learning_rate": 3.461225771070188e-05,
74
+ "loss": 1.0066,
75
+ "step": 50
76
+ },
77
+ {
78
+ "epoch": 0.26,
79
+ "learning_rate": 3.354563143251483e-05,
80
+ "loss": 0.9969,
81
+ "step": 55
82
+ },
83
+ {
84
+ "epoch": 0.29,
85
+ "learning_rate": 3.2402526190701667e-05,
86
+ "loss": 1.0205,
87
+ "step": 60
88
+ },
89
+ {
90
+ "epoch": 0.31,
91
+ "learning_rate": 3.1189395985184464e-05,
92
+ "loss": 0.9882,
93
+ "step": 65
94
+ },
95
+ {
96
+ "epoch": 0.33,
97
+ "learning_rate": 2.9913090178528815e-05,
98
+ "loss": 0.9749,
99
+ "step": 70
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "learning_rate": 2.858081482427673e-05,
104
+ "loss": 0.9849,
105
+ "step": 75
106
+ },
107
+ {
108
+ "epoch": 0.38,
109
+ "learning_rate": 2.7200091981393524e-05,
110
+ "loss": 0.9699,
111
+ "step": 80
112
+ },
113
+ {
114
+ "epoch": 0.38,
115
+ "eval_loss": 1.043211817741394,
116
+ "eval_runtime": 26.2196,
117
+ "eval_samples_per_second": 4.767,
118
+ "eval_steps_per_second": 1.602,
119
+ "step": 80
120
+ },
121
+ {
122
+ "epoch": 0.41,
123
+ "learning_rate": 2.577871724454045e-05,
124
+ "loss": 0.9874,
125
+ "step": 85
126
+ },
127
+ {
128
+ "epoch": 0.43,
129
+ "learning_rate": 2.4324715729958146e-05,
130
+ "loss": 0.9723,
131
+ "step": 90
132
+ },
133
+ {
134
+ "epoch": 0.45,
135
+ "learning_rate": 2.2846296765465708e-05,
136
+ "loss": 0.9884,
137
+ "step": 95
138
+ },
139
+ {
140
+ "epoch": 0.48,
141
+ "learning_rate": 2.1351807540396666e-05,
142
+ "loss": 0.9596,
143
+ "step": 100
144
+ },
145
+ {
146
+ "epoch": 0.5,
147
+ "learning_rate": 1.9849685977165566e-05,
148
+ "loss": 0.9784,
149
+ "step": 105
150
+ },
151
+ {
152
+ "epoch": 0.53,
153
+ "learning_rate": 1.8348413090553356e-05,
154
+ "loss": 0.9715,
155
+ "step": 110
156
+ },
157
+ {
158
+ "epoch": 0.55,
159
+ "learning_rate": 1.6856465103692203e-05,
160
+ "loss": 0.9627,
161
+ "step": 115
162
+ },
163
+ {
164
+ "epoch": 0.57,
165
+ "learning_rate": 1.5382265591104088e-05,
166
+ "loss": 0.96,
167
+ "step": 120
168
+ },
169
+ {
170
+ "epoch": 0.6,
171
+ "learning_rate": 1.3934137918994753e-05,
172
+ "loss": 0.9743,
173
+ "step": 125
174
+ },
175
+ {
176
+ "epoch": 0.62,
177
+ "learning_rate": 1.2520258251326212e-05,
178
+ "loss": 0.9661,
179
+ "step": 130
180
+ },
181
+ {
182
+ "epoch": 0.65,
183
+ "learning_rate": 1.1148609386996692e-05,
184
+ "loss": 0.9676,
185
+ "step": 135
186
+ },
187
+ {
188
+ "epoch": 0.67,
189
+ "learning_rate": 9.826935688764434e-06,
190
+ "loss": 0.9842,
191
+ "step": 140
192
+ },
193
+ {
194
+ "epoch": 0.69,
195
+ "learning_rate": 8.562699358387723e-06,
196
+ "loss": 0.9628,
197
+ "step": 145
198
+ },
199
+ {
200
+ "epoch": 0.72,
201
+ "learning_rate": 7.3630383048527255e-06,
202
+ "loss": 0.9603,
203
+ "step": 150
204
+ },
205
+ {
206
+ "epoch": 0.74,
207
+ "learning_rate": 6.234725843566269e-06,
208
+ "loss": 0.9626,
209
+ "step": 155
210
+ },
211
+ {
212
+ "epoch": 0.77,
213
+ "learning_rate": 5.184132454052731e-06,
214
+ "loss": 0.9576,
215
+ "step": 160
216
+ },
217
+ {
218
+ "epoch": 0.77,
219
+ "eval_loss": 1.0250179767608643,
220
+ "eval_runtime": 26.1866,
221
+ "eval_samples_per_second": 4.773,
222
+ "eval_steps_per_second": 1.604,
223
+ "step": 160
224
+ },
225
+ {
226
+ "epoch": 0.79,
227
+ "learning_rate": 4.217189812072131e-06,
228
+ "loss": 0.9659,
229
+ "step": 165
230
+ },
231
+ {
232
+ "epoch": 0.81,
233
+ "learning_rate": 3.3393572992349156e-06,
234
+ "loss": 0.9655,
235
+ "step": 170
236
+ },
237
+ {
238
+ "epoch": 0.84,
239
+ "learning_rate": 2.5555911792009624e-06,
240
+ "loss": 0.9501,
241
+ "step": 175
242
+ },
243
+ {
244
+ "epoch": 0.86,
245
+ "learning_rate": 1.8703166144947427e-06,
246
+ "loss": 0.9754,
247
+ "step": 180
248
+ },
249
+ {
250
+ "epoch": 0.88,
251
+ "learning_rate": 1.2874026819303698e-06,
252
+ "loss": 0.9497,
253
+ "step": 185
254
+ },
255
+ {
256
+ "epoch": 0.91,
257
+ "learning_rate": 8.101405277100549e-07,
258
+ "loss": 0.9477,
259
+ "step": 190
260
+ },
261
+ {
262
+ "epoch": 0.93,
263
+ "learning_rate": 4.412247855328322e-07,
264
+ "loss": 0.9624,
265
+ "step": 195
266
+ },
267
+ {
268
+ "epoch": 0.96,
269
+ "learning_rate": 1.8273836262732824e-07,
270
+ "loss": 0.959,
271
+ "step": 200
272
+ },
273
+ {
274
+ "epoch": 0.98,
275
+ "learning_rate": 3.614067960701961e-08,
276
+ "loss": 0.9523,
277
+ "step": 205
278
  },
279
  {
280
+ "epoch": 1.0,
281
+ "step": 209,
282
+ "total_flos": 3.512657938470666e+17,
283
+ "train_loss": 0.5025384334856243,
284
+ "train_runtime": 8965.5847,
285
+ "train_samples_per_second": 2.798,
286
+ "train_steps_per_second": 0.023
287
  }
288
  ],
289
  "logging_steps": 5,
290
+ "max_steps": 209,
291
  "num_input_tokens_seen": 0,
292
  "num_train_epochs": 1,
293
  "save_steps": 50,
294
+ "total_flos": 3.512657938470666e+17,
295
+ "train_batch_size": 3,
296
  "trial_name": null,
297
  "trial_params": null
298
  }