alexredna commited on
Commit
b530618
1 Parent(s): 87dc7dc

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.0244
24
 
25
  ## Model description
26
 
@@ -40,12 +40,12 @@ More information needed
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 4e-05
43
- - train_batch_size: 3
44
- - eval_batch_size: 3
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
- - gradient_accumulation_steps: 40
48
- - total_train_batch_size: 120
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
@@ -54,8 +54,9 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 0.9699 | 0.38 | 80 | 1.0432 |
58
- | 0.9576 | 0.77 | 160 | 1.0250 |
 
59
 
60
 
61
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0546
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 4e-05
43
+ - train_batch_size: 6
44
+ - eval_batch_size: 4
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - gradient_accumulation_steps: 36
48
+ - total_train_batch_size: 216
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 1.1912 | 0.28 | 10 | 1.1099 |
58
+ | 1.1238 | 0.55 | 20 | 1.0655 |
59
+ | 1.1258 | 0.83 | 30 | 1.0550 |
60
 
61
 
62
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57de9f65961ee02019b9e09abe2f8108d672f1d339c0c27346ccf40f27ed3e4e
3
  size 26362152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36f835eb9195a1de5a20430109f1d198bee36ccd393dc5dbe881b49aea860788
3
  size 26362152
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.0243600606918335,
4
- "eval_runtime": 26.1874,
5
- "eval_samples": 783,
6
- "eval_samples_per_second": 4.773,
7
- "eval_steps_per_second": 1.604,
8
- "train_loss": 0.5025384334856243,
9
- "train_runtime": 8965.5847,
10
- "train_samples": 115131,
11
- "train_samples_per_second": 2.798,
12
- "train_steps_per_second": 0.023
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.0545984506607056,
4
+ "eval_runtime": 9.2861,
5
+ "eval_samples": 130,
6
+ "eval_samples_per_second": 5.061,
7
+ "eval_steps_per_second": 1.292,
8
+ "train_loss": 1.1544433269235823,
9
+ "train_runtime": 5078.5964,
10
+ "train_samples": 29726,
11
+ "train_samples_per_second": 1.537,
12
+ "train_steps_per_second": 0.007
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.0243600606918335,
4
- "eval_runtime": 26.1874,
5
- "eval_samples": 783,
6
- "eval_samples_per_second": 4.773,
7
- "eval_steps_per_second": 1.604
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.0545984506607056,
4
+ "eval_runtime": 9.2861,
5
+ "eval_samples": 130,
6
+ "eval_samples_per_second": 5.061,
7
+ "eval_steps_per_second": 1.292
8
  }
runs/Jan13_19-31-59_98f107f1aa39/events.out.tfevents.1705174497.98f107f1aa39.154347.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd69b424d6c6d97ea528767f2e0f4baedd3db13474839bd4f0a724058db48e10
3
- size 6159
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b23ffb1193103c8874ec5d6b80f396589fdaf1ea2fab93a29bb79b481a657f
3
+ size 7235
runs/Jan13_19-31-59_98f107f1aa39/events.out.tfevents.1705179585.98f107f1aa39.154347.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:771da7e33030b7c506cff2ebedb68fd5813ed7af9fee67bbca483a22096bc161
3
+ size 354
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.5025384334856243,
4
- "train_runtime": 8965.5847,
5
- "train_samples": 115131,
6
- "train_samples_per_second": 2.798,
7
- "train_steps_per_second": 0.023
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 1.1544433269235823,
4
+ "train_runtime": 5078.5964,
5
+ "train_samples": 29726,
6
+ "train_samples_per_second": 1.537,
7
+ "train_steps_per_second": 0.007
8
  }
trainer_state.json CHANGED
@@ -1,298 +1,102 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9996412770536889,
5
- "eval_steps": 80,
6
- "global_step": 209,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "learning_rate": 3.9997740569453936e-05,
14
- "loss": 1.3377,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.02,
19
- "learning_rate": 3.9943539757443494e-05,
20
- "loss": 1.1879,
21
  "step": 5
22
  },
23
  {
24
- "epoch": 0.05,
25
- "learning_rate": 3.97744778056729e-05,
26
- "loss": 1.1238,
27
  "step": 10
28
  },
29
  {
30
- "epoch": 0.07,
31
- "learning_rate": 3.949376867256863e-05,
32
- "loss": 1.0839,
33
- "step": 15
34
- },
35
- {
36
- "epoch": 0.1,
37
- "learning_rate": 3.9102997248704994e-05,
38
- "loss": 1.0787,
39
- "step": 20
40
- },
41
- {
42
- "epoch": 0.12,
43
- "learning_rate": 3.8604369839019515e-05,
44
- "loss": 1.0363,
45
- "step": 25
46
- },
47
- {
48
- "epoch": 0.14,
49
- "learning_rate": 3.800070170596182e-05,
50
- "loss": 1.0079,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 0.17,
55
- "learning_rate": 3.729540117445352e-05,
56
- "loss": 1.0173,
57
- "step": 35
58
- },
59
- {
60
- "epoch": 0.19,
61
- "learning_rate": 3.6492450388403034e-05,
62
- "loss": 1.0162,
63
- "step": 40
64
- },
65
- {
66
- "epoch": 0.22,
67
- "learning_rate": 3.559638282742449e-05,
68
- "loss": 1.0247,
69
- "step": 45
70
- },
71
- {
72
- "epoch": 0.24,
73
- "learning_rate": 3.461225771070188e-05,
74
- "loss": 1.0066,
75
- "step": 50
76
- },
77
- {
78
- "epoch": 0.26,
79
- "learning_rate": 3.354563143251483e-05,
80
- "loss": 0.9969,
81
- "step": 55
82
- },
83
- {
84
- "epoch": 0.29,
85
- "learning_rate": 3.2402526190701667e-05,
86
- "loss": 1.0205,
87
- "step": 60
88
- },
89
- {
90
- "epoch": 0.31,
91
- "learning_rate": 3.1189395985184464e-05,
92
- "loss": 0.9882,
93
- "step": 65
94
- },
95
- {
96
- "epoch": 0.33,
97
- "learning_rate": 2.9913090178528815e-05,
98
- "loss": 0.9749,
99
- "step": 70
100
- },
101
- {
102
- "epoch": 0.36,
103
- "learning_rate": 2.858081482427673e-05,
104
- "loss": 0.9849,
105
- "step": 75
106
- },
107
- {
108
- "epoch": 0.38,
109
- "learning_rate": 2.7200091981393524e-05,
110
- "loss": 0.9699,
111
- "step": 80
112
- },
113
- {
114
- "epoch": 0.38,
115
- "eval_loss": 1.043211817741394,
116
- "eval_runtime": 26.2196,
117
- "eval_samples_per_second": 4.767,
118
- "eval_steps_per_second": 1.602,
119
- "step": 80
120
- },
121
- {
122
- "epoch": 0.41,
123
- "learning_rate": 2.577871724454045e-05,
124
- "loss": 0.9874,
125
- "step": 85
126
- },
127
- {
128
- "epoch": 0.43,
129
- "learning_rate": 2.4324715729958146e-05,
130
- "loss": 0.9723,
131
- "step": 90
132
- },
133
- {
134
- "epoch": 0.45,
135
- "learning_rate": 2.2846296765465708e-05,
136
- "loss": 0.9884,
137
- "step": 95
138
- },
139
- {
140
- "epoch": 0.48,
141
- "learning_rate": 2.1351807540396666e-05,
142
- "loss": 0.9596,
143
- "step": 100
144
- },
145
- {
146
- "epoch": 0.5,
147
- "learning_rate": 1.9849685977165566e-05,
148
- "loss": 0.9784,
149
- "step": 105
150
  },
151
  {
152
- "epoch": 0.53,
153
- "learning_rate": 1.8348413090553356e-05,
154
- "loss": 0.9715,
155
- "step": 110
156
  },
157
  {
158
  "epoch": 0.55,
159
- "learning_rate": 1.6856465103692203e-05,
160
- "loss": 0.9627,
161
- "step": 115
162
- },
163
- {
164
- "epoch": 0.57,
165
- "learning_rate": 1.5382265591104088e-05,
166
- "loss": 0.96,
167
- "step": 120
168
- },
169
- {
170
- "epoch": 0.6,
171
- "learning_rate": 1.3934137918994753e-05,
172
- "loss": 0.9743,
173
- "step": 125
174
- },
175
- {
176
- "epoch": 0.62,
177
- "learning_rate": 1.2520258251326212e-05,
178
- "loss": 0.9661,
179
- "step": 130
180
- },
181
- {
182
- "epoch": 0.65,
183
- "learning_rate": 1.1148609386996692e-05,
184
- "loss": 0.9676,
185
- "step": 135
186
  },
187
  {
188
- "epoch": 0.67,
189
- "learning_rate": 9.826935688764434e-06,
190
- "loss": 0.9842,
191
- "step": 140
 
 
192
  },
193
  {
194
  "epoch": 0.69,
195
- "learning_rate": 8.562699358387723e-06,
196
- "loss": 0.9628,
197
- "step": 145
198
- },
199
- {
200
- "epoch": 0.72,
201
- "learning_rate": 7.3630383048527255e-06,
202
- "loss": 0.9603,
203
- "step": 150
204
- },
205
- {
206
- "epoch": 0.74,
207
- "learning_rate": 6.234725843566269e-06,
208
- "loss": 0.9626,
209
- "step": 155
210
- },
211
- {
212
- "epoch": 0.77,
213
- "learning_rate": 5.184132454052731e-06,
214
- "loss": 0.9576,
215
- "step": 160
216
- },
217
- {
218
- "epoch": 0.77,
219
- "eval_loss": 1.0250179767608643,
220
- "eval_runtime": 26.1866,
221
- "eval_samples_per_second": 4.773,
222
- "eval_steps_per_second": 1.604,
223
- "step": 160
224
- },
225
- {
226
- "epoch": 0.79,
227
- "learning_rate": 4.217189812072131e-06,
228
- "loss": 0.9659,
229
- "step": 165
230
- },
231
- {
232
- "epoch": 0.81,
233
- "learning_rate": 3.3393572992349156e-06,
234
- "loss": 0.9655,
235
- "step": 170
236
- },
237
- {
238
- "epoch": 0.84,
239
- "learning_rate": 2.5555911792009624e-06,
240
- "loss": 0.9501,
241
- "step": 175
242
- },
243
- {
244
- "epoch": 0.86,
245
- "learning_rate": 1.8703166144947427e-06,
246
- "loss": 0.9754,
247
- "step": 180
248
- },
249
- {
250
- "epoch": 0.88,
251
- "learning_rate": 1.2874026819303698e-06,
252
- "loss": 0.9497,
253
- "step": 185
254
- },
255
- {
256
- "epoch": 0.91,
257
- "learning_rate": 8.101405277100549e-07,
258
- "loss": 0.9477,
259
- "step": 190
260
  },
261
  {
262
- "epoch": 0.93,
263
- "learning_rate": 4.412247855328322e-07,
264
- "loss": 0.9624,
265
- "step": 195
266
  },
267
  {
268
- "epoch": 0.96,
269
- "learning_rate": 1.8273836262732824e-07,
270
- "loss": 0.959,
271
- "step": 200
 
 
272
  },
273
  {
274
- "epoch": 0.98,
275
- "learning_rate": 3.614067960701961e-08,
276
- "loss": 0.9523,
277
- "step": 205
278
  },
279
  {
280
  "epoch": 1.0,
281
- "step": 209,
282
- "total_flos": 3.512657938470666e+17,
283
- "train_loss": 0.5025384334856243,
284
- "train_runtime": 8965.5847,
285
- "train_samples_per_second": 2.798,
286
- "train_steps_per_second": 0.023
287
  }
288
  ],
289
  "logging_steps": 5,
290
- "max_steps": 209,
291
  "num_input_tokens_seen": 0,
292
  "num_train_epochs": 1,
293
- "save_steps": 50,
294
- "total_flos": 3.512657938470666e+17,
295
- "train_batch_size": 3,
296
  "trial_name": null,
297
  "trial_params": null
298
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9961568024596464,
5
+ "eval_steps": 10,
6
+ "global_step": 36,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 3.9923893961834914e-05,
14
+ "loss": 1.3268,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.14,
19
+ "learning_rate": 3.812615574073301e-05,
20
+ "loss": 1.2548,
21
  "step": 5
22
  },
23
  {
24
+ "epoch": 0.28,
25
+ "learning_rate": 3.285575219373079e-05,
26
+ "loss": 1.1912,
27
  "step": 10
28
  },
29
  {
30
+ "epoch": 0.28,
31
+ "eval_loss": 1.109934687614441,
32
+ "eval_runtime": 9.2946,
33
+ "eval_samples_per_second": 5.057,
34
+ "eval_steps_per_second": 1.291,
35
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
  {
38
+ "epoch": 0.42,
39
+ "learning_rate": 2.5176380902050418e-05,
40
+ "loss": 1.1433,
41
+ "step": 15
42
  },
43
  {
44
  "epoch": 0.55,
45
+ "learning_rate": 1.6527036446661396e-05,
46
+ "loss": 1.1238,
47
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  },
49
  {
50
+ "epoch": 0.55,
51
+ "eval_loss": 1.065536379814148,
52
+ "eval_runtime": 9.2887,
53
+ "eval_samples_per_second": 5.06,
54
+ "eval_steps_per_second": 1.292,
55
+ "step": 20
56
  },
57
  {
58
  "epoch": 0.69,
59
+ "learning_rate": 8.528471272979083e-06,
60
+ "loss": 1.1102,
61
+ "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  },
63
  {
64
+ "epoch": 0.83,
65
+ "learning_rate": 2.679491924311226e-06,
66
+ "loss": 1.1258,
67
+ "step": 30
68
  },
69
  {
70
+ "epoch": 0.83,
71
+ "eval_loss": 1.055001974105835,
72
+ "eval_runtime": 9.2909,
73
+ "eval_samples_per_second": 5.059,
74
+ "eval_steps_per_second": 1.292,
75
+ "step": 30
76
  },
77
  {
78
+ "epoch": 0.97,
79
+ "learning_rate": 7.61060381650891e-08,
80
+ "loss": 1.1272,
81
+ "step": 35
82
  },
83
  {
84
  "epoch": 1.0,
85
+ "step": 36,
86
+ "total_flos": 1.0010669722946765e+17,
87
+ "train_loss": 1.1544433269235823,
88
+ "train_runtime": 5078.5964,
89
+ "train_samples_per_second": 1.537,
90
+ "train_steps_per_second": 0.007
91
  }
92
  ],
93
  "logging_steps": 5,
94
+ "max_steps": 36,
95
  "num_input_tokens_seen": 0,
96
  "num_train_epochs": 1,
97
+ "save_steps": 20,
98
+ "total_flos": 1.0010669722946765e+17,
99
+ "train_batch_size": 6,
100
  "trial_name": null,
101
  "trial_params": null
102
  }