alexredna commited on
Commit
b8d192d
1 Parent(s): 5017998

Model save

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: Tukan-1.1B-Chat-v0.1
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # Tukan-1.1B-Chat-v0.1
17
+
18
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the None dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 1.5518
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 4e-05
40
+ - train_batch_size: 1
41
+ - eval_batch_size: 1
42
+ - seed: 42
43
+ - distributed_type: multi-GPU
44
+ - gradient_accumulation_steps: 40
45
+ - total_train_batch_size: 40
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - num_epochs: 3
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss |
53
+ |:-------------:|:-----:|:----:|:---------------:|
54
+ | 1.4434 | 1.0 | 366 | 1.3898 |
55
+ | 0.9304 | 2.0 | 733 | 1.4106 |
56
+ | 0.5651 | 2.99 | 1098 | 1.5518 |
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.36.2
62
+ - Pytorch 2.2.0a0+gitd925d94
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.15.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.99,
3
+ "eval_loss": 1.5518141984939575,
4
+ "eval_runtime": 10.9934,
5
+ "eval_samples": 300,
6
+ "eval_samples_per_second": 27.289,
7
+ "eval_steps_per_second": 27.289,
8
+ "train_loss": 0.9967951453231506,
9
+ "train_runtime": 11703.2983,
10
+ "train_samples": 14671,
11
+ "train_samples_per_second": 3.761,
12
+ "train_steps_per_second": 0.094
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.99,
3
+ "eval_loss": 1.5518141984939575,
4
+ "eval_runtime": 10.9934,
5
+ "eval_samples": 300,
6
+ "eval_samples_per_second": 27.289,
7
+ "eval_steps_per_second": 27.289
8
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.36.2"
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16f9bbc587efbdc0a4d114917554e2b930ed083c4ff6b84674c36b5cefb750f6
3
  size 4400216536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a23ff254360aacb0f6fd5c0a766b83ebf9a1d74403ec965c63c52d2a8f645d
3
  size 4400216536
runs/Jan11_11-07-49_98f107f1aa39/events.out.tfevents.1704971327.98f107f1aa39.6703.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12e245cc2ddd26c7c30be490ab524d705444e64e5e3dfae5d12a6549e6a9398b
3
- size 12238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d879a733676237bc27275058f22c93b18ddad27bfb12bbf858505e80310f58e
3
+ size 14276
runs/Jan11_11-07-49_98f107f1aa39/events.out.tfevents.1704983878.98f107f1aa39.6703.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a48fb69286c88bd80a48cd8017d3bb3959741aa11d6c0270874e9decc0cd644
3
+ size 359
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.99,
3
+ "train_loss": 0.9967951453231506,
4
+ "train_runtime": 11703.2983,
5
+ "train_samples": 14671,
6
+ "train_samples_per_second": 3.761,
7
+ "train_steps_per_second": 0.094
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.993660963806148,
5
+ "eval_steps": 500,
6
+ "global_step": 1098,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 3.999991813565924e-05,
14
+ "loss": 2.2897,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.05,
19
+ "learning_rate": 3.996726317608652e-05,
20
+ "loss": 1.6172,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 3.986915987431006e-05,
26
+ "loss": 1.5144,
27
+ "step": 40
28
+ },
29
+ {
30
+ "epoch": 0.16,
31
+ "learning_rate": 3.970601125372218e-05,
32
+ "loss": 1.5003,
33
+ "step": 60
34
+ },
35
+ {
36
+ "epoch": 0.22,
37
+ "learning_rate": 3.947835141108928e-05,
38
+ "loss": 1.4788,
39
+ "step": 80
40
+ },
41
+ {
42
+ "epoch": 0.27,
43
+ "learning_rate": 3.9186925632429396e-05,
44
+ "loss": 1.4834,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.33,
49
+ "learning_rate": 3.883268795318252e-05,
50
+ "loss": 1.4782,
51
+ "step": 120
52
+ },
53
+ {
54
+ "epoch": 0.38,
55
+ "learning_rate": 3.8416798035001545e-05,
56
+ "loss": 1.4776,
57
+ "step": 140
58
+ },
59
+ {
60
+ "epoch": 0.44,
61
+ "learning_rate": 3.794061736938837e-05,
62
+ "loss": 1.4813,
63
+ "step": 160
64
+ },
65
+ {
66
+ "epoch": 0.49,
67
+ "learning_rate": 3.740570482060311e-05,
68
+ "loss": 1.4974,
69
+ "step": 180
70
+ },
71
+ {
72
+ "epoch": 0.55,
73
+ "learning_rate": 3.681381152243763e-05,
74
+ "loss": 1.4778,
75
+ "step": 200
76
+ },
77
+ {
78
+ "epoch": 0.6,
79
+ "learning_rate": 3.6166875145559684e-05,
80
+ "loss": 1.5029,
81
+ "step": 220
82
+ },
83
+ {
84
+ "epoch": 0.65,
85
+ "learning_rate": 3.54670135541946e-05,
86
+ "loss": 1.5029,
87
+ "step": 240
88
+ },
89
+ {
90
+ "epoch": 0.71,
91
+ "learning_rate": 3.4716517872910405e-05,
92
+ "loss": 1.4741,
93
+ "step": 260
94
+ },
95
+ {
96
+ "epoch": 0.76,
97
+ "learning_rate": 3.391784498620369e-05,
98
+ "loss": 1.4563,
99
+ "step": 280
100
+ },
101
+ {
102
+ "epoch": 0.82,
103
+ "learning_rate": 3.307360949544012e-05,
104
+ "loss": 1.4634,
105
+ "step": 300
106
+ },
107
+ {
108
+ "epoch": 0.87,
109
+ "learning_rate": 3.2186575159479966e-05,
110
+ "loss": 1.4616,
111
+ "step": 320
112
+ },
113
+ {
114
+ "epoch": 0.93,
115
+ "learning_rate": 3.1259645847009384e-05,
116
+ "loss": 1.4308,
117
+ "step": 340
118
+ },
119
+ {
120
+ "epoch": 0.98,
121
+ "learning_rate": 3.0295856030196618e-05,
122
+ "loss": 1.4434,
123
+ "step": 360
124
+ },
125
+ {
126
+ "epoch": 1.0,
127
+ "eval_loss": 1.3897957801818848,
128
+ "eval_runtime": 11.4488,
129
+ "eval_samples_per_second": 26.204,
130
+ "eval_steps_per_second": 26.204,
131
+ "step": 366
132
+ },
133
+ {
134
+ "epoch": 1.04,
135
+ "learning_rate": 2.9298360850793944e-05,
136
+ "loss": 1.1296,
137
+ "step": 380
138
+ },
139
+ {
140
+ "epoch": 1.09,
141
+ "learning_rate": 2.827042579120562e-05,
142
+ "loss": 0.9657,
143
+ "step": 400
144
+ },
145
+ {
146
+ "epoch": 1.15,
147
+ "learning_rate": 2.721541598433567e-05,
148
+ "loss": 0.9303,
149
+ "step": 420
150
+ },
151
+ {
152
+ "epoch": 1.2,
153
+ "learning_rate": 2.613678519721155e-05,
154
+ "loss": 0.9411,
155
+ "step": 440
156
+ },
157
+ {
158
+ "epoch": 1.25,
159
+ "learning_rate": 2.5038064524447827e-05,
160
+ "loss": 0.9468,
161
+ "step": 460
162
+ },
163
+ {
164
+ "epoch": 1.31,
165
+ "learning_rate": 2.392285082856394e-05,
166
+ "loss": 0.938,
167
+ "step": 480
168
+ },
169
+ {
170
+ "epoch": 1.36,
171
+ "learning_rate": 2.2794794964998705e-05,
172
+ "loss": 0.938,
173
+ "step": 500
174
+ },
175
+ {
176
+ "epoch": 1.42,
177
+ "learning_rate": 2.1657589830369113e-05,
178
+ "loss": 0.9383,
179
+ "step": 520
180
+ },
181
+ {
182
+ "epoch": 1.47,
183
+ "learning_rate": 2.0514958273099778e-05,
184
+ "loss": 0.9431,
185
+ "step": 540
186
+ },
187
+ {
188
+ "epoch": 1.53,
189
+ "learning_rate": 1.93706409059995e-05,
190
+ "loss": 0.937,
191
+ "step": 560
192
+ },
193
+ {
194
+ "epoch": 1.58,
195
+ "learning_rate": 1.82283838606831e-05,
196
+ "loss": 0.9408,
197
+ "step": 580
198
+ },
199
+ {
200
+ "epoch": 1.64,
201
+ "learning_rate": 1.7091926523926205e-05,
202
+ "loss": 0.9567,
203
+ "step": 600
204
+ },
205
+ {
206
+ "epoch": 1.69,
207
+ "learning_rate": 1.5964989296100682e-05,
208
+ "loss": 0.9302,
209
+ "step": 620
210
+ },
211
+ {
212
+ "epoch": 1.74,
213
+ "learning_rate": 1.4851261411765414e-05,
214
+ "loss": 0.9309,
215
+ "step": 640
216
+ },
217
+ {
218
+ "epoch": 1.8,
219
+ "learning_rate": 1.375438886228411e-05,
220
+ "loss": 0.9354,
221
+ "step": 660
222
+ },
223
+ {
224
+ "epoch": 1.85,
225
+ "learning_rate": 1.2677962460007555e-05,
226
+ "loss": 0.9429,
227
+ "step": 680
228
+ },
229
+ {
230
+ "epoch": 1.91,
231
+ "learning_rate": 1.162550608309446e-05,
232
+ "loss": 0.9209,
233
+ "step": 700
234
+ },
235
+ {
236
+ "epoch": 1.96,
237
+ "learning_rate": 1.060046513945361e-05,
238
+ "loss": 0.9304,
239
+ "step": 720
240
+ },
241
+ {
242
+ "epoch": 2.0,
243
+ "eval_loss": 1.4105572700500488,
244
+ "eval_runtime": 11.4541,
245
+ "eval_samples_per_second": 26.191,
246
+ "eval_steps_per_second": 26.191,
247
+ "step": 733
248
+ },
249
+ {
250
+ "epoch": 2.02,
251
+ "learning_rate": 9.606195287572577e-06,
252
+ "loss": 0.7909,
253
+ "step": 740
254
+ },
255
+ {
256
+ "epoch": 2.07,
257
+ "learning_rate": 8.645951451157741e-06,
258
+ "loss": 0.5917,
259
+ "step": 760
260
+ },
261
+ {
262
+ "epoch": 2.13,
263
+ "learning_rate": 7.72287716354776e-06,
264
+ "loss": 0.5678,
265
+ "step": 780
266
+ },
267
+ {
268
+ "epoch": 2.18,
269
+ "learning_rate": 6.8399942767839075e-06,
270
+ "loss": 0.5837,
271
+ "step": 800
272
+ },
273
+ {
274
+ "epoch": 2.24,
275
+ "learning_rate": 6.000193069026181e-06,
276
+ "loss": 0.5701,
277
+ "step": 820
278
+ },
279
+ {
280
+ "epoch": 2.29,
281
+ "learning_rate": 5.206222782700667e-06,
282
+ "loss": 0.5467,
283
+ "step": 840
284
+ },
285
+ {
286
+ "epoch": 2.34,
287
+ "learning_rate": 4.460682624352952e-06,
288
+ "loss": 0.5695,
289
+ "step": 860
290
+ },
291
+ {
292
+ "epoch": 2.4,
293
+ "learning_rate": 3.766013255671479e-06,
294
+ "loss": 0.5557,
295
+ "step": 880
296
+ },
297
+ {
298
+ "epoch": 2.45,
299
+ "learning_rate": 3.1244888035362875e-06,
300
+ "loss": 0.5468,
301
+ "step": 900
302
+ },
303
+ {
304
+ "epoch": 2.51,
305
+ "learning_rate": 2.5382094152499705e-06,
306
+ "loss": 0.5793,
307
+ "step": 920
308
+ },
309
+ {
310
+ "epoch": 2.56,
311
+ "learning_rate": 2.009094383322356e-06,
312
+ "loss": 0.5462,
313
+ "step": 940
314
+ },
315
+ {
316
+ "epoch": 2.62,
317
+ "learning_rate": 1.5388758623164802e-06,
318
+ "loss": 0.5617,
319
+ "step": 960
320
+ },
321
+ {
322
+ "epoch": 2.67,
323
+ "learning_rate": 1.1290931983246334e-06,
324
+ "loss": 0.5574,
325
+ "step": 980
326
+ },
327
+ {
328
+ "epoch": 2.73,
329
+ "learning_rate": 7.810878896382101e-07,
330
+ "loss": 0.5632,
331
+ "step": 1000
332
+ },
333
+ {
334
+ "epoch": 2.78,
335
+ "learning_rate": 4.959991951083498e-07,
336
+ "loss": 0.57,
337
+ "step": 1020
338
+ },
339
+ {
340
+ "epoch": 2.84,
341
+ "learning_rate": 2.747604045743102e-07,
342
+ "loss": 0.5498,
343
+ "step": 1040
344
+ },
345
+ {
346
+ "epoch": 2.89,
347
+ "learning_rate": 1.180957835689478e-07,
348
+ "loss": 0.5369,
349
+ "step": 1060
350
+ },
351
+ {
352
+ "epoch": 2.94,
353
+ "learning_rate": 2.651820230338942e-08,
354
+ "loss": 0.5651,
355
+ "step": 1080
356
+ },
357
+ {
358
+ "epoch": 2.99,
359
+ "eval_loss": 1.5518141984939575,
360
+ "eval_runtime": 11.4288,
361
+ "eval_samples_per_second": 26.25,
362
+ "eval_steps_per_second": 26.25,
363
+ "step": 1098
364
+ },
365
+ {
366
+ "epoch": 2.99,
367
+ "step": 1098,
368
+ "total_flos": 6.035394717233971e+16,
369
+ "train_loss": 0.9967951453231506,
370
+ "train_runtime": 11703.2983,
371
+ "train_samples_per_second": 3.761,
372
+ "train_steps_per_second": 0.094
373
+ }
374
+ ],
375
+ "logging_steps": 20,
376
+ "max_steps": 1098,
377
+ "num_input_tokens_seen": 0,
378
+ "num_train_epochs": 3,
379
+ "save_steps": 20,
380
+ "total_flos": 6.035394717233971e+16,
381
+ "train_batch_size": 1,
382
+ "trial_name": null,
383
+ "trial_params": null
384
+ }