mtzig commited on
Commit
7a863fb
·
verified ·
1 Parent(s): 3e41e46

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15078b350dceb966b20c8709542ebf0e64b3e9a4c0e2319cdaec4f9c5530bac6
3
  size 13648688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a652c5fae11ea2adca853738ad01096d957c3c9179219d103154b7cc54bf3ad3
3
  size 13648688
last-checkpoint/global_step800/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f877dddd46433a4a3b3134d1597d0e501858c53ef38529470af634bdc6d0189
3
+ size 20450800
last-checkpoint/global_step800/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18b9212355f3f0b018d7ec9e3ebfd82e6e3d995128421c7d94a2b4c511641a43
3
+ size 20450800
last-checkpoint/global_step800/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afeace15bd4996a7dec661fffefc77e9c270e4d2aeea76a6fb2f7d0689fb6eb5
3
+ size 20450800
last-checkpoint/global_step800/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dfd918412cd9bd19e74611b73cd176c7b11f534edfa3c6db14479fe30fcb1d4
3
+ size 20450800
last-checkpoint/global_step800/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40c25e909a08fdf653f7b456b957ed4d1cda89dfb424aba1b9daf57b7ae1863d
3
+ size 152238
last-checkpoint/global_step800/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2eb3c53ba5f4d34c3f070d040498034275d76c2a55a2308aa8e3d26e8d57ece
3
+ size 152238
last-checkpoint/global_step800/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89896592980174dd3b06a9cba6578075ff9f19bc98fcc455acad08cbfcdc859d
3
+ size 152238
last-checkpoint/global_step800/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd22d993a562918152c931db50c1e239c8ab00cedb5b9ecb46e9ee8a798b231d
3
+ size 152238
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step700
 
1
+ global_step800
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08ee93655f035f40cef98d94e21df0215201bfd9c2fd009c63503f74d4bd0676
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87cb02e8cd64657a53dc5b5e254ac5f48ae8d194a60165370a55a19b22db6f41
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f9350b4bfefd5190b618e0103ff8128fab616f2df08e300e5789f194a7e25b8
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e24d98488b8ee4c9ae33c4ceab244d33a0b8840dd5bba055edd1d451f13ad848
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd0a399dabcc87f1904a1f24d9d7781d4c2d3c109c95dd2958fca743902bd75c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d2cd121bedf7b6b76721153b7cef441f373f23cfeb4d690fa2511614d4a9fde
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e90189ce66cbbdd26dcd499b49b05660c650805c2cfc5e25340f61c20bbb952
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e241a5faad1af0a2c7b86616994b8a556b97dd87a2d34edfc821f52d2592e9ec
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed875039ee3baaee6a245c8988a3754c26fb7f9e800cc58167646a8642969266
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7ed681055e4d6c45b04422cd95ffb42b03b2ad7470799d0056fdac3772ffaa
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8607439286812173,
5
  "eval_steps": 40,
6
- "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5123,6 +5123,742 @@
5123
  "learning_rate": 1.1562177270067766e-06,
5124
  "loss": 0.2128,
5125
  "step": 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5126
  }
5127
  ],
5128
  "logging_steps": 1,
@@ -5142,7 +5878,7 @@
5142
  "attributes": {}
5143
  }
5144
  },
5145
- "total_flos": 687762207244288.0,
5146
  "train_batch_size": 4,
5147
  "trial_name": null,
5148
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9837073470642483,
5
  "eval_steps": 40,
6
+ "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5123
  "learning_rate": 1.1562177270067766e-06,
5124
  "loss": 0.2128,
5125
  "step": 700
5126
+ },
5127
+ {
5128
+ "epoch": 0.8619735628650477,
5129
+ "grad_norm": 0.4252111135080987,
5130
+ "learning_rate": 1.1362392103604536e-06,
5131
+ "loss": 0.1746,
5132
+ "step": 701
5133
+ },
5134
+ {
5135
+ "epoch": 0.8632031970488779,
5136
+ "grad_norm": 0.45840370226979704,
5137
+ "learning_rate": 1.1164244064062101e-06,
5138
+ "loss": 0.2294,
5139
+ "step": 702
5140
+ },
5141
+ {
5142
+ "epoch": 0.8644328312327083,
5143
+ "grad_norm": 0.4054117908741426,
5144
+ "learning_rate": 1.0967736811213048e-06,
5145
+ "loss": 0.2092,
5146
+ "step": 703
5147
+ },
5148
+ {
5149
+ "epoch": 0.8656624654165386,
5150
+ "grad_norm": 0.39243641295826015,
5151
+ "learning_rate": 1.0772873974524833e-06,
5152
+ "loss": 0.2066,
5153
+ "step": 704
5154
+ },
5155
+ {
5156
+ "epoch": 0.8668920996003688,
5157
+ "grad_norm": 0.354816693403516,
5158
+ "learning_rate": 1.0579659153092759e-06,
5159
+ "loss": 0.1898,
5160
+ "step": 705
5161
+ },
5162
+ {
5163
+ "epoch": 0.8681217337841992,
5164
+ "grad_norm": 0.3898934913509522,
5165
+ "learning_rate": 1.0388095915573427e-06,
5166
+ "loss": 0.1295,
5167
+ "step": 706
5168
+ },
5169
+ {
5170
+ "epoch": 0.8693513679680295,
5171
+ "grad_norm": 0.5938780642245854,
5172
+ "learning_rate": 1.0198187800118842e-06,
5173
+ "loss": 0.2201,
5174
+ "step": 707
5175
+ },
5176
+ {
5177
+ "epoch": 0.8705810021518599,
5178
+ "grad_norm": 0.4475510981424918,
5179
+ "learning_rate": 1.0009938314311186e-06,
5180
+ "loss": 0.1617,
5181
+ "step": 708
5182
+ },
5183
+ {
5184
+ "epoch": 0.8718106363356901,
5185
+ "grad_norm": 0.6094389342778289,
5186
+ "learning_rate": 9.82335093509782e-07,
5187
+ "loss": 0.2703,
5188
+ "step": 709
5189
+ },
5190
+ {
5191
+ "epoch": 0.8730402705195205,
5192
+ "grad_norm": 0.3623911413159473,
5193
+ "learning_rate": 9.638429108727232e-07,
5194
+ "loss": 0.1882,
5195
+ "step": 710
5196
+ },
5197
+ {
5198
+ "epoch": 0.8742699047033508,
5199
+ "grad_norm": 0.4538866026680618,
5200
+ "learning_rate": 9.455176250685338e-07,
5201
+ "loss": 0.2013,
5202
+ "step": 711
5203
+ },
5204
+ {
5205
+ "epoch": 0.875499538887181,
5206
+ "grad_norm": 0.3194360166861715,
5207
+ "learning_rate": 9.27359574563238e-07,
5208
+ "loss": 0.1851,
5209
+ "step": 712
5210
+ },
5211
+ {
5212
+ "epoch": 0.8767291730710114,
5213
+ "grad_norm": 0.38338959194718086,
5214
+ "learning_rate": 9.093690947340406e-07,
5215
+ "loss": 0.1927,
5216
+ "step": 713
5217
+ },
5218
+ {
5219
+ "epoch": 0.8779588072548417,
5220
+ "grad_norm": 0.3879298822682774,
5221
+ "learning_rate": 8.915465178631344e-07,
5222
+ "loss": 0.1238,
5223
+ "step": 714
5224
+ },
5225
+ {
5226
+ "epoch": 0.879188441438672,
5227
+ "grad_norm": 0.34738835843437105,
5228
+ "learning_rate": 8.738921731315686e-07,
5229
+ "loss": 0.1616,
5230
+ "step": 715
5231
+ },
5232
+ {
5233
+ "epoch": 0.8804180756225023,
5234
+ "grad_norm": 0.42388132636624687,
5235
+ "learning_rate": 8.564063866131567e-07,
5236
+ "loss": 0.1793,
5237
+ "step": 716
5238
+ },
5239
+ {
5240
+ "epoch": 0.8816477098063327,
5241
+ "grad_norm": 0.5595604732568557,
5242
+ "learning_rate": 8.390894812684602e-07,
5243
+ "loss": 0.2188,
5244
+ "step": 717
5245
+ },
5246
+ {
5247
+ "epoch": 0.8828773439901629,
5248
+ "grad_norm": 0.4324638530503791,
5249
+ "learning_rate": 8.219417769388316e-07,
5250
+ "loss": 0.1984,
5251
+ "step": 718
5252
+ },
5253
+ {
5254
+ "epoch": 0.8841069781739932,
5255
+ "grad_norm": 0.5485490850760583,
5256
+ "learning_rate": 8.049635903404907e-07,
5257
+ "loss": 0.1978,
5258
+ "step": 719
5259
+ },
5260
+ {
5261
+ "epoch": 0.8853366123578236,
5262
+ "grad_norm": 0.34294438952407474,
5263
+ "learning_rate": 7.881552350586863e-07,
5264
+ "loss": 0.1937,
5265
+ "step": 720
5266
+ },
5267
+ {
5268
+ "epoch": 0.8853366123578236,
5269
+ "eval_accuracy": 0.8021390374331551,
5270
+ "eval_f1": 0.5066666666666667,
5271
+ "eval_loss": 0.42445313930511475,
5272
+ "eval_precision": 0.76,
5273
+ "eval_recall": 0.38,
5274
+ "eval_runtime": 23.5507,
5275
+ "eval_samples_per_second": 2.123,
5276
+ "eval_steps_per_second": 0.17,
5277
+ "step": 720
5278
+ },
5279
+ {
5280
+ "epoch": 0.8865662465416538,
5281
+ "grad_norm": 0.3459770483320799,
5282
+ "learning_rate": 7.715170215419043e-07,
5283
+ "loss": 0.1599,
5284
+ "step": 721
5285
+ },
5286
+ {
5287
+ "epoch": 0.8877958807254842,
5288
+ "grad_norm": 0.3720600631339784,
5289
+ "learning_rate": 7.550492570961243e-07,
5290
+ "loss": 0.1482,
5291
+ "step": 722
5292
+ },
5293
+ {
5294
+ "epoch": 0.8890255149093145,
5295
+ "grad_norm": 0.3994271301853499,
5296
+ "learning_rate": 7.387522458791552e-07,
5297
+ "loss": 0.1792,
5298
+ "step": 723
5299
+ },
5300
+ {
5301
+ "epoch": 0.8902551490931447,
5302
+ "grad_norm": 0.3640144769183845,
5303
+ "learning_rate": 7.226262888950153e-07,
5304
+ "loss": 0.1466,
5305
+ "step": 724
5306
+ },
5307
+ {
5308
+ "epoch": 0.8914847832769751,
5309
+ "grad_norm": 0.37455447234355393,
5310
+ "learning_rate": 7.066716839883592e-07,
5311
+ "loss": 0.1584,
5312
+ "step": 725
5313
+ },
5314
+ {
5315
+ "epoch": 0.8927144174608054,
5316
+ "grad_norm": 0.333702811711787,
5317
+ "learning_rate": 6.908887258389974e-07,
5318
+ "loss": 0.186,
5319
+ "step": 726
5320
+ },
5321
+ {
5322
+ "epoch": 0.8939440516446358,
5323
+ "grad_norm": 0.4224450151535308,
5324
+ "learning_rate": 6.752777059564431e-07,
5325
+ "loss": 0.2272,
5326
+ "step": 727
5327
+ },
5328
+ {
5329
+ "epoch": 0.895173685828466,
5330
+ "grad_norm": 0.3600232430765048,
5331
+ "learning_rate": 6.598389126745209e-07,
5332
+ "loss": 0.2266,
5333
+ "step": 728
5334
+ },
5335
+ {
5336
+ "epoch": 0.8964033200122964,
5337
+ "grad_norm": 0.5012149171996233,
5338
+ "learning_rate": 6.445726311460553e-07,
5339
+ "loss": 0.2236,
5340
+ "step": 729
5341
+ },
5342
+ {
5343
+ "epoch": 0.8976329541961267,
5344
+ "grad_norm": 0.39536858077842535,
5345
+ "learning_rate": 6.29479143337598e-07,
5346
+ "loss": 0.145,
5347
+ "step": 730
5348
+ },
5349
+ {
5350
+ "epoch": 0.8988625883799569,
5351
+ "grad_norm": 0.2991426563682339,
5352
+ "learning_rate": 6.145587280242138e-07,
5353
+ "loss": 0.175,
5354
+ "step": 731
5355
+ },
5356
+ {
5357
+ "epoch": 0.9000922225637873,
5358
+ "grad_norm": 0.3676759279938805,
5359
+ "learning_rate": 5.99811660784344e-07,
5360
+ "loss": 0.1776,
5361
+ "step": 732
5362
+ },
5363
+ {
5364
+ "epoch": 0.9013218567476176,
5365
+ "grad_norm": 0.3708523019855745,
5366
+ "learning_rate": 5.852382139947077e-07,
5367
+ "loss": 0.1736,
5368
+ "step": 733
5369
+ },
5370
+ {
5371
+ "epoch": 0.9025514909314479,
5372
+ "grad_norm": 0.43018990463756845,
5373
+ "learning_rate": 5.708386568252688e-07,
5374
+ "loss": 0.1436,
5375
+ "step": 734
5376
+ },
5377
+ {
5378
+ "epoch": 0.9037811251152782,
5379
+ "grad_norm": 0.4802111636137182,
5380
+ "learning_rate": 5.566132552342784e-07,
5381
+ "loss": 0.2092,
5382
+ "step": 735
5383
+ },
5384
+ {
5385
+ "epoch": 0.9050107592991086,
5386
+ "grad_norm": 0.2782286925340434,
5387
+ "learning_rate": 5.425622719633428e-07,
5388
+ "loss": 0.1204,
5389
+ "step": 736
5390
+ },
5391
+ {
5392
+ "epoch": 0.9062403934829388,
5393
+ "grad_norm": 0.42051906131253536,
5394
+ "learning_rate": 5.286859665325905e-07,
5395
+ "loss": 0.2128,
5396
+ "step": 737
5397
+ },
5398
+ {
5399
+ "epoch": 0.9074700276667691,
5400
+ "grad_norm": 0.5089981907905707,
5401
+ "learning_rate": 5.149845952358589e-07,
5402
+ "loss": 0.2393,
5403
+ "step": 738
5404
+ },
5405
+ {
5406
+ "epoch": 0.9086996618505995,
5407
+ "grad_norm": 0.41726717657714646,
5408
+ "learning_rate": 5.014584111359811e-07,
5409
+ "loss": 0.1784,
5410
+ "step": 739
5411
+ },
5412
+ {
5413
+ "epoch": 0.9099292960344297,
5414
+ "grad_norm": 0.4603202083257241,
5415
+ "learning_rate": 4.881076640600979e-07,
5416
+ "loss": 0.1913,
5417
+ "step": 740
5418
+ },
5419
+ {
5420
+ "epoch": 0.9111589302182601,
5421
+ "grad_norm": 0.4130280700492162,
5422
+ "learning_rate": 4.7493260059504497e-07,
5423
+ "loss": 0.238,
5424
+ "step": 741
5425
+ },
5426
+ {
5427
+ "epoch": 0.9123885644020904,
5428
+ "grad_norm": 0.5036196124164085,
5429
+ "learning_rate": 4.6193346408280216e-07,
5430
+ "loss": 0.1926,
5431
+ "step": 742
5432
+ },
5433
+ {
5434
+ "epoch": 0.9136181985859206,
5435
+ "grad_norm": 0.36883928715030756,
5436
+ "learning_rate": 4.491104946160052e-07,
5437
+ "loss": 0.1659,
5438
+ "step": 743
5439
+ },
5440
+ {
5441
+ "epoch": 0.914847832769751,
5442
+ "grad_norm": 0.4945934881547631,
5443
+ "learning_rate": 4.3646392903348823e-07,
5444
+ "loss": 0.1546,
5445
+ "step": 744
5446
+ },
5447
+ {
5448
+ "epoch": 0.9160774669535813,
5449
+ "grad_norm": 0.33228048042062874,
5450
+ "learning_rate": 4.2399400091594154e-07,
5451
+ "loss": 0.174,
5452
+ "step": 745
5453
+ },
5454
+ {
5455
+ "epoch": 0.9173071011374117,
5456
+ "grad_norm": 0.3446047000937293,
5457
+ "learning_rate": 4.117009405815686e-07,
5458
+ "loss": 0.1422,
5459
+ "step": 746
5460
+ },
5461
+ {
5462
+ "epoch": 0.9185367353212419,
5463
+ "grad_norm": 0.4483830076000804,
5464
+ "learning_rate": 3.9958497508185036e-07,
5465
+ "loss": 0.2321,
5466
+ "step": 747
5467
+ },
5468
+ {
5469
+ "epoch": 0.9197663695050723,
5470
+ "grad_norm": 0.6123522355957743,
5471
+ "learning_rate": 3.8764632819734526e-07,
5472
+ "loss": 0.2343,
5473
+ "step": 748
5474
+ },
5475
+ {
5476
+ "epoch": 0.9209960036889026,
5477
+ "grad_norm": 0.4578579557499873,
5478
+ "learning_rate": 3.758852204335539e-07,
5479
+ "loss": 0.2546,
5480
+ "step": 749
5481
+ },
5482
+ {
5483
+ "epoch": 0.9222256378727328,
5484
+ "grad_norm": 0.5396510643509638,
5485
+ "learning_rate": 3.643018690168487e-07,
5486
+ "loss": 0.2142,
5487
+ "step": 750
5488
+ },
5489
+ {
5490
+ "epoch": 0.9234552720565632,
5491
+ "grad_norm": 0.4439335617291929,
5492
+ "learning_rate": 3.5289648789046616e-07,
5493
+ "loss": 0.1763,
5494
+ "step": 751
5495
+ },
5496
+ {
5497
+ "epoch": 0.9246849062403935,
5498
+ "grad_norm": 0.3529798858955445,
5499
+ "learning_rate": 3.4166928771054653e-07,
5500
+ "loss": 0.2423,
5501
+ "step": 752
5502
+ },
5503
+ {
5504
+ "epoch": 0.9259145404242238,
5505
+ "grad_norm": 0.44409731091250987,
5506
+ "learning_rate": 3.3062047584224934e-07,
5507
+ "loss": 0.2102,
5508
+ "step": 753
5509
+ },
5510
+ {
5511
+ "epoch": 0.9271441746080541,
5512
+ "grad_norm": 0.5097303302568637,
5513
+ "learning_rate": 3.197502563559185e-07,
5514
+ "loss": 0.157,
5515
+ "step": 754
5516
+ },
5517
+ {
5518
+ "epoch": 0.9283738087918845,
5519
+ "grad_norm": 0.312449073047315,
5520
+ "learning_rate": 3.0905883002332213e-07,
5521
+ "loss": 0.1459,
5522
+ "step": 755
5523
+ },
5524
+ {
5525
+ "epoch": 0.9296034429757147,
5526
+ "grad_norm": 0.5047008458688232,
5527
+ "learning_rate": 2.985463943139322e-07,
5528
+ "loss": 0.1738,
5529
+ "step": 756
5530
+ },
5531
+ {
5532
+ "epoch": 0.930833077159545,
5533
+ "grad_norm": 0.5557205879162439,
5534
+ "learning_rate": 2.882131433912883e-07,
5535
+ "loss": 0.2488,
5536
+ "step": 757
5537
+ },
5538
+ {
5539
+ "epoch": 0.9320627113433754,
5540
+ "grad_norm": 0.43487988536317196,
5541
+ "learning_rate": 2.7805926810940297e-07,
5542
+ "loss": 0.1683,
5543
+ "step": 758
5544
+ },
5545
+ {
5546
+ "epoch": 0.9332923455272056,
5547
+ "grad_norm": 0.41756639561772235,
5548
+ "learning_rate": 2.6808495600924355e-07,
5549
+ "loss": 0.1387,
5550
+ "step": 759
5551
+ },
5552
+ {
5553
+ "epoch": 0.934521979711036,
5554
+ "grad_norm": 0.3700165943051797,
5555
+ "learning_rate": 2.582903913152612e-07,
5556
+ "loss": 0.164,
5557
+ "step": 760
5558
+ },
5559
+ {
5560
+ "epoch": 0.934521979711036,
5561
+ "eval_accuracy": 0.8074866310160428,
5562
+ "eval_f1": 0.5263157894736842,
5563
+ "eval_loss": 0.41820311546325684,
5564
+ "eval_precision": 0.7692307692307693,
5565
+ "eval_recall": 0.4,
5566
+ "eval_runtime": 23.5561,
5567
+ "eval_samples_per_second": 2.123,
5568
+ "eval_steps_per_second": 0.17,
5569
+ "step": 760
5570
+ },
5571
+ {
5572
+ "epoch": 0.9357516138948663,
5573
+ "grad_norm": 0.375235976764425,
5574
+ "learning_rate": 2.4867575493199515e-07,
5575
+ "loss": 0.1546,
5576
+ "step": 761
5577
+ },
5578
+ {
5579
+ "epoch": 0.9369812480786965,
5580
+ "grad_norm": 0.43981480426316444,
5581
+ "learning_rate": 2.392412244407294e-07,
5582
+ "loss": 0.1917,
5583
+ "step": 762
5584
+ },
5585
+ {
5586
+ "epoch": 0.9382108822625269,
5587
+ "grad_norm": 0.6482748347069781,
5588
+ "learning_rate": 2.2998697409620573e-07,
5589
+ "loss": 0.2225,
5590
+ "step": 763
5591
+ },
5592
+ {
5593
+ "epoch": 0.9394405164463572,
5594
+ "grad_norm": 0.42061880501285365,
5595
+ "learning_rate": 2.2091317482342056e-07,
5596
+ "loss": 0.23,
5597
+ "step": 764
5598
+ },
5599
+ {
5600
+ "epoch": 0.9406701506301876,
5601
+ "grad_norm": 0.33119896763642076,
5602
+ "learning_rate": 2.1201999421445074e-07,
5603
+ "loss": 0.1174,
5604
+ "step": 765
5605
+ },
5606
+ {
5607
+ "epoch": 0.9418997848140178,
5608
+ "grad_norm": 0.3630099682147168,
5609
+ "learning_rate": 2.0330759652536835e-07,
5610
+ "loss": 0.1809,
5611
+ "step": 766
5612
+ },
5613
+ {
5614
+ "epoch": 0.9431294189978482,
5615
+ "grad_norm": 0.5867052799544186,
5616
+ "learning_rate": 1.9477614267320867e-07,
5617
+ "loss": 0.258,
5618
+ "step": 767
5619
+ },
5620
+ {
5621
+ "epoch": 0.9443590531816785,
5622
+ "grad_norm": 0.4296071075770851,
5623
+ "learning_rate": 1.8642579023298913e-07,
5624
+ "loss": 0.2059,
5625
+ "step": 768
5626
+ },
5627
+ {
5628
+ "epoch": 0.9455886873655087,
5629
+ "grad_norm": 0.49292796300633096,
5630
+ "learning_rate": 1.7825669343480624e-07,
5631
+ "loss": 0.2087,
5632
+ "step": 769
5633
+ },
5634
+ {
5635
+ "epoch": 0.9468183215493391,
5636
+ "grad_norm": 0.4595855430483603,
5637
+ "learning_rate": 1.7026900316098217e-07,
5638
+ "loss": 0.1969,
5639
+ "step": 770
5640
+ },
5641
+ {
5642
+ "epoch": 0.9480479557331694,
5643
+ "grad_norm": 0.35921138730577007,
5644
+ "learning_rate": 1.6246286694328594e-07,
5645
+ "loss": 0.1353,
5646
+ "step": 771
5647
+ },
5648
+ {
5649
+ "epoch": 0.9492775899169997,
5650
+ "grad_norm": 0.41891935550025844,
5651
+ "learning_rate": 1.5483842896019675e-07,
5652
+ "loss": 0.144,
5653
+ "step": 772
5654
+ },
5655
+ {
5656
+ "epoch": 0.95050722410083,
5657
+ "grad_norm": 0.3608543106060588,
5658
+ "learning_rate": 1.473958300342504e-07,
5659
+ "loss": 0.1887,
5660
+ "step": 773
5661
+ },
5662
+ {
5663
+ "epoch": 0.9517368582846603,
5664
+ "grad_norm": 0.29802095126527695,
5665
+ "learning_rate": 1.401352076294371e-07,
5666
+ "loss": 0.114,
5667
+ "step": 774
5668
+ },
5669
+ {
5670
+ "epoch": 0.9529664924684906,
5671
+ "grad_norm": 0.4605125584516114,
5672
+ "learning_rate": 1.3305669584865565e-07,
5673
+ "loss": 0.226,
5674
+ "step": 775
5675
+ },
5676
+ {
5677
+ "epoch": 0.9541961266523209,
5678
+ "grad_norm": 0.4384971669173196,
5679
+ "learning_rate": 1.261604254312454e-07,
5680
+ "loss": 0.2221,
5681
+ "step": 776
5682
+ },
5683
+ {
5684
+ "epoch": 0.9554257608361513,
5685
+ "grad_norm": 0.5042450469158528,
5686
+ "learning_rate": 1.1944652375056597e-07,
5687
+ "loss": 0.1903,
5688
+ "step": 777
5689
+ },
5690
+ {
5691
+ "epoch": 0.9566553950199815,
5692
+ "grad_norm": 0.34864348698758946,
5693
+ "learning_rate": 1.1291511481164807e-07,
5694
+ "loss": 0.1514,
5695
+ "step": 778
5696
+ },
5697
+ {
5698
+ "epoch": 0.9578850292038119,
5699
+ "grad_norm": 0.39006673270014214,
5700
+ "learning_rate": 1.0656631924889749e-07,
5701
+ "loss": 0.1849,
5702
+ "step": 779
5703
+ },
5704
+ {
5705
+ "epoch": 0.9591146633876422,
5706
+ "grad_norm": 0.37108179891285153,
5707
+ "learning_rate": 1.0040025432387801e-07,
5708
+ "loss": 0.1699,
5709
+ "step": 780
5710
+ },
5711
+ {
5712
+ "epoch": 0.9603442975714724,
5713
+ "grad_norm": 0.4840858020566464,
5714
+ "learning_rate": 9.441703392313095e-08,
5715
+ "loss": 0.1975,
5716
+ "step": 781
5717
+ },
5718
+ {
5719
+ "epoch": 0.9615739317553028,
5720
+ "grad_norm": 0.4069300865488938,
5721
+ "learning_rate": 8.861676855608237e-08,
5722
+ "loss": 0.1635,
5723
+ "step": 782
5724
+ },
5725
+ {
5726
+ "epoch": 0.9628035659391331,
5727
+ "grad_norm": 0.43469366649519353,
5728
+ "learning_rate": 8.299956535300135e-08,
5729
+ "loss": 0.2255,
5730
+ "step": 783
5731
+ },
5732
+ {
5733
+ "epoch": 0.9640332001229635,
5734
+ "grad_norm": 0.4875682606006585,
5735
+ "learning_rate": 7.756552806301498e-08,
5736
+ "loss": 0.2556,
5737
+ "step": 784
5738
+ },
5739
+ {
5740
+ "epoch": 0.9652628343067937,
5741
+ "grad_norm": 0.37118130700190005,
5742
+ "learning_rate": 7.23147570521987e-08,
5743
+ "loss": 0.1849,
5744
+ "step": 785
5745
+ },
5746
+ {
5747
+ "epoch": 0.9664924684906241,
5748
+ "grad_norm": 0.40788113934634646,
5749
+ "learning_rate": 6.724734930171561e-08,
5750
+ "loss": 0.132,
5751
+ "step": 786
5752
+ },
5753
+ {
5754
+ "epoch": 0.9677221026744544,
5755
+ "grad_norm": 0.40100916877522913,
5756
+ "learning_rate": 6.236339840603677e-08,
5757
+ "loss": 0.1737,
5758
+ "step": 787
5759
+ },
5760
+ {
5761
+ "epoch": 0.9689517368582846,
5762
+ "grad_norm": 0.288332597781413,
5763
+ "learning_rate": 5.766299457119817e-08,
5764
+ "loss": 0.1717,
5765
+ "step": 788
5766
+ },
5767
+ {
5768
+ "epoch": 0.970181371042115,
5769
+ "grad_norm": 0.3844917502681006,
5770
+ "learning_rate": 5.3146224613144225e-08,
5771
+ "loss": 0.2025,
5772
+ "step": 789
5773
+ },
5774
+ {
5775
+ "epoch": 0.9714110052259453,
5776
+ "grad_norm": 0.39531701055460605,
5777
+ "learning_rate": 4.8813171956123565e-08,
5778
+ "loss": 0.1683,
5779
+ "step": 790
5780
+ },
5781
+ {
5782
+ "epoch": 0.9726406394097756,
5783
+ "grad_norm": 0.4042396380118405,
5784
+ "learning_rate": 4.4663916631143554e-08,
5785
+ "loss": 0.1624,
5786
+ "step": 791
5787
+ },
5788
+ {
5789
+ "epoch": 0.9738702735936059,
5790
+ "grad_norm": 0.2908881244181945,
5791
+ "learning_rate": 4.069853527449596e-08,
5792
+ "loss": 0.1629,
5793
+ "step": 792
5794
+ },
5795
+ {
5796
+ "epoch": 0.9750999077774362,
5797
+ "grad_norm": 0.4056597153568874,
5798
+ "learning_rate": 3.691710112634139e-08,
5799
+ "loss": 0.2174,
5800
+ "step": 793
5801
+ },
5802
+ {
5803
+ "epoch": 0.9763295419612665,
5804
+ "grad_norm": 0.45435351641201505,
5805
+ "learning_rate": 3.3319684029354815e-08,
5806
+ "loss": 0.1799,
5807
+ "step": 794
5808
+ },
5809
+ {
5810
+ "epoch": 0.9775591761450968,
5811
+ "grad_norm": 0.300609635946807,
5812
+ "learning_rate": 2.9906350427435505e-08,
5813
+ "loss": 0.1744,
5814
+ "step": 795
5815
+ },
5816
+ {
5817
+ "epoch": 0.9787888103289272,
5818
+ "grad_norm": 0.3475286198999752,
5819
+ "learning_rate": 2.667716336448356e-08,
5820
+ "loss": 0.1171,
5821
+ "step": 796
5822
+ },
5823
+ {
5824
+ "epoch": 0.9800184445127574,
5825
+ "grad_norm": 0.3272831675072718,
5826
+ "learning_rate": 2.3632182483228628e-08,
5827
+ "loss": 0.1556,
5828
+ "step": 797
5829
+ },
5830
+ {
5831
+ "epoch": 0.9812480786965878,
5832
+ "grad_norm": 0.48277032481078636,
5833
+ "learning_rate": 2.077146402413521e-08,
5834
+ "loss": 0.2464,
5835
+ "step": 798
5836
+ },
5837
+ {
5838
+ "epoch": 0.9824777128804181,
5839
+ "grad_norm": 0.32237492877791696,
5840
+ "learning_rate": 1.80950608243613e-08,
5841
+ "loss": 0.1548,
5842
+ "step": 799
5843
+ },
5844
+ {
5845
+ "epoch": 0.9837073470642483,
5846
+ "grad_norm": 0.4196152906906351,
5847
+ "learning_rate": 1.5603022316780235e-08,
5848
+ "loss": 0.2185,
5849
+ "step": 800
5850
+ },
5851
+ {
5852
+ "epoch": 0.9837073470642483,
5853
+ "eval_accuracy": 0.8128342245989305,
5854
+ "eval_f1": 0.5454545454545454,
5855
+ "eval_loss": 0.4195312559604645,
5856
+ "eval_precision": 0.7777777777777778,
5857
+ "eval_recall": 0.42,
5858
+ "eval_runtime": 23.0969,
5859
+ "eval_samples_per_second": 2.165,
5860
+ "eval_steps_per_second": 0.173,
5861
+ "step": 800
5862
  }
5863
  ],
5864
  "logging_steps": 1,
 
5878
  "attributes": {}
5879
  }
5880
  },
5881
+ "total_flos": 785744118972416.0,
5882
  "train_batch_size": 4,
5883
  "trial_name": null,
5884
  "trial_params": null