lesso's picture
Training in progress, step 200, checkpoint
ac819fb verified
{
"best_metric": 0.0022250961046665907,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.03170577045022194,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001585288522511097,
"grad_norm": 12.08696174621582,
"learning_rate": 1.008e-05,
"loss": 4.2706,
"step": 1
},
{
"epoch": 0.0001585288522511097,
"eval_loss": 4.04376220703125,
"eval_runtime": 285.4706,
"eval_samples_per_second": 9.304,
"eval_steps_per_second": 2.326,
"step": 1
},
{
"epoch": 0.0003170577045022194,
"grad_norm": 15.856035232543945,
"learning_rate": 2.016e-05,
"loss": 3.9804,
"step": 2
},
{
"epoch": 0.0004755865567533291,
"grad_norm": 15.65820026397705,
"learning_rate": 3.024e-05,
"loss": 3.9763,
"step": 3
},
{
"epoch": 0.0006341154090044388,
"grad_norm": 14.708057403564453,
"learning_rate": 4.032e-05,
"loss": 3.2287,
"step": 4
},
{
"epoch": 0.0007926442612555486,
"grad_norm": 10.415152549743652,
"learning_rate": 5.04e-05,
"loss": 2.0174,
"step": 5
},
{
"epoch": 0.0009511731135066582,
"grad_norm": 12.43994140625,
"learning_rate": 6.048e-05,
"loss": 1.2254,
"step": 6
},
{
"epoch": 0.0011097019657577679,
"grad_norm": 10.96854019165039,
"learning_rate": 7.055999999999999e-05,
"loss": 0.3292,
"step": 7
},
{
"epoch": 0.0012682308180088776,
"grad_norm": 36.675350189208984,
"learning_rate": 8.064e-05,
"loss": 0.2307,
"step": 8
},
{
"epoch": 0.0014267596702599874,
"grad_norm": 2.064239501953125,
"learning_rate": 9.072e-05,
"loss": 0.0138,
"step": 9
},
{
"epoch": 0.0015852885225110971,
"grad_norm": 0.1330491453409195,
"learning_rate": 0.0001008,
"loss": 0.001,
"step": 10
},
{
"epoch": 0.0017438173747622067,
"grad_norm": 0.01124438177794218,
"learning_rate": 0.00010026947368421052,
"loss": 0.0001,
"step": 11
},
{
"epoch": 0.0019023462270133164,
"grad_norm": 0.004640920553356409,
"learning_rate": 9.973894736842104e-05,
"loss": 0.0,
"step": 12
},
{
"epoch": 0.002060875079264426,
"grad_norm": 0.007322155870497227,
"learning_rate": 9.920842105263157e-05,
"loss": 0.0,
"step": 13
},
{
"epoch": 0.0022194039315155357,
"grad_norm": 0.012465717270970345,
"learning_rate": 9.86778947368421e-05,
"loss": 0.0001,
"step": 14
},
{
"epoch": 0.0023779327837666455,
"grad_norm": 0.0506725050508976,
"learning_rate": 9.814736842105264e-05,
"loss": 0.0001,
"step": 15
},
{
"epoch": 0.0025364616360177552,
"grad_norm": 0.00856021698564291,
"learning_rate": 9.761684210526316e-05,
"loss": 0.0,
"step": 16
},
{
"epoch": 0.002694990488268865,
"grad_norm": 0.001643276889808476,
"learning_rate": 9.708631578947368e-05,
"loss": 0.0,
"step": 17
},
{
"epoch": 0.0028535193405199747,
"grad_norm": 0.0009701295639388263,
"learning_rate": 9.655578947368421e-05,
"loss": 0.0,
"step": 18
},
{
"epoch": 0.0030120481927710845,
"grad_norm": 0.0005847598076798022,
"learning_rate": 9.602526315789473e-05,
"loss": 0.0,
"step": 19
},
{
"epoch": 0.0031705770450221942,
"grad_norm": 0.0006212879670783877,
"learning_rate": 9.549473684210525e-05,
"loss": 0.0,
"step": 20
},
{
"epoch": 0.0033291058972733036,
"grad_norm": 0.0008075927617028356,
"learning_rate": 9.496421052631579e-05,
"loss": 0.0,
"step": 21
},
{
"epoch": 0.0034876347495244133,
"grad_norm": 0.0006020912551321089,
"learning_rate": 9.443368421052631e-05,
"loss": 0.0,
"step": 22
},
{
"epoch": 0.003646163601775523,
"grad_norm": 0.0005198498256504536,
"learning_rate": 9.390315789473683e-05,
"loss": 0.0,
"step": 23
},
{
"epoch": 0.003804692454026633,
"grad_norm": 0.0016288729384541512,
"learning_rate": 9.337263157894737e-05,
"loss": 0.0,
"step": 24
},
{
"epoch": 0.003963221306277742,
"grad_norm": 0.0007833559648133814,
"learning_rate": 9.28421052631579e-05,
"loss": 0.0,
"step": 25
},
{
"epoch": 0.004121750158528852,
"grad_norm": 0.0014727013185620308,
"learning_rate": 9.231157894736842e-05,
"loss": 0.0,
"step": 26
},
{
"epoch": 0.004280279010779962,
"grad_norm": 0.0016179227968677878,
"learning_rate": 9.178105263157895e-05,
"loss": 0.0,
"step": 27
},
{
"epoch": 0.004438807863031071,
"grad_norm": 0.0005032668123021722,
"learning_rate": 9.125052631578948e-05,
"loss": 0.0,
"step": 28
},
{
"epoch": 0.004597336715282181,
"grad_norm": 0.0005934814107604325,
"learning_rate": 9.072e-05,
"loss": 0.0,
"step": 29
},
{
"epoch": 0.004755865567533291,
"grad_norm": 0.00047520003863610327,
"learning_rate": 9.018947368421052e-05,
"loss": 0.0,
"step": 30
},
{
"epoch": 0.004914394419784401,
"grad_norm": 0.0007378292502835393,
"learning_rate": 8.965894736842104e-05,
"loss": 0.0,
"step": 31
},
{
"epoch": 0.0050729232720355105,
"grad_norm": 0.0007299358258023858,
"learning_rate": 8.912842105263157e-05,
"loss": 0.0,
"step": 32
},
{
"epoch": 0.00523145212428662,
"grad_norm": 0.0005630860105156898,
"learning_rate": 8.85978947368421e-05,
"loss": 0.0,
"step": 33
},
{
"epoch": 0.00538998097653773,
"grad_norm": 0.0009994081920012832,
"learning_rate": 8.806736842105264e-05,
"loss": 0.0,
"step": 34
},
{
"epoch": 0.00554850982878884,
"grad_norm": 0.0004544692055787891,
"learning_rate": 8.753684210526316e-05,
"loss": 0.0,
"step": 35
},
{
"epoch": 0.0057070386810399495,
"grad_norm": 0.00069584691664204,
"learning_rate": 8.700631578947369e-05,
"loss": 0.0,
"step": 36
},
{
"epoch": 0.005865567533291059,
"grad_norm": 0.0019399194279685616,
"learning_rate": 8.647578947368421e-05,
"loss": 0.0,
"step": 37
},
{
"epoch": 0.006024096385542169,
"grad_norm": 0.0005930989282205701,
"learning_rate": 8.594526315789473e-05,
"loss": 0.0,
"step": 38
},
{
"epoch": 0.006182625237793279,
"grad_norm": 0.0007997532375156879,
"learning_rate": 8.541473684210525e-05,
"loss": 0.0,
"step": 39
},
{
"epoch": 0.0063411540900443885,
"grad_norm": 0.0013047147076576948,
"learning_rate": 8.488421052631578e-05,
"loss": 0.0,
"step": 40
},
{
"epoch": 0.006499682942295497,
"grad_norm": 0.00036864462890662253,
"learning_rate": 8.435368421052631e-05,
"loss": 0.0,
"step": 41
},
{
"epoch": 0.006658211794546607,
"grad_norm": 0.0003495727141853422,
"learning_rate": 8.382315789473684e-05,
"loss": 0.0,
"step": 42
},
{
"epoch": 0.006816740646797717,
"grad_norm": 0.0003555091971065849,
"learning_rate": 8.329263157894737e-05,
"loss": 0.0,
"step": 43
},
{
"epoch": 0.006975269499048827,
"grad_norm": 0.0002377888304181397,
"learning_rate": 8.27621052631579e-05,
"loss": 0.0,
"step": 44
},
{
"epoch": 0.007133798351299936,
"grad_norm": 0.0002825877454597503,
"learning_rate": 8.223157894736842e-05,
"loss": 0.0,
"step": 45
},
{
"epoch": 0.007292327203551046,
"grad_norm": 0.0002043453569058329,
"learning_rate": 8.170105263157894e-05,
"loss": 0.0,
"step": 46
},
{
"epoch": 0.007450856055802156,
"grad_norm": 0.0002539000706747174,
"learning_rate": 8.117052631578946e-05,
"loss": 0.0,
"step": 47
},
{
"epoch": 0.007609384908053266,
"grad_norm": 0.00020019305520690978,
"learning_rate": 8.064e-05,
"loss": 0.0,
"step": 48
},
{
"epoch": 0.0077679137603043754,
"grad_norm": 0.00020320792100392282,
"learning_rate": 8.010947368421052e-05,
"loss": 0.0,
"step": 49
},
{
"epoch": 0.007926442612555484,
"grad_norm": 0.00017714353452902287,
"learning_rate": 7.957894736842105e-05,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.007926442612555484,
"eval_loss": 0.0037753561045974493,
"eval_runtime": 285.8239,
"eval_samples_per_second": 9.292,
"eval_steps_per_second": 2.323,
"step": 50
},
{
"epoch": 0.008084971464806594,
"grad_norm": 11.202030181884766,
"learning_rate": 7.904842105263158e-05,
"loss": 0.3948,
"step": 51
},
{
"epoch": 0.008243500317057704,
"grad_norm": 4.6145090891513973e-05,
"learning_rate": 7.85178947368421e-05,
"loss": 0.0,
"step": 52
},
{
"epoch": 0.008402029169308814,
"grad_norm": 5.335342575563118e-05,
"learning_rate": 7.798736842105263e-05,
"loss": 0.0,
"step": 53
},
{
"epoch": 0.008560558021559923,
"grad_norm": 6.71695961500518e-05,
"learning_rate": 7.745684210526315e-05,
"loss": 0.0,
"step": 54
},
{
"epoch": 0.008719086873811033,
"grad_norm": 7.083545642672107e-05,
"learning_rate": 7.692631578947369e-05,
"loss": 0.0,
"step": 55
},
{
"epoch": 0.008877615726062143,
"grad_norm": 0.00011363301746314391,
"learning_rate": 7.639578947368421e-05,
"loss": 0.0,
"step": 56
},
{
"epoch": 0.009036144578313253,
"grad_norm": 0.00015090873057488352,
"learning_rate": 7.586526315789473e-05,
"loss": 0.0,
"step": 57
},
{
"epoch": 0.009194673430564362,
"grad_norm": 0.00017162153380922973,
"learning_rate": 7.533473684210526e-05,
"loss": 0.0,
"step": 58
},
{
"epoch": 0.009353202282815472,
"grad_norm": 0.00037030354724265635,
"learning_rate": 7.480421052631578e-05,
"loss": 0.0,
"step": 59
},
{
"epoch": 0.009511731135066582,
"grad_norm": 0.00045603603939525783,
"learning_rate": 7.427368421052632e-05,
"loss": 0.0,
"step": 60
},
{
"epoch": 0.009670259987317692,
"grad_norm": 0.000606843619607389,
"learning_rate": 7.374315789473685e-05,
"loss": 0.0,
"step": 61
},
{
"epoch": 0.009828788839568801,
"grad_norm": 0.0012902193702757359,
"learning_rate": 7.321263157894737e-05,
"loss": 0.0,
"step": 62
},
{
"epoch": 0.009987317691819911,
"grad_norm": 0.0015680071664974093,
"learning_rate": 7.26821052631579e-05,
"loss": 0.0,
"step": 63
},
{
"epoch": 0.010145846544071021,
"grad_norm": 0.0017102680867537856,
"learning_rate": 7.215157894736842e-05,
"loss": 0.0,
"step": 64
},
{
"epoch": 0.01030437539632213,
"grad_norm": 0.001034354092553258,
"learning_rate": 7.162105263157894e-05,
"loss": 0.0,
"step": 65
},
{
"epoch": 0.01046290424857324,
"grad_norm": 0.0008957475074566901,
"learning_rate": 7.109052631578947e-05,
"loss": 0.0,
"step": 66
},
{
"epoch": 0.01062143310082435,
"grad_norm": 0.000811917707324028,
"learning_rate": 7.055999999999999e-05,
"loss": 0.0,
"step": 67
},
{
"epoch": 0.01077996195307546,
"grad_norm": 0.0010698516853153706,
"learning_rate": 7.002947368421052e-05,
"loss": 0.0,
"step": 68
},
{
"epoch": 0.01093849080532657,
"grad_norm": 0.0007929237326607108,
"learning_rate": 6.949894736842105e-05,
"loss": 0.0,
"step": 69
},
{
"epoch": 0.01109701965757768,
"grad_norm": 0.0005297662573866546,
"learning_rate": 6.896842105263158e-05,
"loss": 0.0,
"step": 70
},
{
"epoch": 0.01125554850982879,
"grad_norm": 0.0004970860900357366,
"learning_rate": 6.843789473684211e-05,
"loss": 0.0,
"step": 71
},
{
"epoch": 0.011414077362079899,
"grad_norm": 0.00042307560215704143,
"learning_rate": 6.790736842105263e-05,
"loss": 0.0,
"step": 72
},
{
"epoch": 0.011572606214331009,
"grad_norm": 0.0003402529109735042,
"learning_rate": 6.737684210526315e-05,
"loss": 0.0,
"step": 73
},
{
"epoch": 0.011731135066582118,
"grad_norm": 0.00037735735531896353,
"learning_rate": 6.684631578947368e-05,
"loss": 0.0,
"step": 74
},
{
"epoch": 0.011889663918833228,
"grad_norm": 0.0004949852591380477,
"learning_rate": 6.631578947368421e-05,
"loss": 0.0,
"step": 75
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.00039665098302066326,
"learning_rate": 6.578526315789473e-05,
"loss": 0.0,
"step": 76
},
{
"epoch": 0.012206721623335448,
"grad_norm": 0.0003759284154511988,
"learning_rate": 6.525473684210526e-05,
"loss": 0.0,
"step": 77
},
{
"epoch": 0.012365250475586557,
"grad_norm": 0.0007349163061007857,
"learning_rate": 6.47242105263158e-05,
"loss": 0.0,
"step": 78
},
{
"epoch": 0.012523779327837667,
"grad_norm": 0.00038908098940737545,
"learning_rate": 6.419368421052632e-05,
"loss": 0.0,
"step": 79
},
{
"epoch": 0.012682308180088777,
"grad_norm": 0.00039207786903716624,
"learning_rate": 6.366315789473684e-05,
"loss": 0.0,
"step": 80
},
{
"epoch": 0.012840837032339885,
"grad_norm": 0.00031621192465536296,
"learning_rate": 6.313263157894736e-05,
"loss": 0.0,
"step": 81
},
{
"epoch": 0.012999365884590995,
"grad_norm": 0.001207337947562337,
"learning_rate": 6.26021052631579e-05,
"loss": 0.0,
"step": 82
},
{
"epoch": 0.013157894736842105,
"grad_norm": 0.0003833776863757521,
"learning_rate": 6.207157894736842e-05,
"loss": 0.0,
"step": 83
},
{
"epoch": 0.013316423589093214,
"grad_norm": 0.0002807167184073478,
"learning_rate": 6.154105263157894e-05,
"loss": 0.0,
"step": 84
},
{
"epoch": 0.013474952441344324,
"grad_norm": 0.00025957514299079776,
"learning_rate": 6.1010526315789474e-05,
"loss": 0.0,
"step": 85
},
{
"epoch": 0.013633481293595434,
"grad_norm": 0.00022323857410810888,
"learning_rate": 6.048e-05,
"loss": 0.0,
"step": 86
},
{
"epoch": 0.013792010145846544,
"grad_norm": 0.00020459384541027248,
"learning_rate": 5.994947368421052e-05,
"loss": 0.0,
"step": 87
},
{
"epoch": 0.013950538998097653,
"grad_norm": 0.00021896703401580453,
"learning_rate": 5.941894736842104e-05,
"loss": 0.0,
"step": 88
},
{
"epoch": 0.014109067850348763,
"grad_norm": 0.0003069478552788496,
"learning_rate": 5.888842105263158e-05,
"loss": 0.0,
"step": 89
},
{
"epoch": 0.014267596702599873,
"grad_norm": 0.0003965249052271247,
"learning_rate": 5.835789473684211e-05,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.014426125554850983,
"grad_norm": 0.00017867452697828412,
"learning_rate": 5.782736842105263e-05,
"loss": 0.0,
"step": 91
},
{
"epoch": 0.014584654407102092,
"grad_norm": 0.00016691711789462715,
"learning_rate": 5.7296842105263154e-05,
"loss": 0.0,
"step": 92
},
{
"epoch": 0.014743183259353202,
"grad_norm": 0.0001635922526475042,
"learning_rate": 5.676631578947368e-05,
"loss": 0.0,
"step": 93
},
{
"epoch": 0.014901712111604312,
"grad_norm": 0.00017515213403385133,
"learning_rate": 5.623578947368421e-05,
"loss": 0.0,
"step": 94
},
{
"epoch": 0.015060240963855422,
"grad_norm": 0.00022856853320263326,
"learning_rate": 5.570526315789474e-05,
"loss": 0.0,
"step": 95
},
{
"epoch": 0.015218769816106531,
"grad_norm": 0.00013485149247571826,
"learning_rate": 5.5174736842105266e-05,
"loss": 0.0,
"step": 96
},
{
"epoch": 0.015377298668357641,
"grad_norm": 0.00022567079577129334,
"learning_rate": 5.464421052631579e-05,
"loss": 0.0,
"step": 97
},
{
"epoch": 0.015535827520608751,
"grad_norm": 0.00031975016463547945,
"learning_rate": 5.411368421052631e-05,
"loss": 0.0,
"step": 98
},
{
"epoch": 0.01569435637285986,
"grad_norm": 0.00018277288472745568,
"learning_rate": 5.358315789473684e-05,
"loss": 0.0,
"step": 99
},
{
"epoch": 0.01585288522511097,
"grad_norm": 0.00018042400188278407,
"learning_rate": 5.3052631578947364e-05,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.01585288522511097,
"eval_loss": 0.0034066014923155308,
"eval_runtime": 285.7064,
"eval_samples_per_second": 9.296,
"eval_steps_per_second": 2.324,
"step": 100
},
{
"epoch": 0.01601141407736208,
"grad_norm": 0.0001420114713255316,
"learning_rate": 5.252210526315789e-05,
"loss": 0.0,
"step": 101
},
{
"epoch": 0.016169942929613188,
"grad_norm": 0.00012088767834939063,
"learning_rate": 5.199157894736842e-05,
"loss": 0.0,
"step": 102
},
{
"epoch": 0.0163284717818643,
"grad_norm": 0.00011178933345945552,
"learning_rate": 5.1461052631578946e-05,
"loss": 0.0,
"step": 103
},
{
"epoch": 0.016487000634115408,
"grad_norm": 0.00010531868610996753,
"learning_rate": 5.0930526315789476e-05,
"loss": 0.0,
"step": 104
},
{
"epoch": 0.01664552948636652,
"grad_norm": 0.00011901999096153304,
"learning_rate": 5.04e-05,
"loss": 0.0,
"step": 105
},
{
"epoch": 0.016804058338617627,
"grad_norm": 9.147950186161324e-05,
"learning_rate": 4.986947368421052e-05,
"loss": 0.0,
"step": 106
},
{
"epoch": 0.01696258719086874,
"grad_norm": 7.989244477357715e-05,
"learning_rate": 4.933894736842105e-05,
"loss": 0.0,
"step": 107
},
{
"epoch": 0.017121116043119847,
"grad_norm": 7.46723817428574e-05,
"learning_rate": 4.880842105263158e-05,
"loss": 0.0,
"step": 108
},
{
"epoch": 0.017279644895370958,
"grad_norm": 8.380914368899539e-05,
"learning_rate": 4.8277894736842103e-05,
"loss": 0.0,
"step": 109
},
{
"epoch": 0.017438173747622066,
"grad_norm": 7.211839692899957e-05,
"learning_rate": 4.7747368421052626e-05,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.017596702599873178,
"grad_norm": 7.725647446932271e-05,
"learning_rate": 4.7216842105263156e-05,
"loss": 0.0,
"step": 111
},
{
"epoch": 0.017755231452124286,
"grad_norm": 8.656168211018667e-05,
"learning_rate": 4.6686315789473686e-05,
"loss": 0.0,
"step": 112
},
{
"epoch": 0.017913760304375397,
"grad_norm": 7.190388714661822e-05,
"learning_rate": 4.615578947368421e-05,
"loss": 0.0,
"step": 113
},
{
"epoch": 0.018072289156626505,
"grad_norm": 7.352729880949482e-05,
"learning_rate": 4.562526315789474e-05,
"loss": 0.0,
"step": 114
},
{
"epoch": 0.018230818008877617,
"grad_norm": 6.0772359574912116e-05,
"learning_rate": 4.509473684210526e-05,
"loss": 0.0,
"step": 115
},
{
"epoch": 0.018389346861128725,
"grad_norm": 5.866353239980526e-05,
"learning_rate": 4.4564210526315784e-05,
"loss": 0.0,
"step": 116
},
{
"epoch": 0.018547875713379836,
"grad_norm": 6.619851046707481e-05,
"learning_rate": 4.403368421052632e-05,
"loss": 0.0,
"step": 117
},
{
"epoch": 0.018706404565630944,
"grad_norm": 6.62534948787652e-05,
"learning_rate": 4.350315789473684e-05,
"loss": 0.0,
"step": 118
},
{
"epoch": 0.018864933417882056,
"grad_norm": 6.304805719992146e-05,
"learning_rate": 4.2972631578947366e-05,
"loss": 0.0,
"step": 119
},
{
"epoch": 0.019023462270133164,
"grad_norm": 6.0336358728818595e-05,
"learning_rate": 4.244210526315789e-05,
"loss": 0.0,
"step": 120
},
{
"epoch": 0.019181991122384275,
"grad_norm": 5.7237852161051705e-05,
"learning_rate": 4.191157894736842e-05,
"loss": 0.0,
"step": 121
},
{
"epoch": 0.019340519974635383,
"grad_norm": 5.5000280553940684e-05,
"learning_rate": 4.138105263157895e-05,
"loss": 0.0,
"step": 122
},
{
"epoch": 0.019499048826886495,
"grad_norm": 6.333758938126266e-05,
"learning_rate": 4.085052631578947e-05,
"loss": 0.0,
"step": 123
},
{
"epoch": 0.019657577679137603,
"grad_norm": 6.331897748168558e-05,
"learning_rate": 4.032e-05,
"loss": 0.0,
"step": 124
},
{
"epoch": 0.019816106531388714,
"grad_norm": 5.774655437562615e-05,
"learning_rate": 3.978947368421052e-05,
"loss": 0.0,
"step": 125
},
{
"epoch": 0.019974635383639822,
"grad_norm": 5.671928738593124e-05,
"learning_rate": 3.925894736842105e-05,
"loss": 0.0,
"step": 126
},
{
"epoch": 0.020133164235890934,
"grad_norm": 6.89297157805413e-05,
"learning_rate": 3.8728421052631575e-05,
"loss": 0.0,
"step": 127
},
{
"epoch": 0.020291693088142042,
"grad_norm": 6.829660560470074e-05,
"learning_rate": 3.8197894736842105e-05,
"loss": 0.0,
"step": 128
},
{
"epoch": 0.02045022194039315,
"grad_norm": 5.680310641764663e-05,
"learning_rate": 3.766736842105263e-05,
"loss": 0.0,
"step": 129
},
{
"epoch": 0.02060875079264426,
"grad_norm": 4.718761192634702e-05,
"learning_rate": 3.713684210526316e-05,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.02076727964489537,
"grad_norm": 5.037297523813322e-05,
"learning_rate": 3.660631578947369e-05,
"loss": 0.0,
"step": 131
},
{
"epoch": 0.02092580849714648,
"grad_norm": 5.18505803484004e-05,
"learning_rate": 3.607578947368421e-05,
"loss": 0.0,
"step": 132
},
{
"epoch": 0.02108433734939759,
"grad_norm": 5.0042519433191046e-05,
"learning_rate": 3.554526315789473e-05,
"loss": 0.0,
"step": 133
},
{
"epoch": 0.0212428662016487,
"grad_norm": 5.112058715894818e-05,
"learning_rate": 3.501473684210526e-05,
"loss": 0.0,
"step": 134
},
{
"epoch": 0.02140139505389981,
"grad_norm": 5.2210583817213774e-05,
"learning_rate": 3.448421052631579e-05,
"loss": 0.0,
"step": 135
},
{
"epoch": 0.02155992390615092,
"grad_norm": 5.472183329402469e-05,
"learning_rate": 3.3953684210526315e-05,
"loss": 0.0,
"step": 136
},
{
"epoch": 0.021718452758402028,
"grad_norm": 5.2417293773032725e-05,
"learning_rate": 3.342315789473684e-05,
"loss": 0.0,
"step": 137
},
{
"epoch": 0.02187698161065314,
"grad_norm": 4.7179757530102506e-05,
"learning_rate": 3.289263157894737e-05,
"loss": 0.0,
"step": 138
},
{
"epoch": 0.022035510462904247,
"grad_norm": 5.0374997954349965e-05,
"learning_rate": 3.23621052631579e-05,
"loss": 0.0,
"step": 139
},
{
"epoch": 0.02219403931515536,
"grad_norm": 5.104386946186423e-05,
"learning_rate": 3.183157894736842e-05,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.022352568167406467,
"grad_norm": 4.212348358123563e-05,
"learning_rate": 3.130105263157895e-05,
"loss": 0.0,
"step": 141
},
{
"epoch": 0.02251109701965758,
"grad_norm": 6.773701898055151e-05,
"learning_rate": 3.077052631578947e-05,
"loss": 0.0,
"step": 142
},
{
"epoch": 0.022669625871908686,
"grad_norm": 4.5040869736112654e-05,
"learning_rate": 3.024e-05,
"loss": 0.0,
"step": 143
},
{
"epoch": 0.022828154724159798,
"grad_norm": 4.242918294039555e-05,
"learning_rate": 2.970947368421052e-05,
"loss": 0.0,
"step": 144
},
{
"epoch": 0.022986683576410906,
"grad_norm": 7.274608651641756e-05,
"learning_rate": 2.9178947368421054e-05,
"loss": 0.0,
"step": 145
},
{
"epoch": 0.023145212428662017,
"grad_norm": 7.292564259842038e-05,
"learning_rate": 2.8648421052631577e-05,
"loss": 0.0,
"step": 146
},
{
"epoch": 0.023303741280913125,
"grad_norm": 8.248071389971301e-05,
"learning_rate": 2.8117894736842103e-05,
"loss": 0.0,
"step": 147
},
{
"epoch": 0.023462270133164237,
"grad_norm": 7.596130308229476e-05,
"learning_rate": 2.7587368421052633e-05,
"loss": 0.0,
"step": 148
},
{
"epoch": 0.023620798985415345,
"grad_norm": 8.240217721322551e-05,
"learning_rate": 2.7056842105263156e-05,
"loss": 0.0,
"step": 149
},
{
"epoch": 0.023779327837666456,
"grad_norm": 0.0001047488913172856,
"learning_rate": 2.6526315789473682e-05,
"loss": 0.0,
"step": 150
},
{
"epoch": 0.023779327837666456,
"eval_loss": 0.003376348875463009,
"eval_runtime": 285.9072,
"eval_samples_per_second": 9.29,
"eval_steps_per_second": 2.322,
"step": 150
},
{
"epoch": 0.023937856689917564,
"grad_norm": 5.650425434112549,
"learning_rate": 2.599578947368421e-05,
"loss": 0.1783,
"step": 151
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.00012569170212373137,
"learning_rate": 2.5465263157894738e-05,
"loss": 0.0,
"step": 152
},
{
"epoch": 0.024254914394419784,
"grad_norm": 0.00018095099949277937,
"learning_rate": 2.493473684210526e-05,
"loss": 0.0,
"step": 153
},
{
"epoch": 0.024413443246670895,
"grad_norm": 0.0001806170621421188,
"learning_rate": 2.440421052631579e-05,
"loss": 0.0,
"step": 154
},
{
"epoch": 0.024571972098922003,
"grad_norm": 0.00021117427968420088,
"learning_rate": 2.3873684210526313e-05,
"loss": 0.0,
"step": 155
},
{
"epoch": 0.024730500951173115,
"grad_norm": 0.00030483287991955876,
"learning_rate": 2.3343157894736843e-05,
"loss": 0.0,
"step": 156
},
{
"epoch": 0.024889029803424223,
"grad_norm": 0.00029481080127879977,
"learning_rate": 2.281263157894737e-05,
"loss": 0.0,
"step": 157
},
{
"epoch": 0.025047558655675334,
"grad_norm": 0.0004884671652689576,
"learning_rate": 2.2282105263157892e-05,
"loss": 0.0,
"step": 158
},
{
"epoch": 0.025206087507926443,
"grad_norm": 0.0002942118444480002,
"learning_rate": 2.175157894736842e-05,
"loss": 0.0,
"step": 159
},
{
"epoch": 0.025364616360177554,
"grad_norm": 0.000287515576928854,
"learning_rate": 2.1221052631578944e-05,
"loss": 0.0,
"step": 160
},
{
"epoch": 0.025523145212428662,
"grad_norm": 0.00042692935676313937,
"learning_rate": 2.0690526315789474e-05,
"loss": 0.0,
"step": 161
},
{
"epoch": 0.02568167406467977,
"grad_norm": 0.0005435315542854369,
"learning_rate": 2.016e-05,
"loss": 0.0,
"step": 162
},
{
"epoch": 0.02584020291693088,
"grad_norm": 0.00035297736758366227,
"learning_rate": 1.9629473684210526e-05,
"loss": 0.0,
"step": 163
},
{
"epoch": 0.02599873176918199,
"grad_norm": 0.0005012881010770798,
"learning_rate": 1.9098947368421053e-05,
"loss": 0.0,
"step": 164
},
{
"epoch": 0.0261572606214331,
"grad_norm": 0.00044648078619502485,
"learning_rate": 1.856842105263158e-05,
"loss": 0.0,
"step": 165
},
{
"epoch": 0.02631578947368421,
"grad_norm": 0.0004886375973001122,
"learning_rate": 1.8037894736842105e-05,
"loss": 0.0,
"step": 166
},
{
"epoch": 0.02647431832593532,
"grad_norm": 0.0005168926436454058,
"learning_rate": 1.750736842105263e-05,
"loss": 0.0,
"step": 167
},
{
"epoch": 0.02663284717818643,
"grad_norm": 0.0004425595107022673,
"learning_rate": 1.6976842105263157e-05,
"loss": 0.0,
"step": 168
},
{
"epoch": 0.02679137603043754,
"grad_norm": 0.0005173716926947236,
"learning_rate": 1.6446315789473684e-05,
"loss": 0.0,
"step": 169
},
{
"epoch": 0.026949904882688648,
"grad_norm": 0.00048417490324936807,
"learning_rate": 1.591578947368421e-05,
"loss": 0.0,
"step": 170
},
{
"epoch": 0.02710843373493976,
"grad_norm": 0.0006713734474033117,
"learning_rate": 1.5385263157894736e-05,
"loss": 0.0,
"step": 171
},
{
"epoch": 0.027266962587190868,
"grad_norm": 0.0004995199851691723,
"learning_rate": 1.485473684210526e-05,
"loss": 0.0,
"step": 172
},
{
"epoch": 0.02742549143944198,
"grad_norm": 0.0004266517935320735,
"learning_rate": 1.4324210526315789e-05,
"loss": 0.0,
"step": 173
},
{
"epoch": 0.027584020291693087,
"grad_norm": 0.0004872908757533878,
"learning_rate": 1.3793684210526316e-05,
"loss": 0.0,
"step": 174
},
{
"epoch": 0.0277425491439442,
"grad_norm": 0.00042201511678285897,
"learning_rate": 1.3263157894736841e-05,
"loss": 0.0,
"step": 175
},
{
"epoch": 0.027901077996195307,
"grad_norm": 0.0004310183576308191,
"learning_rate": 1.2732631578947369e-05,
"loss": 0.0,
"step": 176
},
{
"epoch": 0.028059606848446418,
"grad_norm": 0.00046641906374134123,
"learning_rate": 1.2202105263157895e-05,
"loss": 0.0,
"step": 177
},
{
"epoch": 0.028218135700697526,
"grad_norm": 0.0004791380779352039,
"learning_rate": 1.1671578947368421e-05,
"loss": 0.0,
"step": 178
},
{
"epoch": 0.028376664552948638,
"grad_norm": 0.000661531463265419,
"learning_rate": 1.1141052631578946e-05,
"loss": 0.0,
"step": 179
},
{
"epoch": 0.028535193405199746,
"grad_norm": 0.0005801509832963347,
"learning_rate": 1.0610526315789472e-05,
"loss": 0.0,
"step": 180
},
{
"epoch": 0.028693722257450857,
"grad_norm": 0.00038415539893321693,
"learning_rate": 1.008e-05,
"loss": 0.0,
"step": 181
},
{
"epoch": 0.028852251109701965,
"grad_norm": 0.00040587177500128746,
"learning_rate": 9.549473684210526e-06,
"loss": 0.0,
"step": 182
},
{
"epoch": 0.029010779961953077,
"grad_norm": 0.00045143821625970304,
"learning_rate": 9.018947368421052e-06,
"loss": 0.0,
"step": 183
},
{
"epoch": 0.029169308814204185,
"grad_norm": 0.0005063486169092357,
"learning_rate": 8.488421052631579e-06,
"loss": 0.0,
"step": 184
},
{
"epoch": 0.029327837666455296,
"grad_norm": 0.0005101272254250944,
"learning_rate": 7.957894736842105e-06,
"loss": 0.0,
"step": 185
},
{
"epoch": 0.029486366518706404,
"grad_norm": 0.00046848724014125764,
"learning_rate": 7.42736842105263e-06,
"loss": 0.0,
"step": 186
},
{
"epoch": 0.029644895370957516,
"grad_norm": 0.000389144872315228,
"learning_rate": 6.896842105263158e-06,
"loss": 0.0,
"step": 187
},
{
"epoch": 0.029803424223208624,
"grad_norm": 0.00045255443546921015,
"learning_rate": 6.3663157894736845e-06,
"loss": 0.0,
"step": 188
},
{
"epoch": 0.029961953075459735,
"grad_norm": 0.00038248361670412123,
"learning_rate": 5.835789473684211e-06,
"loss": 0.0,
"step": 189
},
{
"epoch": 0.030120481927710843,
"grad_norm": 0.00043523870408535004,
"learning_rate": 5.305263157894736e-06,
"loss": 0.0,
"step": 190
},
{
"epoch": 0.030279010779961955,
"grad_norm": 0.00042310234857723117,
"learning_rate": 4.774736842105263e-06,
"loss": 0.0,
"step": 191
},
{
"epoch": 0.030437539632213063,
"grad_norm": 0.00040010226075537503,
"learning_rate": 4.244210526315789e-06,
"loss": 0.0,
"step": 192
},
{
"epoch": 0.030596068484464174,
"grad_norm": 0.0003844479797407985,
"learning_rate": 3.713684210526315e-06,
"loss": 0.0,
"step": 193
},
{
"epoch": 0.030754597336715282,
"grad_norm": 0.0004438844043761492,
"learning_rate": 3.1831578947368422e-06,
"loss": 0.0,
"step": 194
},
{
"epoch": 0.03091312618896639,
"grad_norm": 0.0004899434861727059,
"learning_rate": 2.652631578947368e-06,
"loss": 0.0,
"step": 195
},
{
"epoch": 0.031071655041217502,
"grad_norm": 0.0006046644994057715,
"learning_rate": 2.1221052631578947e-06,
"loss": 0.0,
"step": 196
},
{
"epoch": 0.03123018389346861,
"grad_norm": 0.0007566219428554177,
"learning_rate": 1.5915789473684211e-06,
"loss": 0.0,
"step": 197
},
{
"epoch": 0.03138871274571972,
"grad_norm": 0.0006015272228978574,
"learning_rate": 1.0610526315789473e-06,
"loss": 0.0,
"step": 198
},
{
"epoch": 0.03154724159797083,
"grad_norm": 0.0004844815412070602,
"learning_rate": 5.305263157894737e-07,
"loss": 0.0,
"step": 199
},
{
"epoch": 0.03170577045022194,
"grad_norm": 0.0004923184169456363,
"learning_rate": 0.0,
"loss": 0.0,
"step": 200
},
{
"epoch": 0.03170577045022194,
"eval_loss": 0.0022250961046665907,
"eval_runtime": 285.0318,
"eval_samples_per_second": 9.318,
"eval_steps_per_second": 2.33,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.162764252413952e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}