RoyJoy's picture
Training in progress, step 150, checkpoint
f37bc59 verified
raw
history blame
28.9 kB
{
"best_metric": 10.3283109664917,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 0.16703786191536749,
"eval_steps": 25,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011135857461024498,
"grad_norm": 0.021220365539193153,
"learning_rate": 2.9999999999999997e-05,
"loss": 10.3743,
"step": 1
},
{
"epoch": 0.0011135857461024498,
"eval_loss": 10.371726036071777,
"eval_runtime": 0.0507,
"eval_samples_per_second": 985.944,
"eval_steps_per_second": 39.438,
"step": 1
},
{
"epoch": 0.0022271714922048997,
"grad_norm": 0.02089685946702957,
"learning_rate": 5.9999999999999995e-05,
"loss": 10.3748,
"step": 2
},
{
"epoch": 0.0033407572383073497,
"grad_norm": 0.022434862330555916,
"learning_rate": 8.999999999999999e-05,
"loss": 10.3751,
"step": 3
},
{
"epoch": 0.004454342984409799,
"grad_norm": 0.022230561822652817,
"learning_rate": 0.00011999999999999999,
"loss": 10.3744,
"step": 4
},
{
"epoch": 0.005567928730512249,
"grad_norm": 0.022404590621590614,
"learning_rate": 0.00015,
"loss": 10.3725,
"step": 5
},
{
"epoch": 0.0066815144766146995,
"grad_norm": 0.023340720683336258,
"learning_rate": 0.00017999999999999998,
"loss": 10.3725,
"step": 6
},
{
"epoch": 0.0077951002227171495,
"grad_norm": 0.02542109414935112,
"learning_rate": 0.00020999999999999998,
"loss": 10.3724,
"step": 7
},
{
"epoch": 0.008908685968819599,
"grad_norm": 0.027613935992121696,
"learning_rate": 0.00023999999999999998,
"loss": 10.3733,
"step": 8
},
{
"epoch": 0.01002227171492205,
"grad_norm": 0.027721751481294632,
"learning_rate": 0.00027,
"loss": 10.3718,
"step": 9
},
{
"epoch": 0.011135857461024499,
"grad_norm": 0.031006429344415665,
"learning_rate": 0.0003,
"loss": 10.3713,
"step": 10
},
{
"epoch": 0.012249443207126948,
"grad_norm": 0.03272189199924469,
"learning_rate": 0.0002999794957488703,
"loss": 10.3694,
"step": 11
},
{
"epoch": 0.013363028953229399,
"grad_norm": 0.03870847821235657,
"learning_rate": 0.0002999179886011389,
"loss": 10.3682,
"step": 12
},
{
"epoch": 0.014476614699331848,
"grad_norm": 0.026634380221366882,
"learning_rate": 0.0002998154953722457,
"loss": 10.3716,
"step": 13
},
{
"epoch": 0.015590200445434299,
"grad_norm": 0.030647795647382736,
"learning_rate": 0.00029967204408281613,
"loss": 10.3724,
"step": 14
},
{
"epoch": 0.01670378619153675,
"grad_norm": 0.03259807825088501,
"learning_rate": 0.00029948767395100045,
"loss": 10.3729,
"step": 15
},
{
"epoch": 0.017817371937639197,
"grad_norm": 0.03251082822680473,
"learning_rate": 0.0002992624353817517,
"loss": 10.3723,
"step": 16
},
{
"epoch": 0.01893095768374165,
"grad_norm": 0.03374587371945381,
"learning_rate": 0.0002989963899530457,
"loss": 10.3709,
"step": 17
},
{
"epoch": 0.0200445434298441,
"grad_norm": 0.03536859527230263,
"learning_rate": 0.00029868961039904624,
"loss": 10.3695,
"step": 18
},
{
"epoch": 0.021158129175946547,
"grad_norm": 0.04094082862138748,
"learning_rate": 0.00029834218059022024,
"loss": 10.3676,
"step": 19
},
{
"epoch": 0.022271714922048998,
"grad_norm": 0.04486091434955597,
"learning_rate": 0.00029795419551040833,
"loss": 10.3681,
"step": 20
},
{
"epoch": 0.02338530066815145,
"grad_norm": 0.04667678475379944,
"learning_rate": 0.00029752576123085736,
"loss": 10.3666,
"step": 21
},
{
"epoch": 0.024498886414253896,
"grad_norm": 0.049042511731386185,
"learning_rate": 0.0002970569948812214,
"loss": 10.3668,
"step": 22
},
{
"epoch": 0.025612472160356347,
"grad_norm": 0.05766143649816513,
"learning_rate": 0.0002965480246175399,
"loss": 10.364,
"step": 23
},
{
"epoch": 0.026726057906458798,
"grad_norm": 0.063766248524189,
"learning_rate": 0.0002959989895872009,
"loss": 10.362,
"step": 24
},
{
"epoch": 0.02783964365256125,
"grad_norm": 0.06413163244724274,
"learning_rate": 0.0002954100398908995,
"loss": 10.3616,
"step": 25
},
{
"epoch": 0.02783964365256125,
"eval_loss": 10.362093925476074,
"eval_runtime": 0.0468,
"eval_samples_per_second": 1068.689,
"eval_steps_per_second": 42.748,
"step": 25
},
{
"epoch": 0.028953229398663696,
"grad_norm": 0.05297641456127167,
"learning_rate": 0.0002947813365416023,
"loss": 10.3688,
"step": 26
},
{
"epoch": 0.030066815144766147,
"grad_norm": 0.059603746980428696,
"learning_rate": 0.0002941130514205272,
"loss": 10.3657,
"step": 27
},
{
"epoch": 0.031180400890868598,
"grad_norm": 0.05739077180624008,
"learning_rate": 0.0002934053672301536,
"loss": 10.3648,
"step": 28
},
{
"epoch": 0.03229398663697105,
"grad_norm": 0.06212243810296059,
"learning_rate": 0.00029265847744427303,
"loss": 10.3629,
"step": 29
},
{
"epoch": 0.0334075723830735,
"grad_norm": 0.0610235333442688,
"learning_rate": 0.00029187258625509513,
"loss": 10.3626,
"step": 30
},
{
"epoch": 0.034521158129175944,
"grad_norm": 0.05962621048092842,
"learning_rate": 0.00029104790851742417,
"loss": 10.3607,
"step": 31
},
{
"epoch": 0.035634743875278395,
"grad_norm": 0.06733676046133041,
"learning_rate": 0.0002901846696899191,
"loss": 10.3592,
"step": 32
},
{
"epoch": 0.036748329621380846,
"grad_norm": 0.06607528775930405,
"learning_rate": 0.00028928310577345606,
"loss": 10.3575,
"step": 33
},
{
"epoch": 0.0378619153674833,
"grad_norm": 0.06120593473315239,
"learning_rate": 0.0002883434632466077,
"loss": 10.3574,
"step": 34
},
{
"epoch": 0.03897550111358575,
"grad_norm": 0.06261507421731949,
"learning_rate": 0.00028736599899825856,
"loss": 10.3548,
"step": 35
},
{
"epoch": 0.0400890868596882,
"grad_norm": 0.05739546939730644,
"learning_rate": 0.00028635098025737434,
"loss": 10.3533,
"step": 36
},
{
"epoch": 0.04120267260579064,
"grad_norm": 0.05693826824426651,
"learning_rate": 0.00028529868451994384,
"loss": 10.3503,
"step": 37
},
{
"epoch": 0.042316258351893093,
"grad_norm": 0.05451498553156853,
"learning_rate": 0.0002842093994731145,
"loss": 10.3581,
"step": 38
},
{
"epoch": 0.043429844097995544,
"grad_norm": 0.055710602551698685,
"learning_rate": 0.00028308342291654174,
"loss": 10.3566,
"step": 39
},
{
"epoch": 0.044543429844097995,
"grad_norm": 0.05021951347589493,
"learning_rate": 0.00028192106268097334,
"loss": 10.3548,
"step": 40
},
{
"epoch": 0.045657015590200446,
"grad_norm": 0.04118210822343826,
"learning_rate": 0.00028072263654409154,
"loss": 10.3563,
"step": 41
},
{
"epoch": 0.0467706013363029,
"grad_norm": 0.03563275188207626,
"learning_rate": 0.0002794884721436361,
"loss": 10.3531,
"step": 42
},
{
"epoch": 0.04788418708240535,
"grad_norm": 0.036302387714385986,
"learning_rate": 0.00027821890688783083,
"loss": 10.3524,
"step": 43
},
{
"epoch": 0.04899777282850779,
"grad_norm": 0.03217688947916031,
"learning_rate": 0.0002769142878631403,
"loss": 10.3522,
"step": 44
},
{
"epoch": 0.05011135857461024,
"grad_norm": 0.035605840384960175,
"learning_rate": 0.00027557497173937923,
"loss": 10.3514,
"step": 45
},
{
"epoch": 0.051224944320712694,
"grad_norm": 0.03646039217710495,
"learning_rate": 0.000274201324672203,
"loss": 10.3483,
"step": 46
},
{
"epoch": 0.052338530066815145,
"grad_norm": 0.03975389897823334,
"learning_rate": 0.00027279372220300385,
"loss": 10.3468,
"step": 47
},
{
"epoch": 0.053452115812917596,
"grad_norm": 0.04297318309545517,
"learning_rate": 0.0002713525491562421,
"loss": 10.3462,
"step": 48
},
{
"epoch": 0.05456570155902005,
"grad_norm": 0.05375898629426956,
"learning_rate": 0.00026987819953423867,
"loss": 10.3476,
"step": 49
},
{
"epoch": 0.0556792873051225,
"grad_norm": 0.06830257922410965,
"learning_rate": 0.00026837107640945905,
"loss": 10.3447,
"step": 50
},
{
"epoch": 0.0556792873051225,
"eval_loss": 10.347752571105957,
"eval_runtime": 0.0506,
"eval_samples_per_second": 988.519,
"eval_steps_per_second": 39.541,
"step": 50
},
{
"epoch": 0.05679287305122494,
"grad_norm": 0.036631014198064804,
"learning_rate": 0.0002668315918143169,
"loss": 10.3538,
"step": 51
},
{
"epoch": 0.05790645879732739,
"grad_norm": 0.03295717015862465,
"learning_rate": 0.00026526016662852886,
"loss": 10.3509,
"step": 52
},
{
"epoch": 0.05902004454342984,
"grad_norm": 0.033492058515548706,
"learning_rate": 0.00026365723046405023,
"loss": 10.351,
"step": 53
},
{
"epoch": 0.060133630289532294,
"grad_norm": 0.031195539981126785,
"learning_rate": 0.0002620232215476231,
"loss": 10.3488,
"step": 54
},
{
"epoch": 0.061247216035634745,
"grad_norm": 0.034528084099292755,
"learning_rate": 0.0002603585866009697,
"loss": 10.348,
"step": 55
},
{
"epoch": 0.062360801781737196,
"grad_norm": 0.03689737245440483,
"learning_rate": 0.00025866378071866334,
"loss": 10.3479,
"step": 56
},
{
"epoch": 0.06347438752783964,
"grad_norm": 0.040240008383989334,
"learning_rate": 0.00025693926724370956,
"loss": 10.346,
"step": 57
},
{
"epoch": 0.0645879732739421,
"grad_norm": 0.035155221819877625,
"learning_rate": 0.00025518551764087326,
"loss": 10.3445,
"step": 58
},
{
"epoch": 0.06570155902004454,
"grad_norm": 0.04302839934825897,
"learning_rate": 0.00025340301136778483,
"loss": 10.3426,
"step": 59
},
{
"epoch": 0.066815144766147,
"grad_norm": 0.04797535389661789,
"learning_rate": 0.00025159223574386114,
"loss": 10.3419,
"step": 60
},
{
"epoch": 0.06792873051224944,
"grad_norm": 0.05042591318488121,
"learning_rate": 0.0002497536858170772,
"loss": 10.3424,
"step": 61
},
{
"epoch": 0.06904231625835189,
"grad_norm": 0.06060624122619629,
"learning_rate": 0.00024788786422862526,
"loss": 10.3386,
"step": 62
},
{
"epoch": 0.07015590200445435,
"grad_norm": 0.03394079953432083,
"learning_rate": 0.00024599528107549745,
"loss": 10.3505,
"step": 63
},
{
"epoch": 0.07126948775055679,
"grad_norm": 0.03250078111886978,
"learning_rate": 0.00024407645377103054,
"loss": 10.3476,
"step": 64
},
{
"epoch": 0.07238307349665925,
"grad_norm": 0.03402571752667427,
"learning_rate": 0.00024213190690345018,
"loss": 10.3451,
"step": 65
},
{
"epoch": 0.07349665924276169,
"grad_norm": 0.03165813162922859,
"learning_rate": 0.00024016217209245374,
"loss": 10.3455,
"step": 66
},
{
"epoch": 0.07461024498886415,
"grad_norm": 0.033986710011959076,
"learning_rate": 0.00023816778784387094,
"loss": 10.3431,
"step": 67
},
{
"epoch": 0.0757238307349666,
"grad_norm": 0.03550755977630615,
"learning_rate": 0.0002361492994024415,
"loss": 10.3431,
"step": 68
},
{
"epoch": 0.07683741648106904,
"grad_norm": 0.03433714807033539,
"learning_rate": 0.0002341072586027509,
"loss": 10.3426,
"step": 69
},
{
"epoch": 0.0779510022271715,
"grad_norm": 0.03356612101197243,
"learning_rate": 0.00023204222371836405,
"loss": 10.3421,
"step": 70
},
{
"epoch": 0.07906458797327394,
"grad_norm": 0.03245285525918007,
"learning_rate": 0.00022995475930919905,
"loss": 10.3419,
"step": 71
},
{
"epoch": 0.0801781737193764,
"grad_norm": 0.035388920456171036,
"learning_rate": 0.00022784543606718227,
"loss": 10.3396,
"step": 72
},
{
"epoch": 0.08129175946547884,
"grad_norm": 0.038405828177928925,
"learning_rate": 0.00022571483066022657,
"loss": 10.3362,
"step": 73
},
{
"epoch": 0.08240534521158129,
"grad_norm": 0.04556138068437576,
"learning_rate": 0.0002235635255745762,
"loss": 10.3345,
"step": 74
},
{
"epoch": 0.08351893095768374,
"grad_norm": 0.05725998058915138,
"learning_rate": 0.00022139210895556104,
"loss": 10.3327,
"step": 75
},
{
"epoch": 0.08351893095768374,
"eval_loss": 10.338118553161621,
"eval_runtime": 0.0504,
"eval_samples_per_second": 991.693,
"eval_steps_per_second": 39.668,
"step": 75
},
{
"epoch": 0.08463251670378619,
"grad_norm": 0.03490576148033142,
"learning_rate": 0.00021920117444680317,
"loss": 10.3483,
"step": 76
},
{
"epoch": 0.08574610244988864,
"grad_norm": 0.03555111214518547,
"learning_rate": 0.00021699132102792097,
"loss": 10.3443,
"step": 77
},
{
"epoch": 0.08685968819599109,
"grad_norm": 0.0329761877655983,
"learning_rate": 0.0002147631528507739,
"loss": 10.3436,
"step": 78
},
{
"epoch": 0.08797327394209355,
"grad_norm": 0.03092610463500023,
"learning_rate": 0.00021251727907429355,
"loss": 10.3443,
"step": 79
},
{
"epoch": 0.08908685968819599,
"grad_norm": 0.03112075664103031,
"learning_rate": 0.0002102543136979454,
"loss": 10.3411,
"step": 80
},
{
"epoch": 0.09020044543429843,
"grad_norm": 0.028758404776453972,
"learning_rate": 0.0002079748753938678,
"loss": 10.3401,
"step": 81
},
{
"epoch": 0.09131403118040089,
"grad_norm": 0.023690655827522278,
"learning_rate": 0.0002056795873377331,
"loss": 10.3377,
"step": 82
},
{
"epoch": 0.09242761692650334,
"grad_norm": 0.02615417167544365,
"learning_rate": 0.00020336907703837748,
"loss": 10.339,
"step": 83
},
{
"epoch": 0.0935412026726058,
"grad_norm": 0.02577822096645832,
"learning_rate": 0.00020104397616624645,
"loss": 10.3357,
"step": 84
},
{
"epoch": 0.09465478841870824,
"grad_norm": 0.032399583607912064,
"learning_rate": 0.00019870492038070252,
"loss": 10.3349,
"step": 85
},
{
"epoch": 0.0957683741648107,
"grad_norm": 0.03822220861911774,
"learning_rate": 0.0001963525491562421,
"loss": 10.3314,
"step": 86
},
{
"epoch": 0.09688195991091314,
"grad_norm": 0.05163590610027313,
"learning_rate": 0.0001939875056076697,
"loss": 10.3299,
"step": 87
},
{
"epoch": 0.09799554565701558,
"grad_norm": 0.038402412086725235,
"learning_rate": 0.00019161043631427666,
"loss": 10.3455,
"step": 88
},
{
"epoch": 0.09910913140311804,
"grad_norm": 0.03179153427481651,
"learning_rate": 0.00018922199114307294,
"loss": 10.3438,
"step": 89
},
{
"epoch": 0.10022271714922049,
"grad_norm": 0.031388312578201294,
"learning_rate": 0.00018682282307111987,
"loss": 10.343,
"step": 90
},
{
"epoch": 0.10133630289532294,
"grad_norm": 0.025438381358981133,
"learning_rate": 0.00018441358800701273,
"loss": 10.3417,
"step": 91
},
{
"epoch": 0.10244988864142539,
"grad_norm": 0.025364622473716736,
"learning_rate": 0.00018199494461156203,
"loss": 10.3393,
"step": 92
},
{
"epoch": 0.10356347438752785,
"grad_norm": 0.024232415482401848,
"learning_rate": 0.000179567554117722,
"loss": 10.337,
"step": 93
},
{
"epoch": 0.10467706013363029,
"grad_norm": 0.028822433203458786,
"learning_rate": 0.00017713208014981648,
"loss": 10.3379,
"step": 94
},
{
"epoch": 0.10579064587973273,
"grad_norm": 0.022030413150787354,
"learning_rate": 0.00017468918854211007,
"loss": 10.337,
"step": 95
},
{
"epoch": 0.10690423162583519,
"grad_norm": 0.0319385826587677,
"learning_rate": 0.00017223954715677627,
"loss": 10.3355,
"step": 96
},
{
"epoch": 0.10801781737193764,
"grad_norm": 0.029396483674645424,
"learning_rate": 0.00016978382570131034,
"loss": 10.3319,
"step": 97
},
{
"epoch": 0.1091314031180401,
"grad_norm": 0.03641531616449356,
"learning_rate": 0.00016732269554543794,
"loss": 10.3319,
"step": 98
},
{
"epoch": 0.11024498886414254,
"grad_norm": 0.045224692672491074,
"learning_rate": 0.00016485682953756942,
"loss": 10.3314,
"step": 99
},
{
"epoch": 0.111358574610245,
"grad_norm": 0.05978056415915489,
"learning_rate": 0.00016238690182084986,
"loss": 10.3267,
"step": 100
},
{
"epoch": 0.111358574610245,
"eval_loss": 10.334311485290527,
"eval_runtime": 0.0448,
"eval_samples_per_second": 1116.397,
"eval_steps_per_second": 44.656,
"step": 100
},
{
"epoch": 0.11247216035634744,
"grad_norm": 0.04336141422390938,
"learning_rate": 0.0001599135876488549,
"loss": 10.3448,
"step": 101
},
{
"epoch": 0.11358574610244988,
"grad_norm": 0.029405318200588226,
"learning_rate": 0.00015743756320098332,
"loss": 10.3418,
"step": 102
},
{
"epoch": 0.11469933184855234,
"grad_norm": 0.024563252925872803,
"learning_rate": 0.0001549595053975962,
"loss": 10.3404,
"step": 103
},
{
"epoch": 0.11581291759465479,
"grad_norm": 0.032005004584789276,
"learning_rate": 0.00015248009171495378,
"loss": 10.3378,
"step": 104
},
{
"epoch": 0.11692650334075724,
"grad_norm": 0.027886446565389633,
"learning_rate": 0.00015,
"loss": 10.3384,
"step": 105
},
{
"epoch": 0.11804008908685969,
"grad_norm": 0.027392663061618805,
"learning_rate": 0.00014751990828504622,
"loss": 10.3359,
"step": 106
},
{
"epoch": 0.11915367483296214,
"grad_norm": 0.029057586565613747,
"learning_rate": 0.00014504049460240375,
"loss": 10.3357,
"step": 107
},
{
"epoch": 0.12026726057906459,
"grad_norm": 0.02602003701031208,
"learning_rate": 0.00014256243679901663,
"loss": 10.334,
"step": 108
},
{
"epoch": 0.12138084632516703,
"grad_norm": 0.025092778727412224,
"learning_rate": 0.00014008641235114508,
"loss": 10.3295,
"step": 109
},
{
"epoch": 0.12249443207126949,
"grad_norm": 0.0306704044342041,
"learning_rate": 0.00013761309817915014,
"loss": 10.3304,
"step": 110
},
{
"epoch": 0.12360801781737193,
"grad_norm": 0.03935825452208519,
"learning_rate": 0.00013514317046243058,
"loss": 10.3302,
"step": 111
},
{
"epoch": 0.12472160356347439,
"grad_norm": 0.05053841695189476,
"learning_rate": 0.00013267730445456208,
"loss": 10.3247,
"step": 112
},
{
"epoch": 0.12583518930957685,
"grad_norm": 0.03532646968960762,
"learning_rate": 0.00013021617429868963,
"loss": 10.3419,
"step": 113
},
{
"epoch": 0.12694877505567928,
"grad_norm": 0.03398346155881882,
"learning_rate": 0.00012776045284322368,
"loss": 10.341,
"step": 114
},
{
"epoch": 0.12806236080178174,
"grad_norm": 0.031128259375691414,
"learning_rate": 0.00012531081145788987,
"loss": 10.3399,
"step": 115
},
{
"epoch": 0.1291759465478842,
"grad_norm": 0.02975599467754364,
"learning_rate": 0.00012286791985018355,
"loss": 10.3378,
"step": 116
},
{
"epoch": 0.13028953229398663,
"grad_norm": 0.025660140439867973,
"learning_rate": 0.00012043244588227796,
"loss": 10.3369,
"step": 117
},
{
"epoch": 0.13140311804008908,
"grad_norm": 0.023060623556375504,
"learning_rate": 0.00011800505538843798,
"loss": 10.3345,
"step": 118
},
{
"epoch": 0.13251670378619154,
"grad_norm": 0.02799002081155777,
"learning_rate": 0.00011558641199298727,
"loss": 10.3328,
"step": 119
},
{
"epoch": 0.133630289532294,
"grad_norm": 0.024050775915384293,
"learning_rate": 0.00011317717692888012,
"loss": 10.3321,
"step": 120
},
{
"epoch": 0.13474387527839643,
"grad_norm": 0.02656245231628418,
"learning_rate": 0.00011077800885692702,
"loss": 10.3342,
"step": 121
},
{
"epoch": 0.1358574610244989,
"grad_norm": 0.02685682475566864,
"learning_rate": 0.00010838956368572334,
"loss": 10.3297,
"step": 122
},
{
"epoch": 0.13697104677060135,
"grad_norm": 0.03605301305651665,
"learning_rate": 0.0001060124943923303,
"loss": 10.3265,
"step": 123
},
{
"epoch": 0.13808463251670378,
"grad_norm": 0.04310872033238411,
"learning_rate": 0.0001036474508437579,
"loss": 10.325,
"step": 124
},
{
"epoch": 0.13919821826280623,
"grad_norm": 0.052310604602098465,
"learning_rate": 0.00010129507961929748,
"loss": 10.3216,
"step": 125
},
{
"epoch": 0.13919821826280623,
"eval_loss": 10.330706596374512,
"eval_runtime": 0.0484,
"eval_samples_per_second": 1033.941,
"eval_steps_per_second": 41.358,
"step": 125
},
{
"epoch": 0.1403118040089087,
"grad_norm": 0.03873632475733757,
"learning_rate": 9.895602383375353e-05,
"loss": 10.343,
"step": 126
},
{
"epoch": 0.14142538975501115,
"grad_norm": 0.03256813436746597,
"learning_rate": 9.663092296162251e-05,
"loss": 10.3388,
"step": 127
},
{
"epoch": 0.14253897550111358,
"grad_norm": 0.026527272537350655,
"learning_rate": 9.432041266226686e-05,
"loss": 10.3378,
"step": 128
},
{
"epoch": 0.14365256124721604,
"grad_norm": 0.022660735994577408,
"learning_rate": 9.202512460613219e-05,
"loss": 10.3365,
"step": 129
},
{
"epoch": 0.1447661469933185,
"grad_norm": 0.0257252287119627,
"learning_rate": 8.97456863020546e-05,
"loss": 10.3343,
"step": 130
},
{
"epoch": 0.14587973273942093,
"grad_norm": 0.02294372208416462,
"learning_rate": 8.748272092570646e-05,
"loss": 10.3345,
"step": 131
},
{
"epoch": 0.14699331848552338,
"grad_norm": 0.022150015458464622,
"learning_rate": 8.523684714922608e-05,
"loss": 10.33,
"step": 132
},
{
"epoch": 0.14810690423162584,
"grad_norm": 0.023904934525489807,
"learning_rate": 8.300867897207903e-05,
"loss": 10.3325,
"step": 133
},
{
"epoch": 0.1492204899777283,
"grad_norm": 0.025626808404922485,
"learning_rate": 8.079882555319684e-05,
"loss": 10.329,
"step": 134
},
{
"epoch": 0.15033407572383073,
"grad_norm": 0.031538087874650955,
"learning_rate": 7.860789104443896e-05,
"loss": 10.3266,
"step": 135
},
{
"epoch": 0.1514476614699332,
"grad_norm": 0.03154623880982399,
"learning_rate": 7.643647442542382e-05,
"loss": 10.3253,
"step": 136
},
{
"epoch": 0.15256124721603564,
"grad_norm": 0.04129767417907715,
"learning_rate": 7.428516933977347e-05,
"loss": 10.3231,
"step": 137
},
{
"epoch": 0.15367483296213807,
"grad_norm": 0.03668758645653725,
"learning_rate": 7.215456393281776e-05,
"loss": 10.3407,
"step": 138
},
{
"epoch": 0.15478841870824053,
"grad_norm": 0.03435783460736275,
"learning_rate": 7.004524069080096e-05,
"loss": 10.3408,
"step": 139
},
{
"epoch": 0.155902004454343,
"grad_norm": 0.03316853567957878,
"learning_rate": 6.795777628163599e-05,
"loss": 10.3389,
"step": 140
},
{
"epoch": 0.15701559020044542,
"grad_norm": 0.025575635954737663,
"learning_rate": 6.58927413972491e-05,
"loss": 10.3364,
"step": 141
},
{
"epoch": 0.15812917594654788,
"grad_norm": 0.021810244768857956,
"learning_rate": 6.385070059755846e-05,
"loss": 10.3354,
"step": 142
},
{
"epoch": 0.15924276169265034,
"grad_norm": 0.021209685131907463,
"learning_rate": 6.183221215612904e-05,
"loss": 10.3342,
"step": 143
},
{
"epoch": 0.1603563474387528,
"grad_norm": 0.02310410887002945,
"learning_rate": 5.983782790754623e-05,
"loss": 10.3325,
"step": 144
},
{
"epoch": 0.16146993318485522,
"grad_norm": 0.021886780858039856,
"learning_rate": 5.786809309654982e-05,
"loss": 10.3306,
"step": 145
},
{
"epoch": 0.16258351893095768,
"grad_norm": 0.025645460933446884,
"learning_rate": 5.592354622896944e-05,
"loss": 10.3291,
"step": 146
},
{
"epoch": 0.16369710467706014,
"grad_norm": 0.028998544439673424,
"learning_rate": 5.40047189245025e-05,
"loss": 10.3274,
"step": 147
},
{
"epoch": 0.16481069042316257,
"grad_norm": 0.03274320811033249,
"learning_rate": 5.211213577137469e-05,
"loss": 10.3251,
"step": 148
},
{
"epoch": 0.16592427616926503,
"grad_norm": 0.041816357523202896,
"learning_rate": 5.024631418292274e-05,
"loss": 10.3252,
"step": 149
},
{
"epoch": 0.16703786191536749,
"grad_norm": 0.05579761043190956,
"learning_rate": 4.840776425613886e-05,
"loss": 10.321,
"step": 150
},
{
"epoch": 0.16703786191536749,
"eval_loss": 10.3283109664917,
"eval_runtime": 0.0477,
"eval_samples_per_second": 1048.482,
"eval_steps_per_second": 41.939,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 69380829872128.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}