lesso's picture
Training in progress, step 200, checkpoint
7482937 verified
{
"best_metric": 0.8799543380737305,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.08983717012914093,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00044918585064570465,
"grad_norm": 1.2916557788848877,
"learning_rate": 1.0100000000000002e-05,
"loss": 1.7692,
"step": 1
},
{
"epoch": 0.00044918585064570465,
"eval_loss": 1.9869955778121948,
"eval_runtime": 205.6388,
"eval_samples_per_second": 145.863,
"eval_steps_per_second": 4.561,
"step": 1
},
{
"epoch": 0.0008983717012914093,
"grad_norm": 1.607176661491394,
"learning_rate": 2.0200000000000003e-05,
"loss": 1.8371,
"step": 2
},
{
"epoch": 0.001347557551937114,
"grad_norm": 1.799497365951538,
"learning_rate": 3.0299999999999998e-05,
"loss": 1.8926,
"step": 3
},
{
"epoch": 0.0017967434025828186,
"grad_norm": 1.7198957204818726,
"learning_rate": 4.0400000000000006e-05,
"loss": 1.9009,
"step": 4
},
{
"epoch": 0.0022459292532285235,
"grad_norm": 1.7114357948303223,
"learning_rate": 5.05e-05,
"loss": 1.8579,
"step": 5
},
{
"epoch": 0.002695115103874228,
"grad_norm": 1.4471302032470703,
"learning_rate": 6.0599999999999996e-05,
"loss": 1.6855,
"step": 6
},
{
"epoch": 0.0031443009545199328,
"grad_norm": 0.8383511304855347,
"learning_rate": 7.07e-05,
"loss": 1.4356,
"step": 7
},
{
"epoch": 0.003593486805165637,
"grad_norm": 0.8857292532920837,
"learning_rate": 8.080000000000001e-05,
"loss": 1.3539,
"step": 8
},
{
"epoch": 0.004042672655811342,
"grad_norm": 0.7878719568252563,
"learning_rate": 9.09e-05,
"loss": 1.2892,
"step": 9
},
{
"epoch": 0.004491858506457047,
"grad_norm": 0.8838219046592712,
"learning_rate": 0.000101,
"loss": 1.2533,
"step": 10
},
{
"epoch": 0.004941044357102751,
"grad_norm": 0.8204696774482727,
"learning_rate": 0.00010046842105263158,
"loss": 1.2432,
"step": 11
},
{
"epoch": 0.005390230207748456,
"grad_norm": 0.6225486397743225,
"learning_rate": 9.993684210526315e-05,
"loss": 1.184,
"step": 12
},
{
"epoch": 0.00583941605839416,
"grad_norm": 0.45047685503959656,
"learning_rate": 9.940526315789473e-05,
"loss": 1.1476,
"step": 13
},
{
"epoch": 0.0062886019090398655,
"grad_norm": 0.43591365218162537,
"learning_rate": 9.887368421052632e-05,
"loss": 1.1241,
"step": 14
},
{
"epoch": 0.00673778775968557,
"grad_norm": 0.39855462312698364,
"learning_rate": 9.83421052631579e-05,
"loss": 1.1202,
"step": 15
},
{
"epoch": 0.007186973610331274,
"grad_norm": 0.32935649156570435,
"learning_rate": 9.781052631578948e-05,
"loss": 1.0963,
"step": 16
},
{
"epoch": 0.007636159460976979,
"grad_norm": 0.3512268364429474,
"learning_rate": 9.727894736842106e-05,
"loss": 1.0864,
"step": 17
},
{
"epoch": 0.008085345311622683,
"grad_norm": 0.36069127917289734,
"learning_rate": 9.674736842105263e-05,
"loss": 1.0866,
"step": 18
},
{
"epoch": 0.00853453116226839,
"grad_norm": 0.3602248728275299,
"learning_rate": 9.621578947368421e-05,
"loss": 1.0199,
"step": 19
},
{
"epoch": 0.008983717012914094,
"grad_norm": 0.3629809021949768,
"learning_rate": 9.568421052631578e-05,
"loss": 1.0797,
"step": 20
},
{
"epoch": 0.009432902863559798,
"grad_norm": 0.3103269934654236,
"learning_rate": 9.515263157894737e-05,
"loss": 1.0546,
"step": 21
},
{
"epoch": 0.009882088714205503,
"grad_norm": 0.30035093426704407,
"learning_rate": 9.462105263157895e-05,
"loss": 1.0643,
"step": 22
},
{
"epoch": 0.010331274564851207,
"grad_norm": 0.3492095172405243,
"learning_rate": 9.408947368421054e-05,
"loss": 1.0258,
"step": 23
},
{
"epoch": 0.010780460415496912,
"grad_norm": 0.4093751609325409,
"learning_rate": 9.355789473684211e-05,
"loss": 1.0746,
"step": 24
},
{
"epoch": 0.011229646266142616,
"grad_norm": 0.5422231554985046,
"learning_rate": 9.302631578947369e-05,
"loss": 0.975,
"step": 25
},
{
"epoch": 0.01167883211678832,
"grad_norm": 0.344453364610672,
"learning_rate": 9.249473684210526e-05,
"loss": 1.0594,
"step": 26
},
{
"epoch": 0.012128017967434027,
"grad_norm": 0.3349956274032593,
"learning_rate": 9.196315789473685e-05,
"loss": 1.0283,
"step": 27
},
{
"epoch": 0.012577203818079731,
"grad_norm": 0.30147019028663635,
"learning_rate": 9.143157894736843e-05,
"loss": 1.0116,
"step": 28
},
{
"epoch": 0.013026389668725435,
"grad_norm": 0.25005072355270386,
"learning_rate": 9.09e-05,
"loss": 0.9857,
"step": 29
},
{
"epoch": 0.01347557551937114,
"grad_norm": 0.262613981962204,
"learning_rate": 9.036842105263158e-05,
"loss": 1.0245,
"step": 30
},
{
"epoch": 0.013924761370016844,
"grad_norm": 0.35343340039253235,
"learning_rate": 8.983684210526316e-05,
"loss": 0.967,
"step": 31
},
{
"epoch": 0.014373947220662549,
"grad_norm": 0.24998489022254944,
"learning_rate": 8.930526315789474e-05,
"loss": 1.0017,
"step": 32
},
{
"epoch": 0.014823133071308253,
"grad_norm": 0.24498069286346436,
"learning_rate": 8.877368421052632e-05,
"loss": 1.0075,
"step": 33
},
{
"epoch": 0.015272318921953958,
"grad_norm": 0.21549324691295624,
"learning_rate": 8.82421052631579e-05,
"loss": 0.985,
"step": 34
},
{
"epoch": 0.015721504772599662,
"grad_norm": 0.22830170392990112,
"learning_rate": 8.771052631578948e-05,
"loss": 0.9629,
"step": 35
},
{
"epoch": 0.016170690623245366,
"grad_norm": 0.2677782475948334,
"learning_rate": 8.717894736842105e-05,
"loss": 1.014,
"step": 36
},
{
"epoch": 0.01661987647389107,
"grad_norm": 0.30911344289779663,
"learning_rate": 8.664736842105263e-05,
"loss": 0.9741,
"step": 37
},
{
"epoch": 0.01706906232453678,
"grad_norm": 0.26794004440307617,
"learning_rate": 8.61157894736842e-05,
"loss": 0.972,
"step": 38
},
{
"epoch": 0.017518248175182483,
"grad_norm": 0.26757925748825073,
"learning_rate": 8.55842105263158e-05,
"loss": 1.0005,
"step": 39
},
{
"epoch": 0.017967434025828188,
"grad_norm": 0.23241755366325378,
"learning_rate": 8.505263157894737e-05,
"loss": 0.9912,
"step": 40
},
{
"epoch": 0.018416619876473892,
"grad_norm": 0.23415440320968628,
"learning_rate": 8.452105263157896e-05,
"loss": 0.96,
"step": 41
},
{
"epoch": 0.018865805727119597,
"grad_norm": 0.2970597445964813,
"learning_rate": 8.398947368421053e-05,
"loss": 1.0017,
"step": 42
},
{
"epoch": 0.0193149915777653,
"grad_norm": 0.3233031630516052,
"learning_rate": 8.345789473684211e-05,
"loss": 0.9603,
"step": 43
},
{
"epoch": 0.019764177428411005,
"grad_norm": 0.25598403811454773,
"learning_rate": 8.292631578947368e-05,
"loss": 0.9232,
"step": 44
},
{
"epoch": 0.02021336327905671,
"grad_norm": 0.28324705362319946,
"learning_rate": 8.239473684210526e-05,
"loss": 0.9762,
"step": 45
},
{
"epoch": 0.020662549129702414,
"grad_norm": 0.2578172981739044,
"learning_rate": 8.186315789473683e-05,
"loss": 0.9924,
"step": 46
},
{
"epoch": 0.02111173498034812,
"grad_norm": 0.23937109112739563,
"learning_rate": 8.133157894736842e-05,
"loss": 0.9744,
"step": 47
},
{
"epoch": 0.021560920830993823,
"grad_norm": 0.2387576401233673,
"learning_rate": 8.080000000000001e-05,
"loss": 0.9448,
"step": 48
},
{
"epoch": 0.022010106681639528,
"grad_norm": 0.3055512309074402,
"learning_rate": 8.026842105263159e-05,
"loss": 0.9624,
"step": 49
},
{
"epoch": 0.022459292532285232,
"grad_norm": 0.3693564534187317,
"learning_rate": 7.973684210526316e-05,
"loss": 0.8829,
"step": 50
},
{
"epoch": 0.022459292532285232,
"eval_loss": 0.9505019783973694,
"eval_runtime": 206.986,
"eval_samples_per_second": 144.913,
"eval_steps_per_second": 4.532,
"step": 50
},
{
"epoch": 0.022908478382930936,
"grad_norm": 0.27806970477104187,
"learning_rate": 7.920526315789474e-05,
"loss": 0.9497,
"step": 51
},
{
"epoch": 0.02335766423357664,
"grad_norm": 0.25498324632644653,
"learning_rate": 7.867368421052631e-05,
"loss": 0.9515,
"step": 52
},
{
"epoch": 0.023806850084222345,
"grad_norm": 0.2593478560447693,
"learning_rate": 7.814210526315789e-05,
"loss": 0.9594,
"step": 53
},
{
"epoch": 0.024256035934868053,
"grad_norm": 0.2518002688884735,
"learning_rate": 7.761052631578946e-05,
"loss": 0.9459,
"step": 54
},
{
"epoch": 0.024705221785513758,
"grad_norm": 0.26930755376815796,
"learning_rate": 7.707894736842105e-05,
"loss": 0.9266,
"step": 55
},
{
"epoch": 0.025154407636159462,
"grad_norm": 0.3588949143886566,
"learning_rate": 7.654736842105264e-05,
"loss": 0.9222,
"step": 56
},
{
"epoch": 0.025603593486805167,
"grad_norm": 0.2470785528421402,
"learning_rate": 7.601578947368422e-05,
"loss": 0.943,
"step": 57
},
{
"epoch": 0.02605277933745087,
"grad_norm": 0.24494914710521698,
"learning_rate": 7.548421052631579e-05,
"loss": 0.9628,
"step": 58
},
{
"epoch": 0.026501965188096575,
"grad_norm": 0.23059231042861938,
"learning_rate": 7.495263157894737e-05,
"loss": 0.9444,
"step": 59
},
{
"epoch": 0.02695115103874228,
"grad_norm": 0.2332954704761505,
"learning_rate": 7.442105263157894e-05,
"loss": 0.9433,
"step": 60
},
{
"epoch": 0.027400336889387984,
"grad_norm": 0.2612130045890808,
"learning_rate": 7.388947368421053e-05,
"loss": 0.9485,
"step": 61
},
{
"epoch": 0.02784952274003369,
"grad_norm": 0.3084039092063904,
"learning_rate": 7.335789473684211e-05,
"loss": 0.9346,
"step": 62
},
{
"epoch": 0.028298708590679393,
"grad_norm": 0.23960231244564056,
"learning_rate": 7.282631578947368e-05,
"loss": 0.955,
"step": 63
},
{
"epoch": 0.028747894441325098,
"grad_norm": 0.2437230795621872,
"learning_rate": 7.229473684210527e-05,
"loss": 0.9372,
"step": 64
},
{
"epoch": 0.029197080291970802,
"grad_norm": 0.2259586751461029,
"learning_rate": 7.176315789473685e-05,
"loss": 0.9379,
"step": 65
},
{
"epoch": 0.029646266142616506,
"grad_norm": 0.2495633065700531,
"learning_rate": 7.123157894736842e-05,
"loss": 0.9357,
"step": 66
},
{
"epoch": 0.03009545199326221,
"grad_norm": 0.27990928292274475,
"learning_rate": 7.07e-05,
"loss": 0.9611,
"step": 67
},
{
"epoch": 0.030544637843907915,
"grad_norm": 0.28341051936149597,
"learning_rate": 7.016842105263159e-05,
"loss": 0.9106,
"step": 68
},
{
"epoch": 0.030993823694553623,
"grad_norm": 0.26214614510536194,
"learning_rate": 6.963684210526316e-05,
"loss": 0.9311,
"step": 69
},
{
"epoch": 0.031443009545199324,
"grad_norm": 0.30308669805526733,
"learning_rate": 6.910526315789474e-05,
"loss": 0.94,
"step": 70
},
{
"epoch": 0.03189219539584503,
"grad_norm": 0.2610565423965454,
"learning_rate": 6.857368421052631e-05,
"loss": 0.9241,
"step": 71
},
{
"epoch": 0.03234138124649073,
"grad_norm": 0.23850691318511963,
"learning_rate": 6.80421052631579e-05,
"loss": 0.9422,
"step": 72
},
{
"epoch": 0.03279056709713644,
"grad_norm": 0.28376665711402893,
"learning_rate": 6.751052631578948e-05,
"loss": 0.9415,
"step": 73
},
{
"epoch": 0.03323975294778214,
"grad_norm": 0.34418389201164246,
"learning_rate": 6.697894736842105e-05,
"loss": 0.9373,
"step": 74
},
{
"epoch": 0.033688938798427846,
"grad_norm": 0.4302990734577179,
"learning_rate": 6.644736842105264e-05,
"loss": 0.8997,
"step": 75
},
{
"epoch": 0.03413812464907356,
"grad_norm": 0.2855401933193207,
"learning_rate": 6.591578947368422e-05,
"loss": 0.945,
"step": 76
},
{
"epoch": 0.03458731049971926,
"grad_norm": 0.3258453607559204,
"learning_rate": 6.538421052631579e-05,
"loss": 0.933,
"step": 77
},
{
"epoch": 0.035036496350364967,
"grad_norm": 0.3375171422958374,
"learning_rate": 6.485263157894737e-05,
"loss": 0.9364,
"step": 78
},
{
"epoch": 0.03548568220101067,
"grad_norm": 0.259318083524704,
"learning_rate": 6.432105263157894e-05,
"loss": 0.8854,
"step": 79
},
{
"epoch": 0.035934868051656375,
"grad_norm": 0.28955981135368347,
"learning_rate": 6.378947368421053e-05,
"loss": 0.9347,
"step": 80
},
{
"epoch": 0.03638405390230208,
"grad_norm": 0.4210306406021118,
"learning_rate": 6.32578947368421e-05,
"loss": 0.8895,
"step": 81
},
{
"epoch": 0.036833239752947784,
"grad_norm": 0.22932107746601105,
"learning_rate": 6.27263157894737e-05,
"loss": 0.9466,
"step": 82
},
{
"epoch": 0.03728242560359349,
"grad_norm": 0.22917746007442474,
"learning_rate": 6.219473684210527e-05,
"loss": 0.9374,
"step": 83
},
{
"epoch": 0.03773161145423919,
"grad_norm": 0.25457313656806946,
"learning_rate": 6.166315789473685e-05,
"loss": 0.9387,
"step": 84
},
{
"epoch": 0.0381807973048849,
"grad_norm": 0.2656180262565613,
"learning_rate": 6.113157894736842e-05,
"loss": 0.9287,
"step": 85
},
{
"epoch": 0.0386299831555306,
"grad_norm": 0.26856109499931335,
"learning_rate": 6.0599999999999996e-05,
"loss": 0.9036,
"step": 86
},
{
"epoch": 0.039079169006176306,
"grad_norm": 0.30407968163490295,
"learning_rate": 6.006842105263158e-05,
"loss": 0.8876,
"step": 87
},
{
"epoch": 0.03952835485682201,
"grad_norm": 0.2707090675830841,
"learning_rate": 5.953684210526315e-05,
"loss": 0.9105,
"step": 88
},
{
"epoch": 0.039977540707467715,
"grad_norm": 0.26852765679359436,
"learning_rate": 5.900526315789474e-05,
"loss": 0.9667,
"step": 89
},
{
"epoch": 0.04042672655811342,
"grad_norm": 0.23827865719795227,
"learning_rate": 5.847368421052632e-05,
"loss": 0.9466,
"step": 90
},
{
"epoch": 0.040875912408759124,
"grad_norm": 0.25658079981803894,
"learning_rate": 5.79421052631579e-05,
"loss": 0.912,
"step": 91
},
{
"epoch": 0.04132509825940483,
"grad_norm": 0.28569671511650085,
"learning_rate": 5.7410526315789475e-05,
"loss": 0.9243,
"step": 92
},
{
"epoch": 0.04177428411005053,
"grad_norm": 0.3140634298324585,
"learning_rate": 5.687894736842105e-05,
"loss": 0.9092,
"step": 93
},
{
"epoch": 0.04222346996069624,
"grad_norm": 0.26119399070739746,
"learning_rate": 5.6347368421052625e-05,
"loss": 0.8836,
"step": 94
},
{
"epoch": 0.04267265581134194,
"grad_norm": 0.2906387448310852,
"learning_rate": 5.5815789473684214e-05,
"loss": 0.9251,
"step": 95
},
{
"epoch": 0.043121841661987646,
"grad_norm": 0.2718961834907532,
"learning_rate": 5.5284210526315796e-05,
"loss": 0.9486,
"step": 96
},
{
"epoch": 0.04357102751263335,
"grad_norm": 0.2514139711856842,
"learning_rate": 5.475263157894737e-05,
"loss": 0.906,
"step": 97
},
{
"epoch": 0.044020213363279055,
"grad_norm": 0.26873457431793213,
"learning_rate": 5.422105263157895e-05,
"loss": 0.9222,
"step": 98
},
{
"epoch": 0.04446939921392476,
"grad_norm": 0.3117465376853943,
"learning_rate": 5.368947368421053e-05,
"loss": 0.897,
"step": 99
},
{
"epoch": 0.044918585064570464,
"grad_norm": 0.4226572513580322,
"learning_rate": 5.3157894736842104e-05,
"loss": 0.847,
"step": 100
},
{
"epoch": 0.044918585064570464,
"eval_loss": 0.9066126942634583,
"eval_runtime": 207.0827,
"eval_samples_per_second": 144.846,
"eval_steps_per_second": 4.53,
"step": 100
},
{
"epoch": 0.04536777091521617,
"grad_norm": 0.25195518136024475,
"learning_rate": 5.262631578947368e-05,
"loss": 0.9138,
"step": 101
},
{
"epoch": 0.04581695676586187,
"grad_norm": 0.2436404973268509,
"learning_rate": 5.209473684210527e-05,
"loss": 0.9067,
"step": 102
},
{
"epoch": 0.04626614261650758,
"grad_norm": 0.27484673261642456,
"learning_rate": 5.1563157894736844e-05,
"loss": 0.9343,
"step": 103
},
{
"epoch": 0.04671532846715328,
"grad_norm": 0.27864891290664673,
"learning_rate": 5.1031578947368426e-05,
"loss": 0.9083,
"step": 104
},
{
"epoch": 0.047164514317798986,
"grad_norm": 0.28357595205307007,
"learning_rate": 5.05e-05,
"loss": 0.8889,
"step": 105
},
{
"epoch": 0.04761370016844469,
"grad_norm": 0.3293287754058838,
"learning_rate": 4.9968421052631576e-05,
"loss": 0.8996,
"step": 106
},
{
"epoch": 0.0480628860190904,
"grad_norm": 0.25651612877845764,
"learning_rate": 4.943684210526316e-05,
"loss": 0.9268,
"step": 107
},
{
"epoch": 0.048512071869736106,
"grad_norm": 0.26456665992736816,
"learning_rate": 4.890526315789474e-05,
"loss": 0.9306,
"step": 108
},
{
"epoch": 0.04896125772038181,
"grad_norm": 0.24413970112800598,
"learning_rate": 4.8373684210526316e-05,
"loss": 0.9318,
"step": 109
},
{
"epoch": 0.049410443571027515,
"grad_norm": 0.24325355887413025,
"learning_rate": 4.784210526315789e-05,
"loss": 0.9073,
"step": 110
},
{
"epoch": 0.04985962942167322,
"grad_norm": 0.2712045907974243,
"learning_rate": 4.731052631578947e-05,
"loss": 0.8931,
"step": 111
},
{
"epoch": 0.050308815272318924,
"grad_norm": 0.3078647255897522,
"learning_rate": 4.6778947368421055e-05,
"loss": 0.8614,
"step": 112
},
{
"epoch": 0.05075800112296463,
"grad_norm": 0.2627927362918854,
"learning_rate": 4.624736842105263e-05,
"loss": 0.8967,
"step": 113
},
{
"epoch": 0.05120718697361033,
"grad_norm": 0.25500932335853577,
"learning_rate": 4.571578947368421e-05,
"loss": 0.9088,
"step": 114
},
{
"epoch": 0.05165637282425604,
"grad_norm": 0.275991827249527,
"learning_rate": 4.518421052631579e-05,
"loss": 0.8913,
"step": 115
},
{
"epoch": 0.05210555867490174,
"grad_norm": 0.2581343352794647,
"learning_rate": 4.465263157894737e-05,
"loss": 0.8865,
"step": 116
},
{
"epoch": 0.052554744525547446,
"grad_norm": 0.2798708975315094,
"learning_rate": 4.412105263157895e-05,
"loss": 0.9146,
"step": 117
},
{
"epoch": 0.05300393037619315,
"grad_norm": 0.3520914316177368,
"learning_rate": 4.358947368421053e-05,
"loss": 0.9002,
"step": 118
},
{
"epoch": 0.053453116226838855,
"grad_norm": 0.3063805401325226,
"learning_rate": 4.30578947368421e-05,
"loss": 0.8889,
"step": 119
},
{
"epoch": 0.05390230207748456,
"grad_norm": 0.268318772315979,
"learning_rate": 4.2526315789473685e-05,
"loss": 0.9306,
"step": 120
},
{
"epoch": 0.054351487928130264,
"grad_norm": 0.2569490075111389,
"learning_rate": 4.199473684210527e-05,
"loss": 0.9124,
"step": 121
},
{
"epoch": 0.05480067377877597,
"grad_norm": 0.24538756906986237,
"learning_rate": 4.146315789473684e-05,
"loss": 0.911,
"step": 122
},
{
"epoch": 0.05524985962942167,
"grad_norm": 0.2484789937734604,
"learning_rate": 4.093157894736842e-05,
"loss": 0.9066,
"step": 123
},
{
"epoch": 0.05569904548006738,
"grad_norm": 0.2707575261592865,
"learning_rate": 4.0400000000000006e-05,
"loss": 0.8681,
"step": 124
},
{
"epoch": 0.05614823133071308,
"grad_norm": 0.35909488797187805,
"learning_rate": 3.986842105263158e-05,
"loss": 0.8218,
"step": 125
},
{
"epoch": 0.056597417181358786,
"grad_norm": 0.23776692152023315,
"learning_rate": 3.933684210526316e-05,
"loss": 0.9211,
"step": 126
},
{
"epoch": 0.05704660303200449,
"grad_norm": 0.25755029916763306,
"learning_rate": 3.880526315789473e-05,
"loss": 0.8879,
"step": 127
},
{
"epoch": 0.057495788882650195,
"grad_norm": 0.2494814097881317,
"learning_rate": 3.827368421052632e-05,
"loss": 0.8906,
"step": 128
},
{
"epoch": 0.0579449747332959,
"grad_norm": 0.25940045714378357,
"learning_rate": 3.7742105263157896e-05,
"loss": 0.8868,
"step": 129
},
{
"epoch": 0.058394160583941604,
"grad_norm": 0.2853882908821106,
"learning_rate": 3.721052631578947e-05,
"loss": 0.8775,
"step": 130
},
{
"epoch": 0.05884334643458731,
"grad_norm": 0.32979917526245117,
"learning_rate": 3.6678947368421054e-05,
"loss": 0.8522,
"step": 131
},
{
"epoch": 0.05929253228523301,
"grad_norm": 0.255938321352005,
"learning_rate": 3.6147368421052636e-05,
"loss": 0.9059,
"step": 132
},
{
"epoch": 0.05974171813587872,
"grad_norm": 0.2526702284812927,
"learning_rate": 3.561578947368421e-05,
"loss": 0.9105,
"step": 133
},
{
"epoch": 0.06019090398652442,
"grad_norm": 0.2506902813911438,
"learning_rate": 3.508421052631579e-05,
"loss": 0.8951,
"step": 134
},
{
"epoch": 0.060640089837170126,
"grad_norm": 0.2672176957130432,
"learning_rate": 3.455263157894737e-05,
"loss": 0.8967,
"step": 135
},
{
"epoch": 0.06108927568781583,
"grad_norm": 0.2774716913700104,
"learning_rate": 3.402105263157895e-05,
"loss": 0.8891,
"step": 136
},
{
"epoch": 0.06153846153846154,
"grad_norm": 0.3068337142467499,
"learning_rate": 3.3489473684210526e-05,
"loss": 0.8719,
"step": 137
},
{
"epoch": 0.061987647389107246,
"grad_norm": 0.2592477798461914,
"learning_rate": 3.295789473684211e-05,
"loss": 0.9152,
"step": 138
},
{
"epoch": 0.06243683323975295,
"grad_norm": 0.24649055302143097,
"learning_rate": 3.242631578947368e-05,
"loss": 0.8869,
"step": 139
},
{
"epoch": 0.06288601909039865,
"grad_norm": 0.24601422250270844,
"learning_rate": 3.1894736842105265e-05,
"loss": 0.9147,
"step": 140
},
{
"epoch": 0.06333520494104436,
"grad_norm": 0.2531537115573883,
"learning_rate": 3.136315789473685e-05,
"loss": 0.9067,
"step": 141
},
{
"epoch": 0.06378439079169006,
"grad_norm": 0.27577677369117737,
"learning_rate": 3.083157894736842e-05,
"loss": 0.8926,
"step": 142
},
{
"epoch": 0.06423357664233577,
"grad_norm": 0.3049788475036621,
"learning_rate": 3.0299999999999998e-05,
"loss": 0.8536,
"step": 143
},
{
"epoch": 0.06468276249298147,
"grad_norm": 0.26886892318725586,
"learning_rate": 2.9768421052631577e-05,
"loss": 0.8746,
"step": 144
},
{
"epoch": 0.06513194834362718,
"grad_norm": 0.2674404978752136,
"learning_rate": 2.923684210526316e-05,
"loss": 0.8991,
"step": 145
},
{
"epoch": 0.06558113419427287,
"grad_norm": 0.2553546726703644,
"learning_rate": 2.8705263157894737e-05,
"loss": 0.8974,
"step": 146
},
{
"epoch": 0.06603032004491859,
"grad_norm": 0.2580017149448395,
"learning_rate": 2.8173684210526313e-05,
"loss": 0.8854,
"step": 147
},
{
"epoch": 0.06647950589556428,
"grad_norm": 0.27426302433013916,
"learning_rate": 2.7642105263157898e-05,
"loss": 0.8885,
"step": 148
},
{
"epoch": 0.06692869174621,
"grad_norm": 0.3027507960796356,
"learning_rate": 2.7110526315789473e-05,
"loss": 0.8973,
"step": 149
},
{
"epoch": 0.06737787759685569,
"grad_norm": 0.36818036437034607,
"learning_rate": 2.6578947368421052e-05,
"loss": 0.8283,
"step": 150
},
{
"epoch": 0.06737787759685569,
"eval_loss": 0.8881184458732605,
"eval_runtime": 207.2757,
"eval_samples_per_second": 144.711,
"eval_steps_per_second": 4.525,
"step": 150
},
{
"epoch": 0.0678270634475014,
"grad_norm": 0.245889350771904,
"learning_rate": 2.6047368421052634e-05,
"loss": 0.9066,
"step": 151
},
{
"epoch": 0.06827624929814712,
"grad_norm": 0.24444212019443512,
"learning_rate": 2.5515789473684213e-05,
"loss": 0.9163,
"step": 152
},
{
"epoch": 0.06872543514879281,
"grad_norm": 0.24982236325740814,
"learning_rate": 2.4984210526315788e-05,
"loss": 0.8824,
"step": 153
},
{
"epoch": 0.06917462099943852,
"grad_norm": 0.2683698534965515,
"learning_rate": 2.445263157894737e-05,
"loss": 0.8958,
"step": 154
},
{
"epoch": 0.06962380685008422,
"grad_norm": 0.27906811237335205,
"learning_rate": 2.3921052631578946e-05,
"loss": 0.8996,
"step": 155
},
{
"epoch": 0.07007299270072993,
"grad_norm": 0.3476739823818207,
"learning_rate": 2.3389473684210528e-05,
"loss": 0.8366,
"step": 156
},
{
"epoch": 0.07052217855137563,
"grad_norm": 0.26626768708229065,
"learning_rate": 2.2857894736842106e-05,
"loss": 0.9163,
"step": 157
},
{
"epoch": 0.07097136440202134,
"grad_norm": 0.26524534821510315,
"learning_rate": 2.2326315789473685e-05,
"loss": 0.895,
"step": 158
},
{
"epoch": 0.07142055025266704,
"grad_norm": 0.26299136877059937,
"learning_rate": 2.1794736842105264e-05,
"loss": 0.8973,
"step": 159
},
{
"epoch": 0.07186973610331275,
"grad_norm": 0.2615616023540497,
"learning_rate": 2.1263157894736842e-05,
"loss": 0.901,
"step": 160
},
{
"epoch": 0.07231892195395845,
"grad_norm": 0.2764776945114136,
"learning_rate": 2.073157894736842e-05,
"loss": 0.8718,
"step": 161
},
{
"epoch": 0.07276810780460416,
"grad_norm": 0.30812153220176697,
"learning_rate": 2.0200000000000003e-05,
"loss": 0.877,
"step": 162
},
{
"epoch": 0.07321729365524986,
"grad_norm": 0.2510615289211273,
"learning_rate": 1.966842105263158e-05,
"loss": 0.8739,
"step": 163
},
{
"epoch": 0.07366647950589557,
"grad_norm": 0.24000753462314606,
"learning_rate": 1.913684210526316e-05,
"loss": 0.913,
"step": 164
},
{
"epoch": 0.07411566535654127,
"grad_norm": 0.24215757846832275,
"learning_rate": 1.8605263157894736e-05,
"loss": 0.9063,
"step": 165
},
{
"epoch": 0.07456485120718698,
"grad_norm": 0.26448169350624084,
"learning_rate": 1.8073684210526318e-05,
"loss": 0.8805,
"step": 166
},
{
"epoch": 0.07501403705783267,
"grad_norm": 0.2861745357513428,
"learning_rate": 1.7542105263157897e-05,
"loss": 0.9049,
"step": 167
},
{
"epoch": 0.07546322290847839,
"grad_norm": 0.30888983607292175,
"learning_rate": 1.7010526315789475e-05,
"loss": 0.867,
"step": 168
},
{
"epoch": 0.07591240875912408,
"grad_norm": 0.27784985303878784,
"learning_rate": 1.6478947368421054e-05,
"loss": 0.8749,
"step": 169
},
{
"epoch": 0.0763615946097698,
"grad_norm": 0.2495727688074112,
"learning_rate": 1.5947368421052633e-05,
"loss": 0.8959,
"step": 170
},
{
"epoch": 0.07681078046041549,
"grad_norm": 0.24338717758655548,
"learning_rate": 1.541578947368421e-05,
"loss": 0.8821,
"step": 171
},
{
"epoch": 0.0772599663110612,
"grad_norm": 0.25039950013160706,
"learning_rate": 1.4884210526315788e-05,
"loss": 0.8733,
"step": 172
},
{
"epoch": 0.0777091521617069,
"grad_norm": 0.27508071064949036,
"learning_rate": 1.4352631578947369e-05,
"loss": 0.9056,
"step": 173
},
{
"epoch": 0.07815833801235261,
"grad_norm": 0.2863399386405945,
"learning_rate": 1.3821052631578949e-05,
"loss": 0.8748,
"step": 174
},
{
"epoch": 0.07860752386299831,
"grad_norm": 0.3646789491176605,
"learning_rate": 1.3289473684210526e-05,
"loss": 0.8383,
"step": 175
},
{
"epoch": 0.07905670971364402,
"grad_norm": 0.2623049020767212,
"learning_rate": 1.2757894736842106e-05,
"loss": 0.8969,
"step": 176
},
{
"epoch": 0.07950589556428972,
"grad_norm": 0.26813623309135437,
"learning_rate": 1.2226315789473685e-05,
"loss": 0.899,
"step": 177
},
{
"epoch": 0.07995508141493543,
"grad_norm": 0.2505576014518738,
"learning_rate": 1.1694736842105264e-05,
"loss": 0.8629,
"step": 178
},
{
"epoch": 0.08040426726558113,
"grad_norm": 0.25836649537086487,
"learning_rate": 1.1163157894736842e-05,
"loss": 0.8929,
"step": 179
},
{
"epoch": 0.08085345311622684,
"grad_norm": 0.27238643169403076,
"learning_rate": 1.0631578947368421e-05,
"loss": 0.8732,
"step": 180
},
{
"epoch": 0.08130263896687254,
"grad_norm": 0.33109569549560547,
"learning_rate": 1.0100000000000002e-05,
"loss": 0.8555,
"step": 181
},
{
"epoch": 0.08175182481751825,
"grad_norm": 0.24706503748893738,
"learning_rate": 9.56842105263158e-06,
"loss": 0.8922,
"step": 182
},
{
"epoch": 0.08220101066816396,
"grad_norm": 0.24443064630031586,
"learning_rate": 9.036842105263159e-06,
"loss": 0.9011,
"step": 183
},
{
"epoch": 0.08265019651880966,
"grad_norm": 0.24768005311489105,
"learning_rate": 8.505263157894738e-06,
"loss": 0.91,
"step": 184
},
{
"epoch": 0.08309938236945537,
"grad_norm": 0.2456647902727127,
"learning_rate": 7.973684210526316e-06,
"loss": 0.8728,
"step": 185
},
{
"epoch": 0.08354856822010107,
"grad_norm": 0.2784600555896759,
"learning_rate": 7.442105263157894e-06,
"loss": 0.8891,
"step": 186
},
{
"epoch": 0.08399775407074678,
"grad_norm": 0.3673495948314667,
"learning_rate": 6.9105263157894745e-06,
"loss": 0.8639,
"step": 187
},
{
"epoch": 0.08444693992139247,
"grad_norm": 0.253378301858902,
"learning_rate": 6.378947368421053e-06,
"loss": 0.8923,
"step": 188
},
{
"epoch": 0.08489612577203819,
"grad_norm": 0.23032832145690918,
"learning_rate": 5.847368421052632e-06,
"loss": 0.9097,
"step": 189
},
{
"epoch": 0.08534531162268388,
"grad_norm": 0.24500040709972382,
"learning_rate": 5.315789473684211e-06,
"loss": 0.9037,
"step": 190
},
{
"epoch": 0.0857944974733296,
"grad_norm": 0.2614372670650482,
"learning_rate": 4.78421052631579e-06,
"loss": 0.9011,
"step": 191
},
{
"epoch": 0.08624368332397529,
"grad_norm": 0.2819826900959015,
"learning_rate": 4.252631578947369e-06,
"loss": 0.8751,
"step": 192
},
{
"epoch": 0.086692869174621,
"grad_norm": 0.29293060302734375,
"learning_rate": 3.721052631578947e-06,
"loss": 0.8822,
"step": 193
},
{
"epoch": 0.0871420550252667,
"grad_norm": 0.2739028036594391,
"learning_rate": 3.1894736842105266e-06,
"loss": 0.8568,
"step": 194
},
{
"epoch": 0.08759124087591241,
"grad_norm": 0.23401287198066711,
"learning_rate": 2.6578947368421053e-06,
"loss": 0.8958,
"step": 195
},
{
"epoch": 0.08804042672655811,
"grad_norm": 0.23881946504116058,
"learning_rate": 2.1263157894736844e-06,
"loss": 0.8922,
"step": 196
},
{
"epoch": 0.08848961257720382,
"grad_norm": 0.2490690052509308,
"learning_rate": 1.5947368421052633e-06,
"loss": 0.8816,
"step": 197
},
{
"epoch": 0.08893879842784952,
"grad_norm": 0.26390817761421204,
"learning_rate": 1.0631578947368422e-06,
"loss": 0.8778,
"step": 198
},
{
"epoch": 0.08938798427849523,
"grad_norm": 0.27074772119522095,
"learning_rate": 5.315789473684211e-07,
"loss": 0.8248,
"step": 199
},
{
"epoch": 0.08983717012914093,
"grad_norm": 0.3729373812675476,
"learning_rate": 0.0,
"loss": 0.8088,
"step": 200
},
{
"epoch": 0.08983717012914093,
"eval_loss": 0.8799543380737305,
"eval_runtime": 207.3734,
"eval_samples_per_second": 144.642,
"eval_steps_per_second": 4.523,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1532281667584e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}