Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
yfliao's picture
Model save
98cf691 verified
raw
history blame
228 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9974102848686646,
"eval_steps": 100,
"global_step": 1011,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029596744358120607,
"grad_norm": 2.681946039199829,
"learning_rate": 1.9607843137254904e-07,
"loss": 1.0892,
"mean_token_accuracy": 0.7134666713588034,
"step": 1
},
{
"epoch": 0.0059193488716241215,
"grad_norm": 2.483736515045166,
"learning_rate": 3.921568627450981e-07,
"loss": 1.0859,
"mean_token_accuracy": 0.7130540900903558,
"step": 2
},
{
"epoch": 0.008879023307436182,
"grad_norm": 2.5574872493743896,
"learning_rate": 5.882352941176471e-07,
"loss": 1.1083,
"mean_token_accuracy": 0.7057264272951731,
"step": 3
},
{
"epoch": 0.011838697743248243,
"grad_norm": 2.592397689819336,
"learning_rate": 7.843137254901962e-07,
"loss": 1.1251,
"mean_token_accuracy": 0.70204062618997,
"step": 4
},
{
"epoch": 0.014798372179060304,
"grad_norm": 2.5958452224731445,
"learning_rate": 9.80392156862745e-07,
"loss": 1.0616,
"mean_token_accuracy": 0.7201201840956424,
"step": 5
},
{
"epoch": 0.017758046614872364,
"grad_norm": 2.527214765548706,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.1498,
"mean_token_accuracy": 0.6991107921462223,
"step": 6
},
{
"epoch": 0.020717721050684423,
"grad_norm": 2.453611135482788,
"learning_rate": 1.3725490196078434e-06,
"loss": 1.0692,
"mean_token_accuracy": 0.7185075890374791,
"step": 7
},
{
"epoch": 0.023677395486496486,
"grad_norm": 2.2676663398742676,
"learning_rate": 1.5686274509803923e-06,
"loss": 1.1027,
"mean_token_accuracy": 0.7096105664418749,
"step": 8
},
{
"epoch": 0.026637069922308545,
"grad_norm": 2.4191880226135254,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.1374,
"mean_token_accuracy": 0.7004450719322626,
"step": 9
},
{
"epoch": 0.029596744358120607,
"grad_norm": 2.2810451984405518,
"learning_rate": 1.96078431372549e-06,
"loss": 1.0701,
"mean_token_accuracy": 0.7192182703502579,
"step": 10
},
{
"epoch": 0.032556418793932666,
"grad_norm": 2.047187566757202,
"learning_rate": 2.1568627450980393e-06,
"loss": 1.0692,
"mean_token_accuracy": 0.7168684606703121,
"step": 11
},
{
"epoch": 0.03551609322974473,
"grad_norm": 1.9986836910247803,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.0591,
"mean_token_accuracy": 0.7179799919846566,
"step": 12
},
{
"epoch": 0.03847576766555679,
"grad_norm": 1.9848605394363403,
"learning_rate": 2.549019607843137e-06,
"loss": 1.0592,
"mean_token_accuracy": 0.7186164399688223,
"step": 13
},
{
"epoch": 0.04143544210136885,
"grad_norm": 1.7683581113815308,
"learning_rate": 2.7450980392156867e-06,
"loss": 1.0286,
"mean_token_accuracy": 0.7263637707391479,
"step": 14
},
{
"epoch": 0.04439511653718091,
"grad_norm": 1.4327510595321655,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.0502,
"mean_token_accuracy": 0.718260961897349,
"step": 15
},
{
"epoch": 0.04735479097299297,
"grad_norm": 1.4091436862945557,
"learning_rate": 3.1372549019607846e-06,
"loss": 1.0816,
"mean_token_accuracy": 0.7076378775080614,
"step": 16
},
{
"epoch": 0.050314465408805034,
"grad_norm": 1.3194211721420288,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0302,
"mean_token_accuracy": 0.7220054724166985,
"step": 17
},
{
"epoch": 0.05327413984461709,
"grad_norm": 1.2913936376571655,
"learning_rate": 3.529411764705883e-06,
"loss": 1.0676,
"mean_token_accuracy": 0.7113759820986945,
"step": 18
},
{
"epoch": 0.05623381428042915,
"grad_norm": 1.236266016960144,
"learning_rate": 3.7254901960784316e-06,
"loss": 1.0571,
"mean_token_accuracy": 0.7136546795764988,
"step": 19
},
{
"epoch": 0.059193488716241215,
"grad_norm": 1.1931370496749878,
"learning_rate": 3.92156862745098e-06,
"loss": 0.9917,
"mean_token_accuracy": 0.7271706923102303,
"step": 20
},
{
"epoch": 0.06215316315205328,
"grad_norm": 1.3087286949157715,
"learning_rate": 4.11764705882353e-06,
"loss": 1.0021,
"mean_token_accuracy": 0.7239365954438801,
"step": 21
},
{
"epoch": 0.06511283758786533,
"grad_norm": 1.2562185525894165,
"learning_rate": 4.313725490196079e-06,
"loss": 0.983,
"mean_token_accuracy": 0.7273888578305255,
"step": 22
},
{
"epoch": 0.0680725120236774,
"grad_norm": 1.1378827095031738,
"learning_rate": 4.509803921568628e-06,
"loss": 0.9578,
"mean_token_accuracy": 0.7362632636857523,
"step": 23
},
{
"epoch": 0.07103218645948946,
"grad_norm": 1.0568324327468872,
"learning_rate": 4.705882352941177e-06,
"loss": 0.9564,
"mean_token_accuracy": 0.7346627849009933,
"step": 24
},
{
"epoch": 0.07399186089530152,
"grad_norm": 0.9209612011909485,
"learning_rate": 4.901960784313726e-06,
"loss": 0.9808,
"mean_token_accuracy": 0.7272476674555969,
"step": 25
},
{
"epoch": 0.07695153533111358,
"grad_norm": 0.8665790557861328,
"learning_rate": 5.098039215686274e-06,
"loss": 1.0003,
"mean_token_accuracy": 0.7212588502719087,
"step": 26
},
{
"epoch": 0.07991120976692564,
"grad_norm": 0.8994502425193787,
"learning_rate": 5.294117647058824e-06,
"loss": 0.9476,
"mean_token_accuracy": 0.7335574894521832,
"step": 27
},
{
"epoch": 0.0828708842027377,
"grad_norm": 1.0448633432388306,
"learning_rate": 5.4901960784313735e-06,
"loss": 0.9464,
"mean_token_accuracy": 0.7334208114703,
"step": 28
},
{
"epoch": 0.08583055863854976,
"grad_norm": 0.9871032238006592,
"learning_rate": 5.686274509803922e-06,
"loss": 0.9505,
"mean_token_accuracy": 0.732524444705358,
"step": 29
},
{
"epoch": 0.08879023307436182,
"grad_norm": 0.9244782328605652,
"learning_rate": 5.882352941176471e-06,
"loss": 0.9369,
"mean_token_accuracy": 0.7377869549204231,
"step": 30
},
{
"epoch": 0.09174990751017388,
"grad_norm": 0.8495871424674988,
"learning_rate": 6.07843137254902e-06,
"loss": 0.9632,
"mean_token_accuracy": 0.7259763433334542,
"step": 31
},
{
"epoch": 0.09470958194598594,
"grad_norm": 0.730097770690918,
"learning_rate": 6.274509803921569e-06,
"loss": 0.8828,
"mean_token_accuracy": 0.7483362451357691,
"step": 32
},
{
"epoch": 0.097669256381798,
"grad_norm": 0.7470875382423401,
"learning_rate": 6.470588235294119e-06,
"loss": 0.9185,
"mean_token_accuracy": 0.7392471457849514,
"step": 33
},
{
"epoch": 0.10062893081761007,
"grad_norm": 0.730536162853241,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8883,
"mean_token_accuracy": 0.7495266641186222,
"step": 34
},
{
"epoch": 0.10358860525342212,
"grad_norm": 0.6699507832527161,
"learning_rate": 6.862745098039216e-06,
"loss": 0.857,
"mean_token_accuracy": 0.7532634065825189,
"step": 35
},
{
"epoch": 0.10654827968923418,
"grad_norm": 0.6172248721122742,
"learning_rate": 7.058823529411766e-06,
"loss": 0.8762,
"mean_token_accuracy": 0.749338820444233,
"step": 36
},
{
"epoch": 0.10950795412504624,
"grad_norm": 0.6268398761749268,
"learning_rate": 7.2549019607843145e-06,
"loss": 0.8679,
"mean_token_accuracy": 0.7519748968716043,
"step": 37
},
{
"epoch": 0.1124676285608583,
"grad_norm": 0.610349178314209,
"learning_rate": 7.450980392156863e-06,
"loss": 0.8855,
"mean_token_accuracy": 0.7472919453079274,
"step": 38
},
{
"epoch": 0.11542730299667037,
"grad_norm": 0.604537308216095,
"learning_rate": 7.647058823529411e-06,
"loss": 0.8499,
"mean_token_accuracy": 0.7552782022394232,
"step": 39
},
{
"epoch": 0.11838697743248243,
"grad_norm": 0.609111487865448,
"learning_rate": 7.84313725490196e-06,
"loss": 0.8822,
"mean_token_accuracy": 0.746562312628656,
"step": 40
},
{
"epoch": 0.12134665186829449,
"grad_norm": 0.5899158716201782,
"learning_rate": 8.03921568627451e-06,
"loss": 0.8811,
"mean_token_accuracy": 0.7473791695126712,
"step": 41
},
{
"epoch": 0.12430632630410655,
"grad_norm": 0.6210097670555115,
"learning_rate": 8.23529411764706e-06,
"loss": 0.8833,
"mean_token_accuracy": 0.7444836846534346,
"step": 42
},
{
"epoch": 0.12726600073991862,
"grad_norm": 0.600689709186554,
"learning_rate": 8.43137254901961e-06,
"loss": 0.8318,
"mean_token_accuracy": 0.7609372507118015,
"step": 43
},
{
"epoch": 0.13022567517573067,
"grad_norm": 0.5491411685943604,
"learning_rate": 8.627450980392157e-06,
"loss": 0.8631,
"mean_token_accuracy": 0.750162132080428,
"step": 44
},
{
"epoch": 0.13318534961154274,
"grad_norm": 0.5706349611282349,
"learning_rate": 8.823529411764707e-06,
"loss": 0.8782,
"mean_token_accuracy": 0.7451601161887986,
"step": 45
},
{
"epoch": 0.1361450240473548,
"grad_norm": 0.5555650591850281,
"learning_rate": 9.019607843137256e-06,
"loss": 0.823,
"mean_token_accuracy": 0.7618301473100519,
"step": 46
},
{
"epoch": 0.13910469848316684,
"grad_norm": 0.5772121548652649,
"learning_rate": 9.215686274509804e-06,
"loss": 0.828,
"mean_token_accuracy": 0.7588256411868824,
"step": 47
},
{
"epoch": 0.14206437291897892,
"grad_norm": 0.611781895160675,
"learning_rate": 9.411764705882354e-06,
"loss": 0.8425,
"mean_token_accuracy": 0.7546703623296309,
"step": 48
},
{
"epoch": 0.14502404735479096,
"grad_norm": 0.5700849294662476,
"learning_rate": 9.607843137254903e-06,
"loss": 0.8695,
"mean_token_accuracy": 0.7466177841712535,
"step": 49
},
{
"epoch": 0.14798372179060304,
"grad_norm": 0.5548747777938843,
"learning_rate": 9.803921568627451e-06,
"loss": 0.8548,
"mean_token_accuracy": 0.7508958491076401,
"step": 50
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.5233455300331116,
"learning_rate": 1e-05,
"loss": 0.8778,
"mean_token_accuracy": 0.7444452874755125,
"step": 51
},
{
"epoch": 0.15390307066222716,
"grad_norm": 0.567051112651825,
"learning_rate": 1.0196078431372549e-05,
"loss": 0.8213,
"mean_token_accuracy": 0.7609767092967284,
"step": 52
},
{
"epoch": 0.1568627450980392,
"grad_norm": 0.5394188165664673,
"learning_rate": 1.03921568627451e-05,
"loss": 0.8661,
"mean_token_accuracy": 0.7484568076496121,
"step": 53
},
{
"epoch": 0.1598224195338513,
"grad_norm": 0.5241853594779968,
"learning_rate": 1.0588235294117648e-05,
"loss": 0.8621,
"mean_token_accuracy": 0.7480956260768654,
"step": 54
},
{
"epoch": 0.16278209396966334,
"grad_norm": 0.48302915692329407,
"learning_rate": 1.0784313725490196e-05,
"loss": 0.8101,
"mean_token_accuracy": 0.7638810794013436,
"step": 55
},
{
"epoch": 0.1657417684054754,
"grad_norm": 0.5048951506614685,
"learning_rate": 1.0980392156862747e-05,
"loss": 0.8164,
"mean_token_accuracy": 0.7611000331453143,
"step": 56
},
{
"epoch": 0.16870144284128746,
"grad_norm": 0.5220761299133301,
"learning_rate": 1.1176470588235295e-05,
"loss": 0.8382,
"mean_token_accuracy": 0.7542881093651161,
"step": 57
},
{
"epoch": 0.1716611172770995,
"grad_norm": 0.5163182020187378,
"learning_rate": 1.1372549019607844e-05,
"loss": 0.845,
"mean_token_accuracy": 0.7544678776426703,
"step": 58
},
{
"epoch": 0.1746207917129116,
"grad_norm": 0.5414546132087708,
"learning_rate": 1.1568627450980394e-05,
"loss": 0.8115,
"mean_token_accuracy": 0.763602548207208,
"step": 59
},
{
"epoch": 0.17758046614872364,
"grad_norm": 0.49731120467185974,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.8498,
"mean_token_accuracy": 0.7513782211298353,
"step": 60
},
{
"epoch": 0.1805401405845357,
"grad_norm": 0.48450183868408203,
"learning_rate": 1.1960784313725491e-05,
"loss": 0.8112,
"mean_token_accuracy": 0.760378165872515,
"step": 61
},
{
"epoch": 0.18349981502034776,
"grad_norm": 0.5090157985687256,
"learning_rate": 1.215686274509804e-05,
"loss": 0.8352,
"mean_token_accuracy": 0.7544511398898393,
"step": 62
},
{
"epoch": 0.1864594894561598,
"grad_norm": 0.5094890594482422,
"learning_rate": 1.235294117647059e-05,
"loss": 0.8169,
"mean_token_accuracy": 0.7596972963469578,
"step": 63
},
{
"epoch": 0.1894191638919719,
"grad_norm": 0.5052422881126404,
"learning_rate": 1.2549019607843138e-05,
"loss": 0.8397,
"mean_token_accuracy": 0.7528146247402845,
"step": 64
},
{
"epoch": 0.19237883832778394,
"grad_norm": 0.48801887035369873,
"learning_rate": 1.2745098039215686e-05,
"loss": 0.7911,
"mean_token_accuracy": 0.7666436131483815,
"step": 65
},
{
"epoch": 0.195338512763596,
"grad_norm": 0.49707359075546265,
"learning_rate": 1.2941176470588238e-05,
"loss": 0.8311,
"mean_token_accuracy": 0.7534919777308312,
"step": 66
},
{
"epoch": 0.19829818719940806,
"grad_norm": 0.47678443789482117,
"learning_rate": 1.3137254901960785e-05,
"loss": 0.7908,
"mean_token_accuracy": 0.7675227128959651,
"step": 67
},
{
"epoch": 0.20125786163522014,
"grad_norm": 0.5108245611190796,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.8136,
"mean_token_accuracy": 0.7605165307209668,
"step": 68
},
{
"epoch": 0.20421753607103219,
"grad_norm": 0.5529371500015259,
"learning_rate": 1.3529411764705885e-05,
"loss": 0.8289,
"mean_token_accuracy": 0.7556330264225892,
"step": 69
},
{
"epoch": 0.20717721050684423,
"grad_norm": 0.48820486664772034,
"learning_rate": 1.3725490196078432e-05,
"loss": 0.8322,
"mean_token_accuracy": 0.7555734646050257,
"step": 70
},
{
"epoch": 0.2101368849426563,
"grad_norm": 0.4998631775379181,
"learning_rate": 1.392156862745098e-05,
"loss": 0.7757,
"mean_token_accuracy": 0.7707598691626208,
"step": 71
},
{
"epoch": 0.21309655937846836,
"grad_norm": 0.5397401452064514,
"learning_rate": 1.4117647058823532e-05,
"loss": 0.8136,
"mean_token_accuracy": 0.7605449945205573,
"step": 72
},
{
"epoch": 0.21605623381428044,
"grad_norm": 0.5162031054496765,
"learning_rate": 1.431372549019608e-05,
"loss": 0.7805,
"mean_token_accuracy": 0.7688441270861772,
"step": 73
},
{
"epoch": 0.21901590825009248,
"grad_norm": 0.4769732654094696,
"learning_rate": 1.4509803921568629e-05,
"loss": 0.8062,
"mean_token_accuracy": 0.7610474880611428,
"step": 74
},
{
"epoch": 0.22197558268590456,
"grad_norm": 0.48078039288520813,
"learning_rate": 1.4705882352941179e-05,
"loss": 0.8152,
"mean_token_accuracy": 0.7588509310402451,
"step": 75
},
{
"epoch": 0.2249352571217166,
"grad_norm": 0.48076578974723816,
"learning_rate": 1.4901960784313726e-05,
"loss": 0.7886,
"mean_token_accuracy": 0.7669702001266795,
"step": 76
},
{
"epoch": 0.22789493155752868,
"grad_norm": 0.524426281452179,
"learning_rate": 1.5098039215686276e-05,
"loss": 0.7958,
"mean_token_accuracy": 0.7644518143592102,
"step": 77
},
{
"epoch": 0.23085460599334073,
"grad_norm": 0.48478269577026367,
"learning_rate": 1.5294117647058822e-05,
"loss": 0.822,
"mean_token_accuracy": 0.7575506383899827,
"step": 78
},
{
"epoch": 0.23381428042915278,
"grad_norm": 0.49773070216178894,
"learning_rate": 1.5490196078431373e-05,
"loss": 0.8007,
"mean_token_accuracy": 0.7629923994057785,
"step": 79
},
{
"epoch": 0.23677395486496486,
"grad_norm": 0.5387545228004456,
"learning_rate": 1.568627450980392e-05,
"loss": 0.8225,
"mean_token_accuracy": 0.7566505741674857,
"step": 80
},
{
"epoch": 0.2397336293007769,
"grad_norm": 0.4855351448059082,
"learning_rate": 1.5882352941176473e-05,
"loss": 0.775,
"mean_token_accuracy": 0.769850506922079,
"step": 81
},
{
"epoch": 0.24269330373658898,
"grad_norm": 0.47540611028671265,
"learning_rate": 1.607843137254902e-05,
"loss": 0.7937,
"mean_token_accuracy": 0.7641365526868825,
"step": 82
},
{
"epoch": 0.24565297817240103,
"grad_norm": 0.48479974269866943,
"learning_rate": 1.627450980392157e-05,
"loss": 0.8315,
"mean_token_accuracy": 0.7560415146119046,
"step": 83
},
{
"epoch": 0.2486126526082131,
"grad_norm": 0.5490248203277588,
"learning_rate": 1.647058823529412e-05,
"loss": 0.8276,
"mean_token_accuracy": 0.7542041203825852,
"step": 84
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.4909403920173645,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.8113,
"mean_token_accuracy": 0.7590975144970054,
"step": 85
},
{
"epoch": 0.25453200147983723,
"grad_norm": 0.47584831714630127,
"learning_rate": 1.686274509803922e-05,
"loss": 0.764,
"mean_token_accuracy": 0.7724685847938628,
"step": 86
},
{
"epoch": 0.25749167591564925,
"grad_norm": 0.49695855379104614,
"learning_rate": 1.7058823529411767e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7775013855023025,
"step": 87
},
{
"epoch": 0.26045135035146133,
"grad_norm": 0.5099871754646301,
"learning_rate": 1.7254901960784314e-05,
"loss": 0.7644,
"mean_token_accuracy": 0.7725364928369027,
"step": 88
},
{
"epoch": 0.2634110247872734,
"grad_norm": 0.5371332764625549,
"learning_rate": 1.7450980392156866e-05,
"loss": 0.8248,
"mean_token_accuracy": 0.7555675225937974,
"step": 89
},
{
"epoch": 0.2663706992230855,
"grad_norm": 0.5191521048545837,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.8008,
"mean_token_accuracy": 0.7618285114849587,
"step": 90
},
{
"epoch": 0.2693303736588975,
"grad_norm": 0.5234159231185913,
"learning_rate": 1.7843137254901965e-05,
"loss": 0.8007,
"mean_token_accuracy": 0.7619624657540706,
"step": 91
},
{
"epoch": 0.2722900480947096,
"grad_norm": 0.5274977087974548,
"learning_rate": 1.8039215686274513e-05,
"loss": 0.8176,
"mean_token_accuracy": 0.7581840756170707,
"step": 92
},
{
"epoch": 0.27524972253052166,
"grad_norm": 0.5195613503456116,
"learning_rate": 1.823529411764706e-05,
"loss": 0.7421,
"mean_token_accuracy": 0.7779025499948702,
"step": 93
},
{
"epoch": 0.2782093969663337,
"grad_norm": 0.5123000741004944,
"learning_rate": 1.843137254901961e-05,
"loss": 0.7924,
"mean_token_accuracy": 0.7655979691874065,
"step": 94
},
{
"epoch": 0.28116907140214575,
"grad_norm": 0.5142971277236938,
"learning_rate": 1.862745098039216e-05,
"loss": 0.7904,
"mean_token_accuracy": 0.7648081417962661,
"step": 95
},
{
"epoch": 0.28412874583795783,
"grad_norm": 0.5216192007064819,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.7764,
"mean_token_accuracy": 0.7663588698907876,
"step": 96
},
{
"epoch": 0.2870884202737699,
"grad_norm": 0.533979058265686,
"learning_rate": 1.9019607843137255e-05,
"loss": 0.8085,
"mean_token_accuracy": 0.7584315215207101,
"step": 97
},
{
"epoch": 0.2900480947095819,
"grad_norm": 0.4970541000366211,
"learning_rate": 1.9215686274509807e-05,
"loss": 0.7709,
"mean_token_accuracy": 0.7712429032432324,
"step": 98
},
{
"epoch": 0.293007769145394,
"grad_norm": 0.5441746115684509,
"learning_rate": 1.9411764705882355e-05,
"loss": 0.7992,
"mean_token_accuracy": 0.7626096179397713,
"step": 99
},
{
"epoch": 0.2959674435812061,
"grad_norm": 0.5223695635795593,
"learning_rate": 1.9607843137254903e-05,
"loss": 0.8004,
"mean_token_accuracy": 0.7618210772497175,
"step": 100
},
{
"epoch": 0.2959674435812061,
"eval_loss": 0.8126489520072937,
"eval_mean_token_accuracy": 0.7551172949521177,
"eval_runtime": 24.8878,
"eval_samples_per_second": 5.183,
"eval_steps_per_second": 1.326,
"step": 100
},
{
"epoch": 0.2989271180170181,
"grad_norm": 0.5140753984451294,
"learning_rate": 1.9803921568627454e-05,
"loss": 0.8128,
"mean_token_accuracy": 0.7589419990451155,
"step": 101
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.5474939942359924,
"learning_rate": 2e-05,
"loss": 0.7814,
"mean_token_accuracy": 0.7675481741705397,
"step": 102
},
{
"epoch": 0.30484646688864225,
"grad_norm": 0.5351850390434265,
"learning_rate": 1.9999940277008807e-05,
"loss": 0.8039,
"mean_token_accuracy": 0.7606729614320974,
"step": 103
},
{
"epoch": 0.30780614132445433,
"grad_norm": 0.5160948038101196,
"learning_rate": 1.99997611087486e-05,
"loss": 0.7853,
"mean_token_accuracy": 0.7661865778379009,
"step": 104
},
{
"epoch": 0.31076581576026635,
"grad_norm": 0.5185216665267944,
"learning_rate": 1.9999462497359468e-05,
"loss": 0.7549,
"mean_token_accuracy": 0.7736576692294679,
"step": 105
},
{
"epoch": 0.3137254901960784,
"grad_norm": 0.4885355830192566,
"learning_rate": 1.9999044446408203e-05,
"loss": 0.7727,
"mean_token_accuracy": 0.769001304246102,
"step": 106
},
{
"epoch": 0.3166851646318905,
"grad_norm": 0.615883469581604,
"learning_rate": 1.9998506960888258e-05,
"loss": 0.7991,
"mean_token_accuracy": 0.7610468017188765,
"step": 107
},
{
"epoch": 0.3196448390677026,
"grad_norm": 0.520724892616272,
"learning_rate": 1.999785004721968e-05,
"loss": 0.7932,
"mean_token_accuracy": 0.7632453023136502,
"step": 108
},
{
"epoch": 0.3226045135035146,
"grad_norm": 0.5822110772132874,
"learning_rate": 1.999707371324904e-05,
"loss": 0.809,
"mean_token_accuracy": 0.7592238599169098,
"step": 109
},
{
"epoch": 0.3255641879393267,
"grad_norm": 0.5411946177482605,
"learning_rate": 1.9996177968249336e-05,
"loss": 0.738,
"mean_token_accuracy": 0.7779971005622943,
"step": 110
},
{
"epoch": 0.32852386237513875,
"grad_norm": 0.5421875715255737,
"learning_rate": 1.999516282291988e-05,
"loss": 0.8056,
"mean_token_accuracy": 0.7613603734181074,
"step": 111
},
{
"epoch": 0.3314835368109508,
"grad_norm": 0.5699617266654968,
"learning_rate": 1.999402828938618e-05,
"loss": 0.7994,
"mean_token_accuracy": 0.7613545035512745,
"step": 112
},
{
"epoch": 0.33444321124676285,
"grad_norm": 0.5206153988838196,
"learning_rate": 1.999277438119978e-05,
"loss": 0.7778,
"mean_token_accuracy": 0.7683061531083251,
"step": 113
},
{
"epoch": 0.3374028856825749,
"grad_norm": 0.5244638323783875,
"learning_rate": 1.9991401113338103e-05,
"loss": 0.8023,
"mean_token_accuracy": 0.7609702591724479,
"step": 114
},
{
"epoch": 0.340362560118387,
"grad_norm": 0.5344120860099792,
"learning_rate": 1.9989908502204295e-05,
"loss": 0.7793,
"mean_token_accuracy": 0.7666984560859803,
"step": 115
},
{
"epoch": 0.343322234554199,
"grad_norm": 0.505351185798645,
"learning_rate": 1.9988296565626988e-05,
"loss": 0.7577,
"mean_token_accuracy": 0.7727362055103163,
"step": 116
},
{
"epoch": 0.3462819089900111,
"grad_norm": 0.5267241597175598,
"learning_rate": 1.9986565322860117e-05,
"loss": 0.8223,
"mean_token_accuracy": 0.7553329490401921,
"step": 117
},
{
"epoch": 0.3492415834258232,
"grad_norm": 0.5347175002098083,
"learning_rate": 1.9984714794582682e-05,
"loss": 0.8163,
"mean_token_accuracy": 0.7553841783952017,
"step": 118
},
{
"epoch": 0.3522012578616352,
"grad_norm": 0.5740127563476562,
"learning_rate": 1.99827450028985e-05,
"loss": 0.7804,
"mean_token_accuracy": 0.7664757137429672,
"step": 119
},
{
"epoch": 0.3551609322974473,
"grad_norm": 0.5313867330551147,
"learning_rate": 1.9980655971335944e-05,
"loss": 0.81,
"mean_token_accuracy": 0.7596098693206174,
"step": 120
},
{
"epoch": 0.35812060673325935,
"grad_norm": 0.5177193284034729,
"learning_rate": 1.9978447724847655e-05,
"loss": 0.7956,
"mean_token_accuracy": 0.7617352886178098,
"step": 121
},
{
"epoch": 0.3610802811690714,
"grad_norm": 0.564724326133728,
"learning_rate": 1.9976120289810247e-05,
"loss": 0.8109,
"mean_token_accuracy": 0.7577093596124115,
"step": 122
},
{
"epoch": 0.36403995560488345,
"grad_norm": 0.539661169052124,
"learning_rate": 1.9973673694024002e-05,
"loss": 0.7858,
"mean_token_accuracy": 0.7645922340526763,
"step": 123
},
{
"epoch": 0.3669996300406955,
"grad_norm": 0.5084680318832397,
"learning_rate": 1.9971107966712518e-05,
"loss": 0.7463,
"mean_token_accuracy": 0.7753920713027525,
"step": 124
},
{
"epoch": 0.3699593044765076,
"grad_norm": 0.4952844977378845,
"learning_rate": 1.9968423138522382e-05,
"loss": 0.7739,
"mean_token_accuracy": 0.7676340081581494,
"step": 125
},
{
"epoch": 0.3729189789123196,
"grad_norm": 0.5472536087036133,
"learning_rate": 1.996561924152278e-05,
"loss": 0.8,
"mean_token_accuracy": 0.7616544988854603,
"step": 126
},
{
"epoch": 0.3758786533481317,
"grad_norm": 0.5309717059135437,
"learning_rate": 1.9962696309205146e-05,
"loss": 0.7776,
"mean_token_accuracy": 0.7678901777514975,
"step": 127
},
{
"epoch": 0.3788383277839438,
"grad_norm": 0.5029951930046082,
"learning_rate": 1.995965437648273e-05,
"loss": 0.7761,
"mean_token_accuracy": 0.766595985687639,
"step": 128
},
{
"epoch": 0.38179800221975585,
"grad_norm": 0.5340363383293152,
"learning_rate": 1.995649347969019e-05,
"loss": 0.7457,
"mean_token_accuracy": 0.7745559370999009,
"step": 129
},
{
"epoch": 0.38475767665556787,
"grad_norm": 0.5484894514083862,
"learning_rate": 1.995321365658317e-05,
"loss": 0.7997,
"mean_token_accuracy": 0.7594812381150867,
"step": 130
},
{
"epoch": 0.38771735109137995,
"grad_norm": 0.6396868228912354,
"learning_rate": 1.994981494633784e-05,
"loss": 0.7976,
"mean_token_accuracy": 0.7599872025655441,
"step": 131
},
{
"epoch": 0.390677025527192,
"grad_norm": 0.5394526124000549,
"learning_rate": 1.9946297389550433e-05,
"loss": 0.7993,
"mean_token_accuracy": 0.7608908088026568,
"step": 132
},
{
"epoch": 0.39363669996300404,
"grad_norm": 0.6235033869743347,
"learning_rate": 1.9942661028236746e-05,
"loss": 0.787,
"mean_token_accuracy": 0.7650479719064859,
"step": 133
},
{
"epoch": 0.3965963743988161,
"grad_norm": 0.5509399175643921,
"learning_rate": 1.9938905905831657e-05,
"loss": 0.7841,
"mean_token_accuracy": 0.7647842322769413,
"step": 134
},
{
"epoch": 0.3995560488346282,
"grad_norm": 0.589085578918457,
"learning_rate": 1.993503206718859e-05,
"loss": 0.7701,
"mean_token_accuracy": 0.7691342710083168,
"step": 135
},
{
"epoch": 0.4025157232704403,
"grad_norm": 0.5094689726829529,
"learning_rate": 1.9931039558578997e-05,
"loss": 0.755,
"mean_token_accuracy": 0.773621444740238,
"step": 136
},
{
"epoch": 0.4054753977062523,
"grad_norm": 0.5288008451461792,
"learning_rate": 1.9926928427691788e-05,
"loss": 0.733,
"mean_token_accuracy": 0.7798217961404127,
"step": 137
},
{
"epoch": 0.40843507214206437,
"grad_norm": 0.5860950350761414,
"learning_rate": 1.992269872363277e-05,
"loss": 0.7793,
"mean_token_accuracy": 0.7671219171893889,
"step": 138
},
{
"epoch": 0.41139474657787645,
"grad_norm": 0.5211442708969116,
"learning_rate": 1.991835049692405e-05,
"loss": 0.7589,
"mean_token_accuracy": 0.7712709984233845,
"step": 139
},
{
"epoch": 0.41435442101368847,
"grad_norm": 0.6341312527656555,
"learning_rate": 1.991388379950346e-05,
"loss": 0.7555,
"mean_token_accuracy": 0.7726431562687772,
"step": 140
},
{
"epoch": 0.41731409544950054,
"grad_norm": 0.5119423866271973,
"learning_rate": 1.9909298684723905e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7683422766172284,
"step": 141
},
{
"epoch": 0.4202737698853126,
"grad_norm": 0.5573475956916809,
"learning_rate": 1.9904595207352736e-05,
"loss": 0.7586,
"mean_token_accuracy": 0.7709694689177727,
"step": 142
},
{
"epoch": 0.4232334443211247,
"grad_norm": 0.5152528882026672,
"learning_rate": 1.9899773423571102e-05,
"loss": 0.742,
"mean_token_accuracy": 0.776040556686583,
"step": 143
},
{
"epoch": 0.4261931187569367,
"grad_norm": 0.5058140754699707,
"learning_rate": 1.9894833390973266e-05,
"loss": 0.8094,
"mean_token_accuracy": 0.7577251436684603,
"step": 144
},
{
"epoch": 0.4291527931927488,
"grad_norm": 0.5282382965087891,
"learning_rate": 1.9889775168565942e-05,
"loss": 0.7748,
"mean_token_accuracy": 0.7683045482642854,
"step": 145
},
{
"epoch": 0.43211246762856087,
"grad_norm": 0.6103954315185547,
"learning_rate": 1.9884598816767563e-05,
"loss": 0.805,
"mean_token_accuracy": 0.7593984139774315,
"step": 146
},
{
"epoch": 0.43507214206437295,
"grad_norm": 0.530112087726593,
"learning_rate": 1.987930439740757e-05,
"loss": 0.7537,
"mean_token_accuracy": 0.7733501196092509,
"step": 147
},
{
"epoch": 0.43803181650018497,
"grad_norm": 0.5501434206962585,
"learning_rate": 1.9873891973725673e-05,
"loss": 0.752,
"mean_token_accuracy": 0.7755143180889366,
"step": 148
},
{
"epoch": 0.44099149093599704,
"grad_norm": 0.496888667345047,
"learning_rate": 1.98683616103711e-05,
"loss": 0.7624,
"mean_token_accuracy": 0.7707987778410632,
"step": 149
},
{
"epoch": 0.4439511653718091,
"grad_norm": 0.5206103324890137,
"learning_rate": 1.986271337340182e-05,
"loss": 0.7754,
"mean_token_accuracy": 0.7663099883208253,
"step": 150
},
{
"epoch": 0.44691083980762114,
"grad_norm": 0.5429675579071045,
"learning_rate": 1.9856947330283752e-05,
"loss": 0.7418,
"mean_token_accuracy": 0.7745724176097732,
"step": 151
},
{
"epoch": 0.4498705142434332,
"grad_norm": 0.515471875667572,
"learning_rate": 1.985106354988997e-05,
"loss": 0.7517,
"mean_token_accuracy": 0.7713102457643006,
"step": 152
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.5580022931098938,
"learning_rate": 1.984506210249986e-05,
"loss": 0.7372,
"mean_token_accuracy": 0.7783837879306136,
"step": 153
},
{
"epoch": 0.45578986311505737,
"grad_norm": 0.5351727604866028,
"learning_rate": 1.9838943059798305e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.7712246098769842,
"step": 154
},
{
"epoch": 0.4587495375508694,
"grad_norm": 0.5970275402069092,
"learning_rate": 1.9832706494874812e-05,
"loss": 0.7852,
"mean_token_accuracy": 0.7650099910061801,
"step": 155
},
{
"epoch": 0.46170921198668147,
"grad_norm": 0.535476803779602,
"learning_rate": 1.982635248222264e-05,
"loss": 0.8135,
"mean_token_accuracy": 0.7548096205593479,
"step": 156
},
{
"epoch": 0.46466888642249354,
"grad_norm": 0.5446284413337708,
"learning_rate": 1.9819881097737917e-05,
"loss": 0.7753,
"mean_token_accuracy": 0.766597256567756,
"step": 157
},
{
"epoch": 0.46762856085830556,
"grad_norm": 0.5779156684875488,
"learning_rate": 1.9813292418718734e-05,
"loss": 0.8178,
"mean_token_accuracy": 0.7556820545263497,
"step": 158
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.5383659601211548,
"learning_rate": 1.9806586523864212e-05,
"loss": 0.7787,
"mean_token_accuracy": 0.7652851550298655,
"step": 159
},
{
"epoch": 0.4735479097299297,
"grad_norm": 0.5274466872215271,
"learning_rate": 1.9799763493273572e-05,
"loss": 0.7451,
"mean_token_accuracy": 0.7758701051335468,
"step": 160
},
{
"epoch": 0.4765075841657418,
"grad_norm": 0.5253377556800842,
"learning_rate": 1.9792823408445173e-05,
"loss": 0.7794,
"mean_token_accuracy": 0.7660881601135704,
"step": 161
},
{
"epoch": 0.4794672586015538,
"grad_norm": 0.6184384822845459,
"learning_rate": 1.978576635227554e-05,
"loss": 0.7705,
"mean_token_accuracy": 0.7684919526843087,
"step": 162
},
{
"epoch": 0.4824269330373659,
"grad_norm": 0.5399531126022339,
"learning_rate": 1.9778592409058376e-05,
"loss": 0.7496,
"mean_token_accuracy": 0.7751951559026848,
"step": 163
},
{
"epoch": 0.48538660747317797,
"grad_norm": 0.5651612281799316,
"learning_rate": 1.9771301664483548e-05,
"loss": 0.7637,
"mean_token_accuracy": 0.770090502168717,
"step": 164
},
{
"epoch": 0.48834628190899,
"grad_norm": 0.6314195394515991,
"learning_rate": 1.976389420563607e-05,
"loss": 0.7634,
"mean_token_accuracy": 0.7709643457026975,
"step": 165
},
{
"epoch": 0.49130595634480206,
"grad_norm": 0.5370025634765625,
"learning_rate": 1.975637012099507e-05,
"loss": 0.7467,
"mean_token_accuracy": 0.7752258048770664,
"step": 166
},
{
"epoch": 0.49426563078061414,
"grad_norm": 0.5424651503562927,
"learning_rate": 1.97487295004327e-05,
"loss": 0.7933,
"mean_token_accuracy": 0.760696495825342,
"step": 167
},
{
"epoch": 0.4972253052164262,
"grad_norm": 0.5711933970451355,
"learning_rate": 1.9740972435213114e-05,
"loss": 0.7928,
"mean_token_accuracy": 0.761649293131421,
"step": 168
},
{
"epoch": 0.5001849796522383,
"grad_norm": 0.5219062566757202,
"learning_rate": 1.9733099017991342e-05,
"loss": 0.7861,
"mean_token_accuracy": 0.7628095190256412,
"step": 169
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.4978106617927551,
"learning_rate": 1.972510934281218e-05,
"loss": 0.7631,
"mean_token_accuracy": 0.7710752711114524,
"step": 170
},
{
"epoch": 0.5061043285238623,
"grad_norm": 0.6013402938842773,
"learning_rate": 1.9717003505109097e-05,
"loss": 0.7991,
"mean_token_accuracy": 0.7586557673484216,
"step": 171
},
{
"epoch": 0.5090640029596745,
"grad_norm": 0.5215644836425781,
"learning_rate": 1.9708781601703066e-05,
"loss": 0.763,
"mean_token_accuracy": 0.7695876606622123,
"step": 172
},
{
"epoch": 0.5120236773954865,
"grad_norm": 0.49007105827331543,
"learning_rate": 1.9700443730801412e-05,
"loss": 0.7644,
"mean_token_accuracy": 0.7701783635410456,
"step": 173
},
{
"epoch": 0.5149833518312985,
"grad_norm": 0.5938363075256348,
"learning_rate": 1.9691989991996663e-05,
"loss": 0.7643,
"mean_token_accuracy": 0.7680917186626302,
"step": 174
},
{
"epoch": 0.5179430262671106,
"grad_norm": 0.5483390092849731,
"learning_rate": 1.9683420486265328e-05,
"loss": 0.7651,
"mean_token_accuracy": 0.7709870011542461,
"step": 175
},
{
"epoch": 0.5209027007029227,
"grad_norm": 0.5027016997337341,
"learning_rate": 1.967473531596671e-05,
"loss": 0.7513,
"mean_token_accuracy": 0.7730452420829894,
"step": 176
},
{
"epoch": 0.5238623751387348,
"grad_norm": 0.5310905575752258,
"learning_rate": 1.966593458484168e-05,
"loss": 0.7715,
"mean_token_accuracy": 0.7680981483212205,
"step": 177
},
{
"epoch": 0.5268220495745468,
"grad_norm": 0.5523523688316345,
"learning_rate": 1.9657018398011435e-05,
"loss": 0.7674,
"mean_token_accuracy": 0.7684800548855188,
"step": 178
},
{
"epoch": 0.5297817240103588,
"grad_norm": 0.5446920394897461,
"learning_rate": 1.9647986861976246e-05,
"loss": 0.773,
"mean_token_accuracy": 0.7688905853900413,
"step": 179
},
{
"epoch": 0.532741398446171,
"grad_norm": 0.5408650636672974,
"learning_rate": 1.9638840084614182e-05,
"loss": 0.7204,
"mean_token_accuracy": 0.7827706625253021,
"step": 180
},
{
"epoch": 0.535701072881983,
"grad_norm": 0.5880627632141113,
"learning_rate": 1.9629578175179823e-05,
"loss": 0.7587,
"mean_token_accuracy": 0.7718611041296293,
"step": 181
},
{
"epoch": 0.538660747317795,
"grad_norm": 0.5494539141654968,
"learning_rate": 1.9620201244302952e-05,
"loss": 0.7487,
"mean_token_accuracy": 0.7745951212558507,
"step": 182
},
{
"epoch": 0.5416204217536071,
"grad_norm": 0.5416110754013062,
"learning_rate": 1.9610709403987248e-05,
"loss": 0.7583,
"mean_token_accuracy": 0.7723263702843611,
"step": 183
},
{
"epoch": 0.5445800961894192,
"grad_norm": 0.5187686681747437,
"learning_rate": 1.9601102767608924e-05,
"loss": 0.7727,
"mean_token_accuracy": 0.7669931715546834,
"step": 184
},
{
"epoch": 0.5475397706252312,
"grad_norm": 0.6072437763214111,
"learning_rate": 1.95913814499154e-05,
"loss": 0.7758,
"mean_token_accuracy": 0.7658226539132729,
"step": 185
},
{
"epoch": 0.5504994450610433,
"grad_norm": 0.5267654061317444,
"learning_rate": 1.95815455670239e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7644383151164792,
"step": 186
},
{
"epoch": 0.5534591194968553,
"grad_norm": 0.5116267800331116,
"learning_rate": 1.9571595236420103e-05,
"loss": 0.765,
"mean_token_accuracy": 0.7686784858855072,
"step": 187
},
{
"epoch": 0.5564187939326674,
"grad_norm": 0.5083511471748352,
"learning_rate": 1.9561530576956703e-05,
"loss": 0.7293,
"mean_token_accuracy": 0.7782823905710549,
"step": 188
},
{
"epoch": 0.5593784683684795,
"grad_norm": 0.5557141900062561,
"learning_rate": 1.955135170885202e-05,
"loss": 0.7426,
"mean_token_accuracy": 0.7763826979033814,
"step": 189
},
{
"epoch": 0.5623381428042915,
"grad_norm": 0.5787784457206726,
"learning_rate": 1.9541058753688538e-05,
"loss": 0.7484,
"mean_token_accuracy": 0.7738303017670985,
"step": 190
},
{
"epoch": 0.5652978172401036,
"grad_norm": 0.5557724237442017,
"learning_rate": 1.9530651834411477e-05,
"loss": 0.7603,
"mean_token_accuracy": 0.7699144780244102,
"step": 191
},
{
"epoch": 0.5682574916759157,
"grad_norm": 0.5540621876716614,
"learning_rate": 1.95201310753273e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.7793132883624135,
"step": 192
},
{
"epoch": 0.5712171661117277,
"grad_norm": 0.5053984522819519,
"learning_rate": 1.9509496602102253e-05,
"loss": 0.7258,
"mean_token_accuracy": 0.7800444754491836,
"step": 193
},
{
"epoch": 0.5741768405475398,
"grad_norm": 0.49898284673690796,
"learning_rate": 1.9498748541760845e-05,
"loss": 0.7396,
"mean_token_accuracy": 0.7753466916631608,
"step": 194
},
{
"epoch": 0.5771365149833518,
"grad_norm": 0.5799064040184021,
"learning_rate": 1.9487887022684336e-05,
"loss": 0.7602,
"mean_token_accuracy": 0.7701053674537776,
"step": 195
},
{
"epoch": 0.5800961894191639,
"grad_norm": 0.5606354475021362,
"learning_rate": 1.947691217460921e-05,
"loss": 0.7544,
"mean_token_accuracy": 0.77100937072039,
"step": 196
},
{
"epoch": 0.583055863854976,
"grad_norm": 0.4998267590999603,
"learning_rate": 1.946582412862562e-05,
"loss": 0.766,
"mean_token_accuracy": 0.7682667265118656,
"step": 197
},
{
"epoch": 0.586015538290788,
"grad_norm": 0.5629295110702515,
"learning_rate": 1.9454623017175814e-05,
"loss": 0.7424,
"mean_token_accuracy": 0.7752050364586516,
"step": 198
},
{
"epoch": 0.5889752127266,
"grad_norm": 0.4932561218738556,
"learning_rate": 1.9443308974052574e-05,
"loss": 0.7489,
"mean_token_accuracy": 0.7741070965947788,
"step": 199
},
{
"epoch": 0.5919348871624122,
"grad_norm": 0.6265371441841125,
"learning_rate": 1.9431882134397596e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.7681866412889478,
"step": 200
},
{
"epoch": 0.5919348871624122,
"eval_loss": 0.778282105922699,
"eval_mean_token_accuracy": 0.7620499776343601,
"eval_runtime": 24.5192,
"eval_samples_per_second": 5.261,
"eval_steps_per_second": 1.346,
"step": 200
},
{
"epoch": 0.5948945615982242,
"grad_norm": 0.5446656346321106,
"learning_rate": 1.9420342634699893e-05,
"loss": 0.722,
"mean_token_accuracy": 0.7810348950987986,
"step": 201
},
{
"epoch": 0.5978542360340362,
"grad_norm": 0.5253841876983643,
"learning_rate": 1.9408690612794146e-05,
"loss": 0.7758,
"mean_token_accuracy": 0.7659725997741449,
"step": 202
},
{
"epoch": 0.6008139104698483,
"grad_norm": 0.5887268781661987,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.7107,
"mean_token_accuracy": 0.7828422379162261,
"step": 203
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.5546231269836426,
"learning_rate": 1.9385049560415794e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.7646388223607241,
"step": 204
},
{
"epoch": 0.6067332593414725,
"grad_norm": 0.5595012307167053,
"learning_rate": 1.9373060812326053e-05,
"loss": 0.7368,
"mean_token_accuracy": 0.7771756648124704,
"step": 205
},
{
"epoch": 0.6096929337772845,
"grad_norm": 0.6051347255706787,
"learning_rate": 1.9360960106790645e-05,
"loss": 0.7637,
"mean_token_accuracy": 0.7687831877533422,
"step": 206
},
{
"epoch": 0.6126526082130965,
"grad_norm": 0.5045530200004578,
"learning_rate": 1.9348747588347637e-05,
"loss": 0.7633,
"mean_token_accuracy": 0.7716161599834406,
"step": 207
},
{
"epoch": 0.6156122826489087,
"grad_norm": 0.5844081044197083,
"learning_rate": 1.9336423402870655e-05,
"loss": 0.7634,
"mean_token_accuracy": 0.7698122225847835,
"step": 208
},
{
"epoch": 0.6185719570847207,
"grad_norm": 0.516323983669281,
"learning_rate": 1.932398769756714e-05,
"loss": 0.7347,
"mean_token_accuracy": 0.7758576109605254,
"step": 209
},
{
"epoch": 0.6215316315205327,
"grad_norm": 0.6504623293876648,
"learning_rate": 1.9311440620976597e-05,
"loss": 0.7375,
"mean_token_accuracy": 0.7756102635673668,
"step": 210
},
{
"epoch": 0.6244913059563448,
"grad_norm": 0.6118385195732117,
"learning_rate": 1.9298782322968817e-05,
"loss": 0.7734,
"mean_token_accuracy": 0.7640280400757476,
"step": 211
},
{
"epoch": 0.6274509803921569,
"grad_norm": 0.5381941795349121,
"learning_rate": 1.9286012954742078e-05,
"loss": 0.7426,
"mean_token_accuracy": 0.7750216295859045,
"step": 212
},
{
"epoch": 0.6304106548279689,
"grad_norm": 0.6116046905517578,
"learning_rate": 1.9273132668821363e-05,
"loss": 0.7894,
"mean_token_accuracy": 0.7624240258634218,
"step": 213
},
{
"epoch": 0.633370329263781,
"grad_norm": 0.5995723009109497,
"learning_rate": 1.9260141619056507e-05,
"loss": 0.8063,
"mean_token_accuracy": 0.7580679708807321,
"step": 214
},
{
"epoch": 0.636330003699593,
"grad_norm": 0.6060746312141418,
"learning_rate": 1.924703996062038e-05,
"loss": 0.7825,
"mean_token_accuracy": 0.7644491908483929,
"step": 215
},
{
"epoch": 0.6392896781354052,
"grad_norm": 0.4967659115791321,
"learning_rate": 1.9233827850007028e-05,
"loss": 0.7419,
"mean_token_accuracy": 0.7752207223816133,
"step": 216
},
{
"epoch": 0.6422493525712172,
"grad_norm": 0.5452144145965576,
"learning_rate": 1.9220505445029803e-05,
"loss": 0.7419,
"mean_token_accuracy": 0.7768822483190798,
"step": 217
},
{
"epoch": 0.6452090270070292,
"grad_norm": 0.5308946967124939,
"learning_rate": 1.9207072904819484e-05,
"loss": 0.7867,
"mean_token_accuracy": 0.7616907876516587,
"step": 218
},
{
"epoch": 0.6481687014428413,
"grad_norm": 0.5080918669700623,
"learning_rate": 1.9193530389822364e-05,
"loss": 0.7551,
"mean_token_accuracy": 0.7722103161394469,
"step": 219
},
{
"epoch": 0.6511283758786534,
"grad_norm": 0.5541013479232788,
"learning_rate": 1.9179878061798347e-05,
"loss": 0.7416,
"mean_token_accuracy": 0.7758964006687687,
"step": 220
},
{
"epoch": 0.6540880503144654,
"grad_norm": 0.5555444955825806,
"learning_rate": 1.9166116083819002e-05,
"loss": 0.7735,
"mean_token_accuracy": 0.7667690994886073,
"step": 221
},
{
"epoch": 0.6570477247502775,
"grad_norm": 0.5138890743255615,
"learning_rate": 1.915224462026563e-05,
"loss": 0.7689,
"mean_token_accuracy": 0.7680507811975301,
"step": 222
},
{
"epoch": 0.6600073991860895,
"grad_norm": 0.5619951486587524,
"learning_rate": 1.913826383682729e-05,
"loss": 0.7776,
"mean_token_accuracy": 0.7642446287241815,
"step": 223
},
{
"epoch": 0.6629670736219015,
"grad_norm": 0.49697887897491455,
"learning_rate": 1.912417390049882e-05,
"loss": 0.7564,
"mean_token_accuracy": 0.7708950011889235,
"step": 224
},
{
"epoch": 0.6659267480577137,
"grad_norm": 0.5893805027008057,
"learning_rate": 1.9109974979578852e-05,
"loss": 0.7347,
"mean_token_accuracy": 0.7758372095558704,
"step": 225
},
{
"epoch": 0.6688864224935257,
"grad_norm": 0.5565352439880371,
"learning_rate": 1.909566724366779e-05,
"loss": 0.7619,
"mean_token_accuracy": 0.76937331341953,
"step": 226
},
{
"epoch": 0.6718460969293377,
"grad_norm": 0.581122875213623,
"learning_rate": 1.9081250863665794e-05,
"loss": 0.7459,
"mean_token_accuracy": 0.7744230618996671,
"step": 227
},
{
"epoch": 0.6748057713651499,
"grad_norm": 0.6203576326370239,
"learning_rate": 1.9066726011770725e-05,
"loss": 0.7403,
"mean_token_accuracy": 0.7757174096012653,
"step": 228
},
{
"epoch": 0.6777654458009619,
"grad_norm": 0.5231543779373169,
"learning_rate": 1.905209286147611e-05,
"loss": 0.7291,
"mean_token_accuracy": 0.7789308093459126,
"step": 229
},
{
"epoch": 0.680725120236774,
"grad_norm": 0.5227301120758057,
"learning_rate": 1.903735158756905e-05,
"loss": 0.7267,
"mean_token_accuracy": 0.780063648206095,
"step": 230
},
{
"epoch": 0.683684794672586,
"grad_norm": 0.5774472951889038,
"learning_rate": 1.9022502366128136e-05,
"loss": 0.7626,
"mean_token_accuracy": 0.7701068030426402,
"step": 231
},
{
"epoch": 0.686644469108398,
"grad_norm": 0.5350067615509033,
"learning_rate": 1.9007545374521354e-05,
"loss": 0.7727,
"mean_token_accuracy": 0.767009637419523,
"step": 232
},
{
"epoch": 0.6896041435442102,
"grad_norm": 0.543245792388916,
"learning_rate": 1.8992480791403957e-05,
"loss": 0.7258,
"mean_token_accuracy": 0.7811048484724694,
"step": 233
},
{
"epoch": 0.6925638179800222,
"grad_norm": 0.6067213416099548,
"learning_rate": 1.897730879671634e-05,
"loss": 0.7454,
"mean_token_accuracy": 0.7739789538178186,
"step": 234
},
{
"epoch": 0.6955234924158342,
"grad_norm": 0.5219905972480774,
"learning_rate": 1.8962029571681887e-05,
"loss": 0.7094,
"mean_token_accuracy": 0.7855872005269757,
"step": 235
},
{
"epoch": 0.6984831668516464,
"grad_norm": 0.5807480216026306,
"learning_rate": 1.8946643298804794e-05,
"loss": 0.7701,
"mean_token_accuracy": 0.7658029579586856,
"step": 236
},
{
"epoch": 0.7014428412874584,
"grad_norm": 0.4960806965827942,
"learning_rate": 1.8931150161867917e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.7792831489593245,
"step": 237
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.5792670249938965,
"learning_rate": 1.891555034593055e-05,
"loss": 0.7467,
"mean_token_accuracy": 0.7733710687900762,
"step": 238
},
{
"epoch": 0.7073621901590825,
"grad_norm": 0.5364589691162109,
"learning_rate": 1.8899844037326227e-05,
"loss": 0.7195,
"mean_token_accuracy": 0.7821820109931461,
"step": 239
},
{
"epoch": 0.7103218645948945,
"grad_norm": 0.5596705079078674,
"learning_rate": 1.8884031423660492e-05,
"loss": 0.7047,
"mean_token_accuracy": 0.785649852431446,
"step": 240
},
{
"epoch": 0.7132815390307066,
"grad_norm": 0.5741063356399536,
"learning_rate": 1.8868112693808664e-05,
"loss": 0.7663,
"mean_token_accuracy": 0.7678326165991625,
"step": 241
},
{
"epoch": 0.7162412134665187,
"grad_norm": 0.516858696937561,
"learning_rate": 1.8852088037913577e-05,
"loss": 0.7471,
"mean_token_accuracy": 0.7746923216355659,
"step": 242
},
{
"epoch": 0.7192008879023307,
"grad_norm": 0.5048111081123352,
"learning_rate": 1.8835957647383304e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.7870937976717415,
"step": 243
},
{
"epoch": 0.7221605623381429,
"grad_norm": 0.5660455226898193,
"learning_rate": 1.8819721714888878e-05,
"loss": 0.7795,
"mean_token_accuracy": 0.7642331478723334,
"step": 244
},
{
"epoch": 0.7251202367739549,
"grad_norm": 0.5211176872253418,
"learning_rate": 1.8803380434362e-05,
"loss": 0.7342,
"mean_token_accuracy": 0.7781391886138683,
"step": 245
},
{
"epoch": 0.7280799112097669,
"grad_norm": 0.5142192244529724,
"learning_rate": 1.878693400099269e-05,
"loss": 0.7301,
"mean_token_accuracy": 0.7786941626128209,
"step": 246
},
{
"epoch": 0.731039585645579,
"grad_norm": 0.5370232462882996,
"learning_rate": 1.877038261122699e-05,
"loss": 0.7593,
"mean_token_accuracy": 0.771669201717037,
"step": 247
},
{
"epoch": 0.733999260081391,
"grad_norm": 0.49543988704681396,
"learning_rate": 1.87537264627646e-05,
"loss": 0.7216,
"mean_token_accuracy": 0.7810789864692633,
"step": 248
},
{
"epoch": 0.7369589345172031,
"grad_norm": 0.56675785779953,
"learning_rate": 1.8736965754556527e-05,
"loss": 0.7627,
"mean_token_accuracy": 0.7688760359914193,
"step": 249
},
{
"epoch": 0.7399186089530152,
"grad_norm": 0.524047315120697,
"learning_rate": 1.8720100686802693e-05,
"loss": 0.7551,
"mean_token_accuracy": 0.7700947445179971,
"step": 250
},
{
"epoch": 0.7428782833888272,
"grad_norm": 0.5166477560997009,
"learning_rate": 1.8703131460949555e-05,
"loss": 0.7785,
"mean_token_accuracy": 0.7636579872205778,
"step": 251
},
{
"epoch": 0.7458379578246392,
"grad_norm": 0.5201772451400757,
"learning_rate": 1.86860582796877e-05,
"loss": 0.736,
"mean_token_accuracy": 0.7761137360141643,
"step": 252
},
{
"epoch": 0.7487976322604514,
"grad_norm": 0.6423028707504272,
"learning_rate": 1.866888134694942e-05,
"loss": 0.7454,
"mean_token_accuracy": 0.7750494962065552,
"step": 253
},
{
"epoch": 0.7517573066962634,
"grad_norm": 0.5888985395431519,
"learning_rate": 1.865160086790627e-05,
"loss": 0.7238,
"mean_token_accuracy": 0.7800915239288521,
"step": 254
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.5778961181640625,
"learning_rate": 1.8634217048966638e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.7687186514339136,
"step": 255
},
{
"epoch": 0.7576766555678875,
"grad_norm": 0.5808703303337097,
"learning_rate": 1.861673009777325e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7729568426414184,
"step": 256
},
{
"epoch": 0.7606363300036996,
"grad_norm": 0.5731485486030579,
"learning_rate": 1.8599140223200716e-05,
"loss": 0.748,
"mean_token_accuracy": 0.7729810722706314,
"step": 257
},
{
"epoch": 0.7635960044395117,
"grad_norm": 0.5766414403915405,
"learning_rate": 1.858144763535302e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.764375293579256,
"step": 258
},
{
"epoch": 0.7665556788753237,
"grad_norm": 0.5422239899635315,
"learning_rate": 1.8563652545561014e-05,
"loss": 0.7329,
"mean_token_accuracy": 0.7776419690528588,
"step": 259
},
{
"epoch": 0.7695153533111357,
"grad_norm": 0.5828793048858643,
"learning_rate": 1.8545755166379898e-05,
"loss": 0.7186,
"mean_token_accuracy": 0.7822970814680493,
"step": 260
},
{
"epoch": 0.7724750277469479,
"grad_norm": 0.5449491739273071,
"learning_rate": 1.852775571158668e-05,
"loss": 0.7711,
"mean_token_accuracy": 0.7660281761867683,
"step": 261
},
{
"epoch": 0.7754347021827599,
"grad_norm": 0.5476288795471191,
"learning_rate": 1.850965439617761e-05,
"loss": 0.7404,
"mean_token_accuracy": 0.7736120045020073,
"step": 262
},
{
"epoch": 0.7783943766185719,
"grad_norm": 0.6878018975257874,
"learning_rate": 1.8491451436365628e-05,
"loss": 0.7758,
"mean_token_accuracy": 0.7640672296658151,
"step": 263
},
{
"epoch": 0.781354051054384,
"grad_norm": 0.5300653576850891,
"learning_rate": 1.8473147049577777e-05,
"loss": 0.7666,
"mean_token_accuracy": 0.7686153435173708,
"step": 264
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.6327837705612183,
"learning_rate": 1.8454741454452604e-05,
"loss": 0.7521,
"mean_token_accuracy": 0.7717832959346983,
"step": 265
},
{
"epoch": 0.7872733999260081,
"grad_norm": 0.5409294366836548,
"learning_rate": 1.843623487083755e-05,
"loss": 0.7404,
"mean_token_accuracy": 0.7766533132408322,
"step": 266
},
{
"epoch": 0.7902330743618202,
"grad_norm": 0.5834295749664307,
"learning_rate": 1.8417627519786317e-05,
"loss": 0.7592,
"mean_token_accuracy": 0.7693419786318872,
"step": 267
},
{
"epoch": 0.7931927487976322,
"grad_norm": 0.5921277403831482,
"learning_rate": 1.839891962355624e-05,
"loss": 0.7162,
"mean_token_accuracy": 0.7820607013724311,
"step": 268
},
{
"epoch": 0.7961524232334444,
"grad_norm": 0.5238744020462036,
"learning_rate": 1.838011140560562e-05,
"loss": 0.7565,
"mean_token_accuracy": 0.770343025952529,
"step": 269
},
{
"epoch": 0.7991120976692564,
"grad_norm": 0.5569880604743958,
"learning_rate": 1.836120309059107e-05,
"loss": 0.7488,
"mean_token_accuracy": 0.7728957184836894,
"step": 270
},
{
"epoch": 0.8020717721050684,
"grad_norm": 0.5647782683372498,
"learning_rate": 1.8342194904364815e-05,
"loss": 0.7135,
"mean_token_accuracy": 0.7830927354241212,
"step": 271
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.5411779284477234,
"learning_rate": 1.8323087073971996e-05,
"loss": 0.7366,
"mean_token_accuracy": 0.775458202599469,
"step": 272
},
{
"epoch": 0.8079911209766926,
"grad_norm": 0.6045868992805481,
"learning_rate": 1.8303879827647977e-05,
"loss": 0.7544,
"mean_token_accuracy": 0.7712791582365803,
"step": 273
},
{
"epoch": 0.8109507954125046,
"grad_norm": 0.5784792304039001,
"learning_rate": 1.8284573394815596e-05,
"loss": 0.7448,
"mean_token_accuracy": 0.7737621785267094,
"step": 274
},
{
"epoch": 0.8139104698483167,
"grad_norm": 0.5260710120201111,
"learning_rate": 1.826516800608244e-05,
"loss": 0.7627,
"mean_token_accuracy": 0.7694265078345902,
"step": 275
},
{
"epoch": 0.8168701442841287,
"grad_norm": 0.5844061374664307,
"learning_rate": 1.8245663893238075e-05,
"loss": 0.7653,
"mean_token_accuracy": 0.7686764548943624,
"step": 276
},
{
"epoch": 0.8198298187199408,
"grad_norm": 0.5687382221221924,
"learning_rate": 1.8226061289251297e-05,
"loss": 0.7631,
"mean_token_accuracy": 0.7688321516962094,
"step": 277
},
{
"epoch": 0.8227894931557529,
"grad_norm": 0.5046533942222595,
"learning_rate": 1.8206360428267332e-05,
"loss": 0.6843,
"mean_token_accuracy": 0.7910775752871206,
"step": 278
},
{
"epoch": 0.8257491675915649,
"grad_norm": 0.6087561249732971,
"learning_rate": 1.8186561545605055e-05,
"loss": 0.7596,
"mean_token_accuracy": 0.7701909103269404,
"step": 279
},
{
"epoch": 0.8287088420273769,
"grad_norm": 0.5349226593971252,
"learning_rate": 1.816666487775416e-05,
"loss": 0.7453,
"mean_token_accuracy": 0.7745023453893125,
"step": 280
},
{
"epoch": 0.8316685164631891,
"grad_norm": 0.549005389213562,
"learning_rate": 1.8146670662372353e-05,
"loss": 0.7424,
"mean_token_accuracy": 0.7753068407668716,
"step": 281
},
{
"epoch": 0.8346281908990011,
"grad_norm": 0.5528567433357239,
"learning_rate": 1.8126579138282502e-05,
"loss": 0.7515,
"mean_token_accuracy": 0.7716154993402936,
"step": 282
},
{
"epoch": 0.8375878653348132,
"grad_norm": 0.47966665029525757,
"learning_rate": 1.8106390545469797e-05,
"loss": 0.7601,
"mean_token_accuracy": 0.7702052221245829,
"step": 283
},
{
"epoch": 0.8405475397706252,
"grad_norm": 0.5724716186523438,
"learning_rate": 1.8086105125078858e-05,
"loss": 0.7332,
"mean_token_accuracy": 0.7777038447981673,
"step": 284
},
{
"epoch": 0.8435072142064373,
"grad_norm": 0.5578106641769409,
"learning_rate": 1.8065723119410885e-05,
"loss": 0.7302,
"mean_token_accuracy": 0.7772090946791604,
"step": 285
},
{
"epoch": 0.8464668886422494,
"grad_norm": 0.5442110896110535,
"learning_rate": 1.804524477192075e-05,
"loss": 0.7334,
"mean_token_accuracy": 0.7762476620441784,
"step": 286
},
{
"epoch": 0.8494265630780614,
"grad_norm": 0.584141731262207,
"learning_rate": 1.8024670327214084e-05,
"loss": 0.7258,
"mean_token_accuracy": 0.7806851593884065,
"step": 287
},
{
"epoch": 0.8523862375138734,
"grad_norm": 0.598616361618042,
"learning_rate": 1.8004000031044363e-05,
"loss": 0.7793,
"mean_token_accuracy": 0.7645610353814324,
"step": 288
},
{
"epoch": 0.8553459119496856,
"grad_norm": 0.5531610250473022,
"learning_rate": 1.798323413030997e-05,
"loss": 0.7302,
"mean_token_accuracy": 0.7774894874371842,
"step": 289
},
{
"epoch": 0.8583055863854976,
"grad_norm": 0.637056291103363,
"learning_rate": 1.796237287305125e-05,
"loss": 0.7319,
"mean_token_accuracy": 0.776980981509457,
"step": 290
},
{
"epoch": 0.8612652608213096,
"grad_norm": 0.526637613773346,
"learning_rate": 1.7941416508447537e-05,
"loss": 0.737,
"mean_token_accuracy": 0.7755365142177981,
"step": 291
},
{
"epoch": 0.8642249352571217,
"grad_norm": 0.6117897033691406,
"learning_rate": 1.792036528681418e-05,
"loss": 0.7453,
"mean_token_accuracy": 0.7738772998083994,
"step": 292
},
{
"epoch": 0.8671846096929338,
"grad_norm": 0.57455974817276,
"learning_rate": 1.789921945959958e-05,
"loss": 0.7293,
"mean_token_accuracy": 0.7769571821797022,
"step": 293
},
{
"epoch": 0.8701442841287459,
"grad_norm": 0.5134701728820801,
"learning_rate": 1.7877979279382135e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.7810807816611623,
"step": 294
},
{
"epoch": 0.8731039585645579,
"grad_norm": 0.6354233026504517,
"learning_rate": 1.7856644999867264e-05,
"loss": 0.7491,
"mean_token_accuracy": 0.7724234282097991,
"step": 295
},
{
"epoch": 0.8760636330003699,
"grad_norm": 0.4881884753704071,
"learning_rate": 1.783521687588437e-05,
"loss": 0.6976,
"mean_token_accuracy": 0.7884361038620284,
"step": 296
},
{
"epoch": 0.8790233074361821,
"grad_norm": 0.6362212300300598,
"learning_rate": 1.781369516338378e-05,
"loss": 0.7398,
"mean_token_accuracy": 0.7755743850683346,
"step": 297
},
{
"epoch": 0.8819829818719941,
"grad_norm": 0.5661829710006714,
"learning_rate": 1.779208011943371e-05,
"loss": 0.734,
"mean_token_accuracy": 0.7765507531646713,
"step": 298
},
{
"epoch": 0.8849426563078061,
"grad_norm": 0.5010657906532288,
"learning_rate": 1.777037200221717e-05,
"loss": 0.7388,
"mean_token_accuracy": 0.7751515429566093,
"step": 299
},
{
"epoch": 0.8879023307436182,
"grad_norm": 0.6076653003692627,
"learning_rate": 1.77485710710289e-05,
"loss": 0.729,
"mean_token_accuracy": 0.7784857584094641,
"step": 300
},
{
"epoch": 0.8879023307436182,
"eval_loss": 0.7613943219184875,
"eval_mean_token_accuracy": 0.7661339070277478,
"eval_runtime": 24.531,
"eval_samples_per_second": 5.259,
"eval_steps_per_second": 1.345,
"step": 300
},
{
"epoch": 0.8908620051794303,
"grad_norm": 0.5315244197845459,
"learning_rate": 1.7726677586272263e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7800706307954832,
"step": 301
},
{
"epoch": 0.8938216796152423,
"grad_norm": 0.572488009929657,
"learning_rate": 1.7704691809456142e-05,
"loss": 0.7619,
"mean_token_accuracy": 0.7684274429192071,
"step": 302
},
{
"epoch": 0.8967813540510544,
"grad_norm": 0.530282735824585,
"learning_rate": 1.7682614003191807e-05,
"loss": 0.7192,
"mean_token_accuracy": 0.7826426067771499,
"step": 303
},
{
"epoch": 0.8997410284868664,
"grad_norm": 0.4633922278881073,
"learning_rate": 1.766044443118978e-05,
"loss": 0.7361,
"mean_token_accuracy": 0.7761481074845271,
"step": 304
},
{
"epoch": 0.9027007029226785,
"grad_norm": 0.5290641784667969,
"learning_rate": 1.76381833582567e-05,
"loss": 0.7347,
"mean_token_accuracy": 0.7752220969695607,
"step": 305
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.5756820440292358,
"learning_rate": 1.761583105029213e-05,
"loss": 0.7091,
"mean_token_accuracy": 0.7832374672534335,
"step": 306
},
{
"epoch": 0.9086200517943026,
"grad_norm": 0.4851895570755005,
"learning_rate": 1.7593387774285412e-05,
"loss": 0.7259,
"mean_token_accuracy": 0.7790017695040672,
"step": 307
},
{
"epoch": 0.9115797262301147,
"grad_norm": 0.5287590026855469,
"learning_rate": 1.7570853798312462e-05,
"loss": 0.7234,
"mean_token_accuracy": 0.7806430154123836,
"step": 308
},
{
"epoch": 0.9145394006659268,
"grad_norm": 0.5195660591125488,
"learning_rate": 1.7548229391532572e-05,
"loss": 0.6565,
"mean_token_accuracy": 0.7984747483843323,
"step": 309
},
{
"epoch": 0.9174990751017388,
"grad_norm": 0.4991515576839447,
"learning_rate": 1.7525514824185187e-05,
"loss": 0.7231,
"mean_token_accuracy": 0.7803891298617083,
"step": 310
},
{
"epoch": 0.9204587495375509,
"grad_norm": 0.4935111701488495,
"learning_rate": 1.750271036758669e-05,
"loss": 0.7564,
"mean_token_accuracy": 0.7712247656704234,
"step": 311
},
{
"epoch": 0.9234184239733629,
"grad_norm": 0.5220803618431091,
"learning_rate": 1.747981629412715e-05,
"loss": 0.7381,
"mean_token_accuracy": 0.7754488466026199,
"step": 312
},
{
"epoch": 0.926378098409175,
"grad_norm": 0.4899723529815674,
"learning_rate": 1.7456832877267083e-05,
"loss": 0.7147,
"mean_token_accuracy": 0.7830229071000929,
"step": 313
},
{
"epoch": 0.9293377728449871,
"grad_norm": 0.48553645610809326,
"learning_rate": 1.7433760391534166e-05,
"loss": 0.7249,
"mean_token_accuracy": 0.7801764351541252,
"step": 314
},
{
"epoch": 0.9322974472807991,
"grad_norm": 0.5421589016914368,
"learning_rate": 1.741059911251997e-05,
"loss": 0.7398,
"mean_token_accuracy": 0.7753942151228886,
"step": 315
},
{
"epoch": 0.9352571217166111,
"grad_norm": 0.5142074823379517,
"learning_rate": 1.7387349316876668e-05,
"loss": 0.7213,
"mean_token_accuracy": 0.7805064687638097,
"step": 316
},
{
"epoch": 0.9382167961524233,
"grad_norm": 0.4945102632045746,
"learning_rate": 1.7364011282313732e-05,
"loss": 0.713,
"mean_token_accuracy": 0.7815959672421611,
"step": 317
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.509762167930603,
"learning_rate": 1.7340585287594605e-05,
"loss": 0.7278,
"mean_token_accuracy": 0.778527115017442,
"step": 318
},
{
"epoch": 0.9441361450240473,
"grad_norm": 0.5061408877372742,
"learning_rate": 1.731707161253338e-05,
"loss": 0.7646,
"mean_token_accuracy": 0.7684516320654873,
"step": 319
},
{
"epoch": 0.9470958194598594,
"grad_norm": 0.4812653958797455,
"learning_rate": 1.7293470537991463e-05,
"loss": 0.7286,
"mean_token_accuracy": 0.7783584589981216,
"step": 320
},
{
"epoch": 0.9500554938956715,
"grad_norm": 0.5362148284912109,
"learning_rate": 1.7269782345874204e-05,
"loss": 0.7029,
"mean_token_accuracy": 0.785544384259824,
"step": 321
},
{
"epoch": 0.9530151683314836,
"grad_norm": 0.5306621193885803,
"learning_rate": 1.7246007319127547e-05,
"loss": 0.747,
"mean_token_accuracy": 0.774057502189317,
"step": 322
},
{
"epoch": 0.9559748427672956,
"grad_norm": 0.567263126373291,
"learning_rate": 1.7222145741734625e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.7807379482187227,
"step": 323
},
{
"epoch": 0.9589345172031076,
"grad_norm": 0.5175469517707825,
"learning_rate": 1.7198197898712402e-05,
"loss": 0.7275,
"mean_token_accuracy": 0.7786112184337877,
"step": 324
},
{
"epoch": 0.9618941916389198,
"grad_norm": 0.5404612421989441,
"learning_rate": 1.717416407610824e-05,
"loss": 0.689,
"mean_token_accuracy": 0.7877453794929681,
"step": 325
},
{
"epoch": 0.9648538660747318,
"grad_norm": 0.5193690061569214,
"learning_rate": 1.7150044560996488e-05,
"loss": 0.747,
"mean_token_accuracy": 0.7742212613379238,
"step": 326
},
{
"epoch": 0.9678135405105438,
"grad_norm": 0.4946900010108948,
"learning_rate": 1.7125839641475074e-05,
"loss": 0.7471,
"mean_token_accuracy": 0.7747309622069193,
"step": 327
},
{
"epoch": 0.9707732149463559,
"grad_norm": 0.48158422112464905,
"learning_rate": 1.7101549606662025e-05,
"loss": 0.7588,
"mean_token_accuracy": 0.7672773960785951,
"step": 328
},
{
"epoch": 0.973732889382168,
"grad_norm": 0.49433794617652893,
"learning_rate": 1.7077174746692054e-05,
"loss": 0.7086,
"mean_token_accuracy": 0.7835172366515396,
"step": 329
},
{
"epoch": 0.97669256381798,
"grad_norm": 0.529739499092102,
"learning_rate": 1.7052715352713076e-05,
"loss": 0.692,
"mean_token_accuracy": 0.7882518659447058,
"step": 330
},
{
"epoch": 0.9796522382537921,
"grad_norm": 0.49609243869781494,
"learning_rate": 1.7028171716882714e-05,
"loss": 0.727,
"mean_token_accuracy": 0.7790673878869031,
"step": 331
},
{
"epoch": 0.9826119126896041,
"grad_norm": 0.5060005784034729,
"learning_rate": 1.7003544132364847e-05,
"loss": 0.7492,
"mean_token_accuracy": 0.7722196174397824,
"step": 332
},
{
"epoch": 0.9855715871254163,
"grad_norm": 0.5200058817863464,
"learning_rate": 1.6978832893326074e-05,
"loss": 0.7274,
"mean_token_accuracy": 0.7771648765922762,
"step": 333
},
{
"epoch": 0.9885312615612283,
"grad_norm": 0.5111742615699768,
"learning_rate": 1.6954038294932215e-05,
"loss": 0.727,
"mean_token_accuracy": 0.7788486720026189,
"step": 334
},
{
"epoch": 0.9914909359970403,
"grad_norm": 0.49541163444519043,
"learning_rate": 1.692916063334479e-05,
"loss": 0.716,
"mean_token_accuracy": 0.7805707677819913,
"step": 335
},
{
"epoch": 0.9944506104328524,
"grad_norm": 0.5204536318778992,
"learning_rate": 1.690420020571747e-05,
"loss": 0.7857,
"mean_token_accuracy": 0.7611835238050416,
"step": 336
},
{
"epoch": 0.9974102848686645,
"grad_norm": 0.49425816535949707,
"learning_rate": 1.6879157310192537e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.7797621176940523,
"step": 337
},
{
"epoch": 1.002959674435812,
"grad_norm": 0.9215492010116577,
"learning_rate": 1.685403224589731e-05,
"loss": 1.431,
"mean_token_accuracy": 0.781872374274613,
"step": 338
},
{
"epoch": 1.005919348871624,
"grad_norm": 0.4850497841835022,
"learning_rate": 1.6828825312940594e-05,
"loss": 0.7123,
"mean_token_accuracy": 0.7815581594577298,
"step": 339
},
{
"epoch": 1.0088790233074363,
"grad_norm": 0.5388746857643127,
"learning_rate": 1.6803536812409077e-05,
"loss": 0.6533,
"mean_token_accuracy": 0.7976729613611061,
"step": 340
},
{
"epoch": 1.0118386977432483,
"grad_norm": 0.5414032340049744,
"learning_rate": 1.6778167046363735e-05,
"loss": 0.663,
"mean_token_accuracy": 0.7950990029075803,
"step": 341
},
{
"epoch": 1.0147983721790603,
"grad_norm": 0.5482701063156128,
"learning_rate": 1.675271631783623e-05,
"loss": 0.6924,
"mean_token_accuracy": 0.7870997024486296,
"step": 342
},
{
"epoch": 1.0177580466148723,
"grad_norm": 0.5530447363853455,
"learning_rate": 1.672718493082529e-05,
"loss": 0.6957,
"mean_token_accuracy": 0.7862520808317638,
"step": 343
},
{
"epoch": 1.0207177210506844,
"grad_norm": 0.5601862072944641,
"learning_rate": 1.6701573190293076e-05,
"loss": 0.7079,
"mean_token_accuracy": 0.7811090177290159,
"step": 344
},
{
"epoch": 1.0236773954864964,
"grad_norm": 0.5983414649963379,
"learning_rate": 1.667588140216154e-05,
"loss": 0.7177,
"mean_token_accuracy": 0.7782319335787533,
"step": 345
},
{
"epoch": 1.0266370699223086,
"grad_norm": 0.5023918747901917,
"learning_rate": 1.6650109873308763e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.7925658601690396,
"step": 346
},
{
"epoch": 1.0295967443581207,
"grad_norm": 0.5499829053878784,
"learning_rate": 1.6624258911565312e-05,
"loss": 0.6964,
"mean_token_accuracy": 0.7845868210400818,
"step": 347
},
{
"epoch": 1.0325564187939327,
"grad_norm": 0.6044626235961914,
"learning_rate": 1.6598328825710536e-05,
"loss": 0.7433,
"mean_token_accuracy": 0.7716598489636504,
"step": 348
},
{
"epoch": 1.0355160932297447,
"grad_norm": 0.5895024538040161,
"learning_rate": 1.6572319925468892e-05,
"loss": 0.6851,
"mean_token_accuracy": 0.7886055642998372,
"step": 349
},
{
"epoch": 1.0384757676655567,
"grad_norm": 0.4884833097457886,
"learning_rate": 1.654623252150624e-05,
"loss": 0.6874,
"mean_token_accuracy": 0.7882489689414884,
"step": 350
},
{
"epoch": 1.0414354421013687,
"grad_norm": 0.48958876729011536,
"learning_rate": 1.6520066925426146e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.789869173725892,
"step": 351
},
{
"epoch": 1.044395116537181,
"grad_norm": 0.5143749713897705,
"learning_rate": 1.6493823449766137e-05,
"loss": 0.7002,
"mean_token_accuracy": 0.7832564985016889,
"step": 352
},
{
"epoch": 1.047354790972993,
"grad_norm": 0.5188062191009521,
"learning_rate": 1.6467502407993995e-05,
"loss": 0.6785,
"mean_token_accuracy": 0.7895198082299716,
"step": 353
},
{
"epoch": 1.050314465408805,
"grad_norm": 0.5853990316390991,
"learning_rate": 1.644110411450398e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.7840915967094005,
"step": 354
},
{
"epoch": 1.053274139844617,
"grad_norm": 0.48951801657676697,
"learning_rate": 1.6414628884613106e-05,
"loss": 0.6905,
"mean_token_accuracy": 0.7872202318165091,
"step": 355
},
{
"epoch": 1.056233814280429,
"grad_norm": 0.5374004244804382,
"learning_rate": 1.6388077034557355e-05,
"loss": 0.7107,
"mean_token_accuracy": 0.7806436850766835,
"step": 356
},
{
"epoch": 1.0591934887162413,
"grad_norm": 0.49236002564430237,
"learning_rate": 1.6361448881487913e-05,
"loss": 0.6762,
"mean_token_accuracy": 0.7917445809376139,
"step": 357
},
{
"epoch": 1.0621531631520533,
"grad_norm": 0.4819602966308594,
"learning_rate": 1.6334744743467366e-05,
"loss": 0.6876,
"mean_token_accuracy": 0.7879321033092377,
"step": 358
},
{
"epoch": 1.0651128375878653,
"grad_norm": 0.47309836745262146,
"learning_rate": 1.6307964939465914e-05,
"loss": 0.684,
"mean_token_accuracy": 0.7893314943134146,
"step": 359
},
{
"epoch": 1.0680725120236774,
"grad_norm": 0.5006982088088989,
"learning_rate": 1.628110978935756e-05,
"loss": 0.6899,
"mean_token_accuracy": 0.7870876825021131,
"step": 360
},
{
"epoch": 1.0710321864594894,
"grad_norm": 0.5221154093742371,
"learning_rate": 1.625417961391628e-05,
"loss": 0.6475,
"mean_token_accuracy": 0.7990545634414727,
"step": 361
},
{
"epoch": 1.0739918608953016,
"grad_norm": 0.4775597155094147,
"learning_rate": 1.62271747348122e-05,
"loss": 0.6934,
"mean_token_accuracy": 0.787116997295676,
"step": 362
},
{
"epoch": 1.0769515353311137,
"grad_norm": 0.5393570065498352,
"learning_rate": 1.6200095474607753e-05,
"loss": 0.6892,
"mean_token_accuracy": 0.7863585652394626,
"step": 363
},
{
"epoch": 1.0799112097669257,
"grad_norm": 0.4533829689025879,
"learning_rate": 1.6172942156753822e-05,
"loss": 0.6737,
"mean_token_accuracy": 0.791843095021805,
"step": 364
},
{
"epoch": 1.0828708842027377,
"grad_norm": 0.462872177362442,
"learning_rate": 1.614571510558588e-05,
"loss": 0.6741,
"mean_token_accuracy": 0.7927564512367392,
"step": 365
},
{
"epoch": 1.0858305586385497,
"grad_norm": 0.5344141125679016,
"learning_rate": 1.6118414646320115e-05,
"loss": 0.678,
"mean_token_accuracy": 0.7914964738663861,
"step": 366
},
{
"epoch": 1.0887902330743617,
"grad_norm": 0.5266002416610718,
"learning_rate": 1.6091041105049542e-05,
"loss": 0.6946,
"mean_token_accuracy": 0.7852726685975778,
"step": 367
},
{
"epoch": 1.091749907510174,
"grad_norm": 0.4648328125476837,
"learning_rate": 1.6063594808740112e-05,
"loss": 0.6415,
"mean_token_accuracy": 0.8008673556038499,
"step": 368
},
{
"epoch": 1.094709581945986,
"grad_norm": 0.5501207709312439,
"learning_rate": 1.6036076085226813e-05,
"loss": 0.7327,
"mean_token_accuracy": 0.7737077885315848,
"step": 369
},
{
"epoch": 1.097669256381798,
"grad_norm": 0.49827733635902405,
"learning_rate": 1.6008485263209742e-05,
"loss": 0.6509,
"mean_token_accuracy": 0.7995274953751699,
"step": 370
},
{
"epoch": 1.10062893081761,
"grad_norm": 0.4650176465511322,
"learning_rate": 1.598082267225018e-05,
"loss": 0.7112,
"mean_token_accuracy": 0.7804922990268738,
"step": 371
},
{
"epoch": 1.103588605253422,
"grad_norm": 0.5303501486778259,
"learning_rate": 1.595308864276666e-05,
"loss": 0.7211,
"mean_token_accuracy": 0.7776063180667486,
"step": 372
},
{
"epoch": 1.106548279689234,
"grad_norm": 0.5931088924407959,
"learning_rate": 1.592528350603103e-05,
"loss": 0.6912,
"mean_token_accuracy": 0.7860275624390939,
"step": 373
},
{
"epoch": 1.1095079541250463,
"grad_norm": 0.464376300573349,
"learning_rate": 1.5897407594164468e-05,
"loss": 0.6996,
"mean_token_accuracy": 0.7857896692996122,
"step": 374
},
{
"epoch": 1.1124676285608583,
"grad_norm": 0.5060982704162598,
"learning_rate": 1.586946124013354e-05,
"loss": 0.6827,
"mean_token_accuracy": 0.7901175041980462,
"step": 375
},
{
"epoch": 1.1154273029966704,
"grad_norm": 0.5316497683525085,
"learning_rate": 1.5841444777746232e-05,
"loss": 0.6454,
"mean_token_accuracy": 0.7995927306906477,
"step": 376
},
{
"epoch": 1.1183869774324824,
"grad_norm": 0.5280824303627014,
"learning_rate": 1.5813358541647915e-05,
"loss": 0.6821,
"mean_token_accuracy": 0.7899257721771863,
"step": 377
},
{
"epoch": 1.1213466518682944,
"grad_norm": 0.4961848258972168,
"learning_rate": 1.578520286731741e-05,
"loss": 0.7106,
"mean_token_accuracy": 0.7801769327002734,
"step": 378
},
{
"epoch": 1.1243063263041067,
"grad_norm": 0.543953001499176,
"learning_rate": 1.575697809106292e-05,
"loss": 0.6922,
"mean_token_accuracy": 0.785628822049384,
"step": 379
},
{
"epoch": 1.1272660007399187,
"grad_norm": 0.5489509105682373,
"learning_rate": 1.5728684550018066e-05,
"loss": 0.6936,
"mean_token_accuracy": 0.7861259742540445,
"step": 380
},
{
"epoch": 1.1302256751757307,
"grad_norm": 0.48247000575065613,
"learning_rate": 1.570032258213783e-05,
"loss": 0.702,
"mean_token_accuracy": 0.781727569386528,
"step": 381
},
{
"epoch": 1.1331853496115427,
"grad_norm": 0.5495713949203491,
"learning_rate": 1.5671892526194515e-05,
"loss": 0.6792,
"mean_token_accuracy": 0.7919662989910665,
"step": 382
},
{
"epoch": 1.1361450240473547,
"grad_norm": 0.4841765761375427,
"learning_rate": 1.564339472177373e-05,
"loss": 0.6693,
"mean_token_accuracy": 0.7934251880120227,
"step": 383
},
{
"epoch": 1.1391046984831668,
"grad_norm": 0.5036046504974365,
"learning_rate": 1.561482950927029e-05,
"loss": 0.7035,
"mean_token_accuracy": 0.7822988951176773,
"step": 384
},
{
"epoch": 1.142064372918979,
"grad_norm": 0.550046443939209,
"learning_rate": 1.5586197229884185e-05,
"loss": 0.6558,
"mean_token_accuracy": 0.797441361172838,
"step": 385
},
{
"epoch": 1.145024047354791,
"grad_norm": 0.5752468705177307,
"learning_rate": 1.5557498225616488e-05,
"loss": 0.7081,
"mean_token_accuracy": 0.7824781572463329,
"step": 386
},
{
"epoch": 1.147983721790603,
"grad_norm": 0.4782570004463196,
"learning_rate": 1.5528732839265272e-05,
"loss": 0.7,
"mean_token_accuracy": 0.7834877131177364,
"step": 387
},
{
"epoch": 1.150943396226415,
"grad_norm": 0.5209779739379883,
"learning_rate": 1.549990141442153e-05,
"loss": 0.6823,
"mean_token_accuracy": 0.7903034725828352,
"step": 388
},
{
"epoch": 1.153903070662227,
"grad_norm": 0.510071337223053,
"learning_rate": 1.5471004295465034e-05,
"loss": 0.7337,
"mean_token_accuracy": 0.7748414033827098,
"step": 389
},
{
"epoch": 1.156862745098039,
"grad_norm": 0.5067256689071655,
"learning_rate": 1.5442041827560274e-05,
"loss": 0.6945,
"mean_token_accuracy": 0.7857010244801683,
"step": 390
},
{
"epoch": 1.1598224195338513,
"grad_norm": 0.5134366154670715,
"learning_rate": 1.5413014356652287e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.7901567665550651,
"step": 391
},
{
"epoch": 1.1627820939696634,
"grad_norm": 0.49565669894218445,
"learning_rate": 1.538392222946255e-05,
"loss": 0.6992,
"mean_token_accuracy": 0.7850131511442856,
"step": 392
},
{
"epoch": 1.1657417684054754,
"grad_norm": 0.4513917565345764,
"learning_rate": 1.5354765793484834e-05,
"loss": 0.6779,
"mean_token_accuracy": 0.7922368459696144,
"step": 393
},
{
"epoch": 1.1687014428412874,
"grad_norm": 0.5351982116699219,
"learning_rate": 1.5325545396981053e-05,
"loss": 0.6937,
"mean_token_accuracy": 0.7857501806841758,
"step": 394
},
{
"epoch": 1.1716611172770994,
"grad_norm": 0.47825103998184204,
"learning_rate": 1.5296261388977107e-05,
"loss": 0.629,
"mean_token_accuracy": 0.8047603633681424,
"step": 395
},
{
"epoch": 1.1746207917129117,
"grad_norm": 0.48426443338394165,
"learning_rate": 1.52669141192587e-05,
"loss": 0.7218,
"mean_token_accuracy": 0.7786340167760629,
"step": 396
},
{
"epoch": 1.1775804661487237,
"grad_norm": 0.510691225528717,
"learning_rate": 1.5237503938367186e-05,
"loss": 0.6961,
"mean_token_accuracy": 0.7848220716497867,
"step": 397
},
{
"epoch": 1.1805401405845357,
"grad_norm": 0.4977818727493286,
"learning_rate": 1.5208031197595357e-05,
"loss": 0.6181,
"mean_token_accuracy": 0.808352793166422,
"step": 398
},
{
"epoch": 1.1834998150203477,
"grad_norm": 0.45590656995773315,
"learning_rate": 1.5178496248983254e-05,
"loss": 0.6445,
"mean_token_accuracy": 0.7992991854336597,
"step": 399
},
{
"epoch": 1.1864594894561598,
"grad_norm": 0.5166680812835693,
"learning_rate": 1.5148899445313983e-05,
"loss": 0.6391,
"mean_token_accuracy": 0.8008235442677688,
"step": 400
},
{
"epoch": 1.1864594894561598,
"eval_loss": 0.753233015537262,
"eval_mean_token_accuracy": 0.7678493271850204,
"eval_runtime": 24.4762,
"eval_samples_per_second": 5.27,
"eval_steps_per_second": 1.348,
"step": 400
},
{
"epoch": 1.189419163891972,
"grad_norm": 0.4777900278568268,
"learning_rate": 1.5119241140109466e-05,
"loss": 0.6447,
"mean_token_accuracy": 0.8008284367677996,
"step": 401
},
{
"epoch": 1.192378838327784,
"grad_norm": 0.4674142301082611,
"learning_rate": 1.5089521687626243e-05,
"loss": 0.6426,
"mean_token_accuracy": 0.8002595216069462,
"step": 402
},
{
"epoch": 1.195338512763596,
"grad_norm": 0.5119103789329529,
"learning_rate": 1.505974144285124e-05,
"loss": 0.7143,
"mean_token_accuracy": 0.7807192708136647,
"step": 403
},
{
"epoch": 1.198298187199408,
"grad_norm": 0.5238728523254395,
"learning_rate": 1.5029900761497507e-05,
"loss": 0.7459,
"mean_token_accuracy": 0.7719988622051683,
"step": 404
},
{
"epoch": 1.20125786163522,
"grad_norm": 0.5216233134269714,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.6977,
"mean_token_accuracy": 0.7839726890839798,
"step": 405
},
{
"epoch": 1.204217536071032,
"grad_norm": 0.509964108467102,
"learning_rate": 1.4970039515511303e-05,
"loss": 0.6809,
"mean_token_accuracy": 0.7893634011753464,
"step": 406
},
{
"epoch": 1.2071772105068441,
"grad_norm": 0.5653720498085022,
"learning_rate": 1.4940019665897363e-05,
"loss": 0.6897,
"mean_token_accuracy": 0.7868935096910736,
"step": 407
},
{
"epoch": 1.2101368849426564,
"grad_norm": 0.4962683618068695,
"learning_rate": 1.4909940809733223e-05,
"loss": 0.7354,
"mean_token_accuracy": 0.7726758488051101,
"step": 408
},
{
"epoch": 1.2130965593784684,
"grad_norm": 0.5176084637641907,
"learning_rate": 1.4879803306298736e-05,
"loss": 0.6964,
"mean_token_accuracy": 0.7838358295177021,
"step": 409
},
{
"epoch": 1.2160562338142804,
"grad_norm": 0.513697624206543,
"learning_rate": 1.4849607515574276e-05,
"loss": 0.6492,
"mean_token_accuracy": 0.799568203590832,
"step": 410
},
{
"epoch": 1.2190159082500924,
"grad_norm": 0.4567902684211731,
"learning_rate": 1.4819353798236427e-05,
"loss": 0.6991,
"mean_token_accuracy": 0.7838256081866393,
"step": 411
},
{
"epoch": 1.2219755826859044,
"grad_norm": 0.5139224529266357,
"learning_rate": 1.4789042515653687e-05,
"loss": 0.6946,
"mean_token_accuracy": 0.7852177162018236,
"step": 412
},
{
"epoch": 1.2249352571217167,
"grad_norm": 0.5555658936500549,
"learning_rate": 1.4758674029882152e-05,
"loss": 0.6539,
"mean_token_accuracy": 0.7970551349204403,
"step": 413
},
{
"epoch": 1.2278949315575287,
"grad_norm": 0.4890614449977875,
"learning_rate": 1.4728248703661183e-05,
"loss": 0.695,
"mean_token_accuracy": 0.7845206023697728,
"step": 414
},
{
"epoch": 1.2308546059933407,
"grad_norm": 0.47974392771720886,
"learning_rate": 1.4697766900409076e-05,
"loss": 0.669,
"mean_token_accuracy": 0.7929167835356624,
"step": 415
},
{
"epoch": 1.2338142804291528,
"grad_norm": 0.5015913248062134,
"learning_rate": 1.466722898421873e-05,
"loss": 0.7009,
"mean_token_accuracy": 0.7827139356082893,
"step": 416
},
{
"epoch": 1.2367739548649648,
"grad_norm": 0.49240073561668396,
"learning_rate": 1.4636635319853274e-05,
"loss": 0.6685,
"mean_token_accuracy": 0.792534979177688,
"step": 417
},
{
"epoch": 1.239733629300777,
"grad_norm": 0.48550987243652344,
"learning_rate": 1.4605986272741748e-05,
"loss": 0.6908,
"mean_token_accuracy": 0.7868828026774352,
"step": 418
},
{
"epoch": 1.242693303736589,
"grad_norm": 0.47983378171920776,
"learning_rate": 1.4575282208974704e-05,
"loss": 0.6831,
"mean_token_accuracy": 0.7891199345178915,
"step": 419
},
{
"epoch": 1.245652978172401,
"grad_norm": 0.49261724948883057,
"learning_rate": 1.4544523495299843e-05,
"loss": 0.6831,
"mean_token_accuracy": 0.7881435108832517,
"step": 420
},
{
"epoch": 1.248612652608213,
"grad_norm": 0.47099459171295166,
"learning_rate": 1.4513710499117648e-05,
"loss": 0.6307,
"mean_token_accuracy": 0.8053076982273811,
"step": 421
},
{
"epoch": 1.251572327044025,
"grad_norm": 0.4534473121166229,
"learning_rate": 1.4482843588476976e-05,
"loss": 0.6953,
"mean_token_accuracy": 0.7836745290375378,
"step": 422
},
{
"epoch": 1.2545320014798373,
"grad_norm": 0.4827975630760193,
"learning_rate": 1.445192313207067e-05,
"loss": 0.6769,
"mean_token_accuracy": 0.7917014445996506,
"step": 423
},
{
"epoch": 1.2574916759156491,
"grad_norm": 0.48446017503738403,
"learning_rate": 1.4420949499231172e-05,
"loss": 0.6811,
"mean_token_accuracy": 0.7885621949952477,
"step": 424
},
{
"epoch": 1.2604513503514614,
"grad_norm": 0.46176275610923767,
"learning_rate": 1.4389923059926064e-05,
"loss": 0.6715,
"mean_token_accuracy": 0.7921377530314322,
"step": 425
},
{
"epoch": 1.2634110247872734,
"grad_norm": 0.4933745265007019,
"learning_rate": 1.4358844184753713e-05,
"loss": 0.6516,
"mean_token_accuracy": 0.7976899559939264,
"step": 426
},
{
"epoch": 1.2663706992230854,
"grad_norm": 0.4907665252685547,
"learning_rate": 1.432771324493879e-05,
"loss": 0.675,
"mean_token_accuracy": 0.7905862204832549,
"step": 427
},
{
"epoch": 1.2693303736588974,
"grad_norm": 0.4861429035663605,
"learning_rate": 1.4296530612327864e-05,
"loss": 0.7044,
"mean_token_accuracy": 0.782618434308195,
"step": 428
},
{
"epoch": 1.2722900480947095,
"grad_norm": 0.44409534335136414,
"learning_rate": 1.4265296659384956e-05,
"loss": 0.702,
"mean_token_accuracy": 0.7835227926569839,
"step": 429
},
{
"epoch": 1.2752497225305217,
"grad_norm": 0.47325289249420166,
"learning_rate": 1.4234011759187084e-05,
"loss": 0.6907,
"mean_token_accuracy": 0.7883719669584818,
"step": 430
},
{
"epoch": 1.2782093969663337,
"grad_norm": 0.4296591281890869,
"learning_rate": 1.4202676285419811e-05,
"loss": 0.6445,
"mean_token_accuracy": 0.799964374790151,
"step": 431
},
{
"epoch": 1.2811690714021458,
"grad_norm": 0.4680195152759552,
"learning_rate": 1.4171290612372781e-05,
"loss": 0.6913,
"mean_token_accuracy": 0.7865936068853461,
"step": 432
},
{
"epoch": 1.2841287458379578,
"grad_norm": 0.47732165455818176,
"learning_rate": 1.4139855114935253e-05,
"loss": 0.665,
"mean_token_accuracy": 0.795472867454343,
"step": 433
},
{
"epoch": 1.2870884202737698,
"grad_norm": 0.44656407833099365,
"learning_rate": 1.410837016859161e-05,
"loss": 0.6747,
"mean_token_accuracy": 0.790485626527416,
"step": 434
},
{
"epoch": 1.290048094709582,
"grad_norm": 0.4626164734363556,
"learning_rate": 1.4076836149416889e-05,
"loss": 0.6591,
"mean_token_accuracy": 0.7963842598244837,
"step": 435
},
{
"epoch": 1.293007769145394,
"grad_norm": 0.4850873053073883,
"learning_rate": 1.4045253434072278e-05,
"loss": 0.7126,
"mean_token_accuracy": 0.7804075548829805,
"step": 436
},
{
"epoch": 1.295967443581206,
"grad_norm": 0.4946662187576294,
"learning_rate": 1.4013622399800628e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.777694595209445,
"step": 437
},
{
"epoch": 1.298927118017018,
"grad_norm": 0.515221893787384,
"learning_rate": 1.3981943424421932e-05,
"loss": 0.6982,
"mean_token_accuracy": 0.784025918890703,
"step": 438
},
{
"epoch": 1.3018867924528301,
"grad_norm": 0.4743560552597046,
"learning_rate": 1.3950216886328818e-05,
"loss": 0.698,
"mean_token_accuracy": 0.7843463257420568,
"step": 439
},
{
"epoch": 1.3048464668886424,
"grad_norm": 0.47368329763412476,
"learning_rate": 1.3918443164482048e-05,
"loss": 0.6961,
"mean_token_accuracy": 0.7865385891914267,
"step": 440
},
{
"epoch": 1.3078061413244544,
"grad_norm": 0.4459000825881958,
"learning_rate": 1.3886622638405953e-05,
"loss": 0.6955,
"mean_token_accuracy": 0.7852747333942596,
"step": 441
},
{
"epoch": 1.3107658157602664,
"grad_norm": 0.47365012764930725,
"learning_rate": 1.3854755688183941e-05,
"loss": 0.7227,
"mean_token_accuracy": 0.7778711159999969,
"step": 442
},
{
"epoch": 1.3137254901960784,
"grad_norm": 0.46061503887176514,
"learning_rate": 1.3822842694453923e-05,
"loss": 0.6885,
"mean_token_accuracy": 0.7876893449725652,
"step": 443
},
{
"epoch": 1.3166851646318904,
"grad_norm": 0.4780057370662689,
"learning_rate": 1.3790884038403796e-05,
"loss": 0.6911,
"mean_token_accuracy": 0.7863533950002012,
"step": 444
},
{
"epoch": 1.3196448390677027,
"grad_norm": 0.48519885540008545,
"learning_rate": 1.375888010176686e-05,
"loss": 0.6666,
"mean_token_accuracy": 0.7935298420501086,
"step": 445
},
{
"epoch": 1.3226045135035145,
"grad_norm": 0.4679955840110779,
"learning_rate": 1.3726831266817278e-05,
"loss": 0.6885,
"mean_token_accuracy": 0.7879594429456447,
"step": 446
},
{
"epoch": 1.3255641879393267,
"grad_norm": 0.4626809060573578,
"learning_rate": 1.3694737916365517e-05,
"loss": 0.7021,
"mean_token_accuracy": 0.7828708121314737,
"step": 447
},
{
"epoch": 1.3285238623751388,
"grad_norm": 0.45779362320899963,
"learning_rate": 1.3662600433753746e-05,
"loss": 0.6896,
"mean_token_accuracy": 0.7876785995413643,
"step": 448
},
{
"epoch": 1.3314835368109508,
"grad_norm": 0.4595906436443329,
"learning_rate": 1.3630419202851287e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7838014568334657,
"step": 449
},
{
"epoch": 1.3344432112467628,
"grad_norm": 0.4679829776287079,
"learning_rate": 1.3598194608050011e-05,
"loss": 0.7047,
"mean_token_accuracy": 0.7832954223966397,
"step": 450
},
{
"epoch": 1.3374028856825748,
"grad_norm": 0.49509483575820923,
"learning_rate": 1.3565927034259757e-05,
"loss": 0.6956,
"mean_token_accuracy": 0.7861987291079401,
"step": 451
},
{
"epoch": 1.340362560118387,
"grad_norm": 0.47606754302978516,
"learning_rate": 1.3533616866903736e-05,
"loss": 0.6774,
"mean_token_accuracy": 0.7900551101111528,
"step": 452
},
{
"epoch": 1.343322234554199,
"grad_norm": 0.44316449761390686,
"learning_rate": 1.3501264491913909e-05,
"loss": 0.7,
"mean_token_accuracy": 0.7830548189627489,
"step": 453
},
{
"epoch": 1.346281908990011,
"grad_norm": 0.499174028635025,
"learning_rate": 1.3468870295726399e-05,
"loss": 0.7203,
"mean_token_accuracy": 0.7776105610712533,
"step": 454
},
{
"epoch": 1.3492415834258231,
"grad_norm": 0.43882501125335693,
"learning_rate": 1.3436434665276865e-05,
"loss": 0.6745,
"mean_token_accuracy": 0.7913862306577221,
"step": 455
},
{
"epoch": 1.3522012578616351,
"grad_norm": 0.49250712990760803,
"learning_rate": 1.3403957987995884e-05,
"loss": 0.68,
"mean_token_accuracy": 0.7894371521316413,
"step": 456
},
{
"epoch": 1.3551609322974474,
"grad_norm": 0.46765249967575073,
"learning_rate": 1.3371440651804313e-05,
"loss": 0.7066,
"mean_token_accuracy": 0.7817244510128959,
"step": 457
},
{
"epoch": 1.3581206067332594,
"grad_norm": 0.46519362926483154,
"learning_rate": 1.3338883045108674e-05,
"loss": 0.6852,
"mean_token_accuracy": 0.7875893561938507,
"step": 458
},
{
"epoch": 1.3610802811690714,
"grad_norm": 0.5211879014968872,
"learning_rate": 1.3306285556796494e-05,
"loss": 0.6873,
"mean_token_accuracy": 0.7886326578047633,
"step": 459
},
{
"epoch": 1.3640399556048834,
"grad_norm": 0.4436584413051605,
"learning_rate": 1.327364857623168e-05,
"loss": 0.7006,
"mean_token_accuracy": 0.7844141672519914,
"step": 460
},
{
"epoch": 1.3669996300406955,
"grad_norm": 0.49897250533103943,
"learning_rate": 1.3240972493249846e-05,
"loss": 0.6907,
"mean_token_accuracy": 0.7872768784393989,
"step": 461
},
{
"epoch": 1.3699593044765077,
"grad_norm": 0.44192755222320557,
"learning_rate": 1.3208257698153677e-05,
"loss": 0.7179,
"mean_token_accuracy": 0.7772223223597873,
"step": 462
},
{
"epoch": 1.3729189789123195,
"grad_norm": 0.48224934935569763,
"learning_rate": 1.3175504581708261e-05,
"loss": 0.6884,
"mean_token_accuracy": 0.7876441851387866,
"step": 463
},
{
"epoch": 1.3758786533481318,
"grad_norm": 0.44167572259902954,
"learning_rate": 1.3142713535136413e-05,
"loss": 0.6964,
"mean_token_accuracy": 0.7840998538649302,
"step": 464
},
{
"epoch": 1.3788383277839438,
"grad_norm": 0.5188360214233398,
"learning_rate": 1.3109884950114007e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7830517429111471,
"step": 465
},
{
"epoch": 1.3817980022197558,
"grad_norm": 0.4949224293231964,
"learning_rate": 1.3077019218765306e-05,
"loss": 0.6686,
"mean_token_accuracy": 0.7925575804293147,
"step": 466
},
{
"epoch": 1.3847576766555678,
"grad_norm": 0.4614505171775818,
"learning_rate": 1.3044116733658261e-05,
"loss": 0.6745,
"mean_token_accuracy": 0.7904813977673216,
"step": 467
},
{
"epoch": 1.3877173510913798,
"grad_norm": 0.47585147619247437,
"learning_rate": 1.3011177887799846e-05,
"loss": 0.6596,
"mean_token_accuracy": 0.7969142283708234,
"step": 468
},
{
"epoch": 1.390677025527192,
"grad_norm": 0.4733677804470062,
"learning_rate": 1.2978203074631335e-05,
"loss": 0.6837,
"mean_token_accuracy": 0.7885936546719822,
"step": 469
},
{
"epoch": 1.393636699963004,
"grad_norm": 0.47128206491470337,
"learning_rate": 1.2945192688023625e-05,
"loss": 0.7228,
"mean_token_accuracy": 0.777582654281462,
"step": 470
},
{
"epoch": 1.3965963743988161,
"grad_norm": 0.5573126077651978,
"learning_rate": 1.2912147122272523e-05,
"loss": 0.692,
"mean_token_accuracy": 0.7851007004118511,
"step": 471
},
{
"epoch": 1.3995560488346281,
"grad_norm": 0.5249556303024292,
"learning_rate": 1.287906677209403e-05,
"loss": 0.666,
"mean_token_accuracy": 0.7935855307222649,
"step": 472
},
{
"epoch": 1.4025157232704402,
"grad_norm": 0.5098072290420532,
"learning_rate": 1.2845952032619651e-05,
"loss": 0.7169,
"mean_token_accuracy": 0.78048614348136,
"step": 473
},
{
"epoch": 1.4054753977062524,
"grad_norm": 0.5147253274917603,
"learning_rate": 1.2812803299391629e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.775834970458234,
"step": 474
},
{
"epoch": 1.4084350721420644,
"grad_norm": 0.529493510723114,
"learning_rate": 1.2779620968358276e-05,
"loss": 0.6582,
"mean_token_accuracy": 0.7956748329946638,
"step": 475
},
{
"epoch": 1.4113947465778764,
"grad_norm": 0.5070955753326416,
"learning_rate": 1.2746405435869198e-05,
"loss": 0.6674,
"mean_token_accuracy": 0.7915634181103908,
"step": 476
},
{
"epoch": 1.4143544210136885,
"grad_norm": 0.5139186978340149,
"learning_rate": 1.271315709867059e-05,
"loss": 0.7037,
"mean_token_accuracy": 0.7825460416635028,
"step": 477
},
{
"epoch": 1.4173140954495005,
"grad_norm": 0.5307909250259399,
"learning_rate": 1.2679876353900482e-05,
"loss": 0.7082,
"mean_token_accuracy": 0.7814352090483259,
"step": 478
},
{
"epoch": 1.4202737698853127,
"grad_norm": 0.4850543737411499,
"learning_rate": 1.2646563599083997e-05,
"loss": 0.724,
"mean_token_accuracy": 0.7763536423746681,
"step": 479
},
{
"epoch": 1.4232334443211248,
"grad_norm": 0.5001718997955322,
"learning_rate": 1.2613219232128608e-05,
"loss": 0.6629,
"mean_token_accuracy": 0.7942459104666942,
"step": 480
},
{
"epoch": 1.4261931187569368,
"grad_norm": 0.5056073069572449,
"learning_rate": 1.2579843651319382e-05,
"loss": 0.7331,
"mean_token_accuracy": 0.7724445151223609,
"step": 481
},
{
"epoch": 1.4291527931927488,
"grad_norm": 0.5267237424850464,
"learning_rate": 1.2546437255314223e-05,
"loss": 0.6659,
"mean_token_accuracy": 0.7943264511441203,
"step": 482
},
{
"epoch": 1.4321124676285608,
"grad_norm": 0.4923066794872284,
"learning_rate": 1.2513000443139112e-05,
"loss": 0.693,
"mean_token_accuracy": 0.7847285183122921,
"step": 483
},
{
"epoch": 1.435072142064373,
"grad_norm": 0.4452427327632904,
"learning_rate": 1.2479533614183334e-05,
"loss": 0.6783,
"mean_token_accuracy": 0.790767397651227,
"step": 484
},
{
"epoch": 1.4380318165001849,
"grad_norm": 0.4807162582874298,
"learning_rate": 1.2446037168194716e-05,
"loss": 0.6951,
"mean_token_accuracy": 0.7842417519133703,
"step": 485
},
{
"epoch": 1.440991490935997,
"grad_norm": 0.4858757257461548,
"learning_rate": 1.2412511505274845e-05,
"loss": 0.6602,
"mean_token_accuracy": 0.7962518182176112,
"step": 486
},
{
"epoch": 1.4439511653718091,
"grad_norm": 0.4663830101490021,
"learning_rate": 1.23789570258743e-05,
"loss": 0.6951,
"mean_token_accuracy": 0.7839527031401198,
"step": 487
},
{
"epoch": 1.4469108398076211,
"grad_norm": 0.4759344160556793,
"learning_rate": 1.2345374130787855e-05,
"loss": 0.6925,
"mean_token_accuracy": 0.7861855601757001,
"step": 488
},
{
"epoch": 1.4498705142434332,
"grad_norm": 0.44426658749580383,
"learning_rate": 1.23117632211497e-05,
"loss": 0.6561,
"mean_token_accuracy": 0.7964251152169285,
"step": 489
},
{
"epoch": 1.4528301886792452,
"grad_norm": 0.4644084870815277,
"learning_rate": 1.2278124698428643e-05,
"loss": 0.6848,
"mean_token_accuracy": 0.7871725512533235,
"step": 490
},
{
"epoch": 1.4557898631150574,
"grad_norm": 0.43534740805625916,
"learning_rate": 1.2244458964423328e-05,
"loss": 0.6952,
"mean_token_accuracy": 0.7838933476240588,
"step": 491
},
{
"epoch": 1.4587495375508694,
"grad_norm": 0.4578785300254822,
"learning_rate": 1.221076642125742e-05,
"loss": 0.6912,
"mean_token_accuracy": 0.7867050710099383,
"step": 492
},
{
"epoch": 1.4617092119866815,
"grad_norm": 0.46426481008529663,
"learning_rate": 1.2177047471374808e-05,
"loss": 0.6679,
"mean_token_accuracy": 0.793821778161506,
"step": 493
},
{
"epoch": 1.4646688864224935,
"grad_norm": 0.4668942391872406,
"learning_rate": 1.214330251753481e-05,
"loss": 0.6788,
"mean_token_accuracy": 0.7911113494359255,
"step": 494
},
{
"epoch": 1.4676285608583055,
"grad_norm": 0.4524623155593872,
"learning_rate": 1.2109531962807333e-05,
"loss": 0.657,
"mean_token_accuracy": 0.7968866396266425,
"step": 495
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.4540092945098877,
"learning_rate": 1.207573621056809e-05,
"loss": 0.6779,
"mean_token_accuracy": 0.79133374474269,
"step": 496
},
{
"epoch": 1.4735479097299298,
"grad_norm": 0.4721427857875824,
"learning_rate": 1.2041915664493763e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.7811596412077128,
"step": 497
},
{
"epoch": 1.4765075841657418,
"grad_norm": 0.45745474100112915,
"learning_rate": 1.2008070728557186e-05,
"loss": 0.6946,
"mean_token_accuracy": 0.7835979713892247,
"step": 498
},
{
"epoch": 1.4794672586015538,
"grad_norm": 0.45184969902038574,
"learning_rate": 1.1974201807022525e-05,
"loss": 0.6594,
"mean_token_accuracy": 0.7954918143409643,
"step": 499
},
{
"epoch": 1.4824269330373658,
"grad_norm": 0.43299737572669983,
"learning_rate": 1.1940309304440434e-05,
"loss": 0.655,
"mean_token_accuracy": 0.7961995893943149,
"step": 500
},
{
"epoch": 1.4824269330373658,
"eval_loss": 0.7452248930931091,
"eval_mean_token_accuracy": 0.7696687843740262,
"eval_runtime": 24.4738,
"eval_samples_per_second": 5.271,
"eval_steps_per_second": 1.348,
"step": 500
},
{
"epoch": 1.485386607473178,
"grad_norm": 0.4329541325569153,
"learning_rate": 1.1906393625643244e-05,
"loss": 0.6908,
"mean_token_accuracy": 0.787461052002391,
"step": 501
},
{
"epoch": 1.4883462819089899,
"grad_norm": 0.44818833470344543,
"learning_rate": 1.1872455175740111e-05,
"loss": 0.7038,
"mean_token_accuracy": 0.7827824467497245,
"step": 502
},
{
"epoch": 1.4913059563448021,
"grad_norm": 0.4627722501754761,
"learning_rate": 1.1838494360112185e-05,
"loss": 0.6831,
"mean_token_accuracy": 0.7892276650561758,
"step": 503
},
{
"epoch": 1.4942656307806141,
"grad_norm": 0.43506646156311035,
"learning_rate": 1.1804511584407763e-05,
"loss": 0.6469,
"mean_token_accuracy": 0.7984073599583249,
"step": 504
},
{
"epoch": 1.4972253052164262,
"grad_norm": 0.4514705538749695,
"learning_rate": 1.1770507254537454e-05,
"loss": 0.6567,
"mean_token_accuracy": 0.797555451493693,
"step": 505
},
{
"epoch": 1.5001849796522384,
"grad_norm": 0.4718611538410187,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.6666,
"mean_token_accuracy": 0.7937825386926253,
"step": 506
},
{
"epoch": 1.5031446540880502,
"grad_norm": 0.4598422646522522,
"learning_rate": 1.1702435557223988e-05,
"loss": 0.7341,
"mean_token_accuracy": 0.7725688345230695,
"step": 507
},
{
"epoch": 1.5061043285238624,
"grad_norm": 0.4759341776371002,
"learning_rate": 1.1668369002869912e-05,
"loss": 0.696,
"mean_token_accuracy": 0.7833280751891905,
"step": 508
},
{
"epoch": 1.5090640029596745,
"grad_norm": 0.4857986867427826,
"learning_rate": 1.1634282520518382e-05,
"loss": 0.6843,
"mean_token_accuracy": 0.7878627921931918,
"step": 509
},
{
"epoch": 1.5120236773954865,
"grad_norm": 0.4445328414440155,
"learning_rate": 1.1600176517318742e-05,
"loss": 0.7016,
"mean_token_accuracy": 0.7835290374274105,
"step": 510
},
{
"epoch": 1.5149833518312985,
"grad_norm": 0.4201406240463257,
"learning_rate": 1.1566051400653486e-05,
"loss": 0.6892,
"mean_token_accuracy": 0.7880382009320334,
"step": 511
},
{
"epoch": 1.5179430262671105,
"grad_norm": 0.4451057016849518,
"learning_rate": 1.153190757813343e-05,
"loss": 0.6661,
"mean_token_accuracy": 0.7936041312626415,
"step": 512
},
{
"epoch": 1.5209027007029228,
"grad_norm": 0.45407670736312866,
"learning_rate": 1.1497745457592817e-05,
"loss": 0.6938,
"mean_token_accuracy": 0.7862274252159144,
"step": 513
},
{
"epoch": 1.5238623751387348,
"grad_norm": 0.48065322637557983,
"learning_rate": 1.1463565447084446e-05,
"loss": 0.6711,
"mean_token_accuracy": 0.7922199519518627,
"step": 514
},
{
"epoch": 1.5268220495745468,
"grad_norm": 0.4554750323295593,
"learning_rate": 1.142936795487482e-05,
"loss": 0.7031,
"mean_token_accuracy": 0.7841927897620309,
"step": 515
},
{
"epoch": 1.5297817240103588,
"grad_norm": 0.47003987431526184,
"learning_rate": 1.1395153389439232e-05,
"loss": 0.6801,
"mean_token_accuracy": 0.7887132537245702,
"step": 516
},
{
"epoch": 1.5327413984461709,
"grad_norm": 0.49194058775901794,
"learning_rate": 1.1360922159456929e-05,
"loss": 0.6516,
"mean_token_accuracy": 0.7972093170337653,
"step": 517
},
{
"epoch": 1.535701072881983,
"grad_norm": 0.4363403618335724,
"learning_rate": 1.1326674673806195e-05,
"loss": 0.6454,
"mean_token_accuracy": 0.7994255155742641,
"step": 518
},
{
"epoch": 1.538660747317795,
"grad_norm": 0.4633619487285614,
"learning_rate": 1.129241134155949e-05,
"loss": 0.7226,
"mean_token_accuracy": 0.7772127285568272,
"step": 519
},
{
"epoch": 1.5416204217536071,
"grad_norm": 0.505766749382019,
"learning_rate": 1.1258132571978555e-05,
"loss": 0.6866,
"mean_token_accuracy": 0.7866910068023953,
"step": 520
},
{
"epoch": 1.5445800961894192,
"grad_norm": 0.4622265696525574,
"learning_rate": 1.1223838774509515e-05,
"loss": 0.6794,
"mean_token_accuracy": 0.7894488197184882,
"step": 521
},
{
"epoch": 1.5475397706252312,
"grad_norm": 0.46530911326408386,
"learning_rate": 1.1189530358778005e-05,
"loss": 0.6714,
"mean_token_accuracy": 0.7917336528774738,
"step": 522
},
{
"epoch": 1.5504994450610434,
"grad_norm": 0.48770585656166077,
"learning_rate": 1.1155207734584264e-05,
"loss": 0.655,
"mean_token_accuracy": 0.7967736177779107,
"step": 523
},
{
"epoch": 1.5534591194968552,
"grad_norm": 0.4736506938934326,
"learning_rate": 1.1120871311898254e-05,
"loss": 0.6626,
"mean_token_accuracy": 0.7948987812297952,
"step": 524
},
{
"epoch": 1.5564187939326675,
"grad_norm": 0.4388614594936371,
"learning_rate": 1.1086521500854746e-05,
"loss": 0.6743,
"mean_token_accuracy": 0.7901189868530583,
"step": 525
},
{
"epoch": 1.5593784683684795,
"grad_norm": 0.42465701699256897,
"learning_rate": 1.1052158711748435e-05,
"loss": 0.6424,
"mean_token_accuracy": 0.8002322656672612,
"step": 526
},
{
"epoch": 1.5623381428042915,
"grad_norm": 0.444394052028656,
"learning_rate": 1.1017783355029027e-05,
"loss": 0.6968,
"mean_token_accuracy": 0.7853953263510778,
"step": 527
},
{
"epoch": 1.5652978172401038,
"grad_norm": 0.4599439799785614,
"learning_rate": 1.0983395841296349e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.783582448885906,
"step": 528
},
{
"epoch": 1.5682574916759155,
"grad_norm": 0.4538317918777466,
"learning_rate": 1.0948996581295437e-05,
"loss": 0.6708,
"mean_token_accuracy": 0.7920199562156756,
"step": 529
},
{
"epoch": 1.5712171661117278,
"grad_norm": 0.5204719305038452,
"learning_rate": 1.0914585985911632e-05,
"loss": 0.7194,
"mean_token_accuracy": 0.7800594247957305,
"step": 530
},
{
"epoch": 1.5741768405475398,
"grad_norm": 0.4342687129974365,
"learning_rate": 1.0880164466165675e-05,
"loss": 0.6803,
"mean_token_accuracy": 0.7888814345475649,
"step": 531
},
{
"epoch": 1.5771365149833518,
"grad_norm": 0.47061675786972046,
"learning_rate": 1.084573243320878e-05,
"loss": 0.6997,
"mean_token_accuracy": 0.7845215145727062,
"step": 532
},
{
"epoch": 1.5800961894191639,
"grad_norm": 0.48753833770751953,
"learning_rate": 1.0811290298317755e-05,
"loss": 0.6963,
"mean_token_accuracy": 0.7853895351084046,
"step": 533
},
{
"epoch": 1.5830558638549759,
"grad_norm": 0.4486468732357025,
"learning_rate": 1.0776838472890065e-05,
"loss": 0.6616,
"mean_token_accuracy": 0.7946923291350155,
"step": 534
},
{
"epoch": 1.5860155382907881,
"grad_norm": 0.46315282583236694,
"learning_rate": 1.0742377368438915e-05,
"loss": 0.6653,
"mean_token_accuracy": 0.7937742045003314,
"step": 535
},
{
"epoch": 1.5889752127266,
"grad_norm": 0.43467020988464355,
"learning_rate": 1.0707907396588362e-05,
"loss": 0.675,
"mean_token_accuracy": 0.7911407237837417,
"step": 536
},
{
"epoch": 1.5919348871624122,
"grad_norm": 0.47853776812553406,
"learning_rate": 1.0673428969068365e-05,
"loss": 0.6694,
"mean_token_accuracy": 0.7934067804791232,
"step": 537
},
{
"epoch": 1.5948945615982242,
"grad_norm": 0.4569770395755768,
"learning_rate": 1.063894249770989e-05,
"loss": 0.7149,
"mean_token_accuracy": 0.7789215590955586,
"step": 538
},
{
"epoch": 1.5978542360340362,
"grad_norm": 0.48249223828315735,
"learning_rate": 1.0604448394439983e-05,
"loss": 0.6881,
"mean_token_accuracy": 0.7885556262821241,
"step": 539
},
{
"epoch": 1.6008139104698484,
"grad_norm": 0.44117307662963867,
"learning_rate": 1.0569947071276847e-05,
"loss": 0.6773,
"mean_token_accuracy": 0.7905948947059994,
"step": 540
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.4791225492954254,
"learning_rate": 1.053543894032493e-05,
"loss": 0.6486,
"mean_token_accuracy": 0.7984527785084713,
"step": 541
},
{
"epoch": 1.6067332593414725,
"grad_norm": 0.4592903256416321,
"learning_rate": 1.0500924413769988e-05,
"loss": 0.7029,
"mean_token_accuracy": 0.7816764343124575,
"step": 542
},
{
"epoch": 1.6096929337772845,
"grad_norm": 0.4603089988231659,
"learning_rate": 1.0466403903874176e-05,
"loss": 0.6692,
"mean_token_accuracy": 0.7920168861161754,
"step": 543
},
{
"epoch": 1.6126526082130965,
"grad_norm": 0.4877552092075348,
"learning_rate": 1.0431877822971118e-05,
"loss": 0.7264,
"mean_token_accuracy": 0.7763762310950634,
"step": 544
},
{
"epoch": 1.6156122826489088,
"grad_norm": 0.4495700001716614,
"learning_rate": 1.0397346583460972e-05,
"loss": 0.6748,
"mean_token_accuracy": 0.790038916470125,
"step": 545
},
{
"epoch": 1.6185719570847206,
"grad_norm": 0.4363431930541992,
"learning_rate": 1.0362810597805526e-05,
"loss": 0.7176,
"mean_token_accuracy": 0.7804455873720191,
"step": 546
},
{
"epoch": 1.6215316315205328,
"grad_norm": 0.4593956470489502,
"learning_rate": 1.0328270278523256e-05,
"loss": 0.692,
"mean_token_accuracy": 0.7868243000254014,
"step": 547
},
{
"epoch": 1.6244913059563448,
"grad_norm": 0.4650803506374359,
"learning_rate": 1.0293726038184393e-05,
"loss": 0.6667,
"mean_token_accuracy": 0.7932379645110449,
"step": 548
},
{
"epoch": 1.6274509803921569,
"grad_norm": 0.4343462288379669,
"learning_rate": 1.0259178289406011e-05,
"loss": 0.6828,
"mean_token_accuracy": 0.7873501273107357,
"step": 549
},
{
"epoch": 1.6304106548279689,
"grad_norm": 0.485445499420166,
"learning_rate": 1.022462744484709e-05,
"loss": 0.6757,
"mean_token_accuracy": 0.790149107536362,
"step": 550
},
{
"epoch": 1.633370329263781,
"grad_norm": 0.4408370852470398,
"learning_rate": 1.019007391720359e-05,
"loss": 0.6423,
"mean_token_accuracy": 0.8007969798780114,
"step": 551
},
{
"epoch": 1.6363300036995931,
"grad_norm": 0.48014140129089355,
"learning_rate": 1.0155518119203511e-05,
"loss": 0.6485,
"mean_token_accuracy": 0.798990145407414,
"step": 552
},
{
"epoch": 1.6392896781354052,
"grad_norm": 0.43950581550598145,
"learning_rate": 1.0120960463601977e-05,
"loss": 0.6884,
"mean_token_accuracy": 0.7868133995463237,
"step": 553
},
{
"epoch": 1.6422493525712172,
"grad_norm": 0.4777732789516449,
"learning_rate": 1.0086401363176306e-05,
"loss": 0.7016,
"mean_token_accuracy": 0.7829182684226537,
"step": 554
},
{
"epoch": 1.6452090270070292,
"grad_norm": 0.4738129675388336,
"learning_rate": 1.0051841230721065e-05,
"loss": 0.7025,
"mean_token_accuracy": 0.7833107058164892,
"step": 555
},
{
"epoch": 1.6481687014428412,
"grad_norm": 0.49576374888420105,
"learning_rate": 1.0017280479043148e-05,
"loss": 0.6832,
"mean_token_accuracy": 0.7878164823186655,
"step": 556
},
{
"epoch": 1.6511283758786535,
"grad_norm": 0.4482108950614929,
"learning_rate": 9.982719520956856e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.7859008840989987,
"step": 557
},
{
"epoch": 1.6540880503144653,
"grad_norm": 0.4530676603317261,
"learning_rate": 9.948158769278939e-06,
"loss": 0.6496,
"mean_token_accuracy": 0.7975575400059007,
"step": 558
},
{
"epoch": 1.6570477247502775,
"grad_norm": 0.4506595730781555,
"learning_rate": 9.913598636823694e-06,
"loss": 0.6711,
"mean_token_accuracy": 0.7920525949216152,
"step": 559
},
{
"epoch": 1.6600073991860895,
"grad_norm": 0.492118775844574,
"learning_rate": 9.879039536398023e-06,
"loss": 0.6663,
"mean_token_accuracy": 0.7926239117866946,
"step": 560
},
{
"epoch": 1.6629670736219015,
"grad_norm": 0.4334714114665985,
"learning_rate": 9.844481880796492e-06,
"loss": 0.6685,
"mean_token_accuracy": 0.7934195520197277,
"step": 561
},
{
"epoch": 1.6659267480577138,
"grad_norm": 0.43926241993904114,
"learning_rate": 9.809926082796415e-06,
"loss": 0.668,
"mean_token_accuracy": 0.7921636930110467,
"step": 562
},
{
"epoch": 1.6688864224935256,
"grad_norm": 0.46381375193595886,
"learning_rate": 9.775372555152912e-06,
"loss": 0.7106,
"mean_token_accuracy": 0.7814721603110977,
"step": 563
},
{
"epoch": 1.6718460969293378,
"grad_norm": 0.4584568738937378,
"learning_rate": 9.740821710593989e-06,
"loss": 0.6723,
"mean_token_accuracy": 0.7927753026753256,
"step": 564
},
{
"epoch": 1.6748057713651499,
"grad_norm": 0.46233710646629333,
"learning_rate": 9.70627396181561e-06,
"loss": 0.6979,
"mean_token_accuracy": 0.7847842845307743,
"step": 565
},
{
"epoch": 1.6777654458009619,
"grad_norm": 0.4692407548427582,
"learning_rate": 9.671729721476747e-06,
"loss": 0.6779,
"mean_token_accuracy": 0.7904914247805244,
"step": 566
},
{
"epoch": 1.6807251202367741,
"grad_norm": 0.45148906111717224,
"learning_rate": 9.637189402194477e-06,
"loss": 0.6636,
"mean_token_accuracy": 0.794561469534099,
"step": 567
},
{
"epoch": 1.683684794672586,
"grad_norm": 0.4668971002101898,
"learning_rate": 9.602653416539031e-06,
"loss": 0.6562,
"mean_token_accuracy": 0.7957992597890263,
"step": 568
},
{
"epoch": 1.6866444691083982,
"grad_norm": 0.4657999575138092,
"learning_rate": 9.568122177028884e-06,
"loss": 0.6793,
"mean_token_accuracy": 0.7895593260251141,
"step": 569
},
{
"epoch": 1.6896041435442102,
"grad_norm": 0.45058828592300415,
"learning_rate": 9.533596096125826e-06,
"loss": 0.6982,
"mean_token_accuracy": 0.7837857085184711,
"step": 570
},
{
"epoch": 1.6925638179800222,
"grad_norm": 0.5159661769866943,
"learning_rate": 9.499075586230014e-06,
"loss": 0.7278,
"mean_token_accuracy": 0.7758815945577252,
"step": 571
},
{
"epoch": 1.6955234924158342,
"grad_norm": 0.4985567629337311,
"learning_rate": 9.464561059675073e-06,
"loss": 0.6815,
"mean_token_accuracy": 0.789947097130735,
"step": 572
},
{
"epoch": 1.6984831668516462,
"grad_norm": 0.4985766112804413,
"learning_rate": 9.430052928723153e-06,
"loss": 0.6689,
"mean_token_accuracy": 0.7914537628745669,
"step": 573
},
{
"epoch": 1.7014428412874585,
"grad_norm": 0.44924196600914,
"learning_rate": 9.395551605560018e-06,
"loss": 0.654,
"mean_token_accuracy": 0.7949039622131476,
"step": 574
},
{
"epoch": 1.7044025157232703,
"grad_norm": 0.4486066997051239,
"learning_rate": 9.361057502290112e-06,
"loss": 0.6689,
"mean_token_accuracy": 0.7932129938757272,
"step": 575
},
{
"epoch": 1.7073621901590825,
"grad_norm": 0.5298429131507874,
"learning_rate": 9.326571030931636e-06,
"loss": 0.6797,
"mean_token_accuracy": 0.7899495064143103,
"step": 576
},
{
"epoch": 1.7103218645948945,
"grad_norm": 0.4834374189376831,
"learning_rate": 9.292092603411642e-06,
"loss": 0.6856,
"mean_token_accuracy": 0.7874172906006217,
"step": 577
},
{
"epoch": 1.7132815390307066,
"grad_norm": 0.4545672833919525,
"learning_rate": 9.257622631561085e-06,
"loss": 0.6793,
"mean_token_accuracy": 0.7896850742245419,
"step": 578
},
{
"epoch": 1.7162412134665188,
"grad_norm": 0.49482157826423645,
"learning_rate": 9.223161527109938e-06,
"loss": 0.7249,
"mean_token_accuracy": 0.7754079872839525,
"step": 579
},
{
"epoch": 1.7192008879023306,
"grad_norm": 0.47407853603363037,
"learning_rate": 9.188709701682246e-06,
"loss": 0.6793,
"mean_token_accuracy": 0.7890331281672109,
"step": 580
},
{
"epoch": 1.7221605623381429,
"grad_norm": 0.496600478887558,
"learning_rate": 9.154267566791224e-06,
"loss": 0.6745,
"mean_token_accuracy": 0.7916224036955456,
"step": 581
},
{
"epoch": 1.7251202367739549,
"grad_norm": 0.447837233543396,
"learning_rate": 9.119835533834332e-06,
"loss": 0.6443,
"mean_token_accuracy": 0.8001154358817507,
"step": 582
},
{
"epoch": 1.728079911209767,
"grad_norm": 0.4290511906147003,
"learning_rate": 9.085414014088368e-06,
"loss": 0.7033,
"mean_token_accuracy": 0.7838360657347012,
"step": 583
},
{
"epoch": 1.7310395856455791,
"grad_norm": 0.4949333369731903,
"learning_rate": 9.051003418704566e-06,
"loss": 0.6797,
"mean_token_accuracy": 0.7891070494649397,
"step": 584
},
{
"epoch": 1.733999260081391,
"grad_norm": 0.47587254643440247,
"learning_rate": 9.016604158703654e-06,
"loss": 0.6047,
"mean_token_accuracy": 0.8115938183485798,
"step": 585
},
{
"epoch": 1.7369589345172032,
"grad_norm": 0.4586060643196106,
"learning_rate": 8.982216644970978e-06,
"loss": 0.7073,
"mean_token_accuracy": 0.7814491686139491,
"step": 586
},
{
"epoch": 1.7399186089530152,
"grad_norm": 0.4535180628299713,
"learning_rate": 8.947841288251568e-06,
"loss": 0.6773,
"mean_token_accuracy": 0.7899806831449463,
"step": 587
},
{
"epoch": 1.7428782833888272,
"grad_norm": 0.4698368012905121,
"learning_rate": 8.913478499145255e-06,
"loss": 0.6992,
"mean_token_accuracy": 0.7847534234645677,
"step": 588
},
{
"epoch": 1.7458379578246392,
"grad_norm": 0.4965501129627228,
"learning_rate": 8.879128688101749e-06,
"loss": 0.73,
"mean_token_accuracy": 0.7749135427241792,
"step": 589
},
{
"epoch": 1.7487976322604513,
"grad_norm": 0.42426785826683044,
"learning_rate": 8.844792265415738e-06,
"loss": 0.6691,
"mean_token_accuracy": 0.7934521695906798,
"step": 590
},
{
"epoch": 1.7517573066962635,
"grad_norm": 0.4164229929447174,
"learning_rate": 8.810469641222001e-06,
"loss": 0.6792,
"mean_token_accuracy": 0.7893430794759394,
"step": 591
},
{
"epoch": 1.7547169811320755,
"grad_norm": 0.4406238794326782,
"learning_rate": 8.776161225490488e-06,
"loss": 0.6774,
"mean_token_accuracy": 0.7888743256018739,
"step": 592
},
{
"epoch": 1.7576766555678875,
"grad_norm": 0.4821741282939911,
"learning_rate": 8.741867428021447e-06,
"loss": 0.7028,
"mean_token_accuracy": 0.782003973548151,
"step": 593
},
{
"epoch": 1.7606363300036996,
"grad_norm": 0.41678085923194885,
"learning_rate": 8.707588658440511e-06,
"loss": 0.6673,
"mean_token_accuracy": 0.792060705641046,
"step": 594
},
{
"epoch": 1.7635960044395116,
"grad_norm": 0.4335281252861023,
"learning_rate": 8.673325326193806e-06,
"loss": 0.6799,
"mean_token_accuracy": 0.7913004243427386,
"step": 595
},
{
"epoch": 1.7665556788753238,
"grad_norm": 0.46854230761528015,
"learning_rate": 8.639077840543078e-06,
"loss": 0.6939,
"mean_token_accuracy": 0.784777034055922,
"step": 596
},
{
"epoch": 1.7695153533111356,
"grad_norm": 0.4286266267299652,
"learning_rate": 8.604846610560771e-06,
"loss": 0.682,
"mean_token_accuracy": 0.7879420465198175,
"step": 597
},
{
"epoch": 1.7724750277469479,
"grad_norm": 0.4346145689487457,
"learning_rate": 8.570632045125185e-06,
"loss": 0.6722,
"mean_token_accuracy": 0.7908459643173444,
"step": 598
},
{
"epoch": 1.77543470218276,
"grad_norm": 0.47212105989456177,
"learning_rate": 8.536434552915555e-06,
"loss": 0.6758,
"mean_token_accuracy": 0.7914862427903648,
"step": 599
},
{
"epoch": 1.778394376618572,
"grad_norm": 0.45980679988861084,
"learning_rate": 8.502254542407186e-06,
"loss": 0.6988,
"mean_token_accuracy": 0.7817833351753944,
"step": 600
},
{
"epoch": 1.778394376618572,
"eval_loss": 0.739486575126648,
"eval_mean_token_accuracy": 0.7714524950010826,
"eval_runtime": 24.4731,
"eval_samples_per_second": 5.271,
"eval_steps_per_second": 1.348,
"step": 600
},
{
"epoch": 1.7813540510543842,
"grad_norm": 0.4924312233924866,
"learning_rate": 8.468092421866575e-06,
"loss": 0.6954,
"mean_token_accuracy": 0.7859722749641744,
"step": 601
},
{
"epoch": 1.784313725490196,
"grad_norm": 0.4518575966358185,
"learning_rate": 8.433948599346516e-06,
"loss": 0.6719,
"mean_token_accuracy": 0.7915203880270405,
"step": 602
},
{
"epoch": 1.7872733999260082,
"grad_norm": 0.41159677505493164,
"learning_rate": 8.399823482681263e-06,
"loss": 0.6654,
"mean_token_accuracy": 0.7925289623050378,
"step": 603
},
{
"epoch": 1.7902330743618202,
"grad_norm": 0.4749601483345032,
"learning_rate": 8.36571747948162e-06,
"loss": 0.651,
"mean_token_accuracy": 0.7971818401347246,
"step": 604
},
{
"epoch": 1.7931927487976322,
"grad_norm": 0.4616299271583557,
"learning_rate": 8.331630997130091e-06,
"loss": 0.6387,
"mean_token_accuracy": 0.801418446439762,
"step": 605
},
{
"epoch": 1.7961524232334445,
"grad_norm": 0.4717465341091156,
"learning_rate": 8.297564442776014e-06,
"loss": 0.7002,
"mean_token_accuracy": 0.7815816907542203,
"step": 606
},
{
"epoch": 1.7991120976692563,
"grad_norm": 0.45160382986068726,
"learning_rate": 8.263518223330698e-06,
"loss": 0.6656,
"mean_token_accuracy": 0.7934779098419342,
"step": 607
},
{
"epoch": 1.8020717721050685,
"grad_norm": 0.5111809372901917,
"learning_rate": 8.229492745462551e-06,
"loss": 0.6734,
"mean_token_accuracy": 0.7910897600390507,
"step": 608
},
{
"epoch": 1.8050314465408805,
"grad_norm": 0.4546574652194977,
"learning_rate": 8.195488415592238e-06,
"loss": 0.6832,
"mean_token_accuracy": 0.7884104161267849,
"step": 609
},
{
"epoch": 1.8079911209766926,
"grad_norm": 0.48915475606918335,
"learning_rate": 8.161505639887818e-06,
"loss": 0.6865,
"mean_token_accuracy": 0.7868375134510748,
"step": 610
},
{
"epoch": 1.8109507954125046,
"grad_norm": 0.45673686265945435,
"learning_rate": 8.12754482425989e-06,
"loss": 0.6531,
"mean_token_accuracy": 0.7978940928567595,
"step": 611
},
{
"epoch": 1.8139104698483166,
"grad_norm": 0.46595895290374756,
"learning_rate": 8.09360637435676e-06,
"loss": 0.6763,
"mean_token_accuracy": 0.7900004127541489,
"step": 612
},
{
"epoch": 1.8168701442841289,
"grad_norm": 0.4639073312282562,
"learning_rate": 8.05969069555957e-06,
"loss": 0.7068,
"mean_token_accuracy": 0.7818326911340046,
"step": 613
},
{
"epoch": 1.8198298187199407,
"grad_norm": 0.48692357540130615,
"learning_rate": 8.025798192977482e-06,
"loss": 0.6724,
"mean_token_accuracy": 0.7902419271935022,
"step": 614
},
{
"epoch": 1.822789493155753,
"grad_norm": 0.4192976653575897,
"learning_rate": 7.991929271442817e-06,
"loss": 0.694,
"mean_token_accuracy": 0.7842421058017395,
"step": 615
},
{
"epoch": 1.825749167591565,
"grad_norm": 0.4323351979255676,
"learning_rate": 7.958084335506239e-06,
"loss": 0.6633,
"mean_token_accuracy": 0.7939339540476142,
"step": 616
},
{
"epoch": 1.828708842027377,
"grad_norm": 0.5116434097290039,
"learning_rate": 7.924263789431913e-06,
"loss": 0.7177,
"mean_token_accuracy": 0.7774093907152634,
"step": 617
},
{
"epoch": 1.8316685164631892,
"grad_norm": 0.47101178765296936,
"learning_rate": 7.89046803719267e-06,
"loss": 0.6311,
"mean_token_accuracy": 0.8026902561156782,
"step": 618
},
{
"epoch": 1.834628190899001,
"grad_norm": 0.4334461987018585,
"learning_rate": 7.856697482465195e-06,
"loss": 0.7056,
"mean_token_accuracy": 0.7813049117276861,
"step": 619
},
{
"epoch": 1.8375878653348132,
"grad_norm": 0.44044068455696106,
"learning_rate": 7.822952528625192e-06,
"loss": 0.6706,
"mean_token_accuracy": 0.7911052218155908,
"step": 620
},
{
"epoch": 1.8405475397706252,
"grad_norm": 0.43130719661712646,
"learning_rate": 7.789233578742583e-06,
"loss": 0.6868,
"mean_token_accuracy": 0.7883987501012448,
"step": 621
},
{
"epoch": 1.8435072142064373,
"grad_norm": 0.464912086725235,
"learning_rate": 7.755541035576677e-06,
"loss": 0.6966,
"mean_token_accuracy": 0.784260520058606,
"step": 622
},
{
"epoch": 1.8464668886422495,
"grad_norm": 0.47342586517333984,
"learning_rate": 7.721875301571359e-06,
"loss": 0.6862,
"mean_token_accuracy": 0.7896494653236235,
"step": 623
},
{
"epoch": 1.8494265630780613,
"grad_norm": 0.4514820873737335,
"learning_rate": 7.688236778850307e-06,
"loss": 0.6702,
"mean_token_accuracy": 0.7906542847766748,
"step": 624
},
{
"epoch": 1.8523862375138735,
"grad_norm": 0.4382912218570709,
"learning_rate": 7.654625869212147e-06,
"loss": 0.6519,
"mean_token_accuracy": 0.7971948655223885,
"step": 625
},
{
"epoch": 1.8553459119496856,
"grad_norm": 0.4642338156700134,
"learning_rate": 7.621042974125701e-06,
"loss": 0.7042,
"mean_token_accuracy": 0.7810776086801536,
"step": 626
},
{
"epoch": 1.8583055863854976,
"grad_norm": 0.43844854831695557,
"learning_rate": 7.587488494725157e-06,
"loss": 0.7134,
"mean_token_accuracy": 0.7782961208172144,
"step": 627
},
{
"epoch": 1.8612652608213096,
"grad_norm": 0.44983789324760437,
"learning_rate": 7.553962831805291e-06,
"loss": 0.6928,
"mean_token_accuracy": 0.7847411304512161,
"step": 628
},
{
"epoch": 1.8642249352571216,
"grad_norm": 0.464546799659729,
"learning_rate": 7.520466385816672e-06,
"loss": 0.6848,
"mean_token_accuracy": 0.7877457152823937,
"step": 629
},
{
"epoch": 1.8671846096929339,
"grad_norm": 0.4500563442707062,
"learning_rate": 7.48699955686089e-06,
"loss": 0.7043,
"mean_token_accuracy": 0.7810525873603867,
"step": 630
},
{
"epoch": 1.870144284128746,
"grad_norm": 0.4776234030723572,
"learning_rate": 7.453562744685779e-06,
"loss": 0.6491,
"mean_token_accuracy": 0.7980624835148542,
"step": 631
},
{
"epoch": 1.873103958564558,
"grad_norm": 0.42935752868652344,
"learning_rate": 7.420156348680621e-06,
"loss": 0.7015,
"mean_token_accuracy": 0.7841032229720888,
"step": 632
},
{
"epoch": 1.87606363300037,
"grad_norm": 0.45095863938331604,
"learning_rate": 7.3867807678713965e-06,
"loss": 0.6695,
"mean_token_accuracy": 0.792214351462561,
"step": 633
},
{
"epoch": 1.879023307436182,
"grad_norm": 0.4426802694797516,
"learning_rate": 7.353436400916006e-06,
"loss": 0.7231,
"mean_token_accuracy": 0.7759461148659138,
"step": 634
},
{
"epoch": 1.8819829818719942,
"grad_norm": 0.4576883316040039,
"learning_rate": 7.32012364609952e-06,
"loss": 0.6891,
"mean_token_accuracy": 0.787467563016268,
"step": 635
},
{
"epoch": 1.884942656307806,
"grad_norm": 0.47537630796432495,
"learning_rate": 7.286842901329413e-06,
"loss": 0.6737,
"mean_token_accuracy": 0.7898305381622779,
"step": 636
},
{
"epoch": 1.8879023307436182,
"grad_norm": 0.47071340680122375,
"learning_rate": 7.253594564130804e-06,
"loss": 0.6314,
"mean_token_accuracy": 0.8024436031530167,
"step": 637
},
{
"epoch": 1.8908620051794303,
"grad_norm": 0.42745083570480347,
"learning_rate": 7.22037903164173e-06,
"loss": 0.6648,
"mean_token_accuracy": 0.7939708774130151,
"step": 638
},
{
"epoch": 1.8938216796152423,
"grad_norm": 0.45386022329330444,
"learning_rate": 7.187196700608373e-06,
"loss": 0.7055,
"mean_token_accuracy": 0.7818898164861657,
"step": 639
},
{
"epoch": 1.8967813540510545,
"grad_norm": 0.5082824230194092,
"learning_rate": 7.154047967380353e-06,
"loss": 0.6797,
"mean_token_accuracy": 0.7885593754013774,
"step": 640
},
{
"epoch": 1.8997410284868663,
"grad_norm": 0.42250484228134155,
"learning_rate": 7.120933227905971e-06,
"loss": 0.6822,
"mean_token_accuracy": 0.7885130074722346,
"step": 641
},
{
"epoch": 1.9027007029226786,
"grad_norm": 0.45145198702812195,
"learning_rate": 7.0878528777274814e-06,
"loss": 0.7101,
"mean_token_accuracy": 0.7797410127092863,
"step": 642
},
{
"epoch": 1.9056603773584906,
"grad_norm": 0.4663936495780945,
"learning_rate": 7.05480731197638e-06,
"loss": 0.6638,
"mean_token_accuracy": 0.7952863824537968,
"step": 643
},
{
"epoch": 1.9086200517943026,
"grad_norm": 0.4832487106323242,
"learning_rate": 7.021796925368667e-06,
"loss": 0.6901,
"mean_token_accuracy": 0.7859927796689913,
"step": 644
},
{
"epoch": 1.9115797262301149,
"grad_norm": 0.4796106219291687,
"learning_rate": 6.988822112200157e-06,
"loss": 0.699,
"mean_token_accuracy": 0.7833292076466405,
"step": 645
},
{
"epoch": 1.9145394006659266,
"grad_norm": 0.4601701498031616,
"learning_rate": 6.955883266341741e-06,
"loss": 0.6911,
"mean_token_accuracy": 0.7855269916498042,
"step": 646
},
{
"epoch": 1.917499075101739,
"grad_norm": 0.4631184935569763,
"learning_rate": 6.9229807812346985e-06,
"loss": 0.6938,
"mean_token_accuracy": 0.7854820719902068,
"step": 647
},
{
"epoch": 1.920458749537551,
"grad_norm": 0.46688076853752136,
"learning_rate": 6.890115049885995e-06,
"loss": 0.6873,
"mean_token_accuracy": 0.7866998790722634,
"step": 648
},
{
"epoch": 1.923418423973363,
"grad_norm": 0.4536078870296478,
"learning_rate": 6.85728646486359e-06,
"loss": 0.6795,
"mean_token_accuracy": 0.7877265813082034,
"step": 649
},
{
"epoch": 1.926378098409175,
"grad_norm": 0.4446280896663666,
"learning_rate": 6.824495418291741e-06,
"loss": 0.6618,
"mean_token_accuracy": 0.793360405089406,
"step": 650
},
{
"epoch": 1.929337772844987,
"grad_norm": 0.4624863564968109,
"learning_rate": 6.791742301846325e-06,
"loss": 0.6943,
"mean_token_accuracy": 0.7851390097369664,
"step": 651
},
{
"epoch": 1.9322974472807992,
"grad_norm": 0.46851369738578796,
"learning_rate": 6.759027506750159e-06,
"loss": 0.6973,
"mean_token_accuracy": 0.7825239711607613,
"step": 652
},
{
"epoch": 1.935257121716611,
"grad_norm": 0.45422789454460144,
"learning_rate": 6.726351423768323e-06,
"loss": 0.7049,
"mean_token_accuracy": 0.7834841114161091,
"step": 653
},
{
"epoch": 1.9382167961524233,
"grad_norm": 0.4513411521911621,
"learning_rate": 6.693714443203507e-06,
"loss": 0.674,
"mean_token_accuracy": 0.7905625791946234,
"step": 654
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.44000759720802307,
"learning_rate": 6.661116954891329e-06,
"loss": 0.6889,
"mean_token_accuracy": 0.7876442391621598,
"step": 655
},
{
"epoch": 1.9441361450240473,
"grad_norm": 0.4787219166755676,
"learning_rate": 6.62855934819569e-06,
"loss": 0.7072,
"mean_token_accuracy": 0.779945015719257,
"step": 656
},
{
"epoch": 1.9470958194598595,
"grad_norm": 0.42401981353759766,
"learning_rate": 6.59604201200412e-06,
"loss": 0.6773,
"mean_token_accuracy": 0.7896623125899848,
"step": 657
},
{
"epoch": 1.9500554938956713,
"grad_norm": 0.5051243305206299,
"learning_rate": 6.563565334723134e-06,
"loss": 0.6973,
"mean_token_accuracy": 0.7849915232879509,
"step": 658
},
{
"epoch": 1.9530151683314836,
"grad_norm": 0.4501940608024597,
"learning_rate": 6.5311297042736046e-06,
"loss": 0.7169,
"mean_token_accuracy": 0.7793906916939676,
"step": 659
},
{
"epoch": 1.9559748427672956,
"grad_norm": 0.4441750645637512,
"learning_rate": 6.498735508086094e-06,
"loss": 0.6299,
"mean_token_accuracy": 0.80293781287729,
"step": 660
},
{
"epoch": 1.9589345172031076,
"grad_norm": 0.4581814706325531,
"learning_rate": 6.466383133096268e-06,
"loss": 0.696,
"mean_token_accuracy": 0.7832016518779903,
"step": 661
},
{
"epoch": 1.9618941916389199,
"grad_norm": 0.4844694137573242,
"learning_rate": 6.4340729657402424e-06,
"loss": 0.6553,
"mean_token_accuracy": 0.79566224863039,
"step": 662
},
{
"epoch": 1.9648538660747317,
"grad_norm": 0.47741377353668213,
"learning_rate": 6.40180539194999e-06,
"loss": 0.7005,
"mean_token_accuracy": 0.7834191013265248,
"step": 663
},
{
"epoch": 1.967813540510544,
"grad_norm": 0.4623546600341797,
"learning_rate": 6.3695807971487175e-06,
"loss": 0.6739,
"mean_token_accuracy": 0.7908099908248746,
"step": 664
},
{
"epoch": 1.970773214946356,
"grad_norm": 0.44196298718452454,
"learning_rate": 6.337399566246257e-06,
"loss": 0.6887,
"mean_token_accuracy": 0.7867410372609397,
"step": 665
},
{
"epoch": 1.973732889382168,
"grad_norm": 0.44744858145713806,
"learning_rate": 6.305262083634488e-06,
"loss": 0.6947,
"mean_token_accuracy": 0.7849319629950706,
"step": 666
},
{
"epoch": 1.97669256381798,
"grad_norm": 0.48888614773750305,
"learning_rate": 6.2731687331827214e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.7852305896900613,
"step": 667
},
{
"epoch": 1.979652238253792,
"grad_norm": 0.548868715763092,
"learning_rate": 6.2411198982331435e-06,
"loss": 0.6583,
"mean_token_accuracy": 0.7950044402077763,
"step": 668
},
{
"epoch": 1.9826119126896042,
"grad_norm": 0.44247865676879883,
"learning_rate": 6.209115961596208e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.7844457961563795,
"step": 669
},
{
"epoch": 1.9855715871254163,
"grad_norm": 0.4636320471763611,
"learning_rate": 6.177157305546077e-06,
"loss": 0.6912,
"mean_token_accuracy": 0.7862634185033074,
"step": 670
},
{
"epoch": 1.9885312615612283,
"grad_norm": 0.4413374066352844,
"learning_rate": 6.145244311816063e-06,
"loss": 0.6814,
"mean_token_accuracy": 0.787695055467721,
"step": 671
},
{
"epoch": 1.9914909359970403,
"grad_norm": 0.45312613248825073,
"learning_rate": 6.113377361594048e-06,
"loss": 0.6754,
"mean_token_accuracy": 0.7900683076107496,
"step": 672
},
{
"epoch": 1.9944506104328523,
"grad_norm": 0.4501809775829315,
"learning_rate": 6.081556835517955e-06,
"loss": 0.6822,
"mean_token_accuracy": 0.788871206473793,
"step": 673
},
{
"epoch": 1.9974102848686646,
"grad_norm": 0.45863819122314453,
"learning_rate": 6.049783113671184e-06,
"loss": 0.6751,
"mean_token_accuracy": 0.7895972815620605,
"step": 674
},
{
"epoch": 2.0029596744358122,
"grad_norm": 0.7237296104431152,
"learning_rate": 6.018056575578075e-06,
"loss": 1.3777,
"mean_token_accuracy": 0.7904976583626417,
"step": 675
},
{
"epoch": 2.005919348871624,
"grad_norm": 0.4619278311729431,
"learning_rate": 5.986377600199371e-06,
"loss": 0.6827,
"mean_token_accuracy": 0.7859173509620443,
"step": 676
},
{
"epoch": 2.0088790233074363,
"grad_norm": 0.4588172435760498,
"learning_rate": 5.9547465659277215e-06,
"loss": 0.6602,
"mean_token_accuracy": 0.7935102380543758,
"step": 677
},
{
"epoch": 2.011838697743248,
"grad_norm": 0.4326033890247345,
"learning_rate": 5.923163850583114e-06,
"loss": 0.6169,
"mean_token_accuracy": 0.8052656884361966,
"step": 678
},
{
"epoch": 2.0147983721790603,
"grad_norm": 0.4270947277545929,
"learning_rate": 5.891629831408392e-06,
"loss": 0.6675,
"mean_token_accuracy": 0.7923289976402469,
"step": 679
},
{
"epoch": 2.0177580466148726,
"grad_norm": 0.42200711369514465,
"learning_rate": 5.8601448850647515e-06,
"loss": 0.7139,
"mean_token_accuracy": 0.7767213966992985,
"step": 680
},
{
"epoch": 2.0207177210506844,
"grad_norm": 0.4401227831840515,
"learning_rate": 5.828709387627219e-06,
"loss": 0.6296,
"mean_token_accuracy": 0.8034322271999133,
"step": 681
},
{
"epoch": 2.0236773954864966,
"grad_norm": 0.4614053964614868,
"learning_rate": 5.797323714580192e-06,
"loss": 0.6402,
"mean_token_accuracy": 0.7988319450662181,
"step": 682
},
{
"epoch": 2.0266370699223084,
"grad_norm": 0.4590739905834198,
"learning_rate": 5.7659882408129204e-06,
"loss": 0.6523,
"mean_token_accuracy": 0.79529094328691,
"step": 683
},
{
"epoch": 2.0295967443581207,
"grad_norm": 0.4543253481388092,
"learning_rate": 5.7347033406150494e-06,
"loss": 0.6733,
"mean_token_accuracy": 0.7890264127897217,
"step": 684
},
{
"epoch": 2.0325564187939325,
"grad_norm": 0.4582739770412445,
"learning_rate": 5.703469387672138e-06,
"loss": 0.6056,
"mean_token_accuracy": 0.8107667655932651,
"step": 685
},
{
"epoch": 2.0355160932297447,
"grad_norm": 0.42348945140838623,
"learning_rate": 5.672286755061212e-06,
"loss": 0.6377,
"mean_token_accuracy": 0.799343160525926,
"step": 686
},
{
"epoch": 2.038475767665557,
"grad_norm": 0.4367158114910126,
"learning_rate": 5.64115581524629e-06,
"loss": 0.6456,
"mean_token_accuracy": 0.7978098111032584,
"step": 687
},
{
"epoch": 2.0414354421013687,
"grad_norm": 0.4166472852230072,
"learning_rate": 5.610076940073939e-06,
"loss": 0.64,
"mean_token_accuracy": 0.7996033627487545,
"step": 688
},
{
"epoch": 2.044395116537181,
"grad_norm": 0.4349493980407715,
"learning_rate": 5.579050500768837e-06,
"loss": 0.6247,
"mean_token_accuracy": 0.8040890019359421,
"step": 689
},
{
"epoch": 2.0473547909729928,
"grad_norm": 0.43666020035743713,
"learning_rate": 5.548076867929331e-06,
"loss": 0.6499,
"mean_token_accuracy": 0.7959618095761632,
"step": 690
},
{
"epoch": 2.050314465408805,
"grad_norm": 0.4168229401111603,
"learning_rate": 5.517156411523026e-06,
"loss": 0.6207,
"mean_token_accuracy": 0.8063047096858116,
"step": 691
},
{
"epoch": 2.0532741398446173,
"grad_norm": 0.4426259398460388,
"learning_rate": 5.486289500882355e-06,
"loss": 0.6437,
"mean_token_accuracy": 0.7976666538912617,
"step": 692
},
{
"epoch": 2.056233814280429,
"grad_norm": 0.47709882259368896,
"learning_rate": 5.455476504700161e-06,
"loss": 0.6354,
"mean_token_accuracy": 0.8001667386857992,
"step": 693
},
{
"epoch": 2.0591934887162413,
"grad_norm": 0.4387308359146118,
"learning_rate": 5.424717791025302e-06,
"loss": 0.6093,
"mean_token_accuracy": 0.8074188099768709,
"step": 694
},
{
"epoch": 2.062153163152053,
"grad_norm": 0.42804378271102905,
"learning_rate": 5.3940137272582534e-06,
"loss": 0.6621,
"mean_token_accuracy": 0.7942881189608123,
"step": 695
},
{
"epoch": 2.0651128375878653,
"grad_norm": 0.4197988510131836,
"learning_rate": 5.3633646801467255e-06,
"loss": 0.6272,
"mean_token_accuracy": 0.8035603820307122,
"step": 696
},
{
"epoch": 2.0680725120236776,
"grad_norm": 0.4130113422870636,
"learning_rate": 5.332771015781275e-06,
"loss": 0.6318,
"mean_token_accuracy": 0.8026469316916442,
"step": 697
},
{
"epoch": 2.0710321864594894,
"grad_norm": 0.4477401077747345,
"learning_rate": 5.302233099590928e-06,
"loss": 0.6202,
"mean_token_accuracy": 0.8051835840765896,
"step": 698
},
{
"epoch": 2.0739918608953016,
"grad_norm": 0.4212632477283478,
"learning_rate": 5.271751296338823e-06,
"loss": 0.6454,
"mean_token_accuracy": 0.7975187090662971,
"step": 699
},
{
"epoch": 2.0769515353311134,
"grad_norm": 0.43481898307800293,
"learning_rate": 5.241325970117851e-06,
"loss": 0.6298,
"mean_token_accuracy": 0.8037347054797938,
"step": 700
},
{
"epoch": 2.0769515353311134,
"eval_loss": 0.7401972413063049,
"eval_mean_token_accuracy": 0.7715796790522826,
"eval_runtime": 24.8345,
"eval_samples_per_second": 5.194,
"eval_steps_per_second": 1.329,
"step": 700
},
{
"epoch": 2.0799112097669257,
"grad_norm": 0.42328760027885437,
"learning_rate": 5.210957484346314e-06,
"loss": 0.5797,
"mean_token_accuracy": 0.8171162575964448,
"step": 701
},
{
"epoch": 2.0828708842027375,
"grad_norm": 0.40636351704597473,
"learning_rate": 5.1806462017635775e-06,
"loss": 0.6444,
"mean_token_accuracy": 0.7976044651105583,
"step": 702
},
{
"epoch": 2.0858305586385497,
"grad_norm": 0.4619290232658386,
"learning_rate": 5.150392484425728e-06,
"loss": 0.6432,
"mean_token_accuracy": 0.7998582873056539,
"step": 703
},
{
"epoch": 2.088790233074362,
"grad_norm": 0.42781201004981995,
"learning_rate": 5.120196693701267e-06,
"loss": 0.6447,
"mean_token_accuracy": 0.7980342866377519,
"step": 704
},
{
"epoch": 2.0917499075101738,
"grad_norm": 0.435585081577301,
"learning_rate": 5.090059190266779e-06,
"loss": 0.6703,
"mean_token_accuracy": 0.7898306031291672,
"step": 705
},
{
"epoch": 2.094709581945986,
"grad_norm": 0.42848485708236694,
"learning_rate": 5.059980334102637e-06,
"loss": 0.6399,
"mean_token_accuracy": 0.8012392387851905,
"step": 706
},
{
"epoch": 2.097669256381798,
"grad_norm": 0.44752803444862366,
"learning_rate": 5.0299604844886985e-06,
"loss": 0.6444,
"mean_token_accuracy": 0.7983052126079367,
"step": 707
},
{
"epoch": 2.10062893081761,
"grad_norm": 0.41624656319618225,
"learning_rate": 5.000000000000003e-06,
"loss": 0.6564,
"mean_token_accuracy": 0.7942197264250628,
"step": 708
},
{
"epoch": 2.1035886052534223,
"grad_norm": 0.4133838415145874,
"learning_rate": 4.970099238502494e-06,
"loss": 0.6516,
"mean_token_accuracy": 0.7961836172559192,
"step": 709
},
{
"epoch": 2.106548279689234,
"grad_norm": 0.4188925325870514,
"learning_rate": 4.940258557148765e-06,
"loss": 0.6703,
"mean_token_accuracy": 0.7904122765338784,
"step": 710
},
{
"epoch": 2.1095079541250463,
"grad_norm": 0.4261308014392853,
"learning_rate": 4.910478312373757e-06,
"loss": 0.6172,
"mean_token_accuracy": 0.8066983237111479,
"step": 711
},
{
"epoch": 2.112467628560858,
"grad_norm": 0.40434494614601135,
"learning_rate": 4.8807588598905364e-06,
"loss": 0.6482,
"mean_token_accuracy": 0.7977588880511752,
"step": 712
},
{
"epoch": 2.1154273029966704,
"grad_norm": 0.4250684380531311,
"learning_rate": 4.8511005546860214e-06,
"loss": 0.6495,
"mean_token_accuracy": 0.7967420913450249,
"step": 713
},
{
"epoch": 2.1183869774324826,
"grad_norm": 0.4167192280292511,
"learning_rate": 4.821503751016746e-06,
"loss": 0.6226,
"mean_token_accuracy": 0.8038675074568771,
"step": 714
},
{
"epoch": 2.1213466518682944,
"grad_norm": 0.4020220637321472,
"learning_rate": 4.791968802404648e-06,
"loss": 0.639,
"mean_token_accuracy": 0.8002841240121322,
"step": 715
},
{
"epoch": 2.1243063263041067,
"grad_norm": 0.41898688673973083,
"learning_rate": 4.762496061632814e-06,
"loss": 0.5961,
"mean_token_accuracy": 0.8106809432630374,
"step": 716
},
{
"epoch": 2.1272660007399185,
"grad_norm": 0.4082755446434021,
"learning_rate": 4.733085880741301e-06,
"loss": 0.6836,
"mean_token_accuracy": 0.7858357226121178,
"step": 717
},
{
"epoch": 2.1302256751757307,
"grad_norm": 0.4276457130908966,
"learning_rate": 4.703738611022899e-06,
"loss": 0.6561,
"mean_token_accuracy": 0.7956159537823245,
"step": 718
},
{
"epoch": 2.133185349611543,
"grad_norm": 0.42158472537994385,
"learning_rate": 4.674454603018949e-06,
"loss": 0.6147,
"mean_token_accuracy": 0.8079100447436781,
"step": 719
},
{
"epoch": 2.1361450240473547,
"grad_norm": 0.4250597357749939,
"learning_rate": 4.645234206515171e-06,
"loss": 0.6386,
"mean_token_accuracy": 0.8010068266815492,
"step": 720
},
{
"epoch": 2.139104698483167,
"grad_norm": 0.4138052463531494,
"learning_rate": 4.616077770537453e-06,
"loss": 0.6231,
"mean_token_accuracy": 0.804220202437573,
"step": 721
},
{
"epoch": 2.1420643729189788,
"grad_norm": 0.4031846523284912,
"learning_rate": 4.586985643347716e-06,
"loss": 0.6353,
"mean_token_accuracy": 0.7999556744979773,
"step": 722
},
{
"epoch": 2.145024047354791,
"grad_norm": 0.4207233190536499,
"learning_rate": 4.557958172439726e-06,
"loss": 0.6519,
"mean_token_accuracy": 0.795605835154003,
"step": 723
},
{
"epoch": 2.1479837217906033,
"grad_norm": 0.4172452390193939,
"learning_rate": 4.5289957045349655e-06,
"loss": 0.6214,
"mean_token_accuracy": 0.8051871043336377,
"step": 724
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.4109727442264557,
"learning_rate": 4.500098585578475e-06,
"loss": 0.62,
"mean_token_accuracy": 0.8044500506016459,
"step": 725
},
{
"epoch": 2.1539030706622273,
"grad_norm": 0.4343760907649994,
"learning_rate": 4.471267160734731e-06,
"loss": 0.6539,
"mean_token_accuracy": 0.7939436976287444,
"step": 726
},
{
"epoch": 2.156862745098039,
"grad_norm": 0.4174571931362152,
"learning_rate": 4.4425017743835155e-06,
"loss": 0.6371,
"mean_token_accuracy": 0.8005225952205913,
"step": 727
},
{
"epoch": 2.1598224195338513,
"grad_norm": 0.38494619727134705,
"learning_rate": 4.413802770115816e-06,
"loss": 0.6524,
"mean_token_accuracy": 0.7961488383409648,
"step": 728
},
{
"epoch": 2.162782093969663,
"grad_norm": 0.41858088970184326,
"learning_rate": 4.385170490729712e-06,
"loss": 0.6421,
"mean_token_accuracy": 0.7982196911670912,
"step": 729
},
{
"epoch": 2.1657417684054754,
"grad_norm": 0.45224249362945557,
"learning_rate": 4.356605278226274e-06,
"loss": 0.6639,
"mean_token_accuracy": 0.7918000336006263,
"step": 730
},
{
"epoch": 2.1687014428412876,
"grad_norm": 0.43538355827331543,
"learning_rate": 4.328107473805487e-06,
"loss": 0.6383,
"mean_token_accuracy": 0.800484981130683,
"step": 731
},
{
"epoch": 2.1716611172770994,
"grad_norm": 0.3976902365684509,
"learning_rate": 4.299677417862174e-06,
"loss": 0.6556,
"mean_token_accuracy": 0.7932561264782982,
"step": 732
},
{
"epoch": 2.1746207917129117,
"grad_norm": 0.44118574261665344,
"learning_rate": 4.2713154499819345e-06,
"loss": 0.6636,
"mean_token_accuracy": 0.7921884608817545,
"step": 733
},
{
"epoch": 2.1775804661487235,
"grad_norm": 0.4160580635070801,
"learning_rate": 4.243021908937083e-06,
"loss": 0.6136,
"mean_token_accuracy": 0.8078645092004564,
"step": 734
},
{
"epoch": 2.1805401405845357,
"grad_norm": 0.4081907272338867,
"learning_rate": 4.214797132682597e-06,
"loss": 0.6017,
"mean_token_accuracy": 0.8104744103732681,
"step": 735
},
{
"epoch": 2.183499815020348,
"grad_norm": 0.4466439187526703,
"learning_rate": 4.186641458352088e-06,
"loss": 0.6713,
"mean_token_accuracy": 0.7900975226524254,
"step": 736
},
{
"epoch": 2.1864594894561598,
"grad_norm": 0.4527799189090729,
"learning_rate": 4.158555222253772e-06,
"loss": 0.6744,
"mean_token_accuracy": 0.7901550404552812,
"step": 737
},
{
"epoch": 2.189419163891972,
"grad_norm": 0.4166731536388397,
"learning_rate": 4.130538759866457e-06,
"loss": 0.6523,
"mean_token_accuracy": 0.795872875107717,
"step": 738
},
{
"epoch": 2.192378838327784,
"grad_norm": 0.4434090852737427,
"learning_rate": 4.102592405835536e-06,
"loss": 0.6366,
"mean_token_accuracy": 0.8006169174890402,
"step": 739
},
{
"epoch": 2.195338512763596,
"grad_norm": 0.4182213842868805,
"learning_rate": 4.074716493968976e-06,
"loss": 0.6193,
"mean_token_accuracy": 0.8064642927723187,
"step": 740
},
{
"epoch": 2.1982981871994083,
"grad_norm": 0.4401805102825165,
"learning_rate": 4.046911357233343e-06,
"loss": 0.5899,
"mean_token_accuracy": 0.8129922266946384,
"step": 741
},
{
"epoch": 2.20125786163522,
"grad_norm": 0.4129815697669983,
"learning_rate": 4.019177327749822e-06,
"loss": 0.6164,
"mean_token_accuracy": 0.8067027474840832,
"step": 742
},
{
"epoch": 2.2042175360710323,
"grad_norm": 0.414181649684906,
"learning_rate": 3.991514736790259e-06,
"loss": 0.6572,
"mean_token_accuracy": 0.7943868846552696,
"step": 743
},
{
"epoch": 2.207177210506844,
"grad_norm": 0.41192206740379333,
"learning_rate": 3.9639239147731865e-06,
"loss": 0.6105,
"mean_token_accuracy": 0.8081474157714055,
"step": 744
},
{
"epoch": 2.2101368849426564,
"grad_norm": 0.4337133765220642,
"learning_rate": 3.936405191259891e-06,
"loss": 0.646,
"mean_token_accuracy": 0.7979063248420304,
"step": 745
},
{
"epoch": 2.213096559378468,
"grad_norm": 0.42786547541618347,
"learning_rate": 3.908958894950465e-06,
"loss": 0.6611,
"mean_token_accuracy": 0.7936699913649292,
"step": 746
},
{
"epoch": 2.2160562338142804,
"grad_norm": 0.45288723707199097,
"learning_rate": 3.881585353679891e-06,
"loss": 0.6648,
"mean_token_accuracy": 0.7914008191748386,
"step": 747
},
{
"epoch": 2.2190159082500927,
"grad_norm": 0.45666372776031494,
"learning_rate": 3.854284894414122e-06,
"loss": 0.6291,
"mean_token_accuracy": 0.8025700241416271,
"step": 748
},
{
"epoch": 2.2219755826859044,
"grad_norm": 0.41519424319267273,
"learning_rate": 3.827057843246181e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.8051112931321951,
"step": 749
},
{
"epoch": 2.2249352571217167,
"grad_norm": 0.42094844579696655,
"learning_rate": 3.799904525392251e-06,
"loss": 0.6083,
"mean_token_accuracy": 0.8084426362380992,
"step": 750
},
{
"epoch": 2.2278949315575285,
"grad_norm": 0.41918104887008667,
"learning_rate": 3.7728252651878018e-06,
"loss": 0.6584,
"mean_token_accuracy": 0.7924028935909405,
"step": 751
},
{
"epoch": 2.2308546059933407,
"grad_norm": 0.43053704500198364,
"learning_rate": 3.745820386083724e-06,
"loss": 0.6675,
"mean_token_accuracy": 0.7899391245102569,
"step": 752
},
{
"epoch": 2.233814280429153,
"grad_norm": 0.433442085981369,
"learning_rate": 3.718890210642442e-06,
"loss": 0.6606,
"mean_token_accuracy": 0.7937032510168863,
"step": 753
},
{
"epoch": 2.2367739548649648,
"grad_norm": 0.4508717358112335,
"learning_rate": 3.6920350605340883e-06,
"loss": 0.6266,
"mean_token_accuracy": 0.8043645426941337,
"step": 754
},
{
"epoch": 2.239733629300777,
"grad_norm": 0.4047711491584778,
"learning_rate": 3.6652552565326382e-06,
"loss": 0.6681,
"mean_token_accuracy": 0.7906295543184187,
"step": 755
},
{
"epoch": 2.242693303736589,
"grad_norm": 0.4291645586490631,
"learning_rate": 3.638551118512089e-06,
"loss": 0.6562,
"mean_token_accuracy": 0.7943759677554681,
"step": 756
},
{
"epoch": 2.245652978172401,
"grad_norm": 0.46543434262275696,
"learning_rate": 3.611922965442648e-06,
"loss": 0.6955,
"mean_token_accuracy": 0.7842147288330679,
"step": 757
},
{
"epoch": 2.2486126526082133,
"grad_norm": 0.44530779123306274,
"learning_rate": 3.5853711153868962e-06,
"loss": 0.6443,
"mean_token_accuracy": 0.7977429200467334,
"step": 758
},
{
"epoch": 2.251572327044025,
"grad_norm": 0.4391216039657593,
"learning_rate": 3.558895885496023e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.7939586840706503,
"step": 759
},
{
"epoch": 2.2545320014798373,
"grad_norm": 0.42412394285202026,
"learning_rate": 3.53249759200601e-06,
"loss": 0.6217,
"mean_token_accuracy": 0.8050196332982708,
"step": 760
},
{
"epoch": 2.257491675915649,
"grad_norm": 0.42387112975120544,
"learning_rate": 3.506176550233863e-06,
"loss": 0.6572,
"mean_token_accuracy": 0.794313531964468,
"step": 761
},
{
"epoch": 2.2604513503514614,
"grad_norm": 0.434893399477005,
"learning_rate": 3.479933074573858e-06,
"loss": 0.6855,
"mean_token_accuracy": 0.7879362757907509,
"step": 762
},
{
"epoch": 2.263411024787273,
"grad_norm": 0.4247857928276062,
"learning_rate": 3.453767478493761e-06,
"loss": 0.644,
"mean_token_accuracy": 0.7982682262279043,
"step": 763
},
{
"epoch": 2.2663706992230854,
"grad_norm": 0.42670580744743347,
"learning_rate": 3.4276800745311135e-06,
"loss": 0.6195,
"mean_token_accuracy": 0.8050541199962113,
"step": 764
},
{
"epoch": 2.2693303736588977,
"grad_norm": 0.3888881504535675,
"learning_rate": 3.401671174289469e-06,
"loss": 0.6515,
"mean_token_accuracy": 0.7958488753426484,
"step": 765
},
{
"epoch": 2.2722900480947095,
"grad_norm": 0.41099730134010315,
"learning_rate": 3.37574108843469e-06,
"loss": 0.6781,
"mean_token_accuracy": 0.7891008767600376,
"step": 766
},
{
"epoch": 2.2752497225305217,
"grad_norm": 0.41824233531951904,
"learning_rate": 3.3498901266912397e-06,
"loss": 0.6085,
"mean_token_accuracy": 0.8082267427244683,
"step": 767
},
{
"epoch": 2.2782093969663335,
"grad_norm": 0.4144093692302704,
"learning_rate": 3.3241185978384636e-06,
"loss": 0.6699,
"mean_token_accuracy": 0.7909267478796423,
"step": 768
},
{
"epoch": 2.2811690714021458,
"grad_norm": 0.42413535714149475,
"learning_rate": 3.2984268097069284e-06,
"loss": 0.6339,
"mean_token_accuracy": 0.801065864295844,
"step": 769
},
{
"epoch": 2.284128745837958,
"grad_norm": 0.39951691031455994,
"learning_rate": 3.2728150691747117e-06,
"loss": 0.6411,
"mean_token_accuracy": 0.7983959606160835,
"step": 770
},
{
"epoch": 2.28708842027377,
"grad_norm": 0.41182059049606323,
"learning_rate": 3.2472836821637744e-06,
"loss": 0.6281,
"mean_token_accuracy": 0.802523700960331,
"step": 771
},
{
"epoch": 2.290048094709582,
"grad_norm": 0.4084027409553528,
"learning_rate": 3.22183295363627e-06,
"loss": 0.6265,
"mean_token_accuracy": 0.802413599215893,
"step": 772
},
{
"epoch": 2.293007769145394,
"grad_norm": 0.4154830574989319,
"learning_rate": 3.196463187590929e-06,
"loss": 0.649,
"mean_token_accuracy": 0.796869447336104,
"step": 773
},
{
"epoch": 2.295967443581206,
"grad_norm": 0.4043501019477844,
"learning_rate": 3.1711746870594083e-06,
"loss": 0.6287,
"mean_token_accuracy": 0.8030152586126692,
"step": 774
},
{
"epoch": 2.2989271180170183,
"grad_norm": 0.4156252443790436,
"learning_rate": 3.145967754102691e-06,
"loss": 0.6372,
"mean_token_accuracy": 0.8003738520892887,
"step": 775
},
{
"epoch": 2.30188679245283,
"grad_norm": 0.4200536012649536,
"learning_rate": 3.1208426898074685e-06,
"loss": 0.671,
"mean_token_accuracy": 0.7902292574180307,
"step": 776
},
{
"epoch": 2.3048464668886424,
"grad_norm": 0.41020068526268005,
"learning_rate": 3.0957997942825337e-06,
"loss": 0.6371,
"mean_token_accuracy": 0.7996246095334629,
"step": 777
},
{
"epoch": 2.307806141324454,
"grad_norm": 0.419129341840744,
"learning_rate": 3.070839366655215e-06,
"loss": 0.6468,
"mean_token_accuracy": 0.7962623324512614,
"step": 778
},
{
"epoch": 2.3107658157602664,
"grad_norm": 0.4240724444389343,
"learning_rate": 3.045961705067787e-06,
"loss": 0.659,
"mean_token_accuracy": 0.7953217981209549,
"step": 779
},
{
"epoch": 2.313725490196078,
"grad_norm": 0.4143866002559662,
"learning_rate": 3.021167106673928e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.7982811964276817,
"step": 780
},
{
"epoch": 2.3166851646318904,
"grad_norm": 0.3981107175350189,
"learning_rate": 2.996455867635155e-06,
"loss": 0.6607,
"mean_token_accuracy": 0.791508945971797,
"step": 781
},
{
"epoch": 2.3196448390677027,
"grad_norm": 0.4162614345550537,
"learning_rate": 2.9718282831172885e-06,
"loss": 0.6504,
"mean_token_accuracy": 0.7963113772717785,
"step": 782
},
{
"epoch": 2.3226045135035145,
"grad_norm": 0.4027155637741089,
"learning_rate": 2.94728464728693e-06,
"loss": 0.6019,
"mean_token_accuracy": 0.8109574738019254,
"step": 783
},
{
"epoch": 2.3255641879393267,
"grad_norm": 0.3899628520011902,
"learning_rate": 2.922825253307947e-06,
"loss": 0.6574,
"mean_token_accuracy": 0.7928772726976023,
"step": 784
},
{
"epoch": 2.328523862375139,
"grad_norm": 0.40858355164527893,
"learning_rate": 2.898450393337977e-06,
"loss": 0.6746,
"mean_token_accuracy": 0.7888906732688429,
"step": 785
},
{
"epoch": 2.3314835368109508,
"grad_norm": 0.41953524947166443,
"learning_rate": 2.8741603585249312e-06,
"loss": 0.6408,
"mean_token_accuracy": 0.7996593220237972,
"step": 786
},
{
"epoch": 2.334443211246763,
"grad_norm": 0.4106321334838867,
"learning_rate": 2.8499554390035144e-06,
"loss": 0.6483,
"mean_token_accuracy": 0.7961187957538525,
"step": 787
},
{
"epoch": 2.337402885682575,
"grad_norm": 0.3997010886669159,
"learning_rate": 2.8258359238917665e-06,
"loss": 0.6245,
"mean_token_accuracy": 0.8050718498453701,
"step": 788
},
{
"epoch": 2.340362560118387,
"grad_norm": 0.409584105014801,
"learning_rate": 2.8018021012875994e-06,
"loss": 0.6017,
"mean_token_accuracy": 0.8123756950624281,
"step": 789
},
{
"epoch": 2.343322234554199,
"grad_norm": 0.40811508893966675,
"learning_rate": 2.7778542582653746e-06,
"loss": 0.6084,
"mean_token_accuracy": 0.8097888468215142,
"step": 790
},
{
"epoch": 2.346281908990011,
"grad_norm": 0.38959836959838867,
"learning_rate": 2.753992680872457e-06,
"loss": 0.6062,
"mean_token_accuracy": 0.8108852376474688,
"step": 791
},
{
"epoch": 2.3492415834258233,
"grad_norm": 0.3957045376300812,
"learning_rate": 2.7302176541257984e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.8015920238869745,
"step": 792
},
{
"epoch": 2.352201257861635,
"grad_norm": 0.40360507369041443,
"learning_rate": 2.7065294620085425e-06,
"loss": 0.648,
"mean_token_accuracy": 0.7971657427741622,
"step": 793
},
{
"epoch": 2.3551609322974474,
"grad_norm": 0.45460647344589233,
"learning_rate": 2.6829283874666236e-06,
"loss": 0.6445,
"mean_token_accuracy": 0.7987224105203672,
"step": 794
},
{
"epoch": 2.358120606733259,
"grad_norm": 0.39461758732795715,
"learning_rate": 2.6594147124053983e-06,
"loss": 0.6515,
"mean_token_accuracy": 0.796149561930855,
"step": 795
},
{
"epoch": 2.3610802811690714,
"grad_norm": 0.41954609751701355,
"learning_rate": 2.635988717686272e-06,
"loss": 0.6246,
"mean_token_accuracy": 0.8032149733919829,
"step": 796
},
{
"epoch": 2.3640399556048832,
"grad_norm": 0.40406131744384766,
"learning_rate": 2.6126506831233343e-06,
"loss": 0.6231,
"mean_token_accuracy": 0.8041168200702946,
"step": 797
},
{
"epoch": 2.3669996300406955,
"grad_norm": 0.3953285217285156,
"learning_rate": 2.5894008874800323e-06,
"loss": 0.6069,
"mean_token_accuracy": 0.8095464392825367,
"step": 798
},
{
"epoch": 2.3699593044765077,
"grad_norm": 0.39232245087623596,
"learning_rate": 2.5662396084658383e-06,
"loss": 0.6887,
"mean_token_accuracy": 0.7847626079340336,
"step": 799
},
{
"epoch": 2.3729189789123195,
"grad_norm": 0.4201255440711975,
"learning_rate": 2.543167122732918e-06,
"loss": 0.6305,
"mean_token_accuracy": 0.8017334424766583,
"step": 800
},
{
"epoch": 2.3729189789123195,
"eval_loss": 0.7381730079650879,
"eval_mean_token_accuracy": 0.7720131224354058,
"eval_runtime": 24.485,
"eval_samples_per_second": 5.269,
"eval_steps_per_second": 1.348,
"step": 800
},
{
"epoch": 2.3758786533481318,
"grad_norm": 0.41799813508987427,
"learning_rate": 2.5201837058728506e-06,
"loss": 0.6426,
"mean_token_accuracy": 0.7981351114043707,
"step": 801
},
{
"epoch": 2.378838327783944,
"grad_norm": 0.4000183939933777,
"learning_rate": 2.4972896324133143e-06,
"loss": 0.6408,
"mean_token_accuracy": 0.8005246267043143,
"step": 802
},
{
"epoch": 2.381798002219756,
"grad_norm": 0.3985981345176697,
"learning_rate": 2.474485175814816e-06,
"loss": 0.6419,
"mean_token_accuracy": 0.8006590768326411,
"step": 803
},
{
"epoch": 2.384757676655568,
"grad_norm": 0.39996403455734253,
"learning_rate": 2.451770608467432e-06,
"loss": 0.6556,
"mean_token_accuracy": 0.7937097877818717,
"step": 804
},
{
"epoch": 2.38771735109138,
"grad_norm": 0.40971750020980835,
"learning_rate": 2.429146201687538e-06,
"loss": 0.6544,
"mean_token_accuracy": 0.7955813996484105,
"step": 805
},
{
"epoch": 2.390677025527192,
"grad_norm": 0.4214819669723511,
"learning_rate": 2.4066122257145898e-06,
"loss": 0.6192,
"mean_token_accuracy": 0.8044043910369116,
"step": 806
},
{
"epoch": 2.393636699963004,
"grad_norm": 0.3935949206352234,
"learning_rate": 2.3841689497078746e-06,
"loss": 0.6616,
"mean_token_accuracy": 0.7930927722183864,
"step": 807
},
{
"epoch": 2.396596374398816,
"grad_norm": 0.4110560715198517,
"learning_rate": 2.361816641743303e-06,
"loss": 0.6589,
"mean_token_accuracy": 0.7933747994521603,
"step": 808
},
{
"epoch": 2.3995560488346284,
"grad_norm": 0.40695828199386597,
"learning_rate": 2.339555568810221e-06,
"loss": 0.6654,
"mean_token_accuracy": 0.79236514420736,
"step": 809
},
{
"epoch": 2.40251572327044,
"grad_norm": 0.4188994765281677,
"learning_rate": 2.317385996808195e-06,
"loss": 0.6401,
"mean_token_accuracy": 0.7978658874862038,
"step": 810
},
{
"epoch": 2.4054753977062524,
"grad_norm": 0.4050770699977875,
"learning_rate": 2.295308190543859e-06,
"loss": 0.6565,
"mean_token_accuracy": 0.793120003753917,
"step": 811
},
{
"epoch": 2.408435072142064,
"grad_norm": 0.4208693206310272,
"learning_rate": 2.2733224137277366e-06,
"loss": 0.6625,
"mean_token_accuracy": 0.7924009490317484,
"step": 812
},
{
"epoch": 2.4113947465778764,
"grad_norm": 0.41889867186546326,
"learning_rate": 2.251428928971102e-06,
"loss": 0.6421,
"mean_token_accuracy": 0.7987856486295601,
"step": 813
},
{
"epoch": 2.4143544210136882,
"grad_norm": 0.3941342532634735,
"learning_rate": 2.229627997782834e-06,
"loss": 0.6522,
"mean_token_accuracy": 0.7969981541204149,
"step": 814
},
{
"epoch": 2.4173140954495005,
"grad_norm": 0.4085904061794281,
"learning_rate": 2.2079198805662917e-06,
"loss": 0.636,
"mean_token_accuracy": 0.8006980355838276,
"step": 815
},
{
"epoch": 2.4202737698853127,
"grad_norm": 0.39825567603111267,
"learning_rate": 2.186304836616221e-06,
"loss": 0.6447,
"mean_token_accuracy": 0.7977600103702366,
"step": 816
},
{
"epoch": 2.4232334443211245,
"grad_norm": 0.40731707215309143,
"learning_rate": 2.1647831241156304e-06,
"loss": 0.6504,
"mean_token_accuracy": 0.7959071538531968,
"step": 817
},
{
"epoch": 2.4261931187569368,
"grad_norm": 0.406483918428421,
"learning_rate": 2.1433550001327376e-06,
"loss": 0.6639,
"mean_token_accuracy": 0.7929632102578547,
"step": 818
},
{
"epoch": 2.429152793192749,
"grad_norm": 0.40205124020576477,
"learning_rate": 2.122020720617869e-06,
"loss": 0.6602,
"mean_token_accuracy": 0.7925995018559459,
"step": 819
},
{
"epoch": 2.432112467628561,
"grad_norm": 0.39821526408195496,
"learning_rate": 2.1007805404004247e-06,
"loss": 0.6125,
"mean_token_accuracy": 0.8074528559405126,
"step": 820
},
{
"epoch": 2.435072142064373,
"grad_norm": 0.41154807806015015,
"learning_rate": 2.0796347131858187e-06,
"loss": 0.5924,
"mean_token_accuracy": 0.8134260585147182,
"step": 821
},
{
"epoch": 2.438031816500185,
"grad_norm": 0.4058592617511749,
"learning_rate": 2.058583491552465e-06,
"loss": 0.6446,
"mean_token_accuracy": 0.7976544788468782,
"step": 822
},
{
"epoch": 2.440991490935997,
"grad_norm": 0.4115375280380249,
"learning_rate": 2.037627126948751e-06,
"loss": 0.6486,
"mean_token_accuracy": 0.7961866171753605,
"step": 823
},
{
"epoch": 2.443951165371809,
"grad_norm": 0.4094432592391968,
"learning_rate": 2.0167658696900317e-06,
"loss": 0.6498,
"mean_token_accuracy": 0.7974890015343987,
"step": 824
},
{
"epoch": 2.446910839807621,
"grad_norm": 0.4136302173137665,
"learning_rate": 1.9959999689556407e-06,
"loss": 0.6508,
"mean_token_accuracy": 0.7969356095942468,
"step": 825
},
{
"epoch": 2.4498705142434334,
"grad_norm": 0.3964935839176178,
"learning_rate": 1.9753296727859195e-06,
"loss": 0.6422,
"mean_token_accuracy": 0.7975552703681513,
"step": 826
},
{
"epoch": 2.452830188679245,
"grad_norm": 0.3923938572406769,
"learning_rate": 1.9547552280792528e-06,
"loss": 0.6644,
"mean_token_accuracy": 0.7923696593805352,
"step": 827
},
{
"epoch": 2.4557898631150574,
"grad_norm": 0.40744659304618835,
"learning_rate": 1.9342768805891176e-06,
"loss": 0.6185,
"mean_token_accuracy": 0.805182835348635,
"step": 828
},
{
"epoch": 2.4587495375508692,
"grad_norm": 0.3996569812297821,
"learning_rate": 1.9138948749211473e-06,
"loss": 0.6885,
"mean_token_accuracy": 0.7846693968735795,
"step": 829
},
{
"epoch": 2.4617092119866815,
"grad_norm": 0.41256505250930786,
"learning_rate": 1.8936094545302098e-06,
"loss": 0.633,
"mean_token_accuracy": 0.80203945172239,
"step": 830
},
{
"epoch": 2.4646688864224937,
"grad_norm": 0.40780341625213623,
"learning_rate": 1.8734208617174986e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.8031928870956203,
"step": 831
},
{
"epoch": 2.4676285608583055,
"grad_norm": 0.40084558725357056,
"learning_rate": 1.8533293376276473e-06,
"loss": 0.6645,
"mean_token_accuracy": 0.7910519113431395,
"step": 832
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.39891085028648376,
"learning_rate": 1.8333351222458407e-06,
"loss": 0.6457,
"mean_token_accuracy": 0.7975340656719943,
"step": 833
},
{
"epoch": 2.4735479097299296,
"grad_norm": 0.39134928584098816,
"learning_rate": 1.813438454394948e-06,
"loss": 0.666,
"mean_token_accuracy": 0.7898439445031347,
"step": 834
},
{
"epoch": 2.476507584165742,
"grad_norm": 0.41572368144989014,
"learning_rate": 1.7936395717326705e-06,
"loss": 0.6414,
"mean_token_accuracy": 0.7995425811392918,
"step": 835
},
{
"epoch": 2.479467258601554,
"grad_norm": 0.40483906865119934,
"learning_rate": 1.773938710748706e-06,
"loss": 0.662,
"mean_token_accuracy": 0.7923949344399477,
"step": 836
},
{
"epoch": 2.482426933037366,
"grad_norm": 0.40634220838546753,
"learning_rate": 1.7543361067619269e-06,
"loss": 0.615,
"mean_token_accuracy": 0.806526275556733,
"step": 837
},
{
"epoch": 2.485386607473178,
"grad_norm": 0.4077673852443695,
"learning_rate": 1.734831993917564e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.802378745198797,
"step": 838
},
{
"epoch": 2.48834628190899,
"grad_norm": 0.39237353205680847,
"learning_rate": 1.715426605184407e-06,
"loss": 0.6155,
"mean_token_accuracy": 0.8060445709769514,
"step": 839
},
{
"epoch": 2.491305956344802,
"grad_norm": 0.4081886112689972,
"learning_rate": 1.6961201723520248e-06,
"loss": 0.6144,
"mean_token_accuracy": 0.8065054898817852,
"step": 840
},
{
"epoch": 2.494265630780614,
"grad_norm": 0.4004494547843933,
"learning_rate": 1.676912926028007e-06,
"loss": 0.6055,
"mean_token_accuracy": 0.809148562640221,
"step": 841
},
{
"epoch": 2.497225305216426,
"grad_norm": 0.4009197950363159,
"learning_rate": 1.6578050956351887e-06,
"loss": 0.6238,
"mean_token_accuracy": 0.8049418801942305,
"step": 842
},
{
"epoch": 2.5001849796522384,
"grad_norm": 0.3991737365722656,
"learning_rate": 1.6387969094089318e-06,
"loss": 0.6176,
"mean_token_accuracy": 0.8066388869046413,
"step": 843
},
{
"epoch": 2.50314465408805,
"grad_norm": 0.4157380163669586,
"learning_rate": 1.619888594394382e-06,
"loss": 0.6469,
"mean_token_accuracy": 0.7967652752528133,
"step": 844
},
{
"epoch": 2.5061043285238624,
"grad_norm": 0.41072478890419006,
"learning_rate": 1.6010803764437633e-06,
"loss": 0.6285,
"mean_token_accuracy": 0.8027356011802552,
"step": 845
},
{
"epoch": 2.5090640029596747,
"grad_norm": 0.38374269008636475,
"learning_rate": 1.5823724802136863e-06,
"loss": 0.6655,
"mean_token_accuracy": 0.7921055036509936,
"step": 846
},
{
"epoch": 2.5120236773954865,
"grad_norm": 0.3918653130531311,
"learning_rate": 1.5637651291624522e-06,
"loss": 0.6541,
"mean_token_accuracy": 0.7957731421640813,
"step": 847
},
{
"epoch": 2.5149833518312983,
"grad_norm": 0.4183335602283478,
"learning_rate": 1.545258545547398e-06,
"loss": 0.6808,
"mean_token_accuracy": 0.7860103025645604,
"step": 848
},
{
"epoch": 2.5179430262671105,
"grad_norm": 0.40748029947280884,
"learning_rate": 1.5268529504222262e-06,
"loss": 0.6476,
"mean_token_accuracy": 0.7958813429391195,
"step": 849
},
{
"epoch": 2.5209027007029228,
"grad_norm": 0.4112967550754547,
"learning_rate": 1.5085485636343755e-06,
"loss": 0.6305,
"mean_token_accuracy": 0.8016536067152452,
"step": 850
},
{
"epoch": 2.5238623751387346,
"grad_norm": 0.40055161714553833,
"learning_rate": 1.4903456038223941e-06,
"loss": 0.6374,
"mean_token_accuracy": 0.799591641647149,
"step": 851
},
{
"epoch": 2.526822049574547,
"grad_norm": 0.39930155873298645,
"learning_rate": 1.4722442884133214e-06,
"loss": 0.5796,
"mean_token_accuracy": 0.8175529008877027,
"step": 852
},
{
"epoch": 2.529781724010359,
"grad_norm": 0.3882523477077484,
"learning_rate": 1.4542448336201021e-06,
"loss": 0.646,
"mean_token_accuracy": 0.7965177087401804,
"step": 853
},
{
"epoch": 2.532741398446171,
"grad_norm": 0.4089968502521515,
"learning_rate": 1.4363474544389876e-06,
"loss": 0.6288,
"mean_token_accuracy": 0.8025391764757291,
"step": 854
},
{
"epoch": 2.535701072881983,
"grad_norm": 0.39754486083984375,
"learning_rate": 1.4185523646469822e-06,
"loss": 0.6461,
"mean_token_accuracy": 0.7974458592055889,
"step": 855
},
{
"epoch": 2.538660747317795,
"grad_norm": 0.429750919342041,
"learning_rate": 1.4008597767992872e-06,
"loss": 0.6118,
"mean_token_accuracy": 0.8093011527301119,
"step": 856
},
{
"epoch": 2.541620421753607,
"grad_norm": 0.38371750712394714,
"learning_rate": 1.3832699022267516e-06,
"loss": 0.6399,
"mean_token_accuracy": 0.7980772590627099,
"step": 857
},
{
"epoch": 2.544580096189419,
"grad_norm": 0.41115689277648926,
"learning_rate": 1.3657829510333653e-06,
"loss": 0.6633,
"mean_token_accuracy": 0.7933955020310409,
"step": 858
},
{
"epoch": 2.547539770625231,
"grad_norm": 0.406768262386322,
"learning_rate": 1.3483991320937307e-06,
"loss": 0.6368,
"mean_token_accuracy": 0.8023250526600325,
"step": 859
},
{
"epoch": 2.5504994450610434,
"grad_norm": 0.4091865122318268,
"learning_rate": 1.3311186530505838e-06,
"loss": 0.6189,
"mean_token_accuracy": 0.8061198976192254,
"step": 860
},
{
"epoch": 2.5534591194968552,
"grad_norm": 0.385766863822937,
"learning_rate": 1.313941720312303e-06,
"loss": 0.6262,
"mean_token_accuracy": 0.8026254886335932,
"step": 861
},
{
"epoch": 2.5564187939326675,
"grad_norm": 0.403012216091156,
"learning_rate": 1.2968685390504465e-06,
"loss": 0.622,
"mean_token_accuracy": 0.8041227440695632,
"step": 862
},
{
"epoch": 2.5593784683684797,
"grad_norm": 0.3971555829048157,
"learning_rate": 1.2798993131973093e-06,
"loss": 0.6745,
"mean_token_accuracy": 0.7896582637305288,
"step": 863
},
{
"epoch": 2.5623381428042915,
"grad_norm": 0.40315189957618713,
"learning_rate": 1.263034245443473e-06,
"loss": 0.6563,
"mean_token_accuracy": 0.7948344293273772,
"step": 864
},
{
"epoch": 2.5652978172401038,
"grad_norm": 0.40858373045921326,
"learning_rate": 1.2462735372353996e-06,
"loss": 0.6228,
"mean_token_accuracy": 0.8045441140339781,
"step": 865
},
{
"epoch": 2.5682574916759155,
"grad_norm": 0.3969631493091583,
"learning_rate": 1.2296173887730122e-06,
"loss": 0.6345,
"mean_token_accuracy": 0.8022942568625994,
"step": 866
},
{
"epoch": 2.571217166111728,
"grad_norm": 0.39615315198898315,
"learning_rate": 1.2130659990073146e-06,
"loss": 0.6356,
"mean_token_accuracy": 0.7998559942550404,
"step": 867
},
{
"epoch": 2.5741768405475396,
"grad_norm": 0.38922396302223206,
"learning_rate": 1.196619565638003e-06,
"loss": 0.6286,
"mean_token_accuracy": 0.8018824489890675,
"step": 868
},
{
"epoch": 2.577136514983352,
"grad_norm": 0.4000704288482666,
"learning_rate": 1.1802782851111206e-06,
"loss": 0.6418,
"mean_token_accuracy": 0.7989303050191064,
"step": 869
},
{
"epoch": 2.580096189419164,
"grad_norm": 0.39476409554481506,
"learning_rate": 1.1640423526166987e-06,
"loss": 0.6445,
"mean_token_accuracy": 0.797418578107648,
"step": 870
},
{
"epoch": 2.583055863854976,
"grad_norm": 0.39660996198654175,
"learning_rate": 1.1479119620864277e-06,
"loss": 0.6575,
"mean_token_accuracy": 0.795806747653712,
"step": 871
},
{
"epoch": 2.586015538290788,
"grad_norm": 0.41734716296195984,
"learning_rate": 1.1318873061913405e-06,
"loss": 0.5882,
"mean_token_accuracy": 0.8143113885996807,
"step": 872
},
{
"epoch": 2.5889752127266,
"grad_norm": 0.3729105293750763,
"learning_rate": 1.1159685763395113e-06,
"loss": 0.64,
"mean_token_accuracy": 0.7987188883545505,
"step": 873
},
{
"epoch": 2.591934887162412,
"grad_norm": 0.3946407437324524,
"learning_rate": 1.1001559626737757e-06,
"loss": 0.6418,
"mean_token_accuracy": 0.798503030470437,
"step": 874
},
{
"epoch": 2.594894561598224,
"grad_norm": 0.4132760763168335,
"learning_rate": 1.0844496540694515e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.8039569693853369,
"step": 875
},
{
"epoch": 2.597854236034036,
"grad_norm": 0.40759339928627014,
"learning_rate": 1.0688498381320855e-06,
"loss": 0.6318,
"mean_token_accuracy": 0.8012822502344166,
"step": 876
},
{
"epoch": 2.6008139104698484,
"grad_norm": 0.41799381375312805,
"learning_rate": 1.0533567011952094e-06,
"loss": 0.6464,
"mean_token_accuracy": 0.7964816550323018,
"step": 877
},
{
"epoch": 2.6037735849056602,
"grad_norm": 0.4257717430591583,
"learning_rate": 1.037970428318118e-06,
"loss": 0.6841,
"mean_token_accuracy": 0.7837483957536826,
"step": 878
},
{
"epoch": 2.6067332593414725,
"grad_norm": 0.4087117314338684,
"learning_rate": 1.022691203283661e-06,
"loss": 0.6507,
"mean_token_accuracy": 0.7950712747355096,
"step": 879
},
{
"epoch": 2.6096929337772847,
"grad_norm": 0.41945111751556396,
"learning_rate": 1.0075192085960451e-06,
"loss": 0.6678,
"mean_token_accuracy": 0.7909589594797406,
"step": 880
},
{
"epoch": 2.6126526082130965,
"grad_norm": 0.398735374212265,
"learning_rate": 9.924546254786493e-07,
"loss": 0.6316,
"mean_token_accuracy": 0.8018926205701773,
"step": 881
},
{
"epoch": 2.6156122826489088,
"grad_norm": 0.406318724155426,
"learning_rate": 9.77497633871868e-07,
"loss": 0.6054,
"mean_token_accuracy": 0.8093279590843514,
"step": 882
},
{
"epoch": 2.6185719570847206,
"grad_norm": 0.3851606845855713,
"learning_rate": 9.62648412430951e-07,
"loss": 0.6791,
"mean_token_accuracy": 0.7881774140441217,
"step": 883
},
{
"epoch": 2.621531631520533,
"grad_norm": 0.4061947762966156,
"learning_rate": 9.479071385238892e-07,
"loss": 0.6212,
"mean_token_accuracy": 0.8042670614990748,
"step": 884
},
{
"epoch": 2.6244913059563446,
"grad_norm": 0.39614221453666687,
"learning_rate": 9.332739882292752e-07,
"loss": 0.6296,
"mean_token_accuracy": 0.8017565837535566,
"step": 885
},
{
"epoch": 2.627450980392157,
"grad_norm": 0.3858533501625061,
"learning_rate": 9.187491363342094e-07,
"loss": 0.5922,
"mean_token_accuracy": 0.8143832301495489,
"step": 886
},
{
"epoch": 2.630410654827969,
"grad_norm": 0.39614781737327576,
"learning_rate": 9.043327563322113e-07,
"loss": 0.6387,
"mean_token_accuracy": 0.799956339899957,
"step": 887
},
{
"epoch": 2.633370329263781,
"grad_norm": 0.38962864875793457,
"learning_rate": 8.900250204211513e-07,
"loss": 0.626,
"mean_token_accuracy": 0.8054223234361488,
"step": 888
},
{
"epoch": 2.636330003699593,
"grad_norm": 0.38743823766708374,
"learning_rate": 8.758260995011825e-07,
"loss": 0.6249,
"mean_token_accuracy": 0.8041963208824743,
"step": 889
},
{
"epoch": 2.6392896781354054,
"grad_norm": 0.38722845911979675,
"learning_rate": 8.617361631727139e-07,
"loss": 0.637,
"mean_token_accuracy": 0.7999073170969193,
"step": 890
},
{
"epoch": 2.642249352571217,
"grad_norm": 0.38422495126724243,
"learning_rate": 8.477553797343729e-07,
"loss": 0.5932,
"mean_token_accuracy": 0.8125740456037845,
"step": 891
},
{
"epoch": 2.645209027007029,
"grad_norm": 0.3883955180644989,
"learning_rate": 8.338839161809997e-07,
"loss": 0.6259,
"mean_token_accuracy": 0.8034302437405634,
"step": 892
},
{
"epoch": 2.648168701442841,
"grad_norm": 0.413769394159317,
"learning_rate": 8.201219382016556e-07,
"loss": 0.6425,
"mean_token_accuracy": 0.7988244713424745,
"step": 893
},
{
"epoch": 2.6511283758786535,
"grad_norm": 0.3942348062992096,
"learning_rate": 8.06469610177636e-07,
"loss": 0.6366,
"mean_token_accuracy": 0.800066869045331,
"step": 894
},
{
"epoch": 2.6540880503144653,
"grad_norm": 0.3790660500526428,
"learning_rate": 7.92927095180518e-07,
"loss": 0.6505,
"mean_token_accuracy": 0.795845314542134,
"step": 895
},
{
"epoch": 2.6570477247502775,
"grad_norm": 0.42260193824768066,
"learning_rate": 7.794945549701993e-07,
"loss": 0.6085,
"mean_token_accuracy": 0.8089679902729355,
"step": 896
},
{
"epoch": 2.6600073991860897,
"grad_norm": 0.37863457202911377,
"learning_rate": 7.661721499929753e-07,
"loss": 0.608,
"mean_token_accuracy": 0.8079819508856279,
"step": 897
},
{
"epoch": 2.6629670736219015,
"grad_norm": 0.4104274809360504,
"learning_rate": 7.529600393796232e-07,
"loss": 0.6343,
"mean_token_accuracy": 0.8013414635641989,
"step": 898
},
{
"epoch": 2.665926748057714,
"grad_norm": 0.4015280604362488,
"learning_rate": 7.398583809434944e-07,
"loss": 0.6194,
"mean_token_accuracy": 0.8067789013401996,
"step": 899
},
{
"epoch": 2.6688864224935256,
"grad_norm": 0.3843616247177124,
"learning_rate": 7.268673311786378e-07,
"loss": 0.655,
"mean_token_accuracy": 0.7944493186314524,
"step": 900
},
{
"epoch": 2.6688864224935256,
"eval_loss": 0.737091064453125,
"eval_mean_token_accuracy": 0.7722201670436681,
"eval_runtime": 24.4823,
"eval_samples_per_second": 5.269,
"eval_steps_per_second": 1.348,
"step": 900
},
{
"epoch": 2.671846096929338,
"grad_norm": 0.40167438983917236,
"learning_rate": 7.1398704525792e-07,
"loss": 0.6665,
"mean_token_accuracy": 0.7904682922183952,
"step": 901
},
{
"epoch": 2.6748057713651496,
"grad_norm": 0.4117159843444824,
"learning_rate": 7.012176770311863e-07,
"loss": 0.6622,
"mean_token_accuracy": 0.7920689961190451,
"step": 902
},
{
"epoch": 2.677765445800962,
"grad_norm": 0.39613744616508484,
"learning_rate": 6.885593790234057e-07,
"loss": 0.6376,
"mean_token_accuracy": 0.799410845334018,
"step": 903
},
{
"epoch": 2.680725120236774,
"grad_norm": 0.38793283700942993,
"learning_rate": 6.760123024328624e-07,
"loss": 0.6141,
"mean_token_accuracy": 0.8077387547151241,
"step": 904
},
{
"epoch": 2.683684794672586,
"grad_norm": 0.38844698667526245,
"learning_rate": 6.635765971293484e-07,
"loss": 0.6559,
"mean_token_accuracy": 0.794430430660069,
"step": 905
},
{
"epoch": 2.686644469108398,
"grad_norm": 0.3850746154785156,
"learning_rate": 6.512524116523633e-07,
"loss": 0.627,
"mean_token_accuracy": 0.8037230062591546,
"step": 906
},
{
"epoch": 2.6896041435442104,
"grad_norm": 0.3915550708770752,
"learning_rate": 6.390398932093555e-07,
"loss": 0.6077,
"mean_token_accuracy": 0.8080517778457975,
"step": 907
},
{
"epoch": 2.692563817980022,
"grad_norm": 0.37720099091529846,
"learning_rate": 6.269391876739494e-07,
"loss": 0.6301,
"mean_token_accuracy": 0.8039389719388176,
"step": 908
},
{
"epoch": 2.695523492415834,
"grad_norm": 0.3923218250274658,
"learning_rate": 6.149504395842087e-07,
"loss": 0.6148,
"mean_token_accuracy": 0.8082143968389491,
"step": 909
},
{
"epoch": 2.6984831668516462,
"grad_norm": 0.39484548568725586,
"learning_rate": 6.030737921409169e-07,
"loss": 0.6583,
"mean_token_accuracy": 0.7938795460478842,
"step": 910
},
{
"epoch": 2.7014428412874585,
"grad_norm": 0.40009021759033203,
"learning_rate": 5.913093872058528e-07,
"loss": 0.6608,
"mean_token_accuracy": 0.793614022788515,
"step": 911
},
{
"epoch": 2.7044025157232703,
"grad_norm": 0.40624064207077026,
"learning_rate": 5.796573653001091e-07,
"loss": 0.6335,
"mean_token_accuracy": 0.8018102965988579,
"step": 912
},
{
"epoch": 2.7073621901590825,
"grad_norm": 0.4008027911186218,
"learning_rate": 5.681178656024055e-07,
"loss": 0.6626,
"mean_token_accuracy": 0.7932069577957652,
"step": 913
},
{
"epoch": 2.7103218645948948,
"grad_norm": 0.40246814489364624,
"learning_rate": 5.56691025947429e-07,
"loss": 0.6378,
"mean_token_accuracy": 0.800000261371183,
"step": 914
},
{
"epoch": 2.7132815390307066,
"grad_norm": 0.37238821387290955,
"learning_rate": 5.453769828241872e-07,
"loss": 0.6268,
"mean_token_accuracy": 0.8024412909252127,
"step": 915
},
{
"epoch": 2.716241213466519,
"grad_norm": 0.39563846588134766,
"learning_rate": 5.341758713743828e-07,
"loss": 0.6596,
"mean_token_accuracy": 0.7931748591712275,
"step": 916
},
{
"epoch": 2.7192008879023306,
"grad_norm": 0.3933393061161041,
"learning_rate": 5.230878253907911e-07,
"loss": 0.6416,
"mean_token_accuracy": 0.7995262716287037,
"step": 917
},
{
"epoch": 2.722160562338143,
"grad_norm": 0.3950590193271637,
"learning_rate": 5.121129773156663e-07,
"loss": 0.6771,
"mean_token_accuracy": 0.7878128871617898,
"step": 918
},
{
"epoch": 2.7251202367739547,
"grad_norm": 0.41165900230407715,
"learning_rate": 5.012514582391592e-07,
"loss": 0.6194,
"mean_token_accuracy": 0.805260790188586,
"step": 919
},
{
"epoch": 2.728079911209767,
"grad_norm": 0.3828143775463104,
"learning_rate": 4.905033978977492e-07,
"loss": 0.6285,
"mean_token_accuracy": 0.8036274550004541,
"step": 920
},
{
"epoch": 2.731039585645579,
"grad_norm": 0.3781799077987671,
"learning_rate": 4.798689246727006e-07,
"loss": 0.6143,
"mean_token_accuracy": 0.8072609168484571,
"step": 921
},
{
"epoch": 2.733999260081391,
"grad_norm": 0.3903900682926178,
"learning_rate": 4.693481655885257e-07,
"loss": 0.6698,
"mean_token_accuracy": 0.7922049787058092,
"step": 922
},
{
"epoch": 2.736958934517203,
"grad_norm": 0.3956415355205536,
"learning_rate": 4.58941246311464e-07,
"loss": 0.6301,
"mean_token_accuracy": 0.8028085591716645,
"step": 923
},
{
"epoch": 2.7399186089530154,
"grad_norm": 0.3861734867095947,
"learning_rate": 4.4864829114798394e-07,
"loss": 0.6371,
"mean_token_accuracy": 0.8004312278302195,
"step": 924
},
{
"epoch": 2.742878283388827,
"grad_norm": 0.3868809640407562,
"learning_rate": 4.384694230432984e-07,
"loss": 0.5952,
"mean_token_accuracy": 0.8138450723816196,
"step": 925
},
{
"epoch": 2.745837957824639,
"grad_norm": 0.3856772780418396,
"learning_rate": 4.2840476357989825e-07,
"loss": 0.611,
"mean_token_accuracy": 0.80796215409744,
"step": 926
},
{
"epoch": 2.7487976322604513,
"grad_norm": 0.404486745595932,
"learning_rate": 4.184544329761009e-07,
"loss": 0.6209,
"mean_token_accuracy": 0.8057150436844314,
"step": 927
},
{
"epoch": 2.7517573066962635,
"grad_norm": 0.3897272050380707,
"learning_rate": 4.0861855008460403e-07,
"loss": 0.6327,
"mean_token_accuracy": 0.8016584740172387,
"step": 928
},
{
"epoch": 2.7547169811320753,
"grad_norm": 0.3906909227371216,
"learning_rate": 3.988972323910778e-07,
"loss": 0.6181,
"mean_token_accuracy": 0.805539043349179,
"step": 929
},
{
"epoch": 2.7576766555678875,
"grad_norm": 0.38629284501075745,
"learning_rate": 3.8929059601275463e-07,
"loss": 0.6256,
"mean_token_accuracy": 0.8029286474181538,
"step": 930
},
{
"epoch": 2.7606363300037,
"grad_norm": 0.4061240255832672,
"learning_rate": 3.797987556970495e-07,
"loss": 0.6719,
"mean_token_accuracy": 0.7906059984731508,
"step": 931
},
{
"epoch": 2.7635960044395116,
"grad_norm": 0.40067771077156067,
"learning_rate": 3.7042182482018074e-07,
"loss": 0.6271,
"mean_token_accuracy": 0.8041936678142166,
"step": 932
},
{
"epoch": 2.766555678875324,
"grad_norm": 0.3809727132320404,
"learning_rate": 3.611599153858214e-07,
"loss": 0.6769,
"mean_token_accuracy": 0.7875104091416671,
"step": 933
},
{
"epoch": 2.7695153533111356,
"grad_norm": 0.40350061655044556,
"learning_rate": 3.520131380237546e-07,
"loss": 0.6647,
"mean_token_accuracy": 0.7917032324367623,
"step": 934
},
{
"epoch": 2.772475027746948,
"grad_norm": 0.4117463529109955,
"learning_rate": 3.429816019885657e-07,
"loss": 0.6811,
"mean_token_accuracy": 0.787343757534307,
"step": 935
},
{
"epoch": 2.7754347021827597,
"grad_norm": 0.3994939923286438,
"learning_rate": 3.3406541515832e-07,
"loss": 0.6786,
"mean_token_accuracy": 0.7861266133562229,
"step": 936
},
{
"epoch": 2.778394376618572,
"grad_norm": 0.39691928029060364,
"learning_rate": 3.252646840332918e-07,
"loss": 0.6468,
"mean_token_accuracy": 0.7971869583236945,
"step": 937
},
{
"epoch": 2.781354051054384,
"grad_norm": 0.37808868288993835,
"learning_rate": 3.16579513734675e-07,
"loss": 0.6259,
"mean_token_accuracy": 0.8036837288252531,
"step": 938
},
{
"epoch": 2.784313725490196,
"grad_norm": 0.38705241680145264,
"learning_rate": 3.080100080033388e-07,
"loss": 0.622,
"mean_token_accuracy": 0.8054349345477914,
"step": 939
},
{
"epoch": 2.787273399926008,
"grad_norm": 0.37049245834350586,
"learning_rate": 2.995562691985898e-07,
"loss": 0.6281,
"mean_token_accuracy": 0.802922693455199,
"step": 940
},
{
"epoch": 2.7902330743618204,
"grad_norm": 0.4022907316684723,
"learning_rate": 2.9121839829693857e-07,
"loss": 0.6193,
"mean_token_accuracy": 0.8052185953687516,
"step": 941
},
{
"epoch": 2.7931927487976322,
"grad_norm": 0.40110448002815247,
"learning_rate": 2.829964948909048e-07,
"loss": 0.6233,
"mean_token_accuracy": 0.8038183781558145,
"step": 942
},
{
"epoch": 2.7961524232334445,
"grad_norm": 0.3919583857059479,
"learning_rate": 2.748906571878207e-07,
"loss": 0.6603,
"mean_token_accuracy": 0.7946063609111435,
"step": 943
},
{
"epoch": 2.7991120976692563,
"grad_norm": 0.39748555421829224,
"learning_rate": 2.6690098200866097e-07,
"loss": 0.6416,
"mean_token_accuracy": 0.7996132256535484,
"step": 944
},
{
"epoch": 2.8020717721050685,
"grad_norm": 0.40067169070243835,
"learning_rate": 2.5902756478688674e-07,
"loss": 0.6431,
"mean_token_accuracy": 0.7986862031085916,
"step": 945
},
{
"epoch": 2.8050314465408803,
"grad_norm": 0.3947811722755432,
"learning_rate": 2.5127049956730207e-07,
"loss": 0.6424,
"mean_token_accuracy": 0.797873089467536,
"step": 946
},
{
"epoch": 2.8079911209766926,
"grad_norm": 0.38122984766960144,
"learning_rate": 2.436298790049363e-07,
"loss": 0.6656,
"mean_token_accuracy": 0.7921808444907809,
"step": 947
},
{
"epoch": 2.810950795412505,
"grad_norm": 0.3970412611961365,
"learning_rate": 2.3610579436392999e-07,
"loss": 0.6454,
"mean_token_accuracy": 0.798217491279841,
"step": 948
},
{
"epoch": 2.8139104698483166,
"grad_norm": 0.39274781942367554,
"learning_rate": 2.2869833551645293e-07,
"loss": 0.6462,
"mean_token_accuracy": 0.7971693406963306,
"step": 949
},
{
"epoch": 2.816870144284129,
"grad_norm": 0.38875052332878113,
"learning_rate": 2.2140759094162468e-07,
"loss": 0.6447,
"mean_token_accuracy": 0.7977855648395308,
"step": 950
},
{
"epoch": 2.8198298187199407,
"grad_norm": 0.39083102345466614,
"learning_rate": 2.1423364772445886e-07,
"loss": 0.6233,
"mean_token_accuracy": 0.8038262482281366,
"step": 951
},
{
"epoch": 2.822789493155753,
"grad_norm": 0.3867531716823578,
"learning_rate": 2.071765915548274e-07,
"loss": 0.6872,
"mean_token_accuracy": 0.7856023656322098,
"step": 952
},
{
"epoch": 2.8257491675915647,
"grad_norm": 0.39589664340019226,
"learning_rate": 2.002365067264289e-07,
"loss": 0.6737,
"mean_token_accuracy": 0.7887226540574725,
"step": 953
},
{
"epoch": 2.828708842027377,
"grad_norm": 0.41389018297195435,
"learning_rate": 1.9341347613579086e-07,
"loss": 0.6184,
"mean_token_accuracy": 0.8065408919751612,
"step": 954
},
{
"epoch": 2.831668516463189,
"grad_norm": 0.4138829708099365,
"learning_rate": 1.867075812812691e-07,
"loss": 0.6391,
"mean_token_accuracy": 0.8007256177269018,
"step": 955
},
{
"epoch": 2.834628190899001,
"grad_norm": 0.384776771068573,
"learning_rate": 1.8011890226208527e-07,
"loss": 0.613,
"mean_token_accuracy": 0.8072664965020259,
"step": 956
},
{
"epoch": 2.837587865334813,
"grad_norm": 0.37912535667419434,
"learning_rate": 1.7364751777736334e-07,
"loss": 0.6509,
"mean_token_accuracy": 0.7937373916975208,
"step": 957
},
{
"epoch": 2.8405475397706255,
"grad_norm": 0.39359596371650696,
"learning_rate": 1.6729350512519006e-07,
"loss": 0.6386,
"mean_token_accuracy": 0.8000337051550754,
"step": 958
},
{
"epoch": 2.8435072142064373,
"grad_norm": 0.3822968602180481,
"learning_rate": 1.6105694020169594e-07,
"loss": 0.6322,
"mean_token_accuracy": 0.8005505544311058,
"step": 959
},
{
"epoch": 2.8464668886422495,
"grad_norm": 0.376174658536911,
"learning_rate": 1.5493789750014032e-07,
"loss": 0.6178,
"mean_token_accuracy": 0.8039858290743149,
"step": 960
},
{
"epoch": 2.8494265630780613,
"grad_norm": 0.388172447681427,
"learning_rate": 1.489364501100332e-07,
"loss": 0.6551,
"mean_token_accuracy": 0.7960565577797374,
"step": 961
},
{
"epoch": 2.8523862375138735,
"grad_norm": 0.3901033103466034,
"learning_rate": 1.430526697162482e-07,
"loss": 0.645,
"mean_token_accuracy": 0.7975772604806072,
"step": 962
},
{
"epoch": 2.8553459119496853,
"grad_norm": 0.3848772943019867,
"learning_rate": 1.3728662659818205e-07,
"loss": 0.6037,
"mean_token_accuracy": 0.8107080863727026,
"step": 963
},
{
"epoch": 2.8583055863854976,
"grad_norm": 0.38093602657318115,
"learning_rate": 1.3163838962890196e-07,
"loss": 0.6602,
"mean_token_accuracy": 0.7919608208568516,
"step": 964
},
{
"epoch": 2.86126526082131,
"grad_norm": 0.3964565396308899,
"learning_rate": 1.2610802627432972e-07,
"loss": 0.6427,
"mean_token_accuracy": 0.798779575468243,
"step": 965
},
{
"epoch": 2.8642249352571216,
"grad_norm": 0.41397061944007874,
"learning_rate": 1.206956025924333e-07,
"loss": 0.6266,
"mean_token_accuracy": 0.8023839610262327,
"step": 966
},
{
"epoch": 2.867184609692934,
"grad_norm": 0.3790512681007385,
"learning_rate": 1.1540118323243866e-07,
"loss": 0.5703,
"mean_token_accuracy": 0.8205281597109083,
"step": 967
},
{
"epoch": 2.870144284128746,
"grad_norm": 0.3819893002510071,
"learning_rate": 1.1022483143405705e-07,
"loss": 0.6072,
"mean_token_accuracy": 0.8105382446606855,
"step": 968
},
{
"epoch": 2.873103958564558,
"grad_norm": 0.38210329413414,
"learning_rate": 1.0516660902673448e-07,
"loss": 0.6473,
"mean_token_accuracy": 0.7979098356765058,
"step": 969
},
{
"epoch": 2.8760636330003697,
"grad_norm": 0.3830581307411194,
"learning_rate": 1.0022657642890232e-07,
"loss": 0.6233,
"mean_token_accuracy": 0.8037948333368617,
"step": 970
},
{
"epoch": 2.879023307436182,
"grad_norm": 0.39410918951034546,
"learning_rate": 9.540479264726676e-08,
"loss": 0.6517,
"mean_token_accuracy": 0.7945131404435056,
"step": 971
},
{
"epoch": 2.881982981871994,
"grad_norm": 0.38177594542503357,
"learning_rate": 9.070131527609604e-08,
"loss": 0.6083,
"mean_token_accuracy": 0.8094474921741853,
"step": 972
},
{
"epoch": 2.884942656307806,
"grad_norm": 0.3808548152446747,
"learning_rate": 8.61162004965388e-08,
"loss": 0.6347,
"mean_token_accuracy": 0.8012875105708535,
"step": 973
},
{
"epoch": 2.8879023307436182,
"grad_norm": 0.4010704755783081,
"learning_rate": 8.16495030759501e-08,
"loss": 0.67,
"mean_token_accuracy": 0.7904797064838223,
"step": 974
},
{
"epoch": 2.8908620051794305,
"grad_norm": 0.3918650448322296,
"learning_rate": 7.730127636723539e-08,
"loss": 0.6005,
"mean_token_accuracy": 0.8118496389421752,
"step": 975
},
{
"epoch": 2.8938216796152423,
"grad_norm": 0.3898662030696869,
"learning_rate": 7.307157230821426e-08,
"loss": 0.6453,
"mean_token_accuracy": 0.7980052666038159,
"step": 976
},
{
"epoch": 2.8967813540510545,
"grad_norm": 0.39199164509773254,
"learning_rate": 6.896044142100433e-08,
"loss": 0.6576,
"mean_token_accuracy": 0.7941206706838407,
"step": 977
},
{
"epoch": 2.8997410284868663,
"grad_norm": 0.40657898783683777,
"learning_rate": 6.496793281141056e-08,
"loss": 0.6771,
"mean_token_accuracy": 0.7881219963995537,
"step": 978
},
{
"epoch": 2.9027007029226786,
"grad_norm": 0.3844878673553467,
"learning_rate": 6.109409416834689e-08,
"loss": 0.6412,
"mean_token_accuracy": 0.7994358237487954,
"step": 979
},
{
"epoch": 2.9056603773584904,
"grad_norm": 0.396533340215683,
"learning_rate": 5.7338971763256646e-08,
"loss": 0.6225,
"mean_token_accuracy": 0.8051790813619156,
"step": 980
},
{
"epoch": 2.9086200517943026,
"grad_norm": 0.3754301369190216,
"learning_rate": 5.37026104495697e-08,
"loss": 0.6305,
"mean_token_accuracy": 0.8030408479316886,
"step": 981
},
{
"epoch": 2.911579726230115,
"grad_norm": 0.40677276253700256,
"learning_rate": 5.0185053662161756e-08,
"loss": 0.6322,
"mean_token_accuracy": 0.8007158859109983,
"step": 982
},
{
"epoch": 2.9145394006659266,
"grad_norm": 0.3934902250766754,
"learning_rate": 4.678634341683252e-08,
"loss": 0.6222,
"mean_token_accuracy": 0.804753444318889,
"step": 983
},
{
"epoch": 2.917499075101739,
"grad_norm": 0.3706609904766083,
"learning_rate": 4.350652030981395e-08,
"loss": 0.6447,
"mean_token_accuracy": 0.7980685126294768,
"step": 984
},
{
"epoch": 2.920458749537551,
"grad_norm": 0.39183953404426575,
"learning_rate": 4.0345623517273894e-08,
"loss": 0.6267,
"mean_token_accuracy": 0.8036114839333938,
"step": 985
},
{
"epoch": 2.923418423973363,
"grad_norm": 0.3982419967651367,
"learning_rate": 3.7303690794854296e-08,
"loss": 0.7065,
"mean_token_accuracy": 0.7786746050545399,
"step": 986
},
{
"epoch": 2.9263780984091747,
"grad_norm": 0.4075382649898529,
"learning_rate": 3.438075847721933e-08,
"loss": 0.585,
"mean_token_accuracy": 0.8150346535603673,
"step": 987
},
{
"epoch": 2.929337772844987,
"grad_norm": 0.3877173662185669,
"learning_rate": 3.157686147762129e-08,
"loss": 0.6477,
"mean_token_accuracy": 0.7976473361920864,
"step": 988
},
{
"epoch": 2.932297447280799,
"grad_norm": 0.38589945435523987,
"learning_rate": 2.8892033287484245e-08,
"loss": 0.664,
"mean_token_accuracy": 0.79294201748894,
"step": 989
},
{
"epoch": 2.935257121716611,
"grad_norm": 0.38838937878608704,
"learning_rate": 2.6326305976001054e-08,
"loss": 0.6019,
"mean_token_accuracy": 0.8104530195130828,
"step": 990
},
{
"epoch": 2.9382167961524233,
"grad_norm": 0.3857711851596832,
"learning_rate": 2.3879710189753657e-08,
"loss": 0.6397,
"mean_token_accuracy": 0.798951730040894,
"step": 991
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.38144826889038086,
"learning_rate": 2.1552275152346702e-08,
"loss": 0.637,
"mean_token_accuracy": 0.8008487203446606,
"step": 992
},
{
"epoch": 2.9441361450240473,
"grad_norm": 0.39223143458366394,
"learning_rate": 1.9344028664056715e-08,
"loss": 0.6031,
"mean_token_accuracy": 0.8099199822731445,
"step": 993
},
{
"epoch": 2.9470958194598595,
"grad_norm": 0.40251073241233826,
"learning_rate": 1.7254997101500137e-08,
"loss": 0.62,
"mean_token_accuracy": 0.8063151660778272,
"step": 994
},
{
"epoch": 2.9500554938956713,
"grad_norm": 0.39291098713874817,
"learning_rate": 1.528520541731915e-08,
"loss": 0.6079,
"mean_token_accuracy": 0.8091930978495294,
"step": 995
},
{
"epoch": 2.9530151683314836,
"grad_norm": 0.3827592432498932,
"learning_rate": 1.3434677139885222e-08,
"loss": 0.6533,
"mean_token_accuracy": 0.7946187338585824,
"step": 996
},
{
"epoch": 2.9559748427672954,
"grad_norm": 0.37337788939476013,
"learning_rate": 1.170343437301491e-08,
"loss": 0.646,
"mean_token_accuracy": 0.7960541011916046,
"step": 997
},
{
"epoch": 2.9589345172031076,
"grad_norm": 0.39986652135849,
"learning_rate": 1.0091497795706728e-08,
"loss": 0.6393,
"mean_token_accuracy": 0.7991646202099173,
"step": 998
},
{
"epoch": 2.96189419163892,
"grad_norm": 0.3993469774723053,
"learning_rate": 8.59888666189579e-09,
"loss": 0.6753,
"mean_token_accuracy": 0.7898335911094181,
"step": 999
},
{
"epoch": 2.9648538660747317,
"grad_norm": 0.40232738852500916,
"learning_rate": 7.225618800222878e-09,
"loss": 0.6607,
"mean_token_accuracy": 0.7922276751171351,
"step": 1000
},
{
"epoch": 2.9648538660747317,
"eval_loss": 0.7369399070739746,
"eval_mean_token_accuracy": 0.7721513551540902,
"eval_runtime": 24.4981,
"eval_samples_per_second": 5.266,
"eval_steps_per_second": 1.347,
"step": 1000
},
{
"epoch": 2.967813540510544,
"grad_norm": 0.3906041085720062,
"learning_rate": 5.971710613821291e-09,
"loss": 0.6733,
"mean_token_accuracy": 0.7893573225919971,
"step": 1001
},
{
"epoch": 2.970773214946356,
"grad_norm": 0.39397749304771423,
"learning_rate": 4.837177080119215e-09,
"loss": 0.6217,
"mean_token_accuracy": 0.8034013413614648,
"step": 1002
},
{
"epoch": 2.973732889382168,
"grad_norm": 0.3905346691608429,
"learning_rate": 3.8220317506654226e-09,
"loss": 0.6531,
"mean_token_accuracy": 0.795008107180572,
"step": 1003
},
{
"epoch": 2.9766925638179798,
"grad_norm": 0.3973424732685089,
"learning_rate": 2.9262867509605164e-09,
"loss": 0.6395,
"mean_token_accuracy": 0.7991790842606037,
"step": 1004
},
{
"epoch": 2.979652238253792,
"grad_norm": 0.3992668092250824,
"learning_rate": 2.149952780321485e-09,
"loss": 0.6643,
"mean_token_accuracy": 0.7913003021486229,
"step": 1005
},
{
"epoch": 2.9826119126896042,
"grad_norm": 0.4035053253173828,
"learning_rate": 1.4930391117451427e-09,
"loss": 0.6354,
"mean_token_accuracy": 0.7998262221333795,
"step": 1006
},
{
"epoch": 2.985571587125416,
"grad_norm": 0.4096769690513611,
"learning_rate": 9.555535917993297e-10,
"loss": 0.6961,
"mean_token_accuracy": 0.7825063025724978,
"step": 1007
},
{
"epoch": 2.9885312615612283,
"grad_norm": 0.41353654861450195,
"learning_rate": 5.375026405352035e-10,
"loss": 0.6249,
"mean_token_accuracy": 0.8038925258126584,
"step": 1008
},
{
"epoch": 2.9914909359970405,
"grad_norm": 0.3798801898956299,
"learning_rate": 2.388912514017516e-10,
"loss": 0.6626,
"mean_token_accuracy": 0.793458732875717,
"step": 1009
},
{
"epoch": 2.9944506104328523,
"grad_norm": 0.39510512351989746,
"learning_rate": 5.972299119250124e-11,
"loss": 0.5891,
"mean_token_accuracy": 0.8136845518054924,
"step": 1010
},
{
"epoch": 2.9974102848686646,
"grad_norm": 0.40476804971694946,
"learning_rate": 0.0,
"loss": 0.6628,
"mean_token_accuracy": 0.7925300203396853,
"step": 1011
},
{
"epoch": 2.9974102848686646,
"step": 1011,
"total_flos": 230593791000576.0,
"train_loss": 0.7078155129410982,
"train_runtime": 41256.6124,
"train_samples_per_second": 1.572,
"train_steps_per_second": 0.025
}
],
"logging_steps": 1,
"max_steps": 1011,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 230593791000576.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}