|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9974102848686646, |
|
"eval_steps": 100, |
|
"global_step": 1011, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0029596744358120607, |
|
"grad_norm": 2.681946039199829, |
|
"learning_rate": 1.9607843137254904e-07, |
|
"loss": 1.0892, |
|
"mean_token_accuracy": 0.7134666713588034, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0059193488716241215, |
|
"grad_norm": 2.483736515045166, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 1.0859, |
|
"mean_token_accuracy": 0.7130540900903558, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008879023307436182, |
|
"grad_norm": 2.5574872493743896, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 1.1083, |
|
"mean_token_accuracy": 0.7057264272951731, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011838697743248243, |
|
"grad_norm": 2.592397689819336, |
|
"learning_rate": 7.843137254901962e-07, |
|
"loss": 1.1251, |
|
"mean_token_accuracy": 0.70204062618997, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014798372179060304, |
|
"grad_norm": 2.5958452224731445, |
|
"learning_rate": 9.80392156862745e-07, |
|
"loss": 1.0616, |
|
"mean_token_accuracy": 0.7201201840956424, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017758046614872364, |
|
"grad_norm": 2.527214765548706, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 1.1498, |
|
"mean_token_accuracy": 0.6991107921462223, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.020717721050684423, |
|
"grad_norm": 2.453611135482788, |
|
"learning_rate": 1.3725490196078434e-06, |
|
"loss": 1.0692, |
|
"mean_token_accuracy": 0.7185075890374791, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.023677395486496486, |
|
"grad_norm": 2.2676663398742676, |
|
"learning_rate": 1.5686274509803923e-06, |
|
"loss": 1.1027, |
|
"mean_token_accuracy": 0.7096105664418749, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.026637069922308545, |
|
"grad_norm": 2.4191880226135254, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 1.1374, |
|
"mean_token_accuracy": 0.7004450719322626, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.029596744358120607, |
|
"grad_norm": 2.2810451984405518, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 1.0701, |
|
"mean_token_accuracy": 0.7192182703502579, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032556418793932666, |
|
"grad_norm": 2.047187566757202, |
|
"learning_rate": 2.1568627450980393e-06, |
|
"loss": 1.0692, |
|
"mean_token_accuracy": 0.7168684606703121, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03551609322974473, |
|
"grad_norm": 1.9986836910247803, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 1.0591, |
|
"mean_token_accuracy": 0.7179799919846566, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03847576766555679, |
|
"grad_norm": 1.9848605394363403, |
|
"learning_rate": 2.549019607843137e-06, |
|
"loss": 1.0592, |
|
"mean_token_accuracy": 0.7186164399688223, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04143544210136885, |
|
"grad_norm": 1.7683581113815308, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 1.0286, |
|
"mean_token_accuracy": 0.7263637707391479, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 1.4327510595321655, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 1.0502, |
|
"mean_token_accuracy": 0.718260961897349, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04735479097299297, |
|
"grad_norm": 1.4091436862945557, |
|
"learning_rate": 3.1372549019607846e-06, |
|
"loss": 1.0816, |
|
"mean_token_accuracy": 0.7076378775080614, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.050314465408805034, |
|
"grad_norm": 1.3194211721420288, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.0302, |
|
"mean_token_accuracy": 0.7220054724166985, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05327413984461709, |
|
"grad_norm": 1.2913936376571655, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 1.0676, |
|
"mean_token_accuracy": 0.7113759820986945, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05623381428042915, |
|
"grad_norm": 1.236266016960144, |
|
"learning_rate": 3.7254901960784316e-06, |
|
"loss": 1.0571, |
|
"mean_token_accuracy": 0.7136546795764988, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.059193488716241215, |
|
"grad_norm": 1.1931370496749878, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 0.9917, |
|
"mean_token_accuracy": 0.7271706923102303, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06215316315205328, |
|
"grad_norm": 1.3087286949157715, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 1.0021, |
|
"mean_token_accuracy": 0.7239365954438801, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06511283758786533, |
|
"grad_norm": 1.2562185525894165, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 0.983, |
|
"mean_token_accuracy": 0.7273888578305255, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0680725120236774, |
|
"grad_norm": 1.1378827095031738, |
|
"learning_rate": 4.509803921568628e-06, |
|
"loss": 0.9578, |
|
"mean_token_accuracy": 0.7362632636857523, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07103218645948946, |
|
"grad_norm": 1.0568324327468872, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 0.9564, |
|
"mean_token_accuracy": 0.7346627849009933, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07399186089530152, |
|
"grad_norm": 0.9209612011909485, |
|
"learning_rate": 4.901960784313726e-06, |
|
"loss": 0.9808, |
|
"mean_token_accuracy": 0.7272476674555969, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07695153533111358, |
|
"grad_norm": 0.8665790557861328, |
|
"learning_rate": 5.098039215686274e-06, |
|
"loss": 1.0003, |
|
"mean_token_accuracy": 0.7212588502719087, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07991120976692564, |
|
"grad_norm": 0.8994502425193787, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.9476, |
|
"mean_token_accuracy": 0.7335574894521832, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0828708842027377, |
|
"grad_norm": 1.0448633432388306, |
|
"learning_rate": 5.4901960784313735e-06, |
|
"loss": 0.9464, |
|
"mean_token_accuracy": 0.7334208114703, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08583055863854976, |
|
"grad_norm": 0.9871032238006592, |
|
"learning_rate": 5.686274509803922e-06, |
|
"loss": 0.9505, |
|
"mean_token_accuracy": 0.732524444705358, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.9244782328605652, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.9369, |
|
"mean_token_accuracy": 0.7377869549204231, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09174990751017388, |
|
"grad_norm": 0.8495871424674988, |
|
"learning_rate": 6.07843137254902e-06, |
|
"loss": 0.9632, |
|
"mean_token_accuracy": 0.7259763433334542, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09470958194598594, |
|
"grad_norm": 0.730097770690918, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 0.8828, |
|
"mean_token_accuracy": 0.7483362451357691, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.097669256381798, |
|
"grad_norm": 0.7470875382423401, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 0.9185, |
|
"mean_token_accuracy": 0.7392471457849514, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10062893081761007, |
|
"grad_norm": 0.730536162853241, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8883, |
|
"mean_token_accuracy": 0.7495266641186222, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10358860525342212, |
|
"grad_norm": 0.6699507832527161, |
|
"learning_rate": 6.862745098039216e-06, |
|
"loss": 0.857, |
|
"mean_token_accuracy": 0.7532634065825189, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10654827968923418, |
|
"grad_norm": 0.6172248721122742, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.8762, |
|
"mean_token_accuracy": 0.749338820444233, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10950795412504624, |
|
"grad_norm": 0.6268398761749268, |
|
"learning_rate": 7.2549019607843145e-06, |
|
"loss": 0.8679, |
|
"mean_token_accuracy": 0.7519748968716043, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1124676285608583, |
|
"grad_norm": 0.610349178314209, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 0.8855, |
|
"mean_token_accuracy": 0.7472919453079274, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11542730299667037, |
|
"grad_norm": 0.604537308216095, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 0.8499, |
|
"mean_token_accuracy": 0.7552782022394232, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11838697743248243, |
|
"grad_norm": 0.609111487865448, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 0.8822, |
|
"mean_token_accuracy": 0.746562312628656, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12134665186829449, |
|
"grad_norm": 0.5899158716201782, |
|
"learning_rate": 8.03921568627451e-06, |
|
"loss": 0.8811, |
|
"mean_token_accuracy": 0.7473791695126712, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12430632630410655, |
|
"grad_norm": 0.6210097670555115, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.8833, |
|
"mean_token_accuracy": 0.7444836846534346, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12726600073991862, |
|
"grad_norm": 0.600689709186554, |
|
"learning_rate": 8.43137254901961e-06, |
|
"loss": 0.8318, |
|
"mean_token_accuracy": 0.7609372507118015, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13022567517573067, |
|
"grad_norm": 0.5491411685943604, |
|
"learning_rate": 8.627450980392157e-06, |
|
"loss": 0.8631, |
|
"mean_token_accuracy": 0.750162132080428, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.5706349611282349, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.8782, |
|
"mean_token_accuracy": 0.7451601161887986, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1361450240473548, |
|
"grad_norm": 0.5555650591850281, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 0.823, |
|
"mean_token_accuracy": 0.7618301473100519, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13910469848316684, |
|
"grad_norm": 0.5772121548652649, |
|
"learning_rate": 9.215686274509804e-06, |
|
"loss": 0.828, |
|
"mean_token_accuracy": 0.7588256411868824, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14206437291897892, |
|
"grad_norm": 0.611781895160675, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.8425, |
|
"mean_token_accuracy": 0.7546703623296309, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14502404735479096, |
|
"grad_norm": 0.5700849294662476, |
|
"learning_rate": 9.607843137254903e-06, |
|
"loss": 0.8695, |
|
"mean_token_accuracy": 0.7466177841712535, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14798372179060304, |
|
"grad_norm": 0.5548747777938843, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 0.8548, |
|
"mean_token_accuracy": 0.7508958491076401, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.5233455300331116, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8778, |
|
"mean_token_accuracy": 0.7444452874755125, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15390307066222716, |
|
"grad_norm": 0.567051112651825, |
|
"learning_rate": 1.0196078431372549e-05, |
|
"loss": 0.8213, |
|
"mean_token_accuracy": 0.7609767092967284, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1568627450980392, |
|
"grad_norm": 0.5394188165664673, |
|
"learning_rate": 1.03921568627451e-05, |
|
"loss": 0.8661, |
|
"mean_token_accuracy": 0.7484568076496121, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1598224195338513, |
|
"grad_norm": 0.5241853594779968, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 0.8621, |
|
"mean_token_accuracy": 0.7480956260768654, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16278209396966334, |
|
"grad_norm": 0.48302915692329407, |
|
"learning_rate": 1.0784313725490196e-05, |
|
"loss": 0.8101, |
|
"mean_token_accuracy": 0.7638810794013436, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1657417684054754, |
|
"grad_norm": 0.5048951506614685, |
|
"learning_rate": 1.0980392156862747e-05, |
|
"loss": 0.8164, |
|
"mean_token_accuracy": 0.7611000331453143, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16870144284128746, |
|
"grad_norm": 0.5220761299133301, |
|
"learning_rate": 1.1176470588235295e-05, |
|
"loss": 0.8382, |
|
"mean_token_accuracy": 0.7542881093651161, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1716611172770995, |
|
"grad_norm": 0.5163182020187378, |
|
"learning_rate": 1.1372549019607844e-05, |
|
"loss": 0.845, |
|
"mean_token_accuracy": 0.7544678776426703, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1746207917129116, |
|
"grad_norm": 0.5414546132087708, |
|
"learning_rate": 1.1568627450980394e-05, |
|
"loss": 0.8115, |
|
"mean_token_accuracy": 0.763602548207208, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.49731120467185974, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.8498, |
|
"mean_token_accuracy": 0.7513782211298353, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1805401405845357, |
|
"grad_norm": 0.48450183868408203, |
|
"learning_rate": 1.1960784313725491e-05, |
|
"loss": 0.8112, |
|
"mean_token_accuracy": 0.760378165872515, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.18349981502034776, |
|
"grad_norm": 0.5090157985687256, |
|
"learning_rate": 1.215686274509804e-05, |
|
"loss": 0.8352, |
|
"mean_token_accuracy": 0.7544511398898393, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1864594894561598, |
|
"grad_norm": 0.5094890594482422, |
|
"learning_rate": 1.235294117647059e-05, |
|
"loss": 0.8169, |
|
"mean_token_accuracy": 0.7596972963469578, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1894191638919719, |
|
"grad_norm": 0.5052422881126404, |
|
"learning_rate": 1.2549019607843138e-05, |
|
"loss": 0.8397, |
|
"mean_token_accuracy": 0.7528146247402845, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.19237883832778394, |
|
"grad_norm": 0.48801887035369873, |
|
"learning_rate": 1.2745098039215686e-05, |
|
"loss": 0.7911, |
|
"mean_token_accuracy": 0.7666436131483815, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.195338512763596, |
|
"grad_norm": 0.49707359075546265, |
|
"learning_rate": 1.2941176470588238e-05, |
|
"loss": 0.8311, |
|
"mean_token_accuracy": 0.7534919777308312, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19829818719940806, |
|
"grad_norm": 0.47678443789482117, |
|
"learning_rate": 1.3137254901960785e-05, |
|
"loss": 0.7908, |
|
"mean_token_accuracy": 0.7675227128959651, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.20125786163522014, |
|
"grad_norm": 0.5108245611190796, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.8136, |
|
"mean_token_accuracy": 0.7605165307209668, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.20421753607103219, |
|
"grad_norm": 0.5529371500015259, |
|
"learning_rate": 1.3529411764705885e-05, |
|
"loss": 0.8289, |
|
"mean_token_accuracy": 0.7556330264225892, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.20717721050684423, |
|
"grad_norm": 0.48820486664772034, |
|
"learning_rate": 1.3725490196078432e-05, |
|
"loss": 0.8322, |
|
"mean_token_accuracy": 0.7555734646050257, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2101368849426563, |
|
"grad_norm": 0.4998631775379181, |
|
"learning_rate": 1.392156862745098e-05, |
|
"loss": 0.7757, |
|
"mean_token_accuracy": 0.7707598691626208, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.21309655937846836, |
|
"grad_norm": 0.5397401452064514, |
|
"learning_rate": 1.4117647058823532e-05, |
|
"loss": 0.8136, |
|
"mean_token_accuracy": 0.7605449945205573, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21605623381428044, |
|
"grad_norm": 0.5162031054496765, |
|
"learning_rate": 1.431372549019608e-05, |
|
"loss": 0.7805, |
|
"mean_token_accuracy": 0.7688441270861772, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.21901590825009248, |
|
"grad_norm": 0.4769732654094696, |
|
"learning_rate": 1.4509803921568629e-05, |
|
"loss": 0.8062, |
|
"mean_token_accuracy": 0.7610474880611428, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.48078039288520813, |
|
"learning_rate": 1.4705882352941179e-05, |
|
"loss": 0.8152, |
|
"mean_token_accuracy": 0.7588509310402451, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2249352571217166, |
|
"grad_norm": 0.48076578974723816, |
|
"learning_rate": 1.4901960784313726e-05, |
|
"loss": 0.7886, |
|
"mean_token_accuracy": 0.7669702001266795, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.22789493155752868, |
|
"grad_norm": 0.524426281452179, |
|
"learning_rate": 1.5098039215686276e-05, |
|
"loss": 0.7958, |
|
"mean_token_accuracy": 0.7644518143592102, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.23085460599334073, |
|
"grad_norm": 0.48478269577026367, |
|
"learning_rate": 1.5294117647058822e-05, |
|
"loss": 0.822, |
|
"mean_token_accuracy": 0.7575506383899827, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.23381428042915278, |
|
"grad_norm": 0.49773070216178894, |
|
"learning_rate": 1.5490196078431373e-05, |
|
"loss": 0.8007, |
|
"mean_token_accuracy": 0.7629923994057785, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.23677395486496486, |
|
"grad_norm": 0.5387545228004456, |
|
"learning_rate": 1.568627450980392e-05, |
|
"loss": 0.8225, |
|
"mean_token_accuracy": 0.7566505741674857, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2397336293007769, |
|
"grad_norm": 0.4855351448059082, |
|
"learning_rate": 1.5882352941176473e-05, |
|
"loss": 0.775, |
|
"mean_token_accuracy": 0.769850506922079, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.24269330373658898, |
|
"grad_norm": 0.47540611028671265, |
|
"learning_rate": 1.607843137254902e-05, |
|
"loss": 0.7937, |
|
"mean_token_accuracy": 0.7641365526868825, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.24565297817240103, |
|
"grad_norm": 0.48479974269866943, |
|
"learning_rate": 1.627450980392157e-05, |
|
"loss": 0.8315, |
|
"mean_token_accuracy": 0.7560415146119046, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2486126526082131, |
|
"grad_norm": 0.5490248203277588, |
|
"learning_rate": 1.647058823529412e-05, |
|
"loss": 0.8276, |
|
"mean_token_accuracy": 0.7542041203825852, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"grad_norm": 0.4909403920173645, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.8113, |
|
"mean_token_accuracy": 0.7590975144970054, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.25453200147983723, |
|
"grad_norm": 0.47584831714630127, |
|
"learning_rate": 1.686274509803922e-05, |
|
"loss": 0.764, |
|
"mean_token_accuracy": 0.7724685847938628, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.25749167591564925, |
|
"grad_norm": 0.49695855379104614, |
|
"learning_rate": 1.7058823529411767e-05, |
|
"loss": 0.7542, |
|
"mean_token_accuracy": 0.7775013855023025, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.26045135035146133, |
|
"grad_norm": 0.5099871754646301, |
|
"learning_rate": 1.7254901960784314e-05, |
|
"loss": 0.7644, |
|
"mean_token_accuracy": 0.7725364928369027, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2634110247872734, |
|
"grad_norm": 0.5371332764625549, |
|
"learning_rate": 1.7450980392156866e-05, |
|
"loss": 0.8248, |
|
"mean_token_accuracy": 0.7555675225937974, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.5191521048545837, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.8008, |
|
"mean_token_accuracy": 0.7618285114849587, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2693303736588975, |
|
"grad_norm": 0.5234159231185913, |
|
"learning_rate": 1.7843137254901965e-05, |
|
"loss": 0.8007, |
|
"mean_token_accuracy": 0.7619624657540706, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2722900480947096, |
|
"grad_norm": 0.5274977087974548, |
|
"learning_rate": 1.8039215686274513e-05, |
|
"loss": 0.8176, |
|
"mean_token_accuracy": 0.7581840756170707, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.27524972253052166, |
|
"grad_norm": 0.5195613503456116, |
|
"learning_rate": 1.823529411764706e-05, |
|
"loss": 0.7421, |
|
"mean_token_accuracy": 0.7779025499948702, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2782093969663337, |
|
"grad_norm": 0.5123000741004944, |
|
"learning_rate": 1.843137254901961e-05, |
|
"loss": 0.7924, |
|
"mean_token_accuracy": 0.7655979691874065, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.28116907140214575, |
|
"grad_norm": 0.5142971277236938, |
|
"learning_rate": 1.862745098039216e-05, |
|
"loss": 0.7904, |
|
"mean_token_accuracy": 0.7648081417962661, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.28412874583795783, |
|
"grad_norm": 0.5216192007064819, |
|
"learning_rate": 1.8823529411764708e-05, |
|
"loss": 0.7764, |
|
"mean_token_accuracy": 0.7663588698907876, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2870884202737699, |
|
"grad_norm": 0.533979058265686, |
|
"learning_rate": 1.9019607843137255e-05, |
|
"loss": 0.8085, |
|
"mean_token_accuracy": 0.7584315215207101, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2900480947095819, |
|
"grad_norm": 0.4970541000366211, |
|
"learning_rate": 1.9215686274509807e-05, |
|
"loss": 0.7709, |
|
"mean_token_accuracy": 0.7712429032432324, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.293007769145394, |
|
"grad_norm": 0.5441746115684509, |
|
"learning_rate": 1.9411764705882355e-05, |
|
"loss": 0.7992, |
|
"mean_token_accuracy": 0.7626096179397713, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"grad_norm": 0.5223695635795593, |
|
"learning_rate": 1.9607843137254903e-05, |
|
"loss": 0.8004, |
|
"mean_token_accuracy": 0.7618210772497175, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"eval_loss": 0.8126489520072937, |
|
"eval_mean_token_accuracy": 0.7551172949521177, |
|
"eval_runtime": 24.8878, |
|
"eval_samples_per_second": 5.183, |
|
"eval_steps_per_second": 1.326, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2989271180170181, |
|
"grad_norm": 0.5140753984451294, |
|
"learning_rate": 1.9803921568627454e-05, |
|
"loss": 0.8128, |
|
"mean_token_accuracy": 0.7589419990451155, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 0.5474939942359924, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7814, |
|
"mean_token_accuracy": 0.7675481741705397, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.30484646688864225, |
|
"grad_norm": 0.5351850390434265, |
|
"learning_rate": 1.9999940277008807e-05, |
|
"loss": 0.8039, |
|
"mean_token_accuracy": 0.7606729614320974, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.30780614132445433, |
|
"grad_norm": 0.5160948038101196, |
|
"learning_rate": 1.99997611087486e-05, |
|
"loss": 0.7853, |
|
"mean_token_accuracy": 0.7661865778379009, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.5185216665267944, |
|
"learning_rate": 1.9999462497359468e-05, |
|
"loss": 0.7549, |
|
"mean_token_accuracy": 0.7736576692294679, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3137254901960784, |
|
"grad_norm": 0.4885355830192566, |
|
"learning_rate": 1.9999044446408203e-05, |
|
"loss": 0.7727, |
|
"mean_token_accuracy": 0.769001304246102, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3166851646318905, |
|
"grad_norm": 0.615883469581604, |
|
"learning_rate": 1.9998506960888258e-05, |
|
"loss": 0.7991, |
|
"mean_token_accuracy": 0.7610468017188765, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3196448390677026, |
|
"grad_norm": 0.520724892616272, |
|
"learning_rate": 1.999785004721968e-05, |
|
"loss": 0.7932, |
|
"mean_token_accuracy": 0.7632453023136502, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3226045135035146, |
|
"grad_norm": 0.5822110772132874, |
|
"learning_rate": 1.999707371324904e-05, |
|
"loss": 0.809, |
|
"mean_token_accuracy": 0.7592238599169098, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3255641879393267, |
|
"grad_norm": 0.5411946177482605, |
|
"learning_rate": 1.9996177968249336e-05, |
|
"loss": 0.738, |
|
"mean_token_accuracy": 0.7779971005622943, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32852386237513875, |
|
"grad_norm": 0.5421875715255737, |
|
"learning_rate": 1.999516282291988e-05, |
|
"loss": 0.8056, |
|
"mean_token_accuracy": 0.7613603734181074, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3314835368109508, |
|
"grad_norm": 0.5699617266654968, |
|
"learning_rate": 1.999402828938618e-05, |
|
"loss": 0.7994, |
|
"mean_token_accuracy": 0.7613545035512745, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.33444321124676285, |
|
"grad_norm": 0.5206153988838196, |
|
"learning_rate": 1.999277438119978e-05, |
|
"loss": 0.7778, |
|
"mean_token_accuracy": 0.7683061531083251, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3374028856825749, |
|
"grad_norm": 0.5244638323783875, |
|
"learning_rate": 1.9991401113338103e-05, |
|
"loss": 0.8023, |
|
"mean_token_accuracy": 0.7609702591724479, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.340362560118387, |
|
"grad_norm": 0.5344120860099792, |
|
"learning_rate": 1.9989908502204295e-05, |
|
"loss": 0.7793, |
|
"mean_token_accuracy": 0.7666984560859803, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.343322234554199, |
|
"grad_norm": 0.505351185798645, |
|
"learning_rate": 1.9988296565626988e-05, |
|
"loss": 0.7577, |
|
"mean_token_accuracy": 0.7727362055103163, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3462819089900111, |
|
"grad_norm": 0.5267241597175598, |
|
"learning_rate": 1.9986565322860117e-05, |
|
"loss": 0.8223, |
|
"mean_token_accuracy": 0.7553329490401921, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3492415834258232, |
|
"grad_norm": 0.5347175002098083, |
|
"learning_rate": 1.9984714794582682e-05, |
|
"loss": 0.8163, |
|
"mean_token_accuracy": 0.7553841783952017, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3522012578616352, |
|
"grad_norm": 0.5740127563476562, |
|
"learning_rate": 1.99827450028985e-05, |
|
"loss": 0.7804, |
|
"mean_token_accuracy": 0.7664757137429672, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.5313867330551147, |
|
"learning_rate": 1.9980655971335944e-05, |
|
"loss": 0.81, |
|
"mean_token_accuracy": 0.7596098693206174, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35812060673325935, |
|
"grad_norm": 0.5177193284034729, |
|
"learning_rate": 1.9978447724847655e-05, |
|
"loss": 0.7956, |
|
"mean_token_accuracy": 0.7617352886178098, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3610802811690714, |
|
"grad_norm": 0.564724326133728, |
|
"learning_rate": 1.9976120289810247e-05, |
|
"loss": 0.8109, |
|
"mean_token_accuracy": 0.7577093596124115, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.36403995560488345, |
|
"grad_norm": 0.539661169052124, |
|
"learning_rate": 1.9973673694024002e-05, |
|
"loss": 0.7858, |
|
"mean_token_accuracy": 0.7645922340526763, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3669996300406955, |
|
"grad_norm": 0.5084680318832397, |
|
"learning_rate": 1.9971107966712518e-05, |
|
"loss": 0.7463, |
|
"mean_token_accuracy": 0.7753920713027525, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3699593044765076, |
|
"grad_norm": 0.4952844977378845, |
|
"learning_rate": 1.9968423138522382e-05, |
|
"loss": 0.7739, |
|
"mean_token_accuracy": 0.7676340081581494, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3729189789123196, |
|
"grad_norm": 0.5472536087036133, |
|
"learning_rate": 1.996561924152278e-05, |
|
"loss": 0.8, |
|
"mean_token_accuracy": 0.7616544988854603, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3758786533481317, |
|
"grad_norm": 0.5309717059135437, |
|
"learning_rate": 1.9962696309205146e-05, |
|
"loss": 0.7776, |
|
"mean_token_accuracy": 0.7678901777514975, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3788383277839438, |
|
"grad_norm": 0.5029951930046082, |
|
"learning_rate": 1.995965437648273e-05, |
|
"loss": 0.7761, |
|
"mean_token_accuracy": 0.766595985687639, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.38179800221975585, |
|
"grad_norm": 0.5340363383293152, |
|
"learning_rate": 1.995649347969019e-05, |
|
"loss": 0.7457, |
|
"mean_token_accuracy": 0.7745559370999009, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.38475767665556787, |
|
"grad_norm": 0.5484894514083862, |
|
"learning_rate": 1.995321365658317e-05, |
|
"loss": 0.7997, |
|
"mean_token_accuracy": 0.7594812381150867, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.38771735109137995, |
|
"grad_norm": 0.6396868228912354, |
|
"learning_rate": 1.994981494633784e-05, |
|
"loss": 0.7976, |
|
"mean_token_accuracy": 0.7599872025655441, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.390677025527192, |
|
"grad_norm": 0.5394526124000549, |
|
"learning_rate": 1.9946297389550433e-05, |
|
"loss": 0.7993, |
|
"mean_token_accuracy": 0.7608908088026568, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.39363669996300404, |
|
"grad_norm": 0.6235033869743347, |
|
"learning_rate": 1.9942661028236746e-05, |
|
"loss": 0.787, |
|
"mean_token_accuracy": 0.7650479719064859, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3965963743988161, |
|
"grad_norm": 0.5509399175643921, |
|
"learning_rate": 1.9938905905831657e-05, |
|
"loss": 0.7841, |
|
"mean_token_accuracy": 0.7647842322769413, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.589085578918457, |
|
"learning_rate": 1.993503206718859e-05, |
|
"loss": 0.7701, |
|
"mean_token_accuracy": 0.7691342710083168, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4025157232704403, |
|
"grad_norm": 0.5094689726829529, |
|
"learning_rate": 1.9931039558578997e-05, |
|
"loss": 0.755, |
|
"mean_token_accuracy": 0.773621444740238, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4054753977062523, |
|
"grad_norm": 0.5288008451461792, |
|
"learning_rate": 1.9926928427691788e-05, |
|
"loss": 0.733, |
|
"mean_token_accuracy": 0.7798217961404127, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.40843507214206437, |
|
"grad_norm": 0.5860950350761414, |
|
"learning_rate": 1.992269872363277e-05, |
|
"loss": 0.7793, |
|
"mean_token_accuracy": 0.7671219171893889, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.41139474657787645, |
|
"grad_norm": 0.5211442708969116, |
|
"learning_rate": 1.991835049692405e-05, |
|
"loss": 0.7589, |
|
"mean_token_accuracy": 0.7712709984233845, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.41435442101368847, |
|
"grad_norm": 0.6341312527656555, |
|
"learning_rate": 1.991388379950346e-05, |
|
"loss": 0.7555, |
|
"mean_token_accuracy": 0.7726431562687772, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.41731409544950054, |
|
"grad_norm": 0.5119423866271973, |
|
"learning_rate": 1.9909298684723905e-05, |
|
"loss": 0.7696, |
|
"mean_token_accuracy": 0.7683422766172284, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4202737698853126, |
|
"grad_norm": 0.5573475956916809, |
|
"learning_rate": 1.9904595207352736e-05, |
|
"loss": 0.7586, |
|
"mean_token_accuracy": 0.7709694689177727, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4232334443211247, |
|
"grad_norm": 0.5152528882026672, |
|
"learning_rate": 1.9899773423571102e-05, |
|
"loss": 0.742, |
|
"mean_token_accuracy": 0.776040556686583, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4261931187569367, |
|
"grad_norm": 0.5058140754699707, |
|
"learning_rate": 1.9894833390973266e-05, |
|
"loss": 0.8094, |
|
"mean_token_accuracy": 0.7577251436684603, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4291527931927488, |
|
"grad_norm": 0.5282382965087891, |
|
"learning_rate": 1.9889775168565942e-05, |
|
"loss": 0.7748, |
|
"mean_token_accuracy": 0.7683045482642854, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.43211246762856087, |
|
"grad_norm": 0.6103954315185547, |
|
"learning_rate": 1.9884598816767563e-05, |
|
"loss": 0.805, |
|
"mean_token_accuracy": 0.7593984139774315, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.43507214206437295, |
|
"grad_norm": 0.530112087726593, |
|
"learning_rate": 1.987930439740757e-05, |
|
"loss": 0.7537, |
|
"mean_token_accuracy": 0.7733501196092509, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.43803181650018497, |
|
"grad_norm": 0.5501434206962585, |
|
"learning_rate": 1.9873891973725673e-05, |
|
"loss": 0.752, |
|
"mean_token_accuracy": 0.7755143180889366, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.44099149093599704, |
|
"grad_norm": 0.496888667345047, |
|
"learning_rate": 1.98683616103711e-05, |
|
"loss": 0.7624, |
|
"mean_token_accuracy": 0.7707987778410632, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.5206103324890137, |
|
"learning_rate": 1.986271337340182e-05, |
|
"loss": 0.7754, |
|
"mean_token_accuracy": 0.7663099883208253, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.44691083980762114, |
|
"grad_norm": 0.5429675579071045, |
|
"learning_rate": 1.9856947330283752e-05, |
|
"loss": 0.7418, |
|
"mean_token_accuracy": 0.7745724176097732, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4498705142434332, |
|
"grad_norm": 0.515471875667572, |
|
"learning_rate": 1.985106354988997e-05, |
|
"loss": 0.7517, |
|
"mean_token_accuracy": 0.7713102457643006, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 0.5580022931098938, |
|
"learning_rate": 1.984506210249986e-05, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.7783837879306136, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.45578986311505737, |
|
"grad_norm": 0.5351727604866028, |
|
"learning_rate": 1.9838943059798305e-05, |
|
"loss": 0.7632, |
|
"mean_token_accuracy": 0.7712246098769842, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4587495375508694, |
|
"grad_norm": 0.5970275402069092, |
|
"learning_rate": 1.9832706494874812e-05, |
|
"loss": 0.7852, |
|
"mean_token_accuracy": 0.7650099910061801, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.46170921198668147, |
|
"grad_norm": 0.535476803779602, |
|
"learning_rate": 1.982635248222264e-05, |
|
"loss": 0.8135, |
|
"mean_token_accuracy": 0.7548096205593479, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.46466888642249354, |
|
"grad_norm": 0.5446284413337708, |
|
"learning_rate": 1.9819881097737917e-05, |
|
"loss": 0.7753, |
|
"mean_token_accuracy": 0.766597256567756, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.46762856085830556, |
|
"grad_norm": 0.5779156684875488, |
|
"learning_rate": 1.9813292418718734e-05, |
|
"loss": 0.8178, |
|
"mean_token_accuracy": 0.7556820545263497, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.5383659601211548, |
|
"learning_rate": 1.9806586523864212e-05, |
|
"loss": 0.7787, |
|
"mean_token_accuracy": 0.7652851550298655, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4735479097299297, |
|
"grad_norm": 0.5274466872215271, |
|
"learning_rate": 1.9799763493273572e-05, |
|
"loss": 0.7451, |
|
"mean_token_accuracy": 0.7758701051335468, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4765075841657418, |
|
"grad_norm": 0.5253377556800842, |
|
"learning_rate": 1.9792823408445173e-05, |
|
"loss": 0.7794, |
|
"mean_token_accuracy": 0.7660881601135704, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4794672586015538, |
|
"grad_norm": 0.6184384822845459, |
|
"learning_rate": 1.978576635227554e-05, |
|
"loss": 0.7705, |
|
"mean_token_accuracy": 0.7684919526843087, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4824269330373659, |
|
"grad_norm": 0.5399531126022339, |
|
"learning_rate": 1.9778592409058376e-05, |
|
"loss": 0.7496, |
|
"mean_token_accuracy": 0.7751951559026848, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.48538660747317797, |
|
"grad_norm": 0.5651612281799316, |
|
"learning_rate": 1.9771301664483548e-05, |
|
"loss": 0.7637, |
|
"mean_token_accuracy": 0.770090502168717, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.6314195394515991, |
|
"learning_rate": 1.976389420563607e-05, |
|
"loss": 0.7634, |
|
"mean_token_accuracy": 0.7709643457026975, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.49130595634480206, |
|
"grad_norm": 0.5370025634765625, |
|
"learning_rate": 1.975637012099507e-05, |
|
"loss": 0.7467, |
|
"mean_token_accuracy": 0.7752258048770664, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.49426563078061414, |
|
"grad_norm": 0.5424651503562927, |
|
"learning_rate": 1.97487295004327e-05, |
|
"loss": 0.7933, |
|
"mean_token_accuracy": 0.760696495825342, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4972253052164262, |
|
"grad_norm": 0.5711933970451355, |
|
"learning_rate": 1.9740972435213114e-05, |
|
"loss": 0.7928, |
|
"mean_token_accuracy": 0.761649293131421, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5001849796522383, |
|
"grad_norm": 0.5219062566757202, |
|
"learning_rate": 1.9733099017991342e-05, |
|
"loss": 0.7861, |
|
"mean_token_accuracy": 0.7628095190256412, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.4978106617927551, |
|
"learning_rate": 1.972510934281218e-05, |
|
"loss": 0.7631, |
|
"mean_token_accuracy": 0.7710752711114524, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5061043285238623, |
|
"grad_norm": 0.6013402938842773, |
|
"learning_rate": 1.9717003505109097e-05, |
|
"loss": 0.7991, |
|
"mean_token_accuracy": 0.7586557673484216, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5090640029596745, |
|
"grad_norm": 0.5215644836425781, |
|
"learning_rate": 1.9708781601703066e-05, |
|
"loss": 0.763, |
|
"mean_token_accuracy": 0.7695876606622123, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5120236773954865, |
|
"grad_norm": 0.49007105827331543, |
|
"learning_rate": 1.9700443730801412e-05, |
|
"loss": 0.7644, |
|
"mean_token_accuracy": 0.7701783635410456, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5149833518312985, |
|
"grad_norm": 0.5938363075256348, |
|
"learning_rate": 1.9691989991996663e-05, |
|
"loss": 0.7643, |
|
"mean_token_accuracy": 0.7680917186626302, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5179430262671106, |
|
"grad_norm": 0.5483390092849731, |
|
"learning_rate": 1.9683420486265328e-05, |
|
"loss": 0.7651, |
|
"mean_token_accuracy": 0.7709870011542461, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5209027007029227, |
|
"grad_norm": 0.5027016997337341, |
|
"learning_rate": 1.967473531596671e-05, |
|
"loss": 0.7513, |
|
"mean_token_accuracy": 0.7730452420829894, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5238623751387348, |
|
"grad_norm": 0.5310905575752258, |
|
"learning_rate": 1.966593458484168e-05, |
|
"loss": 0.7715, |
|
"mean_token_accuracy": 0.7680981483212205, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5268220495745468, |
|
"grad_norm": 0.5523523688316345, |
|
"learning_rate": 1.9657018398011435e-05, |
|
"loss": 0.7674, |
|
"mean_token_accuracy": 0.7684800548855188, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5297817240103588, |
|
"grad_norm": 0.5446920394897461, |
|
"learning_rate": 1.9647986861976246e-05, |
|
"loss": 0.773, |
|
"mean_token_accuracy": 0.7688905853900413, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.5408650636672974, |
|
"learning_rate": 1.9638840084614182e-05, |
|
"loss": 0.7204, |
|
"mean_token_accuracy": 0.7827706625253021, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.535701072881983, |
|
"grad_norm": 0.5880627632141113, |
|
"learning_rate": 1.9629578175179823e-05, |
|
"loss": 0.7587, |
|
"mean_token_accuracy": 0.7718611041296293, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.538660747317795, |
|
"grad_norm": 0.5494539141654968, |
|
"learning_rate": 1.9620201244302952e-05, |
|
"loss": 0.7487, |
|
"mean_token_accuracy": 0.7745951212558507, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5416204217536071, |
|
"grad_norm": 0.5416110754013062, |
|
"learning_rate": 1.9610709403987248e-05, |
|
"loss": 0.7583, |
|
"mean_token_accuracy": 0.7723263702843611, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5445800961894192, |
|
"grad_norm": 0.5187686681747437, |
|
"learning_rate": 1.9601102767608924e-05, |
|
"loss": 0.7727, |
|
"mean_token_accuracy": 0.7669931715546834, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5475397706252312, |
|
"grad_norm": 0.6072437763214111, |
|
"learning_rate": 1.95913814499154e-05, |
|
"loss": 0.7758, |
|
"mean_token_accuracy": 0.7658226539132729, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5504994450610433, |
|
"grad_norm": 0.5267654061317444, |
|
"learning_rate": 1.95815455670239e-05, |
|
"loss": 0.7799, |
|
"mean_token_accuracy": 0.7644383151164792, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5534591194968553, |
|
"grad_norm": 0.5116267800331116, |
|
"learning_rate": 1.9571595236420103e-05, |
|
"loss": 0.765, |
|
"mean_token_accuracy": 0.7686784858855072, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5564187939326674, |
|
"grad_norm": 0.5083511471748352, |
|
"learning_rate": 1.9561530576956703e-05, |
|
"loss": 0.7293, |
|
"mean_token_accuracy": 0.7782823905710549, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5593784683684795, |
|
"grad_norm": 0.5557141900062561, |
|
"learning_rate": 1.955135170885202e-05, |
|
"loss": 0.7426, |
|
"mean_token_accuracy": 0.7763826979033814, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5623381428042915, |
|
"grad_norm": 0.5787784457206726, |
|
"learning_rate": 1.9541058753688538e-05, |
|
"loss": 0.7484, |
|
"mean_token_accuracy": 0.7738303017670985, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5652978172401036, |
|
"grad_norm": 0.5557724237442017, |
|
"learning_rate": 1.9530651834411477e-05, |
|
"loss": 0.7603, |
|
"mean_token_accuracy": 0.7699144780244102, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5682574916759157, |
|
"grad_norm": 0.5540621876716614, |
|
"learning_rate": 1.95201310753273e-05, |
|
"loss": 0.7224, |
|
"mean_token_accuracy": 0.7793132883624135, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5712171661117277, |
|
"grad_norm": 0.5053984522819519, |
|
"learning_rate": 1.9509496602102253e-05, |
|
"loss": 0.7258, |
|
"mean_token_accuracy": 0.7800444754491836, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5741768405475398, |
|
"grad_norm": 0.49898284673690796, |
|
"learning_rate": 1.9498748541760845e-05, |
|
"loss": 0.7396, |
|
"mean_token_accuracy": 0.7753466916631608, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.5799064040184021, |
|
"learning_rate": 1.9487887022684336e-05, |
|
"loss": 0.7602, |
|
"mean_token_accuracy": 0.7701053674537776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5800961894191639, |
|
"grad_norm": 0.5606354475021362, |
|
"learning_rate": 1.947691217460921e-05, |
|
"loss": 0.7544, |
|
"mean_token_accuracy": 0.77100937072039, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.583055863854976, |
|
"grad_norm": 0.4998267590999603, |
|
"learning_rate": 1.946582412862562e-05, |
|
"loss": 0.766, |
|
"mean_token_accuracy": 0.7682667265118656, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.586015538290788, |
|
"grad_norm": 0.5629295110702515, |
|
"learning_rate": 1.9454623017175814e-05, |
|
"loss": 0.7424, |
|
"mean_token_accuracy": 0.7752050364586516, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5889752127266, |
|
"grad_norm": 0.4932561218738556, |
|
"learning_rate": 1.9443308974052574e-05, |
|
"loss": 0.7489, |
|
"mean_token_accuracy": 0.7741070965947788, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"grad_norm": 0.6265371441841125, |
|
"learning_rate": 1.9431882134397596e-05, |
|
"loss": 0.7658, |
|
"mean_token_accuracy": 0.7681866412889478, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"eval_loss": 0.778282105922699, |
|
"eval_mean_token_accuracy": 0.7620499776343601, |
|
"eval_runtime": 24.5192, |
|
"eval_samples_per_second": 5.261, |
|
"eval_steps_per_second": 1.346, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5948945615982242, |
|
"grad_norm": 0.5446656346321106, |
|
"learning_rate": 1.9420342634699893e-05, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.7810348950987986, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5978542360340362, |
|
"grad_norm": 0.5253841876983643, |
|
"learning_rate": 1.9408690612794146e-05, |
|
"loss": 0.7758, |
|
"mean_token_accuracy": 0.7659725997741449, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6008139104698483, |
|
"grad_norm": 0.5887268781661987, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.7107, |
|
"mean_token_accuracy": 0.7828422379162261, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 0.5546231269836426, |
|
"learning_rate": 1.9385049560415794e-05, |
|
"loss": 0.7812, |
|
"mean_token_accuracy": 0.7646388223607241, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6067332593414725, |
|
"grad_norm": 0.5595012307167053, |
|
"learning_rate": 1.9373060812326053e-05, |
|
"loss": 0.7368, |
|
"mean_token_accuracy": 0.7771756648124704, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6096929337772845, |
|
"grad_norm": 0.6051347255706787, |
|
"learning_rate": 1.9360960106790645e-05, |
|
"loss": 0.7637, |
|
"mean_token_accuracy": 0.7687831877533422, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6126526082130965, |
|
"grad_norm": 0.5045530200004578, |
|
"learning_rate": 1.9348747588347637e-05, |
|
"loss": 0.7633, |
|
"mean_token_accuracy": 0.7716161599834406, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6156122826489087, |
|
"grad_norm": 0.5844081044197083, |
|
"learning_rate": 1.9336423402870655e-05, |
|
"loss": 0.7634, |
|
"mean_token_accuracy": 0.7698122225847835, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6185719570847207, |
|
"grad_norm": 0.516323983669281, |
|
"learning_rate": 1.932398769756714e-05, |
|
"loss": 0.7347, |
|
"mean_token_accuracy": 0.7758576109605254, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.6504623293876648, |
|
"learning_rate": 1.9311440620976597e-05, |
|
"loss": 0.7375, |
|
"mean_token_accuracy": 0.7756102635673668, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6244913059563448, |
|
"grad_norm": 0.6118385195732117, |
|
"learning_rate": 1.9298782322968817e-05, |
|
"loss": 0.7734, |
|
"mean_token_accuracy": 0.7640280400757476, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6274509803921569, |
|
"grad_norm": 0.5381941795349121, |
|
"learning_rate": 1.9286012954742078e-05, |
|
"loss": 0.7426, |
|
"mean_token_accuracy": 0.7750216295859045, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6304106548279689, |
|
"grad_norm": 0.6116046905517578, |
|
"learning_rate": 1.9273132668821363e-05, |
|
"loss": 0.7894, |
|
"mean_token_accuracy": 0.7624240258634218, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.633370329263781, |
|
"grad_norm": 0.5995723009109497, |
|
"learning_rate": 1.9260141619056507e-05, |
|
"loss": 0.8063, |
|
"mean_token_accuracy": 0.7580679708807321, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.636330003699593, |
|
"grad_norm": 0.6060746312141418, |
|
"learning_rate": 1.924703996062038e-05, |
|
"loss": 0.7825, |
|
"mean_token_accuracy": 0.7644491908483929, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6392896781354052, |
|
"grad_norm": 0.4967659115791321, |
|
"learning_rate": 1.9233827850007028e-05, |
|
"loss": 0.7419, |
|
"mean_token_accuracy": 0.7752207223816133, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6422493525712172, |
|
"grad_norm": 0.5452144145965576, |
|
"learning_rate": 1.9220505445029803e-05, |
|
"loss": 0.7419, |
|
"mean_token_accuracy": 0.7768822483190798, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6452090270070292, |
|
"grad_norm": 0.5308946967124939, |
|
"learning_rate": 1.9207072904819484e-05, |
|
"loss": 0.7867, |
|
"mean_token_accuracy": 0.7616907876516587, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6481687014428413, |
|
"grad_norm": 0.5080918669700623, |
|
"learning_rate": 1.9193530389822364e-05, |
|
"loss": 0.7551, |
|
"mean_token_accuracy": 0.7722103161394469, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6511283758786534, |
|
"grad_norm": 0.5541013479232788, |
|
"learning_rate": 1.9179878061798347e-05, |
|
"loss": 0.7416, |
|
"mean_token_accuracy": 0.7758964006687687, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6540880503144654, |
|
"grad_norm": 0.5555444955825806, |
|
"learning_rate": 1.9166116083819002e-05, |
|
"loss": 0.7735, |
|
"mean_token_accuracy": 0.7667690994886073, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6570477247502775, |
|
"grad_norm": 0.5138890743255615, |
|
"learning_rate": 1.915224462026563e-05, |
|
"loss": 0.7689, |
|
"mean_token_accuracy": 0.7680507811975301, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6600073991860895, |
|
"grad_norm": 0.5619951486587524, |
|
"learning_rate": 1.913826383682729e-05, |
|
"loss": 0.7776, |
|
"mean_token_accuracy": 0.7642446287241815, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6629670736219015, |
|
"grad_norm": 0.49697887897491455, |
|
"learning_rate": 1.912417390049882e-05, |
|
"loss": 0.7564, |
|
"mean_token_accuracy": 0.7708950011889235, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.5893805027008057, |
|
"learning_rate": 1.9109974979578852e-05, |
|
"loss": 0.7347, |
|
"mean_token_accuracy": 0.7758372095558704, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6688864224935257, |
|
"grad_norm": 0.5565352439880371, |
|
"learning_rate": 1.909566724366779e-05, |
|
"loss": 0.7619, |
|
"mean_token_accuracy": 0.76937331341953, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6718460969293377, |
|
"grad_norm": 0.581122875213623, |
|
"learning_rate": 1.9081250863665794e-05, |
|
"loss": 0.7459, |
|
"mean_token_accuracy": 0.7744230618996671, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6748057713651499, |
|
"grad_norm": 0.6203576326370239, |
|
"learning_rate": 1.9066726011770725e-05, |
|
"loss": 0.7403, |
|
"mean_token_accuracy": 0.7757174096012653, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6777654458009619, |
|
"grad_norm": 0.5231543779373169, |
|
"learning_rate": 1.905209286147611e-05, |
|
"loss": 0.7291, |
|
"mean_token_accuracy": 0.7789308093459126, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.680725120236774, |
|
"grad_norm": 0.5227301120758057, |
|
"learning_rate": 1.903735158756905e-05, |
|
"loss": 0.7267, |
|
"mean_token_accuracy": 0.780063648206095, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.683684794672586, |
|
"grad_norm": 0.5774472951889038, |
|
"learning_rate": 1.9022502366128136e-05, |
|
"loss": 0.7626, |
|
"mean_token_accuracy": 0.7701068030426402, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.686644469108398, |
|
"grad_norm": 0.5350067615509033, |
|
"learning_rate": 1.9007545374521354e-05, |
|
"loss": 0.7727, |
|
"mean_token_accuracy": 0.767009637419523, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6896041435442102, |
|
"grad_norm": 0.543245792388916, |
|
"learning_rate": 1.8992480791403957e-05, |
|
"loss": 0.7258, |
|
"mean_token_accuracy": 0.7811048484724694, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6925638179800222, |
|
"grad_norm": 0.6067213416099548, |
|
"learning_rate": 1.897730879671634e-05, |
|
"loss": 0.7454, |
|
"mean_token_accuracy": 0.7739789538178186, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6955234924158342, |
|
"grad_norm": 0.5219905972480774, |
|
"learning_rate": 1.8962029571681887e-05, |
|
"loss": 0.7094, |
|
"mean_token_accuracy": 0.7855872005269757, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6984831668516464, |
|
"grad_norm": 0.5807480216026306, |
|
"learning_rate": 1.8946643298804794e-05, |
|
"loss": 0.7701, |
|
"mean_token_accuracy": 0.7658029579586856, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7014428412874584, |
|
"grad_norm": 0.4960806965827942, |
|
"learning_rate": 1.8931150161867917e-05, |
|
"loss": 0.7285, |
|
"mean_token_accuracy": 0.7792831489593245, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7044025157232704, |
|
"grad_norm": 0.5792670249938965, |
|
"learning_rate": 1.891555034593055e-05, |
|
"loss": 0.7467, |
|
"mean_token_accuracy": 0.7733710687900762, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.7073621901590825, |
|
"grad_norm": 0.5364589691162109, |
|
"learning_rate": 1.8899844037326227e-05, |
|
"loss": 0.7195, |
|
"mean_token_accuracy": 0.7821820109931461, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.5596705079078674, |
|
"learning_rate": 1.8884031423660492e-05, |
|
"loss": 0.7047, |
|
"mean_token_accuracy": 0.785649852431446, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7132815390307066, |
|
"grad_norm": 0.5741063356399536, |
|
"learning_rate": 1.8868112693808664e-05, |
|
"loss": 0.7663, |
|
"mean_token_accuracy": 0.7678326165991625, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7162412134665187, |
|
"grad_norm": 0.516858696937561, |
|
"learning_rate": 1.8852088037913577e-05, |
|
"loss": 0.7471, |
|
"mean_token_accuracy": 0.7746923216355659, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7192008879023307, |
|
"grad_norm": 0.5048111081123352, |
|
"learning_rate": 1.8835957647383304e-05, |
|
"loss": 0.7023, |
|
"mean_token_accuracy": 0.7870937976717415, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7221605623381429, |
|
"grad_norm": 0.5660455226898193, |
|
"learning_rate": 1.8819721714888878e-05, |
|
"loss": 0.7795, |
|
"mean_token_accuracy": 0.7642331478723334, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7251202367739549, |
|
"grad_norm": 0.5211176872253418, |
|
"learning_rate": 1.8803380434362e-05, |
|
"loss": 0.7342, |
|
"mean_token_accuracy": 0.7781391886138683, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7280799112097669, |
|
"grad_norm": 0.5142192244529724, |
|
"learning_rate": 1.878693400099269e-05, |
|
"loss": 0.7301, |
|
"mean_token_accuracy": 0.7786941626128209, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.731039585645579, |
|
"grad_norm": 0.5370232462882996, |
|
"learning_rate": 1.877038261122699e-05, |
|
"loss": 0.7593, |
|
"mean_token_accuracy": 0.771669201717037, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.733999260081391, |
|
"grad_norm": 0.49543988704681396, |
|
"learning_rate": 1.87537264627646e-05, |
|
"loss": 0.7216, |
|
"mean_token_accuracy": 0.7810789864692633, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7369589345172031, |
|
"grad_norm": 0.56675785779953, |
|
"learning_rate": 1.8736965754556527e-05, |
|
"loss": 0.7627, |
|
"mean_token_accuracy": 0.7688760359914193, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7399186089530152, |
|
"grad_norm": 0.524047315120697, |
|
"learning_rate": 1.8720100686802693e-05, |
|
"loss": 0.7551, |
|
"mean_token_accuracy": 0.7700947445179971, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7428782833888272, |
|
"grad_norm": 0.5166477560997009, |
|
"learning_rate": 1.8703131460949555e-05, |
|
"loss": 0.7785, |
|
"mean_token_accuracy": 0.7636579872205778, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7458379578246392, |
|
"grad_norm": 0.5201772451400757, |
|
"learning_rate": 1.86860582796877e-05, |
|
"loss": 0.736, |
|
"mean_token_accuracy": 0.7761137360141643, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7487976322604514, |
|
"grad_norm": 0.6423028707504272, |
|
"learning_rate": 1.866888134694942e-05, |
|
"loss": 0.7454, |
|
"mean_token_accuracy": 0.7750494962065552, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.7517573066962634, |
|
"grad_norm": 0.5888985395431519, |
|
"learning_rate": 1.865160086790627e-05, |
|
"loss": 0.7238, |
|
"mean_token_accuracy": 0.7800915239288521, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.5778961181640625, |
|
"learning_rate": 1.8634217048966638e-05, |
|
"loss": 0.7658, |
|
"mean_token_accuracy": 0.7687186514339136, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7576766555678875, |
|
"grad_norm": 0.5808703303337097, |
|
"learning_rate": 1.861673009777325e-05, |
|
"loss": 0.7449, |
|
"mean_token_accuracy": 0.7729568426414184, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7606363300036996, |
|
"grad_norm": 0.5731485486030579, |
|
"learning_rate": 1.8599140223200716e-05, |
|
"loss": 0.748, |
|
"mean_token_accuracy": 0.7729810722706314, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7635960044395117, |
|
"grad_norm": 0.5766414403915405, |
|
"learning_rate": 1.858144763535302e-05, |
|
"loss": 0.7782, |
|
"mean_token_accuracy": 0.764375293579256, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7665556788753237, |
|
"grad_norm": 0.5422239899635315, |
|
"learning_rate": 1.8563652545561014e-05, |
|
"loss": 0.7329, |
|
"mean_token_accuracy": 0.7776419690528588, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7695153533111357, |
|
"grad_norm": 0.5828793048858643, |
|
"learning_rate": 1.8545755166379898e-05, |
|
"loss": 0.7186, |
|
"mean_token_accuracy": 0.7822970814680493, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7724750277469479, |
|
"grad_norm": 0.5449491739273071, |
|
"learning_rate": 1.852775571158668e-05, |
|
"loss": 0.7711, |
|
"mean_token_accuracy": 0.7660281761867683, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7754347021827599, |
|
"grad_norm": 0.5476288795471191, |
|
"learning_rate": 1.850965439617761e-05, |
|
"loss": 0.7404, |
|
"mean_token_accuracy": 0.7736120045020073, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7783943766185719, |
|
"grad_norm": 0.6878018975257874, |
|
"learning_rate": 1.8491451436365628e-05, |
|
"loss": 0.7758, |
|
"mean_token_accuracy": 0.7640672296658151, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.781354051054384, |
|
"grad_norm": 0.5300653576850891, |
|
"learning_rate": 1.8473147049577777e-05, |
|
"loss": 0.7666, |
|
"mean_token_accuracy": 0.7686153435173708, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 0.6327837705612183, |
|
"learning_rate": 1.8454741454452604e-05, |
|
"loss": 0.7521, |
|
"mean_token_accuracy": 0.7717832959346983, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7872733999260081, |
|
"grad_norm": 0.5409294366836548, |
|
"learning_rate": 1.843623487083755e-05, |
|
"loss": 0.7404, |
|
"mean_token_accuracy": 0.7766533132408322, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7902330743618202, |
|
"grad_norm": 0.5834295749664307, |
|
"learning_rate": 1.8417627519786317e-05, |
|
"loss": 0.7592, |
|
"mean_token_accuracy": 0.7693419786318872, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7931927487976322, |
|
"grad_norm": 0.5921277403831482, |
|
"learning_rate": 1.839891962355624e-05, |
|
"loss": 0.7162, |
|
"mean_token_accuracy": 0.7820607013724311, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7961524232334444, |
|
"grad_norm": 0.5238744020462036, |
|
"learning_rate": 1.838011140560562e-05, |
|
"loss": 0.7565, |
|
"mean_token_accuracy": 0.770343025952529, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7991120976692564, |
|
"grad_norm": 0.5569880604743958, |
|
"learning_rate": 1.836120309059107e-05, |
|
"loss": 0.7488, |
|
"mean_token_accuracy": 0.7728957184836894, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8020717721050684, |
|
"grad_norm": 0.5647782683372498, |
|
"learning_rate": 1.8342194904364815e-05, |
|
"loss": 0.7135, |
|
"mean_token_accuracy": 0.7830927354241212, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.8050314465408805, |
|
"grad_norm": 0.5411779284477234, |
|
"learning_rate": 1.8323087073971996e-05, |
|
"loss": 0.7366, |
|
"mean_token_accuracy": 0.775458202599469, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.8079911209766926, |
|
"grad_norm": 0.6045868992805481, |
|
"learning_rate": 1.8303879827647977e-05, |
|
"loss": 0.7544, |
|
"mean_token_accuracy": 0.7712791582365803, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.8109507954125046, |
|
"grad_norm": 0.5784792304039001, |
|
"learning_rate": 1.8284573394815596e-05, |
|
"loss": 0.7448, |
|
"mean_token_accuracy": 0.7737621785267094, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.8139104698483167, |
|
"grad_norm": 0.5260710120201111, |
|
"learning_rate": 1.826516800608244e-05, |
|
"loss": 0.7627, |
|
"mean_token_accuracy": 0.7694265078345902, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8168701442841287, |
|
"grad_norm": 0.5844061374664307, |
|
"learning_rate": 1.8245663893238075e-05, |
|
"loss": 0.7653, |
|
"mean_token_accuracy": 0.7686764548943624, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8198298187199408, |
|
"grad_norm": 0.5687382221221924, |
|
"learning_rate": 1.8226061289251297e-05, |
|
"loss": 0.7631, |
|
"mean_token_accuracy": 0.7688321516962094, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8227894931557529, |
|
"grad_norm": 0.5046533942222595, |
|
"learning_rate": 1.8206360428267332e-05, |
|
"loss": 0.6843, |
|
"mean_token_accuracy": 0.7910775752871206, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8257491675915649, |
|
"grad_norm": 0.6087561249732971, |
|
"learning_rate": 1.8186561545605055e-05, |
|
"loss": 0.7596, |
|
"mean_token_accuracy": 0.7701909103269404, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8287088420273769, |
|
"grad_norm": 0.5349226593971252, |
|
"learning_rate": 1.816666487775416e-05, |
|
"loss": 0.7453, |
|
"mean_token_accuracy": 0.7745023453893125, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8316685164631891, |
|
"grad_norm": 0.549005389213562, |
|
"learning_rate": 1.8146670662372353e-05, |
|
"loss": 0.7424, |
|
"mean_token_accuracy": 0.7753068407668716, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.8346281908990011, |
|
"grad_norm": 0.5528567433357239, |
|
"learning_rate": 1.8126579138282502e-05, |
|
"loss": 0.7515, |
|
"mean_token_accuracy": 0.7716154993402936, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8375878653348132, |
|
"grad_norm": 0.47966665029525757, |
|
"learning_rate": 1.8106390545469797e-05, |
|
"loss": 0.7601, |
|
"mean_token_accuracy": 0.7702052221245829, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8405475397706252, |
|
"grad_norm": 0.5724716186523438, |
|
"learning_rate": 1.8086105125078858e-05, |
|
"loss": 0.7332, |
|
"mean_token_accuracy": 0.7777038447981673, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8435072142064373, |
|
"grad_norm": 0.5578106641769409, |
|
"learning_rate": 1.8065723119410885e-05, |
|
"loss": 0.7302, |
|
"mean_token_accuracy": 0.7772090946791604, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8464668886422494, |
|
"grad_norm": 0.5442110896110535, |
|
"learning_rate": 1.804524477192075e-05, |
|
"loss": 0.7334, |
|
"mean_token_accuracy": 0.7762476620441784, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8494265630780614, |
|
"grad_norm": 0.584141731262207, |
|
"learning_rate": 1.8024670327214084e-05, |
|
"loss": 0.7258, |
|
"mean_token_accuracy": 0.7806851593884065, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8523862375138734, |
|
"grad_norm": 0.598616361618042, |
|
"learning_rate": 1.8004000031044363e-05, |
|
"loss": 0.7793, |
|
"mean_token_accuracy": 0.7645610353814324, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8553459119496856, |
|
"grad_norm": 0.5531610250473022, |
|
"learning_rate": 1.798323413030997e-05, |
|
"loss": 0.7302, |
|
"mean_token_accuracy": 0.7774894874371842, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8583055863854976, |
|
"grad_norm": 0.637056291103363, |
|
"learning_rate": 1.796237287305125e-05, |
|
"loss": 0.7319, |
|
"mean_token_accuracy": 0.776980981509457, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8612652608213096, |
|
"grad_norm": 0.526637613773346, |
|
"learning_rate": 1.7941416508447537e-05, |
|
"loss": 0.737, |
|
"mean_token_accuracy": 0.7755365142177981, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8642249352571217, |
|
"grad_norm": 0.6117897033691406, |
|
"learning_rate": 1.792036528681418e-05, |
|
"loss": 0.7453, |
|
"mean_token_accuracy": 0.7738772998083994, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8671846096929338, |
|
"grad_norm": 0.57455974817276, |
|
"learning_rate": 1.789921945959958e-05, |
|
"loss": 0.7293, |
|
"mean_token_accuracy": 0.7769571821797022, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8701442841287459, |
|
"grad_norm": 0.5134701728820801, |
|
"learning_rate": 1.7877979279382135e-05, |
|
"loss": 0.7198, |
|
"mean_token_accuracy": 0.7810807816611623, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8731039585645579, |
|
"grad_norm": 0.6354233026504517, |
|
"learning_rate": 1.7856644999867264e-05, |
|
"loss": 0.7491, |
|
"mean_token_accuracy": 0.7724234282097991, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8760636330003699, |
|
"grad_norm": 0.4881884753704071, |
|
"learning_rate": 1.783521687588437e-05, |
|
"loss": 0.6976, |
|
"mean_token_accuracy": 0.7884361038620284, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8790233074361821, |
|
"grad_norm": 0.6362212300300598, |
|
"learning_rate": 1.781369516338378e-05, |
|
"loss": 0.7398, |
|
"mean_token_accuracy": 0.7755743850683346, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8819829818719941, |
|
"grad_norm": 0.5661829710006714, |
|
"learning_rate": 1.779208011943371e-05, |
|
"loss": 0.734, |
|
"mean_token_accuracy": 0.7765507531646713, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8849426563078061, |
|
"grad_norm": 0.5010657906532288, |
|
"learning_rate": 1.777037200221717e-05, |
|
"loss": 0.7388, |
|
"mean_token_accuracy": 0.7751515429566093, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"grad_norm": 0.6076653003692627, |
|
"learning_rate": 1.77485710710289e-05, |
|
"loss": 0.729, |
|
"mean_token_accuracy": 0.7784857584094641, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"eval_loss": 0.7613943219184875, |
|
"eval_mean_token_accuracy": 0.7661339070277478, |
|
"eval_runtime": 24.531, |
|
"eval_samples_per_second": 5.259, |
|
"eval_steps_per_second": 1.345, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8908620051794303, |
|
"grad_norm": 0.5315244197845459, |
|
"learning_rate": 1.7726677586272263e-05, |
|
"loss": 0.7247, |
|
"mean_token_accuracy": 0.7800706307954832, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8938216796152423, |
|
"grad_norm": 0.572488009929657, |
|
"learning_rate": 1.7704691809456142e-05, |
|
"loss": 0.7619, |
|
"mean_token_accuracy": 0.7684274429192071, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8967813540510544, |
|
"grad_norm": 0.530282735824585, |
|
"learning_rate": 1.7682614003191807e-05, |
|
"loss": 0.7192, |
|
"mean_token_accuracy": 0.7826426067771499, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8997410284868664, |
|
"grad_norm": 0.4633922278881073, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.7361, |
|
"mean_token_accuracy": 0.7761481074845271, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.9027007029226785, |
|
"grad_norm": 0.5290641784667969, |
|
"learning_rate": 1.76381833582567e-05, |
|
"loss": 0.7347, |
|
"mean_token_accuracy": 0.7752220969695607, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 0.5756820440292358, |
|
"learning_rate": 1.761583105029213e-05, |
|
"loss": 0.7091, |
|
"mean_token_accuracy": 0.7832374672534335, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.9086200517943026, |
|
"grad_norm": 0.4851895570755005, |
|
"learning_rate": 1.7593387774285412e-05, |
|
"loss": 0.7259, |
|
"mean_token_accuracy": 0.7790017695040672, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.9115797262301147, |
|
"grad_norm": 0.5287590026855469, |
|
"learning_rate": 1.7570853798312462e-05, |
|
"loss": 0.7234, |
|
"mean_token_accuracy": 0.7806430154123836, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.9145394006659268, |
|
"grad_norm": 0.5195660591125488, |
|
"learning_rate": 1.7548229391532572e-05, |
|
"loss": 0.6565, |
|
"mean_token_accuracy": 0.7984747483843323, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.9174990751017388, |
|
"grad_norm": 0.4991515576839447, |
|
"learning_rate": 1.7525514824185187e-05, |
|
"loss": 0.7231, |
|
"mean_token_accuracy": 0.7803891298617083, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9204587495375509, |
|
"grad_norm": 0.4935111701488495, |
|
"learning_rate": 1.750271036758669e-05, |
|
"loss": 0.7564, |
|
"mean_token_accuracy": 0.7712247656704234, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9234184239733629, |
|
"grad_norm": 0.5220803618431091, |
|
"learning_rate": 1.747981629412715e-05, |
|
"loss": 0.7381, |
|
"mean_token_accuracy": 0.7754488466026199, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.926378098409175, |
|
"grad_norm": 0.4899723529815674, |
|
"learning_rate": 1.7456832877267083e-05, |
|
"loss": 0.7147, |
|
"mean_token_accuracy": 0.7830229071000929, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9293377728449871, |
|
"grad_norm": 0.48553645610809326, |
|
"learning_rate": 1.7433760391534166e-05, |
|
"loss": 0.7249, |
|
"mean_token_accuracy": 0.7801764351541252, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9322974472807991, |
|
"grad_norm": 0.5421589016914368, |
|
"learning_rate": 1.741059911251997e-05, |
|
"loss": 0.7398, |
|
"mean_token_accuracy": 0.7753942151228886, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9352571217166111, |
|
"grad_norm": 0.5142074823379517, |
|
"learning_rate": 1.7387349316876668e-05, |
|
"loss": 0.7213, |
|
"mean_token_accuracy": 0.7805064687638097, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9382167961524233, |
|
"grad_norm": 0.4945102632045746, |
|
"learning_rate": 1.7364011282313732e-05, |
|
"loss": 0.713, |
|
"mean_token_accuracy": 0.7815959672421611, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.509762167930603, |
|
"learning_rate": 1.7340585287594605e-05, |
|
"loss": 0.7278, |
|
"mean_token_accuracy": 0.778527115017442, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9441361450240473, |
|
"grad_norm": 0.5061408877372742, |
|
"learning_rate": 1.731707161253338e-05, |
|
"loss": 0.7646, |
|
"mean_token_accuracy": 0.7684516320654873, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9470958194598594, |
|
"grad_norm": 0.4812653958797455, |
|
"learning_rate": 1.7293470537991463e-05, |
|
"loss": 0.7286, |
|
"mean_token_accuracy": 0.7783584589981216, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9500554938956715, |
|
"grad_norm": 0.5362148284912109, |
|
"learning_rate": 1.7269782345874204e-05, |
|
"loss": 0.7029, |
|
"mean_token_accuracy": 0.785544384259824, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9530151683314836, |
|
"grad_norm": 0.5306621193885803, |
|
"learning_rate": 1.7246007319127547e-05, |
|
"loss": 0.747, |
|
"mean_token_accuracy": 0.774057502189317, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9559748427672956, |
|
"grad_norm": 0.567263126373291, |
|
"learning_rate": 1.7222145741734625e-05, |
|
"loss": 0.7198, |
|
"mean_token_accuracy": 0.7807379482187227, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.9589345172031076, |
|
"grad_norm": 0.5175469517707825, |
|
"learning_rate": 1.7198197898712402e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.7786112184337877, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9618941916389198, |
|
"grad_norm": 0.5404612421989441, |
|
"learning_rate": 1.717416407610824e-05, |
|
"loss": 0.689, |
|
"mean_token_accuracy": 0.7877453794929681, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9648538660747318, |
|
"grad_norm": 0.5193690061569214, |
|
"learning_rate": 1.7150044560996488e-05, |
|
"loss": 0.747, |
|
"mean_token_accuracy": 0.7742212613379238, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9678135405105438, |
|
"grad_norm": 0.4946900010108948, |
|
"learning_rate": 1.7125839641475074e-05, |
|
"loss": 0.7471, |
|
"mean_token_accuracy": 0.7747309622069193, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9707732149463559, |
|
"grad_norm": 0.48158422112464905, |
|
"learning_rate": 1.7101549606662025e-05, |
|
"loss": 0.7588, |
|
"mean_token_accuracy": 0.7672773960785951, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.973732889382168, |
|
"grad_norm": 0.49433794617652893, |
|
"learning_rate": 1.7077174746692054e-05, |
|
"loss": 0.7086, |
|
"mean_token_accuracy": 0.7835172366515396, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.97669256381798, |
|
"grad_norm": 0.529739499092102, |
|
"learning_rate": 1.7052715352713076e-05, |
|
"loss": 0.692, |
|
"mean_token_accuracy": 0.7882518659447058, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9796522382537921, |
|
"grad_norm": 0.49609243869781494, |
|
"learning_rate": 1.7028171716882714e-05, |
|
"loss": 0.727, |
|
"mean_token_accuracy": 0.7790673878869031, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9826119126896041, |
|
"grad_norm": 0.5060005784034729, |
|
"learning_rate": 1.7003544132364847e-05, |
|
"loss": 0.7492, |
|
"mean_token_accuracy": 0.7722196174397824, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9855715871254163, |
|
"grad_norm": 0.5200058817863464, |
|
"learning_rate": 1.6978832893326074e-05, |
|
"loss": 0.7274, |
|
"mean_token_accuracy": 0.7771648765922762, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9885312615612283, |
|
"grad_norm": 0.5111742615699768, |
|
"learning_rate": 1.6954038294932215e-05, |
|
"loss": 0.727, |
|
"mean_token_accuracy": 0.7788486720026189, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9914909359970403, |
|
"grad_norm": 0.49541163444519043, |
|
"learning_rate": 1.692916063334479e-05, |
|
"loss": 0.716, |
|
"mean_token_accuracy": 0.7805707677819913, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9944506104328524, |
|
"grad_norm": 0.5204536318778992, |
|
"learning_rate": 1.690420020571747e-05, |
|
"loss": 0.7857, |
|
"mean_token_accuracy": 0.7611835238050416, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9974102848686645, |
|
"grad_norm": 0.49425816535949707, |
|
"learning_rate": 1.6879157310192537e-05, |
|
"loss": 0.7237, |
|
"mean_token_accuracy": 0.7797621176940523, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.002959674435812, |
|
"grad_norm": 0.9215492010116577, |
|
"learning_rate": 1.685403224589731e-05, |
|
"loss": 1.431, |
|
"mean_token_accuracy": 0.781872374274613, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.005919348871624, |
|
"grad_norm": 0.4850497841835022, |
|
"learning_rate": 1.6828825312940594e-05, |
|
"loss": 0.7123, |
|
"mean_token_accuracy": 0.7815581594577298, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.0088790233074363, |
|
"grad_norm": 0.5388746857643127, |
|
"learning_rate": 1.6803536812409077e-05, |
|
"loss": 0.6533, |
|
"mean_token_accuracy": 0.7976729613611061, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0118386977432483, |
|
"grad_norm": 0.5414032340049744, |
|
"learning_rate": 1.6778167046363735e-05, |
|
"loss": 0.663, |
|
"mean_token_accuracy": 0.7950990029075803, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.0147983721790603, |
|
"grad_norm": 0.5482701063156128, |
|
"learning_rate": 1.675271631783623e-05, |
|
"loss": 0.6924, |
|
"mean_token_accuracy": 0.7870997024486296, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.0177580466148723, |
|
"grad_norm": 0.5530447363853455, |
|
"learning_rate": 1.672718493082529e-05, |
|
"loss": 0.6957, |
|
"mean_token_accuracy": 0.7862520808317638, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.0207177210506844, |
|
"grad_norm": 0.5601862072944641, |
|
"learning_rate": 1.6701573190293076e-05, |
|
"loss": 0.7079, |
|
"mean_token_accuracy": 0.7811090177290159, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.0236773954864964, |
|
"grad_norm": 0.5983414649963379, |
|
"learning_rate": 1.667588140216154e-05, |
|
"loss": 0.7177, |
|
"mean_token_accuracy": 0.7782319335787533, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0266370699223086, |
|
"grad_norm": 0.5023918747901917, |
|
"learning_rate": 1.6650109873308763e-05, |
|
"loss": 0.6742, |
|
"mean_token_accuracy": 0.7925658601690396, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.0295967443581207, |
|
"grad_norm": 0.5499829053878784, |
|
"learning_rate": 1.6624258911565312e-05, |
|
"loss": 0.6964, |
|
"mean_token_accuracy": 0.7845868210400818, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.0325564187939327, |
|
"grad_norm": 0.6044626235961914, |
|
"learning_rate": 1.6598328825710536e-05, |
|
"loss": 0.7433, |
|
"mean_token_accuracy": 0.7716598489636504, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.0355160932297447, |
|
"grad_norm": 0.5895024538040161, |
|
"learning_rate": 1.6572319925468892e-05, |
|
"loss": 0.6851, |
|
"mean_token_accuracy": 0.7886055642998372, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.0384757676655567, |
|
"grad_norm": 0.4884833097457886, |
|
"learning_rate": 1.654623252150624e-05, |
|
"loss": 0.6874, |
|
"mean_token_accuracy": 0.7882489689414884, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0414354421013687, |
|
"grad_norm": 0.48958876729011536, |
|
"learning_rate": 1.6520066925426146e-05, |
|
"loss": 0.6761, |
|
"mean_token_accuracy": 0.789869173725892, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.044395116537181, |
|
"grad_norm": 0.5143749713897705, |
|
"learning_rate": 1.6493823449766137e-05, |
|
"loss": 0.7002, |
|
"mean_token_accuracy": 0.7832564985016889, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.047354790972993, |
|
"grad_norm": 0.5188062191009521, |
|
"learning_rate": 1.6467502407993995e-05, |
|
"loss": 0.6785, |
|
"mean_token_accuracy": 0.7895198082299716, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.050314465408805, |
|
"grad_norm": 0.5853990316390991, |
|
"learning_rate": 1.644110411450398e-05, |
|
"loss": 0.7027, |
|
"mean_token_accuracy": 0.7840915967094005, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.053274139844617, |
|
"grad_norm": 0.48951801657676697, |
|
"learning_rate": 1.6414628884613106e-05, |
|
"loss": 0.6905, |
|
"mean_token_accuracy": 0.7872202318165091, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.056233814280429, |
|
"grad_norm": 0.5374004244804382, |
|
"learning_rate": 1.6388077034557355e-05, |
|
"loss": 0.7107, |
|
"mean_token_accuracy": 0.7806436850766835, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.0591934887162413, |
|
"grad_norm": 0.49236002564430237, |
|
"learning_rate": 1.6361448881487913e-05, |
|
"loss": 0.6762, |
|
"mean_token_accuracy": 0.7917445809376139, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.0621531631520533, |
|
"grad_norm": 0.4819602966308594, |
|
"learning_rate": 1.6334744743467366e-05, |
|
"loss": 0.6876, |
|
"mean_token_accuracy": 0.7879321033092377, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.0651128375878653, |
|
"grad_norm": 0.47309836745262146, |
|
"learning_rate": 1.6307964939465914e-05, |
|
"loss": 0.684, |
|
"mean_token_accuracy": 0.7893314943134146, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.0680725120236774, |
|
"grad_norm": 0.5006982088088989, |
|
"learning_rate": 1.628110978935756e-05, |
|
"loss": 0.6899, |
|
"mean_token_accuracy": 0.7870876825021131, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0710321864594894, |
|
"grad_norm": 0.5221154093742371, |
|
"learning_rate": 1.625417961391628e-05, |
|
"loss": 0.6475, |
|
"mean_token_accuracy": 0.7990545634414727, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.0739918608953016, |
|
"grad_norm": 0.4775597155094147, |
|
"learning_rate": 1.62271747348122e-05, |
|
"loss": 0.6934, |
|
"mean_token_accuracy": 0.787116997295676, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.0769515353311137, |
|
"grad_norm": 0.5393570065498352, |
|
"learning_rate": 1.6200095474607753e-05, |
|
"loss": 0.6892, |
|
"mean_token_accuracy": 0.7863585652394626, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.0799112097669257, |
|
"grad_norm": 0.4533829689025879, |
|
"learning_rate": 1.6172942156753822e-05, |
|
"loss": 0.6737, |
|
"mean_token_accuracy": 0.791843095021805, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.0828708842027377, |
|
"grad_norm": 0.462872177362442, |
|
"learning_rate": 1.614571510558588e-05, |
|
"loss": 0.6741, |
|
"mean_token_accuracy": 0.7927564512367392, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0858305586385497, |
|
"grad_norm": 0.5344141125679016, |
|
"learning_rate": 1.6118414646320115e-05, |
|
"loss": 0.678, |
|
"mean_token_accuracy": 0.7914964738663861, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.0887902330743617, |
|
"grad_norm": 0.5266002416610718, |
|
"learning_rate": 1.6091041105049542e-05, |
|
"loss": 0.6946, |
|
"mean_token_accuracy": 0.7852726685975778, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.091749907510174, |
|
"grad_norm": 0.4648328125476837, |
|
"learning_rate": 1.6063594808740112e-05, |
|
"loss": 0.6415, |
|
"mean_token_accuracy": 0.8008673556038499, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.094709581945986, |
|
"grad_norm": 0.5501207709312439, |
|
"learning_rate": 1.6036076085226813e-05, |
|
"loss": 0.7327, |
|
"mean_token_accuracy": 0.7737077885315848, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.097669256381798, |
|
"grad_norm": 0.49827733635902405, |
|
"learning_rate": 1.6008485263209742e-05, |
|
"loss": 0.6509, |
|
"mean_token_accuracy": 0.7995274953751699, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.10062893081761, |
|
"grad_norm": 0.4650176465511322, |
|
"learning_rate": 1.598082267225018e-05, |
|
"loss": 0.7112, |
|
"mean_token_accuracy": 0.7804922990268738, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.103588605253422, |
|
"grad_norm": 0.5303501486778259, |
|
"learning_rate": 1.595308864276666e-05, |
|
"loss": 0.7211, |
|
"mean_token_accuracy": 0.7776063180667486, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.106548279689234, |
|
"grad_norm": 0.5931088924407959, |
|
"learning_rate": 1.592528350603103e-05, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7860275624390939, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.1095079541250463, |
|
"grad_norm": 0.464376300573349, |
|
"learning_rate": 1.5897407594164468e-05, |
|
"loss": 0.6996, |
|
"mean_token_accuracy": 0.7857896692996122, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.1124676285608583, |
|
"grad_norm": 0.5060982704162598, |
|
"learning_rate": 1.586946124013354e-05, |
|
"loss": 0.6827, |
|
"mean_token_accuracy": 0.7901175041980462, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.1154273029966704, |
|
"grad_norm": 0.5316497683525085, |
|
"learning_rate": 1.5841444777746232e-05, |
|
"loss": 0.6454, |
|
"mean_token_accuracy": 0.7995927306906477, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.1183869774324824, |
|
"grad_norm": 0.5280824303627014, |
|
"learning_rate": 1.5813358541647915e-05, |
|
"loss": 0.6821, |
|
"mean_token_accuracy": 0.7899257721771863, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.1213466518682944, |
|
"grad_norm": 0.4961848258972168, |
|
"learning_rate": 1.578520286731741e-05, |
|
"loss": 0.7106, |
|
"mean_token_accuracy": 0.7801769327002734, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.1243063263041067, |
|
"grad_norm": 0.543953001499176, |
|
"learning_rate": 1.575697809106292e-05, |
|
"loss": 0.6922, |
|
"mean_token_accuracy": 0.785628822049384, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.1272660007399187, |
|
"grad_norm": 0.5489509105682373, |
|
"learning_rate": 1.5728684550018066e-05, |
|
"loss": 0.6936, |
|
"mean_token_accuracy": 0.7861259742540445, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1302256751757307, |
|
"grad_norm": 0.48247000575065613, |
|
"learning_rate": 1.570032258213783e-05, |
|
"loss": 0.702, |
|
"mean_token_accuracy": 0.781727569386528, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.1331853496115427, |
|
"grad_norm": 0.5495713949203491, |
|
"learning_rate": 1.5671892526194515e-05, |
|
"loss": 0.6792, |
|
"mean_token_accuracy": 0.7919662989910665, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.1361450240473547, |
|
"grad_norm": 0.4841765761375427, |
|
"learning_rate": 1.564339472177373e-05, |
|
"loss": 0.6693, |
|
"mean_token_accuracy": 0.7934251880120227, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.1391046984831668, |
|
"grad_norm": 0.5036046504974365, |
|
"learning_rate": 1.561482950927029e-05, |
|
"loss": 0.7035, |
|
"mean_token_accuracy": 0.7822988951176773, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.142064372918979, |
|
"grad_norm": 0.550046443939209, |
|
"learning_rate": 1.5586197229884185e-05, |
|
"loss": 0.6558, |
|
"mean_token_accuracy": 0.797441361172838, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.145024047354791, |
|
"grad_norm": 0.5752468705177307, |
|
"learning_rate": 1.5557498225616488e-05, |
|
"loss": 0.7081, |
|
"mean_token_accuracy": 0.7824781572463329, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.147983721790603, |
|
"grad_norm": 0.4782570004463196, |
|
"learning_rate": 1.5528732839265272e-05, |
|
"loss": 0.7, |
|
"mean_token_accuracy": 0.7834877131177364, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.150943396226415, |
|
"grad_norm": 0.5209779739379883, |
|
"learning_rate": 1.549990141442153e-05, |
|
"loss": 0.6823, |
|
"mean_token_accuracy": 0.7903034725828352, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.153903070662227, |
|
"grad_norm": 0.510071337223053, |
|
"learning_rate": 1.5471004295465034e-05, |
|
"loss": 0.7337, |
|
"mean_token_accuracy": 0.7748414033827098, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.156862745098039, |
|
"grad_norm": 0.5067256689071655, |
|
"learning_rate": 1.5442041827560274e-05, |
|
"loss": 0.6945, |
|
"mean_token_accuracy": 0.7857010244801683, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1598224195338513, |
|
"grad_norm": 0.5134366154670715, |
|
"learning_rate": 1.5413014356652287e-05, |
|
"loss": 0.6761, |
|
"mean_token_accuracy": 0.7901567665550651, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.1627820939696634, |
|
"grad_norm": 0.49565669894218445, |
|
"learning_rate": 1.538392222946255e-05, |
|
"loss": 0.6992, |
|
"mean_token_accuracy": 0.7850131511442856, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.1657417684054754, |
|
"grad_norm": 0.4513917565345764, |
|
"learning_rate": 1.5354765793484834e-05, |
|
"loss": 0.6779, |
|
"mean_token_accuracy": 0.7922368459696144, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.1687014428412874, |
|
"grad_norm": 0.5351982116699219, |
|
"learning_rate": 1.5325545396981053e-05, |
|
"loss": 0.6937, |
|
"mean_token_accuracy": 0.7857501806841758, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.1716611172770994, |
|
"grad_norm": 0.47825103998184204, |
|
"learning_rate": 1.5296261388977107e-05, |
|
"loss": 0.629, |
|
"mean_token_accuracy": 0.8047603633681424, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1746207917129117, |
|
"grad_norm": 0.48426443338394165, |
|
"learning_rate": 1.52669141192587e-05, |
|
"loss": 0.7218, |
|
"mean_token_accuracy": 0.7786340167760629, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.1775804661487237, |
|
"grad_norm": 0.510691225528717, |
|
"learning_rate": 1.5237503938367186e-05, |
|
"loss": 0.6961, |
|
"mean_token_accuracy": 0.7848220716497867, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.1805401405845357, |
|
"grad_norm": 0.4977818727493286, |
|
"learning_rate": 1.5208031197595357e-05, |
|
"loss": 0.6181, |
|
"mean_token_accuracy": 0.808352793166422, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.1834998150203477, |
|
"grad_norm": 0.45590656995773315, |
|
"learning_rate": 1.5178496248983254e-05, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.7992991854336597, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.1864594894561598, |
|
"grad_norm": 0.5166680812835693, |
|
"learning_rate": 1.5148899445313983e-05, |
|
"loss": 0.6391, |
|
"mean_token_accuracy": 0.8008235442677688, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1864594894561598, |
|
"eval_loss": 0.753233015537262, |
|
"eval_mean_token_accuracy": 0.7678493271850204, |
|
"eval_runtime": 24.4762, |
|
"eval_samples_per_second": 5.27, |
|
"eval_steps_per_second": 1.348, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.189419163891972, |
|
"grad_norm": 0.4777900278568268, |
|
"learning_rate": 1.5119241140109466e-05, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.8008284367677996, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.192378838327784, |
|
"grad_norm": 0.4674142301082611, |
|
"learning_rate": 1.5089521687626243e-05, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.8002595216069462, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.195338512763596, |
|
"grad_norm": 0.5119103789329529, |
|
"learning_rate": 1.505974144285124e-05, |
|
"loss": 0.7143, |
|
"mean_token_accuracy": 0.7807192708136647, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.198298187199408, |
|
"grad_norm": 0.5238728523254395, |
|
"learning_rate": 1.5029900761497507e-05, |
|
"loss": 0.7459, |
|
"mean_token_accuracy": 0.7719988622051683, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.20125786163522, |
|
"grad_norm": 0.5216233134269714, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.6977, |
|
"mean_token_accuracy": 0.7839726890839798, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.204217536071032, |
|
"grad_norm": 0.509964108467102, |
|
"learning_rate": 1.4970039515511303e-05, |
|
"loss": 0.6809, |
|
"mean_token_accuracy": 0.7893634011753464, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.2071772105068441, |
|
"grad_norm": 0.5653720498085022, |
|
"learning_rate": 1.4940019665897363e-05, |
|
"loss": 0.6897, |
|
"mean_token_accuracy": 0.7868935096910736, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.2101368849426564, |
|
"grad_norm": 0.4962683618068695, |
|
"learning_rate": 1.4909940809733223e-05, |
|
"loss": 0.7354, |
|
"mean_token_accuracy": 0.7726758488051101, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.2130965593784684, |
|
"grad_norm": 0.5176084637641907, |
|
"learning_rate": 1.4879803306298736e-05, |
|
"loss": 0.6964, |
|
"mean_token_accuracy": 0.7838358295177021, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.2160562338142804, |
|
"grad_norm": 0.513697624206543, |
|
"learning_rate": 1.4849607515574276e-05, |
|
"loss": 0.6492, |
|
"mean_token_accuracy": 0.799568203590832, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2190159082500924, |
|
"grad_norm": 0.4567902684211731, |
|
"learning_rate": 1.4819353798236427e-05, |
|
"loss": 0.6991, |
|
"mean_token_accuracy": 0.7838256081866393, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.2219755826859044, |
|
"grad_norm": 0.5139224529266357, |
|
"learning_rate": 1.4789042515653687e-05, |
|
"loss": 0.6946, |
|
"mean_token_accuracy": 0.7852177162018236, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.2249352571217167, |
|
"grad_norm": 0.5555658936500549, |
|
"learning_rate": 1.4758674029882152e-05, |
|
"loss": 0.6539, |
|
"mean_token_accuracy": 0.7970551349204403, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.2278949315575287, |
|
"grad_norm": 0.4890614449977875, |
|
"learning_rate": 1.4728248703661183e-05, |
|
"loss": 0.695, |
|
"mean_token_accuracy": 0.7845206023697728, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.2308546059933407, |
|
"grad_norm": 0.47974392771720886, |
|
"learning_rate": 1.4697766900409076e-05, |
|
"loss": 0.669, |
|
"mean_token_accuracy": 0.7929167835356624, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.2338142804291528, |
|
"grad_norm": 0.5015913248062134, |
|
"learning_rate": 1.466722898421873e-05, |
|
"loss": 0.7009, |
|
"mean_token_accuracy": 0.7827139356082893, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.2367739548649648, |
|
"grad_norm": 0.49240073561668396, |
|
"learning_rate": 1.4636635319853274e-05, |
|
"loss": 0.6685, |
|
"mean_token_accuracy": 0.792534979177688, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.239733629300777, |
|
"grad_norm": 0.48550987243652344, |
|
"learning_rate": 1.4605986272741748e-05, |
|
"loss": 0.6908, |
|
"mean_token_accuracy": 0.7868828026774352, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.242693303736589, |
|
"grad_norm": 0.47983378171920776, |
|
"learning_rate": 1.4575282208974704e-05, |
|
"loss": 0.6831, |
|
"mean_token_accuracy": 0.7891199345178915, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.245652978172401, |
|
"grad_norm": 0.49261724948883057, |
|
"learning_rate": 1.4544523495299843e-05, |
|
"loss": 0.6831, |
|
"mean_token_accuracy": 0.7881435108832517, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.248612652608213, |
|
"grad_norm": 0.47099459171295166, |
|
"learning_rate": 1.4513710499117648e-05, |
|
"loss": 0.6307, |
|
"mean_token_accuracy": 0.8053076982273811, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.251572327044025, |
|
"grad_norm": 0.4534473121166229, |
|
"learning_rate": 1.4482843588476976e-05, |
|
"loss": 0.6953, |
|
"mean_token_accuracy": 0.7836745290375378, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.2545320014798373, |
|
"grad_norm": 0.4827975630760193, |
|
"learning_rate": 1.445192313207067e-05, |
|
"loss": 0.6769, |
|
"mean_token_accuracy": 0.7917014445996506, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.2574916759156491, |
|
"grad_norm": 0.48446017503738403, |
|
"learning_rate": 1.4420949499231172e-05, |
|
"loss": 0.6811, |
|
"mean_token_accuracy": 0.7885621949952477, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.2604513503514614, |
|
"grad_norm": 0.46176275610923767, |
|
"learning_rate": 1.4389923059926064e-05, |
|
"loss": 0.6715, |
|
"mean_token_accuracy": 0.7921377530314322, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2634110247872734, |
|
"grad_norm": 0.4933745265007019, |
|
"learning_rate": 1.4358844184753713e-05, |
|
"loss": 0.6516, |
|
"mean_token_accuracy": 0.7976899559939264, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.2663706992230854, |
|
"grad_norm": 0.4907665252685547, |
|
"learning_rate": 1.432771324493879e-05, |
|
"loss": 0.675, |
|
"mean_token_accuracy": 0.7905862204832549, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.2693303736588974, |
|
"grad_norm": 0.4861429035663605, |
|
"learning_rate": 1.4296530612327864e-05, |
|
"loss": 0.7044, |
|
"mean_token_accuracy": 0.782618434308195, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.2722900480947095, |
|
"grad_norm": 0.44409534335136414, |
|
"learning_rate": 1.4265296659384956e-05, |
|
"loss": 0.702, |
|
"mean_token_accuracy": 0.7835227926569839, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.2752497225305217, |
|
"grad_norm": 0.47325289249420166, |
|
"learning_rate": 1.4234011759187084e-05, |
|
"loss": 0.6907, |
|
"mean_token_accuracy": 0.7883719669584818, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2782093969663337, |
|
"grad_norm": 0.4296591281890869, |
|
"learning_rate": 1.4202676285419811e-05, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.799964374790151, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.2811690714021458, |
|
"grad_norm": 0.4680195152759552, |
|
"learning_rate": 1.4171290612372781e-05, |
|
"loss": 0.6913, |
|
"mean_token_accuracy": 0.7865936068853461, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.2841287458379578, |
|
"grad_norm": 0.47732165455818176, |
|
"learning_rate": 1.4139855114935253e-05, |
|
"loss": 0.665, |
|
"mean_token_accuracy": 0.795472867454343, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.2870884202737698, |
|
"grad_norm": 0.44656407833099365, |
|
"learning_rate": 1.410837016859161e-05, |
|
"loss": 0.6747, |
|
"mean_token_accuracy": 0.790485626527416, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.290048094709582, |
|
"grad_norm": 0.4626164734363556, |
|
"learning_rate": 1.4076836149416889e-05, |
|
"loss": 0.6591, |
|
"mean_token_accuracy": 0.7963842598244837, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.293007769145394, |
|
"grad_norm": 0.4850873053073883, |
|
"learning_rate": 1.4045253434072278e-05, |
|
"loss": 0.7126, |
|
"mean_token_accuracy": 0.7804075548829805, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.295967443581206, |
|
"grad_norm": 0.4946662187576294, |
|
"learning_rate": 1.4013622399800628e-05, |
|
"loss": 0.7237, |
|
"mean_token_accuracy": 0.777694595209445, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.298927118017018, |
|
"grad_norm": 0.515221893787384, |
|
"learning_rate": 1.3981943424421932e-05, |
|
"loss": 0.6982, |
|
"mean_token_accuracy": 0.784025918890703, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.3018867924528301, |
|
"grad_norm": 0.4743560552597046, |
|
"learning_rate": 1.3950216886328818e-05, |
|
"loss": 0.698, |
|
"mean_token_accuracy": 0.7843463257420568, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.3048464668886424, |
|
"grad_norm": 0.47368329763412476, |
|
"learning_rate": 1.3918443164482048e-05, |
|
"loss": 0.6961, |
|
"mean_token_accuracy": 0.7865385891914267, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3078061413244544, |
|
"grad_norm": 0.4459000825881958, |
|
"learning_rate": 1.3886622638405953e-05, |
|
"loss": 0.6955, |
|
"mean_token_accuracy": 0.7852747333942596, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.3107658157602664, |
|
"grad_norm": 0.47365012764930725, |
|
"learning_rate": 1.3854755688183941e-05, |
|
"loss": 0.7227, |
|
"mean_token_accuracy": 0.7778711159999969, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.3137254901960784, |
|
"grad_norm": 0.46061503887176514, |
|
"learning_rate": 1.3822842694453923e-05, |
|
"loss": 0.6885, |
|
"mean_token_accuracy": 0.7876893449725652, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.3166851646318904, |
|
"grad_norm": 0.4780057370662689, |
|
"learning_rate": 1.3790884038403796e-05, |
|
"loss": 0.6911, |
|
"mean_token_accuracy": 0.7863533950002012, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.3196448390677027, |
|
"grad_norm": 0.48519885540008545, |
|
"learning_rate": 1.375888010176686e-05, |
|
"loss": 0.6666, |
|
"mean_token_accuracy": 0.7935298420501086, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.3226045135035145, |
|
"grad_norm": 0.4679955840110779, |
|
"learning_rate": 1.3726831266817278e-05, |
|
"loss": 0.6885, |
|
"mean_token_accuracy": 0.7879594429456447, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.3255641879393267, |
|
"grad_norm": 0.4626809060573578, |
|
"learning_rate": 1.3694737916365517e-05, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.7828708121314737, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.3285238623751388, |
|
"grad_norm": 0.45779362320899963, |
|
"learning_rate": 1.3662600433753746e-05, |
|
"loss": 0.6896, |
|
"mean_token_accuracy": 0.7876785995413643, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.3314835368109508, |
|
"grad_norm": 0.4595906436443329, |
|
"learning_rate": 1.3630419202851287e-05, |
|
"loss": 0.6979, |
|
"mean_token_accuracy": 0.7838014568334657, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.3344432112467628, |
|
"grad_norm": 0.4679829776287079, |
|
"learning_rate": 1.3598194608050011e-05, |
|
"loss": 0.7047, |
|
"mean_token_accuracy": 0.7832954223966397, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3374028856825748, |
|
"grad_norm": 0.49509483575820923, |
|
"learning_rate": 1.3565927034259757e-05, |
|
"loss": 0.6956, |
|
"mean_token_accuracy": 0.7861987291079401, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.340362560118387, |
|
"grad_norm": 0.47606754302978516, |
|
"learning_rate": 1.3533616866903736e-05, |
|
"loss": 0.6774, |
|
"mean_token_accuracy": 0.7900551101111528, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.343322234554199, |
|
"grad_norm": 0.44316449761390686, |
|
"learning_rate": 1.3501264491913909e-05, |
|
"loss": 0.7, |
|
"mean_token_accuracy": 0.7830548189627489, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.346281908990011, |
|
"grad_norm": 0.499174028635025, |
|
"learning_rate": 1.3468870295726399e-05, |
|
"loss": 0.7203, |
|
"mean_token_accuracy": 0.7776105610712533, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.3492415834258231, |
|
"grad_norm": 0.43882501125335693, |
|
"learning_rate": 1.3436434665276865e-05, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.7913862306577221, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3522012578616351, |
|
"grad_norm": 0.49250712990760803, |
|
"learning_rate": 1.3403957987995884e-05, |
|
"loss": 0.68, |
|
"mean_token_accuracy": 0.7894371521316413, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.3551609322974474, |
|
"grad_norm": 0.46765249967575073, |
|
"learning_rate": 1.3371440651804313e-05, |
|
"loss": 0.7066, |
|
"mean_token_accuracy": 0.7817244510128959, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.3581206067332594, |
|
"grad_norm": 0.46519362926483154, |
|
"learning_rate": 1.3338883045108674e-05, |
|
"loss": 0.6852, |
|
"mean_token_accuracy": 0.7875893561938507, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.3610802811690714, |
|
"grad_norm": 0.5211879014968872, |
|
"learning_rate": 1.3306285556796494e-05, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.7886326578047633, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.3640399556048834, |
|
"grad_norm": 0.4436584413051605, |
|
"learning_rate": 1.327364857623168e-05, |
|
"loss": 0.7006, |
|
"mean_token_accuracy": 0.7844141672519914, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3669996300406955, |
|
"grad_norm": 0.49897250533103943, |
|
"learning_rate": 1.3240972493249846e-05, |
|
"loss": 0.6907, |
|
"mean_token_accuracy": 0.7872768784393989, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.3699593044765077, |
|
"grad_norm": 0.44192755222320557, |
|
"learning_rate": 1.3208257698153677e-05, |
|
"loss": 0.7179, |
|
"mean_token_accuracy": 0.7772223223597873, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.3729189789123195, |
|
"grad_norm": 0.48224934935569763, |
|
"learning_rate": 1.3175504581708261e-05, |
|
"loss": 0.6884, |
|
"mean_token_accuracy": 0.7876441851387866, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.3758786533481318, |
|
"grad_norm": 0.44167572259902954, |
|
"learning_rate": 1.3142713535136413e-05, |
|
"loss": 0.6964, |
|
"mean_token_accuracy": 0.7840998538649302, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.3788383277839438, |
|
"grad_norm": 0.5188360214233398, |
|
"learning_rate": 1.3109884950114007e-05, |
|
"loss": 0.6979, |
|
"mean_token_accuracy": 0.7830517429111471, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.3817980022197558, |
|
"grad_norm": 0.4949224293231964, |
|
"learning_rate": 1.3077019218765306e-05, |
|
"loss": 0.6686, |
|
"mean_token_accuracy": 0.7925575804293147, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.3847576766555678, |
|
"grad_norm": 0.4614505171775818, |
|
"learning_rate": 1.3044116733658261e-05, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.7904813977673216, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.3877173510913798, |
|
"grad_norm": 0.47585147619247437, |
|
"learning_rate": 1.3011177887799846e-05, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.7969142283708234, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.390677025527192, |
|
"grad_norm": 0.4733677804470062, |
|
"learning_rate": 1.2978203074631335e-05, |
|
"loss": 0.6837, |
|
"mean_token_accuracy": 0.7885936546719822, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.393636699963004, |
|
"grad_norm": 0.47128206491470337, |
|
"learning_rate": 1.2945192688023625e-05, |
|
"loss": 0.7228, |
|
"mean_token_accuracy": 0.777582654281462, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3965963743988161, |
|
"grad_norm": 0.5573126077651978, |
|
"learning_rate": 1.2912147122272523e-05, |
|
"loss": 0.692, |
|
"mean_token_accuracy": 0.7851007004118511, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.3995560488346281, |
|
"grad_norm": 0.5249556303024292, |
|
"learning_rate": 1.287906677209403e-05, |
|
"loss": 0.666, |
|
"mean_token_accuracy": 0.7935855307222649, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.4025157232704402, |
|
"grad_norm": 0.5098072290420532, |
|
"learning_rate": 1.2845952032619651e-05, |
|
"loss": 0.7169, |
|
"mean_token_accuracy": 0.78048614348136, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.4054753977062524, |
|
"grad_norm": 0.5147253274917603, |
|
"learning_rate": 1.2812803299391629e-05, |
|
"loss": 0.7285, |
|
"mean_token_accuracy": 0.775834970458234, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.4084350721420644, |
|
"grad_norm": 0.529493510723114, |
|
"learning_rate": 1.2779620968358276e-05, |
|
"loss": 0.6582, |
|
"mean_token_accuracy": 0.7956748329946638, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.4113947465778764, |
|
"grad_norm": 0.5070955753326416, |
|
"learning_rate": 1.2746405435869198e-05, |
|
"loss": 0.6674, |
|
"mean_token_accuracy": 0.7915634181103908, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.4143544210136885, |
|
"grad_norm": 0.5139186978340149, |
|
"learning_rate": 1.271315709867059e-05, |
|
"loss": 0.7037, |
|
"mean_token_accuracy": 0.7825460416635028, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.4173140954495005, |
|
"grad_norm": 0.5307909250259399, |
|
"learning_rate": 1.2679876353900482e-05, |
|
"loss": 0.7082, |
|
"mean_token_accuracy": 0.7814352090483259, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.4202737698853127, |
|
"grad_norm": 0.4850543737411499, |
|
"learning_rate": 1.2646563599083997e-05, |
|
"loss": 0.724, |
|
"mean_token_accuracy": 0.7763536423746681, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.4232334443211248, |
|
"grad_norm": 0.5001718997955322, |
|
"learning_rate": 1.2613219232128608e-05, |
|
"loss": 0.6629, |
|
"mean_token_accuracy": 0.7942459104666942, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4261931187569368, |
|
"grad_norm": 0.5056073069572449, |
|
"learning_rate": 1.2579843651319382e-05, |
|
"loss": 0.7331, |
|
"mean_token_accuracy": 0.7724445151223609, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.4291527931927488, |
|
"grad_norm": 0.5267237424850464, |
|
"learning_rate": 1.2546437255314223e-05, |
|
"loss": 0.6659, |
|
"mean_token_accuracy": 0.7943264511441203, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.4321124676285608, |
|
"grad_norm": 0.4923066794872284, |
|
"learning_rate": 1.2513000443139112e-05, |
|
"loss": 0.693, |
|
"mean_token_accuracy": 0.7847285183122921, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.435072142064373, |
|
"grad_norm": 0.4452427327632904, |
|
"learning_rate": 1.2479533614183334e-05, |
|
"loss": 0.6783, |
|
"mean_token_accuracy": 0.790767397651227, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.4380318165001849, |
|
"grad_norm": 0.4807162582874298, |
|
"learning_rate": 1.2446037168194716e-05, |
|
"loss": 0.6951, |
|
"mean_token_accuracy": 0.7842417519133703, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.440991490935997, |
|
"grad_norm": 0.4858757257461548, |
|
"learning_rate": 1.2412511505274845e-05, |
|
"loss": 0.6602, |
|
"mean_token_accuracy": 0.7962518182176112, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.4439511653718091, |
|
"grad_norm": 0.4663830101490021, |
|
"learning_rate": 1.23789570258743e-05, |
|
"loss": 0.6951, |
|
"mean_token_accuracy": 0.7839527031401198, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.4469108398076211, |
|
"grad_norm": 0.4759344160556793, |
|
"learning_rate": 1.2345374130787855e-05, |
|
"loss": 0.6925, |
|
"mean_token_accuracy": 0.7861855601757001, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.4498705142434332, |
|
"grad_norm": 0.44426658749580383, |
|
"learning_rate": 1.23117632211497e-05, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.7964251152169285, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.4528301886792452, |
|
"grad_norm": 0.4644084870815277, |
|
"learning_rate": 1.2278124698428643e-05, |
|
"loss": 0.6848, |
|
"mean_token_accuracy": 0.7871725512533235, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4557898631150574, |
|
"grad_norm": 0.43534740805625916, |
|
"learning_rate": 1.2244458964423328e-05, |
|
"loss": 0.6952, |
|
"mean_token_accuracy": 0.7838933476240588, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.4587495375508694, |
|
"grad_norm": 0.4578785300254822, |
|
"learning_rate": 1.221076642125742e-05, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7867050710099383, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.4617092119866815, |
|
"grad_norm": 0.46426481008529663, |
|
"learning_rate": 1.2177047471374808e-05, |
|
"loss": 0.6679, |
|
"mean_token_accuracy": 0.793821778161506, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.4646688864224935, |
|
"grad_norm": 0.4668942391872406, |
|
"learning_rate": 1.214330251753481e-05, |
|
"loss": 0.6788, |
|
"mean_token_accuracy": 0.7911113494359255, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.4676285608583055, |
|
"grad_norm": 0.4524623155593872, |
|
"learning_rate": 1.2109531962807333e-05, |
|
"loss": 0.657, |
|
"mean_token_accuracy": 0.7968866396266425, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.4540092945098877, |
|
"learning_rate": 1.207573621056809e-05, |
|
"loss": 0.6779, |
|
"mean_token_accuracy": 0.79133374474269, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.4735479097299298, |
|
"grad_norm": 0.4721427857875824, |
|
"learning_rate": 1.2041915664493763e-05, |
|
"loss": 0.7114, |
|
"mean_token_accuracy": 0.7811596412077128, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.4765075841657418, |
|
"grad_norm": 0.45745474100112915, |
|
"learning_rate": 1.2008070728557186e-05, |
|
"loss": 0.6946, |
|
"mean_token_accuracy": 0.7835979713892247, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.4794672586015538, |
|
"grad_norm": 0.45184969902038574, |
|
"learning_rate": 1.1974201807022525e-05, |
|
"loss": 0.6594, |
|
"mean_token_accuracy": 0.7954918143409643, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.4824269330373658, |
|
"grad_norm": 0.43299737572669983, |
|
"learning_rate": 1.1940309304440434e-05, |
|
"loss": 0.655, |
|
"mean_token_accuracy": 0.7961995893943149, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4824269330373658, |
|
"eval_loss": 0.7452248930931091, |
|
"eval_mean_token_accuracy": 0.7696687843740262, |
|
"eval_runtime": 24.4738, |
|
"eval_samples_per_second": 5.271, |
|
"eval_steps_per_second": 1.348, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.485386607473178, |
|
"grad_norm": 0.4329541325569153, |
|
"learning_rate": 1.1906393625643244e-05, |
|
"loss": 0.6908, |
|
"mean_token_accuracy": 0.787461052002391, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.4883462819089899, |
|
"grad_norm": 0.44818833470344543, |
|
"learning_rate": 1.1872455175740111e-05, |
|
"loss": 0.7038, |
|
"mean_token_accuracy": 0.7827824467497245, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.4913059563448021, |
|
"grad_norm": 0.4627722501754761, |
|
"learning_rate": 1.1838494360112185e-05, |
|
"loss": 0.6831, |
|
"mean_token_accuracy": 0.7892276650561758, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.4942656307806141, |
|
"grad_norm": 0.43506646156311035, |
|
"learning_rate": 1.1804511584407763e-05, |
|
"loss": 0.6469, |
|
"mean_token_accuracy": 0.7984073599583249, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.4972253052164262, |
|
"grad_norm": 0.4514705538749695, |
|
"learning_rate": 1.1770507254537454e-05, |
|
"loss": 0.6567, |
|
"mean_token_accuracy": 0.797555451493693, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.5001849796522384, |
|
"grad_norm": 0.4718611538410187, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.6666, |
|
"mean_token_accuracy": 0.7937825386926253, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.5031446540880502, |
|
"grad_norm": 0.4598422646522522, |
|
"learning_rate": 1.1702435557223988e-05, |
|
"loss": 0.7341, |
|
"mean_token_accuracy": 0.7725688345230695, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.5061043285238624, |
|
"grad_norm": 0.4759341776371002, |
|
"learning_rate": 1.1668369002869912e-05, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.7833280751891905, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.5090640029596745, |
|
"grad_norm": 0.4857986867427826, |
|
"learning_rate": 1.1634282520518382e-05, |
|
"loss": 0.6843, |
|
"mean_token_accuracy": 0.7878627921931918, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.5120236773954865, |
|
"grad_norm": 0.4445328414440155, |
|
"learning_rate": 1.1600176517318742e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.7835290374274105, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5149833518312985, |
|
"grad_norm": 0.4201406240463257, |
|
"learning_rate": 1.1566051400653486e-05, |
|
"loss": 0.6892, |
|
"mean_token_accuracy": 0.7880382009320334, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.5179430262671105, |
|
"grad_norm": 0.4451057016849518, |
|
"learning_rate": 1.153190757813343e-05, |
|
"loss": 0.6661, |
|
"mean_token_accuracy": 0.7936041312626415, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.5209027007029228, |
|
"grad_norm": 0.45407670736312866, |
|
"learning_rate": 1.1497745457592817e-05, |
|
"loss": 0.6938, |
|
"mean_token_accuracy": 0.7862274252159144, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.5238623751387348, |
|
"grad_norm": 0.48065322637557983, |
|
"learning_rate": 1.1463565447084446e-05, |
|
"loss": 0.6711, |
|
"mean_token_accuracy": 0.7922199519518627, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.5268220495745468, |
|
"grad_norm": 0.4554750323295593, |
|
"learning_rate": 1.142936795487482e-05, |
|
"loss": 0.7031, |
|
"mean_token_accuracy": 0.7841927897620309, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.5297817240103588, |
|
"grad_norm": 0.47003987431526184, |
|
"learning_rate": 1.1395153389439232e-05, |
|
"loss": 0.6801, |
|
"mean_token_accuracy": 0.7887132537245702, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.5327413984461709, |
|
"grad_norm": 0.49194058775901794, |
|
"learning_rate": 1.1360922159456929e-05, |
|
"loss": 0.6516, |
|
"mean_token_accuracy": 0.7972093170337653, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.535701072881983, |
|
"grad_norm": 0.4363403618335724, |
|
"learning_rate": 1.1326674673806195e-05, |
|
"loss": 0.6454, |
|
"mean_token_accuracy": 0.7994255155742641, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.538660747317795, |
|
"grad_norm": 0.4633619487285614, |
|
"learning_rate": 1.129241134155949e-05, |
|
"loss": 0.7226, |
|
"mean_token_accuracy": 0.7772127285568272, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.5416204217536071, |
|
"grad_norm": 0.505766749382019, |
|
"learning_rate": 1.1258132571978555e-05, |
|
"loss": 0.6866, |
|
"mean_token_accuracy": 0.7866910068023953, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5445800961894192, |
|
"grad_norm": 0.4622265696525574, |
|
"learning_rate": 1.1223838774509515e-05, |
|
"loss": 0.6794, |
|
"mean_token_accuracy": 0.7894488197184882, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.5475397706252312, |
|
"grad_norm": 0.46530911326408386, |
|
"learning_rate": 1.1189530358778005e-05, |
|
"loss": 0.6714, |
|
"mean_token_accuracy": 0.7917336528774738, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.5504994450610434, |
|
"grad_norm": 0.48770585656166077, |
|
"learning_rate": 1.1155207734584264e-05, |
|
"loss": 0.655, |
|
"mean_token_accuracy": 0.7967736177779107, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.5534591194968552, |
|
"grad_norm": 0.4736506938934326, |
|
"learning_rate": 1.1120871311898254e-05, |
|
"loss": 0.6626, |
|
"mean_token_accuracy": 0.7948987812297952, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.5564187939326675, |
|
"grad_norm": 0.4388614594936371, |
|
"learning_rate": 1.1086521500854746e-05, |
|
"loss": 0.6743, |
|
"mean_token_accuracy": 0.7901189868530583, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5593784683684795, |
|
"grad_norm": 0.42465701699256897, |
|
"learning_rate": 1.1052158711748435e-05, |
|
"loss": 0.6424, |
|
"mean_token_accuracy": 0.8002322656672612, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.5623381428042915, |
|
"grad_norm": 0.444394052028656, |
|
"learning_rate": 1.1017783355029027e-05, |
|
"loss": 0.6968, |
|
"mean_token_accuracy": 0.7853953263510778, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.5652978172401038, |
|
"grad_norm": 0.4599439799785614, |
|
"learning_rate": 1.0983395841296349e-05, |
|
"loss": 0.7023, |
|
"mean_token_accuracy": 0.783582448885906, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.5682574916759155, |
|
"grad_norm": 0.4538317918777466, |
|
"learning_rate": 1.0948996581295437e-05, |
|
"loss": 0.6708, |
|
"mean_token_accuracy": 0.7920199562156756, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.5712171661117278, |
|
"grad_norm": 0.5204719305038452, |
|
"learning_rate": 1.0914585985911632e-05, |
|
"loss": 0.7194, |
|
"mean_token_accuracy": 0.7800594247957305, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5741768405475398, |
|
"grad_norm": 0.4342687129974365, |
|
"learning_rate": 1.0880164466165675e-05, |
|
"loss": 0.6803, |
|
"mean_token_accuracy": 0.7888814345475649, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.5771365149833518, |
|
"grad_norm": 0.47061675786972046, |
|
"learning_rate": 1.084573243320878e-05, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.7845215145727062, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.5800961894191639, |
|
"grad_norm": 0.48753833770751953, |
|
"learning_rate": 1.0811290298317755e-05, |
|
"loss": 0.6963, |
|
"mean_token_accuracy": 0.7853895351084046, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.5830558638549759, |
|
"grad_norm": 0.4486468732357025, |
|
"learning_rate": 1.0776838472890065e-05, |
|
"loss": 0.6616, |
|
"mean_token_accuracy": 0.7946923291350155, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.5860155382907881, |
|
"grad_norm": 0.46315282583236694, |
|
"learning_rate": 1.0742377368438915e-05, |
|
"loss": 0.6653, |
|
"mean_token_accuracy": 0.7937742045003314, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.5889752127266, |
|
"grad_norm": 0.43467020988464355, |
|
"learning_rate": 1.0707907396588362e-05, |
|
"loss": 0.675, |
|
"mean_token_accuracy": 0.7911407237837417, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.5919348871624122, |
|
"grad_norm": 0.47853776812553406, |
|
"learning_rate": 1.0673428969068365e-05, |
|
"loss": 0.6694, |
|
"mean_token_accuracy": 0.7934067804791232, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.5948945615982242, |
|
"grad_norm": 0.4569770395755768, |
|
"learning_rate": 1.063894249770989e-05, |
|
"loss": 0.7149, |
|
"mean_token_accuracy": 0.7789215590955586, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.5978542360340362, |
|
"grad_norm": 0.48249223828315735, |
|
"learning_rate": 1.0604448394439983e-05, |
|
"loss": 0.6881, |
|
"mean_token_accuracy": 0.7885556262821241, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.6008139104698484, |
|
"grad_norm": 0.44117307662963867, |
|
"learning_rate": 1.0569947071276847e-05, |
|
"loss": 0.6773, |
|
"mean_token_accuracy": 0.7905948947059994, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6037735849056602, |
|
"grad_norm": 0.4791225492954254, |
|
"learning_rate": 1.053543894032493e-05, |
|
"loss": 0.6486, |
|
"mean_token_accuracy": 0.7984527785084713, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.6067332593414725, |
|
"grad_norm": 0.4592903256416321, |
|
"learning_rate": 1.0500924413769988e-05, |
|
"loss": 0.7029, |
|
"mean_token_accuracy": 0.7816764343124575, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.6096929337772845, |
|
"grad_norm": 0.4603089988231659, |
|
"learning_rate": 1.0466403903874176e-05, |
|
"loss": 0.6692, |
|
"mean_token_accuracy": 0.7920168861161754, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.6126526082130965, |
|
"grad_norm": 0.4877552092075348, |
|
"learning_rate": 1.0431877822971118e-05, |
|
"loss": 0.7264, |
|
"mean_token_accuracy": 0.7763762310950634, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.6156122826489088, |
|
"grad_norm": 0.4495700001716614, |
|
"learning_rate": 1.0397346583460972e-05, |
|
"loss": 0.6748, |
|
"mean_token_accuracy": 0.790038916470125, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.6185719570847206, |
|
"grad_norm": 0.4363431930541992, |
|
"learning_rate": 1.0362810597805526e-05, |
|
"loss": 0.7176, |
|
"mean_token_accuracy": 0.7804455873720191, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.6215316315205328, |
|
"grad_norm": 0.4593956470489502, |
|
"learning_rate": 1.0328270278523256e-05, |
|
"loss": 0.692, |
|
"mean_token_accuracy": 0.7868243000254014, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.6244913059563448, |
|
"grad_norm": 0.4650803506374359, |
|
"learning_rate": 1.0293726038184393e-05, |
|
"loss": 0.6667, |
|
"mean_token_accuracy": 0.7932379645110449, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.6274509803921569, |
|
"grad_norm": 0.4343462288379669, |
|
"learning_rate": 1.0259178289406011e-05, |
|
"loss": 0.6828, |
|
"mean_token_accuracy": 0.7873501273107357, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.6304106548279689, |
|
"grad_norm": 0.485445499420166, |
|
"learning_rate": 1.022462744484709e-05, |
|
"loss": 0.6757, |
|
"mean_token_accuracy": 0.790149107536362, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.633370329263781, |
|
"grad_norm": 0.4408370852470398, |
|
"learning_rate": 1.019007391720359e-05, |
|
"loss": 0.6423, |
|
"mean_token_accuracy": 0.8007969798780114, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.6363300036995931, |
|
"grad_norm": 0.48014140129089355, |
|
"learning_rate": 1.0155518119203511e-05, |
|
"loss": 0.6485, |
|
"mean_token_accuracy": 0.798990145407414, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.6392896781354052, |
|
"grad_norm": 0.43950581550598145, |
|
"learning_rate": 1.0120960463601977e-05, |
|
"loss": 0.6884, |
|
"mean_token_accuracy": 0.7868133995463237, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.6422493525712172, |
|
"grad_norm": 0.4777732789516449, |
|
"learning_rate": 1.0086401363176306e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.7829182684226537, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.6452090270070292, |
|
"grad_norm": 0.4738129675388336, |
|
"learning_rate": 1.0051841230721065e-05, |
|
"loss": 0.7025, |
|
"mean_token_accuracy": 0.7833107058164892, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.6481687014428412, |
|
"grad_norm": 0.49576374888420105, |
|
"learning_rate": 1.0017280479043148e-05, |
|
"loss": 0.6832, |
|
"mean_token_accuracy": 0.7878164823186655, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.6511283758786535, |
|
"grad_norm": 0.4482108950614929, |
|
"learning_rate": 9.982719520956856e-06, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.7859008840989987, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.6540880503144653, |
|
"grad_norm": 0.4530676603317261, |
|
"learning_rate": 9.948158769278939e-06, |
|
"loss": 0.6496, |
|
"mean_token_accuracy": 0.7975575400059007, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.6570477247502775, |
|
"grad_norm": 0.4506595730781555, |
|
"learning_rate": 9.913598636823694e-06, |
|
"loss": 0.6711, |
|
"mean_token_accuracy": 0.7920525949216152, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.6600073991860895, |
|
"grad_norm": 0.492118775844574, |
|
"learning_rate": 9.879039536398023e-06, |
|
"loss": 0.6663, |
|
"mean_token_accuracy": 0.7926239117866946, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.6629670736219015, |
|
"grad_norm": 0.4334714114665985, |
|
"learning_rate": 9.844481880796492e-06, |
|
"loss": 0.6685, |
|
"mean_token_accuracy": 0.7934195520197277, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.6659267480577138, |
|
"grad_norm": 0.43926241993904114, |
|
"learning_rate": 9.809926082796415e-06, |
|
"loss": 0.668, |
|
"mean_token_accuracy": 0.7921636930110467, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.6688864224935256, |
|
"grad_norm": 0.46381375193595886, |
|
"learning_rate": 9.775372555152912e-06, |
|
"loss": 0.7106, |
|
"mean_token_accuracy": 0.7814721603110977, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.6718460969293378, |
|
"grad_norm": 0.4584568738937378, |
|
"learning_rate": 9.740821710593989e-06, |
|
"loss": 0.6723, |
|
"mean_token_accuracy": 0.7927753026753256, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.6748057713651499, |
|
"grad_norm": 0.46233710646629333, |
|
"learning_rate": 9.70627396181561e-06, |
|
"loss": 0.6979, |
|
"mean_token_accuracy": 0.7847842845307743, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.6777654458009619, |
|
"grad_norm": 0.4692407548427582, |
|
"learning_rate": 9.671729721476747e-06, |
|
"loss": 0.6779, |
|
"mean_token_accuracy": 0.7904914247805244, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.6807251202367741, |
|
"grad_norm": 0.45148906111717224, |
|
"learning_rate": 9.637189402194477e-06, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.794561469534099, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.683684794672586, |
|
"grad_norm": 0.4668971002101898, |
|
"learning_rate": 9.602653416539031e-06, |
|
"loss": 0.6562, |
|
"mean_token_accuracy": 0.7957992597890263, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.6866444691083982, |
|
"grad_norm": 0.4657999575138092, |
|
"learning_rate": 9.568122177028884e-06, |
|
"loss": 0.6793, |
|
"mean_token_accuracy": 0.7895593260251141, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.6896041435442102, |
|
"grad_norm": 0.45058828592300415, |
|
"learning_rate": 9.533596096125826e-06, |
|
"loss": 0.6982, |
|
"mean_token_accuracy": 0.7837857085184711, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6925638179800222, |
|
"grad_norm": 0.5159661769866943, |
|
"learning_rate": 9.499075586230014e-06, |
|
"loss": 0.7278, |
|
"mean_token_accuracy": 0.7758815945577252, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.6955234924158342, |
|
"grad_norm": 0.4985567629337311, |
|
"learning_rate": 9.464561059675073e-06, |
|
"loss": 0.6815, |
|
"mean_token_accuracy": 0.789947097130735, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.6984831668516462, |
|
"grad_norm": 0.4985766112804413, |
|
"learning_rate": 9.430052928723153e-06, |
|
"loss": 0.6689, |
|
"mean_token_accuracy": 0.7914537628745669, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.7014428412874585, |
|
"grad_norm": 0.44924196600914, |
|
"learning_rate": 9.395551605560018e-06, |
|
"loss": 0.654, |
|
"mean_token_accuracy": 0.7949039622131476, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.7044025157232703, |
|
"grad_norm": 0.4486066997051239, |
|
"learning_rate": 9.361057502290112e-06, |
|
"loss": 0.6689, |
|
"mean_token_accuracy": 0.7932129938757272, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.7073621901590825, |
|
"grad_norm": 0.5298429131507874, |
|
"learning_rate": 9.326571030931636e-06, |
|
"loss": 0.6797, |
|
"mean_token_accuracy": 0.7899495064143103, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.7103218645948945, |
|
"grad_norm": 0.4834374189376831, |
|
"learning_rate": 9.292092603411642e-06, |
|
"loss": 0.6856, |
|
"mean_token_accuracy": 0.7874172906006217, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.7132815390307066, |
|
"grad_norm": 0.4545672833919525, |
|
"learning_rate": 9.257622631561085e-06, |
|
"loss": 0.6793, |
|
"mean_token_accuracy": 0.7896850742245419, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.7162412134665188, |
|
"grad_norm": 0.49482157826423645, |
|
"learning_rate": 9.223161527109938e-06, |
|
"loss": 0.7249, |
|
"mean_token_accuracy": 0.7754079872839525, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.7192008879023306, |
|
"grad_norm": 0.47407853603363037, |
|
"learning_rate": 9.188709701682246e-06, |
|
"loss": 0.6793, |
|
"mean_token_accuracy": 0.7890331281672109, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7221605623381429, |
|
"grad_norm": 0.496600478887558, |
|
"learning_rate": 9.154267566791224e-06, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.7916224036955456, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.7251202367739549, |
|
"grad_norm": 0.447837233543396, |
|
"learning_rate": 9.119835533834332e-06, |
|
"loss": 0.6443, |
|
"mean_token_accuracy": 0.8001154358817507, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.728079911209767, |
|
"grad_norm": 0.4290511906147003, |
|
"learning_rate": 9.085414014088368e-06, |
|
"loss": 0.7033, |
|
"mean_token_accuracy": 0.7838360657347012, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.7310395856455791, |
|
"grad_norm": 0.4949333369731903, |
|
"learning_rate": 9.051003418704566e-06, |
|
"loss": 0.6797, |
|
"mean_token_accuracy": 0.7891070494649397, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.733999260081391, |
|
"grad_norm": 0.47587254643440247, |
|
"learning_rate": 9.016604158703654e-06, |
|
"loss": 0.6047, |
|
"mean_token_accuracy": 0.8115938183485798, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.7369589345172032, |
|
"grad_norm": 0.4586060643196106, |
|
"learning_rate": 8.982216644970978e-06, |
|
"loss": 0.7073, |
|
"mean_token_accuracy": 0.7814491686139491, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.7399186089530152, |
|
"grad_norm": 0.4535180628299713, |
|
"learning_rate": 8.947841288251568e-06, |
|
"loss": 0.6773, |
|
"mean_token_accuracy": 0.7899806831449463, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.7428782833888272, |
|
"grad_norm": 0.4698368012905121, |
|
"learning_rate": 8.913478499145255e-06, |
|
"loss": 0.6992, |
|
"mean_token_accuracy": 0.7847534234645677, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.7458379578246392, |
|
"grad_norm": 0.4965501129627228, |
|
"learning_rate": 8.879128688101749e-06, |
|
"loss": 0.73, |
|
"mean_token_accuracy": 0.7749135427241792, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.7487976322604513, |
|
"grad_norm": 0.42426785826683044, |
|
"learning_rate": 8.844792265415738e-06, |
|
"loss": 0.6691, |
|
"mean_token_accuracy": 0.7934521695906798, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7517573066962635, |
|
"grad_norm": 0.4164229929447174, |
|
"learning_rate": 8.810469641222001e-06, |
|
"loss": 0.6792, |
|
"mean_token_accuracy": 0.7893430794759394, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.7547169811320755, |
|
"grad_norm": 0.4406238794326782, |
|
"learning_rate": 8.776161225490488e-06, |
|
"loss": 0.6774, |
|
"mean_token_accuracy": 0.7888743256018739, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.7576766555678875, |
|
"grad_norm": 0.4821741282939911, |
|
"learning_rate": 8.741867428021447e-06, |
|
"loss": 0.7028, |
|
"mean_token_accuracy": 0.782003973548151, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.7606363300036996, |
|
"grad_norm": 0.41678085923194885, |
|
"learning_rate": 8.707588658440511e-06, |
|
"loss": 0.6673, |
|
"mean_token_accuracy": 0.792060705641046, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.7635960044395116, |
|
"grad_norm": 0.4335281252861023, |
|
"learning_rate": 8.673325326193806e-06, |
|
"loss": 0.6799, |
|
"mean_token_accuracy": 0.7913004243427386, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.7665556788753238, |
|
"grad_norm": 0.46854230761528015, |
|
"learning_rate": 8.639077840543078e-06, |
|
"loss": 0.6939, |
|
"mean_token_accuracy": 0.784777034055922, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.7695153533111356, |
|
"grad_norm": 0.4286266267299652, |
|
"learning_rate": 8.604846610560771e-06, |
|
"loss": 0.682, |
|
"mean_token_accuracy": 0.7879420465198175, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.7724750277469479, |
|
"grad_norm": 0.4346145689487457, |
|
"learning_rate": 8.570632045125185e-06, |
|
"loss": 0.6722, |
|
"mean_token_accuracy": 0.7908459643173444, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.77543470218276, |
|
"grad_norm": 0.47212105989456177, |
|
"learning_rate": 8.536434552915555e-06, |
|
"loss": 0.6758, |
|
"mean_token_accuracy": 0.7914862427903648, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.778394376618572, |
|
"grad_norm": 0.45980679988861084, |
|
"learning_rate": 8.502254542407186e-06, |
|
"loss": 0.6988, |
|
"mean_token_accuracy": 0.7817833351753944, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.778394376618572, |
|
"eval_loss": 0.739486575126648, |
|
"eval_mean_token_accuracy": 0.7714524950010826, |
|
"eval_runtime": 24.4731, |
|
"eval_samples_per_second": 5.271, |
|
"eval_steps_per_second": 1.348, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7813540510543842, |
|
"grad_norm": 0.4924312233924866, |
|
"learning_rate": 8.468092421866575e-06, |
|
"loss": 0.6954, |
|
"mean_token_accuracy": 0.7859722749641744, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.784313725490196, |
|
"grad_norm": 0.4518575966358185, |
|
"learning_rate": 8.433948599346516e-06, |
|
"loss": 0.6719, |
|
"mean_token_accuracy": 0.7915203880270405, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.7872733999260082, |
|
"grad_norm": 0.41159677505493164, |
|
"learning_rate": 8.399823482681263e-06, |
|
"loss": 0.6654, |
|
"mean_token_accuracy": 0.7925289623050378, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.7902330743618202, |
|
"grad_norm": 0.4749601483345032, |
|
"learning_rate": 8.36571747948162e-06, |
|
"loss": 0.651, |
|
"mean_token_accuracy": 0.7971818401347246, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.7931927487976322, |
|
"grad_norm": 0.4616299271583557, |
|
"learning_rate": 8.331630997130091e-06, |
|
"loss": 0.6387, |
|
"mean_token_accuracy": 0.801418446439762, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.7961524232334445, |
|
"grad_norm": 0.4717465341091156, |
|
"learning_rate": 8.297564442776014e-06, |
|
"loss": 0.7002, |
|
"mean_token_accuracy": 0.7815816907542203, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.7991120976692563, |
|
"grad_norm": 0.45160382986068726, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.7934779098419342, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.8020717721050685, |
|
"grad_norm": 0.5111809372901917, |
|
"learning_rate": 8.229492745462551e-06, |
|
"loss": 0.6734, |
|
"mean_token_accuracy": 0.7910897600390507, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.8050314465408805, |
|
"grad_norm": 0.4546574652194977, |
|
"learning_rate": 8.195488415592238e-06, |
|
"loss": 0.6832, |
|
"mean_token_accuracy": 0.7884104161267849, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.8079911209766926, |
|
"grad_norm": 0.48915475606918335, |
|
"learning_rate": 8.161505639887818e-06, |
|
"loss": 0.6865, |
|
"mean_token_accuracy": 0.7868375134510748, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8109507954125046, |
|
"grad_norm": 0.45673686265945435, |
|
"learning_rate": 8.12754482425989e-06, |
|
"loss": 0.6531, |
|
"mean_token_accuracy": 0.7978940928567595, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.8139104698483166, |
|
"grad_norm": 0.46595895290374756, |
|
"learning_rate": 8.09360637435676e-06, |
|
"loss": 0.6763, |
|
"mean_token_accuracy": 0.7900004127541489, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.8168701442841289, |
|
"grad_norm": 0.4639073312282562, |
|
"learning_rate": 8.05969069555957e-06, |
|
"loss": 0.7068, |
|
"mean_token_accuracy": 0.7818326911340046, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.8198298187199407, |
|
"grad_norm": 0.48692357540130615, |
|
"learning_rate": 8.025798192977482e-06, |
|
"loss": 0.6724, |
|
"mean_token_accuracy": 0.7902419271935022, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.822789493155753, |
|
"grad_norm": 0.4192976653575897, |
|
"learning_rate": 7.991929271442817e-06, |
|
"loss": 0.694, |
|
"mean_token_accuracy": 0.7842421058017395, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.825749167591565, |
|
"grad_norm": 0.4323351979255676, |
|
"learning_rate": 7.958084335506239e-06, |
|
"loss": 0.6633, |
|
"mean_token_accuracy": 0.7939339540476142, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.828708842027377, |
|
"grad_norm": 0.5116434097290039, |
|
"learning_rate": 7.924263789431913e-06, |
|
"loss": 0.7177, |
|
"mean_token_accuracy": 0.7774093907152634, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.8316685164631892, |
|
"grad_norm": 0.47101178765296936, |
|
"learning_rate": 7.89046803719267e-06, |
|
"loss": 0.6311, |
|
"mean_token_accuracy": 0.8026902561156782, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.834628190899001, |
|
"grad_norm": 0.4334461987018585, |
|
"learning_rate": 7.856697482465195e-06, |
|
"loss": 0.7056, |
|
"mean_token_accuracy": 0.7813049117276861, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.8375878653348132, |
|
"grad_norm": 0.44044068455696106, |
|
"learning_rate": 7.822952528625192e-06, |
|
"loss": 0.6706, |
|
"mean_token_accuracy": 0.7911052218155908, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8405475397706252, |
|
"grad_norm": 0.43130719661712646, |
|
"learning_rate": 7.789233578742583e-06, |
|
"loss": 0.6868, |
|
"mean_token_accuracy": 0.7883987501012448, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.8435072142064373, |
|
"grad_norm": 0.464912086725235, |
|
"learning_rate": 7.755541035576677e-06, |
|
"loss": 0.6966, |
|
"mean_token_accuracy": 0.784260520058606, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.8464668886422495, |
|
"grad_norm": 0.47342586517333984, |
|
"learning_rate": 7.721875301571359e-06, |
|
"loss": 0.6862, |
|
"mean_token_accuracy": 0.7896494653236235, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.8494265630780613, |
|
"grad_norm": 0.4514820873737335, |
|
"learning_rate": 7.688236778850307e-06, |
|
"loss": 0.6702, |
|
"mean_token_accuracy": 0.7906542847766748, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.8523862375138735, |
|
"grad_norm": 0.4382912218570709, |
|
"learning_rate": 7.654625869212147e-06, |
|
"loss": 0.6519, |
|
"mean_token_accuracy": 0.7971948655223885, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.8553459119496856, |
|
"grad_norm": 0.4642338156700134, |
|
"learning_rate": 7.621042974125701e-06, |
|
"loss": 0.7042, |
|
"mean_token_accuracy": 0.7810776086801536, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.8583055863854976, |
|
"grad_norm": 0.43844854831695557, |
|
"learning_rate": 7.587488494725157e-06, |
|
"loss": 0.7134, |
|
"mean_token_accuracy": 0.7782961208172144, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.8612652608213096, |
|
"grad_norm": 0.44983789324760437, |
|
"learning_rate": 7.553962831805291e-06, |
|
"loss": 0.6928, |
|
"mean_token_accuracy": 0.7847411304512161, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.8642249352571216, |
|
"grad_norm": 0.464546799659729, |
|
"learning_rate": 7.520466385816672e-06, |
|
"loss": 0.6848, |
|
"mean_token_accuracy": 0.7877457152823937, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.8671846096929339, |
|
"grad_norm": 0.4500563442707062, |
|
"learning_rate": 7.48699955686089e-06, |
|
"loss": 0.7043, |
|
"mean_token_accuracy": 0.7810525873603867, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.870144284128746, |
|
"grad_norm": 0.4776234030723572, |
|
"learning_rate": 7.453562744685779e-06, |
|
"loss": 0.6491, |
|
"mean_token_accuracy": 0.7980624835148542, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.873103958564558, |
|
"grad_norm": 0.42935752868652344, |
|
"learning_rate": 7.420156348680621e-06, |
|
"loss": 0.7015, |
|
"mean_token_accuracy": 0.7841032229720888, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.87606363300037, |
|
"grad_norm": 0.45095863938331604, |
|
"learning_rate": 7.3867807678713965e-06, |
|
"loss": 0.6695, |
|
"mean_token_accuracy": 0.792214351462561, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.879023307436182, |
|
"grad_norm": 0.4426802694797516, |
|
"learning_rate": 7.353436400916006e-06, |
|
"loss": 0.7231, |
|
"mean_token_accuracy": 0.7759461148659138, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.8819829818719942, |
|
"grad_norm": 0.4576883316040039, |
|
"learning_rate": 7.32012364609952e-06, |
|
"loss": 0.6891, |
|
"mean_token_accuracy": 0.787467563016268, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.884942656307806, |
|
"grad_norm": 0.47537630796432495, |
|
"learning_rate": 7.286842901329413e-06, |
|
"loss": 0.6737, |
|
"mean_token_accuracy": 0.7898305381622779, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.8879023307436182, |
|
"grad_norm": 0.47071340680122375, |
|
"learning_rate": 7.253594564130804e-06, |
|
"loss": 0.6314, |
|
"mean_token_accuracy": 0.8024436031530167, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.8908620051794303, |
|
"grad_norm": 0.42745083570480347, |
|
"learning_rate": 7.22037903164173e-06, |
|
"loss": 0.6648, |
|
"mean_token_accuracy": 0.7939708774130151, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.8938216796152423, |
|
"grad_norm": 0.45386022329330444, |
|
"learning_rate": 7.187196700608373e-06, |
|
"loss": 0.7055, |
|
"mean_token_accuracy": 0.7818898164861657, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.8967813540510545, |
|
"grad_norm": 0.5082824230194092, |
|
"learning_rate": 7.154047967380353e-06, |
|
"loss": 0.6797, |
|
"mean_token_accuracy": 0.7885593754013774, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.8997410284868663, |
|
"grad_norm": 0.42250484228134155, |
|
"learning_rate": 7.120933227905971e-06, |
|
"loss": 0.6822, |
|
"mean_token_accuracy": 0.7885130074722346, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.9027007029226786, |
|
"grad_norm": 0.45145198702812195, |
|
"learning_rate": 7.0878528777274814e-06, |
|
"loss": 0.7101, |
|
"mean_token_accuracy": 0.7797410127092863, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.9056603773584906, |
|
"grad_norm": 0.4663936495780945, |
|
"learning_rate": 7.05480731197638e-06, |
|
"loss": 0.6638, |
|
"mean_token_accuracy": 0.7952863824537968, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.9086200517943026, |
|
"grad_norm": 0.4832487106323242, |
|
"learning_rate": 7.021796925368667e-06, |
|
"loss": 0.6901, |
|
"mean_token_accuracy": 0.7859927796689913, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.9115797262301149, |
|
"grad_norm": 0.4796106219291687, |
|
"learning_rate": 6.988822112200157e-06, |
|
"loss": 0.699, |
|
"mean_token_accuracy": 0.7833292076466405, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.9145394006659266, |
|
"grad_norm": 0.4601701498031616, |
|
"learning_rate": 6.955883266341741e-06, |
|
"loss": 0.6911, |
|
"mean_token_accuracy": 0.7855269916498042, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.917499075101739, |
|
"grad_norm": 0.4631184935569763, |
|
"learning_rate": 6.9229807812346985e-06, |
|
"loss": 0.6938, |
|
"mean_token_accuracy": 0.7854820719902068, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.920458749537551, |
|
"grad_norm": 0.46688076853752136, |
|
"learning_rate": 6.890115049885995e-06, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.7866998790722634, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.923418423973363, |
|
"grad_norm": 0.4536078870296478, |
|
"learning_rate": 6.85728646486359e-06, |
|
"loss": 0.6795, |
|
"mean_token_accuracy": 0.7877265813082034, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.926378098409175, |
|
"grad_norm": 0.4446280896663666, |
|
"learning_rate": 6.824495418291741e-06, |
|
"loss": 0.6618, |
|
"mean_token_accuracy": 0.793360405089406, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.929337772844987, |
|
"grad_norm": 0.4624863564968109, |
|
"learning_rate": 6.791742301846325e-06, |
|
"loss": 0.6943, |
|
"mean_token_accuracy": 0.7851390097369664, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.9322974472807992, |
|
"grad_norm": 0.46851369738578796, |
|
"learning_rate": 6.759027506750159e-06, |
|
"loss": 0.6973, |
|
"mean_token_accuracy": 0.7825239711607613, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.935257121716611, |
|
"grad_norm": 0.45422789454460144, |
|
"learning_rate": 6.726351423768323e-06, |
|
"loss": 0.7049, |
|
"mean_token_accuracy": 0.7834841114161091, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.9382167961524233, |
|
"grad_norm": 0.4513411521911621, |
|
"learning_rate": 6.693714443203507e-06, |
|
"loss": 0.674, |
|
"mean_token_accuracy": 0.7905625791946234, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 0.44000759720802307, |
|
"learning_rate": 6.661116954891329e-06, |
|
"loss": 0.6889, |
|
"mean_token_accuracy": 0.7876442391621598, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.9441361450240473, |
|
"grad_norm": 0.4787219166755676, |
|
"learning_rate": 6.62855934819569e-06, |
|
"loss": 0.7072, |
|
"mean_token_accuracy": 0.779945015719257, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.9470958194598595, |
|
"grad_norm": 0.42401981353759766, |
|
"learning_rate": 6.59604201200412e-06, |
|
"loss": 0.6773, |
|
"mean_token_accuracy": 0.7896623125899848, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.9500554938956713, |
|
"grad_norm": 0.5051243305206299, |
|
"learning_rate": 6.563565334723134e-06, |
|
"loss": 0.6973, |
|
"mean_token_accuracy": 0.7849915232879509, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.9530151683314836, |
|
"grad_norm": 0.4501940608024597, |
|
"learning_rate": 6.5311297042736046e-06, |
|
"loss": 0.7169, |
|
"mean_token_accuracy": 0.7793906916939676, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.9559748427672956, |
|
"grad_norm": 0.4441750645637512, |
|
"learning_rate": 6.498735508086094e-06, |
|
"loss": 0.6299, |
|
"mean_token_accuracy": 0.80293781287729, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9589345172031076, |
|
"grad_norm": 0.4581814706325531, |
|
"learning_rate": 6.466383133096268e-06, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.7832016518779903, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.9618941916389199, |
|
"grad_norm": 0.4844694137573242, |
|
"learning_rate": 6.4340729657402424e-06, |
|
"loss": 0.6553, |
|
"mean_token_accuracy": 0.79566224863039, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.9648538660747317, |
|
"grad_norm": 0.47741377353668213, |
|
"learning_rate": 6.40180539194999e-06, |
|
"loss": 0.7005, |
|
"mean_token_accuracy": 0.7834191013265248, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.967813540510544, |
|
"grad_norm": 0.4623546600341797, |
|
"learning_rate": 6.3695807971487175e-06, |
|
"loss": 0.6739, |
|
"mean_token_accuracy": 0.7908099908248746, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.970773214946356, |
|
"grad_norm": 0.44196298718452454, |
|
"learning_rate": 6.337399566246257e-06, |
|
"loss": 0.6887, |
|
"mean_token_accuracy": 0.7867410372609397, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.973732889382168, |
|
"grad_norm": 0.44744858145713806, |
|
"learning_rate": 6.305262083634488e-06, |
|
"loss": 0.6947, |
|
"mean_token_accuracy": 0.7849319629950706, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.97669256381798, |
|
"grad_norm": 0.48888614773750305, |
|
"learning_rate": 6.2731687331827214e-06, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.7852305896900613, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.979652238253792, |
|
"grad_norm": 0.548868715763092, |
|
"learning_rate": 6.2411198982331435e-06, |
|
"loss": 0.6583, |
|
"mean_token_accuracy": 0.7950044402077763, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.9826119126896042, |
|
"grad_norm": 0.44247865676879883, |
|
"learning_rate": 6.209115961596208e-06, |
|
"loss": 0.6963, |
|
"mean_token_accuracy": 0.7844457961563795, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.9855715871254163, |
|
"grad_norm": 0.4636320471763611, |
|
"learning_rate": 6.177157305546077e-06, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7862634185033074, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.9885312615612283, |
|
"grad_norm": 0.4413374066352844, |
|
"learning_rate": 6.145244311816063e-06, |
|
"loss": 0.6814, |
|
"mean_token_accuracy": 0.787695055467721, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.9914909359970403, |
|
"grad_norm": 0.45312613248825073, |
|
"learning_rate": 6.113377361594048e-06, |
|
"loss": 0.6754, |
|
"mean_token_accuracy": 0.7900683076107496, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.9944506104328523, |
|
"grad_norm": 0.4501809775829315, |
|
"learning_rate": 6.081556835517955e-06, |
|
"loss": 0.6822, |
|
"mean_token_accuracy": 0.788871206473793, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.9974102848686646, |
|
"grad_norm": 0.45863819122314453, |
|
"learning_rate": 6.049783113671184e-06, |
|
"loss": 0.6751, |
|
"mean_token_accuracy": 0.7895972815620605, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 2.0029596744358122, |
|
"grad_norm": 0.7237296104431152, |
|
"learning_rate": 6.018056575578075e-06, |
|
"loss": 1.3777, |
|
"mean_token_accuracy": 0.7904976583626417, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.005919348871624, |
|
"grad_norm": 0.4619278311729431, |
|
"learning_rate": 5.986377600199371e-06, |
|
"loss": 0.6827, |
|
"mean_token_accuracy": 0.7859173509620443, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 2.0088790233074363, |
|
"grad_norm": 0.4588172435760498, |
|
"learning_rate": 5.9547465659277215e-06, |
|
"loss": 0.6602, |
|
"mean_token_accuracy": 0.7935102380543758, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 2.011838697743248, |
|
"grad_norm": 0.4326033890247345, |
|
"learning_rate": 5.923163850583114e-06, |
|
"loss": 0.6169, |
|
"mean_token_accuracy": 0.8052656884361966, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.0147983721790603, |
|
"grad_norm": 0.4270947277545929, |
|
"learning_rate": 5.891629831408392e-06, |
|
"loss": 0.6675, |
|
"mean_token_accuracy": 0.7923289976402469, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 2.0177580466148726, |
|
"grad_norm": 0.42200711369514465, |
|
"learning_rate": 5.8601448850647515e-06, |
|
"loss": 0.7139, |
|
"mean_token_accuracy": 0.7767213966992985, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0207177210506844, |
|
"grad_norm": 0.4401227831840515, |
|
"learning_rate": 5.828709387627219e-06, |
|
"loss": 0.6296, |
|
"mean_token_accuracy": 0.8034322271999133, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.0236773954864966, |
|
"grad_norm": 0.4614053964614868, |
|
"learning_rate": 5.797323714580192e-06, |
|
"loss": 0.6402, |
|
"mean_token_accuracy": 0.7988319450662181, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.0266370699223084, |
|
"grad_norm": 0.4590739905834198, |
|
"learning_rate": 5.7659882408129204e-06, |
|
"loss": 0.6523, |
|
"mean_token_accuracy": 0.79529094328691, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.0295967443581207, |
|
"grad_norm": 0.4543253481388092, |
|
"learning_rate": 5.7347033406150494e-06, |
|
"loss": 0.6733, |
|
"mean_token_accuracy": 0.7890264127897217, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.0325564187939325, |
|
"grad_norm": 0.4582739770412445, |
|
"learning_rate": 5.703469387672138e-06, |
|
"loss": 0.6056, |
|
"mean_token_accuracy": 0.8107667655932651, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.0355160932297447, |
|
"grad_norm": 0.42348945140838623, |
|
"learning_rate": 5.672286755061212e-06, |
|
"loss": 0.6377, |
|
"mean_token_accuracy": 0.799343160525926, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 2.038475767665557, |
|
"grad_norm": 0.4367158114910126, |
|
"learning_rate": 5.64115581524629e-06, |
|
"loss": 0.6456, |
|
"mean_token_accuracy": 0.7978098111032584, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.0414354421013687, |
|
"grad_norm": 0.4166472852230072, |
|
"learning_rate": 5.610076940073939e-06, |
|
"loss": 0.64, |
|
"mean_token_accuracy": 0.7996033627487545, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 2.044395116537181, |
|
"grad_norm": 0.4349493980407715, |
|
"learning_rate": 5.579050500768837e-06, |
|
"loss": 0.6247, |
|
"mean_token_accuracy": 0.8040890019359421, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 2.0473547909729928, |
|
"grad_norm": 0.43666020035743713, |
|
"learning_rate": 5.548076867929331e-06, |
|
"loss": 0.6499, |
|
"mean_token_accuracy": 0.7959618095761632, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.050314465408805, |
|
"grad_norm": 0.4168229401111603, |
|
"learning_rate": 5.517156411523026e-06, |
|
"loss": 0.6207, |
|
"mean_token_accuracy": 0.8063047096858116, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 2.0532741398446173, |
|
"grad_norm": 0.4426259398460388, |
|
"learning_rate": 5.486289500882355e-06, |
|
"loss": 0.6437, |
|
"mean_token_accuracy": 0.7976666538912617, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 2.056233814280429, |
|
"grad_norm": 0.47709882259368896, |
|
"learning_rate": 5.455476504700161e-06, |
|
"loss": 0.6354, |
|
"mean_token_accuracy": 0.8001667386857992, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.0591934887162413, |
|
"grad_norm": 0.4387308359146118, |
|
"learning_rate": 5.424717791025302e-06, |
|
"loss": 0.6093, |
|
"mean_token_accuracy": 0.8074188099768709, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 2.062153163152053, |
|
"grad_norm": 0.42804378271102905, |
|
"learning_rate": 5.3940137272582534e-06, |
|
"loss": 0.6621, |
|
"mean_token_accuracy": 0.7942881189608123, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.0651128375878653, |
|
"grad_norm": 0.4197988510131836, |
|
"learning_rate": 5.3633646801467255e-06, |
|
"loss": 0.6272, |
|
"mean_token_accuracy": 0.8035603820307122, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.0680725120236776, |
|
"grad_norm": 0.4130113422870636, |
|
"learning_rate": 5.332771015781275e-06, |
|
"loss": 0.6318, |
|
"mean_token_accuracy": 0.8026469316916442, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 2.0710321864594894, |
|
"grad_norm": 0.4477401077747345, |
|
"learning_rate": 5.302233099590928e-06, |
|
"loss": 0.6202, |
|
"mean_token_accuracy": 0.8051835840765896, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 2.0739918608953016, |
|
"grad_norm": 0.4212632477283478, |
|
"learning_rate": 5.271751296338823e-06, |
|
"loss": 0.6454, |
|
"mean_token_accuracy": 0.7975187090662971, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.0769515353311134, |
|
"grad_norm": 0.43481898307800293, |
|
"learning_rate": 5.241325970117851e-06, |
|
"loss": 0.6298, |
|
"mean_token_accuracy": 0.8037347054797938, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0769515353311134, |
|
"eval_loss": 0.7401972413063049, |
|
"eval_mean_token_accuracy": 0.7715796790522826, |
|
"eval_runtime": 24.8345, |
|
"eval_samples_per_second": 5.194, |
|
"eval_steps_per_second": 1.329, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0799112097669257, |
|
"grad_norm": 0.42328760027885437, |
|
"learning_rate": 5.210957484346314e-06, |
|
"loss": 0.5797, |
|
"mean_token_accuracy": 0.8171162575964448, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 2.0828708842027375, |
|
"grad_norm": 0.40636351704597473, |
|
"learning_rate": 5.1806462017635775e-06, |
|
"loss": 0.6444, |
|
"mean_token_accuracy": 0.7976044651105583, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.0858305586385497, |
|
"grad_norm": 0.4619290232658386, |
|
"learning_rate": 5.150392484425728e-06, |
|
"loss": 0.6432, |
|
"mean_token_accuracy": 0.7998582873056539, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 2.088790233074362, |
|
"grad_norm": 0.42781201004981995, |
|
"learning_rate": 5.120196693701267e-06, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.7980342866377519, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 2.0917499075101738, |
|
"grad_norm": 0.435585081577301, |
|
"learning_rate": 5.090059190266779e-06, |
|
"loss": 0.6703, |
|
"mean_token_accuracy": 0.7898306031291672, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.094709581945986, |
|
"grad_norm": 0.42848485708236694, |
|
"learning_rate": 5.059980334102637e-06, |
|
"loss": 0.6399, |
|
"mean_token_accuracy": 0.8012392387851905, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 2.097669256381798, |
|
"grad_norm": 0.44752803444862366, |
|
"learning_rate": 5.0299604844886985e-06, |
|
"loss": 0.6444, |
|
"mean_token_accuracy": 0.7983052126079367, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 2.10062893081761, |
|
"grad_norm": 0.41624656319618225, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.6564, |
|
"mean_token_accuracy": 0.7942197264250628, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.1035886052534223, |
|
"grad_norm": 0.4133838415145874, |
|
"learning_rate": 4.970099238502494e-06, |
|
"loss": 0.6516, |
|
"mean_token_accuracy": 0.7961836172559192, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 2.106548279689234, |
|
"grad_norm": 0.4188925325870514, |
|
"learning_rate": 4.940258557148765e-06, |
|
"loss": 0.6703, |
|
"mean_token_accuracy": 0.7904122765338784, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.1095079541250463, |
|
"grad_norm": 0.4261308014392853, |
|
"learning_rate": 4.910478312373757e-06, |
|
"loss": 0.6172, |
|
"mean_token_accuracy": 0.8066983237111479, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.112467628560858, |
|
"grad_norm": 0.40434494614601135, |
|
"learning_rate": 4.8807588598905364e-06, |
|
"loss": 0.6482, |
|
"mean_token_accuracy": 0.7977588880511752, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 2.1154273029966704, |
|
"grad_norm": 0.4250684380531311, |
|
"learning_rate": 4.8511005546860214e-06, |
|
"loss": 0.6495, |
|
"mean_token_accuracy": 0.7967420913450249, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 2.1183869774324826, |
|
"grad_norm": 0.4167192280292511, |
|
"learning_rate": 4.821503751016746e-06, |
|
"loss": 0.6226, |
|
"mean_token_accuracy": 0.8038675074568771, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.1213466518682944, |
|
"grad_norm": 0.4020220637321472, |
|
"learning_rate": 4.791968802404648e-06, |
|
"loss": 0.639, |
|
"mean_token_accuracy": 0.8002841240121322, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.1243063263041067, |
|
"grad_norm": 0.41898688673973083, |
|
"learning_rate": 4.762496061632814e-06, |
|
"loss": 0.5961, |
|
"mean_token_accuracy": 0.8106809432630374, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 2.1272660007399185, |
|
"grad_norm": 0.4082755446434021, |
|
"learning_rate": 4.733085880741301e-06, |
|
"loss": 0.6836, |
|
"mean_token_accuracy": 0.7858357226121178, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.1302256751757307, |
|
"grad_norm": 0.4276457130908966, |
|
"learning_rate": 4.703738611022899e-06, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.7956159537823245, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 2.133185349611543, |
|
"grad_norm": 0.42158472537994385, |
|
"learning_rate": 4.674454603018949e-06, |
|
"loss": 0.6147, |
|
"mean_token_accuracy": 0.8079100447436781, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 2.1361450240473547, |
|
"grad_norm": 0.4250597357749939, |
|
"learning_rate": 4.645234206515171e-06, |
|
"loss": 0.6386, |
|
"mean_token_accuracy": 0.8010068266815492, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.139104698483167, |
|
"grad_norm": 0.4138052463531494, |
|
"learning_rate": 4.616077770537453e-06, |
|
"loss": 0.6231, |
|
"mean_token_accuracy": 0.804220202437573, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 2.1420643729189788, |
|
"grad_norm": 0.4031846523284912, |
|
"learning_rate": 4.586985643347716e-06, |
|
"loss": 0.6353, |
|
"mean_token_accuracy": 0.7999556744979773, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 2.145024047354791, |
|
"grad_norm": 0.4207233190536499, |
|
"learning_rate": 4.557958172439726e-06, |
|
"loss": 0.6519, |
|
"mean_token_accuracy": 0.795605835154003, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.1479837217906033, |
|
"grad_norm": 0.4172452390193939, |
|
"learning_rate": 4.5289957045349655e-06, |
|
"loss": 0.6214, |
|
"mean_token_accuracy": 0.8051871043336377, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 2.150943396226415, |
|
"grad_norm": 0.4109727442264557, |
|
"learning_rate": 4.500098585578475e-06, |
|
"loss": 0.62, |
|
"mean_token_accuracy": 0.8044500506016459, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.1539030706622273, |
|
"grad_norm": 0.4343760907649994, |
|
"learning_rate": 4.471267160734731e-06, |
|
"loss": 0.6539, |
|
"mean_token_accuracy": 0.7939436976287444, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.156862745098039, |
|
"grad_norm": 0.4174571931362152, |
|
"learning_rate": 4.4425017743835155e-06, |
|
"loss": 0.6371, |
|
"mean_token_accuracy": 0.8005225952205913, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 2.1598224195338513, |
|
"grad_norm": 0.38494619727134705, |
|
"learning_rate": 4.413802770115816e-06, |
|
"loss": 0.6524, |
|
"mean_token_accuracy": 0.7961488383409648, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 2.162782093969663, |
|
"grad_norm": 0.41858088970184326, |
|
"learning_rate": 4.385170490729712e-06, |
|
"loss": 0.6421, |
|
"mean_token_accuracy": 0.7982196911670912, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.1657417684054754, |
|
"grad_norm": 0.45224249362945557, |
|
"learning_rate": 4.356605278226274e-06, |
|
"loss": 0.6639, |
|
"mean_token_accuracy": 0.7918000336006263, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.1687014428412876, |
|
"grad_norm": 0.43538355827331543, |
|
"learning_rate": 4.328107473805487e-06, |
|
"loss": 0.6383, |
|
"mean_token_accuracy": 0.800484981130683, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 2.1716611172770994, |
|
"grad_norm": 0.3976902365684509, |
|
"learning_rate": 4.299677417862174e-06, |
|
"loss": 0.6556, |
|
"mean_token_accuracy": 0.7932561264782982, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.1746207917129117, |
|
"grad_norm": 0.44118574261665344, |
|
"learning_rate": 4.2713154499819345e-06, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.7921884608817545, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 2.1775804661487235, |
|
"grad_norm": 0.4160580635070801, |
|
"learning_rate": 4.243021908937083e-06, |
|
"loss": 0.6136, |
|
"mean_token_accuracy": 0.8078645092004564, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 2.1805401405845357, |
|
"grad_norm": 0.4081907272338867, |
|
"learning_rate": 4.214797132682597e-06, |
|
"loss": 0.6017, |
|
"mean_token_accuracy": 0.8104744103732681, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.183499815020348, |
|
"grad_norm": 0.4466439187526703, |
|
"learning_rate": 4.186641458352088e-06, |
|
"loss": 0.6713, |
|
"mean_token_accuracy": 0.7900975226524254, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 2.1864594894561598, |
|
"grad_norm": 0.4527799189090729, |
|
"learning_rate": 4.158555222253772e-06, |
|
"loss": 0.6744, |
|
"mean_token_accuracy": 0.7901550404552812, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 2.189419163891972, |
|
"grad_norm": 0.4166731536388397, |
|
"learning_rate": 4.130538759866457e-06, |
|
"loss": 0.6523, |
|
"mean_token_accuracy": 0.795872875107717, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.192378838327784, |
|
"grad_norm": 0.4434090852737427, |
|
"learning_rate": 4.102592405835536e-06, |
|
"loss": 0.6366, |
|
"mean_token_accuracy": 0.8006169174890402, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 2.195338512763596, |
|
"grad_norm": 0.4182213842868805, |
|
"learning_rate": 4.074716493968976e-06, |
|
"loss": 0.6193, |
|
"mean_token_accuracy": 0.8064642927723187, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.1982981871994083, |
|
"grad_norm": 0.4401805102825165, |
|
"learning_rate": 4.046911357233343e-06, |
|
"loss": 0.5899, |
|
"mean_token_accuracy": 0.8129922266946384, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.20125786163522, |
|
"grad_norm": 0.4129815697669983, |
|
"learning_rate": 4.019177327749822e-06, |
|
"loss": 0.6164, |
|
"mean_token_accuracy": 0.8067027474840832, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 2.2042175360710323, |
|
"grad_norm": 0.414181649684906, |
|
"learning_rate": 3.991514736790259e-06, |
|
"loss": 0.6572, |
|
"mean_token_accuracy": 0.7943868846552696, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 2.207177210506844, |
|
"grad_norm": 0.41192206740379333, |
|
"learning_rate": 3.9639239147731865e-06, |
|
"loss": 0.6105, |
|
"mean_token_accuracy": 0.8081474157714055, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.2101368849426564, |
|
"grad_norm": 0.4337133765220642, |
|
"learning_rate": 3.936405191259891e-06, |
|
"loss": 0.646, |
|
"mean_token_accuracy": 0.7979063248420304, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.213096559378468, |
|
"grad_norm": 0.42786547541618347, |
|
"learning_rate": 3.908958894950465e-06, |
|
"loss": 0.6611, |
|
"mean_token_accuracy": 0.7936699913649292, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 2.2160562338142804, |
|
"grad_norm": 0.45288723707199097, |
|
"learning_rate": 3.881585353679891e-06, |
|
"loss": 0.6648, |
|
"mean_token_accuracy": 0.7914008191748386, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 2.2190159082500927, |
|
"grad_norm": 0.45666372776031494, |
|
"learning_rate": 3.854284894414122e-06, |
|
"loss": 0.6291, |
|
"mean_token_accuracy": 0.8025700241416271, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 2.2219755826859044, |
|
"grad_norm": 0.41519424319267273, |
|
"learning_rate": 3.827057843246181e-06, |
|
"loss": 0.6233, |
|
"mean_token_accuracy": 0.8051112931321951, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 2.2249352571217167, |
|
"grad_norm": 0.42094844579696655, |
|
"learning_rate": 3.799904525392251e-06, |
|
"loss": 0.6083, |
|
"mean_token_accuracy": 0.8084426362380992, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2278949315575285, |
|
"grad_norm": 0.41918104887008667, |
|
"learning_rate": 3.7728252651878018e-06, |
|
"loss": 0.6584, |
|
"mean_token_accuracy": 0.7924028935909405, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 2.2308546059933407, |
|
"grad_norm": 0.43053704500198364, |
|
"learning_rate": 3.745820386083724e-06, |
|
"loss": 0.6675, |
|
"mean_token_accuracy": 0.7899391245102569, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 2.233814280429153, |
|
"grad_norm": 0.433442085981369, |
|
"learning_rate": 3.718890210642442e-06, |
|
"loss": 0.6606, |
|
"mean_token_accuracy": 0.7937032510168863, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.2367739548649648, |
|
"grad_norm": 0.4508717358112335, |
|
"learning_rate": 3.6920350605340883e-06, |
|
"loss": 0.6266, |
|
"mean_token_accuracy": 0.8043645426941337, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 2.239733629300777, |
|
"grad_norm": 0.4047711491584778, |
|
"learning_rate": 3.6652552565326382e-06, |
|
"loss": 0.6681, |
|
"mean_token_accuracy": 0.7906295543184187, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.242693303736589, |
|
"grad_norm": 0.4291645586490631, |
|
"learning_rate": 3.638551118512089e-06, |
|
"loss": 0.6562, |
|
"mean_token_accuracy": 0.7943759677554681, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.245652978172401, |
|
"grad_norm": 0.46543434262275696, |
|
"learning_rate": 3.611922965442648e-06, |
|
"loss": 0.6955, |
|
"mean_token_accuracy": 0.7842147288330679, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 2.2486126526082133, |
|
"grad_norm": 0.44530779123306274, |
|
"learning_rate": 3.5853711153868962e-06, |
|
"loss": 0.6443, |
|
"mean_token_accuracy": 0.7977429200467334, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 2.251572327044025, |
|
"grad_norm": 0.4391216039657593, |
|
"learning_rate": 3.558895885496023e-06, |
|
"loss": 0.6551, |
|
"mean_token_accuracy": 0.7939586840706503, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 2.2545320014798373, |
|
"grad_norm": 0.42412394285202026, |
|
"learning_rate": 3.53249759200601e-06, |
|
"loss": 0.6217, |
|
"mean_token_accuracy": 0.8050196332982708, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.257491675915649, |
|
"grad_norm": 0.42387112975120544, |
|
"learning_rate": 3.506176550233863e-06, |
|
"loss": 0.6572, |
|
"mean_token_accuracy": 0.794313531964468, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 2.2604513503514614, |
|
"grad_norm": 0.434893399477005, |
|
"learning_rate": 3.479933074573858e-06, |
|
"loss": 0.6855, |
|
"mean_token_accuracy": 0.7879362757907509, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 2.263411024787273, |
|
"grad_norm": 0.4247857928276062, |
|
"learning_rate": 3.453767478493761e-06, |
|
"loss": 0.644, |
|
"mean_token_accuracy": 0.7982682262279043, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 2.2663706992230854, |
|
"grad_norm": 0.42670580744743347, |
|
"learning_rate": 3.4276800745311135e-06, |
|
"loss": 0.6195, |
|
"mean_token_accuracy": 0.8050541199962113, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 2.2693303736588977, |
|
"grad_norm": 0.3888881504535675, |
|
"learning_rate": 3.401671174289469e-06, |
|
"loss": 0.6515, |
|
"mean_token_accuracy": 0.7958488753426484, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.2722900480947095, |
|
"grad_norm": 0.41099730134010315, |
|
"learning_rate": 3.37574108843469e-06, |
|
"loss": 0.6781, |
|
"mean_token_accuracy": 0.7891008767600376, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 2.2752497225305217, |
|
"grad_norm": 0.41824233531951904, |
|
"learning_rate": 3.3498901266912397e-06, |
|
"loss": 0.6085, |
|
"mean_token_accuracy": 0.8082267427244683, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 2.2782093969663335, |
|
"grad_norm": 0.4144093692302704, |
|
"learning_rate": 3.3241185978384636e-06, |
|
"loss": 0.6699, |
|
"mean_token_accuracy": 0.7909267478796423, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 2.2811690714021458, |
|
"grad_norm": 0.42413535714149475, |
|
"learning_rate": 3.2984268097069284e-06, |
|
"loss": 0.6339, |
|
"mean_token_accuracy": 0.801065864295844, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 2.284128745837958, |
|
"grad_norm": 0.39951691031455994, |
|
"learning_rate": 3.2728150691747117e-06, |
|
"loss": 0.6411, |
|
"mean_token_accuracy": 0.7983959606160835, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.28708842027377, |
|
"grad_norm": 0.41182059049606323, |
|
"learning_rate": 3.2472836821637744e-06, |
|
"loss": 0.6281, |
|
"mean_token_accuracy": 0.802523700960331, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 2.290048094709582, |
|
"grad_norm": 0.4084027409553528, |
|
"learning_rate": 3.22183295363627e-06, |
|
"loss": 0.6265, |
|
"mean_token_accuracy": 0.802413599215893, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 2.293007769145394, |
|
"grad_norm": 0.4154830574989319, |
|
"learning_rate": 3.196463187590929e-06, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.796869447336104, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 2.295967443581206, |
|
"grad_norm": 0.4043501019477844, |
|
"learning_rate": 3.1711746870594083e-06, |
|
"loss": 0.6287, |
|
"mean_token_accuracy": 0.8030152586126692, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 2.2989271180170183, |
|
"grad_norm": 0.4156252443790436, |
|
"learning_rate": 3.145967754102691e-06, |
|
"loss": 0.6372, |
|
"mean_token_accuracy": 0.8003738520892887, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.30188679245283, |
|
"grad_norm": 0.4200536012649536, |
|
"learning_rate": 3.1208426898074685e-06, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.7902292574180307, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 2.3048464668886424, |
|
"grad_norm": 0.41020068526268005, |
|
"learning_rate": 3.0957997942825337e-06, |
|
"loss": 0.6371, |
|
"mean_token_accuracy": 0.7996246095334629, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.307806141324454, |
|
"grad_norm": 0.419129341840744, |
|
"learning_rate": 3.070839366655215e-06, |
|
"loss": 0.6468, |
|
"mean_token_accuracy": 0.7962623324512614, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 2.3107658157602664, |
|
"grad_norm": 0.4240724444389343, |
|
"learning_rate": 3.045961705067787e-06, |
|
"loss": 0.659, |
|
"mean_token_accuracy": 0.7953217981209549, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 2.313725490196078, |
|
"grad_norm": 0.4143866002559662, |
|
"learning_rate": 3.021167106673928e-06, |
|
"loss": 0.6424, |
|
"mean_token_accuracy": 0.7982811964276817, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.3166851646318904, |
|
"grad_norm": 0.3981107175350189, |
|
"learning_rate": 2.996455867635155e-06, |
|
"loss": 0.6607, |
|
"mean_token_accuracy": 0.791508945971797, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 2.3196448390677027, |
|
"grad_norm": 0.4162614345550537, |
|
"learning_rate": 2.9718282831172885e-06, |
|
"loss": 0.6504, |
|
"mean_token_accuracy": 0.7963113772717785, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 2.3226045135035145, |
|
"grad_norm": 0.4027155637741089, |
|
"learning_rate": 2.94728464728693e-06, |
|
"loss": 0.6019, |
|
"mean_token_accuracy": 0.8109574738019254, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 2.3255641879393267, |
|
"grad_norm": 0.3899628520011902, |
|
"learning_rate": 2.922825253307947e-06, |
|
"loss": 0.6574, |
|
"mean_token_accuracy": 0.7928772726976023, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 2.328523862375139, |
|
"grad_norm": 0.40858355164527893, |
|
"learning_rate": 2.898450393337977e-06, |
|
"loss": 0.6746, |
|
"mean_token_accuracy": 0.7888906732688429, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.3314835368109508, |
|
"grad_norm": 0.41953524947166443, |
|
"learning_rate": 2.8741603585249312e-06, |
|
"loss": 0.6408, |
|
"mean_token_accuracy": 0.7996593220237972, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.334443211246763, |
|
"grad_norm": 0.4106321334838867, |
|
"learning_rate": 2.8499554390035144e-06, |
|
"loss": 0.6483, |
|
"mean_token_accuracy": 0.7961187957538525, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 2.337402885682575, |
|
"grad_norm": 0.3997010886669159, |
|
"learning_rate": 2.8258359238917665e-06, |
|
"loss": 0.6245, |
|
"mean_token_accuracy": 0.8050718498453701, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 2.340362560118387, |
|
"grad_norm": 0.409584105014801, |
|
"learning_rate": 2.8018021012875994e-06, |
|
"loss": 0.6017, |
|
"mean_token_accuracy": 0.8123756950624281, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 2.343322234554199, |
|
"grad_norm": 0.40811508893966675, |
|
"learning_rate": 2.7778542582653746e-06, |
|
"loss": 0.6084, |
|
"mean_token_accuracy": 0.8097888468215142, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.346281908990011, |
|
"grad_norm": 0.38959836959838867, |
|
"learning_rate": 2.753992680872457e-06, |
|
"loss": 0.6062, |
|
"mean_token_accuracy": 0.8108852376474688, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 2.3492415834258233, |
|
"grad_norm": 0.3957045376300812, |
|
"learning_rate": 2.7302176541257984e-06, |
|
"loss": 0.6328, |
|
"mean_token_accuracy": 0.8015920238869745, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.352201257861635, |
|
"grad_norm": 0.40360507369041443, |
|
"learning_rate": 2.7065294620085425e-06, |
|
"loss": 0.648, |
|
"mean_token_accuracy": 0.7971657427741622, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 2.3551609322974474, |
|
"grad_norm": 0.45460647344589233, |
|
"learning_rate": 2.6829283874666236e-06, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.7987224105203672, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 2.358120606733259, |
|
"grad_norm": 0.39461758732795715, |
|
"learning_rate": 2.6594147124053983e-06, |
|
"loss": 0.6515, |
|
"mean_token_accuracy": 0.796149561930855, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.3610802811690714, |
|
"grad_norm": 0.41954609751701355, |
|
"learning_rate": 2.635988717686272e-06, |
|
"loss": 0.6246, |
|
"mean_token_accuracy": 0.8032149733919829, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 2.3640399556048832, |
|
"grad_norm": 0.40406131744384766, |
|
"learning_rate": 2.6126506831233343e-06, |
|
"loss": 0.6231, |
|
"mean_token_accuracy": 0.8041168200702946, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 2.3669996300406955, |
|
"grad_norm": 0.3953285217285156, |
|
"learning_rate": 2.5894008874800323e-06, |
|
"loss": 0.6069, |
|
"mean_token_accuracy": 0.8095464392825367, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.3699593044765077, |
|
"grad_norm": 0.39232245087623596, |
|
"learning_rate": 2.5662396084658383e-06, |
|
"loss": 0.6887, |
|
"mean_token_accuracy": 0.7847626079340336, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 2.3729189789123195, |
|
"grad_norm": 0.4201255440711975, |
|
"learning_rate": 2.543167122732918e-06, |
|
"loss": 0.6305, |
|
"mean_token_accuracy": 0.8017334424766583, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.3729189789123195, |
|
"eval_loss": 0.7381730079650879, |
|
"eval_mean_token_accuracy": 0.7720131224354058, |
|
"eval_runtime": 24.485, |
|
"eval_samples_per_second": 5.269, |
|
"eval_steps_per_second": 1.348, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.3758786533481318, |
|
"grad_norm": 0.41799813508987427, |
|
"learning_rate": 2.5201837058728506e-06, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.7981351114043707, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 2.378838327783944, |
|
"grad_norm": 0.4000183939933777, |
|
"learning_rate": 2.4972896324133143e-06, |
|
"loss": 0.6408, |
|
"mean_token_accuracy": 0.8005246267043143, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 2.381798002219756, |
|
"grad_norm": 0.3985981345176697, |
|
"learning_rate": 2.474485175814816e-06, |
|
"loss": 0.6419, |
|
"mean_token_accuracy": 0.8006590768326411, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 2.384757676655568, |
|
"grad_norm": 0.39996403455734253, |
|
"learning_rate": 2.451770608467432e-06, |
|
"loss": 0.6556, |
|
"mean_token_accuracy": 0.7937097877818717, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.38771735109138, |
|
"grad_norm": 0.40971750020980835, |
|
"learning_rate": 2.429146201687538e-06, |
|
"loss": 0.6544, |
|
"mean_token_accuracy": 0.7955813996484105, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.390677025527192, |
|
"grad_norm": 0.4214819669723511, |
|
"learning_rate": 2.4066122257145898e-06, |
|
"loss": 0.6192, |
|
"mean_token_accuracy": 0.8044043910369116, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 2.393636699963004, |
|
"grad_norm": 0.3935949206352234, |
|
"learning_rate": 2.3841689497078746e-06, |
|
"loss": 0.6616, |
|
"mean_token_accuracy": 0.7930927722183864, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.396596374398816, |
|
"grad_norm": 0.4110560715198517, |
|
"learning_rate": 2.361816641743303e-06, |
|
"loss": 0.6589, |
|
"mean_token_accuracy": 0.7933747994521603, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.3995560488346284, |
|
"grad_norm": 0.40695828199386597, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.6654, |
|
"mean_token_accuracy": 0.79236514420736, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 2.40251572327044, |
|
"grad_norm": 0.4188994765281677, |
|
"learning_rate": 2.317385996808195e-06, |
|
"loss": 0.6401, |
|
"mean_token_accuracy": 0.7978658874862038, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4054753977062524, |
|
"grad_norm": 0.4050770699977875, |
|
"learning_rate": 2.295308190543859e-06, |
|
"loss": 0.6565, |
|
"mean_token_accuracy": 0.793120003753917, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 2.408435072142064, |
|
"grad_norm": 0.4208693206310272, |
|
"learning_rate": 2.2733224137277366e-06, |
|
"loss": 0.6625, |
|
"mean_token_accuracy": 0.7924009490317484, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.4113947465778764, |
|
"grad_norm": 0.41889867186546326, |
|
"learning_rate": 2.251428928971102e-06, |
|
"loss": 0.6421, |
|
"mean_token_accuracy": 0.7987856486295601, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 2.4143544210136882, |
|
"grad_norm": 0.3941342532634735, |
|
"learning_rate": 2.229627997782834e-06, |
|
"loss": 0.6522, |
|
"mean_token_accuracy": 0.7969981541204149, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.4173140954495005, |
|
"grad_norm": 0.4085904061794281, |
|
"learning_rate": 2.2079198805662917e-06, |
|
"loss": 0.636, |
|
"mean_token_accuracy": 0.8006980355838276, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.4202737698853127, |
|
"grad_norm": 0.39825567603111267, |
|
"learning_rate": 2.186304836616221e-06, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.7977600103702366, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.4232334443211245, |
|
"grad_norm": 0.40731707215309143, |
|
"learning_rate": 2.1647831241156304e-06, |
|
"loss": 0.6504, |
|
"mean_token_accuracy": 0.7959071538531968, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 2.4261931187569368, |
|
"grad_norm": 0.406483918428421, |
|
"learning_rate": 2.1433550001327376e-06, |
|
"loss": 0.6639, |
|
"mean_token_accuracy": 0.7929632102578547, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 2.429152793192749, |
|
"grad_norm": 0.40205124020576477, |
|
"learning_rate": 2.122020720617869e-06, |
|
"loss": 0.6602, |
|
"mean_token_accuracy": 0.7925995018559459, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.432112467628561, |
|
"grad_norm": 0.39821526408195496, |
|
"learning_rate": 2.1007805404004247e-06, |
|
"loss": 0.6125, |
|
"mean_token_accuracy": 0.8074528559405126, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.435072142064373, |
|
"grad_norm": 0.41154807806015015, |
|
"learning_rate": 2.0796347131858187e-06, |
|
"loss": 0.5924, |
|
"mean_token_accuracy": 0.8134260585147182, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 2.438031816500185, |
|
"grad_norm": 0.4058592617511749, |
|
"learning_rate": 2.058583491552465e-06, |
|
"loss": 0.6446, |
|
"mean_token_accuracy": 0.7976544788468782, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.440991490935997, |
|
"grad_norm": 0.4115375280380249, |
|
"learning_rate": 2.037627126948751e-06, |
|
"loss": 0.6486, |
|
"mean_token_accuracy": 0.7961866171753605, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 2.443951165371809, |
|
"grad_norm": 0.4094432592391968, |
|
"learning_rate": 2.0167658696900317e-06, |
|
"loss": 0.6498, |
|
"mean_token_accuracy": 0.7974890015343987, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 2.446910839807621, |
|
"grad_norm": 0.4136302173137665, |
|
"learning_rate": 1.9959999689556407e-06, |
|
"loss": 0.6508, |
|
"mean_token_accuracy": 0.7969356095942468, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.4498705142434334, |
|
"grad_norm": 0.3964935839176178, |
|
"learning_rate": 1.9753296727859195e-06, |
|
"loss": 0.6422, |
|
"mean_token_accuracy": 0.7975552703681513, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 2.452830188679245, |
|
"grad_norm": 0.3923938572406769, |
|
"learning_rate": 1.9547552280792528e-06, |
|
"loss": 0.6644, |
|
"mean_token_accuracy": 0.7923696593805352, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 2.4557898631150574, |
|
"grad_norm": 0.40744659304618835, |
|
"learning_rate": 1.9342768805891176e-06, |
|
"loss": 0.6185, |
|
"mean_token_accuracy": 0.805182835348635, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.4587495375508692, |
|
"grad_norm": 0.3996569812297821, |
|
"learning_rate": 1.9138948749211473e-06, |
|
"loss": 0.6885, |
|
"mean_token_accuracy": 0.7846693968735795, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 2.4617092119866815, |
|
"grad_norm": 0.41256505250930786, |
|
"learning_rate": 1.8936094545302098e-06, |
|
"loss": 0.633, |
|
"mean_token_accuracy": 0.80203945172239, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.4646688864224937, |
|
"grad_norm": 0.40780341625213623, |
|
"learning_rate": 1.8734208617174986e-06, |
|
"loss": 0.6233, |
|
"mean_token_accuracy": 0.8031928870956203, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 2.4676285608583055, |
|
"grad_norm": 0.40084558725357056, |
|
"learning_rate": 1.8533293376276473e-06, |
|
"loss": 0.6645, |
|
"mean_token_accuracy": 0.7910519113431395, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 0.39891085028648376, |
|
"learning_rate": 1.8333351222458407e-06, |
|
"loss": 0.6457, |
|
"mean_token_accuracy": 0.7975340656719943, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 2.4735479097299296, |
|
"grad_norm": 0.39134928584098816, |
|
"learning_rate": 1.813438454394948e-06, |
|
"loss": 0.666, |
|
"mean_token_accuracy": 0.7898439445031347, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 2.476507584165742, |
|
"grad_norm": 0.41572368144989014, |
|
"learning_rate": 1.7936395717326705e-06, |
|
"loss": 0.6414, |
|
"mean_token_accuracy": 0.7995425811392918, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.479467258601554, |
|
"grad_norm": 0.40483906865119934, |
|
"learning_rate": 1.773938710748706e-06, |
|
"loss": 0.662, |
|
"mean_token_accuracy": 0.7923949344399477, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 2.482426933037366, |
|
"grad_norm": 0.40634220838546753, |
|
"learning_rate": 1.7543361067619269e-06, |
|
"loss": 0.615, |
|
"mean_token_accuracy": 0.806526275556733, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.485386607473178, |
|
"grad_norm": 0.4077673852443695, |
|
"learning_rate": 1.734831993917564e-06, |
|
"loss": 0.6328, |
|
"mean_token_accuracy": 0.802378745198797, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 2.48834628190899, |
|
"grad_norm": 0.39237353205680847, |
|
"learning_rate": 1.715426605184407e-06, |
|
"loss": 0.6155, |
|
"mean_token_accuracy": 0.8060445709769514, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 2.491305956344802, |
|
"grad_norm": 0.4081886112689972, |
|
"learning_rate": 1.6961201723520248e-06, |
|
"loss": 0.6144, |
|
"mean_token_accuracy": 0.8065054898817852, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.494265630780614, |
|
"grad_norm": 0.4004494547843933, |
|
"learning_rate": 1.676912926028007e-06, |
|
"loss": 0.6055, |
|
"mean_token_accuracy": 0.809148562640221, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 2.497225305216426, |
|
"grad_norm": 0.4009197950363159, |
|
"learning_rate": 1.6578050956351887e-06, |
|
"loss": 0.6238, |
|
"mean_token_accuracy": 0.8049418801942305, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 2.5001849796522384, |
|
"grad_norm": 0.3991737365722656, |
|
"learning_rate": 1.6387969094089318e-06, |
|
"loss": 0.6176, |
|
"mean_token_accuracy": 0.8066388869046413, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 2.50314465408805, |
|
"grad_norm": 0.4157380163669586, |
|
"learning_rate": 1.619888594394382e-06, |
|
"loss": 0.6469, |
|
"mean_token_accuracy": 0.7967652752528133, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 2.5061043285238624, |
|
"grad_norm": 0.41072478890419006, |
|
"learning_rate": 1.6010803764437633e-06, |
|
"loss": 0.6285, |
|
"mean_token_accuracy": 0.8027356011802552, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.5090640029596747, |
|
"grad_norm": 0.38374269008636475, |
|
"learning_rate": 1.5823724802136863e-06, |
|
"loss": 0.6655, |
|
"mean_token_accuracy": 0.7921055036509936, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 2.5120236773954865, |
|
"grad_norm": 0.3918653130531311, |
|
"learning_rate": 1.5637651291624522e-06, |
|
"loss": 0.6541, |
|
"mean_token_accuracy": 0.7957731421640813, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 2.5149833518312983, |
|
"grad_norm": 0.4183335602283478, |
|
"learning_rate": 1.545258545547398e-06, |
|
"loss": 0.6808, |
|
"mean_token_accuracy": 0.7860103025645604, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 2.5179430262671105, |
|
"grad_norm": 0.40748029947280884, |
|
"learning_rate": 1.5268529504222262e-06, |
|
"loss": 0.6476, |
|
"mean_token_accuracy": 0.7958813429391195, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 2.5209027007029228, |
|
"grad_norm": 0.4112967550754547, |
|
"learning_rate": 1.5085485636343755e-06, |
|
"loss": 0.6305, |
|
"mean_token_accuracy": 0.8016536067152452, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5238623751387346, |
|
"grad_norm": 0.40055161714553833, |
|
"learning_rate": 1.4903456038223941e-06, |
|
"loss": 0.6374, |
|
"mean_token_accuracy": 0.799591641647149, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 2.526822049574547, |
|
"grad_norm": 0.39930155873298645, |
|
"learning_rate": 1.4722442884133214e-06, |
|
"loss": 0.5796, |
|
"mean_token_accuracy": 0.8175529008877027, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 2.529781724010359, |
|
"grad_norm": 0.3882523477077484, |
|
"learning_rate": 1.4542448336201021e-06, |
|
"loss": 0.646, |
|
"mean_token_accuracy": 0.7965177087401804, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 2.532741398446171, |
|
"grad_norm": 0.4089968502521515, |
|
"learning_rate": 1.4363474544389876e-06, |
|
"loss": 0.6288, |
|
"mean_token_accuracy": 0.8025391764757291, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 2.535701072881983, |
|
"grad_norm": 0.39754486083984375, |
|
"learning_rate": 1.4185523646469822e-06, |
|
"loss": 0.6461, |
|
"mean_token_accuracy": 0.7974458592055889, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.538660747317795, |
|
"grad_norm": 0.429750919342041, |
|
"learning_rate": 1.4008597767992872e-06, |
|
"loss": 0.6118, |
|
"mean_token_accuracy": 0.8093011527301119, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 2.541620421753607, |
|
"grad_norm": 0.38371750712394714, |
|
"learning_rate": 1.3832699022267516e-06, |
|
"loss": 0.6399, |
|
"mean_token_accuracy": 0.7980772590627099, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 2.544580096189419, |
|
"grad_norm": 0.41115689277648926, |
|
"learning_rate": 1.3657829510333653e-06, |
|
"loss": 0.6633, |
|
"mean_token_accuracy": 0.7933955020310409, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 2.547539770625231, |
|
"grad_norm": 0.406768262386322, |
|
"learning_rate": 1.3483991320937307e-06, |
|
"loss": 0.6368, |
|
"mean_token_accuracy": 0.8023250526600325, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 2.5504994450610434, |
|
"grad_norm": 0.4091865122318268, |
|
"learning_rate": 1.3311186530505838e-06, |
|
"loss": 0.6189, |
|
"mean_token_accuracy": 0.8061198976192254, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.5534591194968552, |
|
"grad_norm": 0.385766863822937, |
|
"learning_rate": 1.313941720312303e-06, |
|
"loss": 0.6262, |
|
"mean_token_accuracy": 0.8026254886335932, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 2.5564187939326675, |
|
"grad_norm": 0.403012216091156, |
|
"learning_rate": 1.2968685390504465e-06, |
|
"loss": 0.622, |
|
"mean_token_accuracy": 0.8041227440695632, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 2.5593784683684797, |
|
"grad_norm": 0.3971555829048157, |
|
"learning_rate": 1.2798993131973093e-06, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.7896582637305288, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 2.5623381428042915, |
|
"grad_norm": 0.40315189957618713, |
|
"learning_rate": 1.263034245443473e-06, |
|
"loss": 0.6563, |
|
"mean_token_accuracy": 0.7948344293273772, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 2.5652978172401038, |
|
"grad_norm": 0.40858373045921326, |
|
"learning_rate": 1.2462735372353996e-06, |
|
"loss": 0.6228, |
|
"mean_token_accuracy": 0.8045441140339781, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.5682574916759155, |
|
"grad_norm": 0.3969631493091583, |
|
"learning_rate": 1.2296173887730122e-06, |
|
"loss": 0.6345, |
|
"mean_token_accuracy": 0.8022942568625994, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 2.571217166111728, |
|
"grad_norm": 0.39615315198898315, |
|
"learning_rate": 1.2130659990073146e-06, |
|
"loss": 0.6356, |
|
"mean_token_accuracy": 0.7998559942550404, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 2.5741768405475396, |
|
"grad_norm": 0.38922396302223206, |
|
"learning_rate": 1.196619565638003e-06, |
|
"loss": 0.6286, |
|
"mean_token_accuracy": 0.8018824489890675, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 2.577136514983352, |
|
"grad_norm": 0.4000704288482666, |
|
"learning_rate": 1.1802782851111206e-06, |
|
"loss": 0.6418, |
|
"mean_token_accuracy": 0.7989303050191064, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 2.580096189419164, |
|
"grad_norm": 0.39476409554481506, |
|
"learning_rate": 1.1640423526166987e-06, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.797418578107648, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.583055863854976, |
|
"grad_norm": 0.39660996198654175, |
|
"learning_rate": 1.1479119620864277e-06, |
|
"loss": 0.6575, |
|
"mean_token_accuracy": 0.795806747653712, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 2.586015538290788, |
|
"grad_norm": 0.41734716296195984, |
|
"learning_rate": 1.1318873061913405e-06, |
|
"loss": 0.5882, |
|
"mean_token_accuracy": 0.8143113885996807, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 2.5889752127266, |
|
"grad_norm": 0.3729105293750763, |
|
"learning_rate": 1.1159685763395113e-06, |
|
"loss": 0.64, |
|
"mean_token_accuracy": 0.7987188883545505, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 2.591934887162412, |
|
"grad_norm": 0.3946407437324524, |
|
"learning_rate": 1.1001559626737757e-06, |
|
"loss": 0.6418, |
|
"mean_token_accuracy": 0.798503030470437, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 2.594894561598224, |
|
"grad_norm": 0.4132760763168335, |
|
"learning_rate": 1.0844496540694515e-06, |
|
"loss": 0.6267, |
|
"mean_token_accuracy": 0.8039569693853369, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.597854236034036, |
|
"grad_norm": 0.40759339928627014, |
|
"learning_rate": 1.0688498381320855e-06, |
|
"loss": 0.6318, |
|
"mean_token_accuracy": 0.8012822502344166, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.6008139104698484, |
|
"grad_norm": 0.41799381375312805, |
|
"learning_rate": 1.0533567011952094e-06, |
|
"loss": 0.6464, |
|
"mean_token_accuracy": 0.7964816550323018, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 2.6037735849056602, |
|
"grad_norm": 0.4257717430591583, |
|
"learning_rate": 1.037970428318118e-06, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.7837483957536826, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 2.6067332593414725, |
|
"grad_norm": 0.4087117314338684, |
|
"learning_rate": 1.022691203283661e-06, |
|
"loss": 0.6507, |
|
"mean_token_accuracy": 0.7950712747355096, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 2.6096929337772847, |
|
"grad_norm": 0.41945111751556396, |
|
"learning_rate": 1.0075192085960451e-06, |
|
"loss": 0.6678, |
|
"mean_token_accuracy": 0.7909589594797406, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6126526082130965, |
|
"grad_norm": 0.398735374212265, |
|
"learning_rate": 9.924546254786493e-07, |
|
"loss": 0.6316, |
|
"mean_token_accuracy": 0.8018926205701773, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 2.6156122826489088, |
|
"grad_norm": 0.406318724155426, |
|
"learning_rate": 9.77497633871868e-07, |
|
"loss": 0.6054, |
|
"mean_token_accuracy": 0.8093279590843514, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 2.6185719570847206, |
|
"grad_norm": 0.3851606845855713, |
|
"learning_rate": 9.62648412430951e-07, |
|
"loss": 0.6791, |
|
"mean_token_accuracy": 0.7881774140441217, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 2.621531631520533, |
|
"grad_norm": 0.4061947762966156, |
|
"learning_rate": 9.479071385238892e-07, |
|
"loss": 0.6212, |
|
"mean_token_accuracy": 0.8042670614990748, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 2.6244913059563446, |
|
"grad_norm": 0.39614221453666687, |
|
"learning_rate": 9.332739882292752e-07, |
|
"loss": 0.6296, |
|
"mean_token_accuracy": 0.8017565837535566, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.627450980392157, |
|
"grad_norm": 0.3858533501625061, |
|
"learning_rate": 9.187491363342094e-07, |
|
"loss": 0.5922, |
|
"mean_token_accuracy": 0.8143832301495489, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 2.630410654827969, |
|
"grad_norm": 0.39614781737327576, |
|
"learning_rate": 9.043327563322113e-07, |
|
"loss": 0.6387, |
|
"mean_token_accuracy": 0.799956339899957, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 2.633370329263781, |
|
"grad_norm": 0.38962864875793457, |
|
"learning_rate": 8.900250204211513e-07, |
|
"loss": 0.626, |
|
"mean_token_accuracy": 0.8054223234361488, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 2.636330003699593, |
|
"grad_norm": 0.38743823766708374, |
|
"learning_rate": 8.758260995011825e-07, |
|
"loss": 0.6249, |
|
"mean_token_accuracy": 0.8041963208824743, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 2.6392896781354054, |
|
"grad_norm": 0.38722845911979675, |
|
"learning_rate": 8.617361631727139e-07, |
|
"loss": 0.637, |
|
"mean_token_accuracy": 0.7999073170969193, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.642249352571217, |
|
"grad_norm": 0.38422495126724243, |
|
"learning_rate": 8.477553797343729e-07, |
|
"loss": 0.5932, |
|
"mean_token_accuracy": 0.8125740456037845, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 2.645209027007029, |
|
"grad_norm": 0.3883955180644989, |
|
"learning_rate": 8.338839161809997e-07, |
|
"loss": 0.6259, |
|
"mean_token_accuracy": 0.8034302437405634, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 2.648168701442841, |
|
"grad_norm": 0.413769394159317, |
|
"learning_rate": 8.201219382016556e-07, |
|
"loss": 0.6425, |
|
"mean_token_accuracy": 0.7988244713424745, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 2.6511283758786535, |
|
"grad_norm": 0.3942348062992096, |
|
"learning_rate": 8.06469610177636e-07, |
|
"loss": 0.6366, |
|
"mean_token_accuracy": 0.800066869045331, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 2.6540880503144653, |
|
"grad_norm": 0.3790660500526428, |
|
"learning_rate": 7.92927095180518e-07, |
|
"loss": 0.6505, |
|
"mean_token_accuracy": 0.795845314542134, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.6570477247502775, |
|
"grad_norm": 0.42260193824768066, |
|
"learning_rate": 7.794945549701993e-07, |
|
"loss": 0.6085, |
|
"mean_token_accuracy": 0.8089679902729355, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 2.6600073991860897, |
|
"grad_norm": 0.37863457202911377, |
|
"learning_rate": 7.661721499929753e-07, |
|
"loss": 0.608, |
|
"mean_token_accuracy": 0.8079819508856279, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 2.6629670736219015, |
|
"grad_norm": 0.4104274809360504, |
|
"learning_rate": 7.529600393796232e-07, |
|
"loss": 0.6343, |
|
"mean_token_accuracy": 0.8013414635641989, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 2.665926748057714, |
|
"grad_norm": 0.4015280604362488, |
|
"learning_rate": 7.398583809434944e-07, |
|
"loss": 0.6194, |
|
"mean_token_accuracy": 0.8067789013401996, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 2.6688864224935256, |
|
"grad_norm": 0.3843616247177124, |
|
"learning_rate": 7.268673311786378e-07, |
|
"loss": 0.655, |
|
"mean_token_accuracy": 0.7944493186314524, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.6688864224935256, |
|
"eval_loss": 0.737091064453125, |
|
"eval_mean_token_accuracy": 0.7722201670436681, |
|
"eval_runtime": 24.4823, |
|
"eval_samples_per_second": 5.269, |
|
"eval_steps_per_second": 1.348, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.671846096929338, |
|
"grad_norm": 0.40167438983917236, |
|
"learning_rate": 7.1398704525792e-07, |
|
"loss": 0.6665, |
|
"mean_token_accuracy": 0.7904682922183952, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 2.6748057713651496, |
|
"grad_norm": 0.4117159843444824, |
|
"learning_rate": 7.012176770311863e-07, |
|
"loss": 0.6622, |
|
"mean_token_accuracy": 0.7920689961190451, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 2.677765445800962, |
|
"grad_norm": 0.39613744616508484, |
|
"learning_rate": 6.885593790234057e-07, |
|
"loss": 0.6376, |
|
"mean_token_accuracy": 0.799410845334018, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 2.680725120236774, |
|
"grad_norm": 0.38793283700942993, |
|
"learning_rate": 6.760123024328624e-07, |
|
"loss": 0.6141, |
|
"mean_token_accuracy": 0.8077387547151241, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 2.683684794672586, |
|
"grad_norm": 0.38844698667526245, |
|
"learning_rate": 6.635765971293484e-07, |
|
"loss": 0.6559, |
|
"mean_token_accuracy": 0.794430430660069, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.686644469108398, |
|
"grad_norm": 0.3850746154785156, |
|
"learning_rate": 6.512524116523633e-07, |
|
"loss": 0.627, |
|
"mean_token_accuracy": 0.8037230062591546, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 2.6896041435442104, |
|
"grad_norm": 0.3915550708770752, |
|
"learning_rate": 6.390398932093555e-07, |
|
"loss": 0.6077, |
|
"mean_token_accuracy": 0.8080517778457975, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 2.692563817980022, |
|
"grad_norm": 0.37720099091529846, |
|
"learning_rate": 6.269391876739494e-07, |
|
"loss": 0.6301, |
|
"mean_token_accuracy": 0.8039389719388176, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 2.695523492415834, |
|
"grad_norm": 0.3923218250274658, |
|
"learning_rate": 6.149504395842087e-07, |
|
"loss": 0.6148, |
|
"mean_token_accuracy": 0.8082143968389491, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 2.6984831668516462, |
|
"grad_norm": 0.39484548568725586, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.6583, |
|
"mean_token_accuracy": 0.7938795460478842, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7014428412874585, |
|
"grad_norm": 0.40009021759033203, |
|
"learning_rate": 5.913093872058528e-07, |
|
"loss": 0.6608, |
|
"mean_token_accuracy": 0.793614022788515, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 2.7044025157232703, |
|
"grad_norm": 0.40624064207077026, |
|
"learning_rate": 5.796573653001091e-07, |
|
"loss": 0.6335, |
|
"mean_token_accuracy": 0.8018102965988579, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 2.7073621901590825, |
|
"grad_norm": 0.4008027911186218, |
|
"learning_rate": 5.681178656024055e-07, |
|
"loss": 0.6626, |
|
"mean_token_accuracy": 0.7932069577957652, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 2.7103218645948948, |
|
"grad_norm": 0.40246814489364624, |
|
"learning_rate": 5.56691025947429e-07, |
|
"loss": 0.6378, |
|
"mean_token_accuracy": 0.800000261371183, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 2.7132815390307066, |
|
"grad_norm": 0.37238821387290955, |
|
"learning_rate": 5.453769828241872e-07, |
|
"loss": 0.6268, |
|
"mean_token_accuracy": 0.8024412909252127, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.716241213466519, |
|
"grad_norm": 0.39563846588134766, |
|
"learning_rate": 5.341758713743828e-07, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.7931748591712275, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 2.7192008879023306, |
|
"grad_norm": 0.3933393061161041, |
|
"learning_rate": 5.230878253907911e-07, |
|
"loss": 0.6416, |
|
"mean_token_accuracy": 0.7995262716287037, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 2.722160562338143, |
|
"grad_norm": 0.3950590193271637, |
|
"learning_rate": 5.121129773156663e-07, |
|
"loss": 0.6771, |
|
"mean_token_accuracy": 0.7878128871617898, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 2.7251202367739547, |
|
"grad_norm": 0.41165900230407715, |
|
"learning_rate": 5.012514582391592e-07, |
|
"loss": 0.6194, |
|
"mean_token_accuracy": 0.805260790188586, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 2.728079911209767, |
|
"grad_norm": 0.3828143775463104, |
|
"learning_rate": 4.905033978977492e-07, |
|
"loss": 0.6285, |
|
"mean_token_accuracy": 0.8036274550004541, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.731039585645579, |
|
"grad_norm": 0.3781799077987671, |
|
"learning_rate": 4.798689246727006e-07, |
|
"loss": 0.6143, |
|
"mean_token_accuracy": 0.8072609168484571, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 2.733999260081391, |
|
"grad_norm": 0.3903900682926178, |
|
"learning_rate": 4.693481655885257e-07, |
|
"loss": 0.6698, |
|
"mean_token_accuracy": 0.7922049787058092, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 2.736958934517203, |
|
"grad_norm": 0.3956415355205536, |
|
"learning_rate": 4.58941246311464e-07, |
|
"loss": 0.6301, |
|
"mean_token_accuracy": 0.8028085591716645, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 2.7399186089530154, |
|
"grad_norm": 0.3861734867095947, |
|
"learning_rate": 4.4864829114798394e-07, |
|
"loss": 0.6371, |
|
"mean_token_accuracy": 0.8004312278302195, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 2.742878283388827, |
|
"grad_norm": 0.3868809640407562, |
|
"learning_rate": 4.384694230432984e-07, |
|
"loss": 0.5952, |
|
"mean_token_accuracy": 0.8138450723816196, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.745837957824639, |
|
"grad_norm": 0.3856772780418396, |
|
"learning_rate": 4.2840476357989825e-07, |
|
"loss": 0.611, |
|
"mean_token_accuracy": 0.80796215409744, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 2.7487976322604513, |
|
"grad_norm": 0.404486745595932, |
|
"learning_rate": 4.184544329761009e-07, |
|
"loss": 0.6209, |
|
"mean_token_accuracy": 0.8057150436844314, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 2.7517573066962635, |
|
"grad_norm": 0.3897272050380707, |
|
"learning_rate": 4.0861855008460403e-07, |
|
"loss": 0.6327, |
|
"mean_token_accuracy": 0.8016584740172387, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 2.7547169811320753, |
|
"grad_norm": 0.3906909227371216, |
|
"learning_rate": 3.988972323910778e-07, |
|
"loss": 0.6181, |
|
"mean_token_accuracy": 0.805539043349179, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 2.7576766555678875, |
|
"grad_norm": 0.38629284501075745, |
|
"learning_rate": 3.8929059601275463e-07, |
|
"loss": 0.6256, |
|
"mean_token_accuracy": 0.8029286474181538, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.7606363300037, |
|
"grad_norm": 0.4061240255832672, |
|
"learning_rate": 3.797987556970495e-07, |
|
"loss": 0.6719, |
|
"mean_token_accuracy": 0.7906059984731508, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 2.7635960044395116, |
|
"grad_norm": 0.40067771077156067, |
|
"learning_rate": 3.7042182482018074e-07, |
|
"loss": 0.6271, |
|
"mean_token_accuracy": 0.8041936678142166, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 2.766555678875324, |
|
"grad_norm": 0.3809727132320404, |
|
"learning_rate": 3.611599153858214e-07, |
|
"loss": 0.6769, |
|
"mean_token_accuracy": 0.7875104091416671, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 2.7695153533111356, |
|
"grad_norm": 0.40350061655044556, |
|
"learning_rate": 3.520131380237546e-07, |
|
"loss": 0.6647, |
|
"mean_token_accuracy": 0.7917032324367623, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 2.772475027746948, |
|
"grad_norm": 0.4117463529109955, |
|
"learning_rate": 3.429816019885657e-07, |
|
"loss": 0.6811, |
|
"mean_token_accuracy": 0.787343757534307, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.7754347021827597, |
|
"grad_norm": 0.3994939923286438, |
|
"learning_rate": 3.3406541515832e-07, |
|
"loss": 0.6786, |
|
"mean_token_accuracy": 0.7861266133562229, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 2.778394376618572, |
|
"grad_norm": 0.39691928029060364, |
|
"learning_rate": 3.252646840332918e-07, |
|
"loss": 0.6468, |
|
"mean_token_accuracy": 0.7971869583236945, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 2.781354051054384, |
|
"grad_norm": 0.37808868288993835, |
|
"learning_rate": 3.16579513734675e-07, |
|
"loss": 0.6259, |
|
"mean_token_accuracy": 0.8036837288252531, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 2.784313725490196, |
|
"grad_norm": 0.38705241680145264, |
|
"learning_rate": 3.080100080033388e-07, |
|
"loss": 0.622, |
|
"mean_token_accuracy": 0.8054349345477914, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 2.787273399926008, |
|
"grad_norm": 0.37049245834350586, |
|
"learning_rate": 2.995562691985898e-07, |
|
"loss": 0.6281, |
|
"mean_token_accuracy": 0.802922693455199, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.7902330743618204, |
|
"grad_norm": 0.4022907316684723, |
|
"learning_rate": 2.9121839829693857e-07, |
|
"loss": 0.6193, |
|
"mean_token_accuracy": 0.8052185953687516, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 2.7931927487976322, |
|
"grad_norm": 0.40110448002815247, |
|
"learning_rate": 2.829964948909048e-07, |
|
"loss": 0.6233, |
|
"mean_token_accuracy": 0.8038183781558145, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 2.7961524232334445, |
|
"grad_norm": 0.3919583857059479, |
|
"learning_rate": 2.748906571878207e-07, |
|
"loss": 0.6603, |
|
"mean_token_accuracy": 0.7946063609111435, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 2.7991120976692563, |
|
"grad_norm": 0.39748555421829224, |
|
"learning_rate": 2.6690098200866097e-07, |
|
"loss": 0.6416, |
|
"mean_token_accuracy": 0.7996132256535484, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 2.8020717721050685, |
|
"grad_norm": 0.40067169070243835, |
|
"learning_rate": 2.5902756478688674e-07, |
|
"loss": 0.6431, |
|
"mean_token_accuracy": 0.7986862031085916, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.8050314465408803, |
|
"grad_norm": 0.3947811722755432, |
|
"learning_rate": 2.5127049956730207e-07, |
|
"loss": 0.6424, |
|
"mean_token_accuracy": 0.797873089467536, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 2.8079911209766926, |
|
"grad_norm": 0.38122984766960144, |
|
"learning_rate": 2.436298790049363e-07, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.7921808444907809, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 2.810950795412505, |
|
"grad_norm": 0.3970412611961365, |
|
"learning_rate": 2.3610579436392999e-07, |
|
"loss": 0.6454, |
|
"mean_token_accuracy": 0.798217491279841, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 2.8139104698483166, |
|
"grad_norm": 0.39274781942367554, |
|
"learning_rate": 2.2869833551645293e-07, |
|
"loss": 0.6462, |
|
"mean_token_accuracy": 0.7971693406963306, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 2.816870144284129, |
|
"grad_norm": 0.38875052332878113, |
|
"learning_rate": 2.2140759094162468e-07, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.7977855648395308, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8198298187199407, |
|
"grad_norm": 0.39083102345466614, |
|
"learning_rate": 2.1423364772445886e-07, |
|
"loss": 0.6233, |
|
"mean_token_accuracy": 0.8038262482281366, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 2.822789493155753, |
|
"grad_norm": 0.3867531716823578, |
|
"learning_rate": 2.071765915548274e-07, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.7856023656322098, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 2.8257491675915647, |
|
"grad_norm": 0.39589664340019226, |
|
"learning_rate": 2.002365067264289e-07, |
|
"loss": 0.6737, |
|
"mean_token_accuracy": 0.7887226540574725, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 2.828708842027377, |
|
"grad_norm": 0.41389018297195435, |
|
"learning_rate": 1.9341347613579086e-07, |
|
"loss": 0.6184, |
|
"mean_token_accuracy": 0.8065408919751612, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 2.831668516463189, |
|
"grad_norm": 0.4138829708099365, |
|
"learning_rate": 1.867075812812691e-07, |
|
"loss": 0.6391, |
|
"mean_token_accuracy": 0.8007256177269018, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.834628190899001, |
|
"grad_norm": 0.384776771068573, |
|
"learning_rate": 1.8011890226208527e-07, |
|
"loss": 0.613, |
|
"mean_token_accuracy": 0.8072664965020259, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 2.837587865334813, |
|
"grad_norm": 0.37912535667419434, |
|
"learning_rate": 1.7364751777736334e-07, |
|
"loss": 0.6509, |
|
"mean_token_accuracy": 0.7937373916975208, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 2.8405475397706255, |
|
"grad_norm": 0.39359596371650696, |
|
"learning_rate": 1.6729350512519006e-07, |
|
"loss": 0.6386, |
|
"mean_token_accuracy": 0.8000337051550754, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 2.8435072142064373, |
|
"grad_norm": 0.3822968602180481, |
|
"learning_rate": 1.6105694020169594e-07, |
|
"loss": 0.6322, |
|
"mean_token_accuracy": 0.8005505544311058, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 2.8464668886422495, |
|
"grad_norm": 0.376174658536911, |
|
"learning_rate": 1.5493789750014032e-07, |
|
"loss": 0.6178, |
|
"mean_token_accuracy": 0.8039858290743149, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.8494265630780613, |
|
"grad_norm": 0.388172447681427, |
|
"learning_rate": 1.489364501100332e-07, |
|
"loss": 0.6551, |
|
"mean_token_accuracy": 0.7960565577797374, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 2.8523862375138735, |
|
"grad_norm": 0.3901033103466034, |
|
"learning_rate": 1.430526697162482e-07, |
|
"loss": 0.645, |
|
"mean_token_accuracy": 0.7975772604806072, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 2.8553459119496853, |
|
"grad_norm": 0.3848772943019867, |
|
"learning_rate": 1.3728662659818205e-07, |
|
"loss": 0.6037, |
|
"mean_token_accuracy": 0.8107080863727026, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 2.8583055863854976, |
|
"grad_norm": 0.38093602657318115, |
|
"learning_rate": 1.3163838962890196e-07, |
|
"loss": 0.6602, |
|
"mean_token_accuracy": 0.7919608208568516, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 2.86126526082131, |
|
"grad_norm": 0.3964565396308899, |
|
"learning_rate": 1.2610802627432972e-07, |
|
"loss": 0.6427, |
|
"mean_token_accuracy": 0.798779575468243, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.8642249352571216, |
|
"grad_norm": 0.41397061944007874, |
|
"learning_rate": 1.206956025924333e-07, |
|
"loss": 0.6266, |
|
"mean_token_accuracy": 0.8023839610262327, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 2.867184609692934, |
|
"grad_norm": 0.3790512681007385, |
|
"learning_rate": 1.1540118323243866e-07, |
|
"loss": 0.5703, |
|
"mean_token_accuracy": 0.8205281597109083, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 2.870144284128746, |
|
"grad_norm": 0.3819893002510071, |
|
"learning_rate": 1.1022483143405705e-07, |
|
"loss": 0.6072, |
|
"mean_token_accuracy": 0.8105382446606855, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 2.873103958564558, |
|
"grad_norm": 0.38210329413414, |
|
"learning_rate": 1.0516660902673448e-07, |
|
"loss": 0.6473, |
|
"mean_token_accuracy": 0.7979098356765058, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 2.8760636330003697, |
|
"grad_norm": 0.3830581307411194, |
|
"learning_rate": 1.0022657642890232e-07, |
|
"loss": 0.6233, |
|
"mean_token_accuracy": 0.8037948333368617, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.879023307436182, |
|
"grad_norm": 0.39410918951034546, |
|
"learning_rate": 9.540479264726676e-08, |
|
"loss": 0.6517, |
|
"mean_token_accuracy": 0.7945131404435056, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 2.881982981871994, |
|
"grad_norm": 0.38177594542503357, |
|
"learning_rate": 9.070131527609604e-08, |
|
"loss": 0.6083, |
|
"mean_token_accuracy": 0.8094474921741853, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 2.884942656307806, |
|
"grad_norm": 0.3808548152446747, |
|
"learning_rate": 8.61162004965388e-08, |
|
"loss": 0.6347, |
|
"mean_token_accuracy": 0.8012875105708535, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 2.8879023307436182, |
|
"grad_norm": 0.4010704755783081, |
|
"learning_rate": 8.16495030759501e-08, |
|
"loss": 0.67, |
|
"mean_token_accuracy": 0.7904797064838223, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 2.8908620051794305, |
|
"grad_norm": 0.3918650448322296, |
|
"learning_rate": 7.730127636723539e-08, |
|
"loss": 0.6005, |
|
"mean_token_accuracy": 0.8118496389421752, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.8938216796152423, |
|
"grad_norm": 0.3898662030696869, |
|
"learning_rate": 7.307157230821426e-08, |
|
"loss": 0.6453, |
|
"mean_token_accuracy": 0.7980052666038159, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 2.8967813540510545, |
|
"grad_norm": 0.39199164509773254, |
|
"learning_rate": 6.896044142100433e-08, |
|
"loss": 0.6576, |
|
"mean_token_accuracy": 0.7941206706838407, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 2.8997410284868663, |
|
"grad_norm": 0.40657898783683777, |
|
"learning_rate": 6.496793281141056e-08, |
|
"loss": 0.6771, |
|
"mean_token_accuracy": 0.7881219963995537, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 2.9027007029226786, |
|
"grad_norm": 0.3844878673553467, |
|
"learning_rate": 6.109409416834689e-08, |
|
"loss": 0.6412, |
|
"mean_token_accuracy": 0.7994358237487954, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 2.9056603773584904, |
|
"grad_norm": 0.396533340215683, |
|
"learning_rate": 5.7338971763256646e-08, |
|
"loss": 0.6225, |
|
"mean_token_accuracy": 0.8051790813619156, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.9086200517943026, |
|
"grad_norm": 0.3754301369190216, |
|
"learning_rate": 5.37026104495697e-08, |
|
"loss": 0.6305, |
|
"mean_token_accuracy": 0.8030408479316886, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 2.911579726230115, |
|
"grad_norm": 0.40677276253700256, |
|
"learning_rate": 5.0185053662161756e-08, |
|
"loss": 0.6322, |
|
"mean_token_accuracy": 0.8007158859109983, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 2.9145394006659266, |
|
"grad_norm": 0.3934902250766754, |
|
"learning_rate": 4.678634341683252e-08, |
|
"loss": 0.6222, |
|
"mean_token_accuracy": 0.804753444318889, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 2.917499075101739, |
|
"grad_norm": 0.3706609904766083, |
|
"learning_rate": 4.350652030981395e-08, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.7980685126294768, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 2.920458749537551, |
|
"grad_norm": 0.39183953404426575, |
|
"learning_rate": 4.0345623517273894e-08, |
|
"loss": 0.6267, |
|
"mean_token_accuracy": 0.8036114839333938, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.923418423973363, |
|
"grad_norm": 0.3982419967651367, |
|
"learning_rate": 3.7303690794854296e-08, |
|
"loss": 0.7065, |
|
"mean_token_accuracy": 0.7786746050545399, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 2.9263780984091747, |
|
"grad_norm": 0.4075382649898529, |
|
"learning_rate": 3.438075847721933e-08, |
|
"loss": 0.585, |
|
"mean_token_accuracy": 0.8150346535603673, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 2.929337772844987, |
|
"grad_norm": 0.3877173662185669, |
|
"learning_rate": 3.157686147762129e-08, |
|
"loss": 0.6477, |
|
"mean_token_accuracy": 0.7976473361920864, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 2.932297447280799, |
|
"grad_norm": 0.38589945435523987, |
|
"learning_rate": 2.8892033287484245e-08, |
|
"loss": 0.664, |
|
"mean_token_accuracy": 0.79294201748894, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 2.935257121716611, |
|
"grad_norm": 0.38838937878608704, |
|
"learning_rate": 2.6326305976001054e-08, |
|
"loss": 0.6019, |
|
"mean_token_accuracy": 0.8104530195130828, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9382167961524233, |
|
"grad_norm": 0.3857711851596832, |
|
"learning_rate": 2.3879710189753657e-08, |
|
"loss": 0.6397, |
|
"mean_token_accuracy": 0.798951730040894, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.38144826889038086, |
|
"learning_rate": 2.1552275152346702e-08, |
|
"loss": 0.637, |
|
"mean_token_accuracy": 0.8008487203446606, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 2.9441361450240473, |
|
"grad_norm": 0.39223143458366394, |
|
"learning_rate": 1.9344028664056715e-08, |
|
"loss": 0.6031, |
|
"mean_token_accuracy": 0.8099199822731445, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 2.9470958194598595, |
|
"grad_norm": 0.40251073241233826, |
|
"learning_rate": 1.7254997101500137e-08, |
|
"loss": 0.62, |
|
"mean_token_accuracy": 0.8063151660778272, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 2.9500554938956713, |
|
"grad_norm": 0.39291098713874817, |
|
"learning_rate": 1.528520541731915e-08, |
|
"loss": 0.6079, |
|
"mean_token_accuracy": 0.8091930978495294, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.9530151683314836, |
|
"grad_norm": 0.3827592432498932, |
|
"learning_rate": 1.3434677139885222e-08, |
|
"loss": 0.6533, |
|
"mean_token_accuracy": 0.7946187338585824, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 2.9559748427672954, |
|
"grad_norm": 0.37337788939476013, |
|
"learning_rate": 1.170343437301491e-08, |
|
"loss": 0.646, |
|
"mean_token_accuracy": 0.7960541011916046, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 2.9589345172031076, |
|
"grad_norm": 0.39986652135849, |
|
"learning_rate": 1.0091497795706728e-08, |
|
"loss": 0.6393, |
|
"mean_token_accuracy": 0.7991646202099173, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 2.96189419163892, |
|
"grad_norm": 0.3993469774723053, |
|
"learning_rate": 8.59888666189579e-09, |
|
"loss": 0.6753, |
|
"mean_token_accuracy": 0.7898335911094181, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 2.9648538660747317, |
|
"grad_norm": 0.40232738852500916, |
|
"learning_rate": 7.225618800222878e-09, |
|
"loss": 0.6607, |
|
"mean_token_accuracy": 0.7922276751171351, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.9648538660747317, |
|
"eval_loss": 0.7369399070739746, |
|
"eval_mean_token_accuracy": 0.7721513551540902, |
|
"eval_runtime": 24.4981, |
|
"eval_samples_per_second": 5.266, |
|
"eval_steps_per_second": 1.347, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.967813540510544, |
|
"grad_norm": 0.3906041085720062, |
|
"learning_rate": 5.971710613821291e-09, |
|
"loss": 0.6733, |
|
"mean_token_accuracy": 0.7893573225919971, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 2.970773214946356, |
|
"grad_norm": 0.39397749304771423, |
|
"learning_rate": 4.837177080119215e-09, |
|
"loss": 0.6217, |
|
"mean_token_accuracy": 0.8034013413614648, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.973732889382168, |
|
"grad_norm": 0.3905346691608429, |
|
"learning_rate": 3.8220317506654226e-09, |
|
"loss": 0.6531, |
|
"mean_token_accuracy": 0.795008107180572, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 2.9766925638179798, |
|
"grad_norm": 0.3973424732685089, |
|
"learning_rate": 2.9262867509605164e-09, |
|
"loss": 0.6395, |
|
"mean_token_accuracy": 0.7991790842606037, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 2.979652238253792, |
|
"grad_norm": 0.3992668092250824, |
|
"learning_rate": 2.149952780321485e-09, |
|
"loss": 0.6643, |
|
"mean_token_accuracy": 0.7913003021486229, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.9826119126896042, |
|
"grad_norm": 0.4035053253173828, |
|
"learning_rate": 1.4930391117451427e-09, |
|
"loss": 0.6354, |
|
"mean_token_accuracy": 0.7998262221333795, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 2.985571587125416, |
|
"grad_norm": 0.4096769690513611, |
|
"learning_rate": 9.555535917993297e-10, |
|
"loss": 0.6961, |
|
"mean_token_accuracy": 0.7825063025724978, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 2.9885312615612283, |
|
"grad_norm": 0.41353654861450195, |
|
"learning_rate": 5.375026405352035e-10, |
|
"loss": 0.6249, |
|
"mean_token_accuracy": 0.8038925258126584, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 2.9914909359970405, |
|
"grad_norm": 0.3798801898956299, |
|
"learning_rate": 2.388912514017516e-10, |
|
"loss": 0.6626, |
|
"mean_token_accuracy": 0.793458732875717, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 2.9944506104328523, |
|
"grad_norm": 0.39510512351989746, |
|
"learning_rate": 5.972299119250124e-11, |
|
"loss": 0.5891, |
|
"mean_token_accuracy": 0.8136845518054924, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.9974102848686646, |
|
"grad_norm": 0.40476804971694946, |
|
"learning_rate": 0.0, |
|
"loss": 0.6628, |
|
"mean_token_accuracy": 0.7925300203396853, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 2.9974102848686646, |
|
"step": 1011, |
|
"total_flos": 230593791000576.0, |
|
"train_loss": 0.7078155129410982, |
|
"train_runtime": 41256.6124, |
|
"train_samples_per_second": 1.572, |
|
"train_steps_per_second": 0.025 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1011, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 230593791000576.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|