diff --git "a/checkpoint-910/trainer_state.json" "b/checkpoint-910/trainer_state.json" deleted file mode 100644--- "a/checkpoint-910/trainer_state.json" +++ /dev/null @@ -1,6435 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 228, - "global_step": 910, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001098901098901099, - "grad_norm": 0.42645424604415894, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.3083, - "step": 1 - }, - { - "epoch": 0.001098901098901099, - "eval_loss": 1.0894134044647217, - "eval_runtime": 283.6216, - "eval_samples_per_second": 9.562, - "eval_steps_per_second": 0.398, - "step": 1 - }, - { - "epoch": 0.002197802197802198, - "grad_norm": 0.4369363784790039, - "learning_rate": 4.000000000000001e-06, - "loss": 1.2765, - "step": 2 - }, - { - "epoch": 0.0032967032967032967, - "grad_norm": 0.43186184763908386, - "learning_rate": 6e-06, - "loss": 1.2816, - "step": 3 - }, - { - "epoch": 0.004395604395604396, - "grad_norm": 0.3838448226451874, - "learning_rate": 8.000000000000001e-06, - "loss": 1.1846, - "step": 4 - }, - { - "epoch": 0.005494505494505495, - "grad_norm": 0.4089745283126831, - "learning_rate": 1e-05, - "loss": 1.2675, - "step": 5 - }, - { - "epoch": 0.006593406593406593, - "grad_norm": 0.4257515072822571, - "learning_rate": 1.2e-05, - "loss": 1.3188, - "step": 6 - }, - { - "epoch": 0.007692307692307693, - "grad_norm": 0.4373401701450348, - "learning_rate": 1.4e-05, - "loss": 1.3062, - "step": 7 - }, - { - "epoch": 0.008791208791208791, - "grad_norm": 0.4150056838989258, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.29, - "step": 8 - }, - { - "epoch": 0.00989010989010989, - "grad_norm": 0.39960989356040955, - "learning_rate": 1.8e-05, - "loss": 1.249, - "step": 9 - }, - { - "epoch": 0.01098901098901099, - "grad_norm": 0.38233983516693115, - "learning_rate": 2e-05, - "loss": 1.2778, - "step": 10 - }, - { - "epoch": 0.012087912087912088, - "grad_norm": 0.36902564764022827, - "learning_rate": 1.9999939076577906e-05, - "loss": 1.2957, - "step": 11 - }, - { - "epoch": 0.013186813186813187, - "grad_norm": 0.3389724791049957, - "learning_rate": 1.9999756307053947e-05, - "loss": 1.2902, - "step": 12 - }, - { - "epoch": 0.014285714285714285, - "grad_norm": 0.3164038360118866, - "learning_rate": 1.9999451693655125e-05, - "loss": 1.2325, - "step": 13 - }, - { - "epoch": 0.015384615384615385, - "grad_norm": 0.2989498972892761, - "learning_rate": 1.9999025240093045e-05, - "loss": 1.2175, - "step": 14 - }, - { - "epoch": 0.016483516483516484, - "grad_norm": 0.3197138011455536, - "learning_rate": 1.9998476951563914e-05, - "loss": 1.2958, - "step": 15 - }, - { - "epoch": 0.017582417582417582, - "grad_norm": 0.3572219908237457, - "learning_rate": 1.9997806834748455e-05, - "loss": 1.2597, - "step": 16 - }, - { - "epoch": 0.01868131868131868, - "grad_norm": 0.33109545707702637, - "learning_rate": 1.9997014897811834e-05, - "loss": 1.2182, - "step": 17 - }, - { - "epoch": 0.01978021978021978, - "grad_norm": 0.334486722946167, - "learning_rate": 1.9996101150403543e-05, - "loss": 1.2381, - "step": 18 - }, - { - "epoch": 0.020879120879120878, - "grad_norm": 0.33181053400039673, - "learning_rate": 1.9995065603657317e-05, - "loss": 1.2364, - "step": 19 - }, - { - "epoch": 0.02197802197802198, - "grad_norm": 0.33900511264801025, - "learning_rate": 1.999390827019096e-05, - "loss": 1.2612, - "step": 20 - }, - { - "epoch": 0.023076923076923078, - "grad_norm": 0.3426145613193512, - "learning_rate": 1.999262916410621e-05, - "loss": 1.2357, - "step": 21 - }, - { - "epoch": 0.024175824175824177, - "grad_norm": 0.35063669085502625, - "learning_rate": 1.9991228300988586e-05, - "loss": 1.2117, - "step": 22 - }, - { - "epoch": 0.025274725274725275, - "grad_norm": 0.32434922456741333, - "learning_rate": 1.998970569790715e-05, - "loss": 1.2285, - "step": 23 - }, - { - "epoch": 0.026373626373626374, - "grad_norm": 0.2922945022583008, - "learning_rate": 1.9988061373414342e-05, - "loss": 1.1452, - "step": 24 - }, - { - "epoch": 0.027472527472527472, - "grad_norm": 0.28110432624816895, - "learning_rate": 1.9986295347545738e-05, - "loss": 1.2279, - "step": 25 - }, - { - "epoch": 0.02857142857142857, - "grad_norm": 0.29521507024765015, - "learning_rate": 1.9984407641819812e-05, - "loss": 1.2388, - "step": 26 - }, - { - "epoch": 0.02967032967032967, - "grad_norm": 0.274477481842041, - "learning_rate": 1.9982398279237657e-05, - "loss": 1.2378, - "step": 27 - }, - { - "epoch": 0.03076923076923077, - "grad_norm": 0.2722557783126831, - "learning_rate": 1.9980267284282718e-05, - "loss": 1.1851, - "step": 28 - }, - { - "epoch": 0.031868131868131866, - "grad_norm": 0.2933584153652191, - "learning_rate": 1.9978014682920503e-05, - "loss": 1.152, - "step": 29 - }, - { - "epoch": 0.03296703296703297, - "grad_norm": 0.32750824093818665, - "learning_rate": 1.9975640502598243e-05, - "loss": 1.1552, - "step": 30 - }, - { - "epoch": 0.03406593406593406, - "grad_norm": 0.30807626247406006, - "learning_rate": 1.997314477224458e-05, - "loss": 1.2089, - "step": 31 - }, - { - "epoch": 0.035164835164835165, - "grad_norm": 0.31829774379730225, - "learning_rate": 1.9970527522269204e-05, - "loss": 1.2341, - "step": 32 - }, - { - "epoch": 0.03626373626373627, - "grad_norm": 0.3305433690547943, - "learning_rate": 1.9967788784562474e-05, - "loss": 1.1446, - "step": 33 - }, - { - "epoch": 0.03736263736263736, - "grad_norm": 0.28519847989082336, - "learning_rate": 1.9964928592495046e-05, - "loss": 1.1992, - "step": 34 - }, - { - "epoch": 0.038461538461538464, - "grad_norm": 0.2888084948062897, - "learning_rate": 1.9961946980917457e-05, - "loss": 1.1718, - "step": 35 - }, - { - "epoch": 0.03956043956043956, - "grad_norm": 0.2943640351295471, - "learning_rate": 1.9958843986159705e-05, - "loss": 1.1716, - "step": 36 - }, - { - "epoch": 0.04065934065934066, - "grad_norm": 0.27622032165527344, - "learning_rate": 1.99556196460308e-05, - "loss": 1.2055, - "step": 37 - }, - { - "epoch": 0.041758241758241756, - "grad_norm": 0.26794302463531494, - "learning_rate": 1.9952273999818312e-05, - "loss": 1.1765, - "step": 38 - }, - { - "epoch": 0.04285714285714286, - "grad_norm": 0.2791333496570587, - "learning_rate": 1.9948807088287884e-05, - "loss": 1.1144, - "step": 39 - }, - { - "epoch": 0.04395604395604396, - "grad_norm": 0.28924229741096497, - "learning_rate": 1.9945218953682736e-05, - "loss": 1.1227, - "step": 40 - }, - { - "epoch": 0.045054945054945054, - "grad_norm": 0.28580647706985474, - "learning_rate": 1.9941509639723155e-05, - "loss": 1.1402, - "step": 41 - }, - { - "epoch": 0.046153846153846156, - "grad_norm": 0.3056754469871521, - "learning_rate": 1.9937679191605964e-05, - "loss": 1.1547, - "step": 42 - }, - { - "epoch": 0.04725274725274725, - "grad_norm": 0.2820611894130707, - "learning_rate": 1.9933727656003964e-05, - "loss": 1.1317, - "step": 43 - }, - { - "epoch": 0.04835164835164835, - "grad_norm": 0.30150943994522095, - "learning_rate": 1.992965508106537e-05, - "loss": 1.1348, - "step": 44 - }, - { - "epoch": 0.04945054945054945, - "grad_norm": 0.30614808201789856, - "learning_rate": 1.9925461516413224e-05, - "loss": 1.137, - "step": 45 - }, - { - "epoch": 0.05054945054945055, - "grad_norm": 0.3059665560722351, - "learning_rate": 1.9921147013144782e-05, - "loss": 1.0941, - "step": 46 - }, - { - "epoch": 0.051648351648351645, - "grad_norm": 0.3012695014476776, - "learning_rate": 1.9916711623830904e-05, - "loss": 1.0622, - "step": 47 - }, - { - "epoch": 0.05274725274725275, - "grad_norm": 0.32081174850463867, - "learning_rate": 1.991215540251542e-05, - "loss": 1.1224, - "step": 48 - }, - { - "epoch": 0.05384615384615385, - "grad_norm": 0.3136264383792877, - "learning_rate": 1.9907478404714438e-05, - "loss": 1.1101, - "step": 49 - }, - { - "epoch": 0.054945054945054944, - "grad_norm": 0.30289289355278015, - "learning_rate": 1.9902680687415704e-05, - "loss": 1.1115, - "step": 50 - }, - { - "epoch": 0.056043956043956046, - "grad_norm": 0.35216423869132996, - "learning_rate": 1.989776230907789e-05, - "loss": 1.0585, - "step": 51 - }, - { - "epoch": 0.05714285714285714, - "grad_norm": 0.3184622526168823, - "learning_rate": 1.9892723329629885e-05, - "loss": 1.0816, - "step": 52 - }, - { - "epoch": 0.05824175824175824, - "grad_norm": 0.3341265618801117, - "learning_rate": 1.988756381047006e-05, - "loss": 1.0791, - "step": 53 - }, - { - "epoch": 0.05934065934065934, - "grad_norm": 0.36139941215515137, - "learning_rate": 1.988228381446553e-05, - "loss": 1.1033, - "step": 54 - }, - { - "epoch": 0.06043956043956044, - "grad_norm": 0.31869763135910034, - "learning_rate": 1.9876883405951378e-05, - "loss": 1.0959, - "step": 55 - }, - { - "epoch": 0.06153846153846154, - "grad_norm": 0.32240596413612366, - "learning_rate": 1.987136265072988e-05, - "loss": 1.1023, - "step": 56 - }, - { - "epoch": 0.06263736263736264, - "grad_norm": 0.3256106674671173, - "learning_rate": 1.9865721616069695e-05, - "loss": 1.0756, - "step": 57 - }, - { - "epoch": 0.06373626373626373, - "grad_norm": 0.3005063235759735, - "learning_rate": 1.985996037070505e-05, - "loss": 0.9989, - "step": 58 - }, - { - "epoch": 0.06483516483516484, - "grad_norm": 0.33904412388801575, - "learning_rate": 1.9854078984834904e-05, - "loss": 1.0819, - "step": 59 - }, - { - "epoch": 0.06593406593406594, - "grad_norm": 0.3378660976886749, - "learning_rate": 1.9848077530122083e-05, - "loss": 1.0888, - "step": 60 - }, - { - "epoch": 0.06703296703296703, - "grad_norm": 0.3539678156375885, - "learning_rate": 1.984195607969242e-05, - "loss": 1.0618, - "step": 61 - }, - { - "epoch": 0.06813186813186813, - "grad_norm": 0.3725723624229431, - "learning_rate": 1.983571470813386e-05, - "loss": 1.0945, - "step": 62 - }, - { - "epoch": 0.06923076923076923, - "grad_norm": 0.3979873061180115, - "learning_rate": 1.9829353491495545e-05, - "loss": 1.0364, - "step": 63 - }, - { - "epoch": 0.07032967032967033, - "grad_norm": 0.36439770460128784, - "learning_rate": 1.982287250728689e-05, - "loss": 1.0642, - "step": 64 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 0.3580510914325714, - "learning_rate": 1.9816271834476642e-05, - "loss": 1.0792, - "step": 65 - }, - { - "epoch": 0.07252747252747253, - "grad_norm": 0.3706108629703522, - "learning_rate": 1.9809551553491918e-05, - "loss": 1.078, - "step": 66 - }, - { - "epoch": 0.07362637362637363, - "grad_norm": 0.364013135433197, - "learning_rate": 1.9802711746217222e-05, - "loss": 1.0565, - "step": 67 - }, - { - "epoch": 0.07472527472527472, - "grad_norm": 0.39967453479766846, - "learning_rate": 1.979575249599344e-05, - "loss": 1.0068, - "step": 68 - }, - { - "epoch": 0.07582417582417582, - "grad_norm": 0.3680606484413147, - "learning_rate": 1.9788673887616852e-05, - "loss": 1.0747, - "step": 69 - }, - { - "epoch": 0.07692307692307693, - "grad_norm": 0.3775380849838257, - "learning_rate": 1.9781476007338058e-05, - "loss": 1.1084, - "step": 70 - }, - { - "epoch": 0.07802197802197802, - "grad_norm": 0.41164255142211914, - "learning_rate": 1.9774158942860962e-05, - "loss": 1.0769, - "step": 71 - }, - { - "epoch": 0.07912087912087912, - "grad_norm": 0.36969923973083496, - "learning_rate": 1.9766722783341682e-05, - "loss": 1.0502, - "step": 72 - }, - { - "epoch": 0.08021978021978023, - "grad_norm": 0.37051713466644287, - "learning_rate": 1.9759167619387474e-05, - "loss": 1.0806, - "step": 73 - }, - { - "epoch": 0.08131868131868132, - "grad_norm": 0.40167927742004395, - "learning_rate": 1.9751493543055634e-05, - "loss": 1.0535, - "step": 74 - }, - { - "epoch": 0.08241758241758242, - "grad_norm": 0.394216924905777, - "learning_rate": 1.9743700647852356e-05, - "loss": 1.0497, - "step": 75 - }, - { - "epoch": 0.08351648351648351, - "grad_norm": 0.3862336277961731, - "learning_rate": 1.9735789028731603e-05, - "loss": 1.0626, - "step": 76 - }, - { - "epoch": 0.08461538461538462, - "grad_norm": 0.4052029550075531, - "learning_rate": 1.972775878209397e-05, - "loss": 1.0367, - "step": 77 - }, - { - "epoch": 0.08571428571428572, - "grad_norm": 0.44047772884368896, - "learning_rate": 1.9719610005785466e-05, - "loss": 1.0211, - "step": 78 - }, - { - "epoch": 0.08681318681318681, - "grad_norm": 0.4512207806110382, - "learning_rate": 1.971134279909636e-05, - "loss": 1.0168, - "step": 79 - }, - { - "epoch": 0.08791208791208792, - "grad_norm": 0.4005817174911499, - "learning_rate": 1.9702957262759964e-05, - "loss": 1.0443, - "step": 80 - }, - { - "epoch": 0.08901098901098901, - "grad_norm": 0.39364778995513916, - "learning_rate": 1.9694453498951392e-05, - "loss": 1.0841, - "step": 81 - }, - { - "epoch": 0.09010989010989011, - "grad_norm": 0.40496987104415894, - "learning_rate": 1.9685831611286312e-05, - "loss": 1.0578, - "step": 82 - }, - { - "epoch": 0.0912087912087912, - "grad_norm": 0.4622587561607361, - "learning_rate": 1.9677091704819714e-05, - "loss": 1.0132, - "step": 83 - }, - { - "epoch": 0.09230769230769231, - "grad_norm": 0.43763408064842224, - "learning_rate": 1.9668233886044597e-05, - "loss": 1.0306, - "step": 84 - }, - { - "epoch": 0.09340659340659341, - "grad_norm": 0.43677887320518494, - "learning_rate": 1.9659258262890683e-05, - "loss": 1.0524, - "step": 85 - }, - { - "epoch": 0.0945054945054945, - "grad_norm": 0.4443049728870392, - "learning_rate": 1.9650164944723116e-05, - "loss": 1.0536, - "step": 86 - }, - { - "epoch": 0.0956043956043956, - "grad_norm": 0.43217942118644714, - "learning_rate": 1.96409540423411e-05, - "loss": 0.9867, - "step": 87 - }, - { - "epoch": 0.0967032967032967, - "grad_norm": 0.44499069452285767, - "learning_rate": 1.9631625667976584e-05, - "loss": 1.0142, - "step": 88 - }, - { - "epoch": 0.0978021978021978, - "grad_norm": 0.43815678358078003, - "learning_rate": 1.9622179935292855e-05, - "loss": 1.0305, - "step": 89 - }, - { - "epoch": 0.0989010989010989, - "grad_norm": 0.4774799942970276, - "learning_rate": 1.961261695938319e-05, - "loss": 0.9952, - "step": 90 - }, - { - "epoch": 0.1, - "grad_norm": 0.4259910583496094, - "learning_rate": 1.9602936856769432e-05, - "loss": 1.0104, - "step": 91 - }, - { - "epoch": 0.1010989010989011, - "grad_norm": 0.429085910320282, - "learning_rate": 1.9593139745400575e-05, - "loss": 1.0197, - "step": 92 - }, - { - "epoch": 0.1021978021978022, - "grad_norm": 0.4863206744194031, - "learning_rate": 1.9583225744651334e-05, - "loss": 1.0462, - "step": 93 - }, - { - "epoch": 0.10329670329670329, - "grad_norm": 0.45531728863716125, - "learning_rate": 1.9573194975320672e-05, - "loss": 0.988, - "step": 94 - }, - { - "epoch": 0.1043956043956044, - "grad_norm": 0.46742838621139526, - "learning_rate": 1.9563047559630356e-05, - "loss": 0.9999, - "step": 95 - }, - { - "epoch": 0.1054945054945055, - "grad_norm": 0.46399447321891785, - "learning_rate": 1.9552783621223437e-05, - "loss": 1.0237, - "step": 96 - }, - { - "epoch": 0.10659340659340659, - "grad_norm": 0.4376146197319031, - "learning_rate": 1.954240328516277e-05, - "loss": 1.0493, - "step": 97 - }, - { - "epoch": 0.1076923076923077, - "grad_norm": 0.43588346242904663, - "learning_rate": 1.9531906677929472e-05, - "loss": 1.0471, - "step": 98 - }, - { - "epoch": 0.1087912087912088, - "grad_norm": 0.4873124361038208, - "learning_rate": 1.9521293927421388e-05, - "loss": 1.0196, - "step": 99 - }, - { - "epoch": 0.10989010989010989, - "grad_norm": 0.5051928162574768, - "learning_rate": 1.9510565162951538e-05, - "loss": 1.045, - "step": 100 - }, - { - "epoch": 0.11098901098901098, - "grad_norm": 0.49000078439712524, - "learning_rate": 1.9499720515246524e-05, - "loss": 1.0616, - "step": 101 - }, - { - "epoch": 0.11208791208791209, - "grad_norm": 0.411355584859848, - "learning_rate": 1.9488760116444966e-05, - "loss": 0.89, - "step": 102 - }, - { - "epoch": 0.11318681318681319, - "grad_norm": 0.43907496333122253, - "learning_rate": 1.947768410009586e-05, - "loss": 0.8889, - "step": 103 - }, - { - "epoch": 0.11428571428571428, - "grad_norm": 0.4558069109916687, - "learning_rate": 1.9466492601156964e-05, - "loss": 0.9908, - "step": 104 - }, - { - "epoch": 0.11538461538461539, - "grad_norm": 0.4685650169849396, - "learning_rate": 1.945518575599317e-05, - "loss": 1.0298, - "step": 105 - }, - { - "epoch": 0.11648351648351649, - "grad_norm": 0.4913391172885895, - "learning_rate": 1.944376370237481e-05, - "loss": 0.9965, - "step": 106 - }, - { - "epoch": 0.11758241758241758, - "grad_norm": 0.4104078710079193, - "learning_rate": 1.943222657947601e-05, - "loss": 0.891, - "step": 107 - }, - { - "epoch": 0.11868131868131868, - "grad_norm": 0.47581571340560913, - "learning_rate": 1.942057452787297e-05, - "loss": 0.9985, - "step": 108 - }, - { - "epoch": 0.11978021978021978, - "grad_norm": 0.5377973318099976, - "learning_rate": 1.9408807689542257e-05, - "loss": 1.0052, - "step": 109 - }, - { - "epoch": 0.12087912087912088, - "grad_norm": 0.4786320626735687, - "learning_rate": 1.9396926207859085e-05, - "loss": 0.9916, - "step": 110 - }, - { - "epoch": 0.12197802197802197, - "grad_norm": 0.5150383114814758, - "learning_rate": 1.938493022759556e-05, - "loss": 0.991, - "step": 111 - }, - { - "epoch": 0.12307692307692308, - "grad_norm": 0.472798615694046, - "learning_rate": 1.937281989491892e-05, - "loss": 1.0272, - "step": 112 - }, - { - "epoch": 0.12417582417582418, - "grad_norm": 0.4752338230609894, - "learning_rate": 1.9360595357389735e-05, - "loss": 1.0496, - "step": 113 - }, - { - "epoch": 0.12527472527472527, - "grad_norm": 0.5019408464431763, - "learning_rate": 1.9348256763960146e-05, - "loss": 1.0388, - "step": 114 - }, - { - "epoch": 0.12637362637362637, - "grad_norm": 0.49702244997024536, - "learning_rate": 1.9335804264972018e-05, - "loss": 1.0048, - "step": 115 - }, - { - "epoch": 0.12747252747252746, - "grad_norm": 0.45504292845726013, - "learning_rate": 1.9323238012155125e-05, - "loss": 1.0158, - "step": 116 - }, - { - "epoch": 0.12857142857142856, - "grad_norm": 0.5002123117446899, - "learning_rate": 1.9310558158625286e-05, - "loss": 0.9698, - "step": 117 - }, - { - "epoch": 0.12967032967032968, - "grad_norm": 0.5354875922203064, - "learning_rate": 1.9297764858882516e-05, - "loss": 0.9936, - "step": 118 - }, - { - "epoch": 0.13076923076923078, - "grad_norm": 0.5310660004615784, - "learning_rate": 1.9284858268809135e-05, - "loss": 1.0139, - "step": 119 - }, - { - "epoch": 0.13186813186813187, - "grad_norm": 0.5326129198074341, - "learning_rate": 1.9271838545667876e-05, - "loss": 0.9886, - "step": 120 - }, - { - "epoch": 0.13296703296703297, - "grad_norm": 0.5502979159355164, - "learning_rate": 1.925870584809995e-05, - "loss": 0.9583, - "step": 121 - }, - { - "epoch": 0.13406593406593406, - "grad_norm": 0.5252236127853394, - "learning_rate": 1.9245460336123136e-05, - "loss": 0.9548, - "step": 122 - }, - { - "epoch": 0.13516483516483516, - "grad_norm": 0.4942091107368469, - "learning_rate": 1.923210217112981e-05, - "loss": 1.0056, - "step": 123 - }, - { - "epoch": 0.13626373626373625, - "grad_norm": 0.633439838886261, - "learning_rate": 1.9218631515885007e-05, - "loss": 1.0051, - "step": 124 - }, - { - "epoch": 0.13736263736263737, - "grad_norm": 0.5528006553649902, - "learning_rate": 1.9205048534524405e-05, - "loss": 0.9659, - "step": 125 - }, - { - "epoch": 0.13846153846153847, - "grad_norm": 0.5047746896743774, - "learning_rate": 1.9191353392552346e-05, - "loss": 0.9843, - "step": 126 - }, - { - "epoch": 0.13956043956043956, - "grad_norm": 0.4919949471950531, - "learning_rate": 1.9177546256839814e-05, - "loss": 0.9927, - "step": 127 - }, - { - "epoch": 0.14065934065934066, - "grad_norm": 0.5230108499526978, - "learning_rate": 1.9163627295622397e-05, - "loss": 1.0347, - "step": 128 - }, - { - "epoch": 0.14175824175824175, - "grad_norm": 0.5235939025878906, - "learning_rate": 1.914959667849825e-05, - "loss": 0.9627, - "step": 129 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 0.49867233633995056, - "learning_rate": 1.913545457642601e-05, - "loss": 1.0347, - "step": 130 - }, - { - "epoch": 0.14395604395604394, - "grad_norm": 0.5188619494438171, - "learning_rate": 1.9121201161722732e-05, - "loss": 0.9868, - "step": 131 - }, - { - "epoch": 0.14505494505494507, - "grad_norm": 0.5248557329177856, - "learning_rate": 1.910683660806177e-05, - "loss": 0.9471, - "step": 132 - }, - { - "epoch": 0.14615384615384616, - "grad_norm": 0.48255398869514465, - "learning_rate": 1.9092361090470688e-05, - "loss": 0.8496, - "step": 133 - }, - { - "epoch": 0.14725274725274726, - "grad_norm": 0.5680371522903442, - "learning_rate": 1.907777478532909e-05, - "loss": 0.9893, - "step": 134 - }, - { - "epoch": 0.14835164835164835, - "grad_norm": 0.5774680376052856, - "learning_rate": 1.9063077870366504e-05, - "loss": 0.9506, - "step": 135 - }, - { - "epoch": 0.14945054945054945, - "grad_norm": 0.5179923176765442, - "learning_rate": 1.9048270524660197e-05, - "loss": 0.9617, - "step": 136 - }, - { - "epoch": 0.15054945054945054, - "grad_norm": 0.5153389573097229, - "learning_rate": 1.903335292863301e-05, - "loss": 0.9935, - "step": 137 - }, - { - "epoch": 0.15164835164835164, - "grad_norm": 0.5755510926246643, - "learning_rate": 1.901832526405114e-05, - "loss": 0.9414, - "step": 138 - }, - { - "epoch": 0.15274725274725276, - "grad_norm": 0.5388807058334351, - "learning_rate": 1.9003187714021936e-05, - "loss": 1.0488, - "step": 139 - }, - { - "epoch": 0.15384615384615385, - "grad_norm": 0.550377368927002, - "learning_rate": 1.8987940462991673e-05, - "loss": 1.0443, - "step": 140 - }, - { - "epoch": 0.15494505494505495, - "grad_norm": 0.5450628995895386, - "learning_rate": 1.8972583696743284e-05, - "loss": 0.9876, - "step": 141 - }, - { - "epoch": 0.15604395604395604, - "grad_norm": 0.4906370937824249, - "learning_rate": 1.895711760239413e-05, - "loss": 0.9768, - "step": 142 - }, - { - "epoch": 0.15714285714285714, - "grad_norm": 0.5642873048782349, - "learning_rate": 1.8941542368393683e-05, - "loss": 1.0104, - "step": 143 - }, - { - "epoch": 0.15824175824175823, - "grad_norm": 0.5400336980819702, - "learning_rate": 1.892585818452126e-05, - "loss": 0.9499, - "step": 144 - }, - { - "epoch": 0.15934065934065933, - "grad_norm": 0.5868093371391296, - "learning_rate": 1.891006524188368e-05, - "loss": 0.9893, - "step": 145 - }, - { - "epoch": 0.16043956043956045, - "grad_norm": 0.584716796875, - "learning_rate": 1.889416373291298e-05, - "loss": 0.9283, - "step": 146 - }, - { - "epoch": 0.16153846153846155, - "grad_norm": 0.559728741645813, - "learning_rate": 1.8878153851364013e-05, - "loss": 0.9623, - "step": 147 - }, - { - "epoch": 0.16263736263736264, - "grad_norm": 0.5793801546096802, - "learning_rate": 1.8862035792312148e-05, - "loss": 0.9891, - "step": 148 - }, - { - "epoch": 0.16373626373626374, - "grad_norm": 0.6070756316184998, - "learning_rate": 1.884580975215084e-05, - "loss": 1.0026, - "step": 149 - }, - { - "epoch": 0.16483516483516483, - "grad_norm": 0.6226385831832886, - "learning_rate": 1.8829475928589272e-05, - "loss": 0.9488, - "step": 150 - }, - { - "epoch": 0.16593406593406593, - "grad_norm": 0.5946024060249329, - "learning_rate": 1.8813034520649923e-05, - "loss": 0.9564, - "step": 151 - }, - { - "epoch": 0.16703296703296702, - "grad_norm": 0.5849313735961914, - "learning_rate": 1.879648572866617e-05, - "loss": 0.9135, - "step": 152 - }, - { - "epoch": 0.16813186813186815, - "grad_norm": 0.6235004663467407, - "learning_rate": 1.8779829754279806e-05, - "loss": 0.9538, - "step": 153 - }, - { - "epoch": 0.16923076923076924, - "grad_norm": 0.6550187468528748, - "learning_rate": 1.8763066800438638e-05, - "loss": 0.9769, - "step": 154 - }, - { - "epoch": 0.17032967032967034, - "grad_norm": 0.5428783893585205, - "learning_rate": 1.874619707139396e-05, - "loss": 1.0058, - "step": 155 - }, - { - "epoch": 0.17142857142857143, - "grad_norm": 0.570265531539917, - "learning_rate": 1.8729220772698096e-05, - "loss": 0.9978, - "step": 156 - }, - { - "epoch": 0.17252747252747253, - "grad_norm": 0.6183134913444519, - "learning_rate": 1.8712138111201898e-05, - "loss": 0.9659, - "step": 157 - }, - { - "epoch": 0.17362637362637362, - "grad_norm": 0.5421972870826721, - "learning_rate": 1.869494929505219e-05, - "loss": 0.9601, - "step": 158 - }, - { - "epoch": 0.17472527472527472, - "grad_norm": 0.5749966502189636, - "learning_rate": 1.8677654533689287e-05, - "loss": 0.9223, - "step": 159 - }, - { - "epoch": 0.17582417582417584, - "grad_norm": 0.5819816589355469, - "learning_rate": 1.866025403784439e-05, - "loss": 0.9803, - "step": 160 - }, - { - "epoch": 0.17692307692307693, - "grad_norm": 0.6133263111114502, - "learning_rate": 1.864274801953705e-05, - "loss": 0.9234, - "step": 161 - }, - { - "epoch": 0.17802197802197803, - "grad_norm": 0.6091593503952026, - "learning_rate": 1.8625136692072577e-05, - "loss": 0.8942, - "step": 162 - }, - { - "epoch": 0.17912087912087912, - "grad_norm": 0.6196579337120056, - "learning_rate": 1.860742027003944e-05, - "loss": 0.9938, - "step": 163 - }, - { - "epoch": 0.18021978021978022, - "grad_norm": 0.5519505143165588, - "learning_rate": 1.8589598969306646e-05, - "loss": 1.0015, - "step": 164 - }, - { - "epoch": 0.1813186813186813, - "grad_norm": 0.5841838121414185, - "learning_rate": 1.8571673007021124e-05, - "loss": 0.9244, - "step": 165 - }, - { - "epoch": 0.1824175824175824, - "grad_norm": 0.6472905874252319, - "learning_rate": 1.855364260160507e-05, - "loss": 0.9498, - "step": 166 - }, - { - "epoch": 0.1835164835164835, - "grad_norm": 0.578123927116394, - "learning_rate": 1.8535507972753275e-05, - "loss": 0.9652, - "step": 167 - }, - { - "epoch": 0.18461538461538463, - "grad_norm": 0.6499450206756592, - "learning_rate": 1.851726934143048e-05, - "loss": 0.9678, - "step": 168 - }, - { - "epoch": 0.18571428571428572, - "grad_norm": 0.6290732026100159, - "learning_rate": 1.849892692986864e-05, - "loss": 0.9605, - "step": 169 - }, - { - "epoch": 0.18681318681318682, - "grad_norm": 0.6311396956443787, - "learning_rate": 1.848048096156426e-05, - "loss": 0.9531, - "step": 170 - }, - { - "epoch": 0.1879120879120879, - "grad_norm": 0.5765597224235535, - "learning_rate": 1.8461931661275642e-05, - "loss": 0.9597, - "step": 171 - }, - { - "epoch": 0.189010989010989, - "grad_norm": 0.5811415910720825, - "learning_rate": 1.8443279255020153e-05, - "loss": 1.0034, - "step": 172 - }, - { - "epoch": 0.1901098901098901, - "grad_norm": 0.6414772272109985, - "learning_rate": 1.842452397007148e-05, - "loss": 0.9824, - "step": 173 - }, - { - "epoch": 0.1912087912087912, - "grad_norm": 0.6162213087081909, - "learning_rate": 1.8405666034956842e-05, - "loss": 0.9294, - "step": 174 - }, - { - "epoch": 0.19230769230769232, - "grad_norm": 0.5893775224685669, - "learning_rate": 1.8386705679454243e-05, - "loss": 0.9288, - "step": 175 - }, - { - "epoch": 0.1934065934065934, - "grad_norm": 0.6348901987075806, - "learning_rate": 1.836764313458962e-05, - "loss": 0.9014, - "step": 176 - }, - { - "epoch": 0.1945054945054945, - "grad_norm": 0.6220593452453613, - "learning_rate": 1.8348478632634067e-05, - "loss": 0.9913, - "step": 177 - }, - { - "epoch": 0.1956043956043956, - "grad_norm": 0.6383747458457947, - "learning_rate": 1.8329212407100996e-05, - "loss": 0.9926, - "step": 178 - }, - { - "epoch": 0.1967032967032967, - "grad_norm": 0.5910571217536926, - "learning_rate": 1.8309844692743283e-05, - "loss": 0.9332, - "step": 179 - }, - { - "epoch": 0.1978021978021978, - "grad_norm": 0.6351461410522461, - "learning_rate": 1.8290375725550417e-05, - "loss": 0.9827, - "step": 180 - }, - { - "epoch": 0.1989010989010989, - "grad_norm": 0.6192371249198914, - "learning_rate": 1.827080574274562e-05, - "loss": 0.8932, - "step": 181 - }, - { - "epoch": 0.2, - "grad_norm": 0.626589834690094, - "learning_rate": 1.8251134982782952e-05, - "loss": 0.9388, - "step": 182 - }, - { - "epoch": 0.2010989010989011, - "grad_norm": 0.5806135535240173, - "learning_rate": 1.8231363685344422e-05, - "loss": 0.9267, - "step": 183 - }, - { - "epoch": 0.2021978021978022, - "grad_norm": 0.5838484168052673, - "learning_rate": 1.821149209133704e-05, - "loss": 0.9271, - "step": 184 - }, - { - "epoch": 0.2032967032967033, - "grad_norm": 0.5986520051956177, - "learning_rate": 1.819152044288992e-05, - "loss": 0.9396, - "step": 185 - }, - { - "epoch": 0.2043956043956044, - "grad_norm": 0.6489253044128418, - "learning_rate": 1.8171448983351284e-05, - "loss": 0.9285, - "step": 186 - }, - { - "epoch": 0.20549450549450549, - "grad_norm": 0.6240598559379578, - "learning_rate": 1.815127795728554e-05, - "loss": 0.9845, - "step": 187 - }, - { - "epoch": 0.20659340659340658, - "grad_norm": 0.5978933572769165, - "learning_rate": 1.8131007610470278e-05, - "loss": 0.9704, - "step": 188 - }, - { - "epoch": 0.2076923076923077, - "grad_norm": 0.6370437741279602, - "learning_rate": 1.8110638189893267e-05, - "loss": 0.9134, - "step": 189 - }, - { - "epoch": 0.2087912087912088, - "grad_norm": 0.5889105796813965, - "learning_rate": 1.8090169943749477e-05, - "loss": 0.949, - "step": 190 - }, - { - "epoch": 0.2098901098901099, - "grad_norm": 0.5496615767478943, - "learning_rate": 1.806960312143802e-05, - "loss": 0.9591, - "step": 191 - }, - { - "epoch": 0.210989010989011, - "grad_norm": 0.5912355184555054, - "learning_rate": 1.804893797355914e-05, - "loss": 0.9557, - "step": 192 - }, - { - "epoch": 0.21208791208791208, - "grad_norm": 0.6098259687423706, - "learning_rate": 1.8028174751911147e-05, - "loss": 0.9407, - "step": 193 - }, - { - "epoch": 0.21318681318681318, - "grad_norm": 0.6522508263587952, - "learning_rate": 1.8007313709487334e-05, - "loss": 0.9252, - "step": 194 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 0.6187021732330322, - "learning_rate": 1.798635510047293e-05, - "loss": 0.9303, - "step": 195 - }, - { - "epoch": 0.2153846153846154, - "grad_norm": 0.6677089333534241, - "learning_rate": 1.7965299180241963e-05, - "loss": 0.9353, - "step": 196 - }, - { - "epoch": 0.2164835164835165, - "grad_norm": 0.6682733297348022, - "learning_rate": 1.7944146205354182e-05, - "loss": 0.9363, - "step": 197 - }, - { - "epoch": 0.2175824175824176, - "grad_norm": 0.618269145488739, - "learning_rate": 1.792289643355191e-05, - "loss": 0.9135, - "step": 198 - }, - { - "epoch": 0.21868131868131868, - "grad_norm": 0.6224507093429565, - "learning_rate": 1.7901550123756906e-05, - "loss": 0.9368, - "step": 199 - }, - { - "epoch": 0.21978021978021978, - "grad_norm": 0.6227851510047913, - "learning_rate": 1.788010753606722e-05, - "loss": 0.957, - "step": 200 - }, - { - "epoch": 0.22087912087912087, - "grad_norm": 0.5879402160644531, - "learning_rate": 1.785856893175402e-05, - "loss": 0.9418, - "step": 201 - }, - { - "epoch": 0.22197802197802197, - "grad_norm": 0.6332261562347412, - "learning_rate": 1.78369345732584e-05, - "loss": 0.9833, - "step": 202 - }, - { - "epoch": 0.2230769230769231, - "grad_norm": 0.5709935426712036, - "learning_rate": 1.781520472418819e-05, - "loss": 0.7654, - "step": 203 - }, - { - "epoch": 0.22417582417582418, - "grad_norm": 0.6371233463287354, - "learning_rate": 1.7793379649314743e-05, - "loss": 0.9479, - "step": 204 - }, - { - "epoch": 0.22527472527472528, - "grad_norm": 0.643076479434967, - "learning_rate": 1.777145961456971e-05, - "loss": 0.9826, - "step": 205 - }, - { - "epoch": 0.22637362637362637, - "grad_norm": 0.5782551169395447, - "learning_rate": 1.7749444887041797e-05, - "loss": 0.9684, - "step": 206 - }, - { - "epoch": 0.22747252747252747, - "grad_norm": 0.6335501670837402, - "learning_rate": 1.7727335734973512e-05, - "loss": 0.9305, - "step": 207 - }, - { - "epoch": 0.22857142857142856, - "grad_norm": 0.6813045144081116, - "learning_rate": 1.7705132427757895e-05, - "loss": 0.9179, - "step": 208 - }, - { - "epoch": 0.22967032967032966, - "grad_norm": 0.6886667013168335, - "learning_rate": 1.7682835235935236e-05, - "loss": 0.931, - "step": 209 - }, - { - "epoch": 0.23076923076923078, - "grad_norm": 0.6036134958267212, - "learning_rate": 1.766044443118978e-05, - "loss": 0.9882, - "step": 210 - }, - { - "epoch": 0.23186813186813188, - "grad_norm": 0.6375074982643127, - "learning_rate": 1.7637960286346423e-05, - "loss": 0.9699, - "step": 211 - }, - { - "epoch": 0.23296703296703297, - "grad_norm": 0.724577784538269, - "learning_rate": 1.761538307536737e-05, - "loss": 0.8797, - "step": 212 - }, - { - "epoch": 0.23406593406593407, - "grad_norm": 0.6929182410240173, - "learning_rate": 1.759271307334881e-05, - "loss": 0.9756, - "step": 213 - }, - { - "epoch": 0.23516483516483516, - "grad_norm": 0.6330498456954956, - "learning_rate": 1.7569950556517566e-05, - "loss": 0.9168, - "step": 214 - }, - { - "epoch": 0.23626373626373626, - "grad_norm": 0.6258252263069153, - "learning_rate": 1.7547095802227723e-05, - "loss": 0.9437, - "step": 215 - }, - { - "epoch": 0.23736263736263735, - "grad_norm": 0.6916672587394714, - "learning_rate": 1.7524149088957244e-05, - "loss": 0.964, - "step": 216 - }, - { - "epoch": 0.23846153846153847, - "grad_norm": 0.6291719675064087, - "learning_rate": 1.7501110696304598e-05, - "loss": 0.9867, - "step": 217 - }, - { - "epoch": 0.23956043956043957, - "grad_norm": 0.6742487549781799, - "learning_rate": 1.747798090498532e-05, - "loss": 0.9345, - "step": 218 - }, - { - "epoch": 0.24065934065934066, - "grad_norm": 0.623616635799408, - "learning_rate": 1.7454759996828622e-05, - "loss": 0.9551, - "step": 219 - }, - { - "epoch": 0.24175824175824176, - "grad_norm": 0.6171446442604065, - "learning_rate": 1.7431448254773943e-05, - "loss": 0.9118, - "step": 220 - }, - { - "epoch": 0.24285714285714285, - "grad_norm": 0.6596176624298096, - "learning_rate": 1.74080459628675e-05, - "loss": 0.9475, - "step": 221 - }, - { - "epoch": 0.24395604395604395, - "grad_norm": 0.660784900188446, - "learning_rate": 1.7384553406258842e-05, - "loss": 0.9606, - "step": 222 - }, - { - "epoch": 0.24505494505494504, - "grad_norm": 0.6923850774765015, - "learning_rate": 1.7360970871197347e-05, - "loss": 0.9203, - "step": 223 - }, - { - "epoch": 0.24615384615384617, - "grad_norm": 0.8632559180259705, - "learning_rate": 1.7337298645028764e-05, - "loss": 0.8893, - "step": 224 - }, - { - "epoch": 0.24725274725274726, - "grad_norm": 0.6309676170349121, - "learning_rate": 1.7313537016191706e-05, - "loss": 1.0045, - "step": 225 - }, - { - "epoch": 0.24835164835164836, - "grad_norm": 0.7040322422981262, - "learning_rate": 1.7289686274214116e-05, - "loss": 0.951, - "step": 226 - }, - { - "epoch": 0.24945054945054945, - "grad_norm": 0.7316541075706482, - "learning_rate": 1.7265746709709762e-05, - "loss": 0.9005, - "step": 227 - }, - { - "epoch": 0.25054945054945055, - "grad_norm": 0.6595112085342407, - "learning_rate": 1.7241718614374678e-05, - "loss": 0.9442, - "step": 228 - }, - { - "epoch": 0.25054945054945055, - "eval_loss": 0.8244916200637817, - "eval_runtime": 284.1006, - "eval_samples_per_second": 9.546, - "eval_steps_per_second": 0.398, - "step": 228 - }, - { - "epoch": 0.25164835164835164, - "grad_norm": 0.8036085963249207, - "learning_rate": 1.7217602280983622e-05, - "loss": 0.8775, - "step": 229 - }, - { - "epoch": 0.25274725274725274, - "grad_norm": 0.6445724964141846, - "learning_rate": 1.7193398003386514e-05, - "loss": 0.9707, - "step": 230 - }, - { - "epoch": 0.25384615384615383, - "grad_norm": 0.6673330664634705, - "learning_rate": 1.716910607650483e-05, - "loss": 0.9608, - "step": 231 - }, - { - "epoch": 0.2549450549450549, - "grad_norm": 0.6371216773986816, - "learning_rate": 1.7144726796328034e-05, - "loss": 0.9322, - "step": 232 - }, - { - "epoch": 0.256043956043956, - "grad_norm": 0.6947388648986816, - "learning_rate": 1.712026045990997e-05, - "loss": 0.9575, - "step": 233 - }, - { - "epoch": 0.2571428571428571, - "grad_norm": 0.6566380858421326, - "learning_rate": 1.709570736536521e-05, - "loss": 0.9084, - "step": 234 - }, - { - "epoch": 0.25824175824175827, - "grad_norm": 0.6687067151069641, - "learning_rate": 1.7071067811865477e-05, - "loss": 0.9236, - "step": 235 - }, - { - "epoch": 0.25934065934065936, - "grad_norm": 0.5979390144348145, - "learning_rate": 1.7046342099635948e-05, - "loss": 0.9771, - "step": 236 - }, - { - "epoch": 0.26043956043956046, - "grad_norm": 0.6617268323898315, - "learning_rate": 1.7021530529951627e-05, - "loss": 0.9832, - "step": 237 - }, - { - "epoch": 0.26153846153846155, - "grad_norm": 0.6460007429122925, - "learning_rate": 1.6996633405133656e-05, - "loss": 0.9707, - "step": 238 - }, - { - "epoch": 0.26263736263736265, - "grad_norm": 0.6502106785774231, - "learning_rate": 1.697165102854565e-05, - "loss": 0.9475, - "step": 239 - }, - { - "epoch": 0.26373626373626374, - "grad_norm": 0.6964218020439148, - "learning_rate": 1.6946583704589973e-05, - "loss": 0.9176, - "step": 240 - }, - { - "epoch": 0.26483516483516484, - "grad_norm": 0.6720311045646667, - "learning_rate": 1.692143173870407e-05, - "loss": 0.9396, - "step": 241 - }, - { - "epoch": 0.26593406593406593, - "grad_norm": 0.6410806775093079, - "learning_rate": 1.68961954373567e-05, - "loss": 0.9698, - "step": 242 - }, - { - "epoch": 0.26703296703296703, - "grad_norm": 0.7727224826812744, - "learning_rate": 1.6870875108044233e-05, - "loss": 0.9076, - "step": 243 - }, - { - "epoch": 0.2681318681318681, - "grad_norm": 0.763213038444519, - "learning_rate": 1.684547105928689e-05, - "loss": 0.9597, - "step": 244 - }, - { - "epoch": 0.2692307692307692, - "grad_norm": 0.6705512404441833, - "learning_rate": 1.6819983600624986e-05, - "loss": 0.9636, - "step": 245 - }, - { - "epoch": 0.2703296703296703, - "grad_norm": 0.6501322984695435, - "learning_rate": 1.6794413042615168e-05, - "loss": 0.9095, - "step": 246 - }, - { - "epoch": 0.2714285714285714, - "grad_norm": 0.654255211353302, - "learning_rate": 1.6768759696826608e-05, - "loss": 0.9606, - "step": 247 - }, - { - "epoch": 0.2725274725274725, - "grad_norm": 0.6866080164909363, - "learning_rate": 1.6743023875837233e-05, - "loss": 0.9178, - "step": 248 - }, - { - "epoch": 0.27362637362637365, - "grad_norm": 0.7246939539909363, - "learning_rate": 1.6717205893229904e-05, - "loss": 0.9467, - "step": 249 - }, - { - "epoch": 0.27472527472527475, - "grad_norm": 0.7138158082962036, - "learning_rate": 1.6691306063588583e-05, - "loss": 0.9452, - "step": 250 - }, - { - "epoch": 0.27582417582417584, - "grad_norm": 0.6748147010803223, - "learning_rate": 1.6665324702494524e-05, - "loss": 0.9478, - "step": 251 - }, - { - "epoch": 0.27692307692307694, - "grad_norm": 0.6949153542518616, - "learning_rate": 1.6639262126522417e-05, - "loss": 0.9742, - "step": 252 - }, - { - "epoch": 0.27802197802197803, - "grad_norm": 0.7192912101745605, - "learning_rate": 1.661311865323652e-05, - "loss": 0.9609, - "step": 253 - }, - { - "epoch": 0.27912087912087913, - "grad_norm": 0.7274371981620789, - "learning_rate": 1.6586894601186804e-05, - "loss": 0.9446, - "step": 254 - }, - { - "epoch": 0.2802197802197802, - "grad_norm": 0.733801007270813, - "learning_rate": 1.6560590289905074e-05, - "loss": 0.9196, - "step": 255 - }, - { - "epoch": 0.2813186813186813, - "grad_norm": 0.6870750784873962, - "learning_rate": 1.6534206039901057e-05, - "loss": 0.9249, - "step": 256 - }, - { - "epoch": 0.2824175824175824, - "grad_norm": 0.7604805827140808, - "learning_rate": 1.650774217265851e-05, - "loss": 0.9195, - "step": 257 - }, - { - "epoch": 0.2835164835164835, - "grad_norm": 0.6549686789512634, - "learning_rate": 1.6481199010631312e-05, - "loss": 0.9371, - "step": 258 - }, - { - "epoch": 0.2846153846153846, - "grad_norm": 0.6888175010681152, - "learning_rate": 1.645457687723951e-05, - "loss": 0.9237, - "step": 259 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 0.687646210193634, - "learning_rate": 1.6427876096865394e-05, - "loss": 0.9565, - "step": 260 - }, - { - "epoch": 0.2868131868131868, - "grad_norm": 0.6868332028388977, - "learning_rate": 1.6401096994849558e-05, - "loss": 0.9543, - "step": 261 - }, - { - "epoch": 0.2879120879120879, - "grad_norm": 0.6656764149665833, - "learning_rate": 1.63742398974869e-05, - "loss": 0.9562, - "step": 262 - }, - { - "epoch": 0.289010989010989, - "grad_norm": 0.6949250102043152, - "learning_rate": 1.6347305132022677e-05, - "loss": 0.9271, - "step": 263 - }, - { - "epoch": 0.29010989010989013, - "grad_norm": 0.7068068385124207, - "learning_rate": 1.632029302664851e-05, - "loss": 0.9356, - "step": 264 - }, - { - "epoch": 0.29120879120879123, - "grad_norm": 0.68993079662323, - "learning_rate": 1.6293203910498375e-05, - "loss": 0.9659, - "step": 265 - }, - { - "epoch": 0.2923076923076923, - "grad_norm": 0.7640364170074463, - "learning_rate": 1.6266038113644605e-05, - "loss": 0.9052, - "step": 266 - }, - { - "epoch": 0.2934065934065934, - "grad_norm": 0.7439724802970886, - "learning_rate": 1.6238795967093865e-05, - "loss": 0.9346, - "step": 267 - }, - { - "epoch": 0.2945054945054945, - "grad_norm": 0.6450992226600647, - "learning_rate": 1.6211477802783105e-05, - "loss": 0.9342, - "step": 268 - }, - { - "epoch": 0.2956043956043956, - "grad_norm": 0.7285473942756653, - "learning_rate": 1.6184083953575543e-05, - "loss": 0.8978, - "step": 269 - }, - { - "epoch": 0.2967032967032967, - "grad_norm": 0.7374494075775146, - "learning_rate": 1.6156614753256583e-05, - "loss": 0.9231, - "step": 270 - }, - { - "epoch": 0.2978021978021978, - "grad_norm": 0.7315150499343872, - "learning_rate": 1.6129070536529767e-05, - "loss": 0.8836, - "step": 271 - }, - { - "epoch": 0.2989010989010989, - "grad_norm": 0.6788934469223022, - "learning_rate": 1.610145163901268e-05, - "loss": 0.9363, - "step": 272 - }, - { - "epoch": 0.3, - "grad_norm": 0.6707409620285034, - "learning_rate": 1.607375839723287e-05, - "loss": 0.9258, - "step": 273 - }, - { - "epoch": 0.3010989010989011, - "grad_norm": 0.6961421370506287, - "learning_rate": 1.6045991148623752e-05, - "loss": 0.921, - "step": 274 - }, - { - "epoch": 0.3021978021978022, - "grad_norm": 0.7395579218864441, - "learning_rate": 1.6018150231520486e-05, - "loss": 0.9493, - "step": 275 - }, - { - "epoch": 0.3032967032967033, - "grad_norm": 0.7345437407493591, - "learning_rate": 1.599023598515586e-05, - "loss": 0.9246, - "step": 276 - }, - { - "epoch": 0.30439560439560437, - "grad_norm": 0.7264265418052673, - "learning_rate": 1.5962248749656158e-05, - "loss": 0.9276, - "step": 277 - }, - { - "epoch": 0.3054945054945055, - "grad_norm": 0.6799497604370117, - "learning_rate": 1.5934188866037017e-05, - "loss": 0.9005, - "step": 278 - }, - { - "epoch": 0.3065934065934066, - "grad_norm": 0.7213189005851746, - "learning_rate": 1.5906056676199256e-05, - "loss": 0.9007, - "step": 279 - }, - { - "epoch": 0.3076923076923077, - "grad_norm": 0.7792261838912964, - "learning_rate": 1.5877852522924733e-05, - "loss": 0.917, - "step": 280 - }, - { - "epoch": 0.3087912087912088, - "grad_norm": 0.7763038277626038, - "learning_rate": 1.584957674987216e-05, - "loss": 0.967, - "step": 281 - }, - { - "epoch": 0.3098901098901099, - "grad_norm": 0.7015681266784668, - "learning_rate": 1.5821229701572897e-05, - "loss": 0.9432, - "step": 282 - }, - { - "epoch": 0.310989010989011, - "grad_norm": 0.7468630075454712, - "learning_rate": 1.5792811723426787e-05, - "loss": 0.9066, - "step": 283 - }, - { - "epoch": 0.3120879120879121, - "grad_norm": 0.7905395030975342, - "learning_rate": 1.5764323161697933e-05, - "loss": 0.9597, - "step": 284 - }, - { - "epoch": 0.3131868131868132, - "grad_norm": 0.7143304347991943, - "learning_rate": 1.573576436351046e-05, - "loss": 0.9397, - "step": 285 - }, - { - "epoch": 0.3142857142857143, - "grad_norm": 0.7128000855445862, - "learning_rate": 1.570713567684432e-05, - "loss": 0.9388, - "step": 286 - }, - { - "epoch": 0.3153846153846154, - "grad_norm": 0.7205612659454346, - "learning_rate": 1.5678437450531014e-05, - "loss": 0.8793, - "step": 287 - }, - { - "epoch": 0.31648351648351647, - "grad_norm": 0.7753700017929077, - "learning_rate": 1.564967003424938e-05, - "loss": 0.9195, - "step": 288 - }, - { - "epoch": 0.31758241758241756, - "grad_norm": 0.7840977311134338, - "learning_rate": 1.5620833778521306e-05, - "loss": 0.9792, - "step": 289 - }, - { - "epoch": 0.31868131868131866, - "grad_norm": 0.7442571520805359, - "learning_rate": 1.5591929034707468e-05, - "loss": 0.8762, - "step": 290 - }, - { - "epoch": 0.31978021978021975, - "grad_norm": 0.7129194736480713, - "learning_rate": 1.556295615500305e-05, - "loss": 0.9454, - "step": 291 - }, - { - "epoch": 0.3208791208791209, - "grad_norm": 0.7198705077171326, - "learning_rate": 1.553391549243344e-05, - "loss": 0.9382, - "step": 292 - }, - { - "epoch": 0.321978021978022, - "grad_norm": 0.6637123823165894, - "learning_rate": 1.5504807400849957e-05, - "loss": 0.9314, - "step": 293 - }, - { - "epoch": 0.3230769230769231, - "grad_norm": 0.7053823471069336, - "learning_rate": 1.5475632234925505e-05, - "loss": 0.9054, - "step": 294 - }, - { - "epoch": 0.3241758241758242, - "grad_norm": 0.7007255554199219, - "learning_rate": 1.5446390350150272e-05, - "loss": 0.9586, - "step": 295 - }, - { - "epoch": 0.3252747252747253, - "grad_norm": 0.7148289084434509, - "learning_rate": 1.54170821028274e-05, - "loss": 0.9343, - "step": 296 - }, - { - "epoch": 0.3263736263736264, - "grad_norm": 0.7231755256652832, - "learning_rate": 1.5387707850068633e-05, - "loss": 0.8966, - "step": 297 - }, - { - "epoch": 0.3274725274725275, - "grad_norm": 0.7579424977302551, - "learning_rate": 1.5358267949789968e-05, - "loss": 0.8788, - "step": 298 - }, - { - "epoch": 0.32857142857142857, - "grad_norm": 0.7326198220252991, - "learning_rate": 1.53287627607073e-05, - "loss": 0.8949, - "step": 299 - }, - { - "epoch": 0.32967032967032966, - "grad_norm": 0.7709233164787292, - "learning_rate": 1.529919264233205e-05, - "loss": 0.9057, - "step": 300 - }, - { - "epoch": 0.33076923076923076, - "grad_norm": 0.8190059065818787, - "learning_rate": 1.5269557954966777e-05, - "loss": 0.9348, - "step": 301 - }, - { - "epoch": 0.33186813186813185, - "grad_norm": 0.7548313140869141, - "learning_rate": 1.5239859059700794e-05, - "loss": 0.9258, - "step": 302 - }, - { - "epoch": 0.33296703296703295, - "grad_norm": 0.7792167067527771, - "learning_rate": 1.5210096318405768e-05, - "loss": 0.9237, - "step": 303 - }, - { - "epoch": 0.33406593406593404, - "grad_norm": 0.817673921585083, - "learning_rate": 1.5180270093731305e-05, - "loss": 0.9031, - "step": 304 - }, - { - "epoch": 0.33516483516483514, - "grad_norm": 0.7080990672111511, - "learning_rate": 1.5150380749100545e-05, - "loss": 0.9201, - "step": 305 - }, - { - "epoch": 0.3362637362637363, - "grad_norm": 0.7408740520477295, - "learning_rate": 1.5120428648705716e-05, - "loss": 0.9284, - "step": 306 - }, - { - "epoch": 0.3373626373626374, - "grad_norm": 0.738618791103363, - "learning_rate": 1.5090414157503715e-05, - "loss": 0.9276, - "step": 307 - }, - { - "epoch": 0.3384615384615385, - "grad_norm": 0.7790191173553467, - "learning_rate": 1.5060337641211637e-05, - "loss": 0.8432, - "step": 308 - }, - { - "epoch": 0.3395604395604396, - "grad_norm": 0.7341995239257812, - "learning_rate": 1.5030199466302354e-05, - "loss": 0.9527, - "step": 309 - }, - { - "epoch": 0.34065934065934067, - "grad_norm": 0.7559542059898376, - "learning_rate": 1.5000000000000002e-05, - "loss": 0.9207, - "step": 310 - }, - { - "epoch": 0.34175824175824177, - "grad_norm": 0.8332465291023254, - "learning_rate": 1.4969739610275556e-05, - "loss": 0.8846, - "step": 311 - }, - { - "epoch": 0.34285714285714286, - "grad_norm": 0.7986682057380676, - "learning_rate": 1.493941866584231e-05, - "loss": 0.9331, - "step": 312 - }, - { - "epoch": 0.34395604395604396, - "grad_norm": 0.742353618144989, - "learning_rate": 1.490903753615141e-05, - "loss": 0.9785, - "step": 313 - }, - { - "epoch": 0.34505494505494505, - "grad_norm": 0.7452000379562378, - "learning_rate": 1.4878596591387329e-05, - "loss": 0.9215, - "step": 314 - }, - { - "epoch": 0.34615384615384615, - "grad_norm": 0.7506792545318604, - "learning_rate": 1.4848096202463373e-05, - "loss": 0.9063, - "step": 315 - }, - { - "epoch": 0.34725274725274724, - "grad_norm": 0.8034257888793945, - "learning_rate": 1.4817536741017153e-05, - "loss": 0.9613, - "step": 316 - }, - { - "epoch": 0.34835164835164834, - "grad_norm": 0.7605523467063904, - "learning_rate": 1.478691857940607e-05, - "loss": 0.9269, - "step": 317 - }, - { - "epoch": 0.34945054945054943, - "grad_norm": 0.7829498648643494, - "learning_rate": 1.4756242090702756e-05, - "loss": 0.9404, - "step": 318 - }, - { - "epoch": 0.3505494505494505, - "grad_norm": 0.7662166953086853, - "learning_rate": 1.4725507648690542e-05, - "loss": 0.9472, - "step": 319 - }, - { - "epoch": 0.3516483516483517, - "grad_norm": 0.7379273772239685, - "learning_rate": 1.469471562785891e-05, - "loss": 0.9237, - "step": 320 - }, - { - "epoch": 0.35274725274725277, - "grad_norm": 0.796578049659729, - "learning_rate": 1.4663866403398915e-05, - "loss": 0.8688, - "step": 321 - }, - { - "epoch": 0.35384615384615387, - "grad_norm": 0.7834927439689636, - "learning_rate": 1.463296035119862e-05, - "loss": 0.9444, - "step": 322 - }, - { - "epoch": 0.35494505494505496, - "grad_norm": 0.783683717250824, - "learning_rate": 1.4601997847838518e-05, - "loss": 0.939, - "step": 323 - }, - { - "epoch": 0.35604395604395606, - "grad_norm": 0.74640291929245, - "learning_rate": 1.4570979270586944e-05, - "loss": 0.9244, - "step": 324 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 0.7685065865516663, - "learning_rate": 1.4539904997395468e-05, - "loss": 0.8833, - "step": 325 - }, - { - "epoch": 0.35824175824175825, - "grad_norm": 0.7682788372039795, - "learning_rate": 1.4508775406894308e-05, - "loss": 0.8969, - "step": 326 - }, - { - "epoch": 0.35934065934065934, - "grad_norm": 0.8653631806373596, - "learning_rate": 1.4477590878387697e-05, - "loss": 0.8895, - "step": 327 - }, - { - "epoch": 0.36043956043956044, - "grad_norm": 0.7875328063964844, - "learning_rate": 1.4446351791849276e-05, - "loss": 0.9475, - "step": 328 - }, - { - "epoch": 0.36153846153846153, - "grad_norm": 0.7357600331306458, - "learning_rate": 1.4415058527917454e-05, - "loss": 0.8029, - "step": 329 - }, - { - "epoch": 0.3626373626373626, - "grad_norm": 0.7759565114974976, - "learning_rate": 1.4383711467890776e-05, - "loss": 0.9407, - "step": 330 - }, - { - "epoch": 0.3637362637362637, - "grad_norm": 0.9226092100143433, - "learning_rate": 1.4352310993723277e-05, - "loss": 0.8742, - "step": 331 - }, - { - "epoch": 0.3648351648351648, - "grad_norm": 0.8586281538009644, - "learning_rate": 1.4320857488019826e-05, - "loss": 0.8724, - "step": 332 - }, - { - "epoch": 0.3659340659340659, - "grad_norm": 0.7703315615653992, - "learning_rate": 1.4289351334031461e-05, - "loss": 0.8854, - "step": 333 - }, - { - "epoch": 0.367032967032967, - "grad_norm": 0.8326209187507629, - "learning_rate": 1.4257792915650728e-05, - "loss": 0.9225, - "step": 334 - }, - { - "epoch": 0.36813186813186816, - "grad_norm": 0.7407411336898804, - "learning_rate": 1.4226182617406996e-05, - "loss": 0.9459, - "step": 335 - }, - { - "epoch": 0.36923076923076925, - "grad_norm": 0.8011029362678528, - "learning_rate": 1.4194520824461773e-05, - "loss": 0.9099, - "step": 336 - }, - { - "epoch": 0.37032967032967035, - "grad_norm": 0.7630101442337036, - "learning_rate": 1.4162807922604014e-05, - "loss": 0.9229, - "step": 337 - }, - { - "epoch": 0.37142857142857144, - "grad_norm": 0.7321675419807434, - "learning_rate": 1.413104429824542e-05, - "loss": 0.948, - "step": 338 - }, - { - "epoch": 0.37252747252747254, - "grad_norm": 0.8401708006858826, - "learning_rate": 1.4099230338415728e-05, - "loss": 0.9053, - "step": 339 - }, - { - "epoch": 0.37362637362637363, - "grad_norm": 0.7967417240142822, - "learning_rate": 1.4067366430758004e-05, - "loss": 0.8841, - "step": 340 - }, - { - "epoch": 0.3747252747252747, - "grad_norm": 0.8076946139335632, - "learning_rate": 1.4035452963523903e-05, - "loss": 0.9023, - "step": 341 - }, - { - "epoch": 0.3758241758241758, - "grad_norm": 0.8282385468482971, - "learning_rate": 1.4003490325568953e-05, - "loss": 0.8308, - "step": 342 - }, - { - "epoch": 0.3769230769230769, - "grad_norm": 0.7593387961387634, - "learning_rate": 1.3971478906347806e-05, - "loss": 0.9097, - "step": 343 - }, - { - "epoch": 0.378021978021978, - "grad_norm": 0.7764955163002014, - "learning_rate": 1.3939419095909513e-05, - "loss": 0.8749, - "step": 344 - }, - { - "epoch": 0.3791208791208791, - "grad_norm": 0.7175388336181641, - "learning_rate": 1.3907311284892737e-05, - "loss": 0.9238, - "step": 345 - }, - { - "epoch": 0.3802197802197802, - "grad_norm": 0.7766281962394714, - "learning_rate": 1.3875155864521031e-05, - "loss": 0.8817, - "step": 346 - }, - { - "epoch": 0.3813186813186813, - "grad_norm": 0.7682809233665466, - "learning_rate": 1.3842953226598036e-05, - "loss": 0.9199, - "step": 347 - }, - { - "epoch": 0.3824175824175824, - "grad_norm": 0.8041484951972961, - "learning_rate": 1.3810703763502744e-05, - "loss": 0.9423, - "step": 348 - }, - { - "epoch": 0.38351648351648354, - "grad_norm": 0.7942437529563904, - "learning_rate": 1.3778407868184674e-05, - "loss": 0.9104, - "step": 349 - }, - { - "epoch": 0.38461538461538464, - "grad_norm": 0.6990230083465576, - "learning_rate": 1.3746065934159123e-05, - "loss": 0.9148, - "step": 350 - }, - { - "epoch": 0.38571428571428573, - "grad_norm": 0.7476593852043152, - "learning_rate": 1.371367835550235e-05, - "loss": 0.9318, - "step": 351 - }, - { - "epoch": 0.3868131868131868, - "grad_norm": 0.7421396374702454, - "learning_rate": 1.3681245526846782e-05, - "loss": 0.9123, - "step": 352 - }, - { - "epoch": 0.3879120879120879, - "grad_norm": 0.7453359365463257, - "learning_rate": 1.3648767843376196e-05, - "loss": 0.9132, - "step": 353 - }, - { - "epoch": 0.389010989010989, - "grad_norm": 0.8082804679870605, - "learning_rate": 1.3616245700820922e-05, - "loss": 0.8787, - "step": 354 - }, - { - "epoch": 0.3901098901098901, - "grad_norm": 0.8102105259895325, - "learning_rate": 1.3583679495453e-05, - "loss": 0.9048, - "step": 355 - }, - { - "epoch": 0.3912087912087912, - "grad_norm": 0.7853689193725586, - "learning_rate": 1.3551069624081372e-05, - "loss": 0.8987, - "step": 356 - }, - { - "epoch": 0.3923076923076923, - "grad_norm": 0.7894556522369385, - "learning_rate": 1.3518416484047018e-05, - "loss": 0.9519, - "step": 357 - }, - { - "epoch": 0.3934065934065934, - "grad_norm": 0.7613143920898438, - "learning_rate": 1.3485720473218153e-05, - "loss": 0.9126, - "step": 358 - }, - { - "epoch": 0.3945054945054945, - "grad_norm": 0.7199496626853943, - "learning_rate": 1.3452981989985347e-05, - "loss": 0.9035, - "step": 359 - }, - { - "epoch": 0.3956043956043956, - "grad_norm": 0.8076068758964539, - "learning_rate": 1.342020143325669e-05, - "loss": 0.9222, - "step": 360 - }, - { - "epoch": 0.3967032967032967, - "grad_norm": 0.8114787936210632, - "learning_rate": 1.3387379202452917e-05, - "loss": 0.9201, - "step": 361 - }, - { - "epoch": 0.3978021978021978, - "grad_norm": 0.859348475933075, - "learning_rate": 1.3354515697502552e-05, - "loss": 0.9541, - "step": 362 - }, - { - "epoch": 0.3989010989010989, - "grad_norm": 0.7929165959358215, - "learning_rate": 1.3321611318837033e-05, - "loss": 0.9302, - "step": 363 - }, - { - "epoch": 0.4, - "grad_norm": 0.8095117211341858, - "learning_rate": 1.3288666467385834e-05, - "loss": 0.93, - "step": 364 - }, - { - "epoch": 0.4010989010989011, - "grad_norm": 0.8312798142433167, - "learning_rate": 1.3255681544571568e-05, - "loss": 0.9182, - "step": 365 - }, - { - "epoch": 0.4021978021978022, - "grad_norm": 0.7289481163024902, - "learning_rate": 1.3222656952305113e-05, - "loss": 0.9165, - "step": 366 - }, - { - "epoch": 0.4032967032967033, - "grad_norm": 0.8334656953811646, - "learning_rate": 1.3189593092980701e-05, - "loss": 0.9024, - "step": 367 - }, - { - "epoch": 0.4043956043956044, - "grad_norm": 0.8704919219017029, - "learning_rate": 1.3156490369471026e-05, - "loss": 0.9127, - "step": 368 - }, - { - "epoch": 0.4054945054945055, - "grad_norm": 0.8928689360618591, - "learning_rate": 1.3123349185122328e-05, - "loss": 0.905, - "step": 369 - }, - { - "epoch": 0.4065934065934066, - "grad_norm": 0.7368168234825134, - "learning_rate": 1.3090169943749475e-05, - "loss": 0.9124, - "step": 370 - }, - { - "epoch": 0.4076923076923077, - "grad_norm": 0.7631838917732239, - "learning_rate": 1.3056953049631059e-05, - "loss": 0.9479, - "step": 371 - }, - { - "epoch": 0.4087912087912088, - "grad_norm": 0.7834437489509583, - "learning_rate": 1.3023698907504447e-05, - "loss": 0.9308, - "step": 372 - }, - { - "epoch": 0.4098901098901099, - "grad_norm": 0.8620699644088745, - "learning_rate": 1.2990407922560869e-05, - "loss": 0.9334, - "step": 373 - }, - { - "epoch": 0.41098901098901097, - "grad_norm": 0.7941575646400452, - "learning_rate": 1.2957080500440469e-05, - "loss": 0.9035, - "step": 374 - }, - { - "epoch": 0.41208791208791207, - "grad_norm": 0.8387643098831177, - "learning_rate": 1.2923717047227368e-05, - "loss": 0.876, - "step": 375 - }, - { - "epoch": 0.41318681318681316, - "grad_norm": 0.8617086410522461, - "learning_rate": 1.2890317969444716e-05, - "loss": 0.9371, - "step": 376 - }, - { - "epoch": 0.4142857142857143, - "grad_norm": 0.8338737487792969, - "learning_rate": 1.2856883674049736e-05, - "loss": 0.8857, - "step": 377 - }, - { - "epoch": 0.4153846153846154, - "grad_norm": 0.8637845516204834, - "learning_rate": 1.2823414568428767e-05, - "loss": 0.8765, - "step": 378 - }, - { - "epoch": 0.4164835164835165, - "grad_norm": 0.8519617319107056, - "learning_rate": 1.2789911060392295e-05, - "loss": 0.9189, - "step": 379 - }, - { - "epoch": 0.4175824175824176, - "grad_norm": 0.8661258816719055, - "learning_rate": 1.2756373558169992e-05, - "loss": 0.8903, - "step": 380 - }, - { - "epoch": 0.4186813186813187, - "grad_norm": 0.8369588851928711, - "learning_rate": 1.2722802470405744e-05, - "loss": 0.9026, - "step": 381 - }, - { - "epoch": 0.4197802197802198, - "grad_norm": 0.80511873960495, - "learning_rate": 1.2689198206152657e-05, - "loss": 0.9001, - "step": 382 - }, - { - "epoch": 0.4208791208791209, - "grad_norm": 0.8231064677238464, - "learning_rate": 1.265556117486809e-05, - "loss": 0.8825, - "step": 383 - }, - { - "epoch": 0.421978021978022, - "grad_norm": 0.8600528240203857, - "learning_rate": 1.2621891786408648e-05, - "loss": 0.9347, - "step": 384 - }, - { - "epoch": 0.4230769230769231, - "grad_norm": 0.9167119860649109, - "learning_rate": 1.2588190451025209e-05, - "loss": 0.9151, - "step": 385 - }, - { - "epoch": 0.42417582417582417, - "grad_norm": 0.947830080986023, - "learning_rate": 1.2554457579357906e-05, - "loss": 0.9053, - "step": 386 - }, - { - "epoch": 0.42527472527472526, - "grad_norm": 0.8016114830970764, - "learning_rate": 1.252069358243114e-05, - "loss": 0.929, - "step": 387 - }, - { - "epoch": 0.42637362637362636, - "grad_norm": 0.8494726419448853, - "learning_rate": 1.2486898871648552e-05, - "loss": 0.9054, - "step": 388 - }, - { - "epoch": 0.42747252747252745, - "grad_norm": 0.9367812871932983, - "learning_rate": 1.2453073858788027e-05, - "loss": 0.8638, - "step": 389 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 0.8473528027534485, - "learning_rate": 1.2419218955996677e-05, - "loss": 0.8899, - "step": 390 - }, - { - "epoch": 0.42967032967032964, - "grad_norm": 0.8546606302261353, - "learning_rate": 1.238533457578581e-05, - "loss": 0.9472, - "step": 391 - }, - { - "epoch": 0.4307692307692308, - "grad_norm": 0.815858006477356, - "learning_rate": 1.23514211310259e-05, - "loss": 0.898, - "step": 392 - }, - { - "epoch": 0.4318681318681319, - "grad_norm": 0.8338545560836792, - "learning_rate": 1.2317479034941572e-05, - "loss": 0.8841, - "step": 393 - }, - { - "epoch": 0.432967032967033, - "grad_norm": 0.9826653599739075, - "learning_rate": 1.2283508701106559e-05, - "loss": 0.8147, - "step": 394 - }, - { - "epoch": 0.4340659340659341, - "grad_norm": 0.7983335256576538, - "learning_rate": 1.2249510543438652e-05, - "loss": 0.9141, - "step": 395 - }, - { - "epoch": 0.4351648351648352, - "grad_norm": 0.8190010786056519, - "learning_rate": 1.2215484976194675e-05, - "loss": 0.8771, - "step": 396 - }, - { - "epoch": 0.43626373626373627, - "grad_norm": 0.8096528053283691, - "learning_rate": 1.2181432413965428e-05, - "loss": 0.884, - "step": 397 - }, - { - "epoch": 0.43736263736263736, - "grad_norm": 0.8622708916664124, - "learning_rate": 1.2147353271670634e-05, - "loss": 0.8831, - "step": 398 - }, - { - "epoch": 0.43846153846153846, - "grad_norm": 0.9301890730857849, - "learning_rate": 1.211324796455389e-05, - "loss": 0.9255, - "step": 399 - }, - { - "epoch": 0.43956043956043955, - "grad_norm": 0.8613848686218262, - "learning_rate": 1.2079116908177592e-05, - "loss": 0.9175, - "step": 400 - }, - { - "epoch": 0.44065934065934065, - "grad_norm": 0.9630892276763916, - "learning_rate": 1.2044960518417902e-05, - "loss": 0.9185, - "step": 401 - }, - { - "epoch": 0.44175824175824174, - "grad_norm": 0.8790509104728699, - "learning_rate": 1.2010779211459649e-05, - "loss": 0.8554, - "step": 402 - }, - { - "epoch": 0.44285714285714284, - "grad_norm": 0.843239426612854, - "learning_rate": 1.1976573403791263e-05, - "loss": 0.9029, - "step": 403 - }, - { - "epoch": 0.44395604395604393, - "grad_norm": 0.8969971537590027, - "learning_rate": 1.194234351219972e-05, - "loss": 0.9309, - "step": 404 - }, - { - "epoch": 0.44505494505494503, - "grad_norm": 0.9060132503509521, - "learning_rate": 1.190808995376545e-05, - "loss": 0.9385, - "step": 405 - }, - { - "epoch": 0.4461538461538462, - "grad_norm": 0.8224505186080933, - "learning_rate": 1.187381314585725e-05, - "loss": 0.9221, - "step": 406 - }, - { - "epoch": 0.4472527472527473, - "grad_norm": 0.834482729434967, - "learning_rate": 1.1839513506127202e-05, - "loss": 0.8758, - "step": 407 - }, - { - "epoch": 0.44835164835164837, - "grad_norm": 0.8234885931015015, - "learning_rate": 1.1805191452505602e-05, - "loss": 0.9122, - "step": 408 - }, - { - "epoch": 0.44945054945054946, - "grad_norm": 0.8815814852714539, - "learning_rate": 1.1770847403195836e-05, - "loss": 0.8646, - "step": 409 - }, - { - "epoch": 0.45054945054945056, - "grad_norm": 0.8104194402694702, - "learning_rate": 1.1736481776669307e-05, - "loss": 0.9326, - "step": 410 - }, - { - "epoch": 0.45164835164835165, - "grad_norm": 0.8835035562515259, - "learning_rate": 1.1702094991660326e-05, - "loss": 0.8773, - "step": 411 - }, - { - "epoch": 0.45274725274725275, - "grad_norm": 0.8700149059295654, - "learning_rate": 1.1667687467161025e-05, - "loss": 0.9061, - "step": 412 - }, - { - "epoch": 0.45384615384615384, - "grad_norm": 0.9154369831085205, - "learning_rate": 1.1633259622416224e-05, - "loss": 0.9423, - "step": 413 - }, - { - "epoch": 0.45494505494505494, - "grad_norm": 0.8719054460525513, - "learning_rate": 1.159881187691835e-05, - "loss": 0.8841, - "step": 414 - }, - { - "epoch": 0.45604395604395603, - "grad_norm": 0.87920081615448, - "learning_rate": 1.156434465040231e-05, - "loss": 0.8736, - "step": 415 - }, - { - "epoch": 0.45714285714285713, - "grad_norm": 0.8094822764396667, - "learning_rate": 1.1529858362840383e-05, - "loss": 0.9175, - "step": 416 - }, - { - "epoch": 0.4582417582417582, - "grad_norm": 0.780607283115387, - "learning_rate": 1.1495353434437098e-05, - "loss": 0.8126, - "step": 417 - }, - { - "epoch": 0.4593406593406593, - "grad_norm": 0.8137334585189819, - "learning_rate": 1.1460830285624119e-05, - "loss": 0.9058, - "step": 418 - }, - { - "epoch": 0.4604395604395604, - "grad_norm": 0.8687000870704651, - "learning_rate": 1.1426289337055119e-05, - "loss": 0.8686, - "step": 419 - }, - { - "epoch": 0.46153846153846156, - "grad_norm": 0.8456850051879883, - "learning_rate": 1.1391731009600655e-05, - "loss": 0.8527, - "step": 420 - }, - { - "epoch": 0.46263736263736266, - "grad_norm": 0.9492373466491699, - "learning_rate": 1.1357155724343046e-05, - "loss": 0.8709, - "step": 421 - }, - { - "epoch": 0.46373626373626375, - "grad_norm": 0.9114683270454407, - "learning_rate": 1.1322563902571227e-05, - "loss": 0.8875, - "step": 422 - }, - { - "epoch": 0.46483516483516485, - "grad_norm": 0.777378499507904, - "learning_rate": 1.128795596577563e-05, - "loss": 0.8847, - "step": 423 - }, - { - "epoch": 0.46593406593406594, - "grad_norm": 0.8800742626190186, - "learning_rate": 1.1253332335643043e-05, - "loss": 0.8674, - "step": 424 - }, - { - "epoch": 0.46703296703296704, - "grad_norm": 0.7953140735626221, - "learning_rate": 1.1218693434051475e-05, - "loss": 0.9221, - "step": 425 - }, - { - "epoch": 0.46813186813186813, - "grad_norm": 0.9174728989601135, - "learning_rate": 1.1184039683065014e-05, - "loss": 0.8949, - "step": 426 - }, - { - "epoch": 0.46923076923076923, - "grad_norm": 0.9254575967788696, - "learning_rate": 1.1149371504928667e-05, - "loss": 0.9118, - "step": 427 - }, - { - "epoch": 0.4703296703296703, - "grad_norm": 0.8703052401542664, - "learning_rate": 1.1114689322063255e-05, - "loss": 0.9026, - "step": 428 - }, - { - "epoch": 0.4714285714285714, - "grad_norm": 0.7875840663909912, - "learning_rate": 1.1079993557060228e-05, - "loss": 0.9182, - "step": 429 - }, - { - "epoch": 0.4725274725274725, - "grad_norm": 0.8329475522041321, - "learning_rate": 1.1045284632676535e-05, - "loss": 0.8907, - "step": 430 - }, - { - "epoch": 0.4736263736263736, - "grad_norm": 0.7891417741775513, - "learning_rate": 1.1010562971829464e-05, - "loss": 0.9159, - "step": 431 - }, - { - "epoch": 0.4747252747252747, - "grad_norm": 0.8938619494438171, - "learning_rate": 1.0975828997591496e-05, - "loss": 0.8849, - "step": 432 - }, - { - "epoch": 0.4758241758241758, - "grad_norm": 0.8578941226005554, - "learning_rate": 1.0941083133185146e-05, - "loss": 0.9163, - "step": 433 - }, - { - "epoch": 0.47692307692307695, - "grad_norm": 0.8057888150215149, - "learning_rate": 1.0906325801977804e-05, - "loss": 0.8947, - "step": 434 - }, - { - "epoch": 0.47802197802197804, - "grad_norm": 0.8770987391471863, - "learning_rate": 1.0871557427476585e-05, - "loss": 0.858, - "step": 435 - }, - { - "epoch": 0.47912087912087914, - "grad_norm": 0.8221072554588318, - "learning_rate": 1.083677843332316e-05, - "loss": 0.8844, - "step": 436 - }, - { - "epoch": 0.48021978021978023, - "grad_norm": 0.8124603629112244, - "learning_rate": 1.0801989243288588e-05, - "loss": 0.9312, - "step": 437 - }, - { - "epoch": 0.48131868131868133, - "grad_norm": 0.7840957641601562, - "learning_rate": 1.0767190281268187e-05, - "loss": 0.9466, - "step": 438 - }, - { - "epoch": 0.4824175824175824, - "grad_norm": 0.8962573409080505, - "learning_rate": 1.0732381971276318e-05, - "loss": 0.8569, - "step": 439 - }, - { - "epoch": 0.4835164835164835, - "grad_norm": 0.9741160869598389, - "learning_rate": 1.0697564737441254e-05, - "loss": 0.8852, - "step": 440 - }, - { - "epoch": 0.4846153846153846, - "grad_norm": 0.8237054944038391, - "learning_rate": 1.0662739004000005e-05, - "loss": 0.9067, - "step": 441 - }, - { - "epoch": 0.4857142857142857, - "grad_norm": 0.9245902895927429, - "learning_rate": 1.0627905195293135e-05, - "loss": 0.8869, - "step": 442 - }, - { - "epoch": 0.4868131868131868, - "grad_norm": 0.941974937915802, - "learning_rate": 1.0593063735759619e-05, - "loss": 0.9111, - "step": 443 - }, - { - "epoch": 0.4879120879120879, - "grad_norm": 0.8515195250511169, - "learning_rate": 1.055821504993164e-05, - "loss": 0.8922, - "step": 444 - }, - { - "epoch": 0.489010989010989, - "grad_norm": 0.8483346104621887, - "learning_rate": 1.0523359562429441e-05, - "loss": 0.8779, - "step": 445 - }, - { - "epoch": 0.4901098901098901, - "grad_norm": 0.8782788515090942, - "learning_rate": 1.0488497697956134e-05, - "loss": 0.8884, - "step": 446 - }, - { - "epoch": 0.4912087912087912, - "grad_norm": 0.9266250133514404, - "learning_rate": 1.0453629881292537e-05, - "loss": 0.8865, - "step": 447 - }, - { - "epoch": 0.49230769230769234, - "grad_norm": 0.8418594598770142, - "learning_rate": 1.0418756537291996e-05, - "loss": 0.905, - "step": 448 - }, - { - "epoch": 0.49340659340659343, - "grad_norm": 0.9292969107627869, - "learning_rate": 1.03838780908752e-05, - "loss": 0.9311, - "step": 449 - }, - { - "epoch": 0.4945054945054945, - "grad_norm": 0.895294725894928, - "learning_rate": 1.0348994967025012e-05, - "loss": 0.9038, - "step": 450 - }, - { - "epoch": 0.4956043956043956, - "grad_norm": 0.9297352433204651, - "learning_rate": 1.0314107590781284e-05, - "loss": 0.8805, - "step": 451 - }, - { - "epoch": 0.4967032967032967, - "grad_norm": 0.9604554176330566, - "learning_rate": 1.0279216387235691e-05, - "loss": 0.9017, - "step": 452 - }, - { - "epoch": 0.4978021978021978, - "grad_norm": 0.9686382412910461, - "learning_rate": 1.0244321781526533e-05, - "loss": 0.9096, - "step": 453 - }, - { - "epoch": 0.4989010989010989, - "grad_norm": 0.8140036463737488, - "learning_rate": 1.0209424198833571e-05, - "loss": 0.7962, - "step": 454 - }, - { - "epoch": 0.5, - "grad_norm": 0.842017650604248, - "learning_rate": 1.0174524064372837e-05, - "loss": 0.8854, - "step": 455 - }, - { - "epoch": 0.5010989010989011, - "grad_norm": 0.8883151412010193, - "learning_rate": 1.0139621803391454e-05, - "loss": 0.9134, - "step": 456 - }, - { - "epoch": 0.5010989010989011, - "eval_loss": 0.8049879670143127, - "eval_runtime": 286.2429, - "eval_samples_per_second": 9.474, - "eval_steps_per_second": 0.395, - "step": 456 - }, - { - "epoch": 0.5021978021978022, - "grad_norm": 0.8745456337928772, - "learning_rate": 1.010471784116246e-05, - "loss": 0.9142, - "step": 457 - }, - { - "epoch": 0.5032967032967033, - "grad_norm": 0.8373499512672424, - "learning_rate": 1.0069812602979617e-05, - "loss": 0.9088, - "step": 458 - }, - { - "epoch": 0.5043956043956044, - "grad_norm": 0.8675478100776672, - "learning_rate": 1.0034906514152239e-05, - "loss": 0.8773, - "step": 459 - }, - { - "epoch": 0.5054945054945055, - "grad_norm": 0.8534087538719177, - "learning_rate": 1e-05, - "loss": 0.8792, - "step": 460 - }, - { - "epoch": 0.5065934065934066, - "grad_norm": 0.8625239729881287, - "learning_rate": 9.965093485847766e-06, - "loss": 0.8823, - "step": 461 - }, - { - "epoch": 0.5076923076923077, - "grad_norm": 0.8697182536125183, - "learning_rate": 9.930187397020385e-06, - "loss": 0.8816, - "step": 462 - }, - { - "epoch": 0.5087912087912088, - "grad_norm": 0.8853737711906433, - "learning_rate": 9.895282158837545e-06, - "loss": 0.8886, - "step": 463 - }, - { - "epoch": 0.5098901098901099, - "grad_norm": 0.8275629878044128, - "learning_rate": 9.860378196608549e-06, - "loss": 0.9398, - "step": 464 - }, - { - "epoch": 0.510989010989011, - "grad_norm": 0.8475217819213867, - "learning_rate": 9.825475935627165e-06, - "loss": 0.907, - "step": 465 - }, - { - "epoch": 0.512087912087912, - "grad_norm": 0.8975513577461243, - "learning_rate": 9.790575801166432e-06, - "loss": 0.8613, - "step": 466 - }, - { - "epoch": 0.5131868131868131, - "grad_norm": 0.9341835379600525, - "learning_rate": 9.75567821847347e-06, - "loss": 0.8613, - "step": 467 - }, - { - "epoch": 0.5142857142857142, - "grad_norm": 0.8976718783378601, - "learning_rate": 9.720783612764314e-06, - "loss": 0.8892, - "step": 468 - }, - { - "epoch": 0.5153846153846153, - "grad_norm": 0.927270770072937, - "learning_rate": 9.685892409218718e-06, - "loss": 0.9063, - "step": 469 - }, - { - "epoch": 0.5164835164835165, - "grad_norm": 0.8267651200294495, - "learning_rate": 9.651005032974994e-06, - "loss": 0.9018, - "step": 470 - }, - { - "epoch": 0.5175824175824176, - "grad_norm": 0.9432240128517151, - "learning_rate": 9.616121909124801e-06, - "loss": 0.8688, - "step": 471 - }, - { - "epoch": 0.5186813186813187, - "grad_norm": 0.7942216992378235, - "learning_rate": 9.581243462708007e-06, - "loss": 0.8882, - "step": 472 - }, - { - "epoch": 0.5197802197802198, - "grad_norm": 0.8511030077934265, - "learning_rate": 9.546370118707463e-06, - "loss": 0.9003, - "step": 473 - }, - { - "epoch": 0.5208791208791209, - "grad_norm": 0.84229975938797, - "learning_rate": 9.511502302043867e-06, - "loss": 0.8799, - "step": 474 - }, - { - "epoch": 0.521978021978022, - "grad_norm": 0.8626801371574402, - "learning_rate": 9.476640437570562e-06, - "loss": 0.9023, - "step": 475 - }, - { - "epoch": 0.5230769230769231, - "grad_norm": 0.8502923846244812, - "learning_rate": 9.441784950068362e-06, - "loss": 0.9321, - "step": 476 - }, - { - "epoch": 0.5241758241758242, - "grad_norm": 0.9864429831504822, - "learning_rate": 9.406936264240386e-06, - "loss": 0.8694, - "step": 477 - }, - { - "epoch": 0.5252747252747253, - "grad_norm": 0.8329097032546997, - "learning_rate": 9.372094804706867e-06, - "loss": 0.9242, - "step": 478 - }, - { - "epoch": 0.5263736263736264, - "grad_norm": 0.9097688794136047, - "learning_rate": 9.337260996000002e-06, - "loss": 0.8705, - "step": 479 - }, - { - "epoch": 0.5274725274725275, - "grad_norm": 0.8455448150634766, - "learning_rate": 9.302435262558748e-06, - "loss": 0.9122, - "step": 480 - }, - { - "epoch": 0.5285714285714286, - "grad_norm": 0.8785291314125061, - "learning_rate": 9.267618028723687e-06, - "loss": 0.9566, - "step": 481 - }, - { - "epoch": 0.5296703296703297, - "grad_norm": 0.9193261861801147, - "learning_rate": 9.232809718731815e-06, - "loss": 0.8815, - "step": 482 - }, - { - "epoch": 0.5307692307692308, - "grad_norm": 0.9496477246284485, - "learning_rate": 9.198010756711413e-06, - "loss": 0.8698, - "step": 483 - }, - { - "epoch": 0.5318681318681319, - "grad_norm": 0.9123839139938354, - "learning_rate": 9.163221566676847e-06, - "loss": 0.8724, - "step": 484 - }, - { - "epoch": 0.532967032967033, - "grad_norm": 0.9025096297264099, - "learning_rate": 9.128442572523418e-06, - "loss": 0.8703, - "step": 485 - }, - { - "epoch": 0.5340659340659341, - "grad_norm": 0.879030704498291, - "learning_rate": 9.093674198022201e-06, - "loss": 0.8539, - "step": 486 - }, - { - "epoch": 0.5351648351648352, - "grad_norm": 0.8959964513778687, - "learning_rate": 9.058916866814857e-06, - "loss": 0.908, - "step": 487 - }, - { - "epoch": 0.5362637362637362, - "grad_norm": 0.9018028974533081, - "learning_rate": 9.024171002408507e-06, - "loss": 0.8563, - "step": 488 - }, - { - "epoch": 0.5373626373626373, - "grad_norm": 0.9013898968696594, - "learning_rate": 8.989437028170537e-06, - "loss": 0.892, - "step": 489 - }, - { - "epoch": 0.5384615384615384, - "grad_norm": 1.0456573963165283, - "learning_rate": 8.954715367323468e-06, - "loss": 0.8697, - "step": 490 - }, - { - "epoch": 0.5395604395604395, - "grad_norm": 0.8590214252471924, - "learning_rate": 8.920006442939772e-06, - "loss": 0.8996, - "step": 491 - }, - { - "epoch": 0.5406593406593406, - "grad_norm": 0.821367621421814, - "learning_rate": 8.885310677936746e-06, - "loss": 0.7652, - "step": 492 - }, - { - "epoch": 0.5417582417582417, - "grad_norm": 0.7900271415710449, - "learning_rate": 8.850628495071336e-06, - "loss": 0.9241, - "step": 493 - }, - { - "epoch": 0.5428571428571428, - "grad_norm": 0.891683042049408, - "learning_rate": 8.815960316934991e-06, - "loss": 0.864, - "step": 494 - }, - { - "epoch": 0.5439560439560439, - "grad_norm": 0.8206665515899658, - "learning_rate": 8.781306565948528e-06, - "loss": 0.7992, - "step": 495 - }, - { - "epoch": 0.545054945054945, - "grad_norm": 0.9308216571807861, - "learning_rate": 8.746667664356957e-06, - "loss": 0.8557, - "step": 496 - }, - { - "epoch": 0.5461538461538461, - "grad_norm": 0.9433832168579102, - "learning_rate": 8.712044034224374e-06, - "loss": 0.8698, - "step": 497 - }, - { - "epoch": 0.5472527472527473, - "grad_norm": 0.848560631275177, - "learning_rate": 8.677436097428775e-06, - "loss": 0.9119, - "step": 498 - }, - { - "epoch": 0.5483516483516484, - "grad_norm": 0.9120827913284302, - "learning_rate": 8.642844275656957e-06, - "loss": 0.8977, - "step": 499 - }, - { - "epoch": 0.5494505494505495, - "grad_norm": 0.9794803857803345, - "learning_rate": 8.60826899039935e-06, - "loss": 0.8619, - "step": 500 - }, - { - "epoch": 0.5505494505494506, - "grad_norm": 0.9240473508834839, - "learning_rate": 8.573710662944884e-06, - "loss": 0.8755, - "step": 501 - }, - { - "epoch": 0.5516483516483517, - "grad_norm": 0.932121217250824, - "learning_rate": 8.539169714375885e-06, - "loss": 0.8924, - "step": 502 - }, - { - "epoch": 0.5527472527472528, - "grad_norm": 0.8963844180107117, - "learning_rate": 8.504646565562907e-06, - "loss": 0.8825, - "step": 503 - }, - { - "epoch": 0.5538461538461539, - "grad_norm": 0.8569233417510986, - "learning_rate": 8.47014163715962e-06, - "loss": 0.9201, - "step": 504 - }, - { - "epoch": 0.554945054945055, - "grad_norm": 1.0095889568328857, - "learning_rate": 8.43565534959769e-06, - "loss": 0.8511, - "step": 505 - }, - { - "epoch": 0.5560439560439561, - "grad_norm": 0.9729787111282349, - "learning_rate": 8.401188123081653e-06, - "loss": 0.8411, - "step": 506 - }, - { - "epoch": 0.5571428571428572, - "grad_norm": 0.9129795432090759, - "learning_rate": 8.366740377583781e-06, - "loss": 0.8953, - "step": 507 - }, - { - "epoch": 0.5582417582417583, - "grad_norm": 0.878101646900177, - "learning_rate": 8.332312532838978e-06, - "loss": 0.8967, - "step": 508 - }, - { - "epoch": 0.5593406593406594, - "grad_norm": 0.9370154142379761, - "learning_rate": 8.297905008339677e-06, - "loss": 0.9144, - "step": 509 - }, - { - "epoch": 0.5604395604395604, - "grad_norm": 0.9259123802185059, - "learning_rate": 8.263518223330698e-06, - "loss": 0.8631, - "step": 510 - }, - { - "epoch": 0.5615384615384615, - "grad_norm": 0.9402754902839661, - "learning_rate": 8.22915259680417e-06, - "loss": 0.9084, - "step": 511 - }, - { - "epoch": 0.5626373626373626, - "grad_norm": 0.9622709155082703, - "learning_rate": 8.194808547494401e-06, - "loss": 0.8812, - "step": 512 - }, - { - "epoch": 0.5637362637362637, - "grad_norm": 0.9227533340454102, - "learning_rate": 8.1604864938728e-06, - "loss": 0.9053, - "step": 513 - }, - { - "epoch": 0.5648351648351648, - "grad_norm": 0.9174178242683411, - "learning_rate": 8.126186854142752e-06, - "loss": 0.8842, - "step": 514 - }, - { - "epoch": 0.5659340659340659, - "grad_norm": 1.0504857301712036, - "learning_rate": 8.091910046234552e-06, - "loss": 0.8557, - "step": 515 - }, - { - "epoch": 0.567032967032967, - "grad_norm": 0.9095496535301208, - "learning_rate": 8.057656487800283e-06, - "loss": 0.9047, - "step": 516 - }, - { - "epoch": 0.5681318681318681, - "grad_norm": 1.0382673740386963, - "learning_rate": 8.023426596208739e-06, - "loss": 0.8369, - "step": 517 - }, - { - "epoch": 0.5692307692307692, - "grad_norm": 0.9548729658126831, - "learning_rate": 7.989220788540356e-06, - "loss": 0.8764, - "step": 518 - }, - { - "epoch": 0.5703296703296703, - "grad_norm": 0.9228119850158691, - "learning_rate": 7.955039481582098e-06, - "loss": 0.9382, - "step": 519 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 0.8801160454750061, - "learning_rate": 7.92088309182241e-06, - "loss": 0.8776, - "step": 520 - }, - { - "epoch": 0.5725274725274725, - "grad_norm": 0.971278190612793, - "learning_rate": 7.886752035446116e-06, - "loss": 0.9147, - "step": 521 - }, - { - "epoch": 0.5736263736263736, - "grad_norm": 0.98002690076828, - "learning_rate": 7.852646728329368e-06, - "loss": 0.8739, - "step": 522 - }, - { - "epoch": 0.5747252747252747, - "grad_norm": 0.9560618996620178, - "learning_rate": 7.818567586034578e-06, - "loss": 0.9067, - "step": 523 - }, - { - "epoch": 0.5758241758241758, - "grad_norm": 0.8873199224472046, - "learning_rate": 7.784515023805328e-06, - "loss": 0.871, - "step": 524 - }, - { - "epoch": 0.5769230769230769, - "grad_norm": 0.9503259658813477, - "learning_rate": 7.750489456561351e-06, - "loss": 0.8664, - "step": 525 - }, - { - "epoch": 0.578021978021978, - "grad_norm": 0.976687490940094, - "learning_rate": 7.716491298893443e-06, - "loss": 0.8628, - "step": 526 - }, - { - "epoch": 0.5791208791208792, - "grad_norm": 1.0008978843688965, - "learning_rate": 7.68252096505843e-06, - "loss": 0.8889, - "step": 527 - }, - { - "epoch": 0.5802197802197803, - "grad_norm": 0.8194331526756287, - "learning_rate": 7.6485788689741e-06, - "loss": 0.9155, - "step": 528 - }, - { - "epoch": 0.5813186813186814, - "grad_norm": 0.9620078206062317, - "learning_rate": 7.6146654242141935e-06, - "loss": 0.8469, - "step": 529 - }, - { - "epoch": 0.5824175824175825, - "grad_norm": 0.9368724226951599, - "learning_rate": 7.580781044003324e-06, - "loss": 0.9273, - "step": 530 - }, - { - "epoch": 0.5835164835164836, - "grad_norm": 0.8956198692321777, - "learning_rate": 7.546926141211975e-06, - "loss": 0.9003, - "step": 531 - }, - { - "epoch": 0.5846153846153846, - "grad_norm": 0.8856068253517151, - "learning_rate": 7.513101128351454e-06, - "loss": 0.8789, - "step": 532 - }, - { - "epoch": 0.5857142857142857, - "grad_norm": 0.9317082762718201, - "learning_rate": 7.4793064175688635e-06, - "loss": 0.9155, - "step": 533 - }, - { - "epoch": 0.5868131868131868, - "grad_norm": 0.9579260945320129, - "learning_rate": 7.445542420642097e-06, - "loss": 0.8924, - "step": 534 - }, - { - "epoch": 0.5879120879120879, - "grad_norm": 0.9473505616188049, - "learning_rate": 7.411809548974792e-06, - "loss": 0.8969, - "step": 535 - }, - { - "epoch": 0.589010989010989, - "grad_norm": 0.9115369319915771, - "learning_rate": 7.378108213591355e-06, - "loss": 0.8752, - "step": 536 - }, - { - "epoch": 0.5901098901098901, - "grad_norm": 0.9633163809776306, - "learning_rate": 7.344438825131912e-06, - "loss": 0.8834, - "step": 537 - }, - { - "epoch": 0.5912087912087912, - "grad_norm": 0.9608784914016724, - "learning_rate": 7.310801793847344e-06, - "loss": 0.8751, - "step": 538 - }, - { - "epoch": 0.5923076923076923, - "grad_norm": 0.9510106444358826, - "learning_rate": 7.277197529594257e-06, - "loss": 0.9009, - "step": 539 - }, - { - "epoch": 0.5934065934065934, - "grad_norm": 1.041865587234497, - "learning_rate": 7.243626441830009e-06, - "loss": 0.8275, - "step": 540 - }, - { - "epoch": 0.5945054945054945, - "grad_norm": 0.8658502697944641, - "learning_rate": 7.210088939607709e-06, - "loss": 0.8487, - "step": 541 - }, - { - "epoch": 0.5956043956043956, - "grad_norm": 0.8698709607124329, - "learning_rate": 7.176585431571235e-06, - "loss": 0.9285, - "step": 542 - }, - { - "epoch": 0.5967032967032967, - "grad_norm": 0.8681691288948059, - "learning_rate": 7.143116325950266e-06, - "loss": 0.8689, - "step": 543 - }, - { - "epoch": 0.5978021978021978, - "grad_norm": 0.9033341407775879, - "learning_rate": 7.109682030555283e-06, - "loss": 0.8818, - "step": 544 - }, - { - "epoch": 0.5989010989010989, - "grad_norm": 1.019751787185669, - "learning_rate": 7.076282952772634e-06, - "loss": 0.8537, - "step": 545 - }, - { - "epoch": 0.6, - "grad_norm": 0.9411675930023193, - "learning_rate": 7.042919499559538e-06, - "loss": 0.8673, - "step": 546 - }, - { - "epoch": 0.6010989010989011, - "grad_norm": 0.8855169415473938, - "learning_rate": 7.009592077439135e-06, - "loss": 0.8867, - "step": 547 - }, - { - "epoch": 0.6021978021978022, - "grad_norm": 0.9604920148849487, - "learning_rate": 6.976301092495556e-06, - "loss": 0.8764, - "step": 548 - }, - { - "epoch": 0.6032967032967033, - "grad_norm": 0.9340246915817261, - "learning_rate": 6.943046950368944e-06, - "loss": 0.8546, - "step": 549 - }, - { - "epoch": 0.6043956043956044, - "grad_norm": 0.964244544506073, - "learning_rate": 6.909830056250527e-06, - "loss": 0.8914, - "step": 550 - }, - { - "epoch": 0.6054945054945055, - "grad_norm": 0.9519253969192505, - "learning_rate": 6.876650814877675e-06, - "loss": 0.8716, - "step": 551 - }, - { - "epoch": 0.6065934065934065, - "grad_norm": 0.9927486181259155, - "learning_rate": 6.843509630528977e-06, - "loss": 0.8735, - "step": 552 - }, - { - "epoch": 0.6076923076923076, - "grad_norm": 0.9689722657203674, - "learning_rate": 6.8104069070193e-06, - "loss": 0.8861, - "step": 553 - }, - { - "epoch": 0.6087912087912087, - "grad_norm": 0.9738321304321289, - "learning_rate": 6.777343047694891e-06, - "loss": 0.8596, - "step": 554 - }, - { - "epoch": 0.6098901098901099, - "grad_norm": 0.890021800994873, - "learning_rate": 6.744318455428436e-06, - "loss": 0.8901, - "step": 555 - }, - { - "epoch": 0.610989010989011, - "grad_norm": 0.9167658090591431, - "learning_rate": 6.711333532614168e-06, - "loss": 0.8826, - "step": 556 - }, - { - "epoch": 0.6120879120879121, - "grad_norm": 0.8974273204803467, - "learning_rate": 6.67838868116297e-06, - "loss": 0.8783, - "step": 557 - }, - { - "epoch": 0.6131868131868132, - "grad_norm": 0.861466646194458, - "learning_rate": 6.645484302497452e-06, - "loss": 0.897, - "step": 558 - }, - { - "epoch": 0.6142857142857143, - "grad_norm": 1.0128669738769531, - "learning_rate": 6.612620797547087e-06, - "loss": 0.8611, - "step": 559 - }, - { - "epoch": 0.6153846153846154, - "grad_norm": 0.9314884543418884, - "learning_rate": 6.579798566743314e-06, - "loss": 0.8905, - "step": 560 - }, - { - "epoch": 0.6164835164835165, - "grad_norm": 0.9477908611297607, - "learning_rate": 6.547018010014654e-06, - "loss": 0.895, - "step": 561 - }, - { - "epoch": 0.6175824175824176, - "grad_norm": 0.9088277816772461, - "learning_rate": 6.5142795267818505e-06, - "loss": 0.8851, - "step": 562 - }, - { - "epoch": 0.6186813186813187, - "grad_norm": 1.0250587463378906, - "learning_rate": 6.481583515952983e-06, - "loss": 0.8568, - "step": 563 - }, - { - "epoch": 0.6197802197802198, - "grad_norm": 0.954470694065094, - "learning_rate": 6.448930375918632e-06, - "loss": 0.8509, - "step": 564 - }, - { - "epoch": 0.6208791208791209, - "grad_norm": 0.9298940300941467, - "learning_rate": 6.4163205045469975e-06, - "loss": 0.9044, - "step": 565 - }, - { - "epoch": 0.621978021978022, - "grad_norm": 1.079363226890564, - "learning_rate": 6.383754299179079e-06, - "loss": 0.8339, - "step": 566 - }, - { - "epoch": 0.6230769230769231, - "grad_norm": 0.970298707485199, - "learning_rate": 6.351232156623803e-06, - "loss": 0.9076, - "step": 567 - }, - { - "epoch": 0.6241758241758242, - "grad_norm": 0.9551522135734558, - "learning_rate": 6.318754473153221e-06, - "loss": 0.8636, - "step": 568 - }, - { - "epoch": 0.6252747252747253, - "grad_norm": 1.0232148170471191, - "learning_rate": 6.286321644497655e-06, - "loss": 0.8949, - "step": 569 - }, - { - "epoch": 0.6263736263736264, - "grad_norm": 0.9624801278114319, - "learning_rate": 6.25393406584088e-06, - "loss": 0.895, - "step": 570 - }, - { - "epoch": 0.6274725274725275, - "grad_norm": 0.9046818614006042, - "learning_rate": 6.22159213181533e-06, - "loss": 0.8698, - "step": 571 - }, - { - "epoch": 0.6285714285714286, - "grad_norm": 0.9417130351066589, - "learning_rate": 6.18929623649726e-06, - "loss": 0.8908, - "step": 572 - }, - { - "epoch": 0.6296703296703297, - "grad_norm": 0.8617178797721863, - "learning_rate": 6.157046773401964e-06, - "loss": 0.7665, - "step": 573 - }, - { - "epoch": 0.6307692307692307, - "grad_norm": 0.9079705476760864, - "learning_rate": 6.124844135478971e-06, - "loss": 0.8728, - "step": 574 - }, - { - "epoch": 0.6318681318681318, - "grad_norm": 1.0097254514694214, - "learning_rate": 6.092688715107265e-06, - "loss": 0.8404, - "step": 575 - }, - { - "epoch": 0.6329670329670329, - "grad_norm": 0.9038534164428711, - "learning_rate": 6.06058090409049e-06, - "loss": 0.9342, - "step": 576 - }, - { - "epoch": 0.634065934065934, - "grad_norm": 1.0077219009399414, - "learning_rate": 6.028521093652195e-06, - "loss": 0.8573, - "step": 577 - }, - { - "epoch": 0.6351648351648351, - "grad_norm": 0.9598901867866516, - "learning_rate": 5.996509674431053e-06, - "loss": 0.8906, - "step": 578 - }, - { - "epoch": 0.6362637362637362, - "grad_norm": 0.9945114254951477, - "learning_rate": 5.9645470364761e-06, - "loss": 0.8674, - "step": 579 - }, - { - "epoch": 0.6373626373626373, - "grad_norm": 0.8753340840339661, - "learning_rate": 5.932633569242e-06, - "loss": 0.919, - "step": 580 - }, - { - "epoch": 0.6384615384615384, - "grad_norm": 1.050989031791687, - "learning_rate": 5.900769661584273e-06, - "loss": 0.8651, - "step": 581 - }, - { - "epoch": 0.6395604395604395, - "grad_norm": 0.952552080154419, - "learning_rate": 5.868955701754584e-06, - "loss": 0.8286, - "step": 582 - }, - { - "epoch": 0.6406593406593407, - "grad_norm": 1.0849634408950806, - "learning_rate": 5.83719207739599e-06, - "loss": 0.813, - "step": 583 - }, - { - "epoch": 0.6417582417582418, - "grad_norm": 0.9899274706840515, - "learning_rate": 5.8054791755382286e-06, - "loss": 0.8451, - "step": 584 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 0.9099906086921692, - "learning_rate": 5.773817382593008e-06, - "loss": 0.9335, - "step": 585 - }, - { - "epoch": 0.643956043956044, - "grad_norm": 1.0032905340194702, - "learning_rate": 5.742207084349274e-06, - "loss": 0.8479, - "step": 586 - }, - { - "epoch": 0.6450549450549451, - "grad_norm": 0.9248991012573242, - "learning_rate": 5.710648665968543e-06, - "loss": 0.8708, - "step": 587 - }, - { - "epoch": 0.6461538461538462, - "grad_norm": 1.064224362373352, - "learning_rate": 5.679142511980176e-06, - "loss": 0.8368, - "step": 588 - }, - { - "epoch": 0.6472527472527473, - "grad_norm": 1.0247055292129517, - "learning_rate": 5.647689006276727e-06, - "loss": 0.9147, - "step": 589 - }, - { - "epoch": 0.6483516483516484, - "grad_norm": 0.9444321990013123, - "learning_rate": 5.616288532109225e-06, - "loss": 0.8653, - "step": 590 - }, - { - "epoch": 0.6494505494505495, - "grad_norm": 0.9603102803230286, - "learning_rate": 5.584941472082549e-06, - "loss": 0.9033, - "step": 591 - }, - { - "epoch": 0.6505494505494506, - "grad_norm": 0.9474777579307556, - "learning_rate": 5.553648208150728e-06, - "loss": 0.8898, - "step": 592 - }, - { - "epoch": 0.6516483516483517, - "grad_norm": 0.930324375629425, - "learning_rate": 5.522409121612304e-06, - "loss": 0.8897, - "step": 593 - }, - { - "epoch": 0.6527472527472528, - "grad_norm": 0.9771080613136292, - "learning_rate": 5.491224593105695e-06, - "loss": 0.8754, - "step": 594 - }, - { - "epoch": 0.6538461538461539, - "grad_norm": 0.9223531484603882, - "learning_rate": 5.460095002604533e-06, - "loss": 0.8771, - "step": 595 - }, - { - "epoch": 0.654945054945055, - "grad_norm": 0.9218010306358337, - "learning_rate": 5.429020729413062e-06, - "loss": 0.8789, - "step": 596 - }, - { - "epoch": 0.656043956043956, - "grad_norm": 0.9781400561332703, - "learning_rate": 5.398002152161484e-06, - "loss": 0.8493, - "step": 597 - }, - { - "epoch": 0.6571428571428571, - "grad_norm": 0.9774869680404663, - "learning_rate": 5.367039648801386e-06, - "loss": 0.852, - "step": 598 - }, - { - "epoch": 0.6582417582417582, - "grad_norm": 0.9851926565170288, - "learning_rate": 5.336133596601089e-06, - "loss": 0.8823, - "step": 599 - }, - { - "epoch": 0.6593406593406593, - "grad_norm": 1.0178786516189575, - "learning_rate": 5.305284372141095e-06, - "loss": 0.861, - "step": 600 - }, - { - "epoch": 0.6604395604395604, - "grad_norm": 1.0185765027999878, - "learning_rate": 5.274492351309462e-06, - "loss": 0.8421, - "step": 601 - }, - { - "epoch": 0.6615384615384615, - "grad_norm": 1.1164214611053467, - "learning_rate": 5.243757909297247e-06, - "loss": 0.8657, - "step": 602 - }, - { - "epoch": 0.6626373626373626, - "grad_norm": 1.026281714439392, - "learning_rate": 5.213081420593933e-06, - "loss": 0.8649, - "step": 603 - }, - { - "epoch": 0.6637362637362637, - "grad_norm": 0.972385585308075, - "learning_rate": 5.1824632589828465e-06, - "loss": 0.8194, - "step": 604 - }, - { - "epoch": 0.6648351648351648, - "grad_norm": 0.9680708646774292, - "learning_rate": 5.151903797536631e-06, - "loss": 0.8893, - "step": 605 - }, - { - "epoch": 0.6659340659340659, - "grad_norm": 0.980360209941864, - "learning_rate": 5.121403408612672e-06, - "loss": 0.8627, - "step": 606 - }, - { - "epoch": 0.667032967032967, - "grad_norm": 0.9436994194984436, - "learning_rate": 5.090962463848592e-06, - "loss": 0.8686, - "step": 607 - }, - { - "epoch": 0.6681318681318681, - "grad_norm": 0.9691317081451416, - "learning_rate": 5.060581334157693e-06, - "loss": 0.9105, - "step": 608 - }, - { - "epoch": 0.6692307692307692, - "grad_norm": 1.0196092128753662, - "learning_rate": 5.030260389724447e-06, - "loss": 0.8732, - "step": 609 - }, - { - "epoch": 0.6703296703296703, - "grad_norm": 1.0082753896713257, - "learning_rate": 5.000000000000003e-06, - "loss": 0.8902, - "step": 610 - }, - { - "epoch": 0.6714285714285714, - "grad_norm": 0.9514579176902771, - "learning_rate": 4.96980053369765e-06, - "loss": 0.8556, - "step": 611 - }, - { - "epoch": 0.6725274725274726, - "grad_norm": 1.0114636421203613, - "learning_rate": 4.939662358788364e-06, - "loss": 0.8573, - "step": 612 - }, - { - "epoch": 0.6736263736263737, - "grad_norm": 1.05171799659729, - "learning_rate": 4.909585842496287e-06, - "loss": 0.903, - "step": 613 - }, - { - "epoch": 0.6747252747252748, - "grad_norm": 1.0408194065093994, - "learning_rate": 4.879571351294287e-06, - "loss": 0.8617, - "step": 614 - }, - { - "epoch": 0.6758241758241759, - "grad_norm": 0.98634272813797, - "learning_rate": 4.849619250899458e-06, - "loss": 0.893, - "step": 615 - }, - { - "epoch": 0.676923076923077, - "grad_norm": 0.9825206398963928, - "learning_rate": 4.8197299062687e-06, - "loss": 0.8829, - "step": 616 - }, - { - "epoch": 0.6780219780219781, - "grad_norm": 0.9365617036819458, - "learning_rate": 4.78990368159424e-06, - "loss": 0.8615, - "step": 617 - }, - { - "epoch": 0.6791208791208792, - "grad_norm": 1.0023555755615234, - "learning_rate": 4.76014094029921e-06, - "loss": 0.9059, - "step": 618 - }, - { - "epoch": 0.6802197802197802, - "grad_norm": 0.9915463924407959, - "learning_rate": 4.7304420450332244e-06, - "loss": 0.8609, - "step": 619 - }, - { - "epoch": 0.6813186813186813, - "grad_norm": 1.011752963066101, - "learning_rate": 4.700807357667953e-06, - "loss": 0.8768, - "step": 620 - }, - { - "epoch": 0.6824175824175824, - "grad_norm": 0.9427807331085205, - "learning_rate": 4.671237239292699e-06, - "loss": 0.9054, - "step": 621 - }, - { - "epoch": 0.6835164835164835, - "grad_norm": 0.9990485906600952, - "learning_rate": 4.641732050210032e-06, - "loss": 0.8978, - "step": 622 - }, - { - "epoch": 0.6846153846153846, - "grad_norm": 0.9522621631622314, - "learning_rate": 4.612292149931369e-06, - "loss": 0.8656, - "step": 623 - }, - { - "epoch": 0.6857142857142857, - "grad_norm": 1.041825532913208, - "learning_rate": 4.582917897172603e-06, - "loss": 0.8766, - "step": 624 - }, - { - "epoch": 0.6868131868131868, - "grad_norm": 0.9778273105621338, - "learning_rate": 4.5536096498497295e-06, - "loss": 0.872, - "step": 625 - }, - { - "epoch": 0.6879120879120879, - "grad_norm": 1.0157729387283325, - "learning_rate": 4.524367765074499e-06, - "loss": 0.8588, - "step": 626 - }, - { - "epoch": 0.689010989010989, - "grad_norm": 0.9173256754875183, - "learning_rate": 4.495192599150045e-06, - "loss": 0.9011, - "step": 627 - }, - { - "epoch": 0.6901098901098901, - "grad_norm": 0.8952473998069763, - "learning_rate": 4.46608450756656e-06, - "loss": 0.8615, - "step": 628 - }, - { - "epoch": 0.6912087912087912, - "grad_norm": 0.9065569043159485, - "learning_rate": 4.437043844996952e-06, - "loss": 0.9027, - "step": 629 - }, - { - "epoch": 0.6923076923076923, - "grad_norm": 1.0536279678344727, - "learning_rate": 4.408070965292534e-06, - "loss": 0.8604, - "step": 630 - }, - { - "epoch": 0.6934065934065934, - "grad_norm": 1.008117914199829, - "learning_rate": 4.379166221478697e-06, - "loss": 0.8694, - "step": 631 - }, - { - "epoch": 0.6945054945054945, - "grad_norm": 1.0685791969299316, - "learning_rate": 4.350329965750622e-06, - "loss": 0.9125, - "step": 632 - }, - { - "epoch": 0.6956043956043956, - "grad_norm": 0.9675098061561584, - "learning_rate": 4.321562549468991e-06, - "loss": 0.8803, - "step": 633 - }, - { - "epoch": 0.6967032967032967, - "grad_norm": 0.9684959053993225, - "learning_rate": 4.292864323155684e-06, - "loss": 0.8943, - "step": 634 - }, - { - "epoch": 0.6978021978021978, - "grad_norm": 1.0042601823806763, - "learning_rate": 4.264235636489542e-06, - "loss": 0.8504, - "step": 635 - }, - { - "epoch": 0.6989010989010989, - "grad_norm": 0.9642529487609863, - "learning_rate": 4.235676838302069e-06, - "loss": 0.8698, - "step": 636 - }, - { - "epoch": 0.7, - "grad_norm": 0.9767054915428162, - "learning_rate": 4.207188276573214e-06, - "loss": 0.8531, - "step": 637 - }, - { - "epoch": 0.701098901098901, - "grad_norm": 0.9658926129341125, - "learning_rate": 4.178770298427107e-06, - "loss": 0.9051, - "step": 638 - }, - { - "epoch": 0.7021978021978021, - "grad_norm": 0.9388576745986938, - "learning_rate": 4.150423250127846e-06, - "loss": 0.8821, - "step": 639 - }, - { - "epoch": 0.7032967032967034, - "grad_norm": 0.9819766283035278, - "learning_rate": 4.12214747707527e-06, - "loss": 0.8431, - "step": 640 - }, - { - "epoch": 0.7043956043956044, - "grad_norm": 0.9655351042747498, - "learning_rate": 4.093943323800746e-06, - "loss": 0.8877, - "step": 641 - }, - { - "epoch": 0.7054945054945055, - "grad_norm": 0.9678211808204651, - "learning_rate": 4.065811133962987e-06, - "loss": 0.8932, - "step": 642 - }, - { - "epoch": 0.7065934065934066, - "grad_norm": 1.008948564529419, - "learning_rate": 4.037751250343841e-06, - "loss": 0.871, - "step": 643 - }, - { - "epoch": 0.7076923076923077, - "grad_norm": 1.0651590824127197, - "learning_rate": 4.009764014844143e-06, - "loss": 0.8612, - "step": 644 - }, - { - "epoch": 0.7087912087912088, - "grad_norm": 0.9793494939804077, - "learning_rate": 3.981849768479516e-06, - "loss": 0.8787, - "step": 645 - }, - { - "epoch": 0.7098901098901099, - "grad_norm": 0.9483197927474976, - "learning_rate": 3.954008851376252e-06, - "loss": 0.876, - "step": 646 - }, - { - "epoch": 0.710989010989011, - "grad_norm": 1.0572367906570435, - "learning_rate": 3.9262416027671354e-06, - "loss": 0.8792, - "step": 647 - }, - { - "epoch": 0.7120879120879121, - "grad_norm": 0.9965589642524719, - "learning_rate": 3.898548360987325e-06, - "loss": 0.8905, - "step": 648 - }, - { - "epoch": 0.7131868131868132, - "grad_norm": 0.9877957701683044, - "learning_rate": 3.8709294634702374e-06, - "loss": 0.8424, - "step": 649 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 1.0614023208618164, - "learning_rate": 3.8433852467434175e-06, - "loss": 0.8486, - "step": 650 - }, - { - "epoch": 0.7153846153846154, - "grad_norm": 1.0227680206298828, - "learning_rate": 3.81591604642446e-06, - "loss": 0.889, - "step": 651 - }, - { - "epoch": 0.7164835164835165, - "grad_norm": 0.9940324425697327, - "learning_rate": 3.7885221972168974e-06, - "loss": 0.8582, - "step": 652 - }, - { - "epoch": 0.7175824175824176, - "grad_norm": 1.0908942222595215, - "learning_rate": 3.7612040329061405e-06, - "loss": 0.81, - "step": 653 - }, - { - "epoch": 0.7186813186813187, - "grad_norm": 0.975747287273407, - "learning_rate": 3.7339618863553983e-06, - "loss": 0.8753, - "step": 654 - }, - { - "epoch": 0.7197802197802198, - "grad_norm": 1.0195329189300537, - "learning_rate": 3.7067960895016277e-06, - "loss": 0.8451, - "step": 655 - }, - { - "epoch": 0.7208791208791209, - "grad_norm": 0.9560036659240723, - "learning_rate": 3.679706973351491e-06, - "loss": 0.9202, - "step": 656 - }, - { - "epoch": 0.721978021978022, - "grad_norm": 0.9672074913978577, - "learning_rate": 3.6526948679773256e-06, - "loss": 0.8759, - "step": 657 - }, - { - "epoch": 0.7230769230769231, - "grad_norm": 0.9840734004974365, - "learning_rate": 3.625760102513103e-06, - "loss": 0.892, - "step": 658 - }, - { - "epoch": 0.7241758241758242, - "grad_norm": 0.8806354999542236, - "learning_rate": 3.598903005150444e-06, - "loss": 0.877, - "step": 659 - }, - { - "epoch": 0.7252747252747253, - "grad_norm": 1.0590522289276123, - "learning_rate": 3.5721239031346067e-06, - "loss": 0.8344, - "step": 660 - }, - { - "epoch": 0.7263736263736263, - "grad_norm": 1.1104999780654907, - "learning_rate": 3.545423122760493e-06, - "loss": 0.8215, - "step": 661 - }, - { - "epoch": 0.7274725274725274, - "grad_norm": 0.9766964316368103, - "learning_rate": 3.5188009893686916e-06, - "loss": 0.8421, - "step": 662 - }, - { - "epoch": 0.7285714285714285, - "grad_norm": 1.0318822860717773, - "learning_rate": 3.492257827341492e-06, - "loss": 0.843, - "step": 663 - }, - { - "epoch": 0.7296703296703296, - "grad_norm": 1.070156216621399, - "learning_rate": 3.4657939600989453e-06, - "loss": 0.8625, - "step": 664 - }, - { - "epoch": 0.7307692307692307, - "grad_norm": 1.006454586982727, - "learning_rate": 3.4394097100949286e-06, - "loss": 0.8736, - "step": 665 - }, - { - "epoch": 0.7318681318681318, - "grad_norm": 1.0376707315444946, - "learning_rate": 3.4131053988131947e-06, - "loss": 0.8872, - "step": 666 - }, - { - "epoch": 0.7329670329670329, - "grad_norm": 1.0665842294692993, - "learning_rate": 3.3868813467634833e-06, - "loss": 0.8545, - "step": 667 - }, - { - "epoch": 0.734065934065934, - "grad_norm": 1.0440903902053833, - "learning_rate": 3.360737873477584e-06, - "loss": 0.9001, - "step": 668 - }, - { - "epoch": 0.7351648351648352, - "grad_norm": 0.9699259996414185, - "learning_rate": 3.3346752975054763e-06, - "loss": 0.8881, - "step": 669 - }, - { - "epoch": 0.7362637362637363, - "grad_norm": 1.0507755279541016, - "learning_rate": 3.308693936411421e-06, - "loss": 0.8568, - "step": 670 - }, - { - "epoch": 0.7373626373626374, - "grad_norm": 1.0369819402694702, - "learning_rate": 3.2827941067700996e-06, - "loss": 0.8631, - "step": 671 - }, - { - "epoch": 0.7384615384615385, - "grad_norm": 0.9790082573890686, - "learning_rate": 3.2569761241627694e-06, - "loss": 0.9159, - "step": 672 - }, - { - "epoch": 0.7395604395604396, - "grad_norm": 0.9796406626701355, - "learning_rate": 3.2312403031733943e-06, - "loss": 0.8625, - "step": 673 - }, - { - "epoch": 0.7406593406593407, - "grad_norm": 0.985804557800293, - "learning_rate": 3.2055869573848374e-06, - "loss": 0.8248, - "step": 674 - }, - { - "epoch": 0.7417582417582418, - "grad_norm": 1.046293020248413, - "learning_rate": 3.1800163993750166e-06, - "loss": 0.8577, - "step": 675 - }, - { - "epoch": 0.7428571428571429, - "grad_norm": 0.9537214040756226, - "learning_rate": 3.1545289407131128e-06, - "loss": 0.7764, - "step": 676 - }, - { - "epoch": 0.743956043956044, - "grad_norm": 1.0488396883010864, - "learning_rate": 3.1291248919557717e-06, - "loss": 0.8368, - "step": 677 - }, - { - "epoch": 0.7450549450549451, - "grad_norm": 1.043094515800476, - "learning_rate": 3.103804562643302e-06, - "loss": 0.9031, - "step": 678 - }, - { - "epoch": 0.7461538461538462, - "grad_norm": 1.0852601528167725, - "learning_rate": 3.0785682612959334e-06, - "loss": 0.9064, - "step": 679 - }, - { - "epoch": 0.7472527472527473, - "grad_norm": 0.919506311416626, - "learning_rate": 3.0534162954100264e-06, - "loss": 0.8797, - "step": 680 - }, - { - "epoch": 0.7483516483516484, - "grad_norm": 1.0292491912841797, - "learning_rate": 3.028348971454356e-06, - "loss": 0.8234, - "step": 681 - }, - { - "epoch": 0.7494505494505495, - "grad_norm": 1.1027458906173706, - "learning_rate": 3.003366594866345e-06, - "loss": 0.8913, - "step": 682 - }, - { - "epoch": 0.7505494505494505, - "grad_norm": 1.0286622047424316, - "learning_rate": 2.978469470048376e-06, - "loss": 0.8593, - "step": 683 - }, - { - "epoch": 0.7516483516483516, - "grad_norm": 0.995910108089447, - "learning_rate": 2.953657900364053e-06, - "loss": 0.8844, - "step": 684 - }, - { - "epoch": 0.7516483516483516, - "eval_loss": 0.799820065498352, - "eval_runtime": 285.758, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 0.395, - "step": 684 - }, - { - "epoch": 0.7527472527472527, - "grad_norm": 1.057847499847412, - "learning_rate": 2.9289321881345257e-06, - "loss": 0.8943, - "step": 685 - }, - { - "epoch": 0.7538461538461538, - "grad_norm": 0.9667083024978638, - "learning_rate": 2.9042926346347932e-06, - "loss": 0.8589, - "step": 686 - }, - { - "epoch": 0.7549450549450549, - "grad_norm": 1.0255805253982544, - "learning_rate": 2.8797395400900362e-06, - "loss": 0.8727, - "step": 687 - }, - { - "epoch": 0.756043956043956, - "grad_norm": 0.9197332262992859, - "learning_rate": 2.855273203671969e-06, - "loss": 0.8946, - "step": 688 - }, - { - "epoch": 0.7571428571428571, - "grad_norm": 1.0482935905456543, - "learning_rate": 2.830893923495173e-06, - "loss": 0.8485, - "step": 689 - }, - { - "epoch": 0.7582417582417582, - "grad_norm": 0.969306230545044, - "learning_rate": 2.8066019966134907e-06, - "loss": 0.9042, - "step": 690 - }, - { - "epoch": 0.7593406593406593, - "grad_norm": 1.0977563858032227, - "learning_rate": 2.7823977190163788e-06, - "loss": 0.8552, - "step": 691 - }, - { - "epoch": 0.7604395604395604, - "grad_norm": 1.1312968730926514, - "learning_rate": 2.7582813856253276e-06, - "loss": 0.8858, - "step": 692 - }, - { - "epoch": 0.7615384615384615, - "grad_norm": 0.9919386506080627, - "learning_rate": 2.7342532902902418e-06, - "loss": 0.8502, - "step": 693 - }, - { - "epoch": 0.7626373626373626, - "grad_norm": 0.9737907648086548, - "learning_rate": 2.7103137257858867e-06, - "loss": 0.8726, - "step": 694 - }, - { - "epoch": 0.7637362637362637, - "grad_norm": 1.002861738204956, - "learning_rate": 2.6864629838082957e-06, - "loss": 0.8845, - "step": 695 - }, - { - "epoch": 0.7648351648351648, - "grad_norm": 1.0830833911895752, - "learning_rate": 2.6627013549712355e-06, - "loss": 0.8497, - "step": 696 - }, - { - "epoch": 0.765934065934066, - "grad_norm": 1.0291420221328735, - "learning_rate": 2.639029128802657e-06, - "loss": 0.8918, - "step": 697 - }, - { - "epoch": 0.7670329670329671, - "grad_norm": 1.0553207397460938, - "learning_rate": 2.615446593741161e-06, - "loss": 0.8468, - "step": 698 - }, - { - "epoch": 0.7681318681318682, - "grad_norm": 1.0321344137191772, - "learning_rate": 2.5919540371325005e-06, - "loss": 0.9042, - "step": 699 - }, - { - "epoch": 0.7692307692307693, - "grad_norm": 0.9573972821235657, - "learning_rate": 2.5685517452260566e-06, - "loss": 0.8801, - "step": 700 - }, - { - "epoch": 0.7703296703296704, - "grad_norm": 1.0892798900604248, - "learning_rate": 2.5452400031713786e-06, - "loss": 0.8371, - "step": 701 - }, - { - "epoch": 0.7714285714285715, - "grad_norm": 0.9496036767959595, - "learning_rate": 2.522019095014683e-06, - "loss": 0.835, - "step": 702 - }, - { - "epoch": 0.7725274725274726, - "grad_norm": 1.1583739519119263, - "learning_rate": 2.4988893036954045e-06, - "loss": 0.8467, - "step": 703 - }, - { - "epoch": 0.7736263736263737, - "grad_norm": 1.0083168745040894, - "learning_rate": 2.4758509110427576e-06, - "loss": 0.8437, - "step": 704 - }, - { - "epoch": 0.7747252747252747, - "grad_norm": 1.0314342975616455, - "learning_rate": 2.45290419777228e-06, - "loss": 0.8827, - "step": 705 - }, - { - "epoch": 0.7758241758241758, - "grad_norm": 0.9839992523193359, - "learning_rate": 2.4300494434824373e-06, - "loss": 0.8498, - "step": 706 - }, - { - "epoch": 0.7769230769230769, - "grad_norm": 1.129815697669983, - "learning_rate": 2.407286926651192e-06, - "loss": 0.8659, - "step": 707 - }, - { - "epoch": 0.778021978021978, - "grad_norm": 1.0962284803390503, - "learning_rate": 2.3846169246326345e-06, - "loss": 0.8641, - "step": 708 - }, - { - "epoch": 0.7791208791208791, - "grad_norm": 1.0342788696289062, - "learning_rate": 2.362039713653581e-06, - "loss": 0.8244, - "step": 709 - }, - { - "epoch": 0.7802197802197802, - "grad_norm": 0.941452145576477, - "learning_rate": 2.339555568810221e-06, - "loss": 0.863, - "step": 710 - }, - { - "epoch": 0.7813186813186813, - "grad_norm": 0.9299508333206177, - "learning_rate": 2.317164764064769e-06, - "loss": 0.88, - "step": 711 - }, - { - "epoch": 0.7824175824175824, - "grad_norm": 1.0635751485824585, - "learning_rate": 2.2948675722421086e-06, - "loss": 0.8452, - "step": 712 - }, - { - "epoch": 0.7835164835164835, - "grad_norm": 1.0833570957183838, - "learning_rate": 2.27266426502649e-06, - "loss": 0.8762, - "step": 713 - }, - { - "epoch": 0.7846153846153846, - "grad_norm": 1.0336107015609741, - "learning_rate": 2.2505551129582047e-06, - "loss": 0.8542, - "step": 714 - }, - { - "epoch": 0.7857142857142857, - "grad_norm": 0.9398995637893677, - "learning_rate": 2.2285403854302912e-06, - "loss": 0.8772, - "step": 715 - }, - { - "epoch": 0.7868131868131868, - "grad_norm": 0.8882145285606384, - "learning_rate": 2.206620350685257e-06, - "loss": 0.8585, - "step": 716 - }, - { - "epoch": 0.7879120879120879, - "grad_norm": 1.0984944105148315, - "learning_rate": 2.1847952758118118e-06, - "loss": 0.8531, - "step": 717 - }, - { - "epoch": 0.789010989010989, - "grad_norm": 0.9191601872444153, - "learning_rate": 2.163065426741603e-06, - "loss": 0.8584, - "step": 718 - }, - { - "epoch": 0.7901098901098901, - "grad_norm": 1.0006372928619385, - "learning_rate": 2.1414310682459805e-06, - "loss": 0.8552, - "step": 719 - }, - { - "epoch": 0.7912087912087912, - "grad_norm": 1.0390114784240723, - "learning_rate": 2.119892463932781e-06, - "loss": 0.8885, - "step": 720 - }, - { - "epoch": 0.7923076923076923, - "grad_norm": 1.067863941192627, - "learning_rate": 2.098449876243096e-06, - "loss": 0.8432, - "step": 721 - }, - { - "epoch": 0.7934065934065934, - "grad_norm": 0.9986834526062012, - "learning_rate": 2.0771035664480944e-06, - "loss": 0.8814, - "step": 722 - }, - { - "epoch": 0.7945054945054945, - "grad_norm": 0.9810887575149536, - "learning_rate": 2.0558537946458177e-06, - "loss": 0.9094, - "step": 723 - }, - { - "epoch": 0.7956043956043956, - "grad_norm": 0.9768193364143372, - "learning_rate": 2.0347008197580376e-06, - "loss": 0.8581, - "step": 724 - }, - { - "epoch": 0.7967032967032966, - "grad_norm": 1.0558338165283203, - "learning_rate": 2.013644899527074e-06, - "loss": 0.8619, - "step": 725 - }, - { - "epoch": 0.7978021978021979, - "grad_norm": 0.9540855288505554, - "learning_rate": 1.9926862905126663e-06, - "loss": 0.8765, - "step": 726 - }, - { - "epoch": 0.798901098901099, - "grad_norm": 0.9828757047653198, - "learning_rate": 1.9718252480888567e-06, - "loss": 0.8505, - "step": 727 - }, - { - "epoch": 0.8, - "grad_norm": 0.9417946934700012, - "learning_rate": 1.95106202644086e-06, - "loss": 0.8744, - "step": 728 - }, - { - "epoch": 0.8010989010989011, - "grad_norm": 0.9659996628761292, - "learning_rate": 1.930396878561983e-06, - "loss": 0.8857, - "step": 729 - }, - { - "epoch": 0.8021978021978022, - "grad_norm": 0.9528297185897827, - "learning_rate": 1.9098300562505266e-06, - "loss": 0.8802, - "step": 730 - }, - { - "epoch": 0.8032967032967033, - "grad_norm": 1.010440707206726, - "learning_rate": 1.8893618101067357e-06, - "loss": 0.9223, - "step": 731 - }, - { - "epoch": 0.8043956043956044, - "grad_norm": 1.1046496629714966, - "learning_rate": 1.8689923895297247e-06, - "loss": 0.8416, - "step": 732 - }, - { - "epoch": 0.8054945054945055, - "grad_norm": 0.980089008808136, - "learning_rate": 1.848722042714457e-06, - "loss": 0.8745, - "step": 733 - }, - { - "epoch": 0.8065934065934066, - "grad_norm": 0.9576093554496765, - "learning_rate": 1.8285510166487154e-06, - "loss": 0.8562, - "step": 734 - }, - { - "epoch": 0.8076923076923077, - "grad_norm": 0.9389727711677551, - "learning_rate": 1.808479557110081e-06, - "loss": 0.8589, - "step": 735 - }, - { - "epoch": 0.8087912087912088, - "grad_norm": 1.0248185396194458, - "learning_rate": 1.7885079086629598e-06, - "loss": 0.8689, - "step": 736 - }, - { - "epoch": 0.8098901098901099, - "grad_norm": 1.0584657192230225, - "learning_rate": 1.7686363146555807e-06, - "loss": 0.8424, - "step": 737 - }, - { - "epoch": 0.810989010989011, - "grad_norm": 0.9815655946731567, - "learning_rate": 1.7488650172170496e-06, - "loss": 0.859, - "step": 738 - }, - { - "epoch": 0.8120879120879121, - "grad_norm": 1.037156581878662, - "learning_rate": 1.7291942572543806e-06, - "loss": 0.832, - "step": 739 - }, - { - "epoch": 0.8131868131868132, - "grad_norm": 1.0387474298477173, - "learning_rate": 1.709624274449584e-06, - "loss": 0.8821, - "step": 740 - }, - { - "epoch": 0.8142857142857143, - "grad_norm": 1.155094861984253, - "learning_rate": 1.6901553072567189e-06, - "loss": 0.8174, - "step": 741 - }, - { - "epoch": 0.8153846153846154, - "grad_norm": 1.0106889009475708, - "learning_rate": 1.6707875928990059e-06, - "loss": 0.8653, - "step": 742 - }, - { - "epoch": 0.8164835164835165, - "grad_norm": 0.9508170485496521, - "learning_rate": 1.651521367365936e-06, - "loss": 0.8992, - "step": 743 - }, - { - "epoch": 0.8175824175824176, - "grad_norm": 0.982920229434967, - "learning_rate": 1.6323568654103838e-06, - "loss": 0.9122, - "step": 744 - }, - { - "epoch": 0.8186813186813187, - "grad_norm": 1.150858759880066, - "learning_rate": 1.6132943205457607e-06, - "loss": 0.8136, - "step": 745 - }, - { - "epoch": 0.8197802197802198, - "grad_norm": 0.9389098882675171, - "learning_rate": 1.5943339650431578e-06, - "loss": 0.857, - "step": 746 - }, - { - "epoch": 0.8208791208791208, - "grad_norm": 1.0250283479690552, - "learning_rate": 1.5754760299285255e-06, - "loss": 0.8628, - "step": 747 - }, - { - "epoch": 0.8219780219780219, - "grad_norm": 1.0515613555908203, - "learning_rate": 1.5567207449798517e-06, - "loss": 0.8693, - "step": 748 - }, - { - "epoch": 0.823076923076923, - "grad_norm": 0.9877017736434937, - "learning_rate": 1.538068338724361e-06, - "loss": 0.9054, - "step": 749 - }, - { - "epoch": 0.8241758241758241, - "grad_norm": 0.9997501969337463, - "learning_rate": 1.5195190384357405e-06, - "loss": 0.8584, - "step": 750 - }, - { - "epoch": 0.8252747252747252, - "grad_norm": 1.0947656631469727, - "learning_rate": 1.5010730701313626e-06, - "loss": 0.8888, - "step": 751 - }, - { - "epoch": 0.8263736263736263, - "grad_norm": 1.0243169069290161, - "learning_rate": 1.4827306585695234e-06, - "loss": 0.8626, - "step": 752 - }, - { - "epoch": 0.8274725274725274, - "grad_norm": 0.9385499954223633, - "learning_rate": 1.4644920272467245e-06, - "loss": 0.8854, - "step": 753 - }, - { - "epoch": 0.8285714285714286, - "grad_norm": 1.0409740209579468, - "learning_rate": 1.446357398394934e-06, - "loss": 0.8187, - "step": 754 - }, - { - "epoch": 0.8296703296703297, - "grad_norm": 1.0209931135177612, - "learning_rate": 1.4283269929788779e-06, - "loss": 0.8589, - "step": 755 - }, - { - "epoch": 0.8307692307692308, - "grad_norm": 1.0085232257843018, - "learning_rate": 1.4104010306933558e-06, - "loss": 0.8666, - "step": 756 - }, - { - "epoch": 0.8318681318681319, - "grad_norm": 1.0032801628112793, - "learning_rate": 1.3925797299605649e-06, - "loss": 0.8978, - "step": 757 - }, - { - "epoch": 0.832967032967033, - "grad_norm": 0.9480746984481812, - "learning_rate": 1.3748633079274254e-06, - "loss": 0.8524, - "step": 758 - }, - { - "epoch": 0.8340659340659341, - "grad_norm": 1.0284579992294312, - "learning_rate": 1.3572519804629537e-06, - "loss": 0.8204, - "step": 759 - }, - { - "epoch": 0.8351648351648352, - "grad_norm": 1.0335582494735718, - "learning_rate": 1.339745962155613e-06, - "loss": 0.8666, - "step": 760 - }, - { - "epoch": 0.8362637362637363, - "grad_norm": 1.0865117311477661, - "learning_rate": 1.322345466310717e-06, - "loss": 0.864, - "step": 761 - }, - { - "epoch": 0.8373626373626374, - "grad_norm": 1.0500315427780151, - "learning_rate": 1.30505070494781e-06, - "loss": 0.8407, - "step": 762 - }, - { - "epoch": 0.8384615384615385, - "grad_norm": 1.1114267110824585, - "learning_rate": 1.2878618887981064e-06, - "loss": 0.8478, - "step": 763 - }, - { - "epoch": 0.8395604395604396, - "grad_norm": 1.096484899520874, - "learning_rate": 1.2707792273019049e-06, - "loss": 0.8273, - "step": 764 - }, - { - "epoch": 0.8406593406593407, - "grad_norm": 1.0912528038024902, - "learning_rate": 1.2538029286060428e-06, - "loss": 0.8497, - "step": 765 - }, - { - "epoch": 0.8417582417582418, - "grad_norm": 0.9767187833786011, - "learning_rate": 1.2369331995613664e-06, - "loss": 0.8629, - "step": 766 - }, - { - "epoch": 0.8428571428571429, - "grad_norm": 1.0741673707962036, - "learning_rate": 1.2201702457201948e-06, - "loss": 0.8734, - "step": 767 - }, - { - "epoch": 0.843956043956044, - "grad_norm": 1.019477128982544, - "learning_rate": 1.2035142713338366e-06, - "loss": 0.8891, - "step": 768 - }, - { - "epoch": 0.845054945054945, - "grad_norm": 1.02205491065979, - "learning_rate": 1.1869654793500784e-06, - "loss": 0.8213, - "step": 769 - }, - { - "epoch": 0.8461538461538461, - "grad_norm": 0.9983634352684021, - "learning_rate": 1.1705240714107301e-06, - "loss": 0.8849, - "step": 770 - }, - { - "epoch": 0.8472527472527472, - "grad_norm": 1.0600976943969727, - "learning_rate": 1.1541902478491607e-06, - "loss": 0.9081, - "step": 771 - }, - { - "epoch": 0.8483516483516483, - "grad_norm": 1.0630656480789185, - "learning_rate": 1.1379642076878528e-06, - "loss": 0.856, - "step": 772 - }, - { - "epoch": 0.8494505494505494, - "grad_norm": 1.012676477432251, - "learning_rate": 1.1218461486359878e-06, - "loss": 0.899, - "step": 773 - }, - { - "epoch": 0.8505494505494505, - "grad_norm": 0.9903215169906616, - "learning_rate": 1.1058362670870248e-06, - "loss": 0.8554, - "step": 774 - }, - { - "epoch": 0.8516483516483516, - "grad_norm": 0.990456759929657, - "learning_rate": 1.0899347581163222e-06, - "loss": 0.8937, - "step": 775 - }, - { - "epoch": 0.8527472527472527, - "grad_norm": 1.0120489597320557, - "learning_rate": 1.0741418154787443e-06, - "loss": 0.8539, - "step": 776 - }, - { - "epoch": 0.8538461538461538, - "grad_norm": 0.9427462816238403, - "learning_rate": 1.058457631606319e-06, - "loss": 0.8433, - "step": 777 - }, - { - "epoch": 0.8549450549450549, - "grad_norm": 0.8684647083282471, - "learning_rate": 1.042882397605871e-06, - "loss": 0.8641, - "step": 778 - }, - { - "epoch": 0.856043956043956, - "grad_norm": 1.0312992334365845, - "learning_rate": 1.0274163032567165e-06, - "loss": 0.8775, - "step": 779 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 1.079676628112793, - "learning_rate": 1.012059537008332e-06, - "loss": 0.8622, - "step": 780 - }, - { - "epoch": 0.8582417582417582, - "grad_norm": 1.0317022800445557, - "learning_rate": 9.968122859780648e-07, - "loss": 0.8594, - "step": 781 - }, - { - "epoch": 0.8593406593406593, - "grad_norm": 1.1111171245574951, - "learning_rate": 9.816747359488632e-07, - "loss": 0.8544, - "step": 782 - }, - { - "epoch": 0.8604395604395605, - "grad_norm": 0.9669517278671265, - "learning_rate": 9.666470713669918e-07, - "loss": 0.8898, - "step": 783 - }, - { - "epoch": 0.8615384615384616, - "grad_norm": 0.9427120089530945, - "learning_rate": 9.517294753398066e-07, - "loss": 0.8779, - "step": 784 - }, - { - "epoch": 0.8626373626373627, - "grad_norm": 0.9802073836326599, - "learning_rate": 9.369221296335007e-07, - "loss": 0.8869, - "step": 785 - }, - { - "epoch": 0.8637362637362638, - "grad_norm": 0.9776756167411804, - "learning_rate": 9.222252146709143e-07, - "loss": 0.8521, - "step": 786 - }, - { - "epoch": 0.8648351648351649, - "grad_norm": 1.0648225545883179, - "learning_rate": 9.076389095293148e-07, - "loss": 0.8526, - "step": 787 - }, - { - "epoch": 0.865934065934066, - "grad_norm": 1.0023770332336426, - "learning_rate": 8.931633919382299e-07, - "loss": 0.8543, - "step": 788 - }, - { - "epoch": 0.8670329670329671, - "grad_norm": 0.979067325592041, - "learning_rate": 8.787988382772705e-07, - "loss": 0.8707, - "step": 789 - }, - { - "epoch": 0.8681318681318682, - "grad_norm": 0.9979352355003357, - "learning_rate": 8.645454235739903e-07, - "loss": 0.8501, - "step": 790 - }, - { - "epoch": 0.8692307692307693, - "grad_norm": 0.9789692163467407, - "learning_rate": 8.504033215017527e-07, - "loss": 0.8657, - "step": 791 - }, - { - "epoch": 0.8703296703296703, - "grad_norm": 0.9490736722946167, - "learning_rate": 8.363727043776037e-07, - "loss": 0.913, - "step": 792 - }, - { - "epoch": 0.8714285714285714, - "grad_norm": 1.0531755685806274, - "learning_rate": 8.224537431601886e-07, - "loss": 0.8727, - "step": 793 - }, - { - "epoch": 0.8725274725274725, - "grad_norm": 1.066422700881958, - "learning_rate": 8.086466074476562e-07, - "loss": 0.8543, - "step": 794 - }, - { - "epoch": 0.8736263736263736, - "grad_norm": 0.9029947519302368, - "learning_rate": 7.949514654755963e-07, - "loss": 0.8755, - "step": 795 - }, - { - "epoch": 0.8747252747252747, - "grad_norm": 1.0645475387573242, - "learning_rate": 7.81368484114996e-07, - "loss": 0.8478, - "step": 796 - }, - { - "epoch": 0.8758241758241758, - "grad_norm": 1.0116077661514282, - "learning_rate": 7.678978288701911e-07, - "loss": 0.8936, - "step": 797 - }, - { - "epoch": 0.8769230769230769, - "grad_norm": 0.9004772305488586, - "learning_rate": 7.545396638768698e-07, - "loss": 0.8649, - "step": 798 - }, - { - "epoch": 0.878021978021978, - "grad_norm": 0.9908126592636108, - "learning_rate": 7.412941519000527e-07, - "loss": 0.9102, - "step": 799 - }, - { - "epoch": 0.8791208791208791, - "grad_norm": 1.0973479747772217, - "learning_rate": 7.281614543321269e-07, - "loss": 0.8483, - "step": 800 - }, - { - "epoch": 0.8802197802197802, - "grad_norm": 0.9987009763717651, - "learning_rate": 7.151417311908648e-07, - "loss": 0.8663, - "step": 801 - }, - { - "epoch": 0.8813186813186813, - "grad_norm": 0.976642370223999, - "learning_rate": 7.022351411174866e-07, - "loss": 0.8673, - "step": 802 - }, - { - "epoch": 0.8824175824175824, - "grad_norm": 1.0586979389190674, - "learning_rate": 6.894418413747183e-07, - "loss": 0.875, - "step": 803 - }, - { - "epoch": 0.8835164835164835, - "grad_norm": 0.961605966091156, - "learning_rate": 6.767619878448783e-07, - "loss": 0.8702, - "step": 804 - }, - { - "epoch": 0.8846153846153846, - "grad_norm": 0.9752766489982605, - "learning_rate": 6.641957350279838e-07, - "loss": 0.9223, - "step": 805 - }, - { - "epoch": 0.8857142857142857, - "grad_norm": 1.052821397781372, - "learning_rate": 6.517432360398556e-07, - "loss": 0.8753, - "step": 806 - }, - { - "epoch": 0.8868131868131868, - "grad_norm": 1.1543768644332886, - "learning_rate": 6.394046426102673e-07, - "loss": 0.8198, - "step": 807 - }, - { - "epoch": 0.8879120879120879, - "grad_norm": 0.988862156867981, - "learning_rate": 6.271801050810856e-07, - "loss": 0.8691, - "step": 808 - }, - { - "epoch": 0.889010989010989, - "grad_norm": 0.976913571357727, - "learning_rate": 6.150697724044407e-07, - "loss": 0.8835, - "step": 809 - }, - { - "epoch": 0.8901098901098901, - "grad_norm": 1.0245682001113892, - "learning_rate": 6.030737921409169e-07, - "loss": 0.8679, - "step": 810 - }, - { - "epoch": 0.8912087912087913, - "grad_norm": 1.0596377849578857, - "learning_rate": 5.911923104577455e-07, - "loss": 0.8357, - "step": 811 - }, - { - "epoch": 0.8923076923076924, - "grad_norm": 1.0137977600097656, - "learning_rate": 5.794254721270331e-07, - "loss": 0.9107, - "step": 812 - }, - { - "epoch": 0.8934065934065935, - "grad_norm": 0.9983325600624084, - "learning_rate": 5.677734205239904e-07, - "loss": 0.8505, - "step": 813 - }, - { - "epoch": 0.8945054945054945, - "grad_norm": 1.0369681119918823, - "learning_rate": 5.562362976251901e-07, - "loss": 0.8489, - "step": 814 - }, - { - "epoch": 0.8956043956043956, - "grad_norm": 1.0275548696517944, - "learning_rate": 5.448142440068316e-07, - "loss": 0.8216, - "step": 815 - }, - { - "epoch": 0.8967032967032967, - "grad_norm": 1.006768822669983, - "learning_rate": 5.335073988430373e-07, - "loss": 0.8714, - "step": 816 - }, - { - "epoch": 0.8978021978021978, - "grad_norm": 1.013403296470642, - "learning_rate": 5.223158999041444e-07, - "loss": 0.9003, - "step": 817 - }, - { - "epoch": 0.8989010989010989, - "grad_norm": 1.0281546115875244, - "learning_rate": 5.112398835550348e-07, - "loss": 0.8455, - "step": 818 - }, - { - "epoch": 0.9, - "grad_norm": 0.9589701294898987, - "learning_rate": 5.002794847534765e-07, - "loss": 0.8949, - "step": 819 - }, - { - "epoch": 0.9010989010989011, - "grad_norm": 0.8923426270484924, - "learning_rate": 4.894348370484648e-07, - "loss": 0.8954, - "step": 820 - }, - { - "epoch": 0.9021978021978022, - "grad_norm": 0.984615683555603, - "learning_rate": 4.787060725786141e-07, - "loss": 0.886, - "step": 821 - }, - { - "epoch": 0.9032967032967033, - "grad_norm": 1.061354637145996, - "learning_rate": 4.6809332207053083e-07, - "loss": 0.8642, - "step": 822 - }, - { - "epoch": 0.9043956043956044, - "grad_norm": 1.0845102071762085, - "learning_rate": 4.575967148372318e-07, - "loss": 0.8346, - "step": 823 - }, - { - "epoch": 0.9054945054945055, - "grad_norm": 1.0316174030303955, - "learning_rate": 4.4721637877656377e-07, - "loss": 0.8575, - "step": 824 - }, - { - "epoch": 0.9065934065934066, - "grad_norm": 0.9098844528198242, - "learning_rate": 4.3695244036964567e-07, - "loss": 0.9179, - "step": 825 - }, - { - "epoch": 0.9076923076923077, - "grad_norm": 0.9126648902893066, - "learning_rate": 4.268050246793276e-07, - "loss": 0.8847, - "step": 826 - }, - { - "epoch": 0.9087912087912088, - "grad_norm": 1.0376429557800293, - "learning_rate": 4.167742553486676e-07, - "loss": 0.8706, - "step": 827 - }, - { - "epoch": 0.9098901098901099, - "grad_norm": 0.971727192401886, - "learning_rate": 4.068602545994249e-07, - "loss": 0.8744, - "step": 828 - }, - { - "epoch": 0.910989010989011, - "grad_norm": 0.9974921941757202, - "learning_rate": 3.9706314323056936e-07, - "loss": 0.874, - "step": 829 - }, - { - "epoch": 0.9120879120879121, - "grad_norm": 0.9950027465820312, - "learning_rate": 3.8738304061681107e-07, - "loss": 0.889, - "step": 830 - }, - { - "epoch": 0.9131868131868132, - "grad_norm": 1.0793790817260742, - "learning_rate": 3.7782006470714614e-07, - "loss": 0.8591, - "step": 831 - }, - { - "epoch": 0.9142857142857143, - "grad_norm": 1.1178529262542725, - "learning_rate": 3.68374332023419e-07, - "loss": 0.8348, - "step": 832 - }, - { - "epoch": 0.9153846153846154, - "grad_norm": 1.0072646141052246, - "learning_rate": 3.590459576589e-07, - "loss": 0.8874, - "step": 833 - }, - { - "epoch": 0.9164835164835164, - "grad_norm": 0.9767899513244629, - "learning_rate": 3.498350552768859e-07, - "loss": 0.8749, - "step": 834 - }, - { - "epoch": 0.9175824175824175, - "grad_norm": 1.059394121170044, - "learning_rate": 3.4074173710931804e-07, - "loss": 0.8676, - "step": 835 - }, - { - "epoch": 0.9186813186813186, - "grad_norm": 1.1175642013549805, - "learning_rate": 3.3176611395540625e-07, - "loss": 0.8283, - "step": 836 - }, - { - "epoch": 0.9197802197802197, - "grad_norm": 1.0190948247909546, - "learning_rate": 3.2290829518028867e-07, - "loss": 0.8866, - "step": 837 - }, - { - "epoch": 0.9208791208791208, - "grad_norm": 0.9729024767875671, - "learning_rate": 3.1416838871368925e-07, - "loss": 0.864, - "step": 838 - }, - { - "epoch": 0.921978021978022, - "grad_norm": 0.9739720821380615, - "learning_rate": 3.0554650104861137e-07, - "loss": 0.8792, - "step": 839 - }, - { - "epoch": 0.9230769230769231, - "grad_norm": 1.0263519287109375, - "learning_rate": 2.970427372400353e-07, - "loss": 0.8436, - "step": 840 - }, - { - "epoch": 0.9241758241758242, - "grad_norm": 1.0045640468597412, - "learning_rate": 2.8865720090364037e-07, - "loss": 0.8533, - "step": 841 - }, - { - "epoch": 0.9252747252747253, - "grad_norm": 0.9555304050445557, - "learning_rate": 2.8038999421453827e-07, - "loss": 0.9201, - "step": 842 - }, - { - "epoch": 0.9263736263736264, - "grad_norm": 1.1007894277572632, - "learning_rate": 2.7224121790603517e-07, - "loss": 0.833, - "step": 843 - }, - { - "epoch": 0.9274725274725275, - "grad_norm": 0.9987741708755493, - "learning_rate": 2.6421097126839714e-07, - "loss": 0.8424, - "step": 844 - }, - { - "epoch": 0.9285714285714286, - "grad_norm": 0.940259575843811, - "learning_rate": 2.5629935214764866e-07, - "loss": 0.8799, - "step": 845 - }, - { - "epoch": 0.9296703296703297, - "grad_norm": 1.066133737564087, - "learning_rate": 2.4850645694436736e-07, - "loss": 0.88, - "step": 846 - }, - { - "epoch": 0.9307692307692308, - "grad_norm": 1.02364182472229, - "learning_rate": 2.4083238061252565e-07, - "loss": 0.8713, - "step": 847 - }, - { - "epoch": 0.9318681318681319, - "grad_norm": 0.918208658695221, - "learning_rate": 2.332772166583208e-07, - "loss": 0.8665, - "step": 848 - }, - { - "epoch": 0.932967032967033, - "grad_norm": 1.0443564653396606, - "learning_rate": 2.2584105713904126e-07, - "loss": 0.8851, - "step": 849 - }, - { - "epoch": 0.9340659340659341, - "grad_norm": 1.195809006690979, - "learning_rate": 2.1852399266194312e-07, - "loss": 0.8182, - "step": 850 - }, - { - "epoch": 0.9351648351648352, - "grad_norm": 1.0563225746154785, - "learning_rate": 2.1132611238315004e-07, - "loss": 0.8522, - "step": 851 - }, - { - "epoch": 0.9362637362637363, - "grad_norm": 0.9704524874687195, - "learning_rate": 2.0424750400655947e-07, - "loss": 0.8964, - "step": 852 - }, - { - "epoch": 0.9373626373626374, - "grad_norm": 0.9127840995788574, - "learning_rate": 1.9728825378278248e-07, - "loss": 0.8531, - "step": 853 - }, - { - "epoch": 0.9384615384615385, - "grad_norm": 0.9666539430618286, - "learning_rate": 1.9044844650808468e-07, - "loss": 0.8931, - "step": 854 - }, - { - "epoch": 0.9395604395604396, - "grad_norm": 1.0174075365066528, - "learning_rate": 1.8372816552336025e-07, - "loss": 0.8841, - "step": 855 - }, - { - "epoch": 0.9406593406593406, - "grad_norm": 0.9566776752471924, - "learning_rate": 1.7712749271311392e-07, - "loss": 0.8781, - "step": 856 - }, - { - "epoch": 0.9417582417582417, - "grad_norm": 1.030104160308838, - "learning_rate": 1.706465085044584e-07, - "loss": 0.8459, - "step": 857 - }, - { - "epoch": 0.9428571428571428, - "grad_norm": 1.0015788078308105, - "learning_rate": 1.6428529186614195e-07, - "loss": 0.8493, - "step": 858 - }, - { - "epoch": 0.9439560439560439, - "grad_norm": 1.0031136274337769, - "learning_rate": 1.580439203075812e-07, - "loss": 0.8853, - "step": 859 - }, - { - "epoch": 0.945054945054945, - "grad_norm": 1.0552512407302856, - "learning_rate": 1.519224698779198e-07, - "loss": 0.897, - "step": 860 - }, - { - "epoch": 0.9461538461538461, - "grad_norm": 1.000359296798706, - "learning_rate": 1.4592101516509916e-07, - "loss": 0.8637, - "step": 861 - }, - { - "epoch": 0.9472527472527472, - "grad_norm": 0.9663481116294861, - "learning_rate": 1.400396292949513e-07, - "loss": 0.8507, - "step": 862 - }, - { - "epoch": 0.9483516483516483, - "grad_norm": 0.9774083495140076, - "learning_rate": 1.3427838393030634e-07, - "loss": 0.8798, - "step": 863 - }, - { - "epoch": 0.9494505494505494, - "grad_norm": 1.0701053142547607, - "learning_rate": 1.2863734927012094e-07, - "loss": 0.8151, - "step": 864 - }, - { - "epoch": 0.9505494505494505, - "grad_norm": 1.0122300386428833, - "learning_rate": 1.231165940486234e-07, - "loss": 0.8534, - "step": 865 - }, - { - "epoch": 0.9516483516483516, - "grad_norm": 1.0653303861618042, - "learning_rate": 1.1771618553447217e-07, - "loss": 0.8832, - "step": 866 - }, - { - "epoch": 0.9527472527472527, - "grad_norm": 0.9388241171836853, - "learning_rate": 1.1243618952994195e-07, - "loss": 0.9025, - "step": 867 - }, - { - "epoch": 0.9538461538461539, - "grad_norm": 0.9821454286575317, - "learning_rate": 1.0727667037011668e-07, - "loss": 0.8458, - "step": 868 - }, - { - "epoch": 0.954945054945055, - "grad_norm": 1.0746763944625854, - "learning_rate": 1.0223769092211012e-07, - "loss": 0.8426, - "step": 869 - }, - { - "epoch": 0.9560439560439561, - "grad_norm": 1.0331653356552124, - "learning_rate": 9.731931258429638e-08, - "loss": 0.8368, - "step": 870 - }, - { - "epoch": 0.9571428571428572, - "grad_norm": 1.066191554069519, - "learning_rate": 9.252159528556404e-08, - "loss": 0.8399, - "step": 871 - }, - { - "epoch": 0.9582417582417583, - "grad_norm": 1.0262787342071533, - "learning_rate": 8.784459748458318e-08, - "loss": 0.8657, - "step": 872 - }, - { - "epoch": 0.9593406593406594, - "grad_norm": 0.9380448460578918, - "learning_rate": 8.328837616909612e-08, - "loss": 0.8491, - "step": 873 - }, - { - "epoch": 0.9604395604395605, - "grad_norm": 1.0255717039108276, - "learning_rate": 7.885298685522235e-08, - "loss": 0.8625, - "step": 874 - }, - { - "epoch": 0.9615384615384616, - "grad_norm": 0.9140982627868652, - "learning_rate": 7.453848358678018e-08, - "loss": 0.8927, - "step": 875 - }, - { - "epoch": 0.9626373626373627, - "grad_norm": 1.0343888998031616, - "learning_rate": 7.034491893463059e-08, - "loss": 0.8455, - "step": 876 - }, - { - "epoch": 0.9637362637362638, - "grad_norm": 1.0084855556488037, - "learning_rate": 6.627234399603554e-08, - "loss": 0.8375, - "step": 877 - }, - { - "epoch": 0.9648351648351648, - "grad_norm": 0.9889881014823914, - "learning_rate": 6.232080839403631e-08, - "loss": 0.8697, - "step": 878 - }, - { - "epoch": 0.9659340659340659, - "grad_norm": 1.0728535652160645, - "learning_rate": 5.849036027684607e-08, - "loss": 0.8707, - "step": 879 - }, - { - "epoch": 0.967032967032967, - "grad_norm": 0.9325518608093262, - "learning_rate": 5.4781046317267103e-08, - "loss": 0.8799, - "step": 880 - }, - { - "epoch": 0.9681318681318681, - "grad_norm": 0.9515867829322815, - "learning_rate": 5.119291171211793e-08, - "loss": 0.8813, - "step": 881 - }, - { - "epoch": 0.9692307692307692, - "grad_norm": 1.066673755645752, - "learning_rate": 4.772600018168816e-08, - "loss": 0.9001, - "step": 882 - }, - { - "epoch": 0.9703296703296703, - "grad_norm": 0.9614695906639099, - "learning_rate": 4.438035396920004e-08, - "loss": 0.8331, - "step": 883 - }, - { - "epoch": 0.9714285714285714, - "grad_norm": 1.1650054454803467, - "learning_rate": 4.115601384029666e-08, - "loss": 0.8554, - "step": 884 - }, - { - "epoch": 0.9725274725274725, - "grad_norm": 1.0655165910720825, - "learning_rate": 3.805301908254455e-08, - "loss": 0.9067, - "step": 885 - }, - { - "epoch": 0.9736263736263736, - "grad_norm": 0.9343301057815552, - "learning_rate": 3.50714075049563e-08, - "loss": 0.8791, - "step": 886 - }, - { - "epoch": 0.9747252747252747, - "grad_norm": 1.0297307968139648, - "learning_rate": 3.22112154375287e-08, - "loss": 0.8639, - "step": 887 - }, - { - "epoch": 0.9758241758241758, - "grad_norm": 1.1532951593399048, - "learning_rate": 2.947247773079753e-08, - "loss": 0.8787, - "step": 888 - }, - { - "epoch": 0.9769230769230769, - "grad_norm": 0.9690073728561401, - "learning_rate": 2.6855227755419046e-08, - "loss": 0.8726, - "step": 889 - }, - { - "epoch": 0.978021978021978, - "grad_norm": 0.931484043598175, - "learning_rate": 2.4359497401758026e-08, - "loss": 0.8792, - "step": 890 - }, - { - "epoch": 0.9791208791208791, - "grad_norm": 0.9039062857627869, - "learning_rate": 2.1985317079500358e-08, - "loss": 0.9051, - "step": 891 - }, - { - "epoch": 0.9802197802197802, - "grad_norm": 1.0109405517578125, - "learning_rate": 1.973271571728441e-08, - "loss": 0.8878, - "step": 892 - }, - { - "epoch": 0.9813186813186813, - "grad_norm": 0.959872305393219, - "learning_rate": 1.7601720762346895e-08, - "loss": 0.8679, - "step": 893 - }, - { - "epoch": 0.9824175824175824, - "grad_norm": 1.031186819076538, - "learning_rate": 1.5592358180189782e-08, - "loss": 0.8578, - "step": 894 - }, - { - "epoch": 0.9835164835164835, - "grad_norm": 0.9113213419914246, - "learning_rate": 1.370465245426167e-08, - "loss": 0.8873, - "step": 895 - }, - { - "epoch": 0.9846153846153847, - "grad_norm": 0.9987814426422119, - "learning_rate": 1.1938626585660252e-08, - "loss": 0.8721, - "step": 896 - }, - { - "epoch": 0.9857142857142858, - "grad_norm": 1.0924243927001953, - "learning_rate": 1.0294302092853647e-08, - "loss": 0.8109, - "step": 897 - }, - { - "epoch": 0.9868131868131869, - "grad_norm": 1.096369981765747, - "learning_rate": 8.771699011416169e-09, - "loss": 0.8455, - "step": 898 - }, - { - "epoch": 0.987912087912088, - "grad_norm": 1.0039680004119873, - "learning_rate": 7.370835893788508e-09, - "loss": 0.8466, - "step": 899 - }, - { - "epoch": 0.989010989010989, - "grad_norm": 1.0639376640319824, - "learning_rate": 6.091729809042379e-09, - "loss": 0.8474, - "step": 900 - }, - { - "epoch": 0.9901098901098901, - "grad_norm": 0.9821733236312866, - "learning_rate": 4.9343963426840006e-09, - "loss": 0.9115, - "step": 901 - }, - { - "epoch": 0.9912087912087912, - "grad_norm": 0.9844228029251099, - "learning_rate": 3.898849596456477e-09, - "loss": 0.8749, - "step": 902 - }, - { - "epoch": 0.9923076923076923, - "grad_norm": 0.9702888131141663, - "learning_rate": 2.9851021881688314e-09, - "loss": 0.8611, - "step": 903 - }, - { - "epoch": 0.9934065934065934, - "grad_norm": 1.0319045782089233, - "learning_rate": 2.193165251545004e-09, - "loss": 0.8526, - "step": 904 - }, - { - "epoch": 0.9945054945054945, - "grad_norm": 0.9472014904022217, - "learning_rate": 1.5230484360873043e-09, - "loss": 0.8698, - "step": 905 - }, - { - "epoch": 0.9956043956043956, - "grad_norm": 1.154123067855835, - "learning_rate": 9.74759906957612e-10, - "loss": 0.8262, - "step": 906 - }, - { - "epoch": 0.9967032967032967, - "grad_norm": 0.9860395789146423, - "learning_rate": 5.483063448785686e-10, - "loss": 0.8783, - "step": 907 - }, - { - "epoch": 0.9978021978021978, - "grad_norm": 1.099100947380066, - "learning_rate": 2.436929460525317e-10, - "loss": 0.8329, - "step": 908 - }, - { - "epoch": 0.9989010989010989, - "grad_norm": 0.9741246700286865, - "learning_rate": 6.092342209607083e-11, - "loss": 0.8633, - "step": 909 - }, - { - "epoch": 1.0, - "grad_norm": 1.0878819227218628, - "learning_rate": 0.0, - "loss": 0.8508, - "step": 910 - } - ], - "logging_steps": 1, - "max_steps": 910, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 5.428008654345339e+18, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -}