|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 112, |
|
"global_step": 1344, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002232142857142857, |
|
"grad_norm": 6.444167137145996, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 0.7893, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002232142857142857, |
|
"eval_loss": 0.7994120121002197, |
|
"eval_runtime": 31.574, |
|
"eval_samples_per_second": 2.312, |
|
"eval_steps_per_second": 0.317, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004464285714285714, |
|
"grad_norm": 6.631099224090576, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.8404, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006696428571428571, |
|
"grad_norm": 6.917625427246094, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.8045, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.008928571428571428, |
|
"grad_norm": 7.057511806488037, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.7771, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.011160714285714286, |
|
"grad_norm": 6.829500198364258, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.9127, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.013392857142857142, |
|
"grad_norm": 5.697404384613037, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8142, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 5.567355155944824, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.8286, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 5.8241071701049805, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.7863, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020089285714285716, |
|
"grad_norm": 5.002991676330566, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.7995, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.022321428571428572, |
|
"grad_norm": 4.243339538574219, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7777, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024553571428571428, |
|
"grad_norm": 4.812699794769287, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.8632, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.026785714285714284, |
|
"grad_norm": 3.2879092693328857, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.7006, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.029017857142857144, |
|
"grad_norm": 3.256328821182251, |
|
"learning_rate": 2.6e-06, |
|
"loss": 0.8796, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 2.9795191287994385, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.7379, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.033482142857142856, |
|
"grad_norm": 2.269883394241333, |
|
"learning_rate": 3e-06, |
|
"loss": 0.6732, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 2.641052484512329, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.7464, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03794642857142857, |
|
"grad_norm": 2.5419418811798096, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 0.7677, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04017857142857143, |
|
"grad_norm": 1.9274882078170776, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.6629, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04241071428571429, |
|
"grad_norm": 1.6530262231826782, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.6453, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 1.4658329486846924, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.6656, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 1.5799874067306519, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.7575, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.049107142857142856, |
|
"grad_norm": 1.6194658279418945, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.7072, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05133928571428571, |
|
"grad_norm": 1.4326906204223633, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 0.6723, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 1.7949881553649902, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.6745, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05580357142857143, |
|
"grad_norm": 1.594699501991272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6926, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05803571428571429, |
|
"grad_norm": 1.2770566940307617, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.6907, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.060267857142857144, |
|
"grad_norm": 1.2886347770690918, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 0.6691, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 1.4923341274261475, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.7632, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06473214285714286, |
|
"grad_norm": 1.3870608806610107, |
|
"learning_rate": 5.8e-06, |
|
"loss": 0.6068, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06696428571428571, |
|
"grad_norm": 1.2519007921218872, |
|
"learning_rate": 6e-06, |
|
"loss": 0.608, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06919642857142858, |
|
"grad_norm": 1.2436811923980713, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 0.5889, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 1.2719563245773315, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.6538, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07366071428571429, |
|
"grad_norm": 1.388859510421753, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 0.6368, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07589285714285714, |
|
"grad_norm": 1.2473232746124268, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.6367, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 1.3017948865890503, |
|
"learning_rate": 7e-06, |
|
"loss": 0.6646, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08035714285714286, |
|
"grad_norm": 1.1188671588897705, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.6325, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08258928571428571, |
|
"grad_norm": 1.117879867553711, |
|
"learning_rate": 7.4e-06, |
|
"loss": 0.5879, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08482142857142858, |
|
"grad_norm": 1.1564981937408447, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.6176, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08705357142857142, |
|
"grad_norm": 1.613521933555603, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 0.7525, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 1.1391220092773438, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5593, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09151785714285714, |
|
"grad_norm": 1.1999366283416748, |
|
"learning_rate": 8.2e-06, |
|
"loss": 0.589, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 1.2285315990447998, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.6326, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09598214285714286, |
|
"grad_norm": 1.0952249765396118, |
|
"learning_rate": 8.6e-06, |
|
"loss": 0.6319, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09821428571428571, |
|
"grad_norm": 1.3287895917892456, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.6771, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10044642857142858, |
|
"grad_norm": 1.251396656036377, |
|
"learning_rate": 9e-06, |
|
"loss": 0.615, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10267857142857142, |
|
"grad_norm": 1.173791766166687, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.6431, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.10491071428571429, |
|
"grad_norm": 1.2991195917129517, |
|
"learning_rate": 9.4e-06, |
|
"loss": 0.6643, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 1.1051254272460938, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.6203, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 1.310900330543518, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 0.643, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11160714285714286, |
|
"grad_norm": 1.0683242082595825, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5383, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11383928571428571, |
|
"grad_norm": 1.139321208000183, |
|
"learning_rate": 1.02e-05, |
|
"loss": 0.6041, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11607142857142858, |
|
"grad_norm": 1.095990777015686, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.5986, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.11830357142857142, |
|
"grad_norm": 1.0637718439102173, |
|
"learning_rate": 1.0600000000000002e-05, |
|
"loss": 0.5558, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.12053571428571429, |
|
"grad_norm": 1.1787432432174683, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 0.6232, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.12276785714285714, |
|
"grad_norm": 4.439203262329102, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.6276, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.1904429197311401, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.5637, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.12723214285714285, |
|
"grad_norm": 1.2871860265731812, |
|
"learning_rate": 1.14e-05, |
|
"loss": 0.6973, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12946428571428573, |
|
"grad_norm": 1.317662000656128, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.6334, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.13169642857142858, |
|
"grad_norm": 1.267655611038208, |
|
"learning_rate": 1.18e-05, |
|
"loss": 0.637, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 1.1938740015029907, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.6162, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13616071428571427, |
|
"grad_norm": 1.1768426895141602, |
|
"learning_rate": 1.22e-05, |
|
"loss": 0.5861, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.13839285714285715, |
|
"grad_norm": 1.2728022336959839, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.5684, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 1.2520177364349365, |
|
"learning_rate": 1.2600000000000001e-05, |
|
"loss": 0.6199, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.2249557971954346, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.5968, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.14508928571428573, |
|
"grad_norm": 1.094007134437561, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.5696, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14732142857142858, |
|
"grad_norm": 1.1172953844070435, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.6564, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.14955357142857142, |
|
"grad_norm": 1.0176945924758911, |
|
"learning_rate": 1.3400000000000002e-05, |
|
"loss": 0.5915, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.15178571428571427, |
|
"grad_norm": 1.1159842014312744, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.587, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.15401785714285715, |
|
"grad_norm": 1.3211426734924316, |
|
"learning_rate": 1.38e-05, |
|
"loss": 0.6512, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 1.2271831035614014, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.6185, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15848214285714285, |
|
"grad_norm": 1.0738003253936768, |
|
"learning_rate": 1.4200000000000001e-05, |
|
"loss": 0.5992, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 1.1245979070663452, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.5878, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.16294642857142858, |
|
"grad_norm": 1.3597056865692139, |
|
"learning_rate": 1.46e-05, |
|
"loss": 0.6344, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.16517857142857142, |
|
"grad_norm": 1.2197428941726685, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.5806, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.16741071428571427, |
|
"grad_norm": 1.1941276788711548, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.6668, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16964285714285715, |
|
"grad_norm": 1.2710192203521729, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.6386, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 1.282441258430481, |
|
"learning_rate": 1.54e-05, |
|
"loss": 0.6265, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.17410714285714285, |
|
"grad_norm": 1.465880274772644, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.6674, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.17633928571428573, |
|
"grad_norm": 1.1180906295776367, |
|
"learning_rate": 1.58e-05, |
|
"loss": 0.583, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 1.1118671894073486, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6287, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18080357142857142, |
|
"grad_norm": 1.1944739818572998, |
|
"learning_rate": 1.62e-05, |
|
"loss": 0.6626, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.18303571428571427, |
|
"grad_norm": 1.2908122539520264, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.6691, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.18526785714285715, |
|
"grad_norm": 1.3516288995742798, |
|
"learning_rate": 1.66e-05, |
|
"loss": 0.6543, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 1.1028647422790527, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.5868, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.18973214285714285, |
|
"grad_norm": 1.2997535467147827, |
|
"learning_rate": 1.7e-05, |
|
"loss": 0.6145, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19196428571428573, |
|
"grad_norm": 1.0468411445617676, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.5493, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.19419642857142858, |
|
"grad_norm": 1.3448480367660522, |
|
"learning_rate": 1.7400000000000003e-05, |
|
"loss": 0.6751, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 1.119872808456421, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.5593, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.19866071428571427, |
|
"grad_norm": 1.3660776615142822, |
|
"learning_rate": 1.7800000000000002e-05, |
|
"loss": 0.6085, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.20089285714285715, |
|
"grad_norm": 1.189186930656433, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.6327, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 1.5373879671096802, |
|
"learning_rate": 1.8200000000000002e-05, |
|
"loss": 0.6988, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.20535714285714285, |
|
"grad_norm": 1.3453340530395508, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.5824, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.20758928571428573, |
|
"grad_norm": 1.2179492712020874, |
|
"learning_rate": 1.86e-05, |
|
"loss": 0.6408, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.20982142857142858, |
|
"grad_norm": 1.1074484586715698, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.5831, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.21205357142857142, |
|
"grad_norm": 1.4239832162857056, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.6534, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 1.1983468532562256, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.602, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.21651785714285715, |
|
"grad_norm": 1.116683006286621, |
|
"learning_rate": 1.94e-05, |
|
"loss": 0.5104, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 1.200826644897461, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.5871, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.22098214285714285, |
|
"grad_norm": 1.1437289714813232, |
|
"learning_rate": 1.98e-05, |
|
"loss": 0.586, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 1.2034040689468384, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5807, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22544642857142858, |
|
"grad_norm": 1.2772138118743896, |
|
"learning_rate": 1.9999968111891562e-05, |
|
"loss": 0.6783, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.22767857142857142, |
|
"grad_norm": 1.0830761194229126, |
|
"learning_rate": 1.9999872447769624e-05, |
|
"loss": 0.6097, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.22991071428571427, |
|
"grad_norm": 1.179870843887329, |
|
"learning_rate": 1.9999713008244287e-05, |
|
"loss": 0.6053, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 1.3241559267044067, |
|
"learning_rate": 1.9999489794332404e-05, |
|
"loss": 0.7262, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 1.2744848728179932, |
|
"learning_rate": 1.9999202807457537e-05, |
|
"loss": 0.6816, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.23660714285714285, |
|
"grad_norm": 1.124351978302002, |
|
"learning_rate": 1.9998852049449998e-05, |
|
"loss": 0.7003, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.23883928571428573, |
|
"grad_norm": 1.009700894355774, |
|
"learning_rate": 1.999843752254677e-05, |
|
"loss": 0.603, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.24107142857142858, |
|
"grad_norm": 1.1453275680541992, |
|
"learning_rate": 1.9997959229391567e-05, |
|
"loss": 0.5768, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.24330357142857142, |
|
"grad_norm": 1.0525116920471191, |
|
"learning_rate": 1.9997417173034746e-05, |
|
"loss": 0.5811, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.24553571428571427, |
|
"grad_norm": 1.1967023611068726, |
|
"learning_rate": 1.9996811356933346e-05, |
|
"loss": 0.5993, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24776785714285715, |
|
"grad_norm": 1.190104365348816, |
|
"learning_rate": 1.999614178495103e-05, |
|
"loss": 0.6231, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2647576332092285, |
|
"learning_rate": 1.9995408461358074e-05, |
|
"loss": 0.6269, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.5685862898826599, |
|
"eval_runtime": 31.2257, |
|
"eval_samples_per_second": 2.338, |
|
"eval_steps_per_second": 0.32, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.25223214285714285, |
|
"grad_norm": 1.121337652206421, |
|
"learning_rate": 1.9994611390831342e-05, |
|
"loss": 0.5947, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2544642857142857, |
|
"grad_norm": 1.3274226188659668, |
|
"learning_rate": 1.9993750578454248e-05, |
|
"loss": 0.7194, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.25669642857142855, |
|
"grad_norm": 1.2155532836914062, |
|
"learning_rate": 1.9992826029716722e-05, |
|
"loss": 0.6605, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.25892857142857145, |
|
"grad_norm": 1.4203304052352905, |
|
"learning_rate": 1.999183775051519e-05, |
|
"loss": 0.5979, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2611607142857143, |
|
"grad_norm": 1.1040465831756592, |
|
"learning_rate": 1.9990785747152527e-05, |
|
"loss": 0.5968, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.26339285714285715, |
|
"grad_norm": 1.3391578197479248, |
|
"learning_rate": 1.9989670026338002e-05, |
|
"loss": 0.6921, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 1.3182568550109863, |
|
"learning_rate": 1.9988490595187273e-05, |
|
"loss": 0.6563, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 1.2267478704452515, |
|
"learning_rate": 1.9987247461222297e-05, |
|
"loss": 0.5942, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2700892857142857, |
|
"grad_norm": 1.4536434412002563, |
|
"learning_rate": 1.9985940632371316e-05, |
|
"loss": 0.6894, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.27232142857142855, |
|
"grad_norm": 1.394129991531372, |
|
"learning_rate": 1.9984570116968785e-05, |
|
"loss": 0.7047, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.27455357142857145, |
|
"grad_norm": 1.4202784299850464, |
|
"learning_rate": 1.9983135923755336e-05, |
|
"loss": 0.7424, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2767857142857143, |
|
"grad_norm": 1.3307374715805054, |
|
"learning_rate": 1.9981638061877714e-05, |
|
"loss": 0.6857, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.27901785714285715, |
|
"grad_norm": 1.0905797481536865, |
|
"learning_rate": 1.998007654088871e-05, |
|
"loss": 0.5316, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 1.1642409563064575, |
|
"learning_rate": 1.9978451370747122e-05, |
|
"loss": 0.6388, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.28348214285714285, |
|
"grad_norm": 1.2434855699539185, |
|
"learning_rate": 1.9976762561817656e-05, |
|
"loss": 0.5913, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.5033081769943237, |
|
"learning_rate": 1.997501012487091e-05, |
|
"loss": 0.6407, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.28794642857142855, |
|
"grad_norm": 1.1626683473587036, |
|
"learning_rate": 1.997319407108326e-05, |
|
"loss": 0.6786, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.29017857142857145, |
|
"grad_norm": 1.342909812927246, |
|
"learning_rate": 1.9971314412036807e-05, |
|
"loss": 0.624, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2924107142857143, |
|
"grad_norm": 1.140647530555725, |
|
"learning_rate": 1.9969371159719307e-05, |
|
"loss": 0.6178, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.29464285714285715, |
|
"grad_norm": 1.40773606300354, |
|
"learning_rate": 1.996736432652409e-05, |
|
"loss": 0.5911, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 1.2301044464111328, |
|
"learning_rate": 1.9965293925249976e-05, |
|
"loss": 0.5775, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.29910714285714285, |
|
"grad_norm": 1.3424404859542847, |
|
"learning_rate": 1.9963159969101207e-05, |
|
"loss": 0.6405, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3013392857142857, |
|
"grad_norm": 1.0376389026641846, |
|
"learning_rate": 1.996096247168734e-05, |
|
"loss": 0.5248, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 1.3082736730575562, |
|
"learning_rate": 1.9958701447023188e-05, |
|
"loss": 0.6588, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.30580357142857145, |
|
"grad_norm": 1.3725180625915527, |
|
"learning_rate": 1.9956376909528704e-05, |
|
"loss": 0.6416, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3080357142857143, |
|
"grad_norm": 1.3519439697265625, |
|
"learning_rate": 1.9953988874028917e-05, |
|
"loss": 0.6421, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.31026785714285715, |
|
"grad_norm": 1.259487509727478, |
|
"learning_rate": 1.995153735575381e-05, |
|
"loss": 0.6523, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.498024821281433, |
|
"learning_rate": 1.994902237033824e-05, |
|
"loss": 0.6319, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.31473214285714285, |
|
"grad_norm": 1.369606375694275, |
|
"learning_rate": 1.994644393382183e-05, |
|
"loss": 0.652, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3169642857142857, |
|
"grad_norm": 1.268442153930664, |
|
"learning_rate": 1.9943802062648877e-05, |
|
"loss": 0.6082, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.31919642857142855, |
|
"grad_norm": 1.1077290773391724, |
|
"learning_rate": 1.9941096773668232e-05, |
|
"loss": 0.577, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 1.3842663764953613, |
|
"learning_rate": 1.9938328084133206e-05, |
|
"loss": 0.6668, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3236607142857143, |
|
"grad_norm": 1.1705602407455444, |
|
"learning_rate": 1.9935496011701453e-05, |
|
"loss": 0.5888, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.32589285714285715, |
|
"grad_norm": 1.029478907585144, |
|
"learning_rate": 1.9932600574434864e-05, |
|
"loss": 0.5286, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 1.2074710130691528, |
|
"learning_rate": 1.9929641790799438e-05, |
|
"loss": 0.6419, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.33035714285714285, |
|
"grad_norm": 1.2905062437057495, |
|
"learning_rate": 1.9926619679665175e-05, |
|
"loss": 0.6704, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3325892857142857, |
|
"grad_norm": 1.1862680912017822, |
|
"learning_rate": 1.992353426030596e-05, |
|
"loss": 0.591, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.33482142857142855, |
|
"grad_norm": 1.023520588874817, |
|
"learning_rate": 1.9920385552399434e-05, |
|
"loss": 0.551, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33705357142857145, |
|
"grad_norm": 1.067068338394165, |
|
"learning_rate": 1.991717357602686e-05, |
|
"loss": 0.5453, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 1.1788963079452515, |
|
"learning_rate": 1.9913898351673006e-05, |
|
"loss": 0.561, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.34151785714285715, |
|
"grad_norm": 1.081198811531067, |
|
"learning_rate": 1.991055990022602e-05, |
|
"loss": 0.6095, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 1.234728455543518, |
|
"learning_rate": 1.990715824297728e-05, |
|
"loss": 0.5744, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.34598214285714285, |
|
"grad_norm": 1.2400490045547485, |
|
"learning_rate": 1.990369340162127e-05, |
|
"loss": 0.6289, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3482142857142857, |
|
"grad_norm": 1.2360931634902954, |
|
"learning_rate": 1.9900165398255434e-05, |
|
"loss": 0.6163, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.35044642857142855, |
|
"grad_norm": 0.9136845469474792, |
|
"learning_rate": 1.9896574255380045e-05, |
|
"loss": 0.545, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.35267857142857145, |
|
"grad_norm": 1.1531745195388794, |
|
"learning_rate": 1.9892919995898052e-05, |
|
"loss": 0.5915, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3549107142857143, |
|
"grad_norm": 1.074296474456787, |
|
"learning_rate": 1.988920264311494e-05, |
|
"loss": 0.5395, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.1261378526687622, |
|
"learning_rate": 1.9885422220738583e-05, |
|
"loss": 0.548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 1.2132823467254639, |
|
"learning_rate": 1.988157875287908e-05, |
|
"loss": 0.6395, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.36160714285714285, |
|
"grad_norm": 1.0568815469741821, |
|
"learning_rate": 1.9877672264048618e-05, |
|
"loss": 0.5183, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3638392857142857, |
|
"grad_norm": 1.2066727876663208, |
|
"learning_rate": 1.98737027791613e-05, |
|
"loss": 0.5492, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.36607142857142855, |
|
"grad_norm": 1.156691551208496, |
|
"learning_rate": 1.9869670323533005e-05, |
|
"loss": 0.6742, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.36830357142857145, |
|
"grad_norm": 1.2685508728027344, |
|
"learning_rate": 1.9865574922881204e-05, |
|
"loss": 0.5676, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3705357142857143, |
|
"grad_norm": 1.1878615617752075, |
|
"learning_rate": 1.986141660332482e-05, |
|
"loss": 0.6537, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.37276785714285715, |
|
"grad_norm": 1.0642375946044922, |
|
"learning_rate": 1.9857195391384038e-05, |
|
"loss": 0.6212, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.1920270919799805, |
|
"learning_rate": 1.9852911313980146e-05, |
|
"loss": 0.5452, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.37723214285714285, |
|
"grad_norm": 0.920224130153656, |
|
"learning_rate": 1.9848564398435374e-05, |
|
"loss": 0.5534, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3794642857142857, |
|
"grad_norm": 1.190719723701477, |
|
"learning_rate": 1.9844154672472707e-05, |
|
"loss": 0.5595, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38169642857142855, |
|
"grad_norm": 1.0356184244155884, |
|
"learning_rate": 1.9839682164215707e-05, |
|
"loss": 0.6007, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.38392857142857145, |
|
"grad_norm": 1.1822032928466797, |
|
"learning_rate": 1.9835146902188336e-05, |
|
"loss": 0.6195, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3861607142857143, |
|
"grad_norm": 1.0653460025787354, |
|
"learning_rate": 1.983054891531478e-05, |
|
"loss": 0.6015, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.38839285714285715, |
|
"grad_norm": 1.2356278896331787, |
|
"learning_rate": 1.9825888232919268e-05, |
|
"loss": 0.6477, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 1.0866338014602661, |
|
"learning_rate": 1.982116488472586e-05, |
|
"loss": 0.5896, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 1.1473841667175293, |
|
"learning_rate": 1.9816378900858288e-05, |
|
"loss": 0.5805, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3950892857142857, |
|
"grad_norm": 1.1367318630218506, |
|
"learning_rate": 1.9811530311839747e-05, |
|
"loss": 0.6801, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.39732142857142855, |
|
"grad_norm": 1.2681716680526733, |
|
"learning_rate": 1.98066191485927e-05, |
|
"loss": 0.7205, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.39955357142857145, |
|
"grad_norm": 1.10531747341156, |
|
"learning_rate": 1.980164544243869e-05, |
|
"loss": 0.5849, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 1.2467155456542969, |
|
"learning_rate": 1.9796609225098136e-05, |
|
"loss": 0.6424, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.40401785714285715, |
|
"grad_norm": 0.9677994847297668, |
|
"learning_rate": 1.9791510528690125e-05, |
|
"loss": 0.5607, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.1776186227798462, |
|
"learning_rate": 1.9786349385732212e-05, |
|
"loss": 0.574, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.40848214285714285, |
|
"grad_norm": 1.1970598697662354, |
|
"learning_rate": 1.9781125829140214e-05, |
|
"loss": 0.5488, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 1.0842020511627197, |
|
"learning_rate": 1.9775839892228004e-05, |
|
"loss": 0.5859, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.41294642857142855, |
|
"grad_norm": 1.1312363147735596, |
|
"learning_rate": 1.977049160870728e-05, |
|
"loss": 0.5971, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.41517857142857145, |
|
"grad_norm": 1.147937297821045, |
|
"learning_rate": 1.976508101268738e-05, |
|
"loss": 0.6647, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4174107142857143, |
|
"grad_norm": 1.27888822555542, |
|
"learning_rate": 1.975960813867503e-05, |
|
"loss": 0.6283, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.41964285714285715, |
|
"grad_norm": 1.1987580060958862, |
|
"learning_rate": 1.9754073021574153e-05, |
|
"loss": 0.5747, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 1.211571455001831, |
|
"learning_rate": 1.9748475696685637e-05, |
|
"loss": 0.6622, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.42410714285714285, |
|
"grad_norm": 1.1061530113220215, |
|
"learning_rate": 1.9742816199707096e-05, |
|
"loss": 0.5731, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4263392857142857, |
|
"grad_norm": 1.3477433919906616, |
|
"learning_rate": 1.9737094566732663e-05, |
|
"loss": 0.669, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 1.1551015377044678, |
|
"learning_rate": 1.9731310834252747e-05, |
|
"loss": 0.5935, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.43080357142857145, |
|
"grad_norm": 1.1217777729034424, |
|
"learning_rate": 1.972546503915381e-05, |
|
"loss": 0.5141, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4330357142857143, |
|
"grad_norm": 1.1012550592422485, |
|
"learning_rate": 1.9719557218718116e-05, |
|
"loss": 0.4681, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.43526785714285715, |
|
"grad_norm": 1.241761565208435, |
|
"learning_rate": 1.9713587410623516e-05, |
|
"loss": 0.5332, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 1.2476496696472168, |
|
"learning_rate": 1.970755565294318e-05, |
|
"loss": 0.6408, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.43973214285714285, |
|
"grad_norm": 1.0551302433013916, |
|
"learning_rate": 1.970146198414538e-05, |
|
"loss": 0.5983, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4419642857142857, |
|
"grad_norm": 1.0154962539672852, |
|
"learning_rate": 1.969530644309323e-05, |
|
"loss": 0.5094, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.44419642857142855, |
|
"grad_norm": 1.04148530960083, |
|
"learning_rate": 1.968908906904444e-05, |
|
"loss": 0.5191, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 1.1981557607650757, |
|
"learning_rate": 1.9682809901651074e-05, |
|
"loss": 0.6159, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4486607142857143, |
|
"grad_norm": 1.05272376537323, |
|
"learning_rate": 1.9676468980959284e-05, |
|
"loss": 0.5368, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.45089285714285715, |
|
"grad_norm": 1.3401468992233276, |
|
"learning_rate": 1.9670066347409063e-05, |
|
"loss": 0.659, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 1.1549017429351807, |
|
"learning_rate": 1.9663602041833983e-05, |
|
"loss": 0.6243, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.45535714285714285, |
|
"grad_norm": 1.3726199865341187, |
|
"learning_rate": 1.9657076105460945e-05, |
|
"loss": 0.6091, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4575892857142857, |
|
"grad_norm": 1.1885762214660645, |
|
"learning_rate": 1.9650488579909898e-05, |
|
"loss": 0.6273, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.45982142857142855, |
|
"grad_norm": 1.225096344947815, |
|
"learning_rate": 1.964383950719359e-05, |
|
"loss": 0.6524, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.46205357142857145, |
|
"grad_norm": 0.9724453687667847, |
|
"learning_rate": 1.9637128929717294e-05, |
|
"loss": 0.5768, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 1.1051849126815796, |
|
"learning_rate": 1.9630356890278527e-05, |
|
"loss": 0.571, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.46651785714285715, |
|
"grad_norm": 1.3608918190002441, |
|
"learning_rate": 1.96235234320668e-05, |
|
"loss": 0.5879, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 1.2607719898223877, |
|
"learning_rate": 1.9616628598663322e-05, |
|
"loss": 0.6728, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47098214285714285, |
|
"grad_norm": 1.1398773193359375, |
|
"learning_rate": 1.9609672434040736e-05, |
|
"loss": 0.5693, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.4732142857142857, |
|
"grad_norm": 1.0851867198944092, |
|
"learning_rate": 1.9602654982562822e-05, |
|
"loss": 0.6436, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.47544642857142855, |
|
"grad_norm": 1.1385678052902222, |
|
"learning_rate": 1.9595576288984233e-05, |
|
"loss": 0.5584, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.47767857142857145, |
|
"grad_norm": 1.1551555395126343, |
|
"learning_rate": 1.9588436398450206e-05, |
|
"loss": 0.6299, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4799107142857143, |
|
"grad_norm": 0.976137101650238, |
|
"learning_rate": 1.958123535649625e-05, |
|
"loss": 0.5715, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 1.0276240110397339, |
|
"learning_rate": 1.9573973209047893e-05, |
|
"loss": 0.5675, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 1.194334626197815, |
|
"learning_rate": 1.9566650002420363e-05, |
|
"loss": 0.635, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.48660714285714285, |
|
"grad_norm": 1.171510100364685, |
|
"learning_rate": 1.9559265783318304e-05, |
|
"loss": 0.5989, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4888392857142857, |
|
"grad_norm": 1.1479747295379639, |
|
"learning_rate": 1.9551820598835464e-05, |
|
"loss": 0.584, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 1.0744856595993042, |
|
"learning_rate": 1.9544314496454423e-05, |
|
"loss": 0.6199, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.49330357142857145, |
|
"grad_norm": 1.1071079969406128, |
|
"learning_rate": 1.9536747524046254e-05, |
|
"loss": 0.6514, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4955357142857143, |
|
"grad_norm": 1.1865029335021973, |
|
"learning_rate": 1.9529119729870253e-05, |
|
"loss": 0.5937, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.49776785714285715, |
|
"grad_norm": 1.1087124347686768, |
|
"learning_rate": 1.9521431162573596e-05, |
|
"loss": 0.6303, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0572322607040405, |
|
"learning_rate": 1.9513681871191063e-05, |
|
"loss": 0.5568, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.5669442415237427, |
|
"eval_runtime": 31.5503, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.317, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5022321428571429, |
|
"grad_norm": 1.0085796117782593, |
|
"learning_rate": 1.95058719051447e-05, |
|
"loss": 0.5449, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5044642857142857, |
|
"grad_norm": 1.0866364240646362, |
|
"learning_rate": 1.949800131424352e-05, |
|
"loss": 0.5364, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5066964285714286, |
|
"grad_norm": 1.1213027238845825, |
|
"learning_rate": 1.9490070148683166e-05, |
|
"loss": 0.6107, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5089285714285714, |
|
"grad_norm": 1.0242129564285278, |
|
"learning_rate": 1.9482078459045617e-05, |
|
"loss": 0.5556, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5111607142857143, |
|
"grad_norm": 1.2038688659667969, |
|
"learning_rate": 1.947402629629885e-05, |
|
"loss": 0.5854, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5133928571428571, |
|
"grad_norm": 1.258674144744873, |
|
"learning_rate": 1.9465913711796502e-05, |
|
"loss": 0.6186, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 1.2189650535583496, |
|
"learning_rate": 1.9457740757277577e-05, |
|
"loss": 0.5852, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5178571428571429, |
|
"grad_norm": 1.239396572113037, |
|
"learning_rate": 1.9449507484866084e-05, |
|
"loss": 0.6632, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5200892857142857, |
|
"grad_norm": 1.089785099029541, |
|
"learning_rate": 1.944121394707072e-05, |
|
"loss": 0.6285, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5223214285714286, |
|
"grad_norm": 1.3066377639770508, |
|
"learning_rate": 1.9432860196784533e-05, |
|
"loss": 0.7184, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5245535714285714, |
|
"grad_norm": 1.1240565776824951, |
|
"learning_rate": 1.9424446287284576e-05, |
|
"loss": 0.5561, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5267857142857143, |
|
"grad_norm": 1.0432591438293457, |
|
"learning_rate": 1.941597227223159e-05, |
|
"loss": 0.5715, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5290178571428571, |
|
"grad_norm": 1.0457866191864014, |
|
"learning_rate": 1.940743820566963e-05, |
|
"loss": 0.5682, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 1.072249174118042, |
|
"learning_rate": 1.9398844142025746e-05, |
|
"loss": 0.5581, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5334821428571429, |
|
"grad_norm": 1.380035161972046, |
|
"learning_rate": 1.9390190136109625e-05, |
|
"loss": 0.6387, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 1.1203244924545288, |
|
"learning_rate": 1.9381476243113243e-05, |
|
"loss": 0.6205, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5379464285714286, |
|
"grad_norm": 1.0932904481887817, |
|
"learning_rate": 1.9372702518610512e-05, |
|
"loss": 0.6444, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5401785714285714, |
|
"grad_norm": 1.3310596942901611, |
|
"learning_rate": 1.9363869018556928e-05, |
|
"loss": 0.6773, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5424107142857143, |
|
"grad_norm": 1.2222163677215576, |
|
"learning_rate": 1.9354975799289215e-05, |
|
"loss": 0.6284, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5446428571428571, |
|
"grad_norm": 1.2926287651062012, |
|
"learning_rate": 1.9346022917524958e-05, |
|
"loss": 0.6252, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 1.3600492477416992, |
|
"learning_rate": 1.933701043036225e-05, |
|
"loss": 0.6198, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5491071428571429, |
|
"grad_norm": 1.1185046434402466, |
|
"learning_rate": 1.9327938395279325e-05, |
|
"loss": 0.6239, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.5513392857142857, |
|
"grad_norm": 1.1856756210327148, |
|
"learning_rate": 1.9318806870134194e-05, |
|
"loss": 0.5969, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.5535714285714286, |
|
"grad_norm": 1.0996602773666382, |
|
"learning_rate": 1.9309615913164262e-05, |
|
"loss": 0.6103, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.5558035714285714, |
|
"grad_norm": 1.1660605669021606, |
|
"learning_rate": 1.9300365582985984e-05, |
|
"loss": 0.6003, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.5580357142857143, |
|
"grad_norm": 1.158035397529602, |
|
"learning_rate": 1.9291055938594464e-05, |
|
"loss": 0.5799, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5602678571428571, |
|
"grad_norm": 1.1602364778518677, |
|
"learning_rate": 1.9281687039363088e-05, |
|
"loss": 0.6373, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.181039571762085, |
|
"learning_rate": 1.9272258945043154e-05, |
|
"loss": 0.5917, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5647321428571429, |
|
"grad_norm": 1.0945910215377808, |
|
"learning_rate": 1.9262771715763483e-05, |
|
"loss": 0.644, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5669642857142857, |
|
"grad_norm": 1.1994935274124146, |
|
"learning_rate": 1.9253225412030028e-05, |
|
"loss": 0.6678, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5691964285714286, |
|
"grad_norm": 1.2264413833618164, |
|
"learning_rate": 1.924362009472551e-05, |
|
"loss": 0.599, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.1776793003082275, |
|
"learning_rate": 1.9233955825109e-05, |
|
"loss": 0.6346, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5736607142857143, |
|
"grad_norm": 1.2656333446502686, |
|
"learning_rate": 1.9224232664815563e-05, |
|
"loss": 0.6538, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5758928571428571, |
|
"grad_norm": 1.1230204105377197, |
|
"learning_rate": 1.9214450675855832e-05, |
|
"loss": 0.6518, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 1.183430790901184, |
|
"learning_rate": 1.9204609920615635e-05, |
|
"loss": 0.6264, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5803571428571429, |
|
"grad_norm": 1.3236072063446045, |
|
"learning_rate": 1.919471046185558e-05, |
|
"loss": 0.6611, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5825892857142857, |
|
"grad_norm": 1.1483640670776367, |
|
"learning_rate": 1.9184752362710674e-05, |
|
"loss": 0.558, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5848214285714286, |
|
"grad_norm": 1.333167314529419, |
|
"learning_rate": 1.917473568668991e-05, |
|
"loss": 0.6025, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5870535714285714, |
|
"grad_norm": 1.0283328294754028, |
|
"learning_rate": 1.9164660497675848e-05, |
|
"loss": 0.5354, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5892857142857143, |
|
"grad_norm": 1.1128134727478027, |
|
"learning_rate": 1.9154526859924242e-05, |
|
"loss": 0.6335, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5915178571428571, |
|
"grad_norm": 1.057431697845459, |
|
"learning_rate": 1.9144334838063595e-05, |
|
"loss": 0.5429, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 1.1948074102401733, |
|
"learning_rate": 1.9134084497094766e-05, |
|
"loss": 0.6311, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.5959821428571429, |
|
"grad_norm": 1.2003923654556274, |
|
"learning_rate": 1.9123775902390555e-05, |
|
"loss": 0.6843, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5982142857142857, |
|
"grad_norm": 1.1661674976348877, |
|
"learning_rate": 1.9113409119695276e-05, |
|
"loss": 0.524, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6004464285714286, |
|
"grad_norm": 1.081857681274414, |
|
"learning_rate": 1.9102984215124352e-05, |
|
"loss": 0.5441, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6026785714285714, |
|
"grad_norm": 1.0927963256835938, |
|
"learning_rate": 1.9092501255163874e-05, |
|
"loss": 0.5054, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6049107142857143, |
|
"grad_norm": 1.1680039167404175, |
|
"learning_rate": 1.9081960306670198e-05, |
|
"loss": 0.59, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 1.2833560705184937, |
|
"learning_rate": 1.907136143686951e-05, |
|
"loss": 0.6486, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 1.117241382598877, |
|
"learning_rate": 1.9060704713357382e-05, |
|
"loss": 0.5582, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6116071428571429, |
|
"grad_norm": 0.9881597757339478, |
|
"learning_rate": 1.904999020409837e-05, |
|
"loss": 0.606, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6138392857142857, |
|
"grad_norm": 1.057505488395691, |
|
"learning_rate": 1.9039217977425567e-05, |
|
"loss": 0.571, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6160714285714286, |
|
"grad_norm": 0.9963536858558655, |
|
"learning_rate": 1.902838810204015e-05, |
|
"loss": 0.5612, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6183035714285714, |
|
"grad_norm": 1.0682551860809326, |
|
"learning_rate": 1.901750064701097e-05, |
|
"loss": 0.5194, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6205357142857143, |
|
"grad_norm": 1.136527419090271, |
|
"learning_rate": 1.90065556817741e-05, |
|
"loss": 0.5609, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6227678571428571, |
|
"grad_norm": 0.9889798164367676, |
|
"learning_rate": 1.8995553276132385e-05, |
|
"loss": 0.5247, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.151685118675232, |
|
"learning_rate": 1.8984493500255e-05, |
|
"loss": 0.6895, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6272321428571429, |
|
"grad_norm": 1.2816126346588135, |
|
"learning_rate": 1.8973376424677022e-05, |
|
"loss": 0.6387, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6294642857142857, |
|
"grad_norm": 1.0590577125549316, |
|
"learning_rate": 1.8962202120298948e-05, |
|
"loss": 0.6099, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6316964285714286, |
|
"grad_norm": 1.278604507446289, |
|
"learning_rate": 1.8950970658386262e-05, |
|
"loss": 0.5988, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6339285714285714, |
|
"grad_norm": 1.2634141445159912, |
|
"learning_rate": 1.8939682110568982e-05, |
|
"loss": 0.5764, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6361607142857143, |
|
"grad_norm": 1.1590498685836792, |
|
"learning_rate": 1.8928336548841197e-05, |
|
"loss": 0.544, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6383928571428571, |
|
"grad_norm": 1.0856074094772339, |
|
"learning_rate": 1.8916934045560603e-05, |
|
"loss": 0.6177, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 1.1679246425628662, |
|
"learning_rate": 1.8905474673448055e-05, |
|
"loss": 0.5372, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 1.0703572034835815, |
|
"learning_rate": 1.8893958505587093e-05, |
|
"loss": 0.5629, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.6450892857142857, |
|
"grad_norm": 1.2214595079421997, |
|
"learning_rate": 1.8882385615423477e-05, |
|
"loss": 0.593, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.6473214285714286, |
|
"grad_norm": 1.0748178958892822, |
|
"learning_rate": 1.8870756076764728e-05, |
|
"loss": 0.5756, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6495535714285714, |
|
"grad_norm": 1.0605632066726685, |
|
"learning_rate": 1.8859069963779636e-05, |
|
"loss": 0.5999, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.6517857142857143, |
|
"grad_norm": 1.0350596904754639, |
|
"learning_rate": 1.8847327350997814e-05, |
|
"loss": 0.5449, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.6540178571428571, |
|
"grad_norm": 0.9954982399940491, |
|
"learning_rate": 1.88355283133092e-05, |
|
"loss": 0.5592, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.2038896083831787, |
|
"learning_rate": 1.8823672925963598e-05, |
|
"loss": 0.6072, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.6584821428571429, |
|
"grad_norm": 1.0482138395309448, |
|
"learning_rate": 1.8811761264570177e-05, |
|
"loss": 0.616, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6607142857142857, |
|
"grad_norm": 1.1453136205673218, |
|
"learning_rate": 1.879979340509701e-05, |
|
"loss": 0.6414, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.6629464285714286, |
|
"grad_norm": 0.9775916934013367, |
|
"learning_rate": 1.8787769423870583e-05, |
|
"loss": 0.542, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.6651785714285714, |
|
"grad_norm": 1.2236546277999878, |
|
"learning_rate": 1.877568939757529e-05, |
|
"loss": 0.5851, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.6674107142857143, |
|
"grad_norm": 1.1174293756484985, |
|
"learning_rate": 1.8763553403252975e-05, |
|
"loss": 0.4804, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 1.034529685974121, |
|
"learning_rate": 1.8751361518302413e-05, |
|
"loss": 0.5805, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.671875, |
|
"grad_norm": 1.0704272985458374, |
|
"learning_rate": 1.873911382047884e-05, |
|
"loss": 0.6559, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6741071428571429, |
|
"grad_norm": 1.0234812498092651, |
|
"learning_rate": 1.8726810387893438e-05, |
|
"loss": 0.5194, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6763392857142857, |
|
"grad_norm": 1.131823182106018, |
|
"learning_rate": 1.871445129901284e-05, |
|
"loss": 0.6617, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 1.3236502408981323, |
|
"learning_rate": 1.8702036632658646e-05, |
|
"loss": 0.5964, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6808035714285714, |
|
"grad_norm": 1.0573298931121826, |
|
"learning_rate": 1.8689566468006898e-05, |
|
"loss": 0.588, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6830357142857143, |
|
"grad_norm": 0.9685704708099365, |
|
"learning_rate": 1.867704088458759e-05, |
|
"loss": 0.5622, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6852678571428571, |
|
"grad_norm": 1.1851588487625122, |
|
"learning_rate": 1.866445996228415e-05, |
|
"loss": 0.5787, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.1072279214859009, |
|
"learning_rate": 1.8651823781332948e-05, |
|
"loss": 0.6292, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6897321428571429, |
|
"grad_norm": 1.1180968284606934, |
|
"learning_rate": 1.863913242232276e-05, |
|
"loss": 0.5659, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.6919642857142857, |
|
"grad_norm": 1.1263612508773804, |
|
"learning_rate": 1.8626385966194275e-05, |
|
"loss": 0.6296, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6941964285714286, |
|
"grad_norm": 1.116678237915039, |
|
"learning_rate": 1.8613584494239568e-05, |
|
"loss": 0.6357, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.6964285714285714, |
|
"grad_norm": 1.0281323194503784, |
|
"learning_rate": 1.8600728088101587e-05, |
|
"loss": 0.5647, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.6986607142857143, |
|
"grad_norm": 1.3668344020843506, |
|
"learning_rate": 1.858781682977362e-05, |
|
"loss": 0.5325, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7008928571428571, |
|
"grad_norm": 1.0618714094161987, |
|
"learning_rate": 1.857485080159879e-05, |
|
"loss": 0.661, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 1.1447020769119263, |
|
"learning_rate": 1.8561830086269524e-05, |
|
"loss": 0.6475, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7053571428571429, |
|
"grad_norm": 1.0199358463287354, |
|
"learning_rate": 1.8548754766827016e-05, |
|
"loss": 0.5274, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7075892857142857, |
|
"grad_norm": 0.9829248785972595, |
|
"learning_rate": 1.8535624926660707e-05, |
|
"loss": 0.5969, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7098214285714286, |
|
"grad_norm": 1.1307730674743652, |
|
"learning_rate": 1.852244064950775e-05, |
|
"loss": 0.5884, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7120535714285714, |
|
"grad_norm": 1.0643337965011597, |
|
"learning_rate": 1.8509202019452472e-05, |
|
"loss": 0.5436, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.2980728149414062, |
|
"learning_rate": 1.8495909120925857e-05, |
|
"loss": 0.6208, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7165178571428571, |
|
"grad_norm": 1.0896912813186646, |
|
"learning_rate": 1.8482562038704975e-05, |
|
"loss": 0.5372, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.2717396020889282, |
|
"learning_rate": 1.846916085791247e-05, |
|
"loss": 0.6847, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7209821428571429, |
|
"grad_norm": 1.1865652799606323, |
|
"learning_rate": 1.8455705664016003e-05, |
|
"loss": 0.6028, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7232142857142857, |
|
"grad_norm": 1.1411960124969482, |
|
"learning_rate": 1.8442196542827712e-05, |
|
"loss": 0.6161, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7254464285714286, |
|
"grad_norm": 1.3044698238372803, |
|
"learning_rate": 1.8428633580503658e-05, |
|
"loss": 0.7426, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7276785714285714, |
|
"grad_norm": 1.2270197868347168, |
|
"learning_rate": 1.8415016863543286e-05, |
|
"loss": 0.6773, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7299107142857143, |
|
"grad_norm": 1.191628098487854, |
|
"learning_rate": 1.8401346478788865e-05, |
|
"loss": 0.632, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.7321428571428571, |
|
"grad_norm": 1.2179105281829834, |
|
"learning_rate": 1.8387622513424942e-05, |
|
"loss": 0.6345, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.734375, |
|
"grad_norm": 1.125915288925171, |
|
"learning_rate": 1.8373845054977764e-05, |
|
"loss": 0.5677, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.7366071428571429, |
|
"grad_norm": 1.111037015914917, |
|
"learning_rate": 1.836001419131476e-05, |
|
"loss": 0.5699, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7388392857142857, |
|
"grad_norm": 1.1068542003631592, |
|
"learning_rate": 1.834613001064394e-05, |
|
"loss": 0.5378, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7410714285714286, |
|
"grad_norm": 1.2940490245819092, |
|
"learning_rate": 1.8332192601513358e-05, |
|
"loss": 0.6397, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.7433035714285714, |
|
"grad_norm": 1.1395896673202515, |
|
"learning_rate": 1.8318202052810538e-05, |
|
"loss": 0.6114, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7455357142857143, |
|
"grad_norm": 1.046424150466919, |
|
"learning_rate": 1.8304158453761904e-05, |
|
"loss": 0.5117, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.7477678571428571, |
|
"grad_norm": 1.1523703336715698, |
|
"learning_rate": 1.829006189393222e-05, |
|
"loss": 0.616, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.104680061340332, |
|
"learning_rate": 1.827591246322401e-05, |
|
"loss": 0.5834, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.5638446807861328, |
|
"eval_runtime": 31.0443, |
|
"eval_samples_per_second": 2.351, |
|
"eval_steps_per_second": 0.322, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.7522321428571429, |
|
"grad_norm": 1.0383703708648682, |
|
"learning_rate": 1.8261710251876993e-05, |
|
"loss": 0.5373, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.7544642857142857, |
|
"grad_norm": 1.1541510820388794, |
|
"learning_rate": 1.8247455350467496e-05, |
|
"loss": 0.6174, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.7566964285714286, |
|
"grad_norm": 1.024482250213623, |
|
"learning_rate": 1.8233147849907894e-05, |
|
"loss": 0.5369, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.7589285714285714, |
|
"grad_norm": 1.229745864868164, |
|
"learning_rate": 1.8218787841446003e-05, |
|
"loss": 0.5801, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7611607142857143, |
|
"grad_norm": 1.1336222887039185, |
|
"learning_rate": 1.8204375416664536e-05, |
|
"loss": 0.5951, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.7633928571428571, |
|
"grad_norm": 1.0349760055541992, |
|
"learning_rate": 1.8189910667480476e-05, |
|
"loss": 0.4982, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.765625, |
|
"grad_norm": 1.3031398057937622, |
|
"learning_rate": 1.8175393686144524e-05, |
|
"loss": 0.629, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.7678571428571429, |
|
"grad_norm": 1.2154903411865234, |
|
"learning_rate": 1.8160824565240495e-05, |
|
"loss": 0.6085, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.7700892857142857, |
|
"grad_norm": 0.9560657143592834, |
|
"learning_rate": 1.8146203397684734e-05, |
|
"loss": 0.5177, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7723214285714286, |
|
"grad_norm": 1.1145116090774536, |
|
"learning_rate": 1.8131530276725514e-05, |
|
"loss": 0.6308, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.7745535714285714, |
|
"grad_norm": 1.1008496284484863, |
|
"learning_rate": 1.811680529594245e-05, |
|
"loss": 0.6026, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.7767857142857143, |
|
"grad_norm": 1.0683940649032593, |
|
"learning_rate": 1.8102028549245894e-05, |
|
"loss": 0.5556, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.7790178571428571, |
|
"grad_norm": 1.1773459911346436, |
|
"learning_rate": 1.808720013087635e-05, |
|
"loss": 0.5652, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.0454031229019165, |
|
"learning_rate": 1.8072320135403862e-05, |
|
"loss": 0.5117, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7834821428571429, |
|
"grad_norm": 1.130771279335022, |
|
"learning_rate": 1.805738865772741e-05, |
|
"loss": 0.6254, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 1.0127828121185303, |
|
"learning_rate": 1.804240579307431e-05, |
|
"loss": 0.5923, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7879464285714286, |
|
"grad_norm": 1.0340120792388916, |
|
"learning_rate": 1.8027371636999605e-05, |
|
"loss": 0.5331, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7901785714285714, |
|
"grad_norm": 1.3208816051483154, |
|
"learning_rate": 1.8012286285385456e-05, |
|
"loss": 0.7229, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.7924107142857143, |
|
"grad_norm": 1.3123232126235962, |
|
"learning_rate": 1.7997149834440527e-05, |
|
"loss": 0.6147, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7946428571428571, |
|
"grad_norm": 1.0704232454299927, |
|
"learning_rate": 1.7981962380699376e-05, |
|
"loss": 0.6055, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.796875, |
|
"grad_norm": 1.1561580896377563, |
|
"learning_rate": 1.7966724021021837e-05, |
|
"loss": 0.544, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.7991071428571429, |
|
"grad_norm": 1.1618038415908813, |
|
"learning_rate": 1.7951434852592406e-05, |
|
"loss": 0.5955, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8013392857142857, |
|
"grad_norm": 1.0671852827072144, |
|
"learning_rate": 1.793609497291961e-05, |
|
"loss": 0.5581, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 1.125833511352539, |
|
"learning_rate": 1.79207044798354e-05, |
|
"loss": 0.7114, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8058035714285714, |
|
"grad_norm": 1.2748684883117676, |
|
"learning_rate": 1.7905263471494522e-05, |
|
"loss": 0.7434, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8080357142857143, |
|
"grad_norm": 1.0450520515441895, |
|
"learning_rate": 1.788977204637388e-05, |
|
"loss": 0.5177, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8102678571428571, |
|
"grad_norm": 1.2043256759643555, |
|
"learning_rate": 1.7874230303271932e-05, |
|
"loss": 0.7341, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.1245315074920654, |
|
"learning_rate": 1.7858638341308026e-05, |
|
"loss": 0.6051, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8147321428571429, |
|
"grad_norm": 1.2384027242660522, |
|
"learning_rate": 1.78429962599218e-05, |
|
"loss": 0.7031, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8169642857142857, |
|
"grad_norm": 1.1532883644104004, |
|
"learning_rate": 1.7827304158872538e-05, |
|
"loss": 0.5226, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8191964285714286, |
|
"grad_norm": 1.0486387014389038, |
|
"learning_rate": 1.7811562138238508e-05, |
|
"loss": 0.5454, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 1.1059547662734985, |
|
"learning_rate": 1.779577029841638e-05, |
|
"loss": 0.6249, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8236607142857143, |
|
"grad_norm": 1.2745469808578491, |
|
"learning_rate": 1.7779928740120525e-05, |
|
"loss": 0.6617, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8258928571428571, |
|
"grad_norm": 1.0087100267410278, |
|
"learning_rate": 1.776403756438241e-05, |
|
"loss": 0.5446, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.828125, |
|
"grad_norm": 1.2666665315628052, |
|
"learning_rate": 1.774809687254994e-05, |
|
"loss": 0.6908, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.8303571428571429, |
|
"grad_norm": 1.4326931238174438, |
|
"learning_rate": 1.773210676628682e-05, |
|
"loss": 0.6975, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.8325892857142857, |
|
"grad_norm": 1.0072338581085205, |
|
"learning_rate": 1.77160673475719e-05, |
|
"loss": 0.5109, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.8348214285714286, |
|
"grad_norm": 0.9262451529502869, |
|
"learning_rate": 1.769997871869852e-05, |
|
"loss": 0.5139, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.8370535714285714, |
|
"grad_norm": 1.3120925426483154, |
|
"learning_rate": 1.768384098227387e-05, |
|
"loss": 0.6241, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8392857142857143, |
|
"grad_norm": 1.2064335346221924, |
|
"learning_rate": 1.7667654241218332e-05, |
|
"loss": 0.6312, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.8415178571428571, |
|
"grad_norm": 1.1633553504943848, |
|
"learning_rate": 1.765141859876481e-05, |
|
"loss": 0.619, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 1.361649990081787, |
|
"learning_rate": 1.7635134158458095e-05, |
|
"loss": 0.6553, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.8459821428571429, |
|
"grad_norm": 1.1922473907470703, |
|
"learning_rate": 1.7618801024154186e-05, |
|
"loss": 0.5775, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8482142857142857, |
|
"grad_norm": 1.3128317594528198, |
|
"learning_rate": 1.7602419300019627e-05, |
|
"loss": 0.5734, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8504464285714286, |
|
"grad_norm": 1.246530532836914, |
|
"learning_rate": 1.758598909053087e-05, |
|
"loss": 0.5827, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.8526785714285714, |
|
"grad_norm": 1.207715392112732, |
|
"learning_rate": 1.7569510500473566e-05, |
|
"loss": 0.5851, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.8549107142857143, |
|
"grad_norm": 0.8977461457252502, |
|
"learning_rate": 1.7552983634941928e-05, |
|
"loss": 0.4392, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.298916220664978, |
|
"learning_rate": 1.753640859933806e-05, |
|
"loss": 0.5916, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 1.3132508993148804, |
|
"learning_rate": 1.751978549937126e-05, |
|
"loss": 0.6496, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8616071428571429, |
|
"grad_norm": 1.1107579469680786, |
|
"learning_rate": 1.7503114441057374e-05, |
|
"loss": 0.609, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.8638392857142857, |
|
"grad_norm": 1.1084481477737427, |
|
"learning_rate": 1.7486395530718104e-05, |
|
"loss": 0.6507, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.8660714285714286, |
|
"grad_norm": 1.031421184539795, |
|
"learning_rate": 1.746962887498034e-05, |
|
"loss": 0.5508, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.8683035714285714, |
|
"grad_norm": 1.0834870338439941, |
|
"learning_rate": 1.7452814580775467e-05, |
|
"loss": 0.5516, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.8705357142857143, |
|
"grad_norm": 0.9795181155204773, |
|
"learning_rate": 1.743595275533869e-05, |
|
"loss": 0.5161, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8727678571428571, |
|
"grad_norm": 0.9782564043998718, |
|
"learning_rate": 1.7419043506208348e-05, |
|
"loss": 0.6326, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.0224997997283936, |
|
"learning_rate": 1.7402086941225246e-05, |
|
"loss": 0.5398, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.8772321428571429, |
|
"grad_norm": 1.1170098781585693, |
|
"learning_rate": 1.7385083168531934e-05, |
|
"loss": 0.5963, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.8794642857142857, |
|
"grad_norm": 1.1109061241149902, |
|
"learning_rate": 1.736803229657204e-05, |
|
"loss": 0.6092, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.8816964285714286, |
|
"grad_norm": 1.0568723678588867, |
|
"learning_rate": 1.7350934434089583e-05, |
|
"loss": 0.5028, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8839285714285714, |
|
"grad_norm": 1.0677611827850342, |
|
"learning_rate": 1.7333789690128252e-05, |
|
"loss": 0.6654, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.8861607142857143, |
|
"grad_norm": 1.2150548696517944, |
|
"learning_rate": 1.7316598174030746e-05, |
|
"loss": 0.6201, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.8883928571428571, |
|
"grad_norm": 1.2608318328857422, |
|
"learning_rate": 1.7299359995438046e-05, |
|
"loss": 0.6584, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.890625, |
|
"grad_norm": 1.141231894493103, |
|
"learning_rate": 1.728207526428873e-05, |
|
"loss": 0.6207, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 1.2013863325119019, |
|
"learning_rate": 1.7264744090818284e-05, |
|
"loss": 0.6354, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8950892857142857, |
|
"grad_norm": 1.1436806917190552, |
|
"learning_rate": 1.7247366585558366e-05, |
|
"loss": 0.6111, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.8973214285714286, |
|
"grad_norm": 1.0974454879760742, |
|
"learning_rate": 1.7229942859336142e-05, |
|
"loss": 0.6545, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.8995535714285714, |
|
"grad_norm": 1.0279815196990967, |
|
"learning_rate": 1.7212473023273532e-05, |
|
"loss": 0.5486, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9017857142857143, |
|
"grad_norm": 1.2103922367095947, |
|
"learning_rate": 1.719495718878655e-05, |
|
"loss": 0.6333, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9040178571428571, |
|
"grad_norm": 1.1030718088150024, |
|
"learning_rate": 1.7177395467584564e-05, |
|
"loss": 0.581, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 1.5910372734069824, |
|
"learning_rate": 1.7159787971669586e-05, |
|
"loss": 0.6665, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9084821428571429, |
|
"grad_norm": 1.4932862520217896, |
|
"learning_rate": 1.7142134813335557e-05, |
|
"loss": 0.6512, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.9107142857142857, |
|
"grad_norm": 1.2478338479995728, |
|
"learning_rate": 1.712443610516765e-05, |
|
"loss": 0.6352, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.9129464285714286, |
|
"grad_norm": 1.2439334392547607, |
|
"learning_rate": 1.7106691960041527e-05, |
|
"loss": 0.6865, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.9151785714285714, |
|
"grad_norm": 1.0125123262405396, |
|
"learning_rate": 1.7088902491122636e-05, |
|
"loss": 0.6067, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9174107142857143, |
|
"grad_norm": 1.1370826959609985, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.6601, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.9196428571428571, |
|
"grad_norm": 1.3260993957519531, |
|
"learning_rate": 1.7053188036012885e-05, |
|
"loss": 0.5318, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.921875, |
|
"grad_norm": 1.2528599500656128, |
|
"learning_rate": 1.7035263277595314e-05, |
|
"loss": 0.5438, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.9241071428571429, |
|
"grad_norm": 1.011576533317566, |
|
"learning_rate": 1.7017293650930083e-05, |
|
"loss": 0.6103, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.9263392857142857, |
|
"grad_norm": 1.0763773918151855, |
|
"learning_rate": 1.6999279270620675e-05, |
|
"loss": 0.6163, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 1.0680584907531738, |
|
"learning_rate": 1.6981220251555996e-05, |
|
"loss": 0.5902, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.9308035714285714, |
|
"grad_norm": 1.2038758993148804, |
|
"learning_rate": 1.6963116708909637e-05, |
|
"loss": 0.629, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.9330357142857143, |
|
"grad_norm": 0.9378647804260254, |
|
"learning_rate": 1.6944968758139144e-05, |
|
"loss": 0.5668, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.9352678571428571, |
|
"grad_norm": 0.9915615916252136, |
|
"learning_rate": 1.6926776514985278e-05, |
|
"loss": 0.5527, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.2160735130310059, |
|
"learning_rate": 1.6908540095471288e-05, |
|
"loss": 0.6082, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9397321428571429, |
|
"grad_norm": 1.1135092973709106, |
|
"learning_rate": 1.6890259615902153e-05, |
|
"loss": 0.6318, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.9419642857142857, |
|
"grad_norm": 1.111850619316101, |
|
"learning_rate": 1.6871935192863862e-05, |
|
"loss": 0.558, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.9441964285714286, |
|
"grad_norm": 1.0438261032104492, |
|
"learning_rate": 1.6853566943222647e-05, |
|
"loss": 0.6356, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.9464285714285714, |
|
"grad_norm": 1.1192708015441895, |
|
"learning_rate": 1.6835154984124266e-05, |
|
"loss": 0.6006, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.9486607142857143, |
|
"grad_norm": 0.9830177426338196, |
|
"learning_rate": 1.6816699432993212e-05, |
|
"loss": 0.6372, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9508928571428571, |
|
"grad_norm": 1.1955080032348633, |
|
"learning_rate": 1.6798200407532025e-05, |
|
"loss": 0.6932, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.953125, |
|
"grad_norm": 1.1463717222213745, |
|
"learning_rate": 1.677965802572048e-05, |
|
"loss": 0.6139, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.9553571428571429, |
|
"grad_norm": 1.1220048666000366, |
|
"learning_rate": 1.676107240581488e-05, |
|
"loss": 0.5997, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.9575892857142857, |
|
"grad_norm": 1.2134525775909424, |
|
"learning_rate": 1.674244366634727e-05, |
|
"loss": 0.6085, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.9598214285714286, |
|
"grad_norm": 1.4323443174362183, |
|
"learning_rate": 1.6723771926124704e-05, |
|
"loss": 0.7118, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9620535714285714, |
|
"grad_norm": 1.0498019456863403, |
|
"learning_rate": 1.6705057304228488e-05, |
|
"loss": 0.5317, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 1.1488829851150513, |
|
"learning_rate": 1.6686299920013388e-05, |
|
"loss": 0.5828, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.9665178571428571, |
|
"grad_norm": 0.9636843800544739, |
|
"learning_rate": 1.666749989310691e-05, |
|
"loss": 0.5752, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 1.0637983083724976, |
|
"learning_rate": 1.6648657343408517e-05, |
|
"loss": 0.5987, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.9709821428571429, |
|
"grad_norm": 1.0497316122055054, |
|
"learning_rate": 1.6629772391088855e-05, |
|
"loss": 0.5571, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9732142857142857, |
|
"grad_norm": 1.2553322315216064, |
|
"learning_rate": 1.661084515658901e-05, |
|
"loss": 0.6733, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.9754464285714286, |
|
"grad_norm": 0.9725781083106995, |
|
"learning_rate": 1.6591875760619718e-05, |
|
"loss": 0.4813, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.9776785714285714, |
|
"grad_norm": 1.3850640058517456, |
|
"learning_rate": 1.6572864324160617e-05, |
|
"loss": 0.6368, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.9799107142857143, |
|
"grad_norm": 1.094842791557312, |
|
"learning_rate": 1.6553810968459455e-05, |
|
"loss": 0.5475, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 1.1491671800613403, |
|
"learning_rate": 1.6534715815031325e-05, |
|
"loss": 0.6112, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.984375, |
|
"grad_norm": 1.1847591400146484, |
|
"learning_rate": 1.651557898565789e-05, |
|
"loss": 0.6173, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.9866071428571429, |
|
"grad_norm": 1.0778274536132812, |
|
"learning_rate": 1.649640060238661e-05, |
|
"loss": 0.5835, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.9888392857142857, |
|
"grad_norm": 1.1943531036376953, |
|
"learning_rate": 1.6477180787529957e-05, |
|
"loss": 0.6343, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.9910714285714286, |
|
"grad_norm": 1.1000702381134033, |
|
"learning_rate": 1.645791966366464e-05, |
|
"loss": 0.5547, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.9933035714285714, |
|
"grad_norm": 1.0542763471603394, |
|
"learning_rate": 1.6438617353630823e-05, |
|
"loss": 0.5713, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9955357142857143, |
|
"grad_norm": 1.0858733654022217, |
|
"learning_rate": 1.6419273980531333e-05, |
|
"loss": 0.6663, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.9977678571428571, |
|
"grad_norm": 1.0207633972167969, |
|
"learning_rate": 1.6399889667730887e-05, |
|
"loss": 0.5343, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.1045469045639038, |
|
"learning_rate": 1.63804645388553e-05, |
|
"loss": 0.544, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5578042268753052, |
|
"eval_runtime": 28.1156, |
|
"eval_samples_per_second": 2.596, |
|
"eval_steps_per_second": 0.356, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.0022321428571428, |
|
"grad_norm": 1.2090537548065186, |
|
"learning_rate": 1.6360998717790694e-05, |
|
"loss": 0.3826, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.0044642857142858, |
|
"grad_norm": 1.4487948417663574, |
|
"learning_rate": 1.6341492328682703e-05, |
|
"loss": 0.5219, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0066964285714286, |
|
"grad_norm": 1.1036040782928467, |
|
"learning_rate": 1.6321945495935717e-05, |
|
"loss": 0.4923, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.0089285714285714, |
|
"grad_norm": 1.0219038724899292, |
|
"learning_rate": 1.6302358344212025e-05, |
|
"loss": 0.4067, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.0111607142857142, |
|
"grad_norm": 1.129643440246582, |
|
"learning_rate": 1.6282730998431072e-05, |
|
"loss": 0.4884, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.0133928571428572, |
|
"grad_norm": 1.1244534254074097, |
|
"learning_rate": 1.6263063583768652e-05, |
|
"loss": 0.3948, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 1.1327791213989258, |
|
"learning_rate": 1.624335622565609e-05, |
|
"loss": 0.46, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.0178571428571428, |
|
"grad_norm": 1.2120473384857178, |
|
"learning_rate": 1.622360904977946e-05, |
|
"loss": 0.4226, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.0200892857142858, |
|
"grad_norm": 1.6121370792388916, |
|
"learning_rate": 1.6203822182078777e-05, |
|
"loss": 0.5237, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.0223214285714286, |
|
"grad_norm": 1.3280739784240723, |
|
"learning_rate": 1.6183995748747204e-05, |
|
"loss": 0.4842, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.0245535714285714, |
|
"grad_norm": 1.1011444330215454, |
|
"learning_rate": 1.6164129876230226e-05, |
|
"loss": 0.3867, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.0267857142857142, |
|
"grad_norm": 1.1854206323623657, |
|
"learning_rate": 1.6144224691224868e-05, |
|
"loss": 0.4239, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0290178571428572, |
|
"grad_norm": 1.2774749994277954, |
|
"learning_rate": 1.6124280320678864e-05, |
|
"loss": 0.4665, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 1.2907570600509644, |
|
"learning_rate": 1.6104296891789867e-05, |
|
"loss": 0.5446, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.0334821428571428, |
|
"grad_norm": 1.1413031816482544, |
|
"learning_rate": 1.608427453200463e-05, |
|
"loss": 0.4177, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.0357142857142858, |
|
"grad_norm": 1.1599458456039429, |
|
"learning_rate": 1.606421336901818e-05, |
|
"loss": 0.4736, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.0379464285714286, |
|
"grad_norm": 1.319823145866394, |
|
"learning_rate": 1.6044113530773034e-05, |
|
"loss": 0.5214, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.0401785714285714, |
|
"grad_norm": 1.1600507497787476, |
|
"learning_rate": 1.6023975145458352e-05, |
|
"loss": 0.4226, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.0424107142857142, |
|
"grad_norm": 0.9671021699905396, |
|
"learning_rate": 1.600379834150914e-05, |
|
"loss": 0.3623, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.0446428571428572, |
|
"grad_norm": 0.9996952414512634, |
|
"learning_rate": 1.5983583247605414e-05, |
|
"loss": 0.423, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.046875, |
|
"grad_norm": 1.362450122833252, |
|
"learning_rate": 1.5963329992671402e-05, |
|
"loss": 0.4763, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.0491071428571428, |
|
"grad_norm": 1.3633580207824707, |
|
"learning_rate": 1.5943038705874697e-05, |
|
"loss": 0.4702, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0513392857142858, |
|
"grad_norm": 1.2059600353240967, |
|
"learning_rate": 1.5922709516625453e-05, |
|
"loss": 0.4417, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.0535714285714286, |
|
"grad_norm": 1.2279843091964722, |
|
"learning_rate": 1.590234255457555e-05, |
|
"loss": 0.4629, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.0558035714285714, |
|
"grad_norm": 1.2962024211883545, |
|
"learning_rate": 1.588193794961776e-05, |
|
"loss": 0.4493, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.0580357142857142, |
|
"grad_norm": 0.9999826550483704, |
|
"learning_rate": 1.5861495831884942e-05, |
|
"loss": 0.3847, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.0602678571428572, |
|
"grad_norm": 1.0514469146728516, |
|
"learning_rate": 1.5841016331749185e-05, |
|
"loss": 0.4358, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.110394835472107, |
|
"learning_rate": 1.582049957982099e-05, |
|
"loss": 0.3376, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.0647321428571428, |
|
"grad_norm": 1.2494930028915405, |
|
"learning_rate": 1.5799945706948447e-05, |
|
"loss": 0.4125, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.0669642857142858, |
|
"grad_norm": 1.2270575761795044, |
|
"learning_rate": 1.5779354844216377e-05, |
|
"loss": 0.4493, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.0691964285714286, |
|
"grad_norm": 1.3279017210006714, |
|
"learning_rate": 1.5758727122945514e-05, |
|
"loss": 0.498, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 1.0876953601837158, |
|
"learning_rate": 1.5738062674691657e-05, |
|
"loss": 0.475, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0736607142857142, |
|
"grad_norm": 1.154995322227478, |
|
"learning_rate": 1.5717361631244842e-05, |
|
"loss": 0.4415, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.0758928571428572, |
|
"grad_norm": 1.1559693813323975, |
|
"learning_rate": 1.5696624124628495e-05, |
|
"loss": 0.4855, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.078125, |
|
"grad_norm": 1.4143540859222412, |
|
"learning_rate": 1.5675850287098585e-05, |
|
"loss": 0.5263, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.0803571428571428, |
|
"grad_norm": 1.0588111877441406, |
|
"learning_rate": 1.5655040251142787e-05, |
|
"loss": 0.444, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.0825892857142858, |
|
"grad_norm": 1.0768473148345947, |
|
"learning_rate": 1.5634194149479642e-05, |
|
"loss": 0.4086, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0848214285714286, |
|
"grad_norm": 1.2432984113693237, |
|
"learning_rate": 1.5613312115057697e-05, |
|
"loss": 0.5302, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.0870535714285714, |
|
"grad_norm": 1.0816019773483276, |
|
"learning_rate": 1.559239428105467e-05, |
|
"loss": 0.4353, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.0892857142857142, |
|
"grad_norm": 1.3731716871261597, |
|
"learning_rate": 1.5571440780876588e-05, |
|
"loss": 0.4319, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.0915178571428572, |
|
"grad_norm": 1.2017320394515991, |
|
"learning_rate": 1.5550451748156957e-05, |
|
"loss": 0.4251, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 1.1153476238250732, |
|
"learning_rate": 1.5529427316755876e-05, |
|
"loss": 0.4685, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0959821428571428, |
|
"grad_norm": 1.0116766691207886, |
|
"learning_rate": 1.5508367620759224e-05, |
|
"loss": 0.3844, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.0982142857142858, |
|
"grad_norm": 1.1723082065582275, |
|
"learning_rate": 1.548727279447777e-05, |
|
"loss": 0.4365, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.1004464285714286, |
|
"grad_norm": 1.2044928073883057, |
|
"learning_rate": 1.546614297244634e-05, |
|
"loss": 0.4114, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.1026785714285714, |
|
"grad_norm": 1.3545663356781006, |
|
"learning_rate": 1.5444978289422937e-05, |
|
"loss": 0.4485, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.1049107142857142, |
|
"grad_norm": 1.1752678155899048, |
|
"learning_rate": 1.542377888038791e-05, |
|
"loss": 0.4622, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1071428571428572, |
|
"grad_norm": 1.1464520692825317, |
|
"learning_rate": 1.540254488054307e-05, |
|
"loss": 0.4101, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.109375, |
|
"grad_norm": 1.1562168598175049, |
|
"learning_rate": 1.538127642531083e-05, |
|
"loss": 0.4391, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.1116071428571428, |
|
"grad_norm": 1.3013789653778076, |
|
"learning_rate": 1.5359973650333352e-05, |
|
"loss": 0.5043, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.1138392857142858, |
|
"grad_norm": 1.0924135446548462, |
|
"learning_rate": 1.533863669147168e-05, |
|
"loss": 0.4345, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.1160714285714286, |
|
"grad_norm": 1.099307894706726, |
|
"learning_rate": 1.5317265684804865e-05, |
|
"loss": 0.423, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1183035714285714, |
|
"grad_norm": 1.159507155418396, |
|
"learning_rate": 1.5295860766629098e-05, |
|
"loss": 0.4412, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.1205357142857142, |
|
"grad_norm": 1.2187080383300781, |
|
"learning_rate": 1.5274422073456853e-05, |
|
"loss": 0.4726, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.1227678571428572, |
|
"grad_norm": 1.1509562730789185, |
|
"learning_rate": 1.5252949742016005e-05, |
|
"loss": 0.3894, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.2710036039352417, |
|
"learning_rate": 1.5231443909248956e-05, |
|
"loss": 0.5062, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.1272321428571428, |
|
"grad_norm": 1.172450304031372, |
|
"learning_rate": 1.5209904712311777e-05, |
|
"loss": 0.409, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.1294642857142858, |
|
"grad_norm": 1.2753404378890991, |
|
"learning_rate": 1.5188332288573313e-05, |
|
"loss": 0.44, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.1316964285714286, |
|
"grad_norm": 1.1560388803482056, |
|
"learning_rate": 1.5166726775614327e-05, |
|
"loss": 0.4809, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.1339285714285714, |
|
"grad_norm": 1.3690383434295654, |
|
"learning_rate": 1.5145088311226599e-05, |
|
"loss": 0.4882, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.1361607142857142, |
|
"grad_norm": 1.2509236335754395, |
|
"learning_rate": 1.5123417033412078e-05, |
|
"loss": 0.3845, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.1383928571428572, |
|
"grad_norm": 1.1513557434082031, |
|
"learning_rate": 1.510171308038197e-05, |
|
"loss": 0.4387, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.140625, |
|
"grad_norm": 1.3206255435943604, |
|
"learning_rate": 1.5079976590555876e-05, |
|
"loss": 0.4861, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 1.1096549034118652, |
|
"learning_rate": 1.5058207702560907e-05, |
|
"loss": 0.4612, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.1450892857142858, |
|
"grad_norm": 1.1876708269119263, |
|
"learning_rate": 1.5036406555230794e-05, |
|
"loss": 0.4633, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.1473214285714286, |
|
"grad_norm": 1.07047438621521, |
|
"learning_rate": 1.501457328760501e-05, |
|
"loss": 0.3813, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.1495535714285714, |
|
"grad_norm": 1.0505889654159546, |
|
"learning_rate": 1.499270803892787e-05, |
|
"loss": 0.3661, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.1517857142857142, |
|
"grad_norm": 1.243187665939331, |
|
"learning_rate": 1.4970810948647664e-05, |
|
"loss": 0.4753, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.1540178571428572, |
|
"grad_norm": 1.1276707649230957, |
|
"learning_rate": 1.4948882156415748e-05, |
|
"loss": 0.4216, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 1.1489506959915161, |
|
"learning_rate": 1.4926921802085662e-05, |
|
"loss": 0.5241, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.1584821428571428, |
|
"grad_norm": 1.2082480192184448, |
|
"learning_rate": 1.4904930025712236e-05, |
|
"loss": 0.4244, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.1607142857142858, |
|
"grad_norm": 1.0853203535079956, |
|
"learning_rate": 1.4882906967550708e-05, |
|
"loss": 0.4449, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1629464285714286, |
|
"grad_norm": 1.0796903371810913, |
|
"learning_rate": 1.4860852768055804e-05, |
|
"loss": 0.4915, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.1651785714285714, |
|
"grad_norm": 1.31143057346344, |
|
"learning_rate": 1.4838767567880865e-05, |
|
"loss": 0.4262, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.1674107142857142, |
|
"grad_norm": 1.1698493957519531, |
|
"learning_rate": 1.4816651507876946e-05, |
|
"loss": 0.4845, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.1696428571428572, |
|
"grad_norm": 1.1286602020263672, |
|
"learning_rate": 1.479450472909191e-05, |
|
"loss": 0.3967, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 1.054138422012329, |
|
"learning_rate": 1.4772327372769533e-05, |
|
"loss": 0.4502, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1741071428571428, |
|
"grad_norm": 1.3807618618011475, |
|
"learning_rate": 1.4750119580348601e-05, |
|
"loss": 0.5044, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.1763392857142858, |
|
"grad_norm": 1.2077445983886719, |
|
"learning_rate": 1.4727881493462018e-05, |
|
"loss": 0.3643, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.1785714285714286, |
|
"grad_norm": 1.1224011182785034, |
|
"learning_rate": 1.4705613253935886e-05, |
|
"loss": 0.4503, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.1808035714285714, |
|
"grad_norm": 1.0725282430648804, |
|
"learning_rate": 1.4683315003788614e-05, |
|
"loss": 0.4861, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1830357142857142, |
|
"grad_norm": 1.2990797758102417, |
|
"learning_rate": 1.4660986885230002e-05, |
|
"loss": 0.4194, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1852678571428572, |
|
"grad_norm": 1.075061321258545, |
|
"learning_rate": 1.463862904066035e-05, |
|
"loss": 0.4469, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 1.1602604389190674, |
|
"learning_rate": 1.4616241612669523e-05, |
|
"loss": 0.433, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.1897321428571428, |
|
"grad_norm": 1.1690677404403687, |
|
"learning_rate": 1.4593824744036078e-05, |
|
"loss": 0.4625, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.1919642857142858, |
|
"grad_norm": 1.1553549766540527, |
|
"learning_rate": 1.4571378577726317e-05, |
|
"loss": 0.4143, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.1941964285714286, |
|
"grad_norm": 1.2570900917053223, |
|
"learning_rate": 1.4548903256893392e-05, |
|
"loss": 0.4434, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.1964285714285714, |
|
"grad_norm": 1.2540876865386963, |
|
"learning_rate": 1.4526398924876407e-05, |
|
"loss": 0.461, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.1986607142857142, |
|
"grad_norm": 1.1208131313323975, |
|
"learning_rate": 1.4503865725199468e-05, |
|
"loss": 0.4251, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.2008928571428572, |
|
"grad_norm": 1.0891457796096802, |
|
"learning_rate": 1.4481303801570805e-05, |
|
"loss": 0.4534, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.203125, |
|
"grad_norm": 0.965118944644928, |
|
"learning_rate": 1.4458713297881828e-05, |
|
"loss": 0.4057, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.2053571428571428, |
|
"grad_norm": 1.086959958076477, |
|
"learning_rate": 1.4436094358206224e-05, |
|
"loss": 0.4249, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2075892857142858, |
|
"grad_norm": 1.0582690238952637, |
|
"learning_rate": 1.4413447126799038e-05, |
|
"loss": 0.3932, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.2098214285714286, |
|
"grad_norm": 0.9543986916542053, |
|
"learning_rate": 1.4390771748095735e-05, |
|
"loss": 0.3234, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.2120535714285714, |
|
"grad_norm": 1.2138005495071411, |
|
"learning_rate": 1.436806836671131e-05, |
|
"loss": 0.4154, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 1.23991060256958, |
|
"learning_rate": 1.4345337127439333e-05, |
|
"loss": 0.4757, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.2165178571428572, |
|
"grad_norm": 1.1649141311645508, |
|
"learning_rate": 1.4322578175251058e-05, |
|
"loss": 0.4841, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 1.238889217376709, |
|
"learning_rate": 1.4299791655294461e-05, |
|
"loss": 0.4381, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.2209821428571428, |
|
"grad_norm": 1.1498550176620483, |
|
"learning_rate": 1.4276977712893357e-05, |
|
"loss": 0.4608, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.2232142857142858, |
|
"grad_norm": 1.2343175411224365, |
|
"learning_rate": 1.4254136493546432e-05, |
|
"loss": 0.4884, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.2254464285714286, |
|
"grad_norm": 1.2278847694396973, |
|
"learning_rate": 1.4231268142926345e-05, |
|
"loss": 0.489, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.2276785714285714, |
|
"grad_norm": 1.3790256977081299, |
|
"learning_rate": 1.4208372806878782e-05, |
|
"loss": 0.4945, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2299107142857142, |
|
"grad_norm": 1.2269234657287598, |
|
"learning_rate": 1.4185450631421542e-05, |
|
"loss": 0.5471, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.2321428571428572, |
|
"grad_norm": 1.1349695920944214, |
|
"learning_rate": 1.4162501762743579e-05, |
|
"loss": 0.4547, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.234375, |
|
"grad_norm": 1.1662615537643433, |
|
"learning_rate": 1.41395263472041e-05, |
|
"loss": 0.478, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.2366071428571428, |
|
"grad_norm": 1.1003633737564087, |
|
"learning_rate": 1.4116524531331616e-05, |
|
"loss": 0.4237, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.2388392857142858, |
|
"grad_norm": 1.012122631072998, |
|
"learning_rate": 1.4093496461823002e-05, |
|
"loss": 0.4799, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.2410714285714286, |
|
"grad_norm": 1.0672236680984497, |
|
"learning_rate": 1.4070442285542579e-05, |
|
"loss": 0.4342, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.2433035714285714, |
|
"grad_norm": 1.1434969902038574, |
|
"learning_rate": 1.4047362149521152e-05, |
|
"loss": 0.4758, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.2455357142857142, |
|
"grad_norm": 1.1280899047851562, |
|
"learning_rate": 1.402425620095511e-05, |
|
"loss": 0.4325, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.2477678571428572, |
|
"grad_norm": 1.074324369430542, |
|
"learning_rate": 1.400112458720544e-05, |
|
"loss": 0.4504, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0355358123779297, |
|
"learning_rate": 1.3977967455796828e-05, |
|
"loss": 0.464, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.5843456983566284, |
|
"eval_runtime": 27.5507, |
|
"eval_samples_per_second": 2.65, |
|
"eval_steps_per_second": 0.363, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2522321428571428, |
|
"grad_norm": 1.1094245910644531, |
|
"learning_rate": 1.3954784954416703e-05, |
|
"loss": 0.458, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.2544642857142856, |
|
"grad_norm": 1.1554150581359863, |
|
"learning_rate": 1.393157723091428e-05, |
|
"loss": 0.4661, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.2566964285714286, |
|
"grad_norm": 1.195674180984497, |
|
"learning_rate": 1.3908344433299644e-05, |
|
"loss": 0.5074, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.2589285714285714, |
|
"grad_norm": 1.067400336265564, |
|
"learning_rate": 1.3885086709742788e-05, |
|
"loss": 0.3862, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.2611607142857144, |
|
"grad_norm": 1.1013455390930176, |
|
"learning_rate": 1.3861804208572674e-05, |
|
"loss": 0.4355, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.2633928571428572, |
|
"grad_norm": 1.1833617687225342, |
|
"learning_rate": 1.3838497078276288e-05, |
|
"loss": 0.4716, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.265625, |
|
"grad_norm": 1.09175705909729, |
|
"learning_rate": 1.3815165467497686e-05, |
|
"loss": 0.4745, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.2678571428571428, |
|
"grad_norm": 1.2390869855880737, |
|
"learning_rate": 1.3791809525037057e-05, |
|
"loss": 0.428, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.2700892857142856, |
|
"grad_norm": 1.2348787784576416, |
|
"learning_rate": 1.376842939984977e-05, |
|
"loss": 0.3749, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.2723214285714286, |
|
"grad_norm": 0.9252075552940369, |
|
"learning_rate": 1.3745025241045414e-05, |
|
"loss": 0.4135, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2745535714285714, |
|
"grad_norm": 1.3814020156860352, |
|
"learning_rate": 1.372159719788686e-05, |
|
"loss": 0.4562, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.2767857142857144, |
|
"grad_norm": 1.3983325958251953, |
|
"learning_rate": 1.3698145419789302e-05, |
|
"loss": 0.4768, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.2790178571428572, |
|
"grad_norm": 1.184475064277649, |
|
"learning_rate": 1.3674670056319315e-05, |
|
"loss": 0.4687, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 1.053714632987976, |
|
"learning_rate": 1.3651171257193883e-05, |
|
"loss": 0.4564, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.2834821428571428, |
|
"grad_norm": 1.1729894876480103, |
|
"learning_rate": 1.3627649172279453e-05, |
|
"loss": 0.4586, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 1.1714091300964355, |
|
"learning_rate": 1.3604103951590993e-05, |
|
"loss": 0.4365, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.2879464285714286, |
|
"grad_norm": 1.2077913284301758, |
|
"learning_rate": 1.3580535745291001e-05, |
|
"loss": 0.4415, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.2901785714285714, |
|
"grad_norm": 1.2015424966812134, |
|
"learning_rate": 1.3556944703688592e-05, |
|
"loss": 0.4644, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.2924107142857144, |
|
"grad_norm": 1.0738661289215088, |
|
"learning_rate": 1.3533330977238496e-05, |
|
"loss": 0.4131, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.2946428571428572, |
|
"grad_norm": 1.1716837882995605, |
|
"learning_rate": 1.3509694716540135e-05, |
|
"loss": 0.4083, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.296875, |
|
"grad_norm": 0.9854421019554138, |
|
"learning_rate": 1.348603607233663e-05, |
|
"loss": 0.3796, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.2991071428571428, |
|
"grad_norm": 1.105299949645996, |
|
"learning_rate": 1.3462355195513868e-05, |
|
"loss": 0.4918, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.3013392857142856, |
|
"grad_norm": 1.2206473350524902, |
|
"learning_rate": 1.343865223709952e-05, |
|
"loss": 0.4594, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.3035714285714286, |
|
"grad_norm": 1.1916580200195312, |
|
"learning_rate": 1.341492734826209e-05, |
|
"loss": 0.482, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.3058035714285714, |
|
"grad_norm": 1.0674858093261719, |
|
"learning_rate": 1.3391180680309945e-05, |
|
"loss": 0.4192, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.3080357142857144, |
|
"grad_norm": 1.403185486793518, |
|
"learning_rate": 1.3367412384690346e-05, |
|
"loss": 0.5409, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.3102678571428572, |
|
"grad_norm": 1.0415091514587402, |
|
"learning_rate": 1.3343622612988492e-05, |
|
"loss": 0.4695, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 1.290241003036499, |
|
"learning_rate": 1.3319811516926541e-05, |
|
"loss": 0.4639, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.3147321428571428, |
|
"grad_norm": 0.9970055222511292, |
|
"learning_rate": 1.329597924836267e-05, |
|
"loss": 0.428, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.3169642857142856, |
|
"grad_norm": 1.2482655048370361, |
|
"learning_rate": 1.3272125959290059e-05, |
|
"loss": 0.4967, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.3191964285714286, |
|
"grad_norm": 1.0975176095962524, |
|
"learning_rate": 1.3248251801835968e-05, |
|
"loss": 0.4343, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.3214285714285714, |
|
"grad_norm": 1.1875313520431519, |
|
"learning_rate": 1.3224356928260735e-05, |
|
"loss": 0.387, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.3236607142857144, |
|
"grad_norm": 1.1995247602462769, |
|
"learning_rate": 1.3200441490956832e-05, |
|
"loss": 0.4853, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.3258928571428572, |
|
"grad_norm": 1.2359397411346436, |
|
"learning_rate": 1.317650564244787e-05, |
|
"loss": 0.4778, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 0.9895375967025757, |
|
"learning_rate": 1.3152549535387624e-05, |
|
"loss": 0.4386, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.3303571428571428, |
|
"grad_norm": 1.1084294319152832, |
|
"learning_rate": 1.3128573322559097e-05, |
|
"loss": 0.4152, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.3325892857142856, |
|
"grad_norm": 1.230063557624817, |
|
"learning_rate": 1.3104577156873496e-05, |
|
"loss": 0.413, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.3348214285714286, |
|
"grad_norm": 1.1854337453842163, |
|
"learning_rate": 1.3080561191369286e-05, |
|
"loss": 0.471, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.3370535714285714, |
|
"grad_norm": 0.9070828557014465, |
|
"learning_rate": 1.3056525579211215e-05, |
|
"loss": 0.3926, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 1.3953931331634521, |
|
"learning_rate": 1.3032470473689322e-05, |
|
"loss": 0.4771, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3415178571428572, |
|
"grad_norm": 1.1881200075149536, |
|
"learning_rate": 1.3008396028217969e-05, |
|
"loss": 0.4817, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 1.27316153049469, |
|
"learning_rate": 1.298430239633486e-05, |
|
"loss": 0.4898, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.3459821428571428, |
|
"grad_norm": 1.0014166831970215, |
|
"learning_rate": 1.296018973170007e-05, |
|
"loss": 0.4285, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.3482142857142856, |
|
"grad_norm": 1.0519405603408813, |
|
"learning_rate": 1.2936058188095045e-05, |
|
"loss": 0.4123, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.3504464285714286, |
|
"grad_norm": 1.192347526550293, |
|
"learning_rate": 1.2911907919421647e-05, |
|
"loss": 0.5152, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.3526785714285714, |
|
"grad_norm": 1.1087520122528076, |
|
"learning_rate": 1.2887739079701147e-05, |
|
"loss": 0.4679, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.3549107142857144, |
|
"grad_norm": 1.2100986242294312, |
|
"learning_rate": 1.2863551823073266e-05, |
|
"loss": 0.4792, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 1.1435983180999756, |
|
"learning_rate": 1.2839346303795173e-05, |
|
"loss": 0.4087, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.359375, |
|
"grad_norm": 1.0820369720458984, |
|
"learning_rate": 1.2815122676240518e-05, |
|
"loss": 0.3981, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.3616071428571428, |
|
"grad_norm": 1.0322455167770386, |
|
"learning_rate": 1.2790881094898428e-05, |
|
"loss": 0.489, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3638392857142856, |
|
"grad_norm": 1.12478768825531, |
|
"learning_rate": 1.2766621714372543e-05, |
|
"loss": 0.3883, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.3660714285714286, |
|
"grad_norm": 1.2271108627319336, |
|
"learning_rate": 1.274234468938001e-05, |
|
"loss": 0.4615, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.3683035714285714, |
|
"grad_norm": 1.1735365390777588, |
|
"learning_rate": 1.271805017475051e-05, |
|
"loss": 0.421, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.3705357142857144, |
|
"grad_norm": 1.11384916305542, |
|
"learning_rate": 1.2693738325425272e-05, |
|
"loss": 0.4568, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.3727678571428572, |
|
"grad_norm": 1.5108226537704468, |
|
"learning_rate": 1.266940929645606e-05, |
|
"loss": 0.5374, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.1557199954986572, |
|
"learning_rate": 1.2645063243004236e-05, |
|
"loss": 0.3919, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.3772321428571428, |
|
"grad_norm": 1.1951533555984497, |
|
"learning_rate": 1.2620700320339705e-05, |
|
"loss": 0.4521, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.3794642857142856, |
|
"grad_norm": 1.1673365831375122, |
|
"learning_rate": 1.2596320683839976e-05, |
|
"loss": 0.4613, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.3816964285714286, |
|
"grad_norm": 1.2299069166183472, |
|
"learning_rate": 1.2571924488989145e-05, |
|
"loss": 0.436, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.3839285714285714, |
|
"grad_norm": 1.0827404260635376, |
|
"learning_rate": 1.2547511891376916e-05, |
|
"loss": 0.3655, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3861607142857144, |
|
"grad_norm": 1.1951014995574951, |
|
"learning_rate": 1.2523083046697598e-05, |
|
"loss": 0.4677, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.3883928571428572, |
|
"grad_norm": 1.2478485107421875, |
|
"learning_rate": 1.2498638110749122e-05, |
|
"loss": 0.4752, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.390625, |
|
"grad_norm": 1.0962241888046265, |
|
"learning_rate": 1.2474177239432042e-05, |
|
"loss": 0.4408, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.3928571428571428, |
|
"grad_norm": 1.0092295408248901, |
|
"learning_rate": 1.2449700588748541e-05, |
|
"loss": 0.4805, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.3950892857142856, |
|
"grad_norm": 1.1849055290222168, |
|
"learning_rate": 1.2425208314801441e-05, |
|
"loss": 0.403, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3973214285714286, |
|
"grad_norm": 1.1802829504013062, |
|
"learning_rate": 1.2400700573793191e-05, |
|
"loss": 0.4818, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.3995535714285714, |
|
"grad_norm": 1.2204647064208984, |
|
"learning_rate": 1.23761775220249e-05, |
|
"loss": 0.4584, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.4017857142857144, |
|
"grad_norm": 1.3409292697906494, |
|
"learning_rate": 1.2351639315895309e-05, |
|
"loss": 0.5377, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.4040178571428572, |
|
"grad_norm": 1.1798982620239258, |
|
"learning_rate": 1.2327086111899816e-05, |
|
"loss": 0.5223, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 1.2914079427719116, |
|
"learning_rate": 1.2302518066629467e-05, |
|
"loss": 0.4595, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4084821428571428, |
|
"grad_norm": 1.084916114807129, |
|
"learning_rate": 1.2277935336769961e-05, |
|
"loss": 0.4484, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.4107142857142856, |
|
"grad_norm": 1.1137053966522217, |
|
"learning_rate": 1.2253338079100652e-05, |
|
"loss": 0.4465, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.4129464285714286, |
|
"grad_norm": 1.1979303359985352, |
|
"learning_rate": 1.2228726450493538e-05, |
|
"loss": 0.4932, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.4151785714285714, |
|
"grad_norm": 1.1469295024871826, |
|
"learning_rate": 1.2204100607912277e-05, |
|
"loss": 0.4786, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.4174107142857144, |
|
"grad_norm": 1.144337773323059, |
|
"learning_rate": 1.2179460708411177e-05, |
|
"loss": 0.42, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.4196428571428572, |
|
"grad_norm": 1.2468199729919434, |
|
"learning_rate": 1.2154806909134198e-05, |
|
"loss": 0.4205, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.421875, |
|
"grad_norm": 1.237923264503479, |
|
"learning_rate": 1.213013936731394e-05, |
|
"loss": 0.4724, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.4241071428571428, |
|
"grad_norm": 1.166019082069397, |
|
"learning_rate": 1.210545824027066e-05, |
|
"loss": 0.475, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.4263392857142856, |
|
"grad_norm": 1.1303397417068481, |
|
"learning_rate": 1.2080763685411243e-05, |
|
"loss": 0.4825, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.0004240274429321, |
|
"learning_rate": 1.205605586022822e-05, |
|
"loss": 0.4399, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.4308035714285714, |
|
"grad_norm": 1.0718317031860352, |
|
"learning_rate": 1.2031334922298749e-05, |
|
"loss": 0.4585, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.4330357142857144, |
|
"grad_norm": 1.0824089050292969, |
|
"learning_rate": 1.2006601029283629e-05, |
|
"loss": 0.4365, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.4352678571428572, |
|
"grad_norm": 1.1855394840240479, |
|
"learning_rate": 1.1981854338926262e-05, |
|
"loss": 0.4693, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 1.056105375289917, |
|
"learning_rate": 1.1957095009051683e-05, |
|
"loss": 0.4306, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.4397321428571428, |
|
"grad_norm": 1.109755516052246, |
|
"learning_rate": 1.193232319756553e-05, |
|
"loss": 0.4184, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.4419642857142856, |
|
"grad_norm": 1.1697285175323486, |
|
"learning_rate": 1.1907539062453044e-05, |
|
"loss": 0.4157, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.4441964285714286, |
|
"grad_norm": 1.3334587812423706, |
|
"learning_rate": 1.1882742761778069e-05, |
|
"loss": 0.4534, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.4464285714285714, |
|
"grad_norm": 1.3505901098251343, |
|
"learning_rate": 1.1857934453682016e-05, |
|
"loss": 0.4529, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.4486607142857144, |
|
"grad_norm": 1.154270887374878, |
|
"learning_rate": 1.1833114296382903e-05, |
|
"loss": 0.4426, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.4508928571428572, |
|
"grad_norm": 1.0679898262023926, |
|
"learning_rate": 1.1808282448174295e-05, |
|
"loss": 0.3802, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.453125, |
|
"grad_norm": 1.2348114252090454, |
|
"learning_rate": 1.1783439067424329e-05, |
|
"loss": 0.467, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.4553571428571428, |
|
"grad_norm": 1.1057263612747192, |
|
"learning_rate": 1.1758584312574693e-05, |
|
"loss": 0.4077, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.4575892857142856, |
|
"grad_norm": 1.0530842542648315, |
|
"learning_rate": 1.17337183421396e-05, |
|
"loss": 0.4221, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.4598214285714286, |
|
"grad_norm": 1.0335931777954102, |
|
"learning_rate": 1.1708841314704811e-05, |
|
"loss": 0.4917, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.4620535714285714, |
|
"grad_norm": 0.9055652618408203, |
|
"learning_rate": 1.1683953388926592e-05, |
|
"loss": 0.3894, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.4642857142857144, |
|
"grad_norm": 1.1665129661560059, |
|
"learning_rate": 1.1659054723530721e-05, |
|
"loss": 0.4008, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.4665178571428572, |
|
"grad_norm": 1.3352913856506348, |
|
"learning_rate": 1.163414547731146e-05, |
|
"loss": 0.4859, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 1.474576473236084, |
|
"learning_rate": 1.1609225809130566e-05, |
|
"loss": 0.4766, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.4709821428571428, |
|
"grad_norm": 0.9900811314582825, |
|
"learning_rate": 1.1584295877916251e-05, |
|
"loss": 0.3852, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.4732142857142856, |
|
"grad_norm": 1.0743541717529297, |
|
"learning_rate": 1.1559355842662188e-05, |
|
"loss": 0.4747, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4754464285714286, |
|
"grad_norm": 1.0290722846984863, |
|
"learning_rate": 1.1534405862426481e-05, |
|
"loss": 0.4397, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.4776785714285714, |
|
"grad_norm": 1.0267919301986694, |
|
"learning_rate": 1.150944609633067e-05, |
|
"loss": 0.4738, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.4799107142857144, |
|
"grad_norm": 1.073096752166748, |
|
"learning_rate": 1.1484476703558698e-05, |
|
"loss": 0.4751, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.4821428571428572, |
|
"grad_norm": 1.1763546466827393, |
|
"learning_rate": 1.1459497843355907e-05, |
|
"loss": 0.4471, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 1.1504164934158325, |
|
"learning_rate": 1.1434509675028018e-05, |
|
"loss": 0.4272, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.4866071428571428, |
|
"grad_norm": 1.0036594867706299, |
|
"learning_rate": 1.1409512357940114e-05, |
|
"loss": 0.4174, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.4888392857142856, |
|
"grad_norm": 1.3433315753936768, |
|
"learning_rate": 1.138450605151563e-05, |
|
"loss": 0.4634, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.4910714285714286, |
|
"grad_norm": 1.1281894445419312, |
|
"learning_rate": 1.1359490915235323e-05, |
|
"loss": 0.5026, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.4933035714285714, |
|
"grad_norm": 1.0535964965820312, |
|
"learning_rate": 1.1334467108636273e-05, |
|
"loss": 0.4849, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.4955357142857144, |
|
"grad_norm": 1.028182864189148, |
|
"learning_rate": 1.1309434791310848e-05, |
|
"loss": 0.5133, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4977678571428572, |
|
"grad_norm": 1.0213048458099365, |
|
"learning_rate": 1.1284394122905697e-05, |
|
"loss": 0.4587, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.157265067100525, |
|
"learning_rate": 1.1259345263120738e-05, |
|
"loss": 0.4129, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.5762594938278198, |
|
"eval_runtime": 28.7549, |
|
"eval_samples_per_second": 2.539, |
|
"eval_steps_per_second": 0.348, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.5022321428571428, |
|
"grad_norm": 1.186301589012146, |
|
"learning_rate": 1.1234288371708112e-05, |
|
"loss": 0.4361, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.5044642857142856, |
|
"grad_norm": 1.1052271127700806, |
|
"learning_rate": 1.1209223608471202e-05, |
|
"loss": 0.415, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.5066964285714286, |
|
"grad_norm": 1.0246312618255615, |
|
"learning_rate": 1.1184151133263578e-05, |
|
"loss": 0.4258, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.5089285714285714, |
|
"grad_norm": 1.1626089811325073, |
|
"learning_rate": 1.1159071105988012e-05, |
|
"loss": 0.4135, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.5111607142857144, |
|
"grad_norm": 1.3956563472747803, |
|
"learning_rate": 1.1133983686595416e-05, |
|
"loss": 0.4887, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.5133928571428572, |
|
"grad_norm": 1.3288111686706543, |
|
"learning_rate": 1.110888903508387e-05, |
|
"loss": 0.5532, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.515625, |
|
"grad_norm": 1.1640434265136719, |
|
"learning_rate": 1.1083787311497562e-05, |
|
"loss": 0.459, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.5178571428571428, |
|
"grad_norm": 1.3315949440002441, |
|
"learning_rate": 1.1058678675925796e-05, |
|
"loss": 0.4436, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.5200892857142856, |
|
"grad_norm": 1.4845269918441772, |
|
"learning_rate": 1.1033563288501944e-05, |
|
"loss": 0.4378, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.5223214285714286, |
|
"grad_norm": 1.1825847625732422, |
|
"learning_rate": 1.1008441309402448e-05, |
|
"loss": 0.4766, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.5245535714285714, |
|
"grad_norm": 1.282860279083252, |
|
"learning_rate": 1.0983312898845788e-05, |
|
"loss": 0.4995, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.5267857142857144, |
|
"grad_norm": 1.080328345298767, |
|
"learning_rate": 1.0958178217091455e-05, |
|
"loss": 0.3866, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.5290178571428572, |
|
"grad_norm": 1.218947172164917, |
|
"learning_rate": 1.093303742443895e-05, |
|
"loss": 0.528, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 1.027030348777771, |
|
"learning_rate": 1.0907890681226728e-05, |
|
"loss": 0.4396, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.5334821428571428, |
|
"grad_norm": 1.1486948728561401, |
|
"learning_rate": 1.0882738147831209e-05, |
|
"loss": 0.4119, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.5357142857142856, |
|
"grad_norm": 1.2360179424285889, |
|
"learning_rate": 1.0857579984665733e-05, |
|
"loss": 0.4318, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.5379464285714286, |
|
"grad_norm": 1.1595594882965088, |
|
"learning_rate": 1.0832416352179549e-05, |
|
"loss": 0.4664, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.5401785714285714, |
|
"grad_norm": 1.145702838897705, |
|
"learning_rate": 1.0807247410856783e-05, |
|
"loss": 0.4545, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5424107142857144, |
|
"grad_norm": 1.0401489734649658, |
|
"learning_rate": 1.0782073321215423e-05, |
|
"loss": 0.4562, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.5446428571428572, |
|
"grad_norm": 0.9377378821372986, |
|
"learning_rate": 1.0756894243806291e-05, |
|
"loss": 0.4018, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.546875, |
|
"grad_norm": 1.0041829347610474, |
|
"learning_rate": 1.073171033921201e-05, |
|
"loss": 0.4464, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.5491071428571428, |
|
"grad_norm": 1.0813108682632446, |
|
"learning_rate": 1.0706521768046006e-05, |
|
"loss": 0.419, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.5513392857142856, |
|
"grad_norm": 1.024339199066162, |
|
"learning_rate": 1.0681328690951447e-05, |
|
"loss": 0.4, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.5535714285714286, |
|
"grad_norm": 1.2120790481567383, |
|
"learning_rate": 1.0656131268600254e-05, |
|
"loss": 0.4255, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.5558035714285714, |
|
"grad_norm": 1.0487234592437744, |
|
"learning_rate": 1.0630929661692051e-05, |
|
"loss": 0.4281, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.5580357142857144, |
|
"grad_norm": 1.1584324836730957, |
|
"learning_rate": 1.0605724030953155e-05, |
|
"loss": 0.4147, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.5602678571428572, |
|
"grad_norm": 1.2971632480621338, |
|
"learning_rate": 1.0580514537135542e-05, |
|
"loss": 0.4992, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.186423420906067, |
|
"learning_rate": 1.0555301341015832e-05, |
|
"loss": 0.509, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5647321428571428, |
|
"grad_norm": 1.0909963846206665, |
|
"learning_rate": 1.0530084603394239e-05, |
|
"loss": 0.4044, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.5669642857142856, |
|
"grad_norm": 1.1188690662384033, |
|
"learning_rate": 1.0504864485093588e-05, |
|
"loss": 0.4433, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.5691964285714286, |
|
"grad_norm": 1.1996616125106812, |
|
"learning_rate": 1.0479641146958249e-05, |
|
"loss": 0.4001, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 1.3726085424423218, |
|
"learning_rate": 1.0454414749853126e-05, |
|
"loss": 0.4005, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.5736607142857144, |
|
"grad_norm": 1.3191404342651367, |
|
"learning_rate": 1.0429185454662638e-05, |
|
"loss": 0.51, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.5758928571428572, |
|
"grad_norm": 1.1625711917877197, |
|
"learning_rate": 1.0403953422289687e-05, |
|
"loss": 0.4751, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.578125, |
|
"grad_norm": 0.9676429629325867, |
|
"learning_rate": 1.0378718813654633e-05, |
|
"loss": 0.4208, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.5803571428571428, |
|
"grad_norm": 1.3455173969268799, |
|
"learning_rate": 1.0353481789694258e-05, |
|
"loss": 0.5174, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.5825892857142856, |
|
"grad_norm": 1.349685549736023, |
|
"learning_rate": 1.0328242511360753e-05, |
|
"loss": 0.4612, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.5848214285714286, |
|
"grad_norm": 1.0394678115844727, |
|
"learning_rate": 1.030300113962069e-05, |
|
"loss": 0.396, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5870535714285714, |
|
"grad_norm": 0.9809379577636719, |
|
"learning_rate": 1.0277757835453989e-05, |
|
"loss": 0.4269, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.5892857142857144, |
|
"grad_norm": 1.2518723011016846, |
|
"learning_rate": 1.0252512759852891e-05, |
|
"loss": 0.4136, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.5915178571428572, |
|
"grad_norm": 1.1232184171676636, |
|
"learning_rate": 1.0227266073820939e-05, |
|
"loss": 0.491, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 1.1965726613998413, |
|
"learning_rate": 1.0202017938371947e-05, |
|
"loss": 0.5157, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.5959821428571428, |
|
"grad_norm": 1.116578459739685, |
|
"learning_rate": 1.0176768514528967e-05, |
|
"loss": 0.4045, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.5982142857142856, |
|
"grad_norm": 1.1951912641525269, |
|
"learning_rate": 1.015151796332328e-05, |
|
"loss": 0.4598, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.6004464285714286, |
|
"grad_norm": 1.137501835823059, |
|
"learning_rate": 1.012626644579334e-05, |
|
"loss": 0.521, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.6026785714285714, |
|
"grad_norm": 1.0315260887145996, |
|
"learning_rate": 1.010101412298378e-05, |
|
"loss": 0.4153, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.6049107142857144, |
|
"grad_norm": 1.1565879583358765, |
|
"learning_rate": 1.0075761155944355e-05, |
|
"loss": 0.4429, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.1776032447814941, |
|
"learning_rate": 1.0050507705728943e-05, |
|
"loss": 0.3924, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.609375, |
|
"grad_norm": 1.1653850078582764, |
|
"learning_rate": 1.0025253933394487e-05, |
|
"loss": 0.4368, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.6116071428571428, |
|
"grad_norm": 1.1864845752716064, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4214, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.6138392857142856, |
|
"grad_norm": 1.2345666885375977, |
|
"learning_rate": 9.974746066605515e-06, |
|
"loss": 0.4087, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.6160714285714286, |
|
"grad_norm": 1.0691934823989868, |
|
"learning_rate": 9.949492294271062e-06, |
|
"loss": 0.4253, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.6183035714285714, |
|
"grad_norm": 1.1589877605438232, |
|
"learning_rate": 9.924238844055646e-06, |
|
"loss": 0.4587, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.6205357142857144, |
|
"grad_norm": 1.1777771711349487, |
|
"learning_rate": 9.898985877016225e-06, |
|
"loss": 0.4513, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.6227678571428572, |
|
"grad_norm": 1.0277619361877441, |
|
"learning_rate": 9.873733554206663e-06, |
|
"loss": 0.4343, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 1.0607410669326782, |
|
"learning_rate": 9.848482036676725e-06, |
|
"loss": 0.454, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.6272321428571428, |
|
"grad_norm": 1.1368486881256104, |
|
"learning_rate": 9.823231485471034e-06, |
|
"loss": 0.4728, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.6294642857142856, |
|
"grad_norm": 0.9980977773666382, |
|
"learning_rate": 9.797982061628056e-06, |
|
"loss": 0.4544, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.6316964285714286, |
|
"grad_norm": 1.1738172769546509, |
|
"learning_rate": 9.772733926179066e-06, |
|
"loss": 0.4884, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.6339285714285714, |
|
"grad_norm": 1.079927682876587, |
|
"learning_rate": 9.747487240147112e-06, |
|
"loss": 0.4424, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.6361607142857144, |
|
"grad_norm": 1.255071997642517, |
|
"learning_rate": 9.722242164546016e-06, |
|
"loss": 0.4366, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.6383928571428572, |
|
"grad_norm": 1.1740080118179321, |
|
"learning_rate": 9.696998860379313e-06, |
|
"loss": 0.4567, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 1.2524385452270508, |
|
"learning_rate": 9.67175748863925e-06, |
|
"loss": 0.4696, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 1.1558537483215332, |
|
"learning_rate": 9.646518210305747e-06, |
|
"loss": 0.4027, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.6450892857142856, |
|
"grad_norm": 1.218324899673462, |
|
"learning_rate": 9.621281186345367e-06, |
|
"loss": 0.4871, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.6473214285714286, |
|
"grad_norm": 1.1278901100158691, |
|
"learning_rate": 9.596046577710314e-06, |
|
"loss": 0.3962, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.6495535714285714, |
|
"grad_norm": 1.1411744356155396, |
|
"learning_rate": 9.570814545337362e-06, |
|
"loss": 0.5, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.6517857142857144, |
|
"grad_norm": 1.4232391119003296, |
|
"learning_rate": 9.545585250146879e-06, |
|
"loss": 0.5546, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6540178571428572, |
|
"grad_norm": 1.096972107887268, |
|
"learning_rate": 9.520358853041756e-06, |
|
"loss": 0.3857, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 1.0176469087600708, |
|
"learning_rate": 9.495135514906415e-06, |
|
"loss": 0.4268, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.6584821428571428, |
|
"grad_norm": 1.092172384262085, |
|
"learning_rate": 9.469915396605763e-06, |
|
"loss": 0.4572, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.6607142857142856, |
|
"grad_norm": 1.2569825649261475, |
|
"learning_rate": 9.44469865898417e-06, |
|
"loss": 0.4815, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.6629464285714286, |
|
"grad_norm": 1.1099218130111694, |
|
"learning_rate": 9.41948546286446e-06, |
|
"loss": 0.4179, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.6651785714285714, |
|
"grad_norm": 1.0693453550338745, |
|
"learning_rate": 9.394275969046845e-06, |
|
"loss": 0.3651, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.6674107142857144, |
|
"grad_norm": 1.193220853805542, |
|
"learning_rate": 9.369070338307954e-06, |
|
"loss": 0.4632, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.6696428571428572, |
|
"grad_norm": 1.2311550378799438, |
|
"learning_rate": 9.34386873139975e-06, |
|
"loss": 0.4297, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.671875, |
|
"grad_norm": 1.1190276145935059, |
|
"learning_rate": 9.31867130904856e-06, |
|
"loss": 0.39, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.6741071428571428, |
|
"grad_norm": 1.3360817432403564, |
|
"learning_rate": 9.293478231954e-06, |
|
"loss": 0.5313, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6763392857142856, |
|
"grad_norm": 1.0268186330795288, |
|
"learning_rate": 9.26828966078799e-06, |
|
"loss": 0.347, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.6785714285714286, |
|
"grad_norm": 1.1641294956207275, |
|
"learning_rate": 9.243105756193714e-06, |
|
"loss": 0.453, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.6808035714285714, |
|
"grad_norm": 0.9681382775306702, |
|
"learning_rate": 9.217926678784579e-06, |
|
"loss": 0.4076, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.6830357142857144, |
|
"grad_norm": 1.077756404876709, |
|
"learning_rate": 9.192752589143219e-06, |
|
"loss": 0.4225, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.6852678571428572, |
|
"grad_norm": 1.216125249862671, |
|
"learning_rate": 9.167583647820453e-06, |
|
"loss": 0.5314, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.0570131540298462, |
|
"learning_rate": 9.14242001533427e-06, |
|
"loss": 0.4223, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.6897321428571428, |
|
"grad_norm": 1.0465316772460938, |
|
"learning_rate": 9.117261852168794e-06, |
|
"loss": 0.4597, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.6919642857142856, |
|
"grad_norm": 1.07583487033844, |
|
"learning_rate": 9.092109318773274e-06, |
|
"loss": 0.4947, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.6941964285714286, |
|
"grad_norm": 1.1008681058883667, |
|
"learning_rate": 9.066962575561054e-06, |
|
"loss": 0.4785, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.6964285714285714, |
|
"grad_norm": 1.061246633529663, |
|
"learning_rate": 9.041821782908544e-06, |
|
"loss": 0.4698, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6986607142857144, |
|
"grad_norm": 0.8538286089897156, |
|
"learning_rate": 9.016687101154215e-06, |
|
"loss": 0.3926, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.7008928571428572, |
|
"grad_norm": 1.1151841878890991, |
|
"learning_rate": 8.991558690597553e-06, |
|
"loss": 0.4459, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.703125, |
|
"grad_norm": 1.27910578250885, |
|
"learning_rate": 8.966436711498058e-06, |
|
"loss": 0.4883, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.7053571428571428, |
|
"grad_norm": 1.1799464225769043, |
|
"learning_rate": 8.941321324074207e-06, |
|
"loss": 0.4439, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.7075892857142856, |
|
"grad_norm": 1.2295399904251099, |
|
"learning_rate": 8.916212688502438e-06, |
|
"loss": 0.4074, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.7098214285714286, |
|
"grad_norm": 1.0072729587554932, |
|
"learning_rate": 8.891110964916135e-06, |
|
"loss": 0.3901, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.7120535714285714, |
|
"grad_norm": 1.0866972208023071, |
|
"learning_rate": 8.866016313404586e-06, |
|
"loss": 0.4063, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 1.1431010961532593, |
|
"learning_rate": 8.840928894011995e-06, |
|
"loss": 0.4814, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.7165178571428572, |
|
"grad_norm": 0.9729580879211426, |
|
"learning_rate": 8.815848866736424e-06, |
|
"loss": 0.366, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 1.3122913837432861, |
|
"learning_rate": 8.790776391528803e-06, |
|
"loss": 0.4625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7209821428571428, |
|
"grad_norm": 1.2349814176559448, |
|
"learning_rate": 8.76571162829189e-06, |
|
"loss": 0.4846, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.7232142857142856, |
|
"grad_norm": 1.240909218788147, |
|
"learning_rate": 8.740654736879265e-06, |
|
"loss": 0.5493, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.7254464285714286, |
|
"grad_norm": 1.0323981046676636, |
|
"learning_rate": 8.715605877094304e-06, |
|
"loss": 0.3947, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.7276785714285714, |
|
"grad_norm": 1.106673002243042, |
|
"learning_rate": 8.690565208689157e-06, |
|
"loss": 0.434, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.7299107142857144, |
|
"grad_norm": 1.1972298622131348, |
|
"learning_rate": 8.665532891363732e-06, |
|
"loss": 0.4705, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.7321428571428572, |
|
"grad_norm": 1.1289480924606323, |
|
"learning_rate": 8.640509084764682e-06, |
|
"loss": 0.4872, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.734375, |
|
"grad_norm": 1.2260942459106445, |
|
"learning_rate": 8.615493948484375e-06, |
|
"loss": 0.5072, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.7366071428571428, |
|
"grad_norm": 0.9576632976531982, |
|
"learning_rate": 8.590487642059888e-06, |
|
"loss": 0.392, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.7388392857142856, |
|
"grad_norm": 1.2125643491744995, |
|
"learning_rate": 8.565490324971983e-06, |
|
"loss": 0.4466, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.7410714285714286, |
|
"grad_norm": 1.294597864151001, |
|
"learning_rate": 8.540502156644096e-06, |
|
"loss": 0.4632, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7433035714285714, |
|
"grad_norm": 1.1891837120056152, |
|
"learning_rate": 8.515523296441304e-06, |
|
"loss": 0.446, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.7455357142857144, |
|
"grad_norm": 0.9572664499282837, |
|
"learning_rate": 8.490553903669335e-06, |
|
"loss": 0.3964, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.7477678571428572, |
|
"grad_norm": 1.1124510765075684, |
|
"learning_rate": 8.465594137573524e-06, |
|
"loss": 0.4481, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.0124688148498535, |
|
"learning_rate": 8.440644157337819e-06, |
|
"loss": 0.4132, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.5769185423851013, |
|
"eval_runtime": 27.5149, |
|
"eval_samples_per_second": 2.653, |
|
"eval_steps_per_second": 0.363, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.7522321428571428, |
|
"grad_norm": 1.0379537343978882, |
|
"learning_rate": 8.415704122083752e-06, |
|
"loss": 0.3967, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.7544642857142856, |
|
"grad_norm": 1.0697929859161377, |
|
"learning_rate": 8.390774190869434e-06, |
|
"loss": 0.4963, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.7566964285714286, |
|
"grad_norm": 1.1796789169311523, |
|
"learning_rate": 8.365854522688543e-06, |
|
"loss": 0.5222, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.7589285714285714, |
|
"grad_norm": 1.0521793365478516, |
|
"learning_rate": 8.340945276469282e-06, |
|
"loss": 0.4473, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.7611607142857144, |
|
"grad_norm": 0.9840192794799805, |
|
"learning_rate": 8.316046611073413e-06, |
|
"loss": 0.4225, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.7633928571428572, |
|
"grad_norm": 1.3078526258468628, |
|
"learning_rate": 8.29115868529519e-06, |
|
"loss": 0.4677, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.765625, |
|
"grad_norm": 1.1875336170196533, |
|
"learning_rate": 8.266281657860406e-06, |
|
"loss": 0.4099, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.7678571428571428, |
|
"grad_norm": 1.1061992645263672, |
|
"learning_rate": 8.24141568742531e-06, |
|
"loss": 0.4772, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.7700892857142856, |
|
"grad_norm": 1.0990246534347534, |
|
"learning_rate": 8.21656093257567e-06, |
|
"loss": 0.415, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.7723214285714286, |
|
"grad_norm": 1.235956072807312, |
|
"learning_rate": 8.191717551825707e-06, |
|
"loss": 0.4911, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.7745535714285714, |
|
"grad_norm": 1.0186740159988403, |
|
"learning_rate": 8.166885703617098e-06, |
|
"loss": 0.4068, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.7767857142857144, |
|
"grad_norm": 1.046399474143982, |
|
"learning_rate": 8.142065546317988e-06, |
|
"loss": 0.4641, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.7790178571428572, |
|
"grad_norm": 1.2857236862182617, |
|
"learning_rate": 8.117257238221936e-06, |
|
"loss": 0.4947, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 1.0499598979949951, |
|
"learning_rate": 8.09246093754696e-06, |
|
"loss": 0.4596, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.7834821428571428, |
|
"grad_norm": 1.1524500846862793, |
|
"learning_rate": 8.067676802434472e-06, |
|
"loss": 0.4414, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.1189254522323608, |
|
"learning_rate": 8.042904990948319e-06, |
|
"loss": 0.4364, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7879464285714286, |
|
"grad_norm": 1.1881475448608398, |
|
"learning_rate": 8.01814566107374e-06, |
|
"loss": 0.4466, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.7901785714285714, |
|
"grad_norm": 1.0106115341186523, |
|
"learning_rate": 7.993398970716375e-06, |
|
"loss": 0.3923, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.7924107142857144, |
|
"grad_norm": 1.1234852075576782, |
|
"learning_rate": 7.968665077701253e-06, |
|
"loss": 0.4678, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.7946428571428572, |
|
"grad_norm": 1.1420475244522095, |
|
"learning_rate": 7.943944139771784e-06, |
|
"loss": 0.4642, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 1.1545650959014893, |
|
"learning_rate": 7.919236314588759e-06, |
|
"loss": 0.4387, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.7991071428571428, |
|
"grad_norm": 1.2602638006210327, |
|
"learning_rate": 7.894541759729344e-06, |
|
"loss": 0.4275, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.8013392857142856, |
|
"grad_norm": 1.1043881177902222, |
|
"learning_rate": 7.869860632686059e-06, |
|
"loss": 0.4424, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.8035714285714286, |
|
"grad_norm": 1.2350860834121704, |
|
"learning_rate": 7.845193090865807e-06, |
|
"loss": 0.4516, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.8058035714285714, |
|
"grad_norm": 1.1565461158752441, |
|
"learning_rate": 7.820539291588825e-06, |
|
"loss": 0.4463, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.8080357142857144, |
|
"grad_norm": 1.1159425973892212, |
|
"learning_rate": 7.795899392087728e-06, |
|
"loss": 0.4668, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8102678571428572, |
|
"grad_norm": 1.323809027671814, |
|
"learning_rate": 7.771273549506466e-06, |
|
"loss": 0.475, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.0323381423950195, |
|
"learning_rate": 7.746661920899351e-06, |
|
"loss": 0.4279, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.8147321428571428, |
|
"grad_norm": 1.3589287996292114, |
|
"learning_rate": 7.72206466323004e-06, |
|
"loss": 0.496, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.8169642857142856, |
|
"grad_norm": 1.098157286643982, |
|
"learning_rate": 7.697481933370535e-06, |
|
"loss": 0.5262, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.8191964285714286, |
|
"grad_norm": 1.0474650859832764, |
|
"learning_rate": 7.672913888100187e-06, |
|
"loss": 0.4107, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.8214285714285714, |
|
"grad_norm": 1.034593939781189, |
|
"learning_rate": 7.648360684104695e-06, |
|
"loss": 0.464, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.8236607142857144, |
|
"grad_norm": 1.3255833387374878, |
|
"learning_rate": 7.623822477975105e-06, |
|
"loss": 0.4558, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.8258928571428572, |
|
"grad_norm": 0.9907006025314331, |
|
"learning_rate": 7.599299426206812e-06, |
|
"loss": 0.3483, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.828125, |
|
"grad_norm": 1.1865644454956055, |
|
"learning_rate": 7.574791685198563e-06, |
|
"loss": 0.4634, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.8303571428571428, |
|
"grad_norm": 1.2235808372497559, |
|
"learning_rate": 7.550299411251461e-06, |
|
"loss": 0.4313, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.8325892857142856, |
|
"grad_norm": 1.1662776470184326, |
|
"learning_rate": 7.52582276056796e-06, |
|
"loss": 0.4313, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.8348214285714286, |
|
"grad_norm": 1.1542125940322876, |
|
"learning_rate": 7.501361889250882e-06, |
|
"loss": 0.4432, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.8370535714285714, |
|
"grad_norm": 1.1531603336334229, |
|
"learning_rate": 7.4769169533024055e-06, |
|
"loss": 0.4444, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.8392857142857144, |
|
"grad_norm": 1.059193730354309, |
|
"learning_rate": 7.452488108623089e-06, |
|
"loss": 0.4356, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.8415178571428572, |
|
"grad_norm": 0.994286835193634, |
|
"learning_rate": 7.428075511010858e-06, |
|
"loss": 0.4322, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 1.0639030933380127, |
|
"learning_rate": 7.403679316160024e-06, |
|
"loss": 0.5315, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.8459821428571428, |
|
"grad_norm": 0.840054452419281, |
|
"learning_rate": 7.379299679660299e-06, |
|
"loss": 0.3606, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.8482142857142856, |
|
"grad_norm": 1.074033498764038, |
|
"learning_rate": 7.354936756995766e-06, |
|
"loss": 0.4659, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.8504464285714286, |
|
"grad_norm": 1.2196365594863892, |
|
"learning_rate": 7.3305907035439404e-06, |
|
"loss": 0.4919, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.8526785714285714, |
|
"grad_norm": 1.1700623035430908, |
|
"learning_rate": 7.3062616745747325e-06, |
|
"loss": 0.445, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8549107142857144, |
|
"grad_norm": 1.149138331413269, |
|
"learning_rate": 7.281949825249495e-06, |
|
"loss": 0.4704, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 1.0726150274276733, |
|
"learning_rate": 7.257655310619996e-06, |
|
"loss": 0.4275, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.859375, |
|
"grad_norm": 1.2384564876556396, |
|
"learning_rate": 7.233378285627459e-06, |
|
"loss": 0.4262, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.8616071428571428, |
|
"grad_norm": 1.1791836023330688, |
|
"learning_rate": 7.209118905101575e-06, |
|
"loss": 0.5294, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.8638392857142856, |
|
"grad_norm": 1.1020606756210327, |
|
"learning_rate": 7.184877323759482e-06, |
|
"loss": 0.4688, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.8660714285714286, |
|
"grad_norm": 1.0698236227035522, |
|
"learning_rate": 7.16065369620483e-06, |
|
"loss": 0.4932, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.8683035714285714, |
|
"grad_norm": 1.3022637367248535, |
|
"learning_rate": 7.136448176926736e-06, |
|
"loss": 0.4702, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.8705357142857144, |
|
"grad_norm": 1.197245478630066, |
|
"learning_rate": 7.112260920298859e-06, |
|
"loss": 0.5103, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.8727678571428572, |
|
"grad_norm": 1.157585859298706, |
|
"learning_rate": 7.088092080578357e-06, |
|
"loss": 0.5016, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 1.1110048294067383, |
|
"learning_rate": 7.063941811904956e-06, |
|
"loss": 0.4405, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8772321428571428, |
|
"grad_norm": 1.014962077140808, |
|
"learning_rate": 7.039810268299934e-06, |
|
"loss": 0.3925, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.8794642857142856, |
|
"grad_norm": 1.1455286741256714, |
|
"learning_rate": 7.015697603665141e-06, |
|
"loss": 0.4581, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.8816964285714286, |
|
"grad_norm": 1.2424312829971313, |
|
"learning_rate": 6.991603971782035e-06, |
|
"loss": 0.5135, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.8839285714285714, |
|
"grad_norm": 1.0072928667068481, |
|
"learning_rate": 6.967529526310681e-06, |
|
"loss": 0.3755, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.8861607142857144, |
|
"grad_norm": 1.1363320350646973, |
|
"learning_rate": 6.943474420788788e-06, |
|
"loss": 0.4542, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.8883928571428572, |
|
"grad_norm": 1.0867626667022705, |
|
"learning_rate": 6.919438808630716e-06, |
|
"loss": 0.4454, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.890625, |
|
"grad_norm": 1.0299354791641235, |
|
"learning_rate": 6.895422843126507e-06, |
|
"loss": 0.4268, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.8928571428571428, |
|
"grad_norm": 1.2059564590454102, |
|
"learning_rate": 6.871426677440907e-06, |
|
"loss": 0.511, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.8950892857142856, |
|
"grad_norm": 1.1440831422805786, |
|
"learning_rate": 6.847450464612378e-06, |
|
"loss": 0.4773, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.8973214285714286, |
|
"grad_norm": 1.1981797218322754, |
|
"learning_rate": 6.8234943575521365e-06, |
|
"loss": 0.4979, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8995535714285714, |
|
"grad_norm": 1.140823483467102, |
|
"learning_rate": 6.799558509043169e-06, |
|
"loss": 0.4491, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.9017857142857144, |
|
"grad_norm": 1.1695003509521484, |
|
"learning_rate": 6.775643071739267e-06, |
|
"loss": 0.4302, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.9040178571428572, |
|
"grad_norm": 1.292051911354065, |
|
"learning_rate": 6.751748198164036e-06, |
|
"loss": 0.5915, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 1.060410499572754, |
|
"learning_rate": 6.727874040709943e-06, |
|
"loss": 0.4208, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.9084821428571428, |
|
"grad_norm": 1.1094176769256592, |
|
"learning_rate": 6.704020751637333e-06, |
|
"loss": 0.4261, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.9107142857142856, |
|
"grad_norm": 1.1401662826538086, |
|
"learning_rate": 6.680188483073458e-06, |
|
"loss": 0.3836, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.9129464285714286, |
|
"grad_norm": 1.1735782623291016, |
|
"learning_rate": 6.6563773870115135e-06, |
|
"loss": 0.4362, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.9151785714285714, |
|
"grad_norm": 0.9720476269721985, |
|
"learning_rate": 6.632587615309658e-06, |
|
"loss": 0.4288, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.9174107142857144, |
|
"grad_norm": 1.134281039237976, |
|
"learning_rate": 6.608819319690059e-06, |
|
"loss": 0.434, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.9196428571428572, |
|
"grad_norm": 0.9402589201927185, |
|
"learning_rate": 6.585072651737911e-06, |
|
"loss": 0.4452, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.921875, |
|
"grad_norm": 1.0265930891036987, |
|
"learning_rate": 6.56134776290048e-06, |
|
"loss": 0.4111, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.9241071428571428, |
|
"grad_norm": 1.001212477684021, |
|
"learning_rate": 6.537644804486136e-06, |
|
"loss": 0.4677, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.9263392857142856, |
|
"grad_norm": 1.193760871887207, |
|
"learning_rate": 6.513963927663372e-06, |
|
"loss": 0.4496, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 1.094336986541748, |
|
"learning_rate": 6.49030528345987e-06, |
|
"loss": 0.403, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.9308035714285714, |
|
"grad_norm": 1.051220417022705, |
|
"learning_rate": 6.466669022761506e-06, |
|
"loss": 0.4085, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.9330357142857144, |
|
"grad_norm": 1.0897574424743652, |
|
"learning_rate": 6.443055296311413e-06, |
|
"loss": 0.4779, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.9352678571428572, |
|
"grad_norm": 1.124507188796997, |
|
"learning_rate": 6.4194642547090016e-06, |
|
"loss": 0.474, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 1.1396455764770508, |
|
"learning_rate": 6.3958960484090094e-06, |
|
"loss": 0.4122, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.9397321428571428, |
|
"grad_norm": 0.9783452153205872, |
|
"learning_rate": 6.37235082772055e-06, |
|
"loss": 0.4359, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.9419642857142856, |
|
"grad_norm": 1.2163567543029785, |
|
"learning_rate": 6.348828742806122e-06, |
|
"loss": 0.4256, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9441964285714286, |
|
"grad_norm": 1.1555063724517822, |
|
"learning_rate": 6.325329943680689e-06, |
|
"loss": 0.4604, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.9464285714285714, |
|
"grad_norm": 1.1841658353805542, |
|
"learning_rate": 6.3018545802107e-06, |
|
"loss": 0.4478, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.9486607142857144, |
|
"grad_norm": 1.2992147207260132, |
|
"learning_rate": 6.278402802113146e-06, |
|
"loss": 0.4252, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.9508928571428572, |
|
"grad_norm": 1.136289358139038, |
|
"learning_rate": 6.25497475895459e-06, |
|
"loss": 0.4876, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 1.138102412223816, |
|
"learning_rate": 6.2315706001502305e-06, |
|
"loss": 0.446, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.9553571428571428, |
|
"grad_norm": 1.0559848546981812, |
|
"learning_rate": 6.208190474962945e-06, |
|
"loss": 0.4242, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.9575892857142856, |
|
"grad_norm": 1.1156829595565796, |
|
"learning_rate": 6.184834532502315e-06, |
|
"loss": 0.4874, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.9598214285714286, |
|
"grad_norm": 1.0015919208526611, |
|
"learning_rate": 6.161502921723719e-06, |
|
"loss": 0.4157, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.9620535714285714, |
|
"grad_norm": 1.1571860313415527, |
|
"learning_rate": 6.138195791427329e-06, |
|
"loss": 0.4177, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 1.3629335165023804, |
|
"learning_rate": 6.114913290257219e-06, |
|
"loss": 0.4605, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9665178571428572, |
|
"grad_norm": 1.03495454788208, |
|
"learning_rate": 6.091655566700359e-06, |
|
"loss": 0.443, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 1.1198673248291016, |
|
"learning_rate": 6.068422769085722e-06, |
|
"loss": 0.3935, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.9709821428571428, |
|
"grad_norm": 1.0860130786895752, |
|
"learning_rate": 6.045215045583301e-06, |
|
"loss": 0.4203, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.9732142857142856, |
|
"grad_norm": 1.050460696220398, |
|
"learning_rate": 6.0220325442031714e-06, |
|
"loss": 0.4028, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.9754464285714286, |
|
"grad_norm": 1.0525908470153809, |
|
"learning_rate": 5.998875412794562e-06, |
|
"loss": 0.3915, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.9776785714285714, |
|
"grad_norm": 1.0617356300354004, |
|
"learning_rate": 5.975743799044894e-06, |
|
"loss": 0.4651, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.9799107142857144, |
|
"grad_norm": 0.9719341993331909, |
|
"learning_rate": 5.952637850478852e-06, |
|
"loss": 0.4276, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.9821428571428572, |
|
"grad_norm": 1.2568265199661255, |
|
"learning_rate": 5.929557714457425e-06, |
|
"loss": 0.5201, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.984375, |
|
"grad_norm": 1.1193770170211792, |
|
"learning_rate": 5.906503538176999e-06, |
|
"loss": 0.4661, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.9866071428571428, |
|
"grad_norm": 1.1484289169311523, |
|
"learning_rate": 5.883475468668387e-06, |
|
"loss": 0.4808, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.9888392857142856, |
|
"grad_norm": 0.956321120262146, |
|
"learning_rate": 5.860473652795901e-06, |
|
"loss": 0.4322, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.9910714285714286, |
|
"grad_norm": 1.0034775733947754, |
|
"learning_rate": 5.8374982372564255e-06, |
|
"loss": 0.3559, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.9933035714285714, |
|
"grad_norm": 1.0558040142059326, |
|
"learning_rate": 5.814549368578464e-06, |
|
"loss": 0.4988, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.9955357142857144, |
|
"grad_norm": 1.1722464561462402, |
|
"learning_rate": 5.7916271931212185e-06, |
|
"loss": 0.4951, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.9977678571428572, |
|
"grad_norm": 1.1077440977096558, |
|
"learning_rate": 5.768731857073657e-06, |
|
"loss": 0.449, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0378532409667969, |
|
"learning_rate": 5.745863506453569e-06, |
|
"loss": 0.3514, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5705999732017517, |
|
"eval_runtime": 29.1399, |
|
"eval_samples_per_second": 2.505, |
|
"eval_steps_per_second": 0.343, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 2.002232142857143, |
|
"grad_norm": 1.1312880516052246, |
|
"learning_rate": 5.7230222871066475e-06, |
|
"loss": 0.3434, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 2.0044642857142856, |
|
"grad_norm": 1.2804909944534302, |
|
"learning_rate": 5.700208344705537e-06, |
|
"loss": 0.3325, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 2.0066964285714284, |
|
"grad_norm": 1.2890160083770752, |
|
"learning_rate": 5.677421824748946e-06, |
|
"loss": 0.3234, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 2.0089285714285716, |
|
"grad_norm": 1.3255733251571655, |
|
"learning_rate": 5.6546628725606675e-06, |
|
"loss": 0.3237, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.0111607142857144, |
|
"grad_norm": 1.2110278606414795, |
|
"learning_rate": 5.631931633288696e-06, |
|
"loss": 0.303, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 2.013392857142857, |
|
"grad_norm": 0.9276548624038696, |
|
"learning_rate": 5.609228251904265e-06, |
|
"loss": 0.3032, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 2.015625, |
|
"grad_norm": 1.1542302370071411, |
|
"learning_rate": 5.586552873200963e-06, |
|
"loss": 0.3109, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 2.017857142857143, |
|
"grad_norm": 1.1671963930130005, |
|
"learning_rate": 5.563905641793776e-06, |
|
"loss": 0.3398, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 2.0200892857142856, |
|
"grad_norm": 1.0253148078918457, |
|
"learning_rate": 5.541286702118174e-06, |
|
"loss": 0.3085, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.0223214285714284, |
|
"grad_norm": 1.112776517868042, |
|
"learning_rate": 5.518696198429201e-06, |
|
"loss": 0.3137, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 2.0245535714285716, |
|
"grad_norm": 1.0918265581130981, |
|
"learning_rate": 5.496134274800533e-06, |
|
"loss": 0.3044, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 2.0267857142857144, |
|
"grad_norm": 1.1833134889602661, |
|
"learning_rate": 5.473601075123599e-06, |
|
"loss": 0.3135, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 2.029017857142857, |
|
"grad_norm": 1.2949035167694092, |
|
"learning_rate": 5.451096743106611e-06, |
|
"loss": 0.3631, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 1.2424516677856445, |
|
"learning_rate": 5.428621422273687e-06, |
|
"loss": 0.327, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.033482142857143, |
|
"grad_norm": 1.102992057800293, |
|
"learning_rate": 5.406175255963923e-06, |
|
"loss": 0.2871, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"grad_norm": 1.140796184539795, |
|
"learning_rate": 5.383758387330476e-06, |
|
"loss": 0.3021, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 2.0379464285714284, |
|
"grad_norm": 1.1115362644195557, |
|
"learning_rate": 5.3613709593396545e-06, |
|
"loss": 0.3066, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 2.0401785714285716, |
|
"grad_norm": 1.0036022663116455, |
|
"learning_rate": 5.3390131147699995e-06, |
|
"loss": 0.2789, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 2.0424107142857144, |
|
"grad_norm": 1.0242892503738403, |
|
"learning_rate": 5.3166849962113886e-06, |
|
"loss": 0.2943, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.044642857142857, |
|
"grad_norm": 1.0714552402496338, |
|
"learning_rate": 5.294386746064115e-06, |
|
"loss": 0.265, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 2.046875, |
|
"grad_norm": 1.2583504915237427, |
|
"learning_rate": 5.272118506537982e-06, |
|
"loss": 0.3399, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 2.049107142857143, |
|
"grad_norm": 1.0679799318313599, |
|
"learning_rate": 5.249880419651403e-06, |
|
"loss": 0.3327, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 2.0513392857142856, |
|
"grad_norm": 1.1574690341949463, |
|
"learning_rate": 5.2276726272304724e-06, |
|
"loss": 0.3007, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 2.0535714285714284, |
|
"grad_norm": 0.8751293420791626, |
|
"learning_rate": 5.205495270908094e-06, |
|
"loss": 0.2863, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.0558035714285716, |
|
"grad_norm": 1.0779941082000732, |
|
"learning_rate": 5.183348492123056e-06, |
|
"loss": 0.3201, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 2.0580357142857144, |
|
"grad_norm": 1.1002516746520996, |
|
"learning_rate": 5.16123243211914e-06, |
|
"loss": 0.3093, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 2.060267857142857, |
|
"grad_norm": 1.1461528539657593, |
|
"learning_rate": 5.1391472319442016e-06, |
|
"loss": 0.3172, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 1.1231317520141602, |
|
"learning_rate": 5.117093032449297e-06, |
|
"loss": 0.3279, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 2.064732142857143, |
|
"grad_norm": 1.1722527742385864, |
|
"learning_rate": 5.0950699742877645e-06, |
|
"loss": 0.3203, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.0669642857142856, |
|
"grad_norm": 1.0735809803009033, |
|
"learning_rate": 5.073078197914341e-06, |
|
"loss": 0.3336, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 2.0691964285714284, |
|
"grad_norm": 1.074890375137329, |
|
"learning_rate": 5.0511178435842565e-06, |
|
"loss": 0.3264, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 2.0714285714285716, |
|
"grad_norm": 0.9348042011260986, |
|
"learning_rate": 5.029189051352339e-06, |
|
"loss": 0.2924, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 2.0736607142857144, |
|
"grad_norm": 1.1553387641906738, |
|
"learning_rate": 5.007291961072133e-06, |
|
"loss": 0.2878, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 2.075892857142857, |
|
"grad_norm": 1.0851117372512817, |
|
"learning_rate": 4.985426712394994e-06, |
|
"loss": 0.3376, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.078125, |
|
"grad_norm": 0.9342989921569824, |
|
"learning_rate": 4.963593444769207e-06, |
|
"loss": 0.2701, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 2.080357142857143, |
|
"grad_norm": 1.141180157661438, |
|
"learning_rate": 4.941792297439098e-06, |
|
"loss": 0.2681, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 2.0825892857142856, |
|
"grad_norm": 1.2393943071365356, |
|
"learning_rate": 4.920023409444128e-06, |
|
"loss": 0.3901, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 2.0848214285714284, |
|
"grad_norm": 1.389803409576416, |
|
"learning_rate": 4.898286919618034e-06, |
|
"loss": 0.3377, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 2.0870535714285716, |
|
"grad_norm": 1.1894795894622803, |
|
"learning_rate": 4.876582966587924e-06, |
|
"loss": 0.3204, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.0892857142857144, |
|
"grad_norm": 1.2430485486984253, |
|
"learning_rate": 4.8549116887734045e-06, |
|
"loss": 0.3155, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 2.091517857142857, |
|
"grad_norm": 1.2486804723739624, |
|
"learning_rate": 4.833273224385678e-06, |
|
"loss": 0.3485, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.9740699529647827, |
|
"learning_rate": 4.811667711426686e-06, |
|
"loss": 0.2882, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 2.095982142857143, |
|
"grad_norm": 1.1677557229995728, |
|
"learning_rate": 4.790095287688227e-06, |
|
"loss": 0.3942, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 2.0982142857142856, |
|
"grad_norm": 1.2111822366714478, |
|
"learning_rate": 4.7685560907510465e-06, |
|
"loss": 0.2916, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.1004464285714284, |
|
"grad_norm": 1.1940852403640747, |
|
"learning_rate": 4.747050257984002e-06, |
|
"loss": 0.2918, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 2.1026785714285716, |
|
"grad_norm": 0.9879006743431091, |
|
"learning_rate": 4.725577926543151e-06, |
|
"loss": 0.2781, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 2.1049107142857144, |
|
"grad_norm": 1.0214678049087524, |
|
"learning_rate": 4.704139233370905e-06, |
|
"loss": 0.3141, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 2.107142857142857, |
|
"grad_norm": 0.9733462333679199, |
|
"learning_rate": 4.682734315195138e-06, |
|
"loss": 0.3298, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 1.053040623664856, |
|
"learning_rate": 4.661363308528319e-06, |
|
"loss": 0.2853, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.111607142857143, |
|
"grad_norm": 1.1272430419921875, |
|
"learning_rate": 4.640026349666651e-06, |
|
"loss": 0.3328, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 2.1138392857142856, |
|
"grad_norm": 1.0852495431900024, |
|
"learning_rate": 4.61872357468917e-06, |
|
"loss": 0.2992, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 2.1160714285714284, |
|
"grad_norm": 1.0410633087158203, |
|
"learning_rate": 4.5974551194569336e-06, |
|
"loss": 0.3085, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 2.1183035714285716, |
|
"grad_norm": 1.071105718612671, |
|
"learning_rate": 4.576221119612091e-06, |
|
"loss": 0.338, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 2.1205357142857144, |
|
"grad_norm": 0.9693806767463684, |
|
"learning_rate": 4.555021710577068e-06, |
|
"loss": 0.3483, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.122767857142857, |
|
"grad_norm": 1.1795443296432495, |
|
"learning_rate": 4.533857027553663e-06, |
|
"loss": 0.3496, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 1.0642222166061401, |
|
"learning_rate": 4.51272720552223e-06, |
|
"loss": 0.2831, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 2.127232142857143, |
|
"grad_norm": 1.088563084602356, |
|
"learning_rate": 4.49163237924078e-06, |
|
"loss": 0.3134, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 2.1294642857142856, |
|
"grad_norm": 1.0107040405273438, |
|
"learning_rate": 4.470572683244127e-06, |
|
"loss": 0.2798, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 2.1316964285714284, |
|
"grad_norm": 1.1377826929092407, |
|
"learning_rate": 4.449548251843048e-06, |
|
"loss": 0.2932, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.1339285714285716, |
|
"grad_norm": 1.1572898626327515, |
|
"learning_rate": 4.4285592191234125e-06, |
|
"loss": 0.31, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 2.1361607142857144, |
|
"grad_norm": 1.0224123001098633, |
|
"learning_rate": 4.4076057189453325e-06, |
|
"loss": 0.2888, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 2.138392857142857, |
|
"grad_norm": 1.0553392171859741, |
|
"learning_rate": 4.386687884942307e-06, |
|
"loss": 0.2955, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 2.140625, |
|
"grad_norm": 1.226377248764038, |
|
"learning_rate": 4.365805850520362e-06, |
|
"loss": 0.3023, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.1049394607543945, |
|
"learning_rate": 4.344959748857215e-06, |
|
"loss": 0.2934, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.1450892857142856, |
|
"grad_norm": 1.1788314580917358, |
|
"learning_rate": 4.324149712901417e-06, |
|
"loss": 0.3623, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 2.1473214285714284, |
|
"grad_norm": 0.9973257184028625, |
|
"learning_rate": 4.3033758753715095e-06, |
|
"loss": 0.2857, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 2.1495535714285716, |
|
"grad_norm": 1.0208550691604614, |
|
"learning_rate": 4.282638368755161e-06, |
|
"loss": 0.2855, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 2.1517857142857144, |
|
"grad_norm": 1.0481334924697876, |
|
"learning_rate": 4.261937325308347e-06, |
|
"loss": 0.351, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 2.154017857142857, |
|
"grad_norm": 1.0952095985412598, |
|
"learning_rate": 4.241272877054489e-06, |
|
"loss": 0.3035, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 1.1130040884017944, |
|
"learning_rate": 4.2206451557836235e-06, |
|
"loss": 0.298, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 2.158482142857143, |
|
"grad_norm": 1.0529165267944336, |
|
"learning_rate": 4.200054293051556e-06, |
|
"loss": 0.339, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 2.1607142857142856, |
|
"grad_norm": 0.9979656934738159, |
|
"learning_rate": 4.179500420179011e-06, |
|
"loss": 0.354, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 2.1629464285714284, |
|
"grad_norm": 1.0694448947906494, |
|
"learning_rate": 4.158983668250819e-06, |
|
"loss": 0.3054, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 2.1651785714285716, |
|
"grad_norm": 0.9863750338554382, |
|
"learning_rate": 4.138504168115059e-06, |
|
"loss": 0.2886, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.1674107142857144, |
|
"grad_norm": 1.0251253843307495, |
|
"learning_rate": 4.11806205038224e-06, |
|
"loss": 0.2786, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 2.169642857142857, |
|
"grad_norm": 1.2727429866790771, |
|
"learning_rate": 4.097657445424454e-06, |
|
"loss": 0.3197, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 2.171875, |
|
"grad_norm": 1.0682674646377563, |
|
"learning_rate": 4.077290483374549e-06, |
|
"loss": 0.2598, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 2.174107142857143, |
|
"grad_norm": 1.122412085533142, |
|
"learning_rate": 4.056961294125305e-06, |
|
"loss": 0.2843, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 2.1763392857142856, |
|
"grad_norm": 1.1398166418075562, |
|
"learning_rate": 4.0366700073286005e-06, |
|
"loss": 0.321, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.1785714285714284, |
|
"grad_norm": 1.2503103017807007, |
|
"learning_rate": 4.016416752394591e-06, |
|
"loss": 0.3992, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 2.1808035714285716, |
|
"grad_norm": 1.1967464685440063, |
|
"learning_rate": 3.996201658490866e-06, |
|
"loss": 0.3086, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 2.1830357142857144, |
|
"grad_norm": 1.0485783815383911, |
|
"learning_rate": 3.9760248545416465e-06, |
|
"loss": 0.2849, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 2.185267857142857, |
|
"grad_norm": 1.1187446117401123, |
|
"learning_rate": 3.955886469226967e-06, |
|
"loss": 0.2947, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.0842468738555908, |
|
"learning_rate": 3.935786630981819e-06, |
|
"loss": 0.2789, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.189732142857143, |
|
"grad_norm": 0.9847047924995422, |
|
"learning_rate": 3.915725467995375e-06, |
|
"loss": 0.2868, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 2.1919642857142856, |
|
"grad_norm": 1.0602375268936157, |
|
"learning_rate": 3.895703108210135e-06, |
|
"loss": 0.3282, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 2.1941964285714284, |
|
"grad_norm": 0.9967721700668335, |
|
"learning_rate": 3.875719679321138e-06, |
|
"loss": 0.3096, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 2.1964285714285716, |
|
"grad_norm": 1.177323818206787, |
|
"learning_rate": 3.8557753087751345e-06, |
|
"loss": 0.3515, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 2.1986607142857144, |
|
"grad_norm": 0.9941040277481079, |
|
"learning_rate": 3.835870123769775e-06, |
|
"loss": 0.3254, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.200892857142857, |
|
"grad_norm": 1.2178453207015991, |
|
"learning_rate": 3.8160042512528e-06, |
|
"loss": 0.3142, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 2.203125, |
|
"grad_norm": 1.2556642293930054, |
|
"learning_rate": 3.796177817921223e-06, |
|
"loss": 0.3206, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 2.205357142857143, |
|
"grad_norm": 1.0012449026107788, |
|
"learning_rate": 3.776390950220544e-06, |
|
"loss": 0.3497, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 2.2075892857142856, |
|
"grad_norm": 1.051226258277893, |
|
"learning_rate": 3.756643774343913e-06, |
|
"loss": 0.381, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 2.2098214285714284, |
|
"grad_norm": 0.9949240684509277, |
|
"learning_rate": 3.7369364162313528e-06, |
|
"loss": 0.3661, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.2120535714285716, |
|
"grad_norm": 1.0959502458572388, |
|
"learning_rate": 3.7172690015689263e-06, |
|
"loss": 0.3618, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 2.2142857142857144, |
|
"grad_norm": 1.028232455253601, |
|
"learning_rate": 3.6976416557879757e-06, |
|
"loss": 0.2777, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 2.216517857142857, |
|
"grad_norm": 1.055541753768921, |
|
"learning_rate": 3.678054504064287e-06, |
|
"loss": 0.3042, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 1.1653730869293213, |
|
"learning_rate": 3.658507671317296e-06, |
|
"loss": 0.3297, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 2.220982142857143, |
|
"grad_norm": 1.1325184106826782, |
|
"learning_rate": 3.639001282209311e-06, |
|
"loss": 0.3129, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.2232142857142856, |
|
"grad_norm": 1.1521151065826416, |
|
"learning_rate": 3.6195354611447033e-06, |
|
"loss": 0.314, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 2.2254464285714284, |
|
"grad_norm": 1.1670697927474976, |
|
"learning_rate": 3.600110332269118e-06, |
|
"loss": 0.2789, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 2.2276785714285716, |
|
"grad_norm": 1.0550611019134521, |
|
"learning_rate": 3.580726019468671e-06, |
|
"loss": 0.3384, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 2.2299107142857144, |
|
"grad_norm": 1.1587961912155151, |
|
"learning_rate": 3.561382646369179e-06, |
|
"loss": 0.344, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 2.232142857142857, |
|
"grad_norm": 1.1734824180603027, |
|
"learning_rate": 3.5420803363353604e-06, |
|
"loss": 0.3755, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.234375, |
|
"grad_norm": 1.101392149925232, |
|
"learning_rate": 3.5228192124700433e-06, |
|
"loss": 0.3274, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 2.236607142857143, |
|
"grad_norm": 1.3699195384979248, |
|
"learning_rate": 3.503599397613394e-06, |
|
"loss": 0.2845, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.2388392857142856, |
|
"grad_norm": 1.0058083534240723, |
|
"learning_rate": 3.4844210143421143e-06, |
|
"loss": 0.3133, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 2.2410714285714284, |
|
"grad_norm": 1.107692837715149, |
|
"learning_rate": 3.465284184968679e-06, |
|
"loss": 0.3459, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 2.2433035714285716, |
|
"grad_norm": 1.0924514532089233, |
|
"learning_rate": 3.4461890315405466e-06, |
|
"loss": 0.3195, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.2455357142857144, |
|
"grad_norm": 1.1005483865737915, |
|
"learning_rate": 3.4271356758393827e-06, |
|
"loss": 0.288, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 2.247767857142857, |
|
"grad_norm": 1.0790880918502808, |
|
"learning_rate": 3.4081242393802847e-06, |
|
"loss": 0.3078, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.0338419675827026, |
|
"learning_rate": 3.3891548434109942e-06, |
|
"loss": 0.3592, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.6482473611831665, |
|
"eval_runtime": 32.6288, |
|
"eval_samples_per_second": 2.237, |
|
"eval_steps_per_second": 0.306, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 2.252232142857143, |
|
"grad_norm": 1.1008427143096924, |
|
"learning_rate": 3.3702276089111484e-06, |
|
"loss": 0.3108, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 2.2544642857142856, |
|
"grad_norm": 1.160321831703186, |
|
"learning_rate": 3.3513426565914854e-06, |
|
"loss": 0.3412, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.2566964285714284, |
|
"grad_norm": 1.04502534866333, |
|
"learning_rate": 3.3325001068930917e-06, |
|
"loss": 0.306, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 2.2589285714285716, |
|
"grad_norm": 1.108601689338684, |
|
"learning_rate": 3.3137000799866148e-06, |
|
"loss": 0.2485, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 2.2611607142857144, |
|
"grad_norm": 1.0433441400527954, |
|
"learning_rate": 3.2949426957715157e-06, |
|
"loss": 0.3673, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 2.263392857142857, |
|
"grad_norm": 1.0202276706695557, |
|
"learning_rate": 3.276228073875296e-06, |
|
"loss": 0.3336, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 1.3703482151031494, |
|
"learning_rate": 3.257556333652734e-06, |
|
"loss": 0.287, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.267857142857143, |
|
"grad_norm": 1.1855953931808472, |
|
"learning_rate": 3.238927594185127e-06, |
|
"loss": 0.334, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 2.2700892857142856, |
|
"grad_norm": 1.0647932291030884, |
|
"learning_rate": 3.2203419742795237e-06, |
|
"loss": 0.3158, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 2.2723214285714284, |
|
"grad_norm": 1.2551442384719849, |
|
"learning_rate": 3.201799592467978e-06, |
|
"loss": 0.34, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 2.2745535714285716, |
|
"grad_norm": 1.2371532917022705, |
|
"learning_rate": 3.1833005670067874e-06, |
|
"loss": 0.3305, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 2.2767857142857144, |
|
"grad_norm": 1.0982019901275635, |
|
"learning_rate": 3.1648450158757373e-06, |
|
"loss": 0.3204, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.279017857142857, |
|
"grad_norm": 1.1620010137557983, |
|
"learning_rate": 3.146433056777355e-06, |
|
"loss": 0.3213, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 1.0344820022583008, |
|
"learning_rate": 3.128064807136142e-06, |
|
"loss": 0.2553, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 2.283482142857143, |
|
"grad_norm": 0.9919111132621765, |
|
"learning_rate": 3.10974038409785e-06, |
|
"loss": 0.2734, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 1.1597861051559448, |
|
"learning_rate": 3.0914599045287165e-06, |
|
"loss": 0.2914, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 2.2879464285714284, |
|
"grad_norm": 1.2358967065811157, |
|
"learning_rate": 3.073223485014727e-06, |
|
"loss": 0.3175, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.2901785714285716, |
|
"grad_norm": 1.172170877456665, |
|
"learning_rate": 3.0550312418608617e-06, |
|
"loss": 0.3634, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 2.2924107142857144, |
|
"grad_norm": 1.1412909030914307, |
|
"learning_rate": 3.0368832910903625e-06, |
|
"loss": 0.3065, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 2.294642857142857, |
|
"grad_norm": 1.2911455631256104, |
|
"learning_rate": 3.018779748444005e-06, |
|
"loss": 0.3507, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 2.296875, |
|
"grad_norm": 1.1728956699371338, |
|
"learning_rate": 3.000720729379326e-06, |
|
"loss": 0.2747, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 2.299107142857143, |
|
"grad_norm": 1.1581259965896606, |
|
"learning_rate": 2.9827063490699225e-06, |
|
"loss": 0.2607, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.3013392857142856, |
|
"grad_norm": 1.1151872873306274, |
|
"learning_rate": 2.9647367224046884e-06, |
|
"loss": 0.3373, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 2.3035714285714284, |
|
"grad_norm": 1.1207554340362549, |
|
"learning_rate": 2.9468119639871163e-06, |
|
"loss": 0.2752, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 2.3058035714285716, |
|
"grad_norm": 1.0638618469238281, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.2839, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 2.3080357142857144, |
|
"grad_norm": 1.1098335981369019, |
|
"learning_rate": 2.911097508877365e-06, |
|
"loss": 0.3076, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 2.310267857142857, |
|
"grad_norm": 1.0225090980529785, |
|
"learning_rate": 2.8933080399584757e-06, |
|
"loss": 0.3733, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 1.366018533706665, |
|
"learning_rate": 2.8755638948323494e-06, |
|
"loss": 0.3335, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 2.314732142857143, |
|
"grad_norm": 1.2792249917984009, |
|
"learning_rate": 2.8578651866644447e-06, |
|
"loss": 0.3129, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 2.3169642857142856, |
|
"grad_norm": 1.0269707441329956, |
|
"learning_rate": 2.840212028330418e-06, |
|
"loss": 0.2463, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 2.3191964285714284, |
|
"grad_norm": 1.0969288349151611, |
|
"learning_rate": 2.8226045324154394e-06, |
|
"loss": 0.2923, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 1.0409966707229614, |
|
"learning_rate": 2.8050428112134474e-06, |
|
"loss": 0.3553, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.3236607142857144, |
|
"grad_norm": 1.2081456184387207, |
|
"learning_rate": 2.7875269767264667e-06, |
|
"loss": 0.3003, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 2.325892857142857, |
|
"grad_norm": 1.045913815498352, |
|
"learning_rate": 2.7700571406638633e-06, |
|
"loss": 0.2766, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 2.328125, |
|
"grad_norm": 1.0660289525985718, |
|
"learning_rate": 2.7526334144416345e-06, |
|
"loss": 0.2689, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 2.330357142857143, |
|
"grad_norm": 1.1659197807312012, |
|
"learning_rate": 2.735255909181719e-06, |
|
"loss": 0.2751, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 2.3325892857142856, |
|
"grad_norm": 0.9567285776138306, |
|
"learning_rate": 2.7179247357112704e-06, |
|
"loss": 0.3007, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.3348214285714284, |
|
"grad_norm": 1.0861024856567383, |
|
"learning_rate": 2.7006400045619597e-06, |
|
"loss": 0.2645, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 2.3370535714285716, |
|
"grad_norm": 1.0163832902908325, |
|
"learning_rate": 2.6834018259692574e-06, |
|
"loss": 0.3232, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 2.3392857142857144, |
|
"grad_norm": 1.087554931640625, |
|
"learning_rate": 2.6662103098717485e-06, |
|
"loss": 0.3177, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 2.341517857142857, |
|
"grad_norm": 1.1598862409591675, |
|
"learning_rate": 2.649065565910419e-06, |
|
"loss": 0.3277, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.0833197832107544, |
|
"learning_rate": 2.631967703427959e-06, |
|
"loss": 0.3163, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.345982142857143, |
|
"grad_norm": 1.1643481254577637, |
|
"learning_rate": 2.6149168314680707e-06, |
|
"loss": 0.2935, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 2.3482142857142856, |
|
"grad_norm": 1.165391445159912, |
|
"learning_rate": 2.597913058774758e-06, |
|
"loss": 0.3332, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 2.3504464285714284, |
|
"grad_norm": 1.015273928642273, |
|
"learning_rate": 2.5809564937916543e-06, |
|
"loss": 0.287, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 2.3526785714285716, |
|
"grad_norm": 1.1589689254760742, |
|
"learning_rate": 2.564047244661316e-06, |
|
"loss": 0.3049, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 2.3549107142857144, |
|
"grad_norm": 1.1673892736434937, |
|
"learning_rate": 2.547185419224537e-06, |
|
"loss": 0.2775, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.357142857142857, |
|
"grad_norm": 1.0458134412765503, |
|
"learning_rate": 2.530371125019664e-06, |
|
"loss": 0.2951, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 2.359375, |
|
"grad_norm": 1.2780659198760986, |
|
"learning_rate": 2.513604469281897e-06, |
|
"loss": 0.2946, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 2.361607142857143, |
|
"grad_norm": 1.2554868459701538, |
|
"learning_rate": 2.4968855589426288e-06, |
|
"loss": 0.3346, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.3638392857142856, |
|
"grad_norm": 1.0079058408737183, |
|
"learning_rate": 2.4802145006287425e-06, |
|
"loss": 0.2499, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 2.3660714285714284, |
|
"grad_norm": 1.0529001951217651, |
|
"learning_rate": 2.4635914006619454e-06, |
|
"loss": 0.3005, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.3683035714285716, |
|
"grad_norm": 1.2676379680633545, |
|
"learning_rate": 2.4470163650580747e-06, |
|
"loss": 0.3839, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 2.3705357142857144, |
|
"grad_norm": 1.1082603931427002, |
|
"learning_rate": 2.430489499526438e-06, |
|
"loss": 0.3176, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 2.372767857142857, |
|
"grad_norm": 1.0914406776428223, |
|
"learning_rate": 2.414010909469133e-06, |
|
"loss": 0.2938, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.001238465309143, |
|
"learning_rate": 2.3975806999803717e-06, |
|
"loss": 0.3355, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 2.377232142857143, |
|
"grad_norm": 1.0119569301605225, |
|
"learning_rate": 2.38119897584582e-06, |
|
"loss": 0.2893, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.3794642857142856, |
|
"grad_norm": 1.0528173446655273, |
|
"learning_rate": 2.364865841541908e-06, |
|
"loss": 0.3065, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 2.3816964285714284, |
|
"grad_norm": 1.0538735389709473, |
|
"learning_rate": 2.3485814012351914e-06, |
|
"loss": 0.3086, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 2.3839285714285716, |
|
"grad_norm": 1.143399715423584, |
|
"learning_rate": 2.33234575878167e-06, |
|
"loss": 0.3536, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 2.3861607142857144, |
|
"grad_norm": 1.137209177017212, |
|
"learning_rate": 2.3161590177261294e-06, |
|
"loss": 0.2712, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 2.388392857142857, |
|
"grad_norm": 1.1341702938079834, |
|
"learning_rate": 2.300021281301483e-06, |
|
"loss": 0.305, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.390625, |
|
"grad_norm": 1.000322937965393, |
|
"learning_rate": 2.2839326524281037e-06, |
|
"loss": 0.3311, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 2.392857142857143, |
|
"grad_norm": 0.9652037620544434, |
|
"learning_rate": 2.267893233713182e-06, |
|
"loss": 0.33, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 2.3950892857142856, |
|
"grad_norm": 0.9035744667053223, |
|
"learning_rate": 2.2519031274500625e-06, |
|
"loss": 0.2822, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 2.3973214285714284, |
|
"grad_norm": 1.0602307319641113, |
|
"learning_rate": 2.235962435617596e-06, |
|
"loss": 0.2652, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 2.3995535714285716, |
|
"grad_norm": 0.950262188911438, |
|
"learning_rate": 2.2200712598794804e-06, |
|
"loss": 0.2667, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.4017857142857144, |
|
"grad_norm": 1.068719744682312, |
|
"learning_rate": 2.204229701583621e-06, |
|
"loss": 0.307, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 2.404017857142857, |
|
"grad_norm": 1.082491397857666, |
|
"learning_rate": 2.1884378617614933e-06, |
|
"loss": 0.2429, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 1.1966094970703125, |
|
"learning_rate": 2.172695841127468e-06, |
|
"loss": 0.3158, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 2.408482142857143, |
|
"grad_norm": 1.2111045122146606, |
|
"learning_rate": 2.157003740078203e-06, |
|
"loss": 0.279, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 1.0568370819091797, |
|
"learning_rate": 2.141361658691975e-06, |
|
"loss": 0.2984, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.4129464285714284, |
|
"grad_norm": 1.1779588460922241, |
|
"learning_rate": 2.1257696967280716e-06, |
|
"loss": 0.2841, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 2.4151785714285716, |
|
"grad_norm": 1.0319031476974487, |
|
"learning_rate": 2.1102279536261193e-06, |
|
"loss": 0.2793, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 2.4174107142857144, |
|
"grad_norm": 0.9412409663200378, |
|
"learning_rate": 2.09473652850548e-06, |
|
"loss": 0.2851, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 2.419642857142857, |
|
"grad_norm": 1.105271339416504, |
|
"learning_rate": 2.0792955201646005e-06, |
|
"loss": 0.2802, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 1.1076663732528687, |
|
"learning_rate": 2.063905027080392e-06, |
|
"loss": 0.3152, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.424107142857143, |
|
"grad_norm": 0.9948890805244446, |
|
"learning_rate": 2.0485651474075987e-06, |
|
"loss": 0.3178, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 2.4263392857142856, |
|
"grad_norm": 1.04751718044281, |
|
"learning_rate": 2.033275978978164e-06, |
|
"loss": 0.3219, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 1.0496200323104858, |
|
"learning_rate": 2.018037619300628e-06, |
|
"loss": 0.2937, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 2.4308035714285716, |
|
"grad_norm": 0.9747620820999146, |
|
"learning_rate": 2.0028501655594736e-06, |
|
"loss": 0.3119, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 2.4330357142857144, |
|
"grad_norm": 1.0099077224731445, |
|
"learning_rate": 1.987713714614543e-06, |
|
"loss": 0.2794, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.435267857142857, |
|
"grad_norm": 0.9872100353240967, |
|
"learning_rate": 1.972628363000396e-06, |
|
"loss": 0.2875, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.9732192158699036, |
|
"learning_rate": 1.9575942069256914e-06, |
|
"loss": 0.2856, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 2.439732142857143, |
|
"grad_norm": 1.5758808851242065, |
|
"learning_rate": 1.942611342272591e-06, |
|
"loss": 0.326, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 2.4419642857142856, |
|
"grad_norm": 1.0248242616653442, |
|
"learning_rate": 1.9276798645961392e-06, |
|
"loss": 0.3052, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 2.4441964285714284, |
|
"grad_norm": 1.0433028936386108, |
|
"learning_rate": 1.9127998691236537e-06, |
|
"loss": 0.2528, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.4464285714285716, |
|
"grad_norm": 1.2717303037643433, |
|
"learning_rate": 1.8979714507541103e-06, |
|
"loss": 0.3272, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 2.4486607142857144, |
|
"grad_norm": 1.105989694595337, |
|
"learning_rate": 1.883194704057556e-06, |
|
"loss": 0.3166, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 2.450892857142857, |
|
"grad_norm": 0.9582863450050354, |
|
"learning_rate": 1.8684697232744886e-06, |
|
"loss": 0.3559, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 2.453125, |
|
"grad_norm": 1.3236531019210815, |
|
"learning_rate": 1.8537966023152664e-06, |
|
"loss": 0.2735, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 2.455357142857143, |
|
"grad_norm": 1.1173170804977417, |
|
"learning_rate": 1.839175434759507e-06, |
|
"loss": 0.2657, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.4575892857142856, |
|
"grad_norm": 1.127776026725769, |
|
"learning_rate": 1.8246063138554793e-06, |
|
"loss": 0.3515, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 2.4598214285714284, |
|
"grad_norm": 1.1345889568328857, |
|
"learning_rate": 1.810089332519528e-06, |
|
"loss": 0.3252, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 2.4620535714285716, |
|
"grad_norm": 1.0312929153442383, |
|
"learning_rate": 1.795624583335467e-06, |
|
"loss": 0.2631, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 2.4642857142857144, |
|
"grad_norm": 1.1354609727859497, |
|
"learning_rate": 1.7812121585539964e-06, |
|
"loss": 0.3394, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 2.466517857142857, |
|
"grad_norm": 0.9982307553291321, |
|
"learning_rate": 1.7668521500921098e-06, |
|
"loss": 0.3204, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 1.3083230257034302, |
|
"learning_rate": 1.7525446495325038e-06, |
|
"loss": 0.3237, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 2.470982142857143, |
|
"grad_norm": 1.1162062883377075, |
|
"learning_rate": 1.7382897481230076e-06, |
|
"loss": 0.2478, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 2.4732142857142856, |
|
"grad_norm": 1.1414133310317993, |
|
"learning_rate": 1.7240875367759902e-06, |
|
"loss": 0.3077, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 2.4754464285714284, |
|
"grad_norm": 1.2450937032699585, |
|
"learning_rate": 1.7099381060677833e-06, |
|
"loss": 0.3084, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 2.4776785714285716, |
|
"grad_norm": 0.9135916233062744, |
|
"learning_rate": 1.6958415462380983e-06, |
|
"loss": 0.2893, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.4799107142857144, |
|
"grad_norm": 1.0570931434631348, |
|
"learning_rate": 1.6817979471894641e-06, |
|
"loss": 0.2603, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 2.482142857142857, |
|
"grad_norm": 1.3998193740844727, |
|
"learning_rate": 1.6678073984866438e-06, |
|
"loss": 0.3685, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 2.484375, |
|
"grad_norm": 1.2341136932373047, |
|
"learning_rate": 1.6538699893560618e-06, |
|
"loss": 0.3341, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 2.486607142857143, |
|
"grad_norm": 1.1154083013534546, |
|
"learning_rate": 1.639985808685245e-06, |
|
"loss": 0.3208, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 2.4888392857142856, |
|
"grad_norm": 1.3637317419052124, |
|
"learning_rate": 1.6261549450222392e-06, |
|
"loss": 0.3564, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.4910714285714284, |
|
"grad_norm": 0.9546897411346436, |
|
"learning_rate": 1.6123774865750607e-06, |
|
"loss": 0.2526, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 2.4933035714285716, |
|
"grad_norm": 1.1718556880950928, |
|
"learning_rate": 1.5986535212111353e-06, |
|
"loss": 0.3031, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 2.4955357142857144, |
|
"grad_norm": 1.1763498783111572, |
|
"learning_rate": 1.5849831364567137e-06, |
|
"loss": 0.2933, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 2.497767857142857, |
|
"grad_norm": 1.3048385381698608, |
|
"learning_rate": 1.571366419496344e-06, |
|
"loss": 0.3144, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.1232630014419556, |
|
"learning_rate": 1.5578034571722879e-06, |
|
"loss": 0.3085, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.6504756212234497, |
|
"eval_runtime": 34.8268, |
|
"eval_samples_per_second": 2.096, |
|
"eval_steps_per_second": 0.287, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.502232142857143, |
|
"grad_norm": 1.248914122581482, |
|
"learning_rate": 1.5442943359839978e-06, |
|
"loss": 0.3111, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 2.5044642857142856, |
|
"grad_norm": 1.2444794178009033, |
|
"learning_rate": 1.5308391420875312e-06, |
|
"loss": 0.3108, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 2.506696428571429, |
|
"grad_norm": 1.0684040784835815, |
|
"learning_rate": 1.5174379612950273e-06, |
|
"loss": 0.2805, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 2.508928571428571, |
|
"grad_norm": 1.049082636833191, |
|
"learning_rate": 1.5040908790741448e-06, |
|
"loss": 0.3263, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 2.5111607142857144, |
|
"grad_norm": 0.9625985026359558, |
|
"learning_rate": 1.490797980547528e-06, |
|
"loss": 0.2914, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.513392857142857, |
|
"grad_norm": 1.173850178718567, |
|
"learning_rate": 1.4775593504922547e-06, |
|
"loss": 0.3015, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 2.515625, |
|
"grad_norm": 1.032834529876709, |
|
"learning_rate": 1.4643750733392958e-06, |
|
"loss": 0.3199, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 2.517857142857143, |
|
"grad_norm": 1.14642333984375, |
|
"learning_rate": 1.4512452331729864e-06, |
|
"loss": 0.3114, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 2.5200892857142856, |
|
"grad_norm": 1.0797921419143677, |
|
"learning_rate": 1.438169913730475e-06, |
|
"loss": 0.3425, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 2.522321428571429, |
|
"grad_norm": 0.9763374328613281, |
|
"learning_rate": 1.4251491984012089e-06, |
|
"loss": 0.3186, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.524553571428571, |
|
"grad_norm": 1.1403617858886719, |
|
"learning_rate": 1.4121831702263833e-06, |
|
"loss": 0.3343, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 2.5267857142857144, |
|
"grad_norm": 1.1691458225250244, |
|
"learning_rate": 1.3992719118984167e-06, |
|
"loss": 0.308, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 2.529017857142857, |
|
"grad_norm": 1.1367650032043457, |
|
"learning_rate": 1.3864155057604323e-06, |
|
"loss": 0.3668, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 1.1581871509552002, |
|
"learning_rate": 1.3736140338057247e-06, |
|
"loss": 0.3477, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 2.533482142857143, |
|
"grad_norm": 1.1503748893737793, |
|
"learning_rate": 1.3608675776772428e-06, |
|
"loss": 0.3597, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.5357142857142856, |
|
"grad_norm": 1.056081771850586, |
|
"learning_rate": 1.3481762186670556e-06, |
|
"loss": 0.341, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 2.537946428571429, |
|
"grad_norm": 1.1978353261947632, |
|
"learning_rate": 1.335540037715851e-06, |
|
"loss": 0.3677, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 2.540178571428571, |
|
"grad_norm": 1.1892170906066895, |
|
"learning_rate": 1.3229591154124132e-06, |
|
"loss": 0.2895, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 2.5424107142857144, |
|
"grad_norm": 0.9958797693252563, |
|
"learning_rate": 1.310433531993104e-06, |
|
"loss": 0.3434, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 2.544642857142857, |
|
"grad_norm": 1.1225999593734741, |
|
"learning_rate": 1.2979633673413571e-06, |
|
"loss": 0.2626, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.546875, |
|
"grad_norm": 1.121170997619629, |
|
"learning_rate": 1.2855487009871615e-06, |
|
"loss": 0.2736, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 2.549107142857143, |
|
"grad_norm": 1.124768853187561, |
|
"learning_rate": 1.2731896121065645e-06, |
|
"loss": 0.3502, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 2.5513392857142856, |
|
"grad_norm": 1.1074433326721191, |
|
"learning_rate": 1.2608861795211601e-06, |
|
"loss": 0.3724, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 2.553571428571429, |
|
"grad_norm": 1.0048129558563232, |
|
"learning_rate": 1.248638481697586e-06, |
|
"loss": 0.3128, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 2.555803571428571, |
|
"grad_norm": 1.2129403352737427, |
|
"learning_rate": 1.2364465967470284e-06, |
|
"loss": 0.3048, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.5580357142857144, |
|
"grad_norm": 1.4070841073989868, |
|
"learning_rate": 1.224310602424712e-06, |
|
"loss": 0.3082, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 2.560267857142857, |
|
"grad_norm": 1.1774101257324219, |
|
"learning_rate": 1.2122305761294196e-06, |
|
"loss": 0.3184, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 1.1185553073883057, |
|
"learning_rate": 1.2002065949029896e-06, |
|
"loss": 0.259, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 2.564732142857143, |
|
"grad_norm": 1.0791926383972168, |
|
"learning_rate": 1.1882387354298264e-06, |
|
"loss": 0.2782, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 2.5669642857142856, |
|
"grad_norm": 1.1294567584991455, |
|
"learning_rate": 1.1763270740364074e-06, |
|
"loss": 0.2585, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.569196428571429, |
|
"grad_norm": 1.1617059707641602, |
|
"learning_rate": 1.1644716866908035e-06, |
|
"loss": 0.397, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 1.1602836847305298, |
|
"learning_rate": 1.15267264900219e-06, |
|
"loss": 0.2723, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 2.5736607142857144, |
|
"grad_norm": 1.0874189138412476, |
|
"learning_rate": 1.1409300362203667e-06, |
|
"loss": 0.3081, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 2.575892857142857, |
|
"grad_norm": 1.186552882194519, |
|
"learning_rate": 1.1292439232352781e-06, |
|
"loss": 0.3096, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 1.004425287246704, |
|
"learning_rate": 1.1176143845765253e-06, |
|
"loss": 0.2703, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.580357142857143, |
|
"grad_norm": 1.0555604696273804, |
|
"learning_rate": 1.1060414944129106e-06, |
|
"loss": 0.3055, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 2.5825892857142856, |
|
"grad_norm": 1.0088086128234863, |
|
"learning_rate": 1.0945253265519472e-06, |
|
"loss": 0.3204, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 2.584821428571429, |
|
"grad_norm": 1.1574110984802246, |
|
"learning_rate": 1.0830659544393996e-06, |
|
"loss": 0.3007, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 2.587053571428571, |
|
"grad_norm": 1.011350154876709, |
|
"learning_rate": 1.0716634511588076e-06, |
|
"loss": 0.2853, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 2.5892857142857144, |
|
"grad_norm": 1.0175875425338745, |
|
"learning_rate": 1.0603178894310185e-06, |
|
"loss": 0.326, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.591517857142857, |
|
"grad_norm": 1.1453709602355957, |
|
"learning_rate": 1.0490293416137409e-06, |
|
"loss": 0.2979, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.9553859233856201, |
|
"learning_rate": 1.0377978797010558e-06, |
|
"loss": 0.2825, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 2.595982142857143, |
|
"grad_norm": 1.2206124067306519, |
|
"learning_rate": 1.0266235753229825e-06, |
|
"loss": 0.3796, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 2.5982142857142856, |
|
"grad_norm": 1.2103142738342285, |
|
"learning_rate": 1.0155064997450026e-06, |
|
"loss": 0.2994, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 2.600446428571429, |
|
"grad_norm": 0.9963259696960449, |
|
"learning_rate": 1.004446723867618e-06, |
|
"loss": 0.3351, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.602678571428571, |
|
"grad_norm": 1.100026249885559, |
|
"learning_rate": 9.934443182259023e-07, |
|
"loss": 0.3307, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 2.6049107142857144, |
|
"grad_norm": 0.9989385008811951, |
|
"learning_rate": 9.824993529890303e-07, |
|
"loss": 0.3261, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 2.607142857142857, |
|
"grad_norm": 0.9991337656974792, |
|
"learning_rate": 9.716118979598533e-07, |
|
"loss": 0.3303, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 2.609375, |
|
"grad_norm": 1.0319366455078125, |
|
"learning_rate": 9.607820225744346e-07, |
|
"loss": 0.3108, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 2.611607142857143, |
|
"grad_norm": 1.1170539855957031, |
|
"learning_rate": 9.500097959016297e-07, |
|
"loss": 0.3182, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.6138392857142856, |
|
"grad_norm": 1.0289571285247803, |
|
"learning_rate": 9.392952866426198e-07, |
|
"loss": 0.348, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 2.616071428571429, |
|
"grad_norm": 1.1711770296096802, |
|
"learning_rate": 9.286385631304939e-07, |
|
"loss": 0.325, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 2.618303571428571, |
|
"grad_norm": 1.0985348224639893, |
|
"learning_rate": 9.180396933298019e-07, |
|
"loss": 0.3667, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 2.6205357142857144, |
|
"grad_norm": 1.128706455230713, |
|
"learning_rate": 9.074987448361261e-07, |
|
"loss": 0.3939, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 2.622767857142857, |
|
"grad_norm": 1.119606852531433, |
|
"learning_rate": 8.970157848756511e-07, |
|
"loss": 0.2845, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 1.1094435453414917, |
|
"learning_rate": 8.865908803047241e-07, |
|
"loss": 0.3649, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 2.627232142857143, |
|
"grad_norm": 1.1740772724151611, |
|
"learning_rate": 8.762240976094461e-07, |
|
"loss": 0.3225, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 2.6294642857142856, |
|
"grad_norm": 1.083766222000122, |
|
"learning_rate": 8.659155029052346e-07, |
|
"loss": 0.2813, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 2.631696428571429, |
|
"grad_norm": 1.0689512491226196, |
|
"learning_rate": 8.556651619364065e-07, |
|
"loss": 0.2851, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 2.633928571428571, |
|
"grad_norm": 1.105716586112976, |
|
"learning_rate": 8.454731400757599e-07, |
|
"loss": 0.2946, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.6361607142857144, |
|
"grad_norm": 1.143965244293213, |
|
"learning_rate": 8.353395023241528e-07, |
|
"loss": 0.2949, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 2.638392857142857, |
|
"grad_norm": 0.8723753690719604, |
|
"learning_rate": 8.252643133100935e-07, |
|
"loss": 0.28, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 2.640625, |
|
"grad_norm": 1.0619475841522217, |
|
"learning_rate": 8.152476372893259e-07, |
|
"loss": 0.3412, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 2.642857142857143, |
|
"grad_norm": 1.1905899047851562, |
|
"learning_rate": 8.052895381444226e-07, |
|
"loss": 0.2769, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 2.6450892857142856, |
|
"grad_norm": 1.1052501201629639, |
|
"learning_rate": 7.953900793843694e-07, |
|
"loss": 0.2905, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.647321428571429, |
|
"grad_norm": 1.066614031791687, |
|
"learning_rate": 7.855493241441692e-07, |
|
"loss": 0.2377, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 2.649553571428571, |
|
"grad_norm": 1.2181105613708496, |
|
"learning_rate": 7.757673351844386e-07, |
|
"loss": 0.3052, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 2.6517857142857144, |
|
"grad_norm": 1.0465307235717773, |
|
"learning_rate": 7.660441748909997e-07, |
|
"loss": 0.3296, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 2.654017857142857, |
|
"grad_norm": 0.9089909195899963, |
|
"learning_rate": 7.563799052744947e-07, |
|
"loss": 0.266, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.9568466544151306, |
|
"learning_rate": 7.46774587969975e-07, |
|
"loss": 0.3503, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.658482142857143, |
|
"grad_norm": 1.032265067100525, |
|
"learning_rate": 7.372282842365208e-07, |
|
"loss": 0.2716, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 2.6607142857142856, |
|
"grad_norm": 1.2572873830795288, |
|
"learning_rate": 7.277410549568476e-07, |
|
"loss": 0.3476, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 2.662946428571429, |
|
"grad_norm": 1.076196312904358, |
|
"learning_rate": 7.183129606369133e-07, |
|
"loss": 0.3222, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 2.665178571428571, |
|
"grad_norm": 1.0198694467544556, |
|
"learning_rate": 7.089440614055398e-07, |
|
"loss": 0.257, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 2.6674107142857144, |
|
"grad_norm": 1.1174793243408203, |
|
"learning_rate": 6.996344170140168e-07, |
|
"loss": 0.3542, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.669642857142857, |
|
"grad_norm": 1.0214918851852417, |
|
"learning_rate": 6.903840868357382e-07, |
|
"loss": 0.3026, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 2.671875, |
|
"grad_norm": 1.2186428308486938, |
|
"learning_rate": 6.811931298658092e-07, |
|
"loss": 0.281, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 2.674107142857143, |
|
"grad_norm": 1.0608640909194946, |
|
"learning_rate": 6.720616047206774e-07, |
|
"loss": 0.3752, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 2.6763392857142856, |
|
"grad_norm": 1.1944961547851562, |
|
"learning_rate": 6.62989569637752e-07, |
|
"loss": 0.2898, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.9049682021141052, |
|
"learning_rate": 6.539770824750447e-07, |
|
"loss": 0.2757, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.680803571428571, |
|
"grad_norm": 0.9544935822486877, |
|
"learning_rate": 6.450242007107865e-07, |
|
"loss": 0.2786, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 2.6830357142857144, |
|
"grad_norm": 1.1401265859603882, |
|
"learning_rate": 6.361309814430727e-07, |
|
"loss": 0.2984, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 2.685267857142857, |
|
"grad_norm": 0.852730929851532, |
|
"learning_rate": 6.272974813894905e-07, |
|
"loss": 0.2265, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.9403284788131714, |
|
"learning_rate": 6.185237568867597e-07, |
|
"loss": 0.2965, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 2.689732142857143, |
|
"grad_norm": 0.9182292222976685, |
|
"learning_rate": 6.098098638903771e-07, |
|
"loss": 0.2878, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.6919642857142856, |
|
"grad_norm": 0.9199351072311401, |
|
"learning_rate": 6.01155857974256e-07, |
|
"loss": 0.2658, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 2.694196428571429, |
|
"grad_norm": 1.2709399461746216, |
|
"learning_rate": 5.925617943303719e-07, |
|
"loss": 0.3033, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 2.696428571428571, |
|
"grad_norm": 1.1471501588821411, |
|
"learning_rate": 5.840277277684136e-07, |
|
"loss": 0.2973, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 2.6986607142857144, |
|
"grad_norm": 1.1023441553115845, |
|
"learning_rate": 5.755537127154231e-07, |
|
"loss": 0.3461, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 2.700892857142857, |
|
"grad_norm": 1.036736249923706, |
|
"learning_rate": 5.671398032154707e-07, |
|
"loss": 0.3071, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.703125, |
|
"grad_norm": 1.0169968605041504, |
|
"learning_rate": 5.58786052929281e-07, |
|
"loss": 0.3106, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 2.705357142857143, |
|
"grad_norm": 0.9377234578132629, |
|
"learning_rate": 5.504925151339191e-07, |
|
"loss": 0.31, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 2.7075892857142856, |
|
"grad_norm": 1.1917829513549805, |
|
"learning_rate": 5.422592427224239e-07, |
|
"loss": 0.3508, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 2.709821428571429, |
|
"grad_norm": 1.1036509275436401, |
|
"learning_rate": 5.340862882034992e-07, |
|
"loss": 0.2706, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 2.712053571428571, |
|
"grad_norm": 1.0939146280288696, |
|
"learning_rate": 5.259737037011547e-07, |
|
"loss": 0.3306, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 1.6331071853637695, |
|
"learning_rate": 5.179215409543848e-07, |
|
"loss": 0.3035, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 2.716517857142857, |
|
"grad_norm": 1.2998929023742676, |
|
"learning_rate": 5.099298513168382e-07, |
|
"loss": 0.3532, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 1.013765573501587, |
|
"learning_rate": 5.01998685756484e-07, |
|
"loss": 0.3092, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 2.720982142857143, |
|
"grad_norm": 1.1098829507827759, |
|
"learning_rate": 4.941280948553018e-07, |
|
"loss": 0.3248, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 2.7232142857142856, |
|
"grad_norm": 1.028980016708374, |
|
"learning_rate": 4.863181288089391e-07, |
|
"loss": 0.3397, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.725446428571429, |
|
"grad_norm": 1.1095798015594482, |
|
"learning_rate": 4.785688374264053e-07, |
|
"loss": 0.3002, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 2.727678571428571, |
|
"grad_norm": 1.1329293251037598, |
|
"learning_rate": 4.708802701297499e-07, |
|
"loss": 0.3509, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 2.7299107142857144, |
|
"grad_norm": 1.1559255123138428, |
|
"learning_rate": 4.632524759537449e-07, |
|
"loss": 0.3122, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 2.732142857142857, |
|
"grad_norm": 1.0540506839752197, |
|
"learning_rate": 4.556855035455787e-07, |
|
"loss": 0.3124, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 1.0515660047531128, |
|
"learning_rate": 4.481794011645368e-07, |
|
"loss": 0.3451, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.736607142857143, |
|
"grad_norm": 1.035614013671875, |
|
"learning_rate": 4.407342166816997e-07, |
|
"loss": 0.277, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 2.7388392857142856, |
|
"grad_norm": 1.2367432117462158, |
|
"learning_rate": 4.3334999757963734e-07, |
|
"loss": 0.3876, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 2.741071428571429, |
|
"grad_norm": 1.1165615320205688, |
|
"learning_rate": 4.2602679095210766e-07, |
|
"loss": 0.3484, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 2.743303571428571, |
|
"grad_norm": 0.900805652141571, |
|
"learning_rate": 4.187646435037529e-07, |
|
"loss": 0.2895, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 2.7455357142857144, |
|
"grad_norm": 1.1723212003707886, |
|
"learning_rate": 4.1156360154979813e-07, |
|
"loss": 0.3315, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.747767857142857, |
|
"grad_norm": 0.971288800239563, |
|
"learning_rate": 4.044237110157667e-07, |
|
"loss": 0.3146, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.0520827770233154, |
|
"learning_rate": 3.9734501743717956e-07, |
|
"loss": 0.3277, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.6504682898521423, |
|
"eval_runtime": 25.5149, |
|
"eval_samples_per_second": 2.861, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 2.752232142857143, |
|
"grad_norm": 1.031693458557129, |
|
"learning_rate": 3.9032756595926755e-07, |
|
"loss": 0.3002, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 2.7544642857142856, |
|
"grad_norm": 1.1060569286346436, |
|
"learning_rate": 3.833714013366796e-07, |
|
"loss": 0.2792, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 2.756696428571429, |
|
"grad_norm": 1.0176774263381958, |
|
"learning_rate": 3.7647656793320164e-07, |
|
"loss": 0.2915, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.758928571428571, |
|
"grad_norm": 1.0510061979293823, |
|
"learning_rate": 3.696431097214748e-07, |
|
"loss": 0.3348, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 2.7611607142857144, |
|
"grad_norm": 1.1525824069976807, |
|
"learning_rate": 3.628710702827076e-07, |
|
"loss": 0.3144, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 2.763392857142857, |
|
"grad_norm": 1.0995824337005615, |
|
"learning_rate": 3.5616049280640995e-07, |
|
"loss": 0.2969, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 2.765625, |
|
"grad_norm": 1.255507469177246, |
|
"learning_rate": 3.4951142009010173e-07, |
|
"loss": 0.3467, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 2.767857142857143, |
|
"grad_norm": 1.0282824039459229, |
|
"learning_rate": 3.429238945390556e-07, |
|
"loss": 0.3324, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.7700892857142856, |
|
"grad_norm": 0.9991137385368347, |
|
"learning_rate": 3.3639795816601705e-07, |
|
"loss": 0.3323, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 2.772321428571429, |
|
"grad_norm": 0.9604209065437317, |
|
"learning_rate": 3.299336525909391e-07, |
|
"loss": 0.2618, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 2.774553571428571, |
|
"grad_norm": 1.1453895568847656, |
|
"learning_rate": 3.235310190407182e-07, |
|
"loss": 0.2599, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 2.7767857142857144, |
|
"grad_norm": 0.890120804309845, |
|
"learning_rate": 3.171900983489273e-07, |
|
"loss": 0.2831, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 2.779017857142857, |
|
"grad_norm": 1.039947509765625, |
|
"learning_rate": 3.109109309555602e-07, |
|
"loss": 0.3081, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 1.0824922323226929, |
|
"learning_rate": 3.0469355690677216e-07, |
|
"loss": 0.3286, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 2.783482142857143, |
|
"grad_norm": 1.1116937398910522, |
|
"learning_rate": 2.985380158546236e-07, |
|
"loss": 0.2822, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 2.7857142857142856, |
|
"grad_norm": 1.1681832075119019, |
|
"learning_rate": 2.9244434705682276e-07, |
|
"loss": 0.2968, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 2.787946428571429, |
|
"grad_norm": 1.229740858078003, |
|
"learning_rate": 2.8641258937648577e-07, |
|
"loss": 0.2954, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 2.790178571428571, |
|
"grad_norm": 1.1749082803726196, |
|
"learning_rate": 2.8044278128188327e-07, |
|
"loss": 0.3335, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.7924107142857144, |
|
"grad_norm": 1.116082787513733, |
|
"learning_rate": 2.7453496084619116e-07, |
|
"loss": 0.3388, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 2.794642857142857, |
|
"grad_norm": 1.1051106452941895, |
|
"learning_rate": 2.6868916574725347e-07, |
|
"loss": 0.3184, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 2.796875, |
|
"grad_norm": 0.9878678321838379, |
|
"learning_rate": 2.6290543326733865e-07, |
|
"loss": 0.306, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 2.799107142857143, |
|
"grad_norm": 1.0825244188308716, |
|
"learning_rate": 2.571838002929061e-07, |
|
"loss": 0.3561, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 2.8013392857142856, |
|
"grad_norm": 1.0878150463104248, |
|
"learning_rate": 2.515243033143644e-07, |
|
"loss": 0.2942, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.803571428571429, |
|
"grad_norm": 1.051950454711914, |
|
"learning_rate": 2.459269784258467e-07, |
|
"loss": 0.321, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 2.805803571428571, |
|
"grad_norm": 1.1179208755493164, |
|
"learning_rate": 2.4039186132497226e-07, |
|
"loss": 0.3436, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 2.8080357142857144, |
|
"grad_norm": 1.0265306234359741, |
|
"learning_rate": 2.349189873126223e-07, |
|
"loss": 0.3321, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 2.810267857142857, |
|
"grad_norm": 1.211876630783081, |
|
"learning_rate": 2.2950839129272096e-07, |
|
"loss": 0.3464, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.05633544921875, |
|
"learning_rate": 2.2416010777199904e-07, |
|
"loss": 0.3155, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.814732142857143, |
|
"grad_norm": 1.1148322820663452, |
|
"learning_rate": 2.1887417085978745e-07, |
|
"loss": 0.2922, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 2.8169642857142856, |
|
"grad_norm": 1.1178148984909058, |
|
"learning_rate": 2.1365061426778967e-07, |
|
"loss": 0.3572, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 2.819196428571429, |
|
"grad_norm": 1.076788306236267, |
|
"learning_rate": 2.0848947130987617e-07, |
|
"loss": 0.3669, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 2.821428571428571, |
|
"grad_norm": 1.0222539901733398, |
|
"learning_rate": 2.0339077490186488e-07, |
|
"loss": 0.2447, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 2.8236607142857144, |
|
"grad_norm": 1.1097285747528076, |
|
"learning_rate": 1.9835455756130995e-07, |
|
"loss": 0.3211, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.825892857142857, |
|
"grad_norm": 1.0979808568954468, |
|
"learning_rate": 1.93380851407301e-07, |
|
"loss": 0.3193, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 2.828125, |
|
"grad_norm": 1.0203653573989868, |
|
"learning_rate": 1.8846968816025434e-07, |
|
"loss": 0.2787, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 2.830357142857143, |
|
"grad_norm": 1.2996318340301514, |
|
"learning_rate": 1.83621099141712e-07, |
|
"loss": 0.3012, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 2.8325892857142856, |
|
"grad_norm": 1.303546667098999, |
|
"learning_rate": 1.7883511527414078e-07, |
|
"loss": 0.3176, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 2.834821428571429, |
|
"grad_norm": 1.072313666343689, |
|
"learning_rate": 1.741117670807335e-07, |
|
"loss": 0.3456, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.837053571428571, |
|
"grad_norm": 1.0661609172821045, |
|
"learning_rate": 1.694510846852193e-07, |
|
"loss": 0.3156, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 2.8392857142857144, |
|
"grad_norm": 1.1145730018615723, |
|
"learning_rate": 1.648530978116658e-07, |
|
"loss": 0.3167, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 2.841517857142857, |
|
"grad_norm": 1.2349388599395752, |
|
"learning_rate": 1.6031783578429605e-07, |
|
"loss": 0.3167, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 1.0584666728973389, |
|
"learning_rate": 1.558453275272942e-07, |
|
"loss": 0.3261, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 2.845982142857143, |
|
"grad_norm": 1.3490861654281616, |
|
"learning_rate": 1.5143560156462567e-07, |
|
"loss": 0.307, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.8482142857142856, |
|
"grad_norm": 1.16843581199646, |
|
"learning_rate": 1.4708868601985503e-07, |
|
"loss": 0.3004, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 2.850446428571429, |
|
"grad_norm": 1.0229218006134033, |
|
"learning_rate": 1.4280460861596513e-07, |
|
"loss": 0.2641, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 2.852678571428571, |
|
"grad_norm": 1.0380784273147583, |
|
"learning_rate": 1.385833966751815e-07, |
|
"loss": 0.2836, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 2.8549107142857144, |
|
"grad_norm": 1.2049779891967773, |
|
"learning_rate": 1.3442507711879494e-07, |
|
"loss": 0.3272, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.1403696537017822, |
|
"learning_rate": 1.303296764669959e-07, |
|
"loss": 0.4074, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.859375, |
|
"grad_norm": 1.1046922206878662, |
|
"learning_rate": 1.2629722083870033e-07, |
|
"loss": 0.3132, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 2.861607142857143, |
|
"grad_norm": 1.0592225790023804, |
|
"learning_rate": 1.2232773595138415e-07, |
|
"loss": 0.2396, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 2.8638392857142856, |
|
"grad_norm": 1.0567072629928589, |
|
"learning_rate": 1.1842124712092117e-07, |
|
"loss": 0.2843, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 2.866071428571429, |
|
"grad_norm": 0.9778628349304199, |
|
"learning_rate": 1.1457777926141889e-07, |
|
"loss": 0.2609, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 2.868303571428571, |
|
"grad_norm": 1.0432276725769043, |
|
"learning_rate": 1.1079735688506065e-07, |
|
"loss": 0.3249, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.8705357142857144, |
|
"grad_norm": 1.0415635108947754, |
|
"learning_rate": 1.0708000410195041e-07, |
|
"loss": 0.2822, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 2.872767857142857, |
|
"grad_norm": 0.9630635976791382, |
|
"learning_rate": 1.0342574461995936e-07, |
|
"loss": 0.2646, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 1.06930410861969, |
|
"learning_rate": 9.98346017445706e-08, |
|
"loss": 0.2927, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 2.877232142857143, |
|
"grad_norm": 0.973823606967926, |
|
"learning_rate": 9.630659837873368e-08, |
|
"loss": 0.3139, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 2.8794642857142856, |
|
"grad_norm": 1.0439759492874146, |
|
"learning_rate": 9.284175702272246e-08, |
|
"loss": 0.2869, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.881696428571429, |
|
"grad_norm": 1.300384283065796, |
|
"learning_rate": 8.944009977398083e-08, |
|
"loss": 0.323, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 2.883928571428571, |
|
"grad_norm": 1.096799612045288, |
|
"learning_rate": 8.610164832699608e-08, |
|
"loss": 0.309, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 2.8861607142857144, |
|
"grad_norm": 1.0528693199157715, |
|
"learning_rate": 8.282642397314356e-08, |
|
"loss": 0.3453, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 2.888392857142857, |
|
"grad_norm": 1.0400652885437012, |
|
"learning_rate": 7.96144476005689e-08, |
|
"loss": 0.2666, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 2.890625, |
|
"grad_norm": 0.9061072468757629, |
|
"learning_rate": 7.646573969404159e-08, |
|
"loss": 0.2714, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.892857142857143, |
|
"grad_norm": 1.0487802028656006, |
|
"learning_rate": 7.338032033482712e-08, |
|
"loss": 0.2844, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 2.8950892857142856, |
|
"grad_norm": 0.9911705255508423, |
|
"learning_rate": 7.035820920056724e-08, |
|
"loss": 0.2749, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 2.897321428571429, |
|
"grad_norm": 1.3166087865829468, |
|
"learning_rate": 6.73994255651389e-08, |
|
"loss": 0.3114, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 2.899553571428571, |
|
"grad_norm": 1.0869697332382202, |
|
"learning_rate": 6.450398829854764e-08, |
|
"loss": 0.2953, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 2.9017857142857144, |
|
"grad_norm": 0.9904835224151611, |
|
"learning_rate": 6.167191586679556e-08, |
|
"loss": 0.2908, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.904017857142857, |
|
"grad_norm": 1.242396354675293, |
|
"learning_rate": 5.890322633177126e-08, |
|
"loss": 0.312, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 1.274328589439392, |
|
"learning_rate": 5.6197937351125664e-08, |
|
"loss": 0.2899, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 2.908482142857143, |
|
"grad_norm": 0.8983531594276428, |
|
"learning_rate": 5.355606617817089e-08, |
|
"loss": 0.2409, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 2.9107142857142856, |
|
"grad_norm": 1.077206015586853, |
|
"learning_rate": 5.097762966176256e-08, |
|
"loss": 0.2714, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 2.912946428571429, |
|
"grad_norm": 0.9723058342933655, |
|
"learning_rate": 4.846264424619218e-08, |
|
"loss": 0.2529, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.915178571428571, |
|
"grad_norm": 0.922687828540802, |
|
"learning_rate": 4.6011125971084924e-08, |
|
"loss": 0.2839, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 2.9174107142857144, |
|
"grad_norm": 1.1098482608795166, |
|
"learning_rate": 4.3623090471296426e-08, |
|
"loss": 0.2432, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 2.919642857142857, |
|
"grad_norm": 1.146208643913269, |
|
"learning_rate": 4.129855297681618e-08, |
|
"loss": 0.289, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 2.921875, |
|
"grad_norm": 1.244287371635437, |
|
"learning_rate": 3.903752831266205e-08, |
|
"loss": 0.2719, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 2.924107142857143, |
|
"grad_norm": 1.0499545335769653, |
|
"learning_rate": 3.684003089879484e-08, |
|
"loss": 0.3146, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.9263392857142856, |
|
"grad_norm": 1.0027427673339844, |
|
"learning_rate": 3.4706074750022744e-08, |
|
"loss": 0.3214, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 2.928571428571429, |
|
"grad_norm": 1.0911877155303955, |
|
"learning_rate": 3.2635673475910345e-08, |
|
"loss": 0.2908, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 2.930803571428571, |
|
"grad_norm": 1.0470216274261475, |
|
"learning_rate": 3.062884028069313e-08, |
|
"loss": 0.3333, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 2.9330357142857144, |
|
"grad_norm": 1.1500890254974365, |
|
"learning_rate": 2.8685587963194206e-08, |
|
"loss": 0.3176, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 2.935267857142857, |
|
"grad_norm": 1.0063806772232056, |
|
"learning_rate": 2.6805928916742163e-08, |
|
"loss": 0.2551, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 1.0910873413085938, |
|
"learning_rate": 2.4989875129091124e-08, |
|
"loss": 0.2847, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 2.939732142857143, |
|
"grad_norm": 0.9625343084335327, |
|
"learning_rate": 2.323743818234414e-08, |
|
"loss": 0.2416, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 2.9419642857142856, |
|
"grad_norm": 0.9664291739463806, |
|
"learning_rate": 2.154862925288326e-08, |
|
"loss": 0.3257, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 2.944196428571429, |
|
"grad_norm": 1.1130931377410889, |
|
"learning_rate": 1.9923459111290676e-08, |
|
"loss": 0.3248, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 2.946428571428571, |
|
"grad_norm": 1.1273753643035889, |
|
"learning_rate": 1.8361938122287704e-08, |
|
"loss": 0.3255, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.9486607142857144, |
|
"grad_norm": 1.1616261005401611, |
|
"learning_rate": 1.6864076244663686e-08, |
|
"loss": 0.3834, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 2.950892857142857, |
|
"grad_norm": 0.9705925583839417, |
|
"learning_rate": 1.5429883031217173e-08, |
|
"loss": 0.3499, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 2.953125, |
|
"grad_norm": 1.1763215065002441, |
|
"learning_rate": 1.4059367628687094e-08, |
|
"loss": 0.3263, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 2.955357142857143, |
|
"grad_norm": 1.0161902904510498, |
|
"learning_rate": 1.2752538777704993e-08, |
|
"loss": 0.2873, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 2.9575892857142856, |
|
"grad_norm": 1.0196352005004883, |
|
"learning_rate": 1.1509404812728443e-08, |
|
"loss": 0.31, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.959821428571429, |
|
"grad_norm": 1.0153658390045166, |
|
"learning_rate": 1.0329973661996617e-08, |
|
"loss": 0.3055, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 2.962053571428571, |
|
"grad_norm": 1.0681740045547485, |
|
"learning_rate": 9.214252847475902e-09, |
|
"loss": 0.2633, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 2.9642857142857144, |
|
"grad_norm": 1.227658987045288, |
|
"learning_rate": 8.162249484809926e-09, |
|
"loss": 0.3204, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 2.966517857142857, |
|
"grad_norm": 1.0228925943374634, |
|
"learning_rate": 7.173970283279597e-09, |
|
"loss": 0.2675, |
|
"step": 1329 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 1.1837745904922485, |
|
"learning_rate": 6.249421545755363e-09, |
|
"loss": 0.3337, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.970982142857143, |
|
"grad_norm": 1.1520233154296875, |
|
"learning_rate": 5.388609168659465e-09, |
|
"loss": 0.3157, |
|
"step": 1331 |
|
}, |
|
{ |
|
"epoch": 2.9732142857142856, |
|
"grad_norm": 1.3604669570922852, |
|
"learning_rate": 4.591538641927074e-09, |
|
"loss": 0.3322, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 2.975446428571429, |
|
"grad_norm": 1.0328572988510132, |
|
"learning_rate": 3.858215048972991e-09, |
|
"loss": 0.3262, |
|
"step": 1333 |
|
}, |
|
{ |
|
"epoch": 2.977678571428571, |
|
"grad_norm": 1.1347808837890625, |
|
"learning_rate": 3.1886430666561163e-09, |
|
"loss": 0.329, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 2.9799107142857144, |
|
"grad_norm": 1.0527995824813843, |
|
"learning_rate": 2.5828269652561355e-09, |
|
"loss": 0.2758, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.982142857142857, |
|
"grad_norm": 1.057120680809021, |
|
"learning_rate": 2.0407706084368816e-09, |
|
"loss": 0.2689, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 2.984375, |
|
"grad_norm": 0.9978762269020081, |
|
"learning_rate": 1.5624774532285726e-09, |
|
"loss": 0.2841, |
|
"step": 1337 |
|
}, |
|
{ |
|
"epoch": 2.986607142857143, |
|
"grad_norm": 1.1930631399154663, |
|
"learning_rate": 1.1479505500044952e-09, |
|
"loss": 0.2659, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 2.9888392857142856, |
|
"grad_norm": 0.9426537752151489, |
|
"learning_rate": 7.971925424621329e-10, |
|
"loss": 0.2822, |
|
"step": 1339 |
|
}, |
|
{ |
|
"epoch": 2.991071428571429, |
|
"grad_norm": 1.2255107164382935, |
|
"learning_rate": 5.102056675998501e-10, |
|
"loss": 0.3154, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.993303571428571, |
|
"grad_norm": 1.0295275449752808, |
|
"learning_rate": 2.8699175571467177e-10, |
|
"loss": 0.2938, |
|
"step": 1341 |
|
}, |
|
{ |
|
"epoch": 2.9955357142857144, |
|
"grad_norm": 1.188139796257019, |
|
"learning_rate": 1.2755223037896892e-10, |
|
"loss": 0.3389, |
|
"step": 1342 |
|
}, |
|
{ |
|
"epoch": 2.997767857142857, |
|
"grad_norm": 1.121012568473816, |
|
"learning_rate": 3.1888108437128085e-11, |
|
"loss": 0.3364, |
|
"step": 1343 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.9530598521232605, |
|
"learning_rate": 0.0, |
|
"loss": 0.3106, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6524144411087036, |
|
"eval_runtime": 25.7885, |
|
"eval_samples_per_second": 2.831, |
|
"eval_steps_per_second": 0.388, |
|
"step": 1344 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1344, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 224, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.076426689825997e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|