diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 500, + "global_step": 6250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8e-05, + "grad_norm": 8.342381477355957, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.704, + "step": 1 + }, + { + "epoch": 0.00016, + "grad_norm": 13.919276237487793, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5506, + "step": 2 + }, + { + "epoch": 0.00024, + "grad_norm": 9.644050598144531, + "learning_rate": 3e-06, + "loss": 0.5138, + "step": 3 + }, + { + "epoch": 0.00032, + "grad_norm": 9.632057189941406, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7317, + "step": 4 + }, + { + "epoch": 0.0004, + "grad_norm": 8.935699462890625, + "learning_rate": 5e-06, + "loss": 0.7372, + "step": 5 + }, + { + "epoch": 0.00048, + "grad_norm": 9.070698738098145, + "learning_rate": 6e-06, + "loss": 0.7517, + "step": 6 + }, + { + "epoch": 0.00056, + "grad_norm": 9.226506233215332, + "learning_rate": 7e-06, + "loss": 0.6523, + "step": 7 + }, + { + "epoch": 0.00064, + "grad_norm": 11.089458465576172, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6743, + "step": 8 + }, + { + "epoch": 0.00072, + "grad_norm": 7.038223743438721, + "learning_rate": 9e-06, + "loss": 0.6536, + "step": 9 + }, + { + "epoch": 0.0008, + "grad_norm": 12.974882125854492, + "learning_rate": 1e-05, + "loss": 0.6122, + "step": 10 + }, + { + "epoch": 0.00088, + "grad_norm": 8.054683685302734, + "learning_rate": 9.999999841833366e-06, + "loss": 0.5658, + "step": 11 + }, + { + "epoch": 0.00096, + "grad_norm": 15.936073303222656, + "learning_rate": 9.99999936733347e-06, + "loss": 0.8491, + "step": 12 + }, + { + "epoch": 0.00104, + "grad_norm": 4.686435699462891, + "learning_rate": 9.999998576500346e-06, + "loss": 0.4673, + "step": 13 + }, + { + "epoch": 0.00112, + "grad_norm": 9.04387092590332, + "learning_rate": 9.999997469334042e-06, + "loss": 0.6238, + "step": 14 + }, + { + "epoch": 0.0012, + "grad_norm": 10.498541831970215, + "learning_rate": 9.999996045834626e-06, + "loss": 0.6662, + "step": 15 + }, + { + "epoch": 0.00128, + "grad_norm": 5.166777610778809, + "learning_rate": 9.999994306002192e-06, + "loss": 0.5575, + "step": 16 + }, + { + "epoch": 0.00136, + "grad_norm": 7.151340484619141, + "learning_rate": 9.99999224983685e-06, + "loss": 0.5892, + "step": 17 + }, + { + "epoch": 0.00144, + "grad_norm": 5.04518985748291, + "learning_rate": 9.999989877338725e-06, + "loss": 0.6417, + "step": 18 + }, + { + "epoch": 0.00152, + "grad_norm": 5.931295394897461, + "learning_rate": 9.999987188507972e-06, + "loss": 0.4939, + "step": 19 + }, + { + "epoch": 0.0016, + "grad_norm": 5.467217445373535, + "learning_rate": 9.99998418334476e-06, + "loss": 0.6131, + "step": 20 + }, + { + "epoch": 0.00168, + "grad_norm": 5.837316989898682, + "learning_rate": 9.999980861849277e-06, + "loss": 0.5316, + "step": 21 + }, + { + "epoch": 0.00176, + "grad_norm": 4.237757205963135, + "learning_rate": 9.999977224021735e-06, + "loss": 0.5499, + "step": 22 + }, + { + "epoch": 0.00184, + "grad_norm": 6.357263565063477, + "learning_rate": 9.999973269862366e-06, + "loss": 0.5405, + "step": 23 + }, + { + "epoch": 0.00192, + "grad_norm": 6.0218939781188965, + "learning_rate": 9.999968999371416e-06, + "loss": 0.543, + "step": 24 + }, + { + "epoch": 0.002, + "grad_norm": 5.242684841156006, + "learning_rate": 9.99996441254916e-06, + "loss": 0.7568, + "step": 25 + }, + { + "epoch": 0.00208, + "grad_norm": 5.868856906890869, + "learning_rate": 9.999959509395884e-06, + "loss": 0.5211, + "step": 26 + }, + { + "epoch": 0.00216, + "grad_norm": 5.401463985443115, + "learning_rate": 9.9999542899119e-06, + "loss": 0.4879, + "step": 27 + }, + { + "epoch": 0.00224, + "grad_norm": 4.193491458892822, + "learning_rate": 9.999948754097538e-06, + "loss": 0.409, + "step": 28 + }, + { + "epoch": 0.00232, + "grad_norm": 8.131255149841309, + "learning_rate": 9.999942901953148e-06, + "loss": 0.5406, + "step": 29 + }, + { + "epoch": 0.0024, + "grad_norm": 8.522489547729492, + "learning_rate": 9.9999367334791e-06, + "loss": 0.4171, + "step": 30 + }, + { + "epoch": 0.00248, + "grad_norm": 5.474879264831543, + "learning_rate": 9.999930248675784e-06, + "loss": 0.3518, + "step": 31 + }, + { + "epoch": 0.00256, + "grad_norm": 3.1692543029785156, + "learning_rate": 9.999923447543614e-06, + "loss": 0.3889, + "step": 32 + }, + { + "epoch": 0.00264, + "grad_norm": 5.542096138000488, + "learning_rate": 9.999916330083015e-06, + "loss": 0.4787, + "step": 33 + }, + { + "epoch": 0.00272, + "grad_norm": 4.017666339874268, + "learning_rate": 9.99990889629444e-06, + "loss": 0.4756, + "step": 34 + }, + { + "epoch": 0.0028, + "grad_norm": 4.048304080963135, + "learning_rate": 9.99990114617836e-06, + "loss": 0.4496, + "step": 35 + }, + { + "epoch": 0.00288, + "grad_norm": 9.737591743469238, + "learning_rate": 9.999893079735262e-06, + "loss": 0.5132, + "step": 36 + }, + { + "epoch": 0.00296, + "grad_norm": 5.1411051750183105, + "learning_rate": 9.99988469696566e-06, + "loss": 0.4311, + "step": 37 + }, + { + "epoch": 0.00304, + "grad_norm": 7.761783599853516, + "learning_rate": 9.999875997870081e-06, + "loss": 0.6614, + "step": 38 + }, + { + "epoch": 0.00312, + "grad_norm": 2.5379042625427246, + "learning_rate": 9.99986698244908e-06, + "loss": 0.4899, + "step": 39 + }, + { + "epoch": 0.0032, + "grad_norm": 5.286753177642822, + "learning_rate": 9.999857650703224e-06, + "loss": 0.4297, + "step": 40 + }, + { + "epoch": 0.00328, + "grad_norm": 4.113813400268555, + "learning_rate": 9.999848002633102e-06, + "loss": 0.3484, + "step": 41 + }, + { + "epoch": 0.00336, + "grad_norm": 4.206110954284668, + "learning_rate": 9.999838038239327e-06, + "loss": 0.5235, + "step": 42 + }, + { + "epoch": 0.00344, + "grad_norm": 8.220080375671387, + "learning_rate": 9.999827757522531e-06, + "loss": 0.5051, + "step": 43 + }, + { + "epoch": 0.00352, + "grad_norm": 3.16642427444458, + "learning_rate": 9.99981716048336e-06, + "loss": 0.3753, + "step": 44 + }, + { + "epoch": 0.0036, + "grad_norm": 2.503070592880249, + "learning_rate": 9.999806247122488e-06, + "loss": 0.453, + "step": 45 + }, + { + "epoch": 0.00368, + "grad_norm": 4.834280967712402, + "learning_rate": 9.999795017440603e-06, + "loss": 0.3499, + "step": 46 + }, + { + "epoch": 0.00376, + "grad_norm": 2.557096004486084, + "learning_rate": 9.999783471438419e-06, + "loss": 0.3253, + "step": 47 + }, + { + "epoch": 0.00384, + "grad_norm": 3.6029672622680664, + "learning_rate": 9.999771609116662e-06, + "loss": 0.4115, + "step": 48 + }, + { + "epoch": 0.00392, + "grad_norm": 3.1142241954803467, + "learning_rate": 9.999759430476084e-06, + "loss": 0.5014, + "step": 49 + }, + { + "epoch": 0.004, + "grad_norm": 3.7333784103393555, + "learning_rate": 9.999746935517457e-06, + "loss": 0.3711, + "step": 50 + }, + { + "epoch": 0.00408, + "grad_norm": 3.2080132961273193, + "learning_rate": 9.999734124241571e-06, + "loss": 0.3347, + "step": 51 + }, + { + "epoch": 0.00416, + "grad_norm": 3.805007219314575, + "learning_rate": 9.999720996649235e-06, + "loss": 0.7125, + "step": 52 + }, + { + "epoch": 0.00424, + "grad_norm": 3.14184308052063, + "learning_rate": 9.999707552741283e-06, + "loss": 0.5121, + "step": 53 + }, + { + "epoch": 0.00432, + "grad_norm": 3.540736436843872, + "learning_rate": 9.999693792518562e-06, + "loss": 0.4294, + "step": 54 + }, + { + "epoch": 0.0044, + "grad_norm": 3.0372300148010254, + "learning_rate": 9.999679715981942e-06, + "loss": 0.4628, + "step": 55 + }, + { + "epoch": 0.00448, + "grad_norm": 2.61629056930542, + "learning_rate": 9.999665323132317e-06, + "loss": 0.4936, + "step": 56 + }, + { + "epoch": 0.00456, + "grad_norm": 3.4021859169006348, + "learning_rate": 9.999650613970597e-06, + "loss": 0.539, + "step": 57 + }, + { + "epoch": 0.00464, + "grad_norm": 1.9579095840454102, + "learning_rate": 9.99963558849771e-06, + "loss": 0.3475, + "step": 58 + }, + { + "epoch": 0.00472, + "grad_norm": 3.136955976486206, + "learning_rate": 9.999620246714607e-06, + "loss": 0.389, + "step": 59 + }, + { + "epoch": 0.0048, + "grad_norm": 3.5431883335113525, + "learning_rate": 9.999604588622263e-06, + "loss": 0.4231, + "step": 60 + }, + { + "epoch": 0.00488, + "grad_norm": 4.076716899871826, + "learning_rate": 9.999588614221663e-06, + "loss": 0.4758, + "step": 61 + }, + { + "epoch": 0.00496, + "grad_norm": 2.444188117980957, + "learning_rate": 9.99957232351382e-06, + "loss": 0.4112, + "step": 62 + }, + { + "epoch": 0.00504, + "grad_norm": 8.321596145629883, + "learning_rate": 9.999555716499766e-06, + "loss": 0.4371, + "step": 63 + }, + { + "epoch": 0.00512, + "grad_norm": 2.7783851623535156, + "learning_rate": 9.99953879318055e-06, + "loss": 0.4137, + "step": 64 + }, + { + "epoch": 0.0052, + "grad_norm": 3.005558729171753, + "learning_rate": 9.999521553557243e-06, + "loss": 0.3781, + "step": 65 + }, + { + "epoch": 0.00528, + "grad_norm": 2.853501558303833, + "learning_rate": 9.999503997630934e-06, + "loss": 0.4312, + "step": 66 + }, + { + "epoch": 0.00536, + "grad_norm": 2.694535732269287, + "learning_rate": 9.999486125402738e-06, + "loss": 0.3815, + "step": 67 + }, + { + "epoch": 0.00544, + "grad_norm": 4.3195013999938965, + "learning_rate": 9.999467936873783e-06, + "loss": 0.4915, + "step": 68 + }, + { + "epoch": 0.00552, + "grad_norm": 2.009920358657837, + "learning_rate": 9.999449432045218e-06, + "loss": 0.4457, + "step": 69 + }, + { + "epoch": 0.0056, + "grad_norm": 2.0476291179656982, + "learning_rate": 9.999430610918217e-06, + "loss": 0.4017, + "step": 70 + }, + { + "epoch": 0.00568, + "grad_norm": 3.428873300552368, + "learning_rate": 9.99941147349397e-06, + "loss": 0.4943, + "step": 71 + }, + { + "epoch": 0.00576, + "grad_norm": 4.138948917388916, + "learning_rate": 9.999392019773685e-06, + "loss": 0.4353, + "step": 72 + }, + { + "epoch": 0.00584, + "grad_norm": 3.7678720951080322, + "learning_rate": 9.999372249758596e-06, + "loss": 0.3594, + "step": 73 + }, + { + "epoch": 0.00592, + "grad_norm": 2.1992523670196533, + "learning_rate": 9.999352163449954e-06, + "loss": 0.4057, + "step": 74 + }, + { + "epoch": 0.006, + "grad_norm": 3.9209282398223877, + "learning_rate": 9.999331760849028e-06, + "loss": 0.3694, + "step": 75 + }, + { + "epoch": 0.00608, + "grad_norm": 2.271817684173584, + "learning_rate": 9.999311041957109e-06, + "loss": 0.3515, + "step": 76 + }, + { + "epoch": 0.00616, + "grad_norm": 3.229722023010254, + "learning_rate": 9.999290006775507e-06, + "loss": 0.3017, + "step": 77 + }, + { + "epoch": 0.00624, + "grad_norm": 2.318446397781372, + "learning_rate": 9.999268655305556e-06, + "loss": 0.3885, + "step": 78 + }, + { + "epoch": 0.00632, + "grad_norm": 2.2932703495025635, + "learning_rate": 9.999246987548603e-06, + "loss": 0.3392, + "step": 79 + }, + { + "epoch": 0.0064, + "grad_norm": 2.111154794692993, + "learning_rate": 9.999225003506021e-06, + "loss": 0.3128, + "step": 80 + }, + { + "epoch": 0.00648, + "grad_norm": 4.121140003204346, + "learning_rate": 9.9992027031792e-06, + "loss": 0.4418, + "step": 81 + }, + { + "epoch": 0.00656, + "grad_norm": 1.6190462112426758, + "learning_rate": 9.999180086569553e-06, + "loss": 0.2905, + "step": 82 + }, + { + "epoch": 0.00664, + "grad_norm": 3.0115609169006348, + "learning_rate": 9.999157153678509e-06, + "loss": 0.3379, + "step": 83 + }, + { + "epoch": 0.00672, + "grad_norm": 3.357482671737671, + "learning_rate": 9.999133904507518e-06, + "loss": 0.4827, + "step": 84 + }, + { + "epoch": 0.0068, + "grad_norm": 2.7103617191314697, + "learning_rate": 9.99911033905805e-06, + "loss": 0.3324, + "step": 85 + }, + { + "epoch": 0.00688, + "grad_norm": 2.003753662109375, + "learning_rate": 9.999086457331603e-06, + "loss": 0.4019, + "step": 86 + }, + { + "epoch": 0.00696, + "grad_norm": 2.259552478790283, + "learning_rate": 9.999062259329679e-06, + "loss": 0.4511, + "step": 87 + }, + { + "epoch": 0.00704, + "grad_norm": 2.696129322052002, + "learning_rate": 9.999037745053814e-06, + "loss": 0.3631, + "step": 88 + }, + { + "epoch": 0.00712, + "grad_norm": 2.0996644496917725, + "learning_rate": 9.999012914505559e-06, + "loss": 0.4356, + "step": 89 + }, + { + "epoch": 0.0072, + "grad_norm": 1.694945216178894, + "learning_rate": 9.998987767686482e-06, + "loss": 0.3558, + "step": 90 + }, + { + "epoch": 0.00728, + "grad_norm": 4.748021602630615, + "learning_rate": 9.998962304598175e-06, + "loss": 0.403, + "step": 91 + }, + { + "epoch": 0.00736, + "grad_norm": 2.026329278945923, + "learning_rate": 9.998936525242251e-06, + "loss": 0.3913, + "step": 92 + }, + { + "epoch": 0.00744, + "grad_norm": 1.9248204231262207, + "learning_rate": 9.99891042962034e-06, + "loss": 0.2967, + "step": 93 + }, + { + "epoch": 0.00752, + "grad_norm": 2.36529278755188, + "learning_rate": 9.998884017734091e-06, + "loss": 0.4356, + "step": 94 + }, + { + "epoch": 0.0076, + "grad_norm": 2.3117401599884033, + "learning_rate": 9.998857289585177e-06, + "loss": 0.362, + "step": 95 + }, + { + "epoch": 0.00768, + "grad_norm": 2.4276554584503174, + "learning_rate": 9.998830245175288e-06, + "loss": 0.5628, + "step": 96 + }, + { + "epoch": 0.00776, + "grad_norm": 3.106940984725952, + "learning_rate": 9.998802884506136e-06, + "loss": 0.3412, + "step": 97 + }, + { + "epoch": 0.00784, + "grad_norm": 1.9471877813339233, + "learning_rate": 9.998775207579452e-06, + "loss": 0.506, + "step": 98 + }, + { + "epoch": 0.00792, + "grad_norm": 2.589988946914673, + "learning_rate": 9.998747214396987e-06, + "loss": 0.5048, + "step": 99 + }, + { + "epoch": 0.008, + "grad_norm": 2.1943140029907227, + "learning_rate": 9.998718904960511e-06, + "loss": 0.3406, + "step": 100 + }, + { + "epoch": 0.00808, + "grad_norm": 2.6359894275665283, + "learning_rate": 9.998690279271815e-06, + "loss": 0.4702, + "step": 101 + }, + { + "epoch": 0.00816, + "grad_norm": 2.8097708225250244, + "learning_rate": 9.99866133733271e-06, + "loss": 0.2897, + "step": 102 + }, + { + "epoch": 0.00824, + "grad_norm": 1.7844816446304321, + "learning_rate": 9.99863207914503e-06, + "loss": 0.3935, + "step": 103 + }, + { + "epoch": 0.00832, + "grad_norm": 2.048438549041748, + "learning_rate": 9.998602504710623e-06, + "loss": 0.4605, + "step": 104 + }, + { + "epoch": 0.0084, + "grad_norm": 2.508671522140503, + "learning_rate": 9.99857261403136e-06, + "loss": 0.3009, + "step": 105 + }, + { + "epoch": 0.00848, + "grad_norm": 1.8178949356079102, + "learning_rate": 9.998542407109135e-06, + "loss": 0.4019, + "step": 106 + }, + { + "epoch": 0.00856, + "grad_norm": 4.293577671051025, + "learning_rate": 9.998511883945855e-06, + "loss": 0.3646, + "step": 107 + }, + { + "epoch": 0.00864, + "grad_norm": 2.485862970352173, + "learning_rate": 9.998481044543452e-06, + "loss": 0.4891, + "step": 108 + }, + { + "epoch": 0.00872, + "grad_norm": 2.0601084232330322, + "learning_rate": 9.998449888903881e-06, + "loss": 0.3481, + "step": 109 + }, + { + "epoch": 0.0088, + "grad_norm": 2.0768215656280518, + "learning_rate": 9.99841841702911e-06, + "loss": 0.3108, + "step": 110 + }, + { + "epoch": 0.00888, + "grad_norm": 1.9751044511795044, + "learning_rate": 9.99838662892113e-06, + "loss": 0.3349, + "step": 111 + }, + { + "epoch": 0.00896, + "grad_norm": 2.3447659015655518, + "learning_rate": 9.998354524581953e-06, + "loss": 0.4507, + "step": 112 + }, + { + "epoch": 0.00904, + "grad_norm": 2.0622479915618896, + "learning_rate": 9.998322104013609e-06, + "loss": 0.3494, + "step": 113 + }, + { + "epoch": 0.00912, + "grad_norm": 2.2754018306732178, + "learning_rate": 9.998289367218151e-06, + "loss": 0.3852, + "step": 114 + }, + { + "epoch": 0.0092, + "grad_norm": 2.0593910217285156, + "learning_rate": 9.998256314197648e-06, + "loss": 0.3999, + "step": 115 + }, + { + "epoch": 0.00928, + "grad_norm": 2.1019093990325928, + "learning_rate": 9.998222944954193e-06, + "loss": 0.4743, + "step": 116 + }, + { + "epoch": 0.00936, + "grad_norm": 1.9430328607559204, + "learning_rate": 9.998189259489897e-06, + "loss": 0.3751, + "step": 117 + }, + { + "epoch": 0.00944, + "grad_norm": 1.9289934635162354, + "learning_rate": 9.99815525780689e-06, + "loss": 0.3586, + "step": 118 + }, + { + "epoch": 0.00952, + "grad_norm": 2.437016248703003, + "learning_rate": 9.998120939907323e-06, + "loss": 0.4204, + "step": 119 + }, + { + "epoch": 0.0096, + "grad_norm": 2.8940718173980713, + "learning_rate": 9.998086305793368e-06, + "loss": 0.379, + "step": 120 + }, + { + "epoch": 0.00968, + "grad_norm": 2.258979558944702, + "learning_rate": 9.998051355467215e-06, + "loss": 0.4011, + "step": 121 + }, + { + "epoch": 0.00976, + "grad_norm": 1.7911226749420166, + "learning_rate": 9.99801608893108e-06, + "loss": 0.3489, + "step": 122 + }, + { + "epoch": 0.00984, + "grad_norm": 3.431490659713745, + "learning_rate": 9.997980506187188e-06, + "loss": 0.4482, + "step": 123 + }, + { + "epoch": 0.00992, + "grad_norm": 2.332395315170288, + "learning_rate": 9.997944607237791e-06, + "loss": 0.3858, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.693103790283203, + "learning_rate": 9.997908392085164e-06, + "loss": 0.4227, + "step": 125 + }, + { + "epoch": 0.01008, + "grad_norm": 2.643721580505371, + "learning_rate": 9.997871860731596e-06, + "loss": 0.3627, + "step": 126 + }, + { + "epoch": 0.01016, + "grad_norm": 2.5970005989074707, + "learning_rate": 9.997835013179397e-06, + "loss": 0.5068, + "step": 127 + }, + { + "epoch": 0.01024, + "grad_norm": 1.9385316371917725, + "learning_rate": 9.997797849430902e-06, + "loss": 0.2864, + "step": 128 + }, + { + "epoch": 0.01032, + "grad_norm": 1.7153620719909668, + "learning_rate": 9.997760369488458e-06, + "loss": 0.3723, + "step": 129 + }, + { + "epoch": 0.0104, + "grad_norm": 1.6988774538040161, + "learning_rate": 9.997722573354438e-06, + "loss": 0.3445, + "step": 130 + }, + { + "epoch": 0.01048, + "grad_norm": 2.6378743648529053, + "learning_rate": 9.997684461031235e-06, + "loss": 0.5199, + "step": 131 + }, + { + "epoch": 0.01056, + "grad_norm": 2.355621099472046, + "learning_rate": 9.997646032521256e-06, + "loss": 0.4259, + "step": 132 + }, + { + "epoch": 0.01064, + "grad_norm": 2.3169424533843994, + "learning_rate": 9.997607287826937e-06, + "loss": 0.5117, + "step": 133 + }, + { + "epoch": 0.01072, + "grad_norm": 2.7013490200042725, + "learning_rate": 9.997568226950723e-06, + "loss": 0.6017, + "step": 134 + }, + { + "epoch": 0.0108, + "grad_norm": 1.5648651123046875, + "learning_rate": 9.997528849895092e-06, + "loss": 0.401, + "step": 135 + }, + { + "epoch": 0.01088, + "grad_norm": 2.346057415008545, + "learning_rate": 9.997489156662532e-06, + "loss": 0.6035, + "step": 136 + }, + { + "epoch": 0.01096, + "grad_norm": 3.0977964401245117, + "learning_rate": 9.997449147255556e-06, + "loss": 0.5332, + "step": 137 + }, + { + "epoch": 0.01104, + "grad_norm": 2.7511343955993652, + "learning_rate": 9.997408821676693e-06, + "loss": 0.3371, + "step": 138 + }, + { + "epoch": 0.01112, + "grad_norm": 3.267756938934326, + "learning_rate": 9.997368179928495e-06, + "loss": 0.349, + "step": 139 + }, + { + "epoch": 0.0112, + "grad_norm": 2.05903959274292, + "learning_rate": 9.997327222013533e-06, + "loss": 0.3592, + "step": 140 + }, + { + "epoch": 0.01128, + "grad_norm": 1.7902599573135376, + "learning_rate": 9.9972859479344e-06, + "loss": 0.3498, + "step": 141 + }, + { + "epoch": 0.01136, + "grad_norm": 1.8080183267593384, + "learning_rate": 9.997244357693704e-06, + "loss": 0.3041, + "step": 142 + }, + { + "epoch": 0.01144, + "grad_norm": 2.1454615592956543, + "learning_rate": 9.99720245129408e-06, + "loss": 0.2934, + "step": 143 + }, + { + "epoch": 0.01152, + "grad_norm": 2.059626340866089, + "learning_rate": 9.997160228738178e-06, + "loss": 0.3284, + "step": 144 + }, + { + "epoch": 0.0116, + "grad_norm": 2.013929605484009, + "learning_rate": 9.997117690028668e-06, + "loss": 0.3648, + "step": 145 + }, + { + "epoch": 0.01168, + "grad_norm": 2.726135015487671, + "learning_rate": 9.997074835168243e-06, + "loss": 0.5155, + "step": 146 + }, + { + "epoch": 0.01176, + "grad_norm": 1.494078516960144, + "learning_rate": 9.997031664159614e-06, + "loss": 0.342, + "step": 147 + }, + { + "epoch": 0.01184, + "grad_norm": 3.9255640506744385, + "learning_rate": 9.996988177005508e-06, + "loss": 0.3398, + "step": 148 + }, + { + "epoch": 0.01192, + "grad_norm": 1.7292317152023315, + "learning_rate": 9.996944373708683e-06, + "loss": 0.3688, + "step": 149 + }, + { + "epoch": 0.012, + "grad_norm": 1.9214504957199097, + "learning_rate": 9.996900254271909e-06, + "loss": 0.3196, + "step": 150 + }, + { + "epoch": 0.01208, + "grad_norm": 2.378833293914795, + "learning_rate": 9.996855818697973e-06, + "loss": 0.4263, + "step": 151 + }, + { + "epoch": 0.01216, + "grad_norm": 2.5661027431488037, + "learning_rate": 9.996811066989688e-06, + "loss": 0.4251, + "step": 152 + }, + { + "epoch": 0.01224, + "grad_norm": 2.3186323642730713, + "learning_rate": 9.996765999149888e-06, + "loss": 0.5136, + "step": 153 + }, + { + "epoch": 0.01232, + "grad_norm": 4.518758296966553, + "learning_rate": 9.996720615181422e-06, + "loss": 0.4732, + "step": 154 + }, + { + "epoch": 0.0124, + "grad_norm": 1.8078404664993286, + "learning_rate": 9.996674915087161e-06, + "loss": 0.342, + "step": 155 + }, + { + "epoch": 0.01248, + "grad_norm": 2.0094892978668213, + "learning_rate": 9.996628898869998e-06, + "loss": 0.3407, + "step": 156 + }, + { + "epoch": 0.01256, + "grad_norm": 2.363971471786499, + "learning_rate": 9.996582566532844e-06, + "loss": 0.5517, + "step": 157 + }, + { + "epoch": 0.01264, + "grad_norm": 1.7961527109146118, + "learning_rate": 9.99653591807863e-06, + "loss": 0.3353, + "step": 158 + }, + { + "epoch": 0.01272, + "grad_norm": 2.390897750854492, + "learning_rate": 9.996488953510303e-06, + "loss": 0.3781, + "step": 159 + }, + { + "epoch": 0.0128, + "grad_norm": 2.0351173877716064, + "learning_rate": 9.996441672830842e-06, + "loss": 0.4534, + "step": 160 + }, + { + "epoch": 0.01288, + "grad_norm": 1.8322193622589111, + "learning_rate": 9.996394076043235e-06, + "loss": 0.3663, + "step": 161 + }, + { + "epoch": 0.01296, + "grad_norm": 1.927498459815979, + "learning_rate": 9.996346163150489e-06, + "loss": 0.4644, + "step": 162 + }, + { + "epoch": 0.01304, + "grad_norm": 1.8679307699203491, + "learning_rate": 9.996297934155642e-06, + "loss": 0.3235, + "step": 163 + }, + { + "epoch": 0.01312, + "grad_norm": 2.3288774490356445, + "learning_rate": 9.996249389061742e-06, + "loss": 0.3555, + "step": 164 + }, + { + "epoch": 0.0132, + "grad_norm": 1.7463264465332031, + "learning_rate": 9.99620052787186e-06, + "loss": 0.3165, + "step": 165 + }, + { + "epoch": 0.01328, + "grad_norm": 1.7550216913223267, + "learning_rate": 9.996151350589089e-06, + "loss": 0.3353, + "step": 166 + }, + { + "epoch": 0.01336, + "grad_norm": 1.5060083866119385, + "learning_rate": 9.996101857216538e-06, + "loss": 0.3295, + "step": 167 + }, + { + "epoch": 0.01344, + "grad_norm": 1.9037864208221436, + "learning_rate": 9.996052047757342e-06, + "loss": 0.4499, + "step": 168 + }, + { + "epoch": 0.01352, + "grad_norm": 2.601609468460083, + "learning_rate": 9.996001922214646e-06, + "loss": 0.4094, + "step": 169 + }, + { + "epoch": 0.0136, + "grad_norm": 1.6243177652359009, + "learning_rate": 9.995951480591627e-06, + "loss": 0.281, + "step": 170 + }, + { + "epoch": 0.01368, + "grad_norm": 1.9210420846939087, + "learning_rate": 9.995900722891474e-06, + "loss": 0.3456, + "step": 171 + }, + { + "epoch": 0.01376, + "grad_norm": 2.4466614723205566, + "learning_rate": 9.995849649117398e-06, + "loss": 0.3608, + "step": 172 + }, + { + "epoch": 0.01384, + "grad_norm": 1.706709384918213, + "learning_rate": 9.995798259272633e-06, + "loss": 0.368, + "step": 173 + }, + { + "epoch": 0.01392, + "grad_norm": 2.536444664001465, + "learning_rate": 9.995746553360427e-06, + "loss": 0.3482, + "step": 174 + }, + { + "epoch": 0.014, + "grad_norm": 1.7476345300674438, + "learning_rate": 9.995694531384051e-06, + "loss": 0.3771, + "step": 175 + }, + { + "epoch": 0.01408, + "grad_norm": 2.676931142807007, + "learning_rate": 9.9956421933468e-06, + "loss": 0.4398, + "step": 176 + }, + { + "epoch": 0.01416, + "grad_norm": 2.442253828048706, + "learning_rate": 9.99558953925198e-06, + "loss": 0.3727, + "step": 177 + }, + { + "epoch": 0.01424, + "grad_norm": 1.871006965637207, + "learning_rate": 9.995536569102927e-06, + "loss": 0.4419, + "step": 178 + }, + { + "epoch": 0.01432, + "grad_norm": 1.8206201791763306, + "learning_rate": 9.995483282902992e-06, + "loss": 0.3648, + "step": 179 + }, + { + "epoch": 0.0144, + "grad_norm": 2.077590227127075, + "learning_rate": 9.995429680655541e-06, + "loss": 0.3741, + "step": 180 + }, + { + "epoch": 0.01448, + "grad_norm": 2.4338691234588623, + "learning_rate": 9.995375762363972e-06, + "loss": 0.3764, + "step": 181 + }, + { + "epoch": 0.01456, + "grad_norm": 1.9777390956878662, + "learning_rate": 9.995321528031693e-06, + "loss": 0.3789, + "step": 182 + }, + { + "epoch": 0.01464, + "grad_norm": 1.9252766370773315, + "learning_rate": 9.995266977662132e-06, + "loss": 0.3909, + "step": 183 + }, + { + "epoch": 0.01472, + "grad_norm": 1.6491369009017944, + "learning_rate": 9.995212111258745e-06, + "loss": 0.4554, + "step": 184 + }, + { + "epoch": 0.0148, + "grad_norm": 1.691046953201294, + "learning_rate": 9.995156928825003e-06, + "loss": 0.4205, + "step": 185 + }, + { + "epoch": 0.01488, + "grad_norm": 1.9484405517578125, + "learning_rate": 9.995101430364396e-06, + "loss": 0.3699, + "step": 186 + }, + { + "epoch": 0.01496, + "grad_norm": 2.238854169845581, + "learning_rate": 9.995045615880434e-06, + "loss": 0.374, + "step": 187 + }, + { + "epoch": 0.01504, + "grad_norm": 1.2045531272888184, + "learning_rate": 9.99498948537665e-06, + "loss": 0.2368, + "step": 188 + }, + { + "epoch": 0.01512, + "grad_norm": 1.7461583614349365, + "learning_rate": 9.994933038856595e-06, + "loss": 0.3133, + "step": 189 + }, + { + "epoch": 0.0152, + "grad_norm": 1.7523412704467773, + "learning_rate": 9.994876276323839e-06, + "loss": 0.3396, + "step": 190 + }, + { + "epoch": 0.01528, + "grad_norm": 3.5537517070770264, + "learning_rate": 9.994819197781973e-06, + "loss": 0.3683, + "step": 191 + }, + { + "epoch": 0.01536, + "grad_norm": 1.3427032232284546, + "learning_rate": 9.994761803234611e-06, + "loss": 0.291, + "step": 192 + }, + { + "epoch": 0.01544, + "grad_norm": 1.7434067726135254, + "learning_rate": 9.994704092685381e-06, + "loss": 0.3493, + "step": 193 + }, + { + "epoch": 0.01552, + "grad_norm": 1.7947523593902588, + "learning_rate": 9.994646066137937e-06, + "loss": 0.4236, + "step": 194 + }, + { + "epoch": 0.0156, + "grad_norm": 2.011258125305176, + "learning_rate": 9.994587723595946e-06, + "loss": 0.3265, + "step": 195 + }, + { + "epoch": 0.01568, + "grad_norm": 1.2146482467651367, + "learning_rate": 9.994529065063103e-06, + "loss": 0.2635, + "step": 196 + }, + { + "epoch": 0.01576, + "grad_norm": 2.2841827869415283, + "learning_rate": 9.994470090543118e-06, + "loss": 0.4389, + "step": 197 + }, + { + "epoch": 0.01584, + "grad_norm": 1.4877972602844238, + "learning_rate": 9.994410800039721e-06, + "loss": 0.4311, + "step": 198 + }, + { + "epoch": 0.01592, + "grad_norm": 2.1514832973480225, + "learning_rate": 9.994351193556666e-06, + "loss": 0.3551, + "step": 199 + }, + { + "epoch": 0.016, + "grad_norm": 1.679862380027771, + "learning_rate": 9.99429127109772e-06, + "loss": 0.3547, + "step": 200 + }, + { + "epoch": 0.01608, + "grad_norm": 2.345980167388916, + "learning_rate": 9.994231032666677e-06, + "loss": 0.3212, + "step": 201 + }, + { + "epoch": 0.01616, + "grad_norm": 1.6985834836959839, + "learning_rate": 9.994170478267348e-06, + "loss": 0.4209, + "step": 202 + }, + { + "epoch": 0.01624, + "grad_norm": 2.21384334564209, + "learning_rate": 9.994109607903563e-06, + "loss": 0.3629, + "step": 203 + }, + { + "epoch": 0.01632, + "grad_norm": 2.7619104385375977, + "learning_rate": 9.994048421579173e-06, + "loss": 0.5192, + "step": 204 + }, + { + "epoch": 0.0164, + "grad_norm": 2.18821382522583, + "learning_rate": 9.993986919298049e-06, + "loss": 0.388, + "step": 205 + }, + { + "epoch": 0.01648, + "grad_norm": 1.5542010068893433, + "learning_rate": 9.993925101064084e-06, + "loss": 0.3708, + "step": 206 + }, + { + "epoch": 0.01656, + "grad_norm": 1.6064860820770264, + "learning_rate": 9.993862966881188e-06, + "loss": 0.3564, + "step": 207 + }, + { + "epoch": 0.01664, + "grad_norm": 1.946254849433899, + "learning_rate": 9.993800516753289e-06, + "loss": 0.4394, + "step": 208 + }, + { + "epoch": 0.01672, + "grad_norm": 1.9601609706878662, + "learning_rate": 9.993737750684342e-06, + "loss": 0.4078, + "step": 209 + }, + { + "epoch": 0.0168, + "grad_norm": 1.7054260969161987, + "learning_rate": 9.993674668678316e-06, + "loss": 0.3253, + "step": 210 + }, + { + "epoch": 0.01688, + "grad_norm": 1.7017536163330078, + "learning_rate": 9.993611270739205e-06, + "loss": 0.3473, + "step": 211 + }, + { + "epoch": 0.01696, + "grad_norm": 2.0936248302459717, + "learning_rate": 9.993547556871015e-06, + "loss": 0.4329, + "step": 212 + }, + { + "epoch": 0.01704, + "grad_norm": 1.91551673412323, + "learning_rate": 9.993483527077782e-06, + "loss": 0.3773, + "step": 213 + }, + { + "epoch": 0.01712, + "grad_norm": 2.9922895431518555, + "learning_rate": 9.99341918136355e-06, + "loss": 0.3711, + "step": 214 + }, + { + "epoch": 0.0172, + "grad_norm": 1.5471163988113403, + "learning_rate": 9.993354519732399e-06, + "loss": 0.3224, + "step": 215 + }, + { + "epoch": 0.01728, + "grad_norm": 1.5325666666030884, + "learning_rate": 9.993289542188413e-06, + "loss": 0.33, + "step": 216 + }, + { + "epoch": 0.01736, + "grad_norm": 1.9776920080184937, + "learning_rate": 9.993224248735706e-06, + "loss": 0.3761, + "step": 217 + }, + { + "epoch": 0.01744, + "grad_norm": 1.6905009746551514, + "learning_rate": 9.993158639378408e-06, + "loss": 0.3817, + "step": 218 + }, + { + "epoch": 0.01752, + "grad_norm": 1.6124826669692993, + "learning_rate": 9.993092714120671e-06, + "loss": 0.3097, + "step": 219 + }, + { + "epoch": 0.0176, + "grad_norm": 1.5717380046844482, + "learning_rate": 9.993026472966664e-06, + "loss": 0.3696, + "step": 220 + }, + { + "epoch": 0.01768, + "grad_norm": 1.8339051008224487, + "learning_rate": 9.992959915920579e-06, + "loss": 0.3433, + "step": 221 + }, + { + "epoch": 0.01776, + "grad_norm": 1.8827868700027466, + "learning_rate": 9.992893042986627e-06, + "loss": 0.3711, + "step": 222 + }, + { + "epoch": 0.01784, + "grad_norm": 1.704789161682129, + "learning_rate": 9.992825854169038e-06, + "loss": 0.2893, + "step": 223 + }, + { + "epoch": 0.01792, + "grad_norm": 1.803351640701294, + "learning_rate": 9.992758349472062e-06, + "loss": 0.3313, + "step": 224 + }, + { + "epoch": 0.018, + "grad_norm": 1.406293511390686, + "learning_rate": 9.992690528899972e-06, + "loss": 0.3326, + "step": 225 + }, + { + "epoch": 0.01808, + "grad_norm": 3.832010269165039, + "learning_rate": 9.992622392457058e-06, + "loss": 0.4356, + "step": 226 + }, + { + "epoch": 0.01816, + "grad_norm": 1.769728660583496, + "learning_rate": 9.992553940147631e-06, + "loss": 0.3862, + "step": 227 + }, + { + "epoch": 0.01824, + "grad_norm": 2.6798720359802246, + "learning_rate": 9.99248517197602e-06, + "loss": 0.3847, + "step": 228 + }, + { + "epoch": 0.01832, + "grad_norm": 1.8670660257339478, + "learning_rate": 9.992416087946579e-06, + "loss": 0.3875, + "step": 229 + }, + { + "epoch": 0.0184, + "grad_norm": 2.3109071254730225, + "learning_rate": 9.992346688063676e-06, + "loss": 0.4156, + "step": 230 + }, + { + "epoch": 0.01848, + "grad_norm": 1.5571988821029663, + "learning_rate": 9.992276972331702e-06, + "loss": 0.3264, + "step": 231 + }, + { + "epoch": 0.01856, + "grad_norm": 2.017947196960449, + "learning_rate": 9.992206940755068e-06, + "loss": 0.3828, + "step": 232 + }, + { + "epoch": 0.01864, + "grad_norm": 1.9046738147735596, + "learning_rate": 9.992136593338206e-06, + "loss": 0.3952, + "step": 233 + }, + { + "epoch": 0.01872, + "grad_norm": 1.9752707481384277, + "learning_rate": 9.992065930085564e-06, + "loss": 0.3992, + "step": 234 + }, + { + "epoch": 0.0188, + "grad_norm": 2.36694598197937, + "learning_rate": 9.991994951001616e-06, + "loss": 0.4147, + "step": 235 + }, + { + "epoch": 0.01888, + "grad_norm": 2.532259464263916, + "learning_rate": 9.99192365609085e-06, + "loss": 0.4949, + "step": 236 + }, + { + "epoch": 0.01896, + "grad_norm": 2.2116692066192627, + "learning_rate": 9.991852045357776e-06, + "loss": 0.3448, + "step": 237 + }, + { + "epoch": 0.01904, + "grad_norm": 1.5159705877304077, + "learning_rate": 9.991780118806927e-06, + "loss": 0.3287, + "step": 238 + }, + { + "epoch": 0.01912, + "grad_norm": 1.7897611856460571, + "learning_rate": 9.991707876442851e-06, + "loss": 0.331, + "step": 239 + }, + { + "epoch": 0.0192, + "grad_norm": 1.7827035188674927, + "learning_rate": 9.991635318270123e-06, + "loss": 0.3524, + "step": 240 + }, + { + "epoch": 0.01928, + "grad_norm": 2.0052402019500732, + "learning_rate": 9.991562444293328e-06, + "loss": 0.4706, + "step": 241 + }, + { + "epoch": 0.01936, + "grad_norm": 3.4093244075775146, + "learning_rate": 9.991489254517079e-06, + "loss": 0.4795, + "step": 242 + }, + { + "epoch": 0.01944, + "grad_norm": 1.9298174381256104, + "learning_rate": 9.991415748946007e-06, + "loss": 0.3301, + "step": 243 + }, + { + "epoch": 0.01952, + "grad_norm": 1.4808518886566162, + "learning_rate": 9.991341927584763e-06, + "loss": 0.3292, + "step": 244 + }, + { + "epoch": 0.0196, + "grad_norm": 1.9954057931900024, + "learning_rate": 9.991267790438016e-06, + "loss": 0.4463, + "step": 245 + }, + { + "epoch": 0.01968, + "grad_norm": 2.5434648990631104, + "learning_rate": 9.991193337510455e-06, + "loss": 0.4003, + "step": 246 + }, + { + "epoch": 0.01976, + "grad_norm": 1.7687448263168335, + "learning_rate": 9.991118568806794e-06, + "loss": 0.4522, + "step": 247 + }, + { + "epoch": 0.01984, + "grad_norm": 1.513317584991455, + "learning_rate": 9.99104348433176e-06, + "loss": 0.4053, + "step": 248 + }, + { + "epoch": 0.01992, + "grad_norm": 1.8949499130249023, + "learning_rate": 9.990968084090104e-06, + "loss": 0.3524, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 1.4182684421539307, + "learning_rate": 9.9908923680866e-06, + "loss": 0.3213, + "step": 250 + }, + { + "epoch": 0.02008, + "grad_norm": 2.0624146461486816, + "learning_rate": 9.990816336326034e-06, + "loss": 0.4941, + "step": 251 + }, + { + "epoch": 0.02016, + "grad_norm": 1.8073673248291016, + "learning_rate": 9.990739988813219e-06, + "loss": 0.342, + "step": 252 + }, + { + "epoch": 0.02024, + "grad_norm": 1.4276469945907593, + "learning_rate": 9.990663325552981e-06, + "loss": 0.2914, + "step": 253 + }, + { + "epoch": 0.02032, + "grad_norm": 1.8346103429794312, + "learning_rate": 9.990586346550175e-06, + "loss": 0.3763, + "step": 254 + }, + { + "epoch": 0.0204, + "grad_norm": 2.2009072303771973, + "learning_rate": 9.99050905180967e-06, + "loss": 0.5039, + "step": 255 + }, + { + "epoch": 0.02048, + "grad_norm": 1.2374415397644043, + "learning_rate": 9.990431441336354e-06, + "loss": 0.2549, + "step": 256 + }, + { + "epoch": 0.02056, + "grad_norm": 2.125685214996338, + "learning_rate": 9.99035351513514e-06, + "loss": 0.3265, + "step": 257 + }, + { + "epoch": 0.02064, + "grad_norm": 1.3601418733596802, + "learning_rate": 9.990275273210958e-06, + "loss": 0.3251, + "step": 258 + }, + { + "epoch": 0.02072, + "grad_norm": 1.8217127323150635, + "learning_rate": 9.990196715568755e-06, + "loss": 0.4395, + "step": 259 + }, + { + "epoch": 0.0208, + "grad_norm": 2.603867292404175, + "learning_rate": 9.990117842213504e-06, + "loss": 0.3981, + "step": 260 + }, + { + "epoch": 0.02088, + "grad_norm": 1.8914546966552734, + "learning_rate": 9.990038653150194e-06, + "loss": 0.3568, + "step": 261 + }, + { + "epoch": 0.02096, + "grad_norm": 1.4449388980865479, + "learning_rate": 9.989959148383834e-06, + "loss": 0.4019, + "step": 262 + }, + { + "epoch": 0.02104, + "grad_norm": 1.4248216152191162, + "learning_rate": 9.989879327919456e-06, + "loss": 0.3217, + "step": 263 + }, + { + "epoch": 0.02112, + "grad_norm": 2.652259588241577, + "learning_rate": 9.98979919176211e-06, + "loss": 0.4723, + "step": 264 + }, + { + "epoch": 0.0212, + "grad_norm": 1.64741849899292, + "learning_rate": 9.989718739916864e-06, + "loss": 0.3847, + "step": 265 + }, + { + "epoch": 0.02128, + "grad_norm": 1.9805387258529663, + "learning_rate": 9.989637972388809e-06, + "loss": 0.5401, + "step": 266 + }, + { + "epoch": 0.02136, + "grad_norm": 2.0374245643615723, + "learning_rate": 9.989556889183055e-06, + "loss": 0.4374, + "step": 267 + }, + { + "epoch": 0.02144, + "grad_norm": 1.9687358140945435, + "learning_rate": 9.989475490304732e-06, + "loss": 0.3703, + "step": 268 + }, + { + "epoch": 0.02152, + "grad_norm": 1.6070175170898438, + "learning_rate": 9.98939377575899e-06, + "loss": 0.2785, + "step": 269 + }, + { + "epoch": 0.0216, + "grad_norm": 2.3517184257507324, + "learning_rate": 9.989311745550997e-06, + "loss": 0.3411, + "step": 270 + }, + { + "epoch": 0.02168, + "grad_norm": 2.0133581161499023, + "learning_rate": 9.989229399685944e-06, + "loss": 0.408, + "step": 271 + }, + { + "epoch": 0.02176, + "grad_norm": 2.088970422744751, + "learning_rate": 9.989146738169042e-06, + "loss": 0.361, + "step": 272 + }, + { + "epoch": 0.02184, + "grad_norm": 2.6646909713745117, + "learning_rate": 9.98906376100552e-06, + "loss": 0.4573, + "step": 273 + }, + { + "epoch": 0.02192, + "grad_norm": 2.1084513664245605, + "learning_rate": 9.988980468200627e-06, + "loss": 0.4713, + "step": 274 + }, + { + "epoch": 0.022, + "grad_norm": 1.7543795108795166, + "learning_rate": 9.988896859759632e-06, + "loss": 0.3186, + "step": 275 + }, + { + "epoch": 0.02208, + "grad_norm": 1.4450194835662842, + "learning_rate": 9.988812935687826e-06, + "loss": 0.3079, + "step": 276 + }, + { + "epoch": 0.02216, + "grad_norm": 1.684277892112732, + "learning_rate": 9.988728695990518e-06, + "loss": 0.3892, + "step": 277 + }, + { + "epoch": 0.02224, + "grad_norm": 1.5748530626296997, + "learning_rate": 9.988644140673038e-06, + "loss": 0.3742, + "step": 278 + }, + { + "epoch": 0.02232, + "grad_norm": 1.5772830247879028, + "learning_rate": 9.988559269740736e-06, + "loss": 0.3195, + "step": 279 + }, + { + "epoch": 0.0224, + "grad_norm": 1.4628132581710815, + "learning_rate": 9.98847408319898e-06, + "loss": 0.3627, + "step": 280 + }, + { + "epoch": 0.02248, + "grad_norm": 2.2950448989868164, + "learning_rate": 9.98838858105316e-06, + "loss": 0.4156, + "step": 281 + }, + { + "epoch": 0.02256, + "grad_norm": 1.51676344871521, + "learning_rate": 9.988302763308686e-06, + "loss": 0.2407, + "step": 282 + }, + { + "epoch": 0.02264, + "grad_norm": 1.6948258876800537, + "learning_rate": 9.988216629970987e-06, + "loss": 0.3616, + "step": 283 + }, + { + "epoch": 0.02272, + "grad_norm": 1.9347060918807983, + "learning_rate": 9.988130181045512e-06, + "loss": 0.3056, + "step": 284 + }, + { + "epoch": 0.0228, + "grad_norm": 2.000277280807495, + "learning_rate": 9.988043416537731e-06, + "loss": 0.402, + "step": 285 + }, + { + "epoch": 0.02288, + "grad_norm": 1.702262043952942, + "learning_rate": 9.987956336453135e-06, + "loss": 0.358, + "step": 286 + }, + { + "epoch": 0.02296, + "grad_norm": 1.9157183170318604, + "learning_rate": 9.98786894079723e-06, + "loss": 0.414, + "step": 287 + }, + { + "epoch": 0.02304, + "grad_norm": 2.2817223072052, + "learning_rate": 9.987781229575547e-06, + "loss": 0.4701, + "step": 288 + }, + { + "epoch": 0.02312, + "grad_norm": 1.9925824403762817, + "learning_rate": 9.987693202793633e-06, + "loss": 0.3452, + "step": 289 + }, + { + "epoch": 0.0232, + "grad_norm": 1.446832299232483, + "learning_rate": 9.98760486045706e-06, + "loss": 0.3077, + "step": 290 + }, + { + "epoch": 0.02328, + "grad_norm": 1.2968581914901733, + "learning_rate": 9.987516202571417e-06, + "loss": 0.2845, + "step": 291 + }, + { + "epoch": 0.02336, + "grad_norm": 1.7567917108535767, + "learning_rate": 9.98742722914231e-06, + "loss": 0.4025, + "step": 292 + }, + { + "epoch": 0.02344, + "grad_norm": 1.824837327003479, + "learning_rate": 9.987337940175371e-06, + "loss": 0.3339, + "step": 293 + }, + { + "epoch": 0.02352, + "grad_norm": 1.6700366735458374, + "learning_rate": 9.98724833567625e-06, + "loss": 0.3338, + "step": 294 + }, + { + "epoch": 0.0236, + "grad_norm": 1.7511111497879028, + "learning_rate": 9.987158415650612e-06, + "loss": 0.4037, + "step": 295 + }, + { + "epoch": 0.02368, + "grad_norm": 1.5636446475982666, + "learning_rate": 9.987068180104148e-06, + "loss": 0.3424, + "step": 296 + }, + { + "epoch": 0.02376, + "grad_norm": 1.8241537809371948, + "learning_rate": 9.986977629042569e-06, + "loss": 0.5058, + "step": 297 + }, + { + "epoch": 0.02384, + "grad_norm": 2.1625335216522217, + "learning_rate": 9.986886762471601e-06, + "loss": 0.4052, + "step": 298 + }, + { + "epoch": 0.02392, + "grad_norm": 1.7909748554229736, + "learning_rate": 9.986795580396994e-06, + "loss": 0.3581, + "step": 299 + }, + { + "epoch": 0.024, + "grad_norm": 1.2463935613632202, + "learning_rate": 9.986704082824516e-06, + "loss": 0.2531, + "step": 300 + }, + { + "epoch": 0.02408, + "grad_norm": 1.689233422279358, + "learning_rate": 9.986612269759956e-06, + "loss": 0.4526, + "step": 301 + }, + { + "epoch": 0.02416, + "grad_norm": 2.223651170730591, + "learning_rate": 9.986520141209123e-06, + "loss": 0.5223, + "step": 302 + }, + { + "epoch": 0.02424, + "grad_norm": 2.412198066711426, + "learning_rate": 9.986427697177847e-06, + "loss": 0.4936, + "step": 303 + }, + { + "epoch": 0.02432, + "grad_norm": 1.578535795211792, + "learning_rate": 9.986334937671974e-06, + "loss": 0.3544, + "step": 304 + }, + { + "epoch": 0.0244, + "grad_norm": 1.6475830078125, + "learning_rate": 9.986241862697375e-06, + "loss": 0.3465, + "step": 305 + }, + { + "epoch": 0.02448, + "grad_norm": 1.4819910526275635, + "learning_rate": 9.986148472259935e-06, + "loss": 0.2884, + "step": 306 + }, + { + "epoch": 0.02456, + "grad_norm": 1.7936198711395264, + "learning_rate": 9.986054766365566e-06, + "loss": 0.5354, + "step": 307 + }, + { + "epoch": 0.02464, + "grad_norm": 1.8793686628341675, + "learning_rate": 9.985960745020195e-06, + "loss": 0.3592, + "step": 308 + }, + { + "epoch": 0.02472, + "grad_norm": 1.6589360237121582, + "learning_rate": 9.985866408229773e-06, + "loss": 0.3107, + "step": 309 + }, + { + "epoch": 0.0248, + "grad_norm": 1.6323789358139038, + "learning_rate": 9.985771756000264e-06, + "loss": 0.3276, + "step": 310 + }, + { + "epoch": 0.02488, + "grad_norm": 1.6884311437606812, + "learning_rate": 9.98567678833766e-06, + "loss": 0.3694, + "step": 311 + }, + { + "epoch": 0.02496, + "grad_norm": 1.9537749290466309, + "learning_rate": 9.985581505247966e-06, + "loss": 0.4009, + "step": 312 + }, + { + "epoch": 0.02504, + "grad_norm": 1.6910805702209473, + "learning_rate": 9.985485906737212e-06, + "loss": 0.3655, + "step": 313 + }, + { + "epoch": 0.02512, + "grad_norm": 1.6935604810714722, + "learning_rate": 9.985389992811447e-06, + "loss": 0.3912, + "step": 314 + }, + { + "epoch": 0.0252, + "grad_norm": 1.7497141361236572, + "learning_rate": 9.985293763476738e-06, + "loss": 0.3744, + "step": 315 + }, + { + "epoch": 0.02528, + "grad_norm": 1.6273237466812134, + "learning_rate": 9.985197218739173e-06, + "loss": 0.316, + "step": 316 + }, + { + "epoch": 0.02536, + "grad_norm": 1.482218623161316, + "learning_rate": 9.985100358604861e-06, + "loss": 0.3293, + "step": 317 + }, + { + "epoch": 0.02544, + "grad_norm": 1.6040552854537964, + "learning_rate": 9.985003183079929e-06, + "loss": 0.3666, + "step": 318 + }, + { + "epoch": 0.02552, + "grad_norm": 1.7107411623001099, + "learning_rate": 9.984905692170525e-06, + "loss": 0.3839, + "step": 319 + }, + { + "epoch": 0.0256, + "grad_norm": 1.7257567644119263, + "learning_rate": 9.984807885882819e-06, + "loss": 0.3729, + "step": 320 + }, + { + "epoch": 0.02568, + "grad_norm": 1.6172734498977661, + "learning_rate": 9.984709764222997e-06, + "loss": 0.3743, + "step": 321 + }, + { + "epoch": 0.02576, + "grad_norm": 1.929922342300415, + "learning_rate": 9.984611327197267e-06, + "loss": 0.3688, + "step": 322 + }, + { + "epoch": 0.02584, + "grad_norm": 1.99301278591156, + "learning_rate": 9.984512574811857e-06, + "loss": 0.3554, + "step": 323 + }, + { + "epoch": 0.02592, + "grad_norm": 1.3338180780410767, + "learning_rate": 9.984413507073014e-06, + "loss": 0.2592, + "step": 324 + }, + { + "epoch": 0.026, + "grad_norm": 1.341111183166504, + "learning_rate": 9.984314123987006e-06, + "loss": 0.2646, + "step": 325 + }, + { + "epoch": 0.02608, + "grad_norm": 1.6769309043884277, + "learning_rate": 9.984214425560122e-06, + "loss": 0.357, + "step": 326 + }, + { + "epoch": 0.02616, + "grad_norm": 2.0242223739624023, + "learning_rate": 9.984114411798667e-06, + "loss": 0.5107, + "step": 327 + }, + { + "epoch": 0.02624, + "grad_norm": 1.5169055461883545, + "learning_rate": 9.984014082708972e-06, + "loss": 0.3416, + "step": 328 + }, + { + "epoch": 0.02632, + "grad_norm": 1.7003381252288818, + "learning_rate": 9.983913438297381e-06, + "loss": 0.3307, + "step": 329 + }, + { + "epoch": 0.0264, + "grad_norm": 1.7793561220169067, + "learning_rate": 9.983812478570265e-06, + "loss": 0.3307, + "step": 330 + }, + { + "epoch": 0.02648, + "grad_norm": 1.37666916847229, + "learning_rate": 9.983711203534008e-06, + "loss": 0.2952, + "step": 331 + }, + { + "epoch": 0.02656, + "grad_norm": 1.3145544528961182, + "learning_rate": 9.983609613195018e-06, + "loss": 0.2823, + "step": 332 + }, + { + "epoch": 0.02664, + "grad_norm": 1.7312018871307373, + "learning_rate": 9.983507707559724e-06, + "loss": 0.3524, + "step": 333 + }, + { + "epoch": 0.02672, + "grad_norm": 1.6275979280471802, + "learning_rate": 9.983405486634572e-06, + "loss": 0.3636, + "step": 334 + }, + { + "epoch": 0.0268, + "grad_norm": 1.8315293788909912, + "learning_rate": 9.983302950426028e-06, + "loss": 0.3742, + "step": 335 + }, + { + "epoch": 0.02688, + "grad_norm": 2.2285656929016113, + "learning_rate": 9.983200098940582e-06, + "loss": 0.5228, + "step": 336 + }, + { + "epoch": 0.02696, + "grad_norm": 2.5498311519622803, + "learning_rate": 9.98309693218474e-06, + "loss": 0.3208, + "step": 337 + }, + { + "epoch": 0.02704, + "grad_norm": 1.8318697214126587, + "learning_rate": 9.982993450165028e-06, + "loss": 0.3807, + "step": 338 + }, + { + "epoch": 0.02712, + "grad_norm": 1.5511677265167236, + "learning_rate": 9.982889652887992e-06, + "loss": 0.3051, + "step": 339 + }, + { + "epoch": 0.0272, + "grad_norm": 1.9955275058746338, + "learning_rate": 9.982785540360202e-06, + "loss": 0.4089, + "step": 340 + }, + { + "epoch": 0.02728, + "grad_norm": 1.8035287857055664, + "learning_rate": 9.982681112588244e-06, + "loss": 0.4043, + "step": 341 + }, + { + "epoch": 0.02736, + "grad_norm": 1.9366258382797241, + "learning_rate": 9.982576369578724e-06, + "loss": 0.3673, + "step": 342 + }, + { + "epoch": 0.02744, + "grad_norm": 1.7964532375335693, + "learning_rate": 9.982471311338268e-06, + "loss": 0.3345, + "step": 343 + }, + { + "epoch": 0.02752, + "grad_norm": 1.6594544649124146, + "learning_rate": 9.982365937873523e-06, + "loss": 0.4043, + "step": 344 + }, + { + "epoch": 0.0276, + "grad_norm": 2.0171964168548584, + "learning_rate": 9.982260249191159e-06, + "loss": 0.4117, + "step": 345 + }, + { + "epoch": 0.02768, + "grad_norm": 1.921452283859253, + "learning_rate": 9.982154245297856e-06, + "loss": 0.4141, + "step": 346 + }, + { + "epoch": 0.02776, + "grad_norm": 1.8650532960891724, + "learning_rate": 9.982047926200327e-06, + "loss": 0.3625, + "step": 347 + }, + { + "epoch": 0.02784, + "grad_norm": 1.7884360551834106, + "learning_rate": 9.981941291905294e-06, + "loss": 0.3626, + "step": 348 + }, + { + "epoch": 0.02792, + "grad_norm": 1.35147225856781, + "learning_rate": 9.981834342419506e-06, + "loss": 0.2929, + "step": 349 + }, + { + "epoch": 0.028, + "grad_norm": 1.7383079528808594, + "learning_rate": 9.981727077749727e-06, + "loss": 0.378, + "step": 350 + }, + { + "epoch": 0.02808, + "grad_norm": 1.9508424997329712, + "learning_rate": 9.981619497902746e-06, + "loss": 0.3706, + "step": 351 + }, + { + "epoch": 0.02816, + "grad_norm": 2.229902744293213, + "learning_rate": 9.981511602885368e-06, + "loss": 0.4509, + "step": 352 + }, + { + "epoch": 0.02824, + "grad_norm": 1.4837419986724854, + "learning_rate": 9.981403392704419e-06, + "loss": 0.3169, + "step": 353 + }, + { + "epoch": 0.02832, + "grad_norm": 1.9721978902816772, + "learning_rate": 9.981294867366745e-06, + "loss": 0.4006, + "step": 354 + }, + { + "epoch": 0.0284, + "grad_norm": 1.4611529111862183, + "learning_rate": 9.981186026879212e-06, + "loss": 0.2671, + "step": 355 + }, + { + "epoch": 0.02848, + "grad_norm": 1.6954723596572876, + "learning_rate": 9.981076871248705e-06, + "loss": 0.3658, + "step": 356 + }, + { + "epoch": 0.02856, + "grad_norm": 1.6720842123031616, + "learning_rate": 9.980967400482134e-06, + "loss": 0.3438, + "step": 357 + }, + { + "epoch": 0.02864, + "grad_norm": 1.6820980310440063, + "learning_rate": 9.98085761458642e-06, + "loss": 0.3547, + "step": 358 + }, + { + "epoch": 0.02872, + "grad_norm": 1.71517813205719, + "learning_rate": 9.980747513568511e-06, + "loss": 0.3521, + "step": 359 + }, + { + "epoch": 0.0288, + "grad_norm": 1.6508595943450928, + "learning_rate": 9.980637097435372e-06, + "loss": 0.369, + "step": 360 + }, + { + "epoch": 0.02888, + "grad_norm": 1.8108190298080444, + "learning_rate": 9.98052636619399e-06, + "loss": 0.3725, + "step": 361 + }, + { + "epoch": 0.02896, + "grad_norm": 1.6566613912582397, + "learning_rate": 9.98041531985137e-06, + "loss": 0.4799, + "step": 362 + }, + { + "epoch": 0.02904, + "grad_norm": 1.784568428993225, + "learning_rate": 9.980303958414537e-06, + "loss": 0.284, + "step": 363 + }, + { + "epoch": 0.02912, + "grad_norm": 1.7334696054458618, + "learning_rate": 9.980192281890535e-06, + "loss": 0.3531, + "step": 364 + }, + { + "epoch": 0.0292, + "grad_norm": 1.55849289894104, + "learning_rate": 9.980080290286434e-06, + "loss": 0.3523, + "step": 365 + }, + { + "epoch": 0.02928, + "grad_norm": 1.8714348077774048, + "learning_rate": 9.979967983609313e-06, + "loss": 0.315, + "step": 366 + }, + { + "epoch": 0.02936, + "grad_norm": 1.5885354280471802, + "learning_rate": 9.979855361866283e-06, + "loss": 0.3596, + "step": 367 + }, + { + "epoch": 0.02944, + "grad_norm": 1.5087720155715942, + "learning_rate": 9.979742425064467e-06, + "loss": 0.4048, + "step": 368 + }, + { + "epoch": 0.02952, + "grad_norm": 1.7668178081512451, + "learning_rate": 9.97962917321101e-06, + "loss": 0.3881, + "step": 369 + }, + { + "epoch": 0.0296, + "grad_norm": 1.5784231424331665, + "learning_rate": 9.979515606313074e-06, + "loss": 0.3335, + "step": 370 + }, + { + "epoch": 0.02968, + "grad_norm": 2.16629695892334, + "learning_rate": 9.97940172437785e-06, + "loss": 0.5286, + "step": 371 + }, + { + "epoch": 0.02976, + "grad_norm": 2.021318197250366, + "learning_rate": 9.979287527412541e-06, + "loss": 0.3768, + "step": 372 + }, + { + "epoch": 0.02984, + "grad_norm": 2.232651710510254, + "learning_rate": 9.979173015424369e-06, + "loss": 0.517, + "step": 373 + }, + { + "epoch": 0.02992, + "grad_norm": 1.7764800786972046, + "learning_rate": 9.979058188420581e-06, + "loss": 0.3594, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 1.5262402296066284, + "learning_rate": 9.978943046408442e-06, + "loss": 0.3362, + "step": 375 + }, + { + "epoch": 0.03008, + "grad_norm": 1.5017170906066895, + "learning_rate": 9.978827589395238e-06, + "loss": 0.3163, + "step": 376 + }, + { + "epoch": 0.03016, + "grad_norm": 1.7568035125732422, + "learning_rate": 9.978711817388266e-06, + "loss": 0.3264, + "step": 377 + }, + { + "epoch": 0.03024, + "grad_norm": 1.738759160041809, + "learning_rate": 9.978595730394861e-06, + "loss": 0.4094, + "step": 378 + }, + { + "epoch": 0.03032, + "grad_norm": 1.2938789129257202, + "learning_rate": 9.978479328422362e-06, + "loss": 0.2659, + "step": 379 + }, + { + "epoch": 0.0304, + "grad_norm": 1.6452093124389648, + "learning_rate": 9.978362611478132e-06, + "loss": 0.3264, + "step": 380 + }, + { + "epoch": 0.03048, + "grad_norm": 1.610556721687317, + "learning_rate": 9.978245579569558e-06, + "loss": 0.3351, + "step": 381 + }, + { + "epoch": 0.03056, + "grad_norm": 1.4417246580123901, + "learning_rate": 9.978128232704044e-06, + "loss": 0.3223, + "step": 382 + }, + { + "epoch": 0.03064, + "grad_norm": 1.9182772636413574, + "learning_rate": 9.978010570889013e-06, + "loss": 0.3542, + "step": 383 + }, + { + "epoch": 0.03072, + "grad_norm": 2.3410074710845947, + "learning_rate": 9.97789259413191e-06, + "loss": 0.3877, + "step": 384 + }, + { + "epoch": 0.0308, + "grad_norm": 1.9133309125900269, + "learning_rate": 9.977774302440199e-06, + "loss": 0.43, + "step": 385 + }, + { + "epoch": 0.03088, + "grad_norm": 1.8875223398208618, + "learning_rate": 9.977655695821362e-06, + "loss": 0.4885, + "step": 386 + }, + { + "epoch": 0.03096, + "grad_norm": 1.7378370761871338, + "learning_rate": 9.977536774282906e-06, + "loss": 0.4246, + "step": 387 + }, + { + "epoch": 0.03104, + "grad_norm": 1.8936777114868164, + "learning_rate": 9.977417537832352e-06, + "loss": 0.3571, + "step": 388 + }, + { + "epoch": 0.03112, + "grad_norm": 1.7290297746658325, + "learning_rate": 9.977297986477246e-06, + "loss": 0.3553, + "step": 389 + }, + { + "epoch": 0.0312, + "grad_norm": 1.4241451025009155, + "learning_rate": 9.977178120225151e-06, + "loss": 0.2923, + "step": 390 + }, + { + "epoch": 0.03128, + "grad_norm": 1.917623519897461, + "learning_rate": 9.977057939083648e-06, + "loss": 0.3546, + "step": 391 + }, + { + "epoch": 0.03136, + "grad_norm": 1.762268304824829, + "learning_rate": 9.976937443060343e-06, + "loss": 0.386, + "step": 392 + }, + { + "epoch": 0.03144, + "grad_norm": 1.6032752990722656, + "learning_rate": 9.97681663216286e-06, + "loss": 0.3461, + "step": 393 + }, + { + "epoch": 0.03152, + "grad_norm": 1.9459599256515503, + "learning_rate": 9.97669550639884e-06, + "loss": 0.4753, + "step": 394 + }, + { + "epoch": 0.0316, + "grad_norm": 2.0758650302886963, + "learning_rate": 9.97657406577595e-06, + "loss": 0.4122, + "step": 395 + }, + { + "epoch": 0.03168, + "grad_norm": 1.4958903789520264, + "learning_rate": 9.976452310301867e-06, + "loss": 0.3139, + "step": 396 + }, + { + "epoch": 0.03176, + "grad_norm": 1.7065391540527344, + "learning_rate": 9.9763302399843e-06, + "loss": 0.3258, + "step": 397 + }, + { + "epoch": 0.03184, + "grad_norm": 1.7780778408050537, + "learning_rate": 9.976207854830968e-06, + "loss": 0.3458, + "step": 398 + }, + { + "epoch": 0.03192, + "grad_norm": 1.318264365196228, + "learning_rate": 9.976085154849617e-06, + "loss": 0.3148, + "step": 399 + }, + { + "epoch": 0.032, + "grad_norm": 2.011800527572632, + "learning_rate": 9.975962140048007e-06, + "loss": 0.3416, + "step": 400 + }, + { + "epoch": 0.03208, + "grad_norm": 1.5037428140640259, + "learning_rate": 9.975838810433922e-06, + "loss": 0.3242, + "step": 401 + }, + { + "epoch": 0.03216, + "grad_norm": 1.8600457906723022, + "learning_rate": 9.975715166015165e-06, + "loss": 0.3643, + "step": 402 + }, + { + "epoch": 0.03224, + "grad_norm": 1.61443293094635, + "learning_rate": 9.975591206799559e-06, + "loss": 0.357, + "step": 403 + }, + { + "epoch": 0.03232, + "grad_norm": 1.9129692316055298, + "learning_rate": 9.975466932794943e-06, + "loss": 0.4214, + "step": 404 + }, + { + "epoch": 0.0324, + "grad_norm": 1.9851549863815308, + "learning_rate": 9.975342344009186e-06, + "loss": 0.4749, + "step": 405 + }, + { + "epoch": 0.03248, + "grad_norm": 1.711588978767395, + "learning_rate": 9.975217440450164e-06, + "loss": 0.32, + "step": 406 + }, + { + "epoch": 0.03256, + "grad_norm": 1.4885379076004028, + "learning_rate": 9.975092222125783e-06, + "loss": 0.3246, + "step": 407 + }, + { + "epoch": 0.03264, + "grad_norm": 1.6126595735549927, + "learning_rate": 9.974966689043963e-06, + "loss": 0.3292, + "step": 408 + }, + { + "epoch": 0.03272, + "grad_norm": 1.5870492458343506, + "learning_rate": 9.974840841212648e-06, + "loss": 0.3787, + "step": 409 + }, + { + "epoch": 0.0328, + "grad_norm": 1.4002914428710938, + "learning_rate": 9.974714678639797e-06, + "loss": 0.2722, + "step": 410 + }, + { + "epoch": 0.03288, + "grad_norm": 2.1911635398864746, + "learning_rate": 9.974588201333394e-06, + "loss": 0.4562, + "step": 411 + }, + { + "epoch": 0.03296, + "grad_norm": 1.665370225906372, + "learning_rate": 9.974461409301442e-06, + "loss": 0.4463, + "step": 412 + }, + { + "epoch": 0.03304, + "grad_norm": 4.2547736167907715, + "learning_rate": 9.97433430255196e-06, + "loss": 0.4829, + "step": 413 + }, + { + "epoch": 0.03312, + "grad_norm": 2.003357410430908, + "learning_rate": 9.97420688109299e-06, + "loss": 0.2167, + "step": 414 + }, + { + "epoch": 0.0332, + "grad_norm": 5.256060600280762, + "learning_rate": 9.974079144932596e-06, + "loss": 0.3008, + "step": 415 + }, + { + "epoch": 0.03328, + "grad_norm": 2.828409433364868, + "learning_rate": 9.973951094078857e-06, + "loss": 0.3806, + "step": 416 + }, + { + "epoch": 0.03336, + "grad_norm": 3.722662925720215, + "learning_rate": 9.973822728539876e-06, + "loss": 0.4364, + "step": 417 + }, + { + "epoch": 0.03344, + "grad_norm": 2.073556900024414, + "learning_rate": 9.973694048323773e-06, + "loss": 0.3991, + "step": 418 + }, + { + "epoch": 0.03352, + "grad_norm": 1.1159542798995972, + "learning_rate": 9.97356505343869e-06, + "loss": 0.2556, + "step": 419 + }, + { + "epoch": 0.0336, + "grad_norm": 1.9902396202087402, + "learning_rate": 9.973435743892787e-06, + "loss": 0.3929, + "step": 420 + }, + { + "epoch": 0.03368, + "grad_norm": 1.4526017904281616, + "learning_rate": 9.973306119694246e-06, + "loss": 0.3209, + "step": 421 + }, + { + "epoch": 0.03376, + "grad_norm": 1.8279871940612793, + "learning_rate": 9.973176180851267e-06, + "loss": 0.4486, + "step": 422 + }, + { + "epoch": 0.03384, + "grad_norm": 1.1028552055358887, + "learning_rate": 9.973045927372071e-06, + "loss": 0.2656, + "step": 423 + }, + { + "epoch": 0.03392, + "grad_norm": 1.7597603797912598, + "learning_rate": 9.972915359264901e-06, + "loss": 0.3717, + "step": 424 + }, + { + "epoch": 0.034, + "grad_norm": 1.9462575912475586, + "learning_rate": 9.972784476538014e-06, + "loss": 0.3828, + "step": 425 + }, + { + "epoch": 0.03408, + "grad_norm": 2.132319927215576, + "learning_rate": 9.972653279199693e-06, + "loss": 0.4109, + "step": 426 + }, + { + "epoch": 0.03416, + "grad_norm": 1.7720433473587036, + "learning_rate": 9.972521767258236e-06, + "loss": 0.3781, + "step": 427 + }, + { + "epoch": 0.03424, + "grad_norm": 1.556762933731079, + "learning_rate": 9.972389940721967e-06, + "loss": 0.3173, + "step": 428 + }, + { + "epoch": 0.03432, + "grad_norm": 1.8000061511993408, + "learning_rate": 9.972257799599223e-06, + "loss": 0.4679, + "step": 429 + }, + { + "epoch": 0.0344, + "grad_norm": 1.4809861183166504, + "learning_rate": 9.972125343898366e-06, + "loss": 0.3235, + "step": 430 + }, + { + "epoch": 0.03448, + "grad_norm": 1.4493865966796875, + "learning_rate": 9.971992573627775e-06, + "loss": 0.2701, + "step": 431 + }, + { + "epoch": 0.03456, + "grad_norm": 1.4800419807434082, + "learning_rate": 9.971859488795849e-06, + "loss": 0.2702, + "step": 432 + }, + { + "epoch": 0.03464, + "grad_norm": 1.9442113637924194, + "learning_rate": 9.97172608941101e-06, + "loss": 0.372, + "step": 433 + }, + { + "epoch": 0.03472, + "grad_norm": 2.0416910648345947, + "learning_rate": 9.971592375481697e-06, + "loss": 0.4425, + "step": 434 + }, + { + "epoch": 0.0348, + "grad_norm": 1.7712527513504028, + "learning_rate": 9.971458347016369e-06, + "loss": 0.4006, + "step": 435 + }, + { + "epoch": 0.03488, + "grad_norm": 1.808829426765442, + "learning_rate": 9.971324004023506e-06, + "loss": 0.3774, + "step": 436 + }, + { + "epoch": 0.03496, + "grad_norm": 1.4706016778945923, + "learning_rate": 9.971189346511608e-06, + "loss": 0.3434, + "step": 437 + }, + { + "epoch": 0.03504, + "grad_norm": 1.7320775985717773, + "learning_rate": 9.971054374489193e-06, + "loss": 0.3452, + "step": 438 + }, + { + "epoch": 0.03512, + "grad_norm": 1.6511974334716797, + "learning_rate": 9.9709190879648e-06, + "loss": 0.3614, + "step": 439 + }, + { + "epoch": 0.0352, + "grad_norm": 1.4348496198654175, + "learning_rate": 9.970783486946991e-06, + "loss": 0.3445, + "step": 440 + }, + { + "epoch": 0.03528, + "grad_norm": 1.5991486310958862, + "learning_rate": 9.970647571444341e-06, + "loss": 0.4998, + "step": 441 + }, + { + "epoch": 0.03536, + "grad_norm": 1.3900196552276611, + "learning_rate": 9.970511341465453e-06, + "loss": 0.3307, + "step": 442 + }, + { + "epoch": 0.03544, + "grad_norm": 1.249131202697754, + "learning_rate": 9.970374797018942e-06, + "loss": 0.2656, + "step": 443 + }, + { + "epoch": 0.03552, + "grad_norm": 1.7488571405410767, + "learning_rate": 9.97023793811345e-06, + "loss": 0.3783, + "step": 444 + }, + { + "epoch": 0.0356, + "grad_norm": 1.5800598859786987, + "learning_rate": 9.970100764757635e-06, + "loss": 0.2901, + "step": 445 + }, + { + "epoch": 0.03568, + "grad_norm": 1.4809678792953491, + "learning_rate": 9.969963276960173e-06, + "loss": 0.3579, + "step": 446 + }, + { + "epoch": 0.03576, + "grad_norm": 1.9391019344329834, + "learning_rate": 9.969825474729763e-06, + "loss": 0.3893, + "step": 447 + }, + { + "epoch": 0.03584, + "grad_norm": 1.4065037965774536, + "learning_rate": 9.969687358075126e-06, + "loss": 0.3532, + "step": 448 + }, + { + "epoch": 0.03592, + "grad_norm": 1.735000729560852, + "learning_rate": 9.969548927004998e-06, + "loss": 0.3295, + "step": 449 + }, + { + "epoch": 0.036, + "grad_norm": 1.9082682132720947, + "learning_rate": 9.969410181528138e-06, + "loss": 0.3841, + "step": 450 + }, + { + "epoch": 0.03608, + "grad_norm": 1.740655779838562, + "learning_rate": 9.969271121653323e-06, + "loss": 0.3545, + "step": 451 + }, + { + "epoch": 0.03616, + "grad_norm": 2.0228629112243652, + "learning_rate": 9.96913174738935e-06, + "loss": 0.4113, + "step": 452 + }, + { + "epoch": 0.03624, + "grad_norm": 1.7781933546066284, + "learning_rate": 9.96899205874504e-06, + "loss": 0.4382, + "step": 453 + }, + { + "epoch": 0.03632, + "grad_norm": 2.111398696899414, + "learning_rate": 9.968852055729229e-06, + "loss": 0.416, + "step": 454 + }, + { + "epoch": 0.0364, + "grad_norm": 1.8158979415893555, + "learning_rate": 9.968711738350773e-06, + "loss": 0.486, + "step": 455 + }, + { + "epoch": 0.03648, + "grad_norm": 1.4151747226715088, + "learning_rate": 9.968571106618551e-06, + "loss": 0.3059, + "step": 456 + }, + { + "epoch": 0.03656, + "grad_norm": 1.4982035160064697, + "learning_rate": 9.968430160541461e-06, + "loss": 0.282, + "step": 457 + }, + { + "epoch": 0.03664, + "grad_norm": 1.3661582469940186, + "learning_rate": 9.96828890012842e-06, + "loss": 0.3013, + "step": 458 + }, + { + "epoch": 0.03672, + "grad_norm": 1.747228980064392, + "learning_rate": 9.968147325388363e-06, + "loss": 0.3509, + "step": 459 + }, + { + "epoch": 0.0368, + "grad_norm": 1.706491231918335, + "learning_rate": 9.968005436330246e-06, + "loss": 0.4495, + "step": 460 + }, + { + "epoch": 0.03688, + "grad_norm": 1.7066677808761597, + "learning_rate": 9.96786323296305e-06, + "loss": 0.3797, + "step": 461 + }, + { + "epoch": 0.03696, + "grad_norm": 2.126333475112915, + "learning_rate": 9.96772071529577e-06, + "loss": 0.4699, + "step": 462 + }, + { + "epoch": 0.03704, + "grad_norm": 2.0231945514678955, + "learning_rate": 9.967577883337421e-06, + "loss": 0.4655, + "step": 463 + }, + { + "epoch": 0.03712, + "grad_norm": 1.7993510961532593, + "learning_rate": 9.967434737097043e-06, + "loss": 0.3497, + "step": 464 + }, + { + "epoch": 0.0372, + "grad_norm": 2.108503580093384, + "learning_rate": 9.967291276583688e-06, + "loss": 0.3369, + "step": 465 + }, + { + "epoch": 0.03728, + "grad_norm": 1.6675549745559692, + "learning_rate": 9.967147501806436e-06, + "loss": 0.4097, + "step": 466 + }, + { + "epoch": 0.03736, + "grad_norm": 1.325800895690918, + "learning_rate": 9.967003412774381e-06, + "loss": 0.2744, + "step": 467 + }, + { + "epoch": 0.03744, + "grad_norm": 1.9661016464233398, + "learning_rate": 9.966859009496641e-06, + "loss": 0.3642, + "step": 468 + }, + { + "epoch": 0.03752, + "grad_norm": 1.5406103134155273, + "learning_rate": 9.966714291982349e-06, + "loss": 0.2844, + "step": 469 + }, + { + "epoch": 0.0376, + "grad_norm": 1.752842903137207, + "learning_rate": 9.966569260240664e-06, + "loss": 0.5024, + "step": 470 + }, + { + "epoch": 0.03768, + "grad_norm": 1.2587329149246216, + "learning_rate": 9.966423914280758e-06, + "loss": 0.2523, + "step": 471 + }, + { + "epoch": 0.03776, + "grad_norm": 1.629905104637146, + "learning_rate": 9.96627825411183e-06, + "loss": 0.3229, + "step": 472 + }, + { + "epoch": 0.03784, + "grad_norm": 1.8891979455947876, + "learning_rate": 9.966132279743095e-06, + "loss": 0.3933, + "step": 473 + }, + { + "epoch": 0.03792, + "grad_norm": 1.3782880306243896, + "learning_rate": 9.965985991183787e-06, + "loss": 0.3091, + "step": 474 + }, + { + "epoch": 0.038, + "grad_norm": 1.4807788133621216, + "learning_rate": 9.96583938844316e-06, + "loss": 0.2939, + "step": 475 + }, + { + "epoch": 0.03808, + "grad_norm": 2.111734390258789, + "learning_rate": 9.965692471530492e-06, + "loss": 0.3984, + "step": 476 + }, + { + "epoch": 0.03816, + "grad_norm": 1.6450163125991821, + "learning_rate": 9.965545240455077e-06, + "loss": 0.3763, + "step": 477 + }, + { + "epoch": 0.03824, + "grad_norm": 1.8153711557388306, + "learning_rate": 9.965397695226228e-06, + "loss": 0.5331, + "step": 478 + }, + { + "epoch": 0.03832, + "grad_norm": 1.3268543481826782, + "learning_rate": 9.965249835853281e-06, + "loss": 0.3017, + "step": 479 + }, + { + "epoch": 0.0384, + "grad_norm": 1.573397159576416, + "learning_rate": 9.96510166234559e-06, + "loss": 0.3174, + "step": 480 + }, + { + "epoch": 0.03848, + "grad_norm": 1.8132137060165405, + "learning_rate": 9.964953174712533e-06, + "loss": 0.3517, + "step": 481 + }, + { + "epoch": 0.03856, + "grad_norm": 1.5276272296905518, + "learning_rate": 9.9648043729635e-06, + "loss": 0.3427, + "step": 482 + }, + { + "epoch": 0.03864, + "grad_norm": 1.4888263940811157, + "learning_rate": 9.964655257107906e-06, + "loss": 0.3782, + "step": 483 + }, + { + "epoch": 0.03872, + "grad_norm": 1.5089031457901, + "learning_rate": 9.964505827155186e-06, + "loss": 0.3162, + "step": 484 + }, + { + "epoch": 0.0388, + "grad_norm": 1.7417253255844116, + "learning_rate": 9.964356083114795e-06, + "loss": 0.4875, + "step": 485 + }, + { + "epoch": 0.03888, + "grad_norm": 1.554205298423767, + "learning_rate": 9.964206024996203e-06, + "loss": 0.3795, + "step": 486 + }, + { + "epoch": 0.03896, + "grad_norm": 2.758025884628296, + "learning_rate": 9.964055652808908e-06, + "loss": 0.5799, + "step": 487 + }, + { + "epoch": 0.03904, + "grad_norm": 1.5787456035614014, + "learning_rate": 9.96390496656242e-06, + "loss": 0.3235, + "step": 488 + }, + { + "epoch": 0.03912, + "grad_norm": 1.6550320386886597, + "learning_rate": 9.963753966266276e-06, + "loss": 0.3013, + "step": 489 + }, + { + "epoch": 0.0392, + "grad_norm": 1.0665961503982544, + "learning_rate": 9.963602651930027e-06, + "loss": 0.2318, + "step": 490 + }, + { + "epoch": 0.03928, + "grad_norm": 1.4649837017059326, + "learning_rate": 9.963451023563245e-06, + "loss": 0.3583, + "step": 491 + }, + { + "epoch": 0.03936, + "grad_norm": 1.462937831878662, + "learning_rate": 9.963299081175525e-06, + "loss": 0.308, + "step": 492 + }, + { + "epoch": 0.03944, + "grad_norm": 1.6257414817810059, + "learning_rate": 9.96314682477648e-06, + "loss": 0.3596, + "step": 493 + }, + { + "epoch": 0.03952, + "grad_norm": 2.3487870693206787, + "learning_rate": 9.962994254375742e-06, + "loss": 0.4331, + "step": 494 + }, + { + "epoch": 0.0396, + "grad_norm": 2.4731879234313965, + "learning_rate": 9.962841369982962e-06, + "loss": 0.4484, + "step": 495 + }, + { + "epoch": 0.03968, + "grad_norm": 1.8673752546310425, + "learning_rate": 9.962688171607817e-06, + "loss": 0.3284, + "step": 496 + }, + { + "epoch": 0.03976, + "grad_norm": 1.7251307964324951, + "learning_rate": 9.962534659259995e-06, + "loss": 0.2836, + "step": 497 + }, + { + "epoch": 0.03984, + "grad_norm": 2.203010320663452, + "learning_rate": 9.96238083294921e-06, + "loss": 0.3871, + "step": 498 + }, + { + "epoch": 0.03992, + "grad_norm": 1.6219152212142944, + "learning_rate": 9.962226692685195e-06, + "loss": 0.333, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 1.6756365299224854, + "learning_rate": 9.962072238477699e-06, + "loss": 0.2864, + "step": 500 + }, + { + "epoch": 0.04008, + "grad_norm": 1.6424753665924072, + "learning_rate": 9.961917470336496e-06, + "loss": 0.3253, + "step": 501 + }, + { + "epoch": 0.04016, + "grad_norm": 1.5344758033752441, + "learning_rate": 9.961762388271378e-06, + "loss": 0.3293, + "step": 502 + }, + { + "epoch": 0.04024, + "grad_norm": 1.1364933252334595, + "learning_rate": 9.961606992292155e-06, + "loss": 0.2658, + "step": 503 + }, + { + "epoch": 0.04032, + "grad_norm": 1.397683024406433, + "learning_rate": 9.96145128240866e-06, + "loss": 0.2682, + "step": 504 + }, + { + "epoch": 0.0404, + "grad_norm": 1.7218232154846191, + "learning_rate": 9.961295258630743e-06, + "loss": 0.3327, + "step": 505 + }, + { + "epoch": 0.04048, + "grad_norm": 1.204432487487793, + "learning_rate": 9.961138920968276e-06, + "loss": 0.2728, + "step": 506 + }, + { + "epoch": 0.04056, + "grad_norm": 1.273925542831421, + "learning_rate": 9.960982269431149e-06, + "loss": 0.2972, + "step": 507 + }, + { + "epoch": 0.04064, + "grad_norm": 1.682013750076294, + "learning_rate": 9.960825304029274e-06, + "loss": 0.3276, + "step": 508 + }, + { + "epoch": 0.04072, + "grad_norm": 1.4792194366455078, + "learning_rate": 9.96066802477258e-06, + "loss": 0.4057, + "step": 509 + }, + { + "epoch": 0.0408, + "grad_norm": 2.17596173286438, + "learning_rate": 9.96051043167102e-06, + "loss": 0.5181, + "step": 510 + }, + { + "epoch": 0.04088, + "grad_norm": 1.6542383432388306, + "learning_rate": 9.960352524734562e-06, + "loss": 0.3988, + "step": 511 + }, + { + "epoch": 0.04096, + "grad_norm": 1.9632936716079712, + "learning_rate": 9.960194303973196e-06, + "loss": 0.3267, + "step": 512 + }, + { + "epoch": 0.04104, + "grad_norm": 1.4410243034362793, + "learning_rate": 9.960035769396935e-06, + "loss": 0.2683, + "step": 513 + }, + { + "epoch": 0.04112, + "grad_norm": 1.5401276350021362, + "learning_rate": 9.959876921015805e-06, + "loss": 0.2925, + "step": 514 + }, + { + "epoch": 0.0412, + "grad_norm": 1.6564226150512695, + "learning_rate": 9.95971775883986e-06, + "loss": 0.33, + "step": 515 + }, + { + "epoch": 0.04128, + "grad_norm": 1.4229389429092407, + "learning_rate": 9.959558282879167e-06, + "loss": 0.2565, + "step": 516 + }, + { + "epoch": 0.04136, + "grad_norm": 1.524813175201416, + "learning_rate": 9.959398493143816e-06, + "loss": 0.3145, + "step": 517 + }, + { + "epoch": 0.04144, + "grad_norm": 1.274613380432129, + "learning_rate": 9.959238389643918e-06, + "loss": 0.2743, + "step": 518 + }, + { + "epoch": 0.04152, + "grad_norm": 1.9126328229904175, + "learning_rate": 9.9590779723896e-06, + "loss": 0.3763, + "step": 519 + }, + { + "epoch": 0.0416, + "grad_norm": 1.5458030700683594, + "learning_rate": 9.95891724139101e-06, + "loss": 0.313, + "step": 520 + }, + { + "epoch": 0.04168, + "grad_norm": 1.80939781665802, + "learning_rate": 9.958756196658321e-06, + "loss": 0.3907, + "step": 521 + }, + { + "epoch": 0.04176, + "grad_norm": 1.551069974899292, + "learning_rate": 9.958594838201719e-06, + "loss": 0.3031, + "step": 522 + }, + { + "epoch": 0.04184, + "grad_norm": 1.9089337587356567, + "learning_rate": 9.958433166031412e-06, + "loss": 0.3787, + "step": 523 + }, + { + "epoch": 0.04192, + "grad_norm": 1.5611118078231812, + "learning_rate": 9.958271180157631e-06, + "loss": 0.3589, + "step": 524 + }, + { + "epoch": 0.042, + "grad_norm": 1.8825455904006958, + "learning_rate": 9.958108880590623e-06, + "loss": 0.4088, + "step": 525 + }, + { + "epoch": 0.04208, + "grad_norm": 1.9425339698791504, + "learning_rate": 9.957946267340655e-06, + "loss": 0.2773, + "step": 526 + }, + { + "epoch": 0.04216, + "grad_norm": 1.547389030456543, + "learning_rate": 9.957783340418015e-06, + "loss": 0.2913, + "step": 527 + }, + { + "epoch": 0.04224, + "grad_norm": 1.622795820236206, + "learning_rate": 9.957620099833013e-06, + "loss": 0.3456, + "step": 528 + }, + { + "epoch": 0.04232, + "grad_norm": 1.9623744487762451, + "learning_rate": 9.957456545595977e-06, + "loss": 0.3924, + "step": 529 + }, + { + "epoch": 0.0424, + "grad_norm": 1.489332675933838, + "learning_rate": 9.95729267771725e-06, + "loss": 0.3155, + "step": 530 + }, + { + "epoch": 0.04248, + "grad_norm": 1.6495463848114014, + "learning_rate": 9.957128496207205e-06, + "loss": 0.3544, + "step": 531 + }, + { + "epoch": 0.04256, + "grad_norm": 1.5633338689804077, + "learning_rate": 9.956964001076224e-06, + "loss": 0.3408, + "step": 532 + }, + { + "epoch": 0.04264, + "grad_norm": 2.2354142665863037, + "learning_rate": 9.95679919233472e-06, + "loss": 0.4222, + "step": 533 + }, + { + "epoch": 0.04272, + "grad_norm": 1.6407157182693481, + "learning_rate": 9.956634069993114e-06, + "loss": 0.4222, + "step": 534 + }, + { + "epoch": 0.0428, + "grad_norm": 1.805367112159729, + "learning_rate": 9.956468634061857e-06, + "loss": 0.3485, + "step": 535 + }, + { + "epoch": 0.04288, + "grad_norm": 1.9591755867004395, + "learning_rate": 9.956302884551413e-06, + "loss": 0.397, + "step": 536 + }, + { + "epoch": 0.04296, + "grad_norm": 2.0101656913757324, + "learning_rate": 9.956136821472269e-06, + "loss": 0.5719, + "step": 537 + }, + { + "epoch": 0.04304, + "grad_norm": 2.0787875652313232, + "learning_rate": 9.955970444834933e-06, + "loss": 0.4123, + "step": 538 + }, + { + "epoch": 0.04312, + "grad_norm": 1.7366102933883667, + "learning_rate": 9.955803754649929e-06, + "loss": 0.3547, + "step": 539 + }, + { + "epoch": 0.0432, + "grad_norm": 1.8972814083099365, + "learning_rate": 9.955636750927803e-06, + "loss": 0.3922, + "step": 540 + }, + { + "epoch": 0.04328, + "grad_norm": 1.79804265499115, + "learning_rate": 9.95546943367912e-06, + "loss": 0.4213, + "step": 541 + }, + { + "epoch": 0.04336, + "grad_norm": 1.6599233150482178, + "learning_rate": 9.955301802914471e-06, + "loss": 0.3559, + "step": 542 + }, + { + "epoch": 0.04344, + "grad_norm": 1.4300049543380737, + "learning_rate": 9.955133858644455e-06, + "loss": 0.3159, + "step": 543 + }, + { + "epoch": 0.04352, + "grad_norm": 1.2100526094436646, + "learning_rate": 9.9549656008797e-06, + "loss": 0.2633, + "step": 544 + }, + { + "epoch": 0.0436, + "grad_norm": 1.8038049936294556, + "learning_rate": 9.95479702963085e-06, + "loss": 0.4075, + "step": 545 + }, + { + "epoch": 0.04368, + "grad_norm": 2.2004456520080566, + "learning_rate": 9.954628144908573e-06, + "loss": 0.516, + "step": 546 + }, + { + "epoch": 0.04376, + "grad_norm": 1.4208917617797852, + "learning_rate": 9.95445894672355e-06, + "loss": 0.3567, + "step": 547 + }, + { + "epoch": 0.04384, + "grad_norm": 1.8718032836914062, + "learning_rate": 9.954289435086487e-06, + "loss": 0.4532, + "step": 548 + }, + { + "epoch": 0.04392, + "grad_norm": 1.7566274404525757, + "learning_rate": 9.954119610008108e-06, + "loss": 0.3248, + "step": 549 + }, + { + "epoch": 0.044, + "grad_norm": 1.7727574110031128, + "learning_rate": 9.953949471499157e-06, + "loss": 0.3495, + "step": 550 + }, + { + "epoch": 0.04408, + "grad_norm": 1.5641764402389526, + "learning_rate": 9.953779019570402e-06, + "loss": 0.3757, + "step": 551 + }, + { + "epoch": 0.04416, + "grad_norm": 1.4524203538894653, + "learning_rate": 9.953608254232622e-06, + "loss": 0.3353, + "step": 552 + }, + { + "epoch": 0.04424, + "grad_norm": 1.6970480680465698, + "learning_rate": 9.953437175496622e-06, + "loss": 0.3662, + "step": 553 + }, + { + "epoch": 0.04432, + "grad_norm": 2.238062858581543, + "learning_rate": 9.953265783373227e-06, + "loss": 0.4972, + "step": 554 + }, + { + "epoch": 0.0444, + "grad_norm": 1.676147699356079, + "learning_rate": 9.95309407787328e-06, + "loss": 0.3475, + "step": 555 + }, + { + "epoch": 0.04448, + "grad_norm": 1.4633108377456665, + "learning_rate": 9.952922059007643e-06, + "loss": 0.3406, + "step": 556 + }, + { + "epoch": 0.04456, + "grad_norm": 1.5241601467132568, + "learning_rate": 9.952749726787201e-06, + "loss": 0.2829, + "step": 557 + }, + { + "epoch": 0.04464, + "grad_norm": 2.0277199745178223, + "learning_rate": 9.952577081222854e-06, + "loss": 0.3451, + "step": 558 + }, + { + "epoch": 0.04472, + "grad_norm": 1.724655032157898, + "learning_rate": 9.952404122325528e-06, + "loss": 0.4065, + "step": 559 + }, + { + "epoch": 0.0448, + "grad_norm": 1.4190020561218262, + "learning_rate": 9.952230850106164e-06, + "loss": 0.3292, + "step": 560 + }, + { + "epoch": 0.04488, + "grad_norm": 1.4401750564575195, + "learning_rate": 9.952057264575723e-06, + "loss": 0.3097, + "step": 561 + }, + { + "epoch": 0.04496, + "grad_norm": 2.137566328048706, + "learning_rate": 9.95188336574519e-06, + "loss": 0.4065, + "step": 562 + }, + { + "epoch": 0.04504, + "grad_norm": 1.5885168313980103, + "learning_rate": 9.951709153625564e-06, + "loss": 0.2592, + "step": 563 + }, + { + "epoch": 0.04512, + "grad_norm": 1.431435227394104, + "learning_rate": 9.951534628227868e-06, + "loss": 0.4273, + "step": 564 + }, + { + "epoch": 0.0452, + "grad_norm": 1.6501264572143555, + "learning_rate": 9.951359789563145e-06, + "loss": 0.3777, + "step": 565 + }, + { + "epoch": 0.04528, + "grad_norm": 2.1749095916748047, + "learning_rate": 9.951184637642456e-06, + "loss": 0.461, + "step": 566 + }, + { + "epoch": 0.04536, + "grad_norm": 1.7694364786148071, + "learning_rate": 9.95100917247688e-06, + "loss": 0.3626, + "step": 567 + }, + { + "epoch": 0.04544, + "grad_norm": 1.836769938468933, + "learning_rate": 9.950833394077522e-06, + "loss": 0.3411, + "step": 568 + }, + { + "epoch": 0.04552, + "grad_norm": 1.6859595775604248, + "learning_rate": 9.9506573024555e-06, + "loss": 0.3591, + "step": 569 + }, + { + "epoch": 0.0456, + "grad_norm": 2.330047130584717, + "learning_rate": 9.950480897621955e-06, + "loss": 0.6001, + "step": 570 + }, + { + "epoch": 0.04568, + "grad_norm": 1.5222753286361694, + "learning_rate": 9.950304179588047e-06, + "loss": 0.3455, + "step": 571 + }, + { + "epoch": 0.04576, + "grad_norm": 1.7251989841461182, + "learning_rate": 9.950127148364958e-06, + "loss": 0.365, + "step": 572 + }, + { + "epoch": 0.04584, + "grad_norm": 1.6580973863601685, + "learning_rate": 9.949949803963887e-06, + "loss": 0.3802, + "step": 573 + }, + { + "epoch": 0.04592, + "grad_norm": 1.8021025657653809, + "learning_rate": 9.949772146396056e-06, + "loss": 0.4321, + "step": 574 + }, + { + "epoch": 0.046, + "grad_norm": 1.5739644765853882, + "learning_rate": 9.949594175672703e-06, + "loss": 0.3246, + "step": 575 + }, + { + "epoch": 0.04608, + "grad_norm": 2.065218925476074, + "learning_rate": 9.949415891805087e-06, + "loss": 0.4126, + "step": 576 + }, + { + "epoch": 0.04616, + "grad_norm": 1.7216342687606812, + "learning_rate": 9.949237294804489e-06, + "loss": 0.4876, + "step": 577 + }, + { + "epoch": 0.04624, + "grad_norm": 1.3691469430923462, + "learning_rate": 9.949058384682206e-06, + "loss": 0.2763, + "step": 578 + }, + { + "epoch": 0.04632, + "grad_norm": 1.4548450708389282, + "learning_rate": 9.94887916144956e-06, + "loss": 0.3798, + "step": 579 + }, + { + "epoch": 0.0464, + "grad_norm": 1.7984998226165771, + "learning_rate": 9.948699625117888e-06, + "loss": 0.4941, + "step": 580 + }, + { + "epoch": 0.04648, + "grad_norm": 1.7592239379882812, + "learning_rate": 9.948519775698551e-06, + "loss": 0.3491, + "step": 581 + }, + { + "epoch": 0.04656, + "grad_norm": 1.7368760108947754, + "learning_rate": 9.948339613202923e-06, + "loss": 0.401, + "step": 582 + }, + { + "epoch": 0.04664, + "grad_norm": 1.7027363777160645, + "learning_rate": 9.948159137642407e-06, + "loss": 0.38, + "step": 583 + }, + { + "epoch": 0.04672, + "grad_norm": 2.045701503753662, + "learning_rate": 9.947978349028418e-06, + "loss": 0.4475, + "step": 584 + }, + { + "epoch": 0.0468, + "grad_norm": 1.5346065759658813, + "learning_rate": 9.947797247372394e-06, + "loss": 0.3696, + "step": 585 + }, + { + "epoch": 0.04688, + "grad_norm": 1.5219029188156128, + "learning_rate": 9.947615832685795e-06, + "loss": 0.2784, + "step": 586 + }, + { + "epoch": 0.04696, + "grad_norm": 1.4893606901168823, + "learning_rate": 9.947434104980097e-06, + "loss": 0.33, + "step": 587 + }, + { + "epoch": 0.04704, + "grad_norm": 1.7347660064697266, + "learning_rate": 9.947252064266796e-06, + "loss": 0.3985, + "step": 588 + }, + { + "epoch": 0.04712, + "grad_norm": 1.6877951622009277, + "learning_rate": 9.947069710557412e-06, + "loss": 0.3609, + "step": 589 + }, + { + "epoch": 0.0472, + "grad_norm": 1.5663824081420898, + "learning_rate": 9.946887043863478e-06, + "loss": 0.3961, + "step": 590 + }, + { + "epoch": 0.04728, + "grad_norm": 1.6489958763122559, + "learning_rate": 9.946704064196554e-06, + "loss": 0.4324, + "step": 591 + }, + { + "epoch": 0.04736, + "grad_norm": 1.8338820934295654, + "learning_rate": 9.946520771568217e-06, + "loss": 0.3133, + "step": 592 + }, + { + "epoch": 0.04744, + "grad_norm": 1.592786192893982, + "learning_rate": 9.946337165990061e-06, + "loss": 0.3564, + "step": 593 + }, + { + "epoch": 0.04752, + "grad_norm": 1.8404992818832397, + "learning_rate": 9.946153247473702e-06, + "loss": 0.3641, + "step": 594 + }, + { + "epoch": 0.0476, + "grad_norm": 1.990356206893921, + "learning_rate": 9.945969016030779e-06, + "loss": 0.4124, + "step": 595 + }, + { + "epoch": 0.04768, + "grad_norm": 1.7861665487289429, + "learning_rate": 9.945784471672943e-06, + "loss": 0.3725, + "step": 596 + }, + { + "epoch": 0.04776, + "grad_norm": 2.0100088119506836, + "learning_rate": 9.945599614411875e-06, + "loss": 0.4765, + "step": 597 + }, + { + "epoch": 0.04784, + "grad_norm": 1.882002830505371, + "learning_rate": 9.945414444259267e-06, + "loss": 0.3974, + "step": 598 + }, + { + "epoch": 0.04792, + "grad_norm": 1.8743993043899536, + "learning_rate": 9.945228961226832e-06, + "loss": 0.3773, + "step": 599 + }, + { + "epoch": 0.048, + "grad_norm": 1.5349911451339722, + "learning_rate": 9.94504316532631e-06, + "loss": 0.2909, + "step": 600 + }, + { + "epoch": 0.04808, + "grad_norm": 1.8333827257156372, + "learning_rate": 9.944857056569452e-06, + "loss": 0.3577, + "step": 601 + }, + { + "epoch": 0.04816, + "grad_norm": 1.9618743658065796, + "learning_rate": 9.944670634968033e-06, + "loss": 0.4357, + "step": 602 + }, + { + "epoch": 0.04824, + "grad_norm": 1.9149047136306763, + "learning_rate": 9.944483900533848e-06, + "loss": 0.4081, + "step": 603 + }, + { + "epoch": 0.04832, + "grad_norm": 2.0370891094207764, + "learning_rate": 9.944296853278712e-06, + "loss": 0.3549, + "step": 604 + }, + { + "epoch": 0.0484, + "grad_norm": 2.002598524093628, + "learning_rate": 9.944109493214458e-06, + "loss": 0.3807, + "step": 605 + }, + { + "epoch": 0.04848, + "grad_norm": 1.196305274963379, + "learning_rate": 9.943921820352938e-06, + "loss": 0.2734, + "step": 606 + }, + { + "epoch": 0.04856, + "grad_norm": 1.8980247974395752, + "learning_rate": 9.943733834706027e-06, + "loss": 0.45, + "step": 607 + }, + { + "epoch": 0.04864, + "grad_norm": 1.8210073709487915, + "learning_rate": 9.94354553628562e-06, + "loss": 0.4916, + "step": 608 + }, + { + "epoch": 0.04872, + "grad_norm": 1.4517135620117188, + "learning_rate": 9.943356925103625e-06, + "loss": 0.3107, + "step": 609 + }, + { + "epoch": 0.0488, + "grad_norm": 1.8347890377044678, + "learning_rate": 9.94316800117198e-06, + "loss": 0.4305, + "step": 610 + }, + { + "epoch": 0.04888, + "grad_norm": 1.664490818977356, + "learning_rate": 9.942978764502635e-06, + "loss": 0.3694, + "step": 611 + }, + { + "epoch": 0.04896, + "grad_norm": 1.5147761106491089, + "learning_rate": 9.942789215107562e-06, + "loss": 0.3826, + "step": 612 + }, + { + "epoch": 0.04904, + "grad_norm": 1.5997848510742188, + "learning_rate": 9.942599352998756e-06, + "loss": 0.3296, + "step": 613 + }, + { + "epoch": 0.04912, + "grad_norm": 1.7910867929458618, + "learning_rate": 9.942409178188225e-06, + "loss": 0.354, + "step": 614 + }, + { + "epoch": 0.0492, + "grad_norm": 1.7603079080581665, + "learning_rate": 9.942218690688004e-06, + "loss": 0.486, + "step": 615 + }, + { + "epoch": 0.04928, + "grad_norm": 1.4481785297393799, + "learning_rate": 9.942027890510142e-06, + "loss": 0.4026, + "step": 616 + }, + { + "epoch": 0.04936, + "grad_norm": 1.7464784383773804, + "learning_rate": 9.94183677766671e-06, + "loss": 0.3778, + "step": 617 + }, + { + "epoch": 0.04944, + "grad_norm": 1.5305639505386353, + "learning_rate": 9.941645352169804e-06, + "loss": 0.3281, + "step": 618 + }, + { + "epoch": 0.04952, + "grad_norm": 1.8555560111999512, + "learning_rate": 9.941453614031528e-06, + "loss": 0.3575, + "step": 619 + }, + { + "epoch": 0.0496, + "grad_norm": 1.9013909101486206, + "learning_rate": 9.941261563264019e-06, + "loss": 0.3648, + "step": 620 + }, + { + "epoch": 0.04968, + "grad_norm": 1.7493908405303955, + "learning_rate": 9.941069199879424e-06, + "loss": 0.3552, + "step": 621 + }, + { + "epoch": 0.04976, + "grad_norm": 1.6803377866744995, + "learning_rate": 9.940876523889911e-06, + "loss": 0.4728, + "step": 622 + }, + { + "epoch": 0.04984, + "grad_norm": 1.9179245233535767, + "learning_rate": 9.940683535307675e-06, + "loss": 0.357, + "step": 623 + }, + { + "epoch": 0.04992, + "grad_norm": 2.1097536087036133, + "learning_rate": 9.940490234144923e-06, + "loss": 0.4825, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.8075186014175415, + "learning_rate": 9.940296620413884e-06, + "loss": 0.4851, + "step": 625 + }, + { + "epoch": 0.05008, + "grad_norm": 1.5507980585098267, + "learning_rate": 9.940102694126809e-06, + "loss": 0.3254, + "step": 626 + }, + { + "epoch": 0.05016, + "grad_norm": 2.0220351219177246, + "learning_rate": 9.939908455295965e-06, + "loss": 0.405, + "step": 627 + }, + { + "epoch": 0.05024, + "grad_norm": 1.5802489519119263, + "learning_rate": 9.939713903933644e-06, + "loss": 0.3536, + "step": 628 + }, + { + "epoch": 0.05032, + "grad_norm": 1.7642475366592407, + "learning_rate": 9.93951904005215e-06, + "loss": 0.3986, + "step": 629 + }, + { + "epoch": 0.0504, + "grad_norm": 1.5196278095245361, + "learning_rate": 9.939323863663817e-06, + "loss": 0.3779, + "step": 630 + }, + { + "epoch": 0.05048, + "grad_norm": 1.995604395866394, + "learning_rate": 9.939128374780987e-06, + "loss": 0.4138, + "step": 631 + }, + { + "epoch": 0.05056, + "grad_norm": 1.300185203552246, + "learning_rate": 9.938932573416033e-06, + "loss": 0.3161, + "step": 632 + }, + { + "epoch": 0.05064, + "grad_norm": 1.9059289693832397, + "learning_rate": 9.938736459581341e-06, + "loss": 0.3545, + "step": 633 + }, + { + "epoch": 0.05072, + "grad_norm": 2.062704563140869, + "learning_rate": 9.938540033289317e-06, + "loss": 0.3853, + "step": 634 + }, + { + "epoch": 0.0508, + "grad_norm": 1.7212554216384888, + "learning_rate": 9.93834329455239e-06, + "loss": 0.3571, + "step": 635 + }, + { + "epoch": 0.05088, + "grad_norm": 1.2724170684814453, + "learning_rate": 9.938146243383006e-06, + "loss": 0.3076, + "step": 636 + }, + { + "epoch": 0.05096, + "grad_norm": 1.4473823308944702, + "learning_rate": 9.937948879793634e-06, + "loss": 0.328, + "step": 637 + }, + { + "epoch": 0.05104, + "grad_norm": 1.304731011390686, + "learning_rate": 9.937751203796758e-06, + "loss": 0.3388, + "step": 638 + }, + { + "epoch": 0.05112, + "grad_norm": 1.8237764835357666, + "learning_rate": 9.937553215404884e-06, + "loss": 0.3978, + "step": 639 + }, + { + "epoch": 0.0512, + "grad_norm": 1.4053940773010254, + "learning_rate": 9.937354914630542e-06, + "loss": 0.2694, + "step": 640 + }, + { + "epoch": 0.05128, + "grad_norm": 1.79597806930542, + "learning_rate": 9.937156301486273e-06, + "loss": 0.4567, + "step": 641 + }, + { + "epoch": 0.05136, + "grad_norm": 1.2079006433486938, + "learning_rate": 9.936957375984644e-06, + "loss": 0.2878, + "step": 642 + }, + { + "epoch": 0.05144, + "grad_norm": 1.547292947769165, + "learning_rate": 9.93675813813824e-06, + "loss": 0.3161, + "step": 643 + }, + { + "epoch": 0.05152, + "grad_norm": 1.8033218383789062, + "learning_rate": 9.936558587959671e-06, + "loss": 0.3995, + "step": 644 + }, + { + "epoch": 0.0516, + "grad_norm": 1.5613384246826172, + "learning_rate": 9.936358725461555e-06, + "loss": 0.2709, + "step": 645 + }, + { + "epoch": 0.05168, + "grad_norm": 1.7098636627197266, + "learning_rate": 9.936158550656539e-06, + "loss": 0.3536, + "step": 646 + }, + { + "epoch": 0.05176, + "grad_norm": 1.7437068223953247, + "learning_rate": 9.93595806355729e-06, + "loss": 0.3261, + "step": 647 + }, + { + "epoch": 0.05184, + "grad_norm": 1.75038480758667, + "learning_rate": 9.93575726417649e-06, + "loss": 0.3398, + "step": 648 + }, + { + "epoch": 0.05192, + "grad_norm": 1.4043598175048828, + "learning_rate": 9.93555615252684e-06, + "loss": 0.3377, + "step": 649 + }, + { + "epoch": 0.052, + "grad_norm": 1.877252221107483, + "learning_rate": 9.935354728621069e-06, + "loss": 0.3839, + "step": 650 + }, + { + "epoch": 0.05208, + "grad_norm": 1.5518877506256104, + "learning_rate": 9.935152992471918e-06, + "loss": 0.4235, + "step": 651 + }, + { + "epoch": 0.05216, + "grad_norm": 1.384873390197754, + "learning_rate": 9.93495094409215e-06, + "loss": 0.3017, + "step": 652 + }, + { + "epoch": 0.05224, + "grad_norm": 2.235605001449585, + "learning_rate": 9.93474858349455e-06, + "loss": 0.3975, + "step": 653 + }, + { + "epoch": 0.05232, + "grad_norm": 2.175873279571533, + "learning_rate": 9.934545910691914e-06, + "loss": 0.4333, + "step": 654 + }, + { + "epoch": 0.0524, + "grad_norm": 2.141580581665039, + "learning_rate": 9.934342925697074e-06, + "loss": 0.4373, + "step": 655 + }, + { + "epoch": 0.05248, + "grad_norm": 1.8927725553512573, + "learning_rate": 9.934139628522865e-06, + "loss": 0.3006, + "step": 656 + }, + { + "epoch": 0.05256, + "grad_norm": 1.7776702642440796, + "learning_rate": 9.933936019182154e-06, + "loss": 0.4212, + "step": 657 + }, + { + "epoch": 0.05264, + "grad_norm": 1.5397083759307861, + "learning_rate": 9.933732097687817e-06, + "loss": 0.3998, + "step": 658 + }, + { + "epoch": 0.05272, + "grad_norm": 1.476570963859558, + "learning_rate": 9.93352786405276e-06, + "loss": 0.2903, + "step": 659 + }, + { + "epoch": 0.0528, + "grad_norm": 2.0092062950134277, + "learning_rate": 9.933323318289902e-06, + "loss": 0.3786, + "step": 660 + }, + { + "epoch": 0.05288, + "grad_norm": 1.8279545307159424, + "learning_rate": 9.933118460412186e-06, + "loss": 0.3953, + "step": 661 + }, + { + "epoch": 0.05296, + "grad_norm": 1.9125584363937378, + "learning_rate": 9.93291329043257e-06, + "loss": 0.3686, + "step": 662 + }, + { + "epoch": 0.05304, + "grad_norm": 1.951283574104309, + "learning_rate": 9.932707808364035e-06, + "loss": 0.4019, + "step": 663 + }, + { + "epoch": 0.05312, + "grad_norm": 1.3012398481369019, + "learning_rate": 9.932502014219583e-06, + "loss": 0.2866, + "step": 664 + }, + { + "epoch": 0.0532, + "grad_norm": 1.913560390472412, + "learning_rate": 9.932295908012233e-06, + "loss": 0.3671, + "step": 665 + }, + { + "epoch": 0.05328, + "grad_norm": 1.9930202960968018, + "learning_rate": 9.932089489755024e-06, + "loss": 0.4171, + "step": 666 + }, + { + "epoch": 0.05336, + "grad_norm": 1.7586143016815186, + "learning_rate": 9.931882759461016e-06, + "loss": 0.4016, + "step": 667 + }, + { + "epoch": 0.05344, + "grad_norm": 1.564494013786316, + "learning_rate": 9.931675717143288e-06, + "loss": 0.3041, + "step": 668 + }, + { + "epoch": 0.05352, + "grad_norm": 1.6268788576126099, + "learning_rate": 9.931468362814937e-06, + "loss": 0.3104, + "step": 669 + }, + { + "epoch": 0.0536, + "grad_norm": 1.5686761140823364, + "learning_rate": 9.931260696489085e-06, + "loss": 0.2989, + "step": 670 + }, + { + "epoch": 0.05368, + "grad_norm": 1.2307571172714233, + "learning_rate": 9.931052718178869e-06, + "loss": 0.2887, + "step": 671 + }, + { + "epoch": 0.05376, + "grad_norm": 1.586047887802124, + "learning_rate": 9.930844427897447e-06, + "loss": 0.3362, + "step": 672 + }, + { + "epoch": 0.05384, + "grad_norm": 1.9724785089492798, + "learning_rate": 9.930635825657996e-06, + "loss": 0.3575, + "step": 673 + }, + { + "epoch": 0.05392, + "grad_norm": 1.7712501287460327, + "learning_rate": 9.930426911473715e-06, + "loss": 0.3853, + "step": 674 + }, + { + "epoch": 0.054, + "grad_norm": 1.4114841222763062, + "learning_rate": 9.93021768535782e-06, + "loss": 0.3587, + "step": 675 + }, + { + "epoch": 0.05408, + "grad_norm": 2.0432698726654053, + "learning_rate": 9.93000814732355e-06, + "loss": 0.4619, + "step": 676 + }, + { + "epoch": 0.05416, + "grad_norm": 1.5945602655410767, + "learning_rate": 9.92979829738416e-06, + "loss": 0.2833, + "step": 677 + }, + { + "epoch": 0.05424, + "grad_norm": 2.2913811206817627, + "learning_rate": 9.929588135552925e-06, + "loss": 0.4524, + "step": 678 + }, + { + "epoch": 0.05432, + "grad_norm": 1.2987146377563477, + "learning_rate": 9.929377661843143e-06, + "loss": 0.2484, + "step": 679 + }, + { + "epoch": 0.0544, + "grad_norm": 1.3887112140655518, + "learning_rate": 9.929166876268132e-06, + "loss": 0.3099, + "step": 680 + }, + { + "epoch": 0.05448, + "grad_norm": 1.4816182851791382, + "learning_rate": 9.928955778841224e-06, + "loss": 0.3065, + "step": 681 + }, + { + "epoch": 0.05456, + "grad_norm": 1.4863247871398926, + "learning_rate": 9.928744369575778e-06, + "loss": 0.3439, + "step": 682 + }, + { + "epoch": 0.05464, + "grad_norm": 2.332185745239258, + "learning_rate": 9.928532648485168e-06, + "loss": 0.5691, + "step": 683 + }, + { + "epoch": 0.05472, + "grad_norm": 1.8246127367019653, + "learning_rate": 9.928320615582784e-06, + "loss": 0.3214, + "step": 684 + }, + { + "epoch": 0.0548, + "grad_norm": 1.643385648727417, + "learning_rate": 9.928108270882049e-06, + "loss": 0.2813, + "step": 685 + }, + { + "epoch": 0.05488, + "grad_norm": 1.6711945533752441, + "learning_rate": 9.927895614396392e-06, + "loss": 0.3232, + "step": 686 + }, + { + "epoch": 0.05496, + "grad_norm": 1.6612136363983154, + "learning_rate": 9.927682646139269e-06, + "loss": 0.3504, + "step": 687 + }, + { + "epoch": 0.05504, + "grad_norm": 1.4387818574905396, + "learning_rate": 9.927469366124152e-06, + "loss": 0.3185, + "step": 688 + }, + { + "epoch": 0.05512, + "grad_norm": 1.5262702703475952, + "learning_rate": 9.927255774364535e-06, + "loss": 0.3075, + "step": 689 + }, + { + "epoch": 0.0552, + "grad_norm": 1.6111522912979126, + "learning_rate": 9.927041870873932e-06, + "loss": 0.3691, + "step": 690 + }, + { + "epoch": 0.05528, + "grad_norm": 1.7556387186050415, + "learning_rate": 9.926827655665878e-06, + "loss": 0.4307, + "step": 691 + }, + { + "epoch": 0.05536, + "grad_norm": 1.449609637260437, + "learning_rate": 9.926613128753922e-06, + "loss": 0.3468, + "step": 692 + }, + { + "epoch": 0.05544, + "grad_norm": 1.6672128438949585, + "learning_rate": 9.926398290151637e-06, + "loss": 0.3852, + "step": 693 + }, + { + "epoch": 0.05552, + "grad_norm": 1.932892084121704, + "learning_rate": 9.926183139872616e-06, + "loss": 0.5628, + "step": 694 + }, + { + "epoch": 0.0556, + "grad_norm": 1.3282470703125, + "learning_rate": 9.925967677930472e-06, + "loss": 0.3317, + "step": 695 + }, + { + "epoch": 0.05568, + "grad_norm": 1.645140528678894, + "learning_rate": 9.925751904338834e-06, + "loss": 0.3647, + "step": 696 + }, + { + "epoch": 0.05576, + "grad_norm": 1.6291844844818115, + "learning_rate": 9.925535819111356e-06, + "loss": 0.2835, + "step": 697 + }, + { + "epoch": 0.05584, + "grad_norm": 1.8840759992599487, + "learning_rate": 9.925319422261708e-06, + "loss": 0.3661, + "step": 698 + }, + { + "epoch": 0.05592, + "grad_norm": 1.7590316534042358, + "learning_rate": 9.925102713803579e-06, + "loss": 0.3905, + "step": 699 + }, + { + "epoch": 0.056, + "grad_norm": 1.4774218797683716, + "learning_rate": 9.924885693750681e-06, + "loss": 0.491, + "step": 700 + }, + { + "epoch": 0.05608, + "grad_norm": 1.8233764171600342, + "learning_rate": 9.924668362116743e-06, + "loss": 0.4349, + "step": 701 + }, + { + "epoch": 0.05616, + "grad_norm": 2.0210931301116943, + "learning_rate": 9.924450718915517e-06, + "loss": 0.3602, + "step": 702 + }, + { + "epoch": 0.05624, + "grad_norm": 1.9228521585464478, + "learning_rate": 9.92423276416077e-06, + "loss": 0.3999, + "step": 703 + }, + { + "epoch": 0.05632, + "grad_norm": 1.6303220987319946, + "learning_rate": 9.924014497866295e-06, + "loss": 0.3443, + "step": 704 + }, + { + "epoch": 0.0564, + "grad_norm": 1.7699851989746094, + "learning_rate": 9.923795920045896e-06, + "loss": 0.3392, + "step": 705 + }, + { + "epoch": 0.05648, + "grad_norm": 1.4135288000106812, + "learning_rate": 9.923577030713406e-06, + "loss": 0.2955, + "step": 706 + }, + { + "epoch": 0.05656, + "grad_norm": 1.8755179643630981, + "learning_rate": 9.923357829882671e-06, + "loss": 0.4674, + "step": 707 + }, + { + "epoch": 0.05664, + "grad_norm": 1.4635642766952515, + "learning_rate": 9.92313831756756e-06, + "loss": 0.2897, + "step": 708 + }, + { + "epoch": 0.05672, + "grad_norm": 1.7681899070739746, + "learning_rate": 9.922918493781958e-06, + "loss": 0.3768, + "step": 709 + }, + { + "epoch": 0.0568, + "grad_norm": 1.4016932249069214, + "learning_rate": 9.92269835853978e-06, + "loss": 0.2945, + "step": 710 + }, + { + "epoch": 0.05688, + "grad_norm": 2.1971428394317627, + "learning_rate": 9.922477911854945e-06, + "loss": 0.4537, + "step": 711 + }, + { + "epoch": 0.05696, + "grad_norm": 1.4524810314178467, + "learning_rate": 9.922257153741402e-06, + "loss": 0.3269, + "step": 712 + }, + { + "epoch": 0.05704, + "grad_norm": 1.7311736345291138, + "learning_rate": 9.92203608421312e-06, + "loss": 0.415, + "step": 713 + }, + { + "epoch": 0.05712, + "grad_norm": 2.3954567909240723, + "learning_rate": 9.921814703284086e-06, + "loss": 0.4937, + "step": 714 + }, + { + "epoch": 0.0572, + "grad_norm": 1.8570671081542969, + "learning_rate": 9.921593010968302e-06, + "loss": 0.3431, + "step": 715 + }, + { + "epoch": 0.05728, + "grad_norm": 1.7509444952011108, + "learning_rate": 9.921371007279796e-06, + "loss": 0.4205, + "step": 716 + }, + { + "epoch": 0.05736, + "grad_norm": 1.6266485452651978, + "learning_rate": 9.921148692232615e-06, + "loss": 0.3455, + "step": 717 + }, + { + "epoch": 0.05744, + "grad_norm": 1.3018132448196411, + "learning_rate": 9.920926065840821e-06, + "loss": 0.2517, + "step": 718 + }, + { + "epoch": 0.05752, + "grad_norm": 1.8658397197723389, + "learning_rate": 9.9207031281185e-06, + "loss": 0.4211, + "step": 719 + }, + { + "epoch": 0.0576, + "grad_norm": 1.8189882040023804, + "learning_rate": 9.920479879079758e-06, + "loss": 0.3201, + "step": 720 + }, + { + "epoch": 0.05768, + "grad_norm": 1.655189871788025, + "learning_rate": 9.920256318738717e-06, + "loss": 0.3993, + "step": 721 + }, + { + "epoch": 0.05776, + "grad_norm": 1.7855379581451416, + "learning_rate": 9.920032447109522e-06, + "loss": 0.3763, + "step": 722 + }, + { + "epoch": 0.05784, + "grad_norm": 1.5921399593353271, + "learning_rate": 9.919808264206339e-06, + "loss": 0.3259, + "step": 723 + }, + { + "epoch": 0.05792, + "grad_norm": 1.2852582931518555, + "learning_rate": 9.919583770043347e-06, + "loss": 0.2554, + "step": 724 + }, + { + "epoch": 0.058, + "grad_norm": 1.9973994493484497, + "learning_rate": 9.91935896463475e-06, + "loss": 0.4433, + "step": 725 + }, + { + "epoch": 0.05808, + "grad_norm": 1.5259404182434082, + "learning_rate": 9.91913384799477e-06, + "loss": 0.391, + "step": 726 + }, + { + "epoch": 0.05816, + "grad_norm": 1.7888422012329102, + "learning_rate": 9.918908420137654e-06, + "loss": 0.3253, + "step": 727 + }, + { + "epoch": 0.05824, + "grad_norm": 1.7182378768920898, + "learning_rate": 9.91868268107766e-06, + "loss": 0.3694, + "step": 728 + }, + { + "epoch": 0.05832, + "grad_norm": 1.8759191036224365, + "learning_rate": 9.918456630829071e-06, + "loss": 0.3199, + "step": 729 + }, + { + "epoch": 0.0584, + "grad_norm": 1.4170511960983276, + "learning_rate": 9.918230269406188e-06, + "loss": 0.3506, + "step": 730 + }, + { + "epoch": 0.05848, + "grad_norm": 1.6913896799087524, + "learning_rate": 9.918003596823333e-06, + "loss": 0.4956, + "step": 731 + }, + { + "epoch": 0.05856, + "grad_norm": 1.5514379739761353, + "learning_rate": 9.917776613094846e-06, + "loss": 0.3401, + "step": 732 + }, + { + "epoch": 0.05864, + "grad_norm": 1.7475535869598389, + "learning_rate": 9.917549318235086e-06, + "loss": 0.3692, + "step": 733 + }, + { + "epoch": 0.05872, + "grad_norm": 1.3284653425216675, + "learning_rate": 9.917321712258436e-06, + "loss": 0.29, + "step": 734 + }, + { + "epoch": 0.0588, + "grad_norm": 1.3704982995986938, + "learning_rate": 9.917093795179294e-06, + "loss": 0.288, + "step": 735 + }, + { + "epoch": 0.05888, + "grad_norm": 1.5144543647766113, + "learning_rate": 9.916865567012082e-06, + "loss": 0.3907, + "step": 736 + }, + { + "epoch": 0.05896, + "grad_norm": 1.4970391988754272, + "learning_rate": 9.916637027771236e-06, + "loss": 0.3982, + "step": 737 + }, + { + "epoch": 0.05904, + "grad_norm": 1.6594504117965698, + "learning_rate": 9.916408177471216e-06, + "loss": 0.3547, + "step": 738 + }, + { + "epoch": 0.05912, + "grad_norm": 1.6561534404754639, + "learning_rate": 9.916179016126502e-06, + "loss": 0.4931, + "step": 739 + }, + { + "epoch": 0.0592, + "grad_norm": 1.0301389694213867, + "learning_rate": 9.915949543751591e-06, + "loss": 0.1981, + "step": 740 + }, + { + "epoch": 0.05928, + "grad_norm": 1.5365773439407349, + "learning_rate": 9.915719760361e-06, + "loss": 0.3218, + "step": 741 + }, + { + "epoch": 0.05936, + "grad_norm": 1.725466251373291, + "learning_rate": 9.915489665969269e-06, + "loss": 0.3497, + "step": 742 + }, + { + "epoch": 0.05944, + "grad_norm": 1.3347848653793335, + "learning_rate": 9.915259260590954e-06, + "loss": 0.29, + "step": 743 + }, + { + "epoch": 0.05952, + "grad_norm": 2.047433853149414, + "learning_rate": 9.915028544240633e-06, + "loss": 0.3546, + "step": 744 + }, + { + "epoch": 0.0596, + "grad_norm": 1.5531580448150635, + "learning_rate": 9.914797516932899e-06, + "loss": 0.2984, + "step": 745 + }, + { + "epoch": 0.05968, + "grad_norm": 1.964288353919983, + "learning_rate": 9.914566178682373e-06, + "loss": 0.3893, + "step": 746 + }, + { + "epoch": 0.05976, + "grad_norm": 1.6539465188980103, + "learning_rate": 9.91433452950369e-06, + "loss": 0.3372, + "step": 747 + }, + { + "epoch": 0.05984, + "grad_norm": 1.79845130443573, + "learning_rate": 9.914102569411503e-06, + "loss": 0.379, + "step": 748 + }, + { + "epoch": 0.05992, + "grad_norm": 1.5639078617095947, + "learning_rate": 9.91387029842049e-06, + "loss": 0.3265, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 1.675551414489746, + "learning_rate": 9.913637716545344e-06, + "loss": 0.3333, + "step": 750 + }, + { + "epoch": 0.06008, + "grad_norm": 1.6994894742965698, + "learning_rate": 9.913404823800782e-06, + "loss": 0.3565, + "step": 751 + }, + { + "epoch": 0.06016, + "grad_norm": 1.9031753540039062, + "learning_rate": 9.913171620201536e-06, + "loss": 0.4082, + "step": 752 + }, + { + "epoch": 0.06024, + "grad_norm": 1.789406657218933, + "learning_rate": 9.912938105762362e-06, + "loss": 0.3834, + "step": 753 + }, + { + "epoch": 0.06032, + "grad_norm": 1.7641888856887817, + "learning_rate": 9.912704280498032e-06, + "loss": 0.3464, + "step": 754 + }, + { + "epoch": 0.0604, + "grad_norm": 1.8987987041473389, + "learning_rate": 9.91247014442334e-06, + "loss": 0.4163, + "step": 755 + }, + { + "epoch": 0.06048, + "grad_norm": 1.8092420101165771, + "learning_rate": 9.912235697553101e-06, + "loss": 0.433, + "step": 756 + }, + { + "epoch": 0.06056, + "grad_norm": 1.20881187915802, + "learning_rate": 9.912000939902144e-06, + "loss": 0.251, + "step": 757 + }, + { + "epoch": 0.06064, + "grad_norm": 1.4415931701660156, + "learning_rate": 9.911765871485325e-06, + "loss": 0.3483, + "step": 758 + }, + { + "epoch": 0.06072, + "grad_norm": 2.056685209274292, + "learning_rate": 9.911530492317511e-06, + "loss": 0.4327, + "step": 759 + }, + { + "epoch": 0.0608, + "grad_norm": 2.1116669178009033, + "learning_rate": 9.9112948024136e-06, + "loss": 0.549, + "step": 760 + }, + { + "epoch": 0.06088, + "grad_norm": 1.765018343925476, + "learning_rate": 9.911058801788499e-06, + "loss": 0.3743, + "step": 761 + }, + { + "epoch": 0.06096, + "grad_norm": 1.5852882862091064, + "learning_rate": 9.910822490457139e-06, + "loss": 0.356, + "step": 762 + }, + { + "epoch": 0.06104, + "grad_norm": 1.6404635906219482, + "learning_rate": 9.910585868434473e-06, + "loss": 0.3543, + "step": 763 + }, + { + "epoch": 0.06112, + "grad_norm": 1.5035934448242188, + "learning_rate": 9.91034893573547e-06, + "loss": 0.3572, + "step": 764 + }, + { + "epoch": 0.0612, + "grad_norm": 1.9684243202209473, + "learning_rate": 9.910111692375118e-06, + "loss": 0.4283, + "step": 765 + }, + { + "epoch": 0.06128, + "grad_norm": 1.9752951860427856, + "learning_rate": 9.90987413836843e-06, + "loss": 0.4133, + "step": 766 + }, + { + "epoch": 0.06136, + "grad_norm": 1.6825329065322876, + "learning_rate": 9.909636273730434e-06, + "loss": 0.4326, + "step": 767 + }, + { + "epoch": 0.06144, + "grad_norm": 1.7126015424728394, + "learning_rate": 9.909398098476177e-06, + "loss": 0.3714, + "step": 768 + }, + { + "epoch": 0.06152, + "grad_norm": 1.39950692653656, + "learning_rate": 9.90915961262073e-06, + "loss": 0.3208, + "step": 769 + }, + { + "epoch": 0.0616, + "grad_norm": 1.8275245428085327, + "learning_rate": 9.908920816179182e-06, + "loss": 0.3871, + "step": 770 + }, + { + "epoch": 0.06168, + "grad_norm": 1.6578397750854492, + "learning_rate": 9.90868170916664e-06, + "loss": 0.335, + "step": 771 + }, + { + "epoch": 0.06176, + "grad_norm": 1.8626042604446411, + "learning_rate": 9.908442291598227e-06, + "loss": 0.3591, + "step": 772 + }, + { + "epoch": 0.06184, + "grad_norm": 1.6682721376419067, + "learning_rate": 9.908202563489095e-06, + "loss": 0.3483, + "step": 773 + }, + { + "epoch": 0.06192, + "grad_norm": 1.5909526348114014, + "learning_rate": 9.907962524854411e-06, + "loss": 0.4166, + "step": 774 + }, + { + "epoch": 0.062, + "grad_norm": 1.7059555053710938, + "learning_rate": 9.90772217570936e-06, + "loss": 0.3201, + "step": 775 + }, + { + "epoch": 0.06208, + "grad_norm": 1.5793256759643555, + "learning_rate": 9.907481516069149e-06, + "loss": 0.3644, + "step": 776 + }, + { + "epoch": 0.06216, + "grad_norm": 1.5775402784347534, + "learning_rate": 9.907240545949001e-06, + "loss": 0.2876, + "step": 777 + }, + { + "epoch": 0.06224, + "grad_norm": 1.9473414421081543, + "learning_rate": 9.906999265364163e-06, + "loss": 0.367, + "step": 778 + }, + { + "epoch": 0.06232, + "grad_norm": 1.9545674324035645, + "learning_rate": 9.906757674329903e-06, + "loss": 0.4359, + "step": 779 + }, + { + "epoch": 0.0624, + "grad_norm": 1.605450987815857, + "learning_rate": 9.906515772861501e-06, + "loss": 0.3206, + "step": 780 + }, + { + "epoch": 0.06248, + "grad_norm": 1.5518044233322144, + "learning_rate": 9.906273560974264e-06, + "loss": 0.3723, + "step": 781 + }, + { + "epoch": 0.06256, + "grad_norm": 1.6524393558502197, + "learning_rate": 9.906031038683515e-06, + "loss": 0.3181, + "step": 782 + }, + { + "epoch": 0.06264, + "grad_norm": 1.3769612312316895, + "learning_rate": 9.9057882060046e-06, + "loss": 0.3077, + "step": 783 + }, + { + "epoch": 0.06272, + "grad_norm": 1.5857384204864502, + "learning_rate": 9.905545062952876e-06, + "loss": 0.3109, + "step": 784 + }, + { + "epoch": 0.0628, + "grad_norm": 1.549974799156189, + "learning_rate": 9.90530160954373e-06, + "loss": 0.3216, + "step": 785 + }, + { + "epoch": 0.06288, + "grad_norm": 1.836879849433899, + "learning_rate": 9.905057845792568e-06, + "loss": 0.3937, + "step": 786 + }, + { + "epoch": 0.06296, + "grad_norm": 1.6906849145889282, + "learning_rate": 9.904813771714806e-06, + "loss": 0.3045, + "step": 787 + }, + { + "epoch": 0.06304, + "grad_norm": 1.4917078018188477, + "learning_rate": 9.904569387325888e-06, + "loss": 0.2654, + "step": 788 + }, + { + "epoch": 0.06312, + "grad_norm": 1.9071767330169678, + "learning_rate": 9.904324692641279e-06, + "loss": 0.4201, + "step": 789 + }, + { + "epoch": 0.0632, + "grad_norm": 1.6818795204162598, + "learning_rate": 9.904079687676453e-06, + "loss": 0.3577, + "step": 790 + }, + { + "epoch": 0.06328, + "grad_norm": 1.5190314054489136, + "learning_rate": 9.903834372446914e-06, + "loss": 0.3156, + "step": 791 + }, + { + "epoch": 0.06336, + "grad_norm": 1.2045843601226807, + "learning_rate": 9.903588746968185e-06, + "loss": 0.2759, + "step": 792 + }, + { + "epoch": 0.06344, + "grad_norm": 1.6211133003234863, + "learning_rate": 9.903342811255802e-06, + "loss": 0.3412, + "step": 793 + }, + { + "epoch": 0.06352, + "grad_norm": 1.6240700483322144, + "learning_rate": 9.903096565325326e-06, + "loss": 0.2906, + "step": 794 + }, + { + "epoch": 0.0636, + "grad_norm": 1.5824121236801147, + "learning_rate": 9.902850009192338e-06, + "loss": 0.3956, + "step": 795 + }, + { + "epoch": 0.06368, + "grad_norm": 1.8506085872650146, + "learning_rate": 9.902603142872433e-06, + "loss": 0.3616, + "step": 796 + }, + { + "epoch": 0.06376, + "grad_norm": 1.4106500148773193, + "learning_rate": 9.90235596638123e-06, + "loss": 0.3068, + "step": 797 + }, + { + "epoch": 0.06384, + "grad_norm": 1.5966297388076782, + "learning_rate": 9.902108479734372e-06, + "loss": 0.4436, + "step": 798 + }, + { + "epoch": 0.06392, + "grad_norm": 1.908096194267273, + "learning_rate": 9.90186068294751e-06, + "loss": 0.3364, + "step": 799 + }, + { + "epoch": 0.064, + "grad_norm": 1.6867671012878418, + "learning_rate": 9.901612576036326e-06, + "loss": 0.377, + "step": 800 + }, + { + "epoch": 0.06408, + "grad_norm": 1.7174458503723145, + "learning_rate": 9.901364159016514e-06, + "loss": 0.4152, + "step": 801 + }, + { + "epoch": 0.06416, + "grad_norm": 1.4476027488708496, + "learning_rate": 9.901115431903792e-06, + "loss": 0.3267, + "step": 802 + }, + { + "epoch": 0.06424, + "grad_norm": 1.7989741563796997, + "learning_rate": 9.900866394713895e-06, + "loss": 0.3113, + "step": 803 + }, + { + "epoch": 0.06432, + "grad_norm": 2.0357306003570557, + "learning_rate": 9.900617047462581e-06, + "loss": 0.3554, + "step": 804 + }, + { + "epoch": 0.0644, + "grad_norm": 1.6617554426193237, + "learning_rate": 9.900367390165623e-06, + "loss": 0.3521, + "step": 805 + }, + { + "epoch": 0.06448, + "grad_norm": 1.6434162855148315, + "learning_rate": 9.900117422838817e-06, + "loss": 0.3676, + "step": 806 + }, + { + "epoch": 0.06456, + "grad_norm": 1.570143699645996, + "learning_rate": 9.899867145497978e-06, + "loss": 0.3857, + "step": 807 + }, + { + "epoch": 0.06464, + "grad_norm": 1.8630397319793701, + "learning_rate": 9.89961655815894e-06, + "loss": 0.4321, + "step": 808 + }, + { + "epoch": 0.06472, + "grad_norm": 1.3922876119613647, + "learning_rate": 9.899365660837555e-06, + "loss": 0.3751, + "step": 809 + }, + { + "epoch": 0.0648, + "grad_norm": 1.575070858001709, + "learning_rate": 9.899114453549699e-06, + "loss": 0.3384, + "step": 810 + }, + { + "epoch": 0.06488, + "grad_norm": 1.494339108467102, + "learning_rate": 9.898862936311264e-06, + "loss": 0.3632, + "step": 811 + }, + { + "epoch": 0.06496, + "grad_norm": 1.5550836324691772, + "learning_rate": 9.898611109138163e-06, + "loss": 0.3129, + "step": 812 + }, + { + "epoch": 0.06504, + "grad_norm": 1.3468754291534424, + "learning_rate": 9.898358972046327e-06, + "loss": 0.3435, + "step": 813 + }, + { + "epoch": 0.06512, + "grad_norm": 1.2275058031082153, + "learning_rate": 9.89810652505171e-06, + "loss": 0.2591, + "step": 814 + }, + { + "epoch": 0.0652, + "grad_norm": 1.689713954925537, + "learning_rate": 9.897853768170282e-06, + "loss": 0.4462, + "step": 815 + }, + { + "epoch": 0.06528, + "grad_norm": 1.4738130569458008, + "learning_rate": 9.897600701418033e-06, + "loss": 0.2962, + "step": 816 + }, + { + "epoch": 0.06536, + "grad_norm": 1.8294364213943481, + "learning_rate": 9.897347324810977e-06, + "loss": 0.3394, + "step": 817 + }, + { + "epoch": 0.06544, + "grad_norm": 2.3864986896514893, + "learning_rate": 9.89709363836514e-06, + "loss": 0.5214, + "step": 818 + }, + { + "epoch": 0.06552, + "grad_norm": 1.4256991147994995, + "learning_rate": 9.896839642096576e-06, + "loss": 0.3178, + "step": 819 + }, + { + "epoch": 0.0656, + "grad_norm": 1.8996535539627075, + "learning_rate": 9.896585336021353e-06, + "loss": 0.53, + "step": 820 + }, + { + "epoch": 0.06568, + "grad_norm": 1.7665208578109741, + "learning_rate": 9.896330720155558e-06, + "loss": 0.4027, + "step": 821 + }, + { + "epoch": 0.06576, + "grad_norm": 1.2968679666519165, + "learning_rate": 9.896075794515304e-06, + "loss": 0.2658, + "step": 822 + }, + { + "epoch": 0.06584, + "grad_norm": 2.179189682006836, + "learning_rate": 9.895820559116715e-06, + "loss": 0.3901, + "step": 823 + }, + { + "epoch": 0.06592, + "grad_norm": 1.4469057321548462, + "learning_rate": 9.89556501397594e-06, + "loss": 0.3504, + "step": 824 + }, + { + "epoch": 0.066, + "grad_norm": 1.4145433902740479, + "learning_rate": 9.89530915910915e-06, + "loss": 0.3422, + "step": 825 + }, + { + "epoch": 0.06608, + "grad_norm": 1.6971393823623657, + "learning_rate": 9.895052994532527e-06, + "loss": 0.3849, + "step": 826 + }, + { + "epoch": 0.06616, + "grad_norm": 1.416899561882019, + "learning_rate": 9.89479652026228e-06, + "loss": 0.2804, + "step": 827 + }, + { + "epoch": 0.06624, + "grad_norm": 1.758364200592041, + "learning_rate": 9.894539736314636e-06, + "loss": 0.3886, + "step": 828 + }, + { + "epoch": 0.06632, + "grad_norm": 1.8403048515319824, + "learning_rate": 9.894282642705839e-06, + "loss": 0.3514, + "step": 829 + }, + { + "epoch": 0.0664, + "grad_norm": 2.0701754093170166, + "learning_rate": 9.894025239452156e-06, + "loss": 0.3829, + "step": 830 + }, + { + "epoch": 0.06648, + "grad_norm": 1.6465054750442505, + "learning_rate": 9.893767526569873e-06, + "loss": 0.4353, + "step": 831 + }, + { + "epoch": 0.06656, + "grad_norm": 1.3121788501739502, + "learning_rate": 9.893509504075291e-06, + "loss": 0.2722, + "step": 832 + }, + { + "epoch": 0.06664, + "grad_norm": 1.453292727470398, + "learning_rate": 9.893251171984737e-06, + "loss": 0.3116, + "step": 833 + }, + { + "epoch": 0.06672, + "grad_norm": 1.511839747428894, + "learning_rate": 9.892992530314556e-06, + "loss": 0.3135, + "step": 834 + }, + { + "epoch": 0.0668, + "grad_norm": 1.5262436866760254, + "learning_rate": 9.892733579081108e-06, + "loss": 0.321, + "step": 835 + }, + { + "epoch": 0.06688, + "grad_norm": 2.0322422981262207, + "learning_rate": 9.892474318300778e-06, + "loss": 0.5337, + "step": 836 + }, + { + "epoch": 0.06696, + "grad_norm": 1.5687463283538818, + "learning_rate": 9.892214747989967e-06, + "loss": 0.2962, + "step": 837 + }, + { + "epoch": 0.06704, + "grad_norm": 2.1740803718566895, + "learning_rate": 9.8919548681651e-06, + "loss": 0.4359, + "step": 838 + }, + { + "epoch": 0.06712, + "grad_norm": 1.6020910739898682, + "learning_rate": 9.891694678842617e-06, + "loss": 0.3122, + "step": 839 + }, + { + "epoch": 0.0672, + "grad_norm": 1.6646548509597778, + "learning_rate": 9.891434180038979e-06, + "loss": 0.4588, + "step": 840 + }, + { + "epoch": 0.06728, + "grad_norm": 1.4084759950637817, + "learning_rate": 9.891173371770666e-06, + "loss": 0.2731, + "step": 841 + }, + { + "epoch": 0.06736, + "grad_norm": 1.433922290802002, + "learning_rate": 9.890912254054182e-06, + "loss": 0.3111, + "step": 842 + }, + { + "epoch": 0.06744, + "grad_norm": 1.3869508504867554, + "learning_rate": 9.890650826906042e-06, + "loss": 0.3775, + "step": 843 + }, + { + "epoch": 0.06752, + "grad_norm": 1.6502546072006226, + "learning_rate": 9.890389090342789e-06, + "loss": 0.3441, + "step": 844 + }, + { + "epoch": 0.0676, + "grad_norm": 1.827817678451538, + "learning_rate": 9.890127044380983e-06, + "loss": 0.4, + "step": 845 + }, + { + "epoch": 0.06768, + "grad_norm": 1.306138038635254, + "learning_rate": 9.889864689037201e-06, + "loss": 0.3183, + "step": 846 + }, + { + "epoch": 0.06776, + "grad_norm": 2.2315001487731934, + "learning_rate": 9.88960202432804e-06, + "loss": 0.4797, + "step": 847 + }, + { + "epoch": 0.06784, + "grad_norm": 1.777730941772461, + "learning_rate": 9.889339050270122e-06, + "loss": 0.4378, + "step": 848 + }, + { + "epoch": 0.06792, + "grad_norm": 1.909468173980713, + "learning_rate": 9.88907576688008e-06, + "loss": 0.4438, + "step": 849 + }, + { + "epoch": 0.068, + "grad_norm": 1.5327777862548828, + "learning_rate": 9.888812174174574e-06, + "loss": 0.3694, + "step": 850 + }, + { + "epoch": 0.06808, + "grad_norm": 2.512308120727539, + "learning_rate": 9.88854827217028e-06, + "loss": 0.5076, + "step": 851 + }, + { + "epoch": 0.06816, + "grad_norm": 1.4551916122436523, + "learning_rate": 9.888284060883892e-06, + "loss": 0.3042, + "step": 852 + }, + { + "epoch": 0.06824, + "grad_norm": 1.8381390571594238, + "learning_rate": 9.88801954033213e-06, + "loss": 0.4247, + "step": 853 + }, + { + "epoch": 0.06832, + "grad_norm": 1.5407111644744873, + "learning_rate": 9.887754710531727e-06, + "loss": 0.3213, + "step": 854 + }, + { + "epoch": 0.0684, + "grad_norm": 1.478391408920288, + "learning_rate": 9.887489571499438e-06, + "loss": 0.3527, + "step": 855 + }, + { + "epoch": 0.06848, + "grad_norm": 1.9624853134155273, + "learning_rate": 9.887224123252037e-06, + "loss": 0.4039, + "step": 856 + }, + { + "epoch": 0.06856, + "grad_norm": 1.6501410007476807, + "learning_rate": 9.886958365806317e-06, + "loss": 0.4169, + "step": 857 + }, + { + "epoch": 0.06864, + "grad_norm": 1.4560359716415405, + "learning_rate": 9.886692299179094e-06, + "loss": 0.2993, + "step": 858 + }, + { + "epoch": 0.06872, + "grad_norm": 1.681654930114746, + "learning_rate": 9.8864259233872e-06, + "loss": 0.405, + "step": 859 + }, + { + "epoch": 0.0688, + "grad_norm": 1.7231281995773315, + "learning_rate": 9.886159238447488e-06, + "loss": 0.3455, + "step": 860 + }, + { + "epoch": 0.06888, + "grad_norm": 1.5479826927185059, + "learning_rate": 9.885892244376831e-06, + "loss": 0.3774, + "step": 861 + }, + { + "epoch": 0.06896, + "grad_norm": 1.5777459144592285, + "learning_rate": 9.88562494119212e-06, + "loss": 0.3301, + "step": 862 + }, + { + "epoch": 0.06904, + "grad_norm": 2.2783584594726562, + "learning_rate": 9.885357328910265e-06, + "loss": 0.4368, + "step": 863 + }, + { + "epoch": 0.06912, + "grad_norm": 1.8769716024398804, + "learning_rate": 9.885089407548198e-06, + "loss": 0.3725, + "step": 864 + }, + { + "epoch": 0.0692, + "grad_norm": 1.3858102560043335, + "learning_rate": 9.884821177122871e-06, + "loss": 0.2691, + "step": 865 + }, + { + "epoch": 0.06928, + "grad_norm": 1.477070689201355, + "learning_rate": 9.884552637651252e-06, + "loss": 0.2952, + "step": 866 + }, + { + "epoch": 0.06936, + "grad_norm": 1.43630051612854, + "learning_rate": 9.884283789150332e-06, + "loss": 0.3235, + "step": 867 + }, + { + "epoch": 0.06944, + "grad_norm": 1.65440833568573, + "learning_rate": 9.88401463163712e-06, + "loss": 0.3179, + "step": 868 + }, + { + "epoch": 0.06952, + "grad_norm": 1.6520179510116577, + "learning_rate": 9.883745165128642e-06, + "loss": 0.3051, + "step": 869 + }, + { + "epoch": 0.0696, + "grad_norm": 1.9777926206588745, + "learning_rate": 9.883475389641952e-06, + "loss": 0.4753, + "step": 870 + }, + { + "epoch": 0.06968, + "grad_norm": 1.5092403888702393, + "learning_rate": 9.883205305194112e-06, + "loss": 0.3216, + "step": 871 + }, + { + "epoch": 0.06976, + "grad_norm": 1.4967854022979736, + "learning_rate": 9.88293491180221e-06, + "loss": 0.3259, + "step": 872 + }, + { + "epoch": 0.06984, + "grad_norm": 1.7150827646255493, + "learning_rate": 9.882664209483356e-06, + "loss": 0.4019, + "step": 873 + }, + { + "epoch": 0.06992, + "grad_norm": 1.9008959531784058, + "learning_rate": 9.882393198254676e-06, + "loss": 0.5442, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 1.919569492340088, + "learning_rate": 9.882121878133314e-06, + "loss": 0.501, + "step": 875 + }, + { + "epoch": 0.07008, + "grad_norm": 1.6029773950576782, + "learning_rate": 9.881850249136438e-06, + "loss": 0.3842, + "step": 876 + }, + { + "epoch": 0.07016, + "grad_norm": 1.4710146188735962, + "learning_rate": 9.881578311281229e-06, + "loss": 0.3785, + "step": 877 + }, + { + "epoch": 0.07024, + "grad_norm": 1.135948896408081, + "learning_rate": 9.881306064584895e-06, + "loss": 0.201, + "step": 878 + }, + { + "epoch": 0.07032, + "grad_norm": 1.733964443206787, + "learning_rate": 9.88103350906466e-06, + "loss": 0.327, + "step": 879 + }, + { + "epoch": 0.0704, + "grad_norm": 1.3882209062576294, + "learning_rate": 9.880760644737765e-06, + "loss": 0.3169, + "step": 880 + }, + { + "epoch": 0.07048, + "grad_norm": 1.7301795482635498, + "learning_rate": 9.880487471621476e-06, + "loss": 0.3786, + "step": 881 + }, + { + "epoch": 0.07056, + "grad_norm": 1.1793718338012695, + "learning_rate": 9.880213989733077e-06, + "loss": 0.2577, + "step": 882 + }, + { + "epoch": 0.07064, + "grad_norm": 1.4494835138320923, + "learning_rate": 9.879940199089864e-06, + "loss": 0.2908, + "step": 883 + }, + { + "epoch": 0.07072, + "grad_norm": 1.7559984922409058, + "learning_rate": 9.879666099709166e-06, + "loss": 0.3739, + "step": 884 + }, + { + "epoch": 0.0708, + "grad_norm": 1.729697823524475, + "learning_rate": 9.87939169160832e-06, + "loss": 0.4134, + "step": 885 + }, + { + "epoch": 0.07088, + "grad_norm": 1.867188572883606, + "learning_rate": 9.879116974804688e-06, + "loss": 0.4629, + "step": 886 + }, + { + "epoch": 0.07096, + "grad_norm": 1.5233434438705444, + "learning_rate": 9.878841949315652e-06, + "loss": 0.3819, + "step": 887 + }, + { + "epoch": 0.07104, + "grad_norm": 1.6903587579727173, + "learning_rate": 9.878566615158609e-06, + "loss": 0.3121, + "step": 888 + }, + { + "epoch": 0.07112, + "grad_norm": 1.3567419052124023, + "learning_rate": 9.878290972350981e-06, + "loss": 0.2761, + "step": 889 + }, + { + "epoch": 0.0712, + "grad_norm": 1.6625133752822876, + "learning_rate": 9.878015020910205e-06, + "loss": 0.3199, + "step": 890 + }, + { + "epoch": 0.07128, + "grad_norm": 1.4875622987747192, + "learning_rate": 9.877738760853741e-06, + "loss": 0.3499, + "step": 891 + }, + { + "epoch": 0.07136, + "grad_norm": 1.3312549591064453, + "learning_rate": 9.877462192199068e-06, + "loss": 0.2261, + "step": 892 + }, + { + "epoch": 0.07144, + "grad_norm": 1.9580225944519043, + "learning_rate": 9.87718531496368e-06, + "loss": 0.3892, + "step": 893 + }, + { + "epoch": 0.07152, + "grad_norm": 1.421797275543213, + "learning_rate": 9.876908129165096e-06, + "loss": 0.3268, + "step": 894 + }, + { + "epoch": 0.0716, + "grad_norm": 2.2238597869873047, + "learning_rate": 9.876630634820853e-06, + "loss": 0.5997, + "step": 895 + }, + { + "epoch": 0.07168, + "grad_norm": 1.9498764276504517, + "learning_rate": 9.87635283194851e-06, + "loss": 0.3721, + "step": 896 + }, + { + "epoch": 0.07176, + "grad_norm": 1.8909319639205933, + "learning_rate": 9.876074720565637e-06, + "loss": 0.4237, + "step": 897 + }, + { + "epoch": 0.07184, + "grad_norm": 1.432112455368042, + "learning_rate": 9.875796300689832e-06, + "loss": 0.2579, + "step": 898 + }, + { + "epoch": 0.07192, + "grad_norm": 1.9798771142959595, + "learning_rate": 9.875517572338711e-06, + "loss": 0.3328, + "step": 899 + }, + { + "epoch": 0.072, + "grad_norm": 1.692348837852478, + "learning_rate": 9.875238535529905e-06, + "loss": 0.3862, + "step": 900 + }, + { + "epoch": 0.07208, + "grad_norm": 1.7320940494537354, + "learning_rate": 9.87495919028107e-06, + "loss": 0.4359, + "step": 901 + }, + { + "epoch": 0.07216, + "grad_norm": 1.4372979402542114, + "learning_rate": 9.87467953660988e-06, + "loss": 0.3045, + "step": 902 + }, + { + "epoch": 0.07224, + "grad_norm": 1.5734885931015015, + "learning_rate": 9.874399574534024e-06, + "loss": 0.3442, + "step": 903 + }, + { + "epoch": 0.07232, + "grad_norm": 1.7863317728042603, + "learning_rate": 9.874119304071217e-06, + "loss": 0.3815, + "step": 904 + }, + { + "epoch": 0.0724, + "grad_norm": 1.580787181854248, + "learning_rate": 9.873838725239192e-06, + "loss": 0.3385, + "step": 905 + }, + { + "epoch": 0.07248, + "grad_norm": 1.7870442867279053, + "learning_rate": 9.873557838055698e-06, + "loss": 0.3777, + "step": 906 + }, + { + "epoch": 0.07256, + "grad_norm": 1.8872705698013306, + "learning_rate": 9.873276642538508e-06, + "loss": 0.3434, + "step": 907 + }, + { + "epoch": 0.07264, + "grad_norm": 1.5185067653656006, + "learning_rate": 9.872995138705407e-06, + "loss": 0.3182, + "step": 908 + }, + { + "epoch": 0.07272, + "grad_norm": 1.175097942352295, + "learning_rate": 9.872713326574212e-06, + "loss": 0.2708, + "step": 909 + }, + { + "epoch": 0.0728, + "grad_norm": 1.4773153066635132, + "learning_rate": 9.87243120616275e-06, + "loss": 0.3242, + "step": 910 + }, + { + "epoch": 0.07288, + "grad_norm": 1.9224504232406616, + "learning_rate": 9.872148777488865e-06, + "loss": 0.5271, + "step": 911 + }, + { + "epoch": 0.07296, + "grad_norm": 1.5207834243774414, + "learning_rate": 9.871866040570432e-06, + "loss": 0.3758, + "step": 912 + }, + { + "epoch": 0.07304, + "grad_norm": 1.730446696281433, + "learning_rate": 9.871582995425335e-06, + "loss": 0.4164, + "step": 913 + }, + { + "epoch": 0.07312, + "grad_norm": 1.8168655633926392, + "learning_rate": 9.871299642071483e-06, + "loss": 0.3849, + "step": 914 + }, + { + "epoch": 0.0732, + "grad_norm": 2.1133110523223877, + "learning_rate": 9.871015980526802e-06, + "loss": 0.4274, + "step": 915 + }, + { + "epoch": 0.07328, + "grad_norm": 1.3025873899459839, + "learning_rate": 9.870732010809236e-06, + "loss": 0.2901, + "step": 916 + }, + { + "epoch": 0.07336, + "grad_norm": 1.259355068206787, + "learning_rate": 9.870447732936755e-06, + "loss": 0.3114, + "step": 917 + }, + { + "epoch": 0.07344, + "grad_norm": 1.4465082883834839, + "learning_rate": 9.870163146927343e-06, + "loss": 0.3957, + "step": 918 + }, + { + "epoch": 0.07352, + "grad_norm": 1.7125508785247803, + "learning_rate": 9.869878252799004e-06, + "loss": 0.3449, + "step": 919 + }, + { + "epoch": 0.0736, + "grad_norm": 2.206003189086914, + "learning_rate": 9.869593050569761e-06, + "loss": 0.4511, + "step": 920 + }, + { + "epoch": 0.07368, + "grad_norm": 1.9376089572906494, + "learning_rate": 9.869307540257663e-06, + "loss": 0.3655, + "step": 921 + }, + { + "epoch": 0.07376, + "grad_norm": 1.8399741649627686, + "learning_rate": 9.869021721880765e-06, + "loss": 0.4562, + "step": 922 + }, + { + "epoch": 0.07384, + "grad_norm": 1.5403954982757568, + "learning_rate": 9.868735595457157e-06, + "loss": 0.4594, + "step": 923 + }, + { + "epoch": 0.07392, + "grad_norm": 1.8176610469818115, + "learning_rate": 9.86844916100494e-06, + "loss": 0.3866, + "step": 924 + }, + { + "epoch": 0.074, + "grad_norm": 1.6104621887207031, + "learning_rate": 9.868162418542233e-06, + "loss": 0.3379, + "step": 925 + }, + { + "epoch": 0.07408, + "grad_norm": 1.7758665084838867, + "learning_rate": 9.867875368087179e-06, + "loss": 0.3828, + "step": 926 + }, + { + "epoch": 0.07416, + "grad_norm": 1.398337960243225, + "learning_rate": 9.867588009657938e-06, + "loss": 0.3024, + "step": 927 + }, + { + "epoch": 0.07424, + "grad_norm": 1.812811017036438, + "learning_rate": 9.86730034327269e-06, + "loss": 0.5139, + "step": 928 + }, + { + "epoch": 0.07432, + "grad_norm": 1.4070488214492798, + "learning_rate": 9.867012368949637e-06, + "loss": 0.325, + "step": 929 + }, + { + "epoch": 0.0744, + "grad_norm": 1.56523597240448, + "learning_rate": 9.866724086706996e-06, + "loss": 0.4111, + "step": 930 + }, + { + "epoch": 0.07448, + "grad_norm": 2.1064064502716064, + "learning_rate": 9.866435496563004e-06, + "loss": 0.4322, + "step": 931 + }, + { + "epoch": 0.07456, + "grad_norm": 1.599244236946106, + "learning_rate": 9.866146598535925e-06, + "loss": 0.3167, + "step": 932 + }, + { + "epoch": 0.07464, + "grad_norm": 1.8283414840698242, + "learning_rate": 9.865857392644029e-06, + "loss": 0.4891, + "step": 933 + }, + { + "epoch": 0.07472, + "grad_norm": 1.8395085334777832, + "learning_rate": 9.86556787890562e-06, + "loss": 0.4302, + "step": 934 + }, + { + "epoch": 0.0748, + "grad_norm": 1.6834166049957275, + "learning_rate": 9.865278057339011e-06, + "loss": 0.3058, + "step": 935 + }, + { + "epoch": 0.07488, + "grad_norm": 1.350382924079895, + "learning_rate": 9.864987927962536e-06, + "loss": 0.3224, + "step": 936 + }, + { + "epoch": 0.07496, + "grad_norm": 2.0382933616638184, + "learning_rate": 9.864697490794556e-06, + "loss": 0.3888, + "step": 937 + }, + { + "epoch": 0.07504, + "grad_norm": 1.7430832386016846, + "learning_rate": 9.864406745853443e-06, + "loss": 0.3893, + "step": 938 + }, + { + "epoch": 0.07512, + "grad_norm": 2.0541799068450928, + "learning_rate": 9.86411569315759e-06, + "loss": 0.4582, + "step": 939 + }, + { + "epoch": 0.0752, + "grad_norm": 1.439474105834961, + "learning_rate": 9.863824332725413e-06, + "loss": 0.3332, + "step": 940 + }, + { + "epoch": 0.07528, + "grad_norm": 1.6011489629745483, + "learning_rate": 9.863532664575346e-06, + "loss": 0.3182, + "step": 941 + }, + { + "epoch": 0.07536, + "grad_norm": 1.7721103429794312, + "learning_rate": 9.863240688725839e-06, + "loss": 0.2911, + "step": 942 + }, + { + "epoch": 0.07544, + "grad_norm": 1.915481448173523, + "learning_rate": 9.862948405195367e-06, + "loss": 0.4128, + "step": 943 + }, + { + "epoch": 0.07552, + "grad_norm": 1.525109887123108, + "learning_rate": 9.862655814002421e-06, + "loss": 0.2976, + "step": 944 + }, + { + "epoch": 0.0756, + "grad_norm": 1.5049928426742554, + "learning_rate": 9.862362915165513e-06, + "loss": 0.3732, + "step": 945 + }, + { + "epoch": 0.07568, + "grad_norm": 1.1610263586044312, + "learning_rate": 9.862069708703172e-06, + "loss": 0.2572, + "step": 946 + }, + { + "epoch": 0.07576, + "grad_norm": 1.5599390268325806, + "learning_rate": 9.861776194633948e-06, + "loss": 0.3857, + "step": 947 + }, + { + "epoch": 0.07584, + "grad_norm": 1.5780138969421387, + "learning_rate": 9.861482372976413e-06, + "loss": 0.3915, + "step": 948 + }, + { + "epoch": 0.07592, + "grad_norm": 1.3370410203933716, + "learning_rate": 9.861188243749154e-06, + "loss": 0.3541, + "step": 949 + }, + { + "epoch": 0.076, + "grad_norm": 1.5612949132919312, + "learning_rate": 9.86089380697078e-06, + "loss": 0.3142, + "step": 950 + }, + { + "epoch": 0.07608, + "grad_norm": 2.2083637714385986, + "learning_rate": 9.860599062659922e-06, + "loss": 0.4862, + "step": 951 + }, + { + "epoch": 0.07616, + "grad_norm": 1.7525696754455566, + "learning_rate": 9.860304010835222e-06, + "loss": 0.3501, + "step": 952 + }, + { + "epoch": 0.07624, + "grad_norm": 1.5072181224822998, + "learning_rate": 9.860008651515352e-06, + "loss": 0.3569, + "step": 953 + }, + { + "epoch": 0.07632, + "grad_norm": 2.169581890106201, + "learning_rate": 9.859712984718994e-06, + "loss": 0.4334, + "step": 954 + }, + { + "epoch": 0.0764, + "grad_norm": 1.740846037864685, + "learning_rate": 9.859417010464857e-06, + "loss": 0.3597, + "step": 955 + }, + { + "epoch": 0.07648, + "grad_norm": 1.516270637512207, + "learning_rate": 9.859120728771667e-06, + "loss": 0.3523, + "step": 956 + }, + { + "epoch": 0.07656, + "grad_norm": 2.101482629776001, + "learning_rate": 9.858824139658166e-06, + "loss": 0.4112, + "step": 957 + }, + { + "epoch": 0.07664, + "grad_norm": 1.6379002332687378, + "learning_rate": 9.85852724314312e-06, + "loss": 0.3163, + "step": 958 + }, + { + "epoch": 0.07672, + "grad_norm": 1.6774054765701294, + "learning_rate": 9.858230039245312e-06, + "loss": 0.2885, + "step": 959 + }, + { + "epoch": 0.0768, + "grad_norm": 1.9318816661834717, + "learning_rate": 9.857932527983544e-06, + "loss": 0.4156, + "step": 960 + }, + { + "epoch": 0.07688, + "grad_norm": 1.6440109014511108, + "learning_rate": 9.85763470937664e-06, + "loss": 0.397, + "step": 961 + }, + { + "epoch": 0.07696, + "grad_norm": 1.682336449623108, + "learning_rate": 9.857336583443441e-06, + "loss": 0.3695, + "step": 962 + }, + { + "epoch": 0.07704, + "grad_norm": 1.555406093597412, + "learning_rate": 9.85703815020281e-06, + "loss": 0.304, + "step": 963 + }, + { + "epoch": 0.07712, + "grad_norm": 1.527746319770813, + "learning_rate": 9.856739409673628e-06, + "loss": 0.3563, + "step": 964 + }, + { + "epoch": 0.0772, + "grad_norm": 2.2929794788360596, + "learning_rate": 9.856440361874791e-06, + "loss": 0.449, + "step": 965 + }, + { + "epoch": 0.07728, + "grad_norm": 1.6270774602890015, + "learning_rate": 9.856141006825225e-06, + "loss": 0.3649, + "step": 966 + }, + { + "epoch": 0.07736, + "grad_norm": 1.5416566133499146, + "learning_rate": 9.855841344543865e-06, + "loss": 0.3216, + "step": 967 + }, + { + "epoch": 0.07744, + "grad_norm": 1.646708607673645, + "learning_rate": 9.855541375049671e-06, + "loss": 0.4936, + "step": 968 + }, + { + "epoch": 0.07752, + "grad_norm": 1.7581204175949097, + "learning_rate": 9.85524109836162e-06, + "loss": 0.3516, + "step": 969 + }, + { + "epoch": 0.0776, + "grad_norm": 1.6109693050384521, + "learning_rate": 9.854940514498712e-06, + "loss": 0.3182, + "step": 970 + }, + { + "epoch": 0.07768, + "grad_norm": 1.38651442527771, + "learning_rate": 9.854639623479962e-06, + "loss": 0.2693, + "step": 971 + }, + { + "epoch": 0.07776, + "grad_norm": 1.5283217430114746, + "learning_rate": 9.854338425324405e-06, + "loss": 0.3448, + "step": 972 + }, + { + "epoch": 0.07784, + "grad_norm": 1.3900861740112305, + "learning_rate": 9.854036920051102e-06, + "loss": 0.2597, + "step": 973 + }, + { + "epoch": 0.07792, + "grad_norm": 1.3767924308776855, + "learning_rate": 9.85373510767912e-06, + "loss": 0.3011, + "step": 974 + }, + { + "epoch": 0.078, + "grad_norm": 1.4808169603347778, + "learning_rate": 9.853432988227563e-06, + "loss": 0.3063, + "step": 975 + }, + { + "epoch": 0.07808, + "grad_norm": 1.500436782836914, + "learning_rate": 9.853130561715538e-06, + "loss": 0.3745, + "step": 976 + }, + { + "epoch": 0.07816, + "grad_norm": 1.8564033508300781, + "learning_rate": 9.852827828162182e-06, + "loss": 0.4051, + "step": 977 + }, + { + "epoch": 0.07824, + "grad_norm": 2.207338809967041, + "learning_rate": 9.852524787586645e-06, + "loss": 0.4495, + "step": 978 + }, + { + "epoch": 0.07832, + "grad_norm": 1.2589378356933594, + "learning_rate": 9.852221440008103e-06, + "loss": 0.3034, + "step": 979 + }, + { + "epoch": 0.0784, + "grad_norm": 1.5934374332427979, + "learning_rate": 9.851917785445745e-06, + "loss": 0.3168, + "step": 980 + }, + { + "epoch": 0.07848, + "grad_norm": 1.6377482414245605, + "learning_rate": 9.851613823918785e-06, + "loss": 0.3054, + "step": 981 + }, + { + "epoch": 0.07856, + "grad_norm": 1.398160696029663, + "learning_rate": 9.85130955544645e-06, + "loss": 0.3111, + "step": 982 + }, + { + "epoch": 0.07864, + "grad_norm": 1.5453648567199707, + "learning_rate": 9.851004980047993e-06, + "loss": 0.3174, + "step": 983 + }, + { + "epoch": 0.07872, + "grad_norm": 1.4585448503494263, + "learning_rate": 9.850700097742683e-06, + "loss": 0.3859, + "step": 984 + }, + { + "epoch": 0.0788, + "grad_norm": 1.838578224182129, + "learning_rate": 9.850394908549808e-06, + "loss": 0.431, + "step": 985 + }, + { + "epoch": 0.07888, + "grad_norm": 1.7750165462493896, + "learning_rate": 9.850089412488676e-06, + "loss": 0.3653, + "step": 986 + }, + { + "epoch": 0.07896, + "grad_norm": 1.7502179145812988, + "learning_rate": 9.849783609578616e-06, + "loss": 0.4266, + "step": 987 + }, + { + "epoch": 0.07904, + "grad_norm": 1.66646409034729, + "learning_rate": 9.849477499838974e-06, + "loss": 0.3501, + "step": 988 + }, + { + "epoch": 0.07912, + "grad_norm": 2.072134256362915, + "learning_rate": 9.849171083289117e-06, + "loss": 0.4357, + "step": 989 + }, + { + "epoch": 0.0792, + "grad_norm": 1.6113394498825073, + "learning_rate": 9.84886435994843e-06, + "loss": 0.3318, + "step": 990 + }, + { + "epoch": 0.07928, + "grad_norm": 1.862763524055481, + "learning_rate": 9.84855732983632e-06, + "loss": 0.34, + "step": 991 + }, + { + "epoch": 0.07936, + "grad_norm": 1.8371633291244507, + "learning_rate": 9.848249992972212e-06, + "loss": 0.3153, + "step": 992 + }, + { + "epoch": 0.07944, + "grad_norm": 1.5195534229278564, + "learning_rate": 9.847942349375549e-06, + "loss": 0.3188, + "step": 993 + }, + { + "epoch": 0.07952, + "grad_norm": 1.642816185951233, + "learning_rate": 9.847634399065794e-06, + "loss": 0.3539, + "step": 994 + }, + { + "epoch": 0.0796, + "grad_norm": 1.4042744636535645, + "learning_rate": 9.84732614206243e-06, + "loss": 0.3128, + "step": 995 + }, + { + "epoch": 0.07968, + "grad_norm": 1.9033617973327637, + "learning_rate": 9.847017578384961e-06, + "loss": 0.395, + "step": 996 + }, + { + "epoch": 0.07976, + "grad_norm": 2.087599039077759, + "learning_rate": 9.846708708052908e-06, + "loss": 0.3815, + "step": 997 + }, + { + "epoch": 0.07984, + "grad_norm": 1.8923081159591675, + "learning_rate": 9.846399531085812e-06, + "loss": 0.3115, + "step": 998 + }, + { + "epoch": 0.07992, + "grad_norm": 1.654727816581726, + "learning_rate": 9.846090047503235e-06, + "loss": 0.2935, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 2.230875015258789, + "learning_rate": 9.845780257324755e-06, + "loss": 0.3925, + "step": 1000 + }, + { + "epoch": 0.08008, + "grad_norm": 1.9971858263015747, + "learning_rate": 9.845470160569973e-06, + "loss": 0.3956, + "step": 1001 + }, + { + "epoch": 0.08016, + "grad_norm": 1.6523634195327759, + "learning_rate": 9.845159757258505e-06, + "loss": 0.443, + "step": 1002 + }, + { + "epoch": 0.08024, + "grad_norm": 2.058262586593628, + "learning_rate": 9.844849047409993e-06, + "loss": 0.3834, + "step": 1003 + }, + { + "epoch": 0.08032, + "grad_norm": 1.6032230854034424, + "learning_rate": 9.844538031044092e-06, + "loss": 0.3474, + "step": 1004 + }, + { + "epoch": 0.0804, + "grad_norm": 1.7125388383865356, + "learning_rate": 9.84422670818048e-06, + "loss": 0.4696, + "step": 1005 + }, + { + "epoch": 0.08048, + "grad_norm": 1.2934495210647583, + "learning_rate": 9.843915078838852e-06, + "loss": 0.3361, + "step": 1006 + }, + { + "epoch": 0.08056, + "grad_norm": 1.6751182079315186, + "learning_rate": 9.843603143038925e-06, + "loss": 0.3279, + "step": 1007 + }, + { + "epoch": 0.08064, + "grad_norm": 1.739035964012146, + "learning_rate": 9.843290900800436e-06, + "loss": 0.3179, + "step": 1008 + }, + { + "epoch": 0.08072, + "grad_norm": 1.8006422519683838, + "learning_rate": 9.842978352143133e-06, + "loss": 0.3985, + "step": 1009 + }, + { + "epoch": 0.0808, + "grad_norm": 1.3257513046264648, + "learning_rate": 9.842665497086798e-06, + "loss": 0.3005, + "step": 1010 + }, + { + "epoch": 0.08088, + "grad_norm": 1.31377112865448, + "learning_rate": 9.84235233565122e-06, + "loss": 0.2922, + "step": 1011 + }, + { + "epoch": 0.08096, + "grad_norm": 1.3837974071502686, + "learning_rate": 9.842038867856211e-06, + "loss": 0.2946, + "step": 1012 + }, + { + "epoch": 0.08104, + "grad_norm": 1.7588199377059937, + "learning_rate": 9.841725093721606e-06, + "loss": 0.4315, + "step": 1013 + }, + { + "epoch": 0.08112, + "grad_norm": 1.4498939514160156, + "learning_rate": 9.841411013267252e-06, + "loss": 0.3537, + "step": 1014 + }, + { + "epoch": 0.0812, + "grad_norm": 1.9055722951889038, + "learning_rate": 9.841096626513024e-06, + "loss": 0.4242, + "step": 1015 + }, + { + "epoch": 0.08128, + "grad_norm": 2.0020415782928467, + "learning_rate": 9.840781933478813e-06, + "loss": 0.4011, + "step": 1016 + }, + { + "epoch": 0.08136, + "grad_norm": 1.7482478618621826, + "learning_rate": 9.840466934184525e-06, + "loss": 0.3524, + "step": 1017 + }, + { + "epoch": 0.08144, + "grad_norm": 1.8049051761627197, + "learning_rate": 9.84015162865009e-06, + "loss": 0.3151, + "step": 1018 + }, + { + "epoch": 0.08152, + "grad_norm": 1.8534055948257446, + "learning_rate": 9.839836016895457e-06, + "loss": 0.3722, + "step": 1019 + }, + { + "epoch": 0.0816, + "grad_norm": 1.7496310472488403, + "learning_rate": 9.839520098940593e-06, + "loss": 0.4035, + "step": 1020 + }, + { + "epoch": 0.08168, + "grad_norm": 2.2628374099731445, + "learning_rate": 9.839203874805486e-06, + "loss": 0.4903, + "step": 1021 + }, + { + "epoch": 0.08176, + "grad_norm": 1.4022674560546875, + "learning_rate": 9.838887344510139e-06, + "loss": 0.314, + "step": 1022 + }, + { + "epoch": 0.08184, + "grad_norm": 1.7159093618392944, + "learning_rate": 9.838570508074584e-06, + "loss": 0.4232, + "step": 1023 + }, + { + "epoch": 0.08192, + "grad_norm": 1.4128509759902954, + "learning_rate": 9.838253365518862e-06, + "loss": 0.3225, + "step": 1024 + }, + { + "epoch": 0.082, + "grad_norm": 1.8229515552520752, + "learning_rate": 9.837935916863038e-06, + "loss": 0.3784, + "step": 1025 + }, + { + "epoch": 0.08208, + "grad_norm": 1.3460335731506348, + "learning_rate": 9.837618162127196e-06, + "loss": 0.2683, + "step": 1026 + }, + { + "epoch": 0.08216, + "grad_norm": 1.5191764831542969, + "learning_rate": 9.83730010133144e-06, + "loss": 0.3418, + "step": 1027 + }, + { + "epoch": 0.08224, + "grad_norm": 1.3885241746902466, + "learning_rate": 9.836981734495895e-06, + "loss": 0.2815, + "step": 1028 + }, + { + "epoch": 0.08232, + "grad_norm": 2.1471521854400635, + "learning_rate": 9.836663061640697e-06, + "loss": 0.3671, + "step": 1029 + }, + { + "epoch": 0.0824, + "grad_norm": 1.2107166051864624, + "learning_rate": 9.83634408278601e-06, + "loss": 0.2604, + "step": 1030 + }, + { + "epoch": 0.08248, + "grad_norm": 1.9330739974975586, + "learning_rate": 9.836024797952017e-06, + "loss": 0.5037, + "step": 1031 + }, + { + "epoch": 0.08256, + "grad_norm": 1.541945219039917, + "learning_rate": 9.835705207158916e-06, + "loss": 0.3424, + "step": 1032 + }, + { + "epoch": 0.08264, + "grad_norm": 2.0090341567993164, + "learning_rate": 9.835385310426928e-06, + "loss": 0.4266, + "step": 1033 + }, + { + "epoch": 0.08272, + "grad_norm": 1.8531626462936401, + "learning_rate": 9.835065107776289e-06, + "loss": 0.4106, + "step": 1034 + }, + { + "epoch": 0.0828, + "grad_norm": 1.9602084159851074, + "learning_rate": 9.83474459922726e-06, + "loss": 0.3546, + "step": 1035 + }, + { + "epoch": 0.08288, + "grad_norm": 1.5967477560043335, + "learning_rate": 9.834423784800115e-06, + "loss": 0.4163, + "step": 1036 + }, + { + "epoch": 0.08296, + "grad_norm": 1.9444454908370972, + "learning_rate": 9.834102664515155e-06, + "loss": 0.396, + "step": 1037 + }, + { + "epoch": 0.08304, + "grad_norm": 1.4539134502410889, + "learning_rate": 9.833781238392695e-06, + "loss": 0.3087, + "step": 1038 + }, + { + "epoch": 0.08312, + "grad_norm": 1.5706698894500732, + "learning_rate": 9.833459506453069e-06, + "loss": 0.3336, + "step": 1039 + }, + { + "epoch": 0.0832, + "grad_norm": 1.93034827709198, + "learning_rate": 9.833137468716634e-06, + "loss": 0.402, + "step": 1040 + }, + { + "epoch": 0.08328, + "grad_norm": 1.7693796157836914, + "learning_rate": 9.832815125203761e-06, + "loss": 0.4466, + "step": 1041 + }, + { + "epoch": 0.08336, + "grad_norm": 1.506583571434021, + "learning_rate": 9.832492475934848e-06, + "loss": 0.3304, + "step": 1042 + }, + { + "epoch": 0.08344, + "grad_norm": 1.3750256299972534, + "learning_rate": 9.832169520930303e-06, + "loss": 0.2565, + "step": 1043 + }, + { + "epoch": 0.08352, + "grad_norm": 1.999240517616272, + "learning_rate": 9.831846260210563e-06, + "loss": 0.486, + "step": 1044 + }, + { + "epoch": 0.0836, + "grad_norm": 1.6762970685958862, + "learning_rate": 9.831522693796077e-06, + "loss": 0.3922, + "step": 1045 + }, + { + "epoch": 0.08368, + "grad_norm": 1.6641404628753662, + "learning_rate": 9.831198821707316e-06, + "loss": 0.3165, + "step": 1046 + }, + { + "epoch": 0.08376, + "grad_norm": 1.7150557041168213, + "learning_rate": 9.83087464396477e-06, + "loss": 0.3909, + "step": 1047 + }, + { + "epoch": 0.08384, + "grad_norm": 1.5680177211761475, + "learning_rate": 9.830550160588951e-06, + "loss": 0.4403, + "step": 1048 + }, + { + "epoch": 0.08392, + "grad_norm": 1.7904253005981445, + "learning_rate": 9.830225371600386e-06, + "loss": 0.3263, + "step": 1049 + }, + { + "epoch": 0.084, + "grad_norm": 1.3248636722564697, + "learning_rate": 9.829900277019624e-06, + "loss": 0.2865, + "step": 1050 + }, + { + "epoch": 0.08408, + "grad_norm": 1.825286865234375, + "learning_rate": 9.829574876867232e-06, + "loss": 0.4188, + "step": 1051 + }, + { + "epoch": 0.08416, + "grad_norm": 1.7181260585784912, + "learning_rate": 9.829249171163798e-06, + "loss": 0.3737, + "step": 1052 + }, + { + "epoch": 0.08424, + "grad_norm": 2.284151077270508, + "learning_rate": 9.828923159929927e-06, + "loss": 0.4879, + "step": 1053 + }, + { + "epoch": 0.08432, + "grad_norm": 1.6576015949249268, + "learning_rate": 9.828596843186244e-06, + "loss": 0.3157, + "step": 1054 + }, + { + "epoch": 0.0844, + "grad_norm": 1.5160009860992432, + "learning_rate": 9.828270220953398e-06, + "loss": 0.2853, + "step": 1055 + }, + { + "epoch": 0.08448, + "grad_norm": 1.5151618719100952, + "learning_rate": 9.827943293252048e-06, + "loss": 0.4335, + "step": 1056 + }, + { + "epoch": 0.08456, + "grad_norm": 1.3789680004119873, + "learning_rate": 9.82761606010288e-06, + "loss": 0.3956, + "step": 1057 + }, + { + "epoch": 0.08464, + "grad_norm": 2.0407497882843018, + "learning_rate": 9.8272885215266e-06, + "loss": 0.4707, + "step": 1058 + }, + { + "epoch": 0.08472, + "grad_norm": 1.4480390548706055, + "learning_rate": 9.826960677543926e-06, + "loss": 0.3389, + "step": 1059 + }, + { + "epoch": 0.0848, + "grad_norm": 1.438524603843689, + "learning_rate": 9.8266325281756e-06, + "loss": 0.2841, + "step": 1060 + }, + { + "epoch": 0.08488, + "grad_norm": 1.9115839004516602, + "learning_rate": 9.826304073442385e-06, + "loss": 0.3926, + "step": 1061 + }, + { + "epoch": 0.08496, + "grad_norm": 2.102163314819336, + "learning_rate": 9.82597531336506e-06, + "loss": 0.4392, + "step": 1062 + }, + { + "epoch": 0.08504, + "grad_norm": 1.6580944061279297, + "learning_rate": 9.825646247964425e-06, + "loss": 0.3485, + "step": 1063 + }, + { + "epoch": 0.08512, + "grad_norm": 1.469714879989624, + "learning_rate": 9.825316877261298e-06, + "loss": 0.3193, + "step": 1064 + }, + { + "epoch": 0.0852, + "grad_norm": 1.4311915636062622, + "learning_rate": 9.824987201276519e-06, + "loss": 0.2758, + "step": 1065 + }, + { + "epoch": 0.08528, + "grad_norm": 1.8289291858673096, + "learning_rate": 9.824657220030942e-06, + "loss": 0.4056, + "step": 1066 + }, + { + "epoch": 0.08536, + "grad_norm": 1.6348884105682373, + "learning_rate": 9.824326933545448e-06, + "loss": 0.2958, + "step": 1067 + }, + { + "epoch": 0.08544, + "grad_norm": 1.3245985507965088, + "learning_rate": 9.823996341840929e-06, + "loss": 0.2905, + "step": 1068 + }, + { + "epoch": 0.08552, + "grad_norm": 1.329889178276062, + "learning_rate": 9.823665444938304e-06, + "loss": 0.3558, + "step": 1069 + }, + { + "epoch": 0.0856, + "grad_norm": 1.8176319599151611, + "learning_rate": 9.823334242858506e-06, + "loss": 0.3617, + "step": 1070 + }, + { + "epoch": 0.08568, + "grad_norm": 2.28464674949646, + "learning_rate": 9.82300273562249e-06, + "loss": 0.4581, + "step": 1071 + }, + { + "epoch": 0.08576, + "grad_norm": 1.3536391258239746, + "learning_rate": 9.822670923251228e-06, + "loss": 0.3201, + "step": 1072 + }, + { + "epoch": 0.08584, + "grad_norm": 1.6658986806869507, + "learning_rate": 9.822338805765714e-06, + "loss": 0.4, + "step": 1073 + }, + { + "epoch": 0.08592, + "grad_norm": 1.4204604625701904, + "learning_rate": 9.82200638318696e-06, + "loss": 0.3206, + "step": 1074 + }, + { + "epoch": 0.086, + "grad_norm": 1.6750638484954834, + "learning_rate": 9.821673655535995e-06, + "loss": 0.3092, + "step": 1075 + }, + { + "epoch": 0.08608, + "grad_norm": 1.8198531866073608, + "learning_rate": 9.821340622833873e-06, + "loss": 0.3966, + "step": 1076 + }, + { + "epoch": 0.08616, + "grad_norm": 1.710455298423767, + "learning_rate": 9.82100728510166e-06, + "loss": 0.2862, + "step": 1077 + }, + { + "epoch": 0.08624, + "grad_norm": 1.3838942050933838, + "learning_rate": 9.820673642360448e-06, + "loss": 0.3097, + "step": 1078 + }, + { + "epoch": 0.08632, + "grad_norm": 1.588562250137329, + "learning_rate": 9.820339694631345e-06, + "loss": 0.4134, + "step": 1079 + }, + { + "epoch": 0.0864, + "grad_norm": 1.470871925354004, + "learning_rate": 9.820005441935479e-06, + "loss": 0.2925, + "step": 1080 + }, + { + "epoch": 0.08648, + "grad_norm": 1.691702127456665, + "learning_rate": 9.819670884293994e-06, + "loss": 0.3222, + "step": 1081 + }, + { + "epoch": 0.08656, + "grad_norm": 1.7869762182235718, + "learning_rate": 9.819336021728062e-06, + "loss": 0.3606, + "step": 1082 + }, + { + "epoch": 0.08664, + "grad_norm": 1.437152624130249, + "learning_rate": 9.819000854258864e-06, + "loss": 0.2756, + "step": 1083 + }, + { + "epoch": 0.08672, + "grad_norm": 1.7799644470214844, + "learning_rate": 9.818665381907605e-06, + "loss": 0.3988, + "step": 1084 + }, + { + "epoch": 0.0868, + "grad_norm": 1.8911408185958862, + "learning_rate": 9.818329604695513e-06, + "loss": 0.4247, + "step": 1085 + }, + { + "epoch": 0.08688, + "grad_norm": 2.2056448459625244, + "learning_rate": 9.817993522643827e-06, + "loss": 0.3675, + "step": 1086 + }, + { + "epoch": 0.08696, + "grad_norm": 1.294008731842041, + "learning_rate": 9.817657135773813e-06, + "loss": 0.3124, + "step": 1087 + }, + { + "epoch": 0.08704, + "grad_norm": 1.4826951026916504, + "learning_rate": 9.817320444106753e-06, + "loss": 0.3155, + "step": 1088 + }, + { + "epoch": 0.08712, + "grad_norm": 1.5381442308425903, + "learning_rate": 9.816983447663946e-06, + "loss": 0.4008, + "step": 1089 + }, + { + "epoch": 0.0872, + "grad_norm": 1.5871357917785645, + "learning_rate": 9.816646146466714e-06, + "loss": 0.333, + "step": 1090 + }, + { + "epoch": 0.08728, + "grad_norm": 1.603145718574524, + "learning_rate": 9.816308540536396e-06, + "loss": 0.3314, + "step": 1091 + }, + { + "epoch": 0.08736, + "grad_norm": 1.7113319635391235, + "learning_rate": 9.815970629894354e-06, + "loss": 0.408, + "step": 1092 + }, + { + "epoch": 0.08744, + "grad_norm": 1.4441734552383423, + "learning_rate": 9.815632414561964e-06, + "loss": 0.2921, + "step": 1093 + }, + { + "epoch": 0.08752, + "grad_norm": 1.7782763242721558, + "learning_rate": 9.815293894560623e-06, + "loss": 0.3528, + "step": 1094 + }, + { + "epoch": 0.0876, + "grad_norm": 1.4475657939910889, + "learning_rate": 9.814955069911752e-06, + "loss": 0.3025, + "step": 1095 + }, + { + "epoch": 0.08768, + "grad_norm": 1.3749080896377563, + "learning_rate": 9.814615940636781e-06, + "loss": 0.277, + "step": 1096 + }, + { + "epoch": 0.08776, + "grad_norm": 1.6869237422943115, + "learning_rate": 9.814276506757172e-06, + "loss": 0.4035, + "step": 1097 + }, + { + "epoch": 0.08784, + "grad_norm": 1.6214927434921265, + "learning_rate": 9.813936768294397e-06, + "loss": 0.3524, + "step": 1098 + }, + { + "epoch": 0.08792, + "grad_norm": 2.009049654006958, + "learning_rate": 9.813596725269948e-06, + "loss": 0.3494, + "step": 1099 + }, + { + "epoch": 0.088, + "grad_norm": 1.901945948600769, + "learning_rate": 9.813256377705341e-06, + "loss": 0.4302, + "step": 1100 + }, + { + "epoch": 0.08808, + "grad_norm": 2.3681435585021973, + "learning_rate": 9.812915725622109e-06, + "loss": 0.5687, + "step": 1101 + }, + { + "epoch": 0.08816, + "grad_norm": 2.2910284996032715, + "learning_rate": 9.812574769041805e-06, + "loss": 0.4986, + "step": 1102 + }, + { + "epoch": 0.08824, + "grad_norm": 1.6809210777282715, + "learning_rate": 9.812233507985995e-06, + "loss": 0.3687, + "step": 1103 + }, + { + "epoch": 0.08832, + "grad_norm": 1.779154658317566, + "learning_rate": 9.811891942476275e-06, + "loss": 0.4001, + "step": 1104 + }, + { + "epoch": 0.0884, + "grad_norm": 1.6831563711166382, + "learning_rate": 9.811550072534251e-06, + "loss": 0.3435, + "step": 1105 + }, + { + "epoch": 0.08848, + "grad_norm": 1.9863594770431519, + "learning_rate": 9.811207898181555e-06, + "loss": 0.3946, + "step": 1106 + }, + { + "epoch": 0.08856, + "grad_norm": 1.761527419090271, + "learning_rate": 9.81086541943983e-06, + "loss": 0.316, + "step": 1107 + }, + { + "epoch": 0.08864, + "grad_norm": 1.4106063842773438, + "learning_rate": 9.810522636330751e-06, + "loss": 0.2762, + "step": 1108 + }, + { + "epoch": 0.08872, + "grad_norm": 1.7829062938690186, + "learning_rate": 9.810179548875999e-06, + "loss": 0.3734, + "step": 1109 + }, + { + "epoch": 0.0888, + "grad_norm": 1.5982229709625244, + "learning_rate": 9.809836157097282e-06, + "loss": 0.3701, + "step": 1110 + }, + { + "epoch": 0.08888, + "grad_norm": 1.4014500379562378, + "learning_rate": 9.809492461016326e-06, + "loss": 0.3202, + "step": 1111 + }, + { + "epoch": 0.08896, + "grad_norm": 1.6743066310882568, + "learning_rate": 9.809148460654874e-06, + "loss": 0.2995, + "step": 1112 + }, + { + "epoch": 0.08904, + "grad_norm": 1.6122390031814575, + "learning_rate": 9.80880415603469e-06, + "loss": 0.3507, + "step": 1113 + }, + { + "epoch": 0.08912, + "grad_norm": 1.8418755531311035, + "learning_rate": 9.808459547177559e-06, + "loss": 0.4162, + "step": 1114 + }, + { + "epoch": 0.0892, + "grad_norm": 1.4559040069580078, + "learning_rate": 9.808114634105278e-06, + "loss": 0.3797, + "step": 1115 + }, + { + "epoch": 0.08928, + "grad_norm": 1.9068504571914673, + "learning_rate": 9.807769416839677e-06, + "loss": 0.4154, + "step": 1116 + }, + { + "epoch": 0.08936, + "grad_norm": 1.5469180345535278, + "learning_rate": 9.807423895402587e-06, + "loss": 0.4295, + "step": 1117 + }, + { + "epoch": 0.08944, + "grad_norm": 1.7214394807815552, + "learning_rate": 9.807078069815877e-06, + "loss": 0.3601, + "step": 1118 + }, + { + "epoch": 0.08952, + "grad_norm": 1.7878152132034302, + "learning_rate": 9.80673194010142e-06, + "loss": 0.3575, + "step": 1119 + }, + { + "epoch": 0.0896, + "grad_norm": 1.857308030128479, + "learning_rate": 9.806385506281117e-06, + "loss": 0.4363, + "step": 1120 + }, + { + "epoch": 0.08968, + "grad_norm": 1.6544005870819092, + "learning_rate": 9.806038768376885e-06, + "loss": 0.3477, + "step": 1121 + }, + { + "epoch": 0.08976, + "grad_norm": 1.5055336952209473, + "learning_rate": 9.80569172641066e-06, + "loss": 0.2681, + "step": 1122 + }, + { + "epoch": 0.08984, + "grad_norm": 1.9073718786239624, + "learning_rate": 9.8053443804044e-06, + "loss": 0.4285, + "step": 1123 + }, + { + "epoch": 0.08992, + "grad_norm": 1.2498080730438232, + "learning_rate": 9.80499673038008e-06, + "loss": 0.2639, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 1.5033406019210815, + "learning_rate": 9.804648776359695e-06, + "loss": 0.3395, + "step": 1125 + }, + { + "epoch": 0.09008, + "grad_norm": 1.9122231006622314, + "learning_rate": 9.80430051836526e-06, + "loss": 0.3571, + "step": 1126 + }, + { + "epoch": 0.09016, + "grad_norm": 2.6971442699432373, + "learning_rate": 9.803951956418803e-06, + "loss": 0.4251, + "step": 1127 + }, + { + "epoch": 0.09024, + "grad_norm": 1.4795504808425903, + "learning_rate": 9.803603090542381e-06, + "loss": 0.3596, + "step": 1128 + }, + { + "epoch": 0.09032, + "grad_norm": 1.6504251956939697, + "learning_rate": 9.803253920758064e-06, + "loss": 0.3429, + "step": 1129 + }, + { + "epoch": 0.0904, + "grad_norm": 2.284956693649292, + "learning_rate": 9.802904447087945e-06, + "loss": 0.6959, + "step": 1130 + }, + { + "epoch": 0.09048, + "grad_norm": 1.777529001235962, + "learning_rate": 9.802554669554131e-06, + "loss": 0.3375, + "step": 1131 + }, + { + "epoch": 0.09056, + "grad_norm": 1.831652045249939, + "learning_rate": 9.802204588178752e-06, + "loss": 0.4083, + "step": 1132 + }, + { + "epoch": 0.09064, + "grad_norm": 2.0457098484039307, + "learning_rate": 9.801854202983957e-06, + "loss": 0.3995, + "step": 1133 + }, + { + "epoch": 0.09072, + "grad_norm": 1.3770190477371216, + "learning_rate": 9.801503513991914e-06, + "loss": 0.2523, + "step": 1134 + }, + { + "epoch": 0.0908, + "grad_norm": 1.600197434425354, + "learning_rate": 9.80115252122481e-06, + "loss": 0.3447, + "step": 1135 + }, + { + "epoch": 0.09088, + "grad_norm": 1.7964802980422974, + "learning_rate": 9.800801224704851e-06, + "loss": 0.4085, + "step": 1136 + }, + { + "epoch": 0.09096, + "grad_norm": 1.558449149131775, + "learning_rate": 9.800449624454262e-06, + "loss": 0.3532, + "step": 1137 + }, + { + "epoch": 0.09104, + "grad_norm": 1.844244360923767, + "learning_rate": 9.800097720495286e-06, + "loss": 0.3174, + "step": 1138 + }, + { + "epoch": 0.09112, + "grad_norm": 1.8442306518554688, + "learning_rate": 9.79974551285019e-06, + "loss": 0.3615, + "step": 1139 + }, + { + "epoch": 0.0912, + "grad_norm": 1.5331751108169556, + "learning_rate": 9.799393001541255e-06, + "loss": 0.3207, + "step": 1140 + }, + { + "epoch": 0.09128, + "grad_norm": 1.578279972076416, + "learning_rate": 9.799040186590782e-06, + "loss": 0.4399, + "step": 1141 + }, + { + "epoch": 0.09136, + "grad_norm": 1.4967174530029297, + "learning_rate": 9.798687068021095e-06, + "loss": 0.3297, + "step": 1142 + }, + { + "epoch": 0.09144, + "grad_norm": 2.0132367610931396, + "learning_rate": 9.798333645854536e-06, + "loss": 0.3207, + "step": 1143 + }, + { + "epoch": 0.09152, + "grad_norm": 1.6801670789718628, + "learning_rate": 9.79797992011346e-06, + "loss": 0.3193, + "step": 1144 + }, + { + "epoch": 0.0916, + "grad_norm": 2.1940245628356934, + "learning_rate": 9.797625890820249e-06, + "loss": 0.4476, + "step": 1145 + }, + { + "epoch": 0.09168, + "grad_norm": 1.625232458114624, + "learning_rate": 9.7972715579973e-06, + "loss": 0.2663, + "step": 1146 + }, + { + "epoch": 0.09176, + "grad_norm": 2.0687153339385986, + "learning_rate": 9.796916921667033e-06, + "loss": 0.3539, + "step": 1147 + }, + { + "epoch": 0.09184, + "grad_norm": 1.9847198724746704, + "learning_rate": 9.796561981851882e-06, + "loss": 0.4134, + "step": 1148 + }, + { + "epoch": 0.09192, + "grad_norm": 1.644768238067627, + "learning_rate": 9.796206738574303e-06, + "loss": 0.2589, + "step": 1149 + }, + { + "epoch": 0.092, + "grad_norm": 1.431854486465454, + "learning_rate": 9.795851191856774e-06, + "loss": 0.3136, + "step": 1150 + }, + { + "epoch": 0.09208, + "grad_norm": 1.3832859992980957, + "learning_rate": 9.795495341721784e-06, + "loss": 0.3832, + "step": 1151 + }, + { + "epoch": 0.09216, + "grad_norm": 1.497043251991272, + "learning_rate": 9.795139188191851e-06, + "loss": 0.367, + "step": 1152 + }, + { + "epoch": 0.09224, + "grad_norm": 1.3032704591751099, + "learning_rate": 9.794782731289507e-06, + "loss": 0.2557, + "step": 1153 + }, + { + "epoch": 0.09232, + "grad_norm": 1.7582857608795166, + "learning_rate": 9.794425971037303e-06, + "loss": 0.3647, + "step": 1154 + }, + { + "epoch": 0.0924, + "grad_norm": 1.4522085189819336, + "learning_rate": 9.794068907457809e-06, + "loss": 0.2847, + "step": 1155 + }, + { + "epoch": 0.09248, + "grad_norm": 1.3870216608047485, + "learning_rate": 9.793711540573616e-06, + "loss": 0.2989, + "step": 1156 + }, + { + "epoch": 0.09256, + "grad_norm": 1.3415980339050293, + "learning_rate": 9.793353870407335e-06, + "loss": 0.2967, + "step": 1157 + }, + { + "epoch": 0.09264, + "grad_norm": 1.8516277074813843, + "learning_rate": 9.792995896981591e-06, + "loss": 0.3464, + "step": 1158 + }, + { + "epoch": 0.09272, + "grad_norm": 1.5445516109466553, + "learning_rate": 9.792637620319037e-06, + "loss": 0.3213, + "step": 1159 + }, + { + "epoch": 0.0928, + "grad_norm": 1.8748204708099365, + "learning_rate": 9.792279040442334e-06, + "loss": 0.4657, + "step": 1160 + }, + { + "epoch": 0.09288, + "grad_norm": 1.5904078483581543, + "learning_rate": 9.791920157374173e-06, + "loss": 0.3865, + "step": 1161 + }, + { + "epoch": 0.09296, + "grad_norm": 1.2793481349945068, + "learning_rate": 9.791560971137257e-06, + "loss": 0.3927, + "step": 1162 + }, + { + "epoch": 0.09304, + "grad_norm": 1.3844211101531982, + "learning_rate": 9.791201481754312e-06, + "loss": 0.2778, + "step": 1163 + }, + { + "epoch": 0.09312, + "grad_norm": 1.733962059020996, + "learning_rate": 9.790841689248078e-06, + "loss": 0.3492, + "step": 1164 + }, + { + "epoch": 0.0932, + "grad_norm": 1.4930529594421387, + "learning_rate": 9.790481593641324e-06, + "loss": 0.2962, + "step": 1165 + }, + { + "epoch": 0.09328, + "grad_norm": 1.9883126020431519, + "learning_rate": 9.790121194956825e-06, + "loss": 0.3775, + "step": 1166 + }, + { + "epoch": 0.09336, + "grad_norm": 1.5635162591934204, + "learning_rate": 9.789760493217388e-06, + "loss": 0.327, + "step": 1167 + }, + { + "epoch": 0.09344, + "grad_norm": 1.5043566226959229, + "learning_rate": 9.78939948844583e-06, + "loss": 0.2745, + "step": 1168 + }, + { + "epoch": 0.09352, + "grad_norm": 1.3067989349365234, + "learning_rate": 9.789038180664994e-06, + "loss": 0.2482, + "step": 1169 + }, + { + "epoch": 0.0936, + "grad_norm": 1.678215503692627, + "learning_rate": 9.788676569897734e-06, + "loss": 0.316, + "step": 1170 + }, + { + "epoch": 0.09368, + "grad_norm": 1.8970040082931519, + "learning_rate": 9.788314656166931e-06, + "loss": 0.4448, + "step": 1171 + }, + { + "epoch": 0.09376, + "grad_norm": 1.6337584257125854, + "learning_rate": 9.787952439495481e-06, + "loss": 0.402, + "step": 1172 + }, + { + "epoch": 0.09384, + "grad_norm": 2.054110050201416, + "learning_rate": 9.787589919906301e-06, + "loss": 0.4844, + "step": 1173 + }, + { + "epoch": 0.09392, + "grad_norm": 1.683643102645874, + "learning_rate": 9.787227097422327e-06, + "loss": 0.4237, + "step": 1174 + }, + { + "epoch": 0.094, + "grad_norm": 1.800292730331421, + "learning_rate": 9.786863972066515e-06, + "loss": 0.4866, + "step": 1175 + }, + { + "epoch": 0.09408, + "grad_norm": 1.3618955612182617, + "learning_rate": 9.786500543861833e-06, + "loss": 0.3326, + "step": 1176 + }, + { + "epoch": 0.09416, + "grad_norm": 1.6615434885025024, + "learning_rate": 9.786136812831276e-06, + "loss": 0.3495, + "step": 1177 + }, + { + "epoch": 0.09424, + "grad_norm": 1.6406981945037842, + "learning_rate": 9.78577277899786e-06, + "loss": 0.3495, + "step": 1178 + }, + { + "epoch": 0.09432, + "grad_norm": 1.4927983283996582, + "learning_rate": 9.785408442384612e-06, + "loss": 0.3832, + "step": 1179 + }, + { + "epoch": 0.0944, + "grad_norm": 1.31818425655365, + "learning_rate": 9.785043803014584e-06, + "loss": 0.3047, + "step": 1180 + }, + { + "epoch": 0.09448, + "grad_norm": 1.7068977355957031, + "learning_rate": 9.784678860910846e-06, + "loss": 0.3715, + "step": 1181 + }, + { + "epoch": 0.09456, + "grad_norm": 2.2386581897735596, + "learning_rate": 9.784313616096486e-06, + "loss": 0.3983, + "step": 1182 + }, + { + "epoch": 0.09464, + "grad_norm": 1.369645595550537, + "learning_rate": 9.783948068594613e-06, + "loss": 0.3607, + "step": 1183 + }, + { + "epoch": 0.09472, + "grad_norm": 1.6752623319625854, + "learning_rate": 9.783582218428352e-06, + "loss": 0.4052, + "step": 1184 + }, + { + "epoch": 0.0948, + "grad_norm": 1.740079641342163, + "learning_rate": 9.783216065620849e-06, + "loss": 0.3649, + "step": 1185 + }, + { + "epoch": 0.09488, + "grad_norm": 1.514141321182251, + "learning_rate": 9.78284961019527e-06, + "loss": 0.3816, + "step": 1186 + }, + { + "epoch": 0.09496, + "grad_norm": 1.8777161836624146, + "learning_rate": 9.782482852174802e-06, + "loss": 0.3526, + "step": 1187 + }, + { + "epoch": 0.09504, + "grad_norm": 2.190656900405884, + "learning_rate": 9.782115791582644e-06, + "loss": 0.4483, + "step": 1188 + }, + { + "epoch": 0.09512, + "grad_norm": 1.8506275415420532, + "learning_rate": 9.781748428442022e-06, + "loss": 0.3753, + "step": 1189 + }, + { + "epoch": 0.0952, + "grad_norm": 1.515964150428772, + "learning_rate": 9.781380762776176e-06, + "loss": 0.4088, + "step": 1190 + }, + { + "epoch": 0.09528, + "grad_norm": 1.1206365823745728, + "learning_rate": 9.781012794608368e-06, + "loss": 0.2067, + "step": 1191 + }, + { + "epoch": 0.09536, + "grad_norm": 1.286902666091919, + "learning_rate": 9.780644523961877e-06, + "loss": 0.2787, + "step": 1192 + }, + { + "epoch": 0.09544, + "grad_norm": 1.765989065170288, + "learning_rate": 9.780275950860005e-06, + "loss": 0.3091, + "step": 1193 + }, + { + "epoch": 0.09552, + "grad_norm": 2.0813465118408203, + "learning_rate": 9.779907075326066e-06, + "loss": 0.3626, + "step": 1194 + }, + { + "epoch": 0.0956, + "grad_norm": 1.8357547521591187, + "learning_rate": 9.779537897383403e-06, + "loss": 0.3385, + "step": 1195 + }, + { + "epoch": 0.09568, + "grad_norm": 2.120861530303955, + "learning_rate": 9.779168417055368e-06, + "loss": 0.462, + "step": 1196 + }, + { + "epoch": 0.09576, + "grad_norm": 1.4538586139678955, + "learning_rate": 9.778798634365336e-06, + "loss": 0.3281, + "step": 1197 + }, + { + "epoch": 0.09584, + "grad_norm": 2.0800697803497314, + "learning_rate": 9.778428549336707e-06, + "loss": 0.4369, + "step": 1198 + }, + { + "epoch": 0.09592, + "grad_norm": 1.2680668830871582, + "learning_rate": 9.778058161992892e-06, + "loss": 0.2548, + "step": 1199 + }, + { + "epoch": 0.096, + "grad_norm": 1.8447930812835693, + "learning_rate": 9.777687472357324e-06, + "loss": 0.5501, + "step": 1200 + }, + { + "epoch": 0.09608, + "grad_norm": 2.0600554943084717, + "learning_rate": 9.777316480453457e-06, + "loss": 0.604, + "step": 1201 + }, + { + "epoch": 0.09616, + "grad_norm": 1.6585683822631836, + "learning_rate": 9.77694518630476e-06, + "loss": 0.3472, + "step": 1202 + }, + { + "epoch": 0.09624, + "grad_norm": 1.3858237266540527, + "learning_rate": 9.776573589934726e-06, + "loss": 0.3476, + "step": 1203 + }, + { + "epoch": 0.09632, + "grad_norm": 1.6310840845108032, + "learning_rate": 9.776201691366863e-06, + "loss": 0.3252, + "step": 1204 + }, + { + "epoch": 0.0964, + "grad_norm": 1.4285833835601807, + "learning_rate": 9.775829490624698e-06, + "loss": 0.2611, + "step": 1205 + }, + { + "epoch": 0.09648, + "grad_norm": 1.5389498472213745, + "learning_rate": 9.775456987731784e-06, + "loss": 0.3895, + "step": 1206 + }, + { + "epoch": 0.09656, + "grad_norm": 1.5493495464324951, + "learning_rate": 9.775084182711683e-06, + "loss": 0.3191, + "step": 1207 + }, + { + "epoch": 0.09664, + "grad_norm": 1.5432543754577637, + "learning_rate": 9.774711075587985e-06, + "loss": 0.4227, + "step": 1208 + }, + { + "epoch": 0.09672, + "grad_norm": 1.4613178968429565, + "learning_rate": 9.774337666384293e-06, + "loss": 0.3943, + "step": 1209 + }, + { + "epoch": 0.0968, + "grad_norm": 2.0416183471679688, + "learning_rate": 9.773963955124232e-06, + "loss": 0.5029, + "step": 1210 + }, + { + "epoch": 0.09688, + "grad_norm": 1.3715733289718628, + "learning_rate": 9.773589941831446e-06, + "loss": 0.3508, + "step": 1211 + }, + { + "epoch": 0.09696, + "grad_norm": 1.3506369590759277, + "learning_rate": 9.773215626529596e-06, + "loss": 0.3365, + "step": 1212 + }, + { + "epoch": 0.09704, + "grad_norm": 1.867976427078247, + "learning_rate": 9.772841009242362e-06, + "loss": 0.3752, + "step": 1213 + }, + { + "epoch": 0.09712, + "grad_norm": 1.6031577587127686, + "learning_rate": 9.772466089993451e-06, + "loss": 0.407, + "step": 1214 + }, + { + "epoch": 0.0972, + "grad_norm": 1.4785380363464355, + "learning_rate": 9.772090868806578e-06, + "loss": 0.2738, + "step": 1215 + }, + { + "epoch": 0.09728, + "grad_norm": 1.7080274820327759, + "learning_rate": 9.771715345705482e-06, + "loss": 0.3905, + "step": 1216 + }, + { + "epoch": 0.09736, + "grad_norm": 1.4409105777740479, + "learning_rate": 9.771339520713924e-06, + "loss": 0.2903, + "step": 1217 + }, + { + "epoch": 0.09744, + "grad_norm": 2.0454001426696777, + "learning_rate": 9.77096339385568e-06, + "loss": 0.4558, + "step": 1218 + }, + { + "epoch": 0.09752, + "grad_norm": 1.4832075834274292, + "learning_rate": 9.770586965154542e-06, + "loss": 0.3588, + "step": 1219 + }, + { + "epoch": 0.0976, + "grad_norm": 1.7369167804718018, + "learning_rate": 9.770210234634333e-06, + "loss": 0.3038, + "step": 1220 + }, + { + "epoch": 0.09768, + "grad_norm": 1.412338137626648, + "learning_rate": 9.769833202318882e-06, + "loss": 0.27, + "step": 1221 + }, + { + "epoch": 0.09776, + "grad_norm": 2.267824649810791, + "learning_rate": 9.769455868232044e-06, + "loss": 0.6121, + "step": 1222 + }, + { + "epoch": 0.09784, + "grad_norm": 1.5539761781692505, + "learning_rate": 9.769078232397693e-06, + "loss": 0.3358, + "step": 1223 + }, + { + "epoch": 0.09792, + "grad_norm": 1.4364866018295288, + "learning_rate": 9.76870029483972e-06, + "loss": 0.3932, + "step": 1224 + }, + { + "epoch": 0.098, + "grad_norm": 1.2992466688156128, + "learning_rate": 9.768322055582034e-06, + "loss": 0.2755, + "step": 1225 + }, + { + "epoch": 0.09808, + "grad_norm": 1.7094968557357788, + "learning_rate": 9.767943514648567e-06, + "loss": 0.3883, + "step": 1226 + }, + { + "epoch": 0.09816, + "grad_norm": 1.9164812564849854, + "learning_rate": 9.767564672063268e-06, + "loss": 0.4088, + "step": 1227 + }, + { + "epoch": 0.09824, + "grad_norm": 2.1447086334228516, + "learning_rate": 9.767185527850103e-06, + "loss": 0.4737, + "step": 1228 + }, + { + "epoch": 0.09832, + "grad_norm": 1.7146050930023193, + "learning_rate": 9.766806082033061e-06, + "loss": 0.3838, + "step": 1229 + }, + { + "epoch": 0.0984, + "grad_norm": 1.4320570230484009, + "learning_rate": 9.766426334636149e-06, + "loss": 0.3106, + "step": 1230 + }, + { + "epoch": 0.09848, + "grad_norm": 1.5674022436141968, + "learning_rate": 9.76604628568339e-06, + "loss": 0.3787, + "step": 1231 + }, + { + "epoch": 0.09856, + "grad_norm": 1.2451162338256836, + "learning_rate": 9.765665935198831e-06, + "loss": 0.3102, + "step": 1232 + }, + { + "epoch": 0.09864, + "grad_norm": 1.3635990619659424, + "learning_rate": 9.765285283206533e-06, + "loss": 0.2954, + "step": 1233 + }, + { + "epoch": 0.09872, + "grad_norm": 2.4409143924713135, + "learning_rate": 9.764904329730583e-06, + "loss": 0.3997, + "step": 1234 + }, + { + "epoch": 0.0988, + "grad_norm": 1.705710530281067, + "learning_rate": 9.764523074795077e-06, + "loss": 0.4332, + "step": 1235 + }, + { + "epoch": 0.09888, + "grad_norm": 1.3760895729064941, + "learning_rate": 9.764141518424138e-06, + "loss": 0.3509, + "step": 1236 + }, + { + "epoch": 0.09896, + "grad_norm": 1.7472193241119385, + "learning_rate": 9.763759660641905e-06, + "loss": 0.3773, + "step": 1237 + }, + { + "epoch": 0.09904, + "grad_norm": 1.6968291997909546, + "learning_rate": 9.76337750147254e-06, + "loss": 0.3579, + "step": 1238 + }, + { + "epoch": 0.09912, + "grad_norm": 1.7409708499908447, + "learning_rate": 9.762995040940217e-06, + "loss": 0.2909, + "step": 1239 + }, + { + "epoch": 0.0992, + "grad_norm": 1.7769598960876465, + "learning_rate": 9.762612279069136e-06, + "loss": 0.3435, + "step": 1240 + }, + { + "epoch": 0.09928, + "grad_norm": 1.4426788091659546, + "learning_rate": 9.762229215883511e-06, + "loss": 0.3389, + "step": 1241 + }, + { + "epoch": 0.09936, + "grad_norm": 1.7437098026275635, + "learning_rate": 9.76184585140758e-06, + "loss": 0.3582, + "step": 1242 + }, + { + "epoch": 0.09944, + "grad_norm": 1.8013086318969727, + "learning_rate": 9.761462185665593e-06, + "loss": 0.3392, + "step": 1243 + }, + { + "epoch": 0.09952, + "grad_norm": 1.4805794954299927, + "learning_rate": 9.761078218681827e-06, + "loss": 0.2881, + "step": 1244 + }, + { + "epoch": 0.0996, + "grad_norm": 1.754407525062561, + "learning_rate": 9.760693950480572e-06, + "loss": 0.3884, + "step": 1245 + }, + { + "epoch": 0.09968, + "grad_norm": 1.5800998210906982, + "learning_rate": 9.760309381086139e-06, + "loss": 0.3402, + "step": 1246 + }, + { + "epoch": 0.09976, + "grad_norm": 1.5772699117660522, + "learning_rate": 9.759924510522861e-06, + "loss": 0.3164, + "step": 1247 + }, + { + "epoch": 0.09984, + "grad_norm": 1.4671974182128906, + "learning_rate": 9.759539338815085e-06, + "loss": 0.3409, + "step": 1248 + }, + { + "epoch": 0.09992, + "grad_norm": 1.8091286420822144, + "learning_rate": 9.75915386598718e-06, + "loss": 0.3817, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 1.8199232816696167, + "learning_rate": 9.758768092063536e-06, + "loss": 0.3736, + "step": 1250 + }, + { + "epoch": 0.10008, + "grad_norm": 1.4588650465011597, + "learning_rate": 9.758382017068558e-06, + "loss": 0.3103, + "step": 1251 + }, + { + "epoch": 0.10016, + "grad_norm": 2.0028064250946045, + "learning_rate": 9.757995641026669e-06, + "loss": 0.4277, + "step": 1252 + }, + { + "epoch": 0.10024, + "grad_norm": 1.8617825508117676, + "learning_rate": 9.757608963962317e-06, + "loss": 0.3659, + "step": 1253 + }, + { + "epoch": 0.10032, + "grad_norm": 1.7988715171813965, + "learning_rate": 9.757221985899965e-06, + "loss": 0.3815, + "step": 1254 + }, + { + "epoch": 0.1004, + "grad_norm": 1.3747351169586182, + "learning_rate": 9.756834706864096e-06, + "loss": 0.3078, + "step": 1255 + }, + { + "epoch": 0.10048, + "grad_norm": 1.8129006624221802, + "learning_rate": 9.756447126879212e-06, + "loss": 0.3561, + "step": 1256 + }, + { + "epoch": 0.10056, + "grad_norm": 1.850455641746521, + "learning_rate": 9.756059245969832e-06, + "loss": 0.4652, + "step": 1257 + }, + { + "epoch": 0.10064, + "grad_norm": 1.7385430335998535, + "learning_rate": 9.755671064160499e-06, + "loss": 0.3239, + "step": 1258 + }, + { + "epoch": 0.10072, + "grad_norm": 1.8935037851333618, + "learning_rate": 9.755282581475769e-06, + "loss": 0.4487, + "step": 1259 + }, + { + "epoch": 0.1008, + "grad_norm": 1.537541389465332, + "learning_rate": 9.754893797940222e-06, + "loss": 0.2818, + "step": 1260 + }, + { + "epoch": 0.10088, + "grad_norm": 1.770788550376892, + "learning_rate": 9.754504713578453e-06, + "loss": 0.3322, + "step": 1261 + }, + { + "epoch": 0.10096, + "grad_norm": 1.3864850997924805, + "learning_rate": 9.75411532841508e-06, + "loss": 0.2596, + "step": 1262 + }, + { + "epoch": 0.10104, + "grad_norm": 1.4967447519302368, + "learning_rate": 9.753725642474739e-06, + "loss": 0.3948, + "step": 1263 + }, + { + "epoch": 0.10112, + "grad_norm": 1.707395315170288, + "learning_rate": 9.75333565578208e-06, + "loss": 0.4174, + "step": 1264 + }, + { + "epoch": 0.1012, + "grad_norm": 1.876741886138916, + "learning_rate": 9.752945368361782e-06, + "loss": 0.4069, + "step": 1265 + }, + { + "epoch": 0.10128, + "grad_norm": 1.4767177104949951, + "learning_rate": 9.75255478023853e-06, + "loss": 0.3826, + "step": 1266 + }, + { + "epoch": 0.10136, + "grad_norm": 2.0608136653900146, + "learning_rate": 9.752163891437042e-06, + "loss": 0.4269, + "step": 1267 + }, + { + "epoch": 0.10144, + "grad_norm": 1.9878028631210327, + "learning_rate": 9.751772701982045e-06, + "loss": 0.4665, + "step": 1268 + }, + { + "epoch": 0.10152, + "grad_norm": 1.6732946634292603, + "learning_rate": 9.751381211898288e-06, + "loss": 0.3332, + "step": 1269 + }, + { + "epoch": 0.1016, + "grad_norm": 1.626977562904358, + "learning_rate": 9.75098942121054e-06, + "loss": 0.4315, + "step": 1270 + }, + { + "epoch": 0.10168, + "grad_norm": 1.6960049867630005, + "learning_rate": 9.750597329943588e-06, + "loss": 0.328, + "step": 1271 + }, + { + "epoch": 0.10176, + "grad_norm": 1.4753273725509644, + "learning_rate": 9.75020493812224e-06, + "loss": 0.3749, + "step": 1272 + }, + { + "epoch": 0.10184, + "grad_norm": 1.60029935836792, + "learning_rate": 9.749812245771318e-06, + "loss": 0.37, + "step": 1273 + }, + { + "epoch": 0.10192, + "grad_norm": 1.379223346710205, + "learning_rate": 9.749419252915668e-06, + "loss": 0.2877, + "step": 1274 + }, + { + "epoch": 0.102, + "grad_norm": 1.588037371635437, + "learning_rate": 9.749025959580156e-06, + "loss": 0.3533, + "step": 1275 + }, + { + "epoch": 0.10208, + "grad_norm": 1.8960047960281372, + "learning_rate": 9.748632365789658e-06, + "loss": 0.5091, + "step": 1276 + }, + { + "epoch": 0.10216, + "grad_norm": 1.6755867004394531, + "learning_rate": 9.748238471569083e-06, + "loss": 0.3203, + "step": 1277 + }, + { + "epoch": 0.10224, + "grad_norm": 1.5891623497009277, + "learning_rate": 9.747844276943345e-06, + "loss": 0.4389, + "step": 1278 + }, + { + "epoch": 0.10232, + "grad_norm": 1.7560958862304688, + "learning_rate": 9.747449781937388e-06, + "loss": 0.281, + "step": 1279 + }, + { + "epoch": 0.1024, + "grad_norm": 1.9748413562774658, + "learning_rate": 9.747054986576165e-06, + "loss": 0.5097, + "step": 1280 + }, + { + "epoch": 0.10248, + "grad_norm": 1.5240323543548584, + "learning_rate": 9.74665989088466e-06, + "loss": 0.3438, + "step": 1281 + }, + { + "epoch": 0.10256, + "grad_norm": 1.2884881496429443, + "learning_rate": 9.746264494887865e-06, + "loss": 0.3211, + "step": 1282 + }, + { + "epoch": 0.10264, + "grad_norm": 2.2344231605529785, + "learning_rate": 9.745868798610796e-06, + "loss": 0.3818, + "step": 1283 + }, + { + "epoch": 0.10272, + "grad_norm": 1.9735519886016846, + "learning_rate": 9.745472802078488e-06, + "loss": 0.35, + "step": 1284 + }, + { + "epoch": 0.1028, + "grad_norm": 1.7784732580184937, + "learning_rate": 9.745076505315994e-06, + "loss": 0.4224, + "step": 1285 + }, + { + "epoch": 0.10288, + "grad_norm": 1.9323874711990356, + "learning_rate": 9.744679908348386e-06, + "loss": 0.4125, + "step": 1286 + }, + { + "epoch": 0.10296, + "grad_norm": 2.1199615001678467, + "learning_rate": 9.74428301120076e-06, + "loss": 0.4652, + "step": 1287 + }, + { + "epoch": 0.10304, + "grad_norm": 1.5500082969665527, + "learning_rate": 9.743885813898217e-06, + "loss": 0.3081, + "step": 1288 + }, + { + "epoch": 0.10312, + "grad_norm": 1.9076387882232666, + "learning_rate": 9.743488316465895e-06, + "loss": 0.4051, + "step": 1289 + }, + { + "epoch": 0.1032, + "grad_norm": 1.48446524143219, + "learning_rate": 9.743090518928937e-06, + "loss": 0.3234, + "step": 1290 + }, + { + "epoch": 0.10328, + "grad_norm": 2.2340919971466064, + "learning_rate": 9.742692421312515e-06, + "loss": 0.4365, + "step": 1291 + }, + { + "epoch": 0.10336, + "grad_norm": 1.6090425252914429, + "learning_rate": 9.74229402364181e-06, + "loss": 0.3622, + "step": 1292 + }, + { + "epoch": 0.10344, + "grad_norm": 1.7896775007247925, + "learning_rate": 9.74189532594203e-06, + "loss": 0.388, + "step": 1293 + }, + { + "epoch": 0.10352, + "grad_norm": 1.6529676914215088, + "learning_rate": 9.7414963282384e-06, + "loss": 0.2979, + "step": 1294 + }, + { + "epoch": 0.1036, + "grad_norm": 1.5958153009414673, + "learning_rate": 9.741097030556162e-06, + "loss": 0.3039, + "step": 1295 + }, + { + "epoch": 0.10368, + "grad_norm": 2.0807502269744873, + "learning_rate": 9.740697432920579e-06, + "loss": 0.4092, + "step": 1296 + }, + { + "epoch": 0.10376, + "grad_norm": 1.3676483631134033, + "learning_rate": 9.740297535356931e-06, + "loss": 0.2693, + "step": 1297 + }, + { + "epoch": 0.10384, + "grad_norm": 2.016907215118408, + "learning_rate": 9.739897337890521e-06, + "loss": 0.3837, + "step": 1298 + }, + { + "epoch": 0.10392, + "grad_norm": 1.2759778499603271, + "learning_rate": 9.739496840546663e-06, + "loss": 0.2593, + "step": 1299 + }, + { + "epoch": 0.104, + "grad_norm": 1.7022594213485718, + "learning_rate": 9.7390960433507e-06, + "loss": 0.3582, + "step": 1300 + }, + { + "epoch": 0.10408, + "grad_norm": 1.783856987953186, + "learning_rate": 9.738694946327988e-06, + "loss": 0.4139, + "step": 1301 + }, + { + "epoch": 0.10416, + "grad_norm": 1.6456480026245117, + "learning_rate": 9.738293549503902e-06, + "loss": 0.3018, + "step": 1302 + }, + { + "epoch": 0.10424, + "grad_norm": 1.666909098625183, + "learning_rate": 9.737891852903838e-06, + "loss": 0.3686, + "step": 1303 + }, + { + "epoch": 0.10432, + "grad_norm": 1.3941386938095093, + "learning_rate": 9.737489856553209e-06, + "loss": 0.4, + "step": 1304 + }, + { + "epoch": 0.1044, + "grad_norm": 1.745068907737732, + "learning_rate": 9.737087560477449e-06, + "loss": 0.3254, + "step": 1305 + }, + { + "epoch": 0.10448, + "grad_norm": 1.4899107217788696, + "learning_rate": 9.736684964702008e-06, + "loss": 0.3617, + "step": 1306 + }, + { + "epoch": 0.10456, + "grad_norm": 1.8580790758132935, + "learning_rate": 9.736282069252358e-06, + "loss": 0.368, + "step": 1307 + }, + { + "epoch": 0.10464, + "grad_norm": 1.2298766374588013, + "learning_rate": 9.735878874153993e-06, + "loss": 0.293, + "step": 1308 + }, + { + "epoch": 0.10472, + "grad_norm": 1.4866394996643066, + "learning_rate": 9.735475379432414e-06, + "loss": 0.3504, + "step": 1309 + }, + { + "epoch": 0.1048, + "grad_norm": 1.508781909942627, + "learning_rate": 9.735071585113153e-06, + "loss": 0.3257, + "step": 1310 + }, + { + "epoch": 0.10488, + "grad_norm": 1.6359803676605225, + "learning_rate": 9.734667491221758e-06, + "loss": 0.3885, + "step": 1311 + }, + { + "epoch": 0.10496, + "grad_norm": 1.50149405002594, + "learning_rate": 9.734263097783792e-06, + "loss": 0.3049, + "step": 1312 + }, + { + "epoch": 0.10504, + "grad_norm": 1.7158201932907104, + "learning_rate": 9.73385840482484e-06, + "loss": 0.4445, + "step": 1313 + }, + { + "epoch": 0.10512, + "grad_norm": 1.7673109769821167, + "learning_rate": 9.733453412370508e-06, + "loss": 0.3097, + "step": 1314 + }, + { + "epoch": 0.1052, + "grad_norm": 1.5726984739303589, + "learning_rate": 9.733048120446416e-06, + "loss": 0.2946, + "step": 1315 + }, + { + "epoch": 0.10528, + "grad_norm": 1.9193859100341797, + "learning_rate": 9.732642529078206e-06, + "loss": 0.4489, + "step": 1316 + }, + { + "epoch": 0.10536, + "grad_norm": 1.6108776330947876, + "learning_rate": 9.73223663829154e-06, + "loss": 0.3922, + "step": 1317 + }, + { + "epoch": 0.10544, + "grad_norm": 1.4639272689819336, + "learning_rate": 9.731830448112096e-06, + "loss": 0.3506, + "step": 1318 + }, + { + "epoch": 0.10552, + "grad_norm": 1.342962622642517, + "learning_rate": 9.731423958565571e-06, + "loss": 0.3522, + "step": 1319 + }, + { + "epoch": 0.1056, + "grad_norm": 1.7002488374710083, + "learning_rate": 9.731017169677683e-06, + "loss": 0.4434, + "step": 1320 + }, + { + "epoch": 0.10568, + "grad_norm": 1.669387698173523, + "learning_rate": 9.73061008147417e-06, + "loss": 0.3224, + "step": 1321 + }, + { + "epoch": 0.10576, + "grad_norm": 1.4158101081848145, + "learning_rate": 9.730202693980786e-06, + "loss": 0.3246, + "step": 1322 + }, + { + "epoch": 0.10584, + "grad_norm": 1.3361846208572388, + "learning_rate": 9.729795007223303e-06, + "loss": 0.269, + "step": 1323 + }, + { + "epoch": 0.10592, + "grad_norm": 1.3655201196670532, + "learning_rate": 9.729387021227518e-06, + "loss": 0.3433, + "step": 1324 + }, + { + "epoch": 0.106, + "grad_norm": 1.260786771774292, + "learning_rate": 9.728978736019238e-06, + "loss": 0.2933, + "step": 1325 + }, + { + "epoch": 0.10608, + "grad_norm": 1.3825486898422241, + "learning_rate": 9.7285701516243e-06, + "loss": 0.3286, + "step": 1326 + }, + { + "epoch": 0.10616, + "grad_norm": 1.707014799118042, + "learning_rate": 9.72816126806855e-06, + "loss": 0.447, + "step": 1327 + }, + { + "epoch": 0.10624, + "grad_norm": 1.8020687103271484, + "learning_rate": 9.727752085377855e-06, + "loss": 0.4374, + "step": 1328 + }, + { + "epoch": 0.10632, + "grad_norm": 1.697378158569336, + "learning_rate": 9.727342603578105e-06, + "loss": 0.3393, + "step": 1329 + }, + { + "epoch": 0.1064, + "grad_norm": 1.4379756450653076, + "learning_rate": 9.726932822695208e-06, + "loss": 0.2879, + "step": 1330 + }, + { + "epoch": 0.10648, + "grad_norm": 2.371371269226074, + "learning_rate": 9.726522742755085e-06, + "loss": 0.4398, + "step": 1331 + }, + { + "epoch": 0.10656, + "grad_norm": 1.6934208869934082, + "learning_rate": 9.726112363783684e-06, + "loss": 0.3344, + "step": 1332 + }, + { + "epoch": 0.10664, + "grad_norm": 1.745850682258606, + "learning_rate": 9.725701685806968e-06, + "loss": 0.4684, + "step": 1333 + }, + { + "epoch": 0.10672, + "grad_norm": 1.6199334859848022, + "learning_rate": 9.725290708850919e-06, + "loss": 0.3236, + "step": 1334 + }, + { + "epoch": 0.1068, + "grad_norm": 1.4844225645065308, + "learning_rate": 9.724879432941536e-06, + "loss": 0.3124, + "step": 1335 + }, + { + "epoch": 0.10688, + "grad_norm": 1.5500150918960571, + "learning_rate": 9.724467858104843e-06, + "loss": 0.3436, + "step": 1336 + }, + { + "epoch": 0.10696, + "grad_norm": 1.6310945749282837, + "learning_rate": 9.724055984366876e-06, + "loss": 0.3663, + "step": 1337 + }, + { + "epoch": 0.10704, + "grad_norm": 1.5722607374191284, + "learning_rate": 9.723643811753693e-06, + "loss": 0.3498, + "step": 1338 + }, + { + "epoch": 0.10712, + "grad_norm": 1.2501575946807861, + "learning_rate": 9.723231340291372e-06, + "loss": 0.2408, + "step": 1339 + }, + { + "epoch": 0.1072, + "grad_norm": 1.5553503036499023, + "learning_rate": 9.722818570006008e-06, + "loss": 0.4481, + "step": 1340 + }, + { + "epoch": 0.10728, + "grad_norm": 2.1203765869140625, + "learning_rate": 9.722405500923715e-06, + "loss": 0.4438, + "step": 1341 + }, + { + "epoch": 0.10736, + "grad_norm": 1.671330213546753, + "learning_rate": 9.721992133070627e-06, + "loss": 0.3637, + "step": 1342 + }, + { + "epoch": 0.10744, + "grad_norm": 1.697939157485962, + "learning_rate": 9.721578466472896e-06, + "loss": 0.3393, + "step": 1343 + }, + { + "epoch": 0.10752, + "grad_norm": 1.2914625406265259, + "learning_rate": 9.721164501156697e-06, + "loss": 0.2479, + "step": 1344 + }, + { + "epoch": 0.1076, + "grad_norm": 1.7447216510772705, + "learning_rate": 9.720750237148214e-06, + "loss": 0.5241, + "step": 1345 + }, + { + "epoch": 0.10768, + "grad_norm": 1.8008346557617188, + "learning_rate": 9.72033567447366e-06, + "loss": 0.4449, + "step": 1346 + }, + { + "epoch": 0.10776, + "grad_norm": 1.2198455333709717, + "learning_rate": 9.719920813159262e-06, + "loss": 0.2726, + "step": 1347 + }, + { + "epoch": 0.10784, + "grad_norm": 1.7773081064224243, + "learning_rate": 9.719505653231268e-06, + "loss": 0.3292, + "step": 1348 + }, + { + "epoch": 0.10792, + "grad_norm": 1.6153675317764282, + "learning_rate": 9.719090194715943e-06, + "loss": 0.404, + "step": 1349 + }, + { + "epoch": 0.108, + "grad_norm": 1.9209851026535034, + "learning_rate": 9.71867443763957e-06, + "loss": 0.4205, + "step": 1350 + }, + { + "epoch": 0.10808, + "grad_norm": 1.8615111112594604, + "learning_rate": 9.718258382028456e-06, + "loss": 0.4368, + "step": 1351 + }, + { + "epoch": 0.10816, + "grad_norm": 1.647263526916504, + "learning_rate": 9.71784202790892e-06, + "loss": 0.5561, + "step": 1352 + }, + { + "epoch": 0.10824, + "grad_norm": 1.804128646850586, + "learning_rate": 9.717425375307305e-06, + "loss": 0.3763, + "step": 1353 + }, + { + "epoch": 0.10832, + "grad_norm": 1.5256234407424927, + "learning_rate": 9.717008424249973e-06, + "loss": 0.3653, + "step": 1354 + }, + { + "epoch": 0.1084, + "grad_norm": 1.6643016338348389, + "learning_rate": 9.716591174763297e-06, + "loss": 0.3654, + "step": 1355 + }, + { + "epoch": 0.10848, + "grad_norm": 1.531507968902588, + "learning_rate": 9.716173626873682e-06, + "loss": 0.2859, + "step": 1356 + }, + { + "epoch": 0.10856, + "grad_norm": 1.4687572717666626, + "learning_rate": 9.71575578060754e-06, + "loss": 0.3194, + "step": 1357 + }, + { + "epoch": 0.10864, + "grad_norm": 1.7269558906555176, + "learning_rate": 9.715337635991312e-06, + "loss": 0.4283, + "step": 1358 + }, + { + "epoch": 0.10872, + "grad_norm": 1.9255378246307373, + "learning_rate": 9.714919193051448e-06, + "loss": 0.3628, + "step": 1359 + }, + { + "epoch": 0.1088, + "grad_norm": 1.5584735870361328, + "learning_rate": 9.714500451814421e-06, + "loss": 0.2875, + "step": 1360 + }, + { + "epoch": 0.10888, + "grad_norm": 1.3883323669433594, + "learning_rate": 9.714081412306728e-06, + "loss": 0.2614, + "step": 1361 + }, + { + "epoch": 0.10896, + "grad_norm": 1.938321590423584, + "learning_rate": 9.713662074554875e-06, + "loss": 0.3478, + "step": 1362 + }, + { + "epoch": 0.10904, + "grad_norm": 1.7196362018585205, + "learning_rate": 9.713242438585397e-06, + "loss": 0.3891, + "step": 1363 + }, + { + "epoch": 0.10912, + "grad_norm": 1.8739652633666992, + "learning_rate": 9.712822504424839e-06, + "loss": 0.4158, + "step": 1364 + }, + { + "epoch": 0.1092, + "grad_norm": 1.669812560081482, + "learning_rate": 9.71240227209977e-06, + "loss": 0.3483, + "step": 1365 + }, + { + "epoch": 0.10928, + "grad_norm": 1.673043131828308, + "learning_rate": 9.711981741636777e-06, + "loss": 0.3858, + "step": 1366 + }, + { + "epoch": 0.10936, + "grad_norm": 1.8101592063903809, + "learning_rate": 9.711560913062465e-06, + "loss": 0.3462, + "step": 1367 + }, + { + "epoch": 0.10944, + "grad_norm": 1.4530932903289795, + "learning_rate": 9.711139786403461e-06, + "loss": 0.2484, + "step": 1368 + }, + { + "epoch": 0.10952, + "grad_norm": 1.6506141424179077, + "learning_rate": 9.710718361686405e-06, + "loss": 0.3754, + "step": 1369 + }, + { + "epoch": 0.1096, + "grad_norm": 1.7748082876205444, + "learning_rate": 9.71029663893796e-06, + "loss": 0.3619, + "step": 1370 + }, + { + "epoch": 0.10968, + "grad_norm": 1.5540876388549805, + "learning_rate": 9.709874618184808e-06, + "loss": 0.3668, + "step": 1371 + }, + { + "epoch": 0.10976, + "grad_norm": 1.5610040426254272, + "learning_rate": 9.709452299453648e-06, + "loss": 0.3208, + "step": 1372 + }, + { + "epoch": 0.10984, + "grad_norm": 1.334820032119751, + "learning_rate": 9.709029682771198e-06, + "loss": 0.2853, + "step": 1373 + }, + { + "epoch": 0.10992, + "grad_norm": 1.774716854095459, + "learning_rate": 9.708606768164199e-06, + "loss": 0.406, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 1.4674115180969238, + "learning_rate": 9.708183555659404e-06, + "loss": 0.3442, + "step": 1375 + }, + { + "epoch": 0.11008, + "grad_norm": 1.6923162937164307, + "learning_rate": 9.707760045283587e-06, + "loss": 0.4355, + "step": 1376 + }, + { + "epoch": 0.11016, + "grad_norm": 1.6160029172897339, + "learning_rate": 9.707336237063546e-06, + "loss": 0.3239, + "step": 1377 + }, + { + "epoch": 0.11024, + "grad_norm": 1.5265469551086426, + "learning_rate": 9.70691213102609e-06, + "loss": 0.3529, + "step": 1378 + }, + { + "epoch": 0.11032, + "grad_norm": 1.5923312902450562, + "learning_rate": 9.706487727198055e-06, + "loss": 0.3371, + "step": 1379 + }, + { + "epoch": 0.1104, + "grad_norm": 1.9004793167114258, + "learning_rate": 9.706063025606288e-06, + "loss": 0.3516, + "step": 1380 + }, + { + "epoch": 0.11048, + "grad_norm": 1.440619945526123, + "learning_rate": 9.70563802627766e-06, + "loss": 0.2806, + "step": 1381 + }, + { + "epoch": 0.11056, + "grad_norm": 1.896283507347107, + "learning_rate": 9.705212729239061e-06, + "loss": 0.3541, + "step": 1382 + }, + { + "epoch": 0.11064, + "grad_norm": 2.1123244762420654, + "learning_rate": 9.704787134517396e-06, + "loss": 0.3817, + "step": 1383 + }, + { + "epoch": 0.11072, + "grad_norm": 2.1019601821899414, + "learning_rate": 9.704361242139589e-06, + "loss": 0.3864, + "step": 1384 + }, + { + "epoch": 0.1108, + "grad_norm": 1.6103894710540771, + "learning_rate": 9.703935052132589e-06, + "loss": 0.3911, + "step": 1385 + }, + { + "epoch": 0.11088, + "grad_norm": 1.6342370510101318, + "learning_rate": 9.703508564523356e-06, + "loss": 0.3514, + "step": 1386 + }, + { + "epoch": 0.11096, + "grad_norm": 1.655190348625183, + "learning_rate": 9.703081779338877e-06, + "loss": 0.3249, + "step": 1387 + }, + { + "epoch": 0.11104, + "grad_norm": 1.3274317979812622, + "learning_rate": 9.702654696606147e-06, + "loss": 0.3422, + "step": 1388 + }, + { + "epoch": 0.11112, + "grad_norm": 1.903498888015747, + "learning_rate": 9.702227316352192e-06, + "loss": 0.4509, + "step": 1389 + }, + { + "epoch": 0.1112, + "grad_norm": 1.5915570259094238, + "learning_rate": 9.701799638604048e-06, + "loss": 0.3718, + "step": 1390 + }, + { + "epoch": 0.11128, + "grad_norm": 1.9381896257400513, + "learning_rate": 9.701371663388771e-06, + "loss": 0.3438, + "step": 1391 + }, + { + "epoch": 0.11136, + "grad_norm": 1.6756565570831299, + "learning_rate": 9.700943390733442e-06, + "loss": 0.3565, + "step": 1392 + }, + { + "epoch": 0.11144, + "grad_norm": 1.537050485610962, + "learning_rate": 9.700514820665153e-06, + "loss": 0.2721, + "step": 1393 + }, + { + "epoch": 0.11152, + "grad_norm": 1.410021424293518, + "learning_rate": 9.70008595321102e-06, + "loss": 0.2962, + "step": 1394 + }, + { + "epoch": 0.1116, + "grad_norm": 1.8756940364837646, + "learning_rate": 9.699656788398175e-06, + "loss": 0.4885, + "step": 1395 + }, + { + "epoch": 0.11168, + "grad_norm": 1.6304752826690674, + "learning_rate": 9.69922732625377e-06, + "loss": 0.382, + "step": 1396 + }, + { + "epoch": 0.11176, + "grad_norm": 1.274293303489685, + "learning_rate": 9.698797566804976e-06, + "loss": 0.2796, + "step": 1397 + }, + { + "epoch": 0.11184, + "grad_norm": 1.7127177715301514, + "learning_rate": 9.69836751007898e-06, + "loss": 0.3734, + "step": 1398 + }, + { + "epoch": 0.11192, + "grad_norm": 1.886913776397705, + "learning_rate": 9.697937156102997e-06, + "loss": 0.3908, + "step": 1399 + }, + { + "epoch": 0.112, + "grad_norm": 1.5846264362335205, + "learning_rate": 9.697506504904246e-06, + "loss": 0.3985, + "step": 1400 + }, + { + "epoch": 0.11208, + "grad_norm": 1.3665478229522705, + "learning_rate": 9.697075556509978e-06, + "loss": 0.2967, + "step": 1401 + }, + { + "epoch": 0.11216, + "grad_norm": 1.4394631385803223, + "learning_rate": 9.696644310947453e-06, + "loss": 0.3561, + "step": 1402 + }, + { + "epoch": 0.11224, + "grad_norm": 1.8952381610870361, + "learning_rate": 9.69621276824396e-06, + "loss": 0.5919, + "step": 1403 + }, + { + "epoch": 0.11232, + "grad_norm": 1.3404344320297241, + "learning_rate": 9.6957809284268e-06, + "loss": 0.3089, + "step": 1404 + }, + { + "epoch": 0.1124, + "grad_norm": 1.6374577283859253, + "learning_rate": 9.69534879152329e-06, + "loss": 0.3373, + "step": 1405 + }, + { + "epoch": 0.11248, + "grad_norm": 1.6670361757278442, + "learning_rate": 9.694916357560774e-06, + "loss": 0.4697, + "step": 1406 + }, + { + "epoch": 0.11256, + "grad_norm": 2.1049678325653076, + "learning_rate": 9.694483626566609e-06, + "loss": 0.3929, + "step": 1407 + }, + { + "epoch": 0.11264, + "grad_norm": 2.02200984954834, + "learning_rate": 9.694050598568173e-06, + "loss": 0.503, + "step": 1408 + }, + { + "epoch": 0.11272, + "grad_norm": 1.2237039804458618, + "learning_rate": 9.693617273592861e-06, + "loss": 0.2909, + "step": 1409 + }, + { + "epoch": 0.1128, + "grad_norm": 1.4623360633850098, + "learning_rate": 9.69318365166809e-06, + "loss": 0.2837, + "step": 1410 + }, + { + "epoch": 0.11288, + "grad_norm": 1.7014158964157104, + "learning_rate": 9.692749732821293e-06, + "loss": 0.2977, + "step": 1411 + }, + { + "epoch": 0.11296, + "grad_norm": 1.7711082696914673, + "learning_rate": 9.692315517079922e-06, + "loss": 0.3118, + "step": 1412 + }, + { + "epoch": 0.11304, + "grad_norm": 1.5991653203964233, + "learning_rate": 9.691881004471449e-06, + "loss": 0.3468, + "step": 1413 + }, + { + "epoch": 0.11312, + "grad_norm": 1.7637615203857422, + "learning_rate": 9.691446195023364e-06, + "loss": 0.3192, + "step": 1414 + }, + { + "epoch": 0.1132, + "grad_norm": 1.9800078868865967, + "learning_rate": 9.691011088763175e-06, + "loss": 0.3971, + "step": 1415 + }, + { + "epoch": 0.11328, + "grad_norm": 1.2780556678771973, + "learning_rate": 9.69057568571841e-06, + "loss": 0.2921, + "step": 1416 + }, + { + "epoch": 0.11336, + "grad_norm": 1.2494962215423584, + "learning_rate": 9.690139985916619e-06, + "loss": 0.283, + "step": 1417 + }, + { + "epoch": 0.11344, + "grad_norm": 1.7476698160171509, + "learning_rate": 9.689703989385362e-06, + "loss": 0.3595, + "step": 1418 + }, + { + "epoch": 0.11352, + "grad_norm": 1.494022011756897, + "learning_rate": 9.689267696152226e-06, + "loss": 0.2728, + "step": 1419 + }, + { + "epoch": 0.1136, + "grad_norm": 1.702022671699524, + "learning_rate": 9.688831106244814e-06, + "loss": 0.3349, + "step": 1420 + }, + { + "epoch": 0.11368, + "grad_norm": 1.6980208158493042, + "learning_rate": 9.688394219690745e-06, + "loss": 0.3074, + "step": 1421 + }, + { + "epoch": 0.11376, + "grad_norm": 1.3088799715042114, + "learning_rate": 9.687957036517662e-06, + "loss": 0.2807, + "step": 1422 + }, + { + "epoch": 0.11384, + "grad_norm": 1.4283772706985474, + "learning_rate": 9.687519556753225e-06, + "loss": 0.3256, + "step": 1423 + }, + { + "epoch": 0.11392, + "grad_norm": 1.6759427785873413, + "learning_rate": 9.687081780425108e-06, + "loss": 0.3734, + "step": 1424 + }, + { + "epoch": 0.114, + "grad_norm": 1.7384825944900513, + "learning_rate": 9.68664370756101e-06, + "loss": 0.396, + "step": 1425 + }, + { + "epoch": 0.11408, + "grad_norm": 1.8031843900680542, + "learning_rate": 9.686205338188645e-06, + "loss": 0.4733, + "step": 1426 + }, + { + "epoch": 0.11416, + "grad_norm": 1.8591822385787964, + "learning_rate": 9.68576667233575e-06, + "loss": 0.4139, + "step": 1427 + }, + { + "epoch": 0.11424, + "grad_norm": 1.8857415914535522, + "learning_rate": 9.685327710030077e-06, + "loss": 0.3974, + "step": 1428 + }, + { + "epoch": 0.11432, + "grad_norm": 1.1718003749847412, + "learning_rate": 9.684888451299396e-06, + "loss": 0.2668, + "step": 1429 + }, + { + "epoch": 0.1144, + "grad_norm": 1.7708724737167358, + "learning_rate": 9.684448896171498e-06, + "loss": 0.3957, + "step": 1430 + }, + { + "epoch": 0.11448, + "grad_norm": 1.4460155963897705, + "learning_rate": 9.684009044674193e-06, + "loss": 0.3137, + "step": 1431 + }, + { + "epoch": 0.11456, + "grad_norm": 1.4484257698059082, + "learning_rate": 9.683568896835309e-06, + "loss": 0.33, + "step": 1432 + }, + { + "epoch": 0.11464, + "grad_norm": 2.073709487915039, + "learning_rate": 9.683128452682692e-06, + "loss": 0.4239, + "step": 1433 + }, + { + "epoch": 0.11472, + "grad_norm": 1.4383269548416138, + "learning_rate": 9.682687712244205e-06, + "loss": 0.269, + "step": 1434 + }, + { + "epoch": 0.1148, + "grad_norm": 1.5897904634475708, + "learning_rate": 9.682246675547737e-06, + "loss": 0.3171, + "step": 1435 + }, + { + "epoch": 0.11488, + "grad_norm": 1.4807348251342773, + "learning_rate": 9.68180534262119e-06, + "loss": 0.2884, + "step": 1436 + }, + { + "epoch": 0.11496, + "grad_norm": 1.8860735893249512, + "learning_rate": 9.681363713492483e-06, + "loss": 0.449, + "step": 1437 + }, + { + "epoch": 0.11504, + "grad_norm": 1.6362406015396118, + "learning_rate": 9.680921788189556e-06, + "loss": 0.3038, + "step": 1438 + }, + { + "epoch": 0.11512, + "grad_norm": 1.7273122072219849, + "learning_rate": 9.680479566740373e-06, + "loss": 0.3497, + "step": 1439 + }, + { + "epoch": 0.1152, + "grad_norm": 1.6315315961837769, + "learning_rate": 9.680037049172907e-06, + "loss": 0.3365, + "step": 1440 + }, + { + "epoch": 0.11528, + "grad_norm": 1.381567358970642, + "learning_rate": 9.679594235515158e-06, + "loss": 0.2703, + "step": 1441 + }, + { + "epoch": 0.11536, + "grad_norm": 2.0206964015960693, + "learning_rate": 9.679151125795136e-06, + "loss": 0.4859, + "step": 1442 + }, + { + "epoch": 0.11544, + "grad_norm": 1.7309236526489258, + "learning_rate": 9.678707720040882e-06, + "loss": 0.4465, + "step": 1443 + }, + { + "epoch": 0.11552, + "grad_norm": 1.8960282802581787, + "learning_rate": 9.678264018280445e-06, + "loss": 0.4757, + "step": 1444 + }, + { + "epoch": 0.1156, + "grad_norm": 1.276365041732788, + "learning_rate": 9.677820020541898e-06, + "loss": 0.2635, + "step": 1445 + }, + { + "epoch": 0.11568, + "grad_norm": 1.4210036993026733, + "learning_rate": 9.677375726853327e-06, + "loss": 0.3031, + "step": 1446 + }, + { + "epoch": 0.11576, + "grad_norm": 1.6796081066131592, + "learning_rate": 9.676931137242846e-06, + "loss": 0.4549, + "step": 1447 + }, + { + "epoch": 0.11584, + "grad_norm": 1.4270731210708618, + "learning_rate": 9.676486251738581e-06, + "loss": 0.3278, + "step": 1448 + }, + { + "epoch": 0.11592, + "grad_norm": 1.5556656122207642, + "learning_rate": 9.67604107036868e-06, + "loss": 0.2951, + "step": 1449 + }, + { + "epoch": 0.116, + "grad_norm": 1.3497978448867798, + "learning_rate": 9.675595593161305e-06, + "loss": 0.357, + "step": 1450 + }, + { + "epoch": 0.11608, + "grad_norm": 1.3413703441619873, + "learning_rate": 9.675149820144643e-06, + "loss": 0.3007, + "step": 1451 + }, + { + "epoch": 0.11616, + "grad_norm": 1.4101148843765259, + "learning_rate": 9.674703751346893e-06, + "loss": 0.3115, + "step": 1452 + }, + { + "epoch": 0.11624, + "grad_norm": 1.480543851852417, + "learning_rate": 9.67425738679628e-06, + "loss": 0.3169, + "step": 1453 + }, + { + "epoch": 0.11632, + "grad_norm": 2.082014799118042, + "learning_rate": 9.67381072652104e-06, + "loss": 0.4067, + "step": 1454 + }, + { + "epoch": 0.1164, + "grad_norm": 1.5257582664489746, + "learning_rate": 9.673363770549435e-06, + "loss": 0.3861, + "step": 1455 + }, + { + "epoch": 0.11648, + "grad_norm": 1.7837907075881958, + "learning_rate": 9.672916518909743e-06, + "loss": 0.4652, + "step": 1456 + }, + { + "epoch": 0.11656, + "grad_norm": 1.7206987142562866, + "learning_rate": 9.672468971630256e-06, + "loss": 0.4114, + "step": 1457 + }, + { + "epoch": 0.11664, + "grad_norm": 1.51250159740448, + "learning_rate": 9.672021128739293e-06, + "loss": 0.4297, + "step": 1458 + }, + { + "epoch": 0.11672, + "grad_norm": 1.856751799583435, + "learning_rate": 9.671572990265186e-06, + "loss": 0.4375, + "step": 1459 + }, + { + "epoch": 0.1168, + "grad_norm": 2.219356060028076, + "learning_rate": 9.671124556236284e-06, + "loss": 0.5495, + "step": 1460 + }, + { + "epoch": 0.11688, + "grad_norm": 1.8340235948562622, + "learning_rate": 9.670675826680963e-06, + "loss": 0.361, + "step": 1461 + }, + { + "epoch": 0.11696, + "grad_norm": 1.3916457891464233, + "learning_rate": 9.670226801627611e-06, + "loss": 0.3085, + "step": 1462 + }, + { + "epoch": 0.11704, + "grad_norm": 1.3472083806991577, + "learning_rate": 9.669777481104637e-06, + "loss": 0.3098, + "step": 1463 + }, + { + "epoch": 0.11712, + "grad_norm": 1.739112138748169, + "learning_rate": 9.669327865140465e-06, + "loss": 0.3132, + "step": 1464 + }, + { + "epoch": 0.1172, + "grad_norm": 1.6176772117614746, + "learning_rate": 9.668877953763544e-06, + "loss": 0.4131, + "step": 1465 + }, + { + "epoch": 0.11728, + "grad_norm": 1.2607818841934204, + "learning_rate": 9.668427747002337e-06, + "loss": 0.3161, + "step": 1466 + }, + { + "epoch": 0.11736, + "grad_norm": 1.6247892379760742, + "learning_rate": 9.667977244885324e-06, + "loss": 0.3631, + "step": 1467 + }, + { + "epoch": 0.11744, + "grad_norm": 1.8417178392410278, + "learning_rate": 9.667526447441012e-06, + "loss": 0.3967, + "step": 1468 + }, + { + "epoch": 0.11752, + "grad_norm": 1.448082447052002, + "learning_rate": 9.667075354697919e-06, + "loss": 0.2738, + "step": 1469 + }, + { + "epoch": 0.1176, + "grad_norm": 1.4578620195388794, + "learning_rate": 9.666623966684585e-06, + "loss": 0.437, + "step": 1470 + }, + { + "epoch": 0.11768, + "grad_norm": 1.3342931270599365, + "learning_rate": 9.666172283429566e-06, + "loss": 0.3525, + "step": 1471 + }, + { + "epoch": 0.11776, + "grad_norm": 1.410962700843811, + "learning_rate": 9.66572030496144e-06, + "loss": 0.3757, + "step": 1472 + }, + { + "epoch": 0.11784, + "grad_norm": 1.3203762769699097, + "learning_rate": 9.665268031308804e-06, + "loss": 0.2784, + "step": 1473 + }, + { + "epoch": 0.11792, + "grad_norm": 1.5164921283721924, + "learning_rate": 9.664815462500268e-06, + "loss": 0.3143, + "step": 1474 + }, + { + "epoch": 0.118, + "grad_norm": 1.555022120475769, + "learning_rate": 9.664362598564466e-06, + "loss": 0.3576, + "step": 1475 + }, + { + "epoch": 0.11808, + "grad_norm": 1.5586203336715698, + "learning_rate": 9.66390943953005e-06, + "loss": 0.2553, + "step": 1476 + }, + { + "epoch": 0.11816, + "grad_norm": 1.8795398473739624, + "learning_rate": 9.66345598542569e-06, + "loss": 0.3442, + "step": 1477 + }, + { + "epoch": 0.11824, + "grad_norm": 1.5754092931747437, + "learning_rate": 9.663002236280072e-06, + "loss": 0.2897, + "step": 1478 + }, + { + "epoch": 0.11832, + "grad_norm": 1.3736915588378906, + "learning_rate": 9.662548192121905e-06, + "loss": 0.3255, + "step": 1479 + }, + { + "epoch": 0.1184, + "grad_norm": 1.4949525594711304, + "learning_rate": 9.662093852979916e-06, + "loss": 0.2846, + "step": 1480 + }, + { + "epoch": 0.11848, + "grad_norm": 1.5100445747375488, + "learning_rate": 9.661639218882849e-06, + "loss": 0.3407, + "step": 1481 + }, + { + "epoch": 0.11856, + "grad_norm": 1.6961963176727295, + "learning_rate": 9.661184289859465e-06, + "loss": 0.3679, + "step": 1482 + }, + { + "epoch": 0.11864, + "grad_norm": 1.5878902673721313, + "learning_rate": 9.660729065938547e-06, + "loss": 0.2542, + "step": 1483 + }, + { + "epoch": 0.11872, + "grad_norm": 1.5185855627059937, + "learning_rate": 9.660273547148897e-06, + "loss": 0.3301, + "step": 1484 + }, + { + "epoch": 0.1188, + "grad_norm": 1.7931679487228394, + "learning_rate": 9.659817733519333e-06, + "loss": 0.3853, + "step": 1485 + }, + { + "epoch": 0.11888, + "grad_norm": 1.5024420022964478, + "learning_rate": 9.659361625078691e-06, + "loss": 0.2664, + "step": 1486 + }, + { + "epoch": 0.11896, + "grad_norm": 1.6533669233322144, + "learning_rate": 9.65890522185583e-06, + "loss": 0.3301, + "step": 1487 + }, + { + "epoch": 0.11904, + "grad_norm": 2.171912908554077, + "learning_rate": 9.658448523879626e-06, + "loss": 0.396, + "step": 1488 + }, + { + "epoch": 0.11912, + "grad_norm": 1.7563323974609375, + "learning_rate": 9.65799153117897e-06, + "loss": 0.4444, + "step": 1489 + }, + { + "epoch": 0.1192, + "grad_norm": 1.1641733646392822, + "learning_rate": 9.657534243782775e-06, + "loss": 0.2883, + "step": 1490 + }, + { + "epoch": 0.11928, + "grad_norm": 1.5933700799942017, + "learning_rate": 9.657076661719972e-06, + "loss": 0.336, + "step": 1491 + }, + { + "epoch": 0.11936, + "grad_norm": 1.411896824836731, + "learning_rate": 9.656618785019513e-06, + "loss": 0.2531, + "step": 1492 + }, + { + "epoch": 0.11944, + "grad_norm": 1.3056985139846802, + "learning_rate": 9.656160613710364e-06, + "loss": 0.2732, + "step": 1493 + }, + { + "epoch": 0.11952, + "grad_norm": 1.6541974544525146, + "learning_rate": 9.655702147821514e-06, + "loss": 0.3592, + "step": 1494 + }, + { + "epoch": 0.1196, + "grad_norm": 1.6240873336791992, + "learning_rate": 9.655243387381965e-06, + "loss": 0.3601, + "step": 1495 + }, + { + "epoch": 0.11968, + "grad_norm": 1.4387151002883911, + "learning_rate": 9.654784332420744e-06, + "loss": 0.274, + "step": 1496 + }, + { + "epoch": 0.11976, + "grad_norm": 1.491258144378662, + "learning_rate": 9.654324982966891e-06, + "loss": 0.3344, + "step": 1497 + }, + { + "epoch": 0.11984, + "grad_norm": 1.6981704235076904, + "learning_rate": 9.653865339049472e-06, + "loss": 0.5099, + "step": 1498 + }, + { + "epoch": 0.11992, + "grad_norm": 1.6840261220932007, + "learning_rate": 9.653405400697567e-06, + "loss": 0.3375, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 1.4480576515197754, + "learning_rate": 9.65294516794027e-06, + "loss": 0.3806, + "step": 1500 + }, + { + "epoch": 0.12008, + "grad_norm": 1.518832802772522, + "learning_rate": 9.6524846408067e-06, + "loss": 0.3553, + "step": 1501 + }, + { + "epoch": 0.12016, + "grad_norm": 1.694017767906189, + "learning_rate": 9.652023819325998e-06, + "loss": 0.3129, + "step": 1502 + }, + { + "epoch": 0.12024, + "grad_norm": 1.3871276378631592, + "learning_rate": 9.651562703527311e-06, + "loss": 0.3113, + "step": 1503 + }, + { + "epoch": 0.12032, + "grad_norm": 1.6968255043029785, + "learning_rate": 9.651101293439817e-06, + "loss": 0.3511, + "step": 1504 + }, + { + "epoch": 0.1204, + "grad_norm": 1.6635756492614746, + "learning_rate": 9.650639589092708e-06, + "loss": 0.354, + "step": 1505 + }, + { + "epoch": 0.12048, + "grad_norm": 1.5392272472381592, + "learning_rate": 9.65017759051519e-06, + "loss": 0.3186, + "step": 1506 + }, + { + "epoch": 0.12056, + "grad_norm": 1.2892400026321411, + "learning_rate": 9.649715297736499e-06, + "loss": 0.2914, + "step": 1507 + }, + { + "epoch": 0.12064, + "grad_norm": 1.6253796815872192, + "learning_rate": 9.649252710785876e-06, + "loss": 0.3478, + "step": 1508 + }, + { + "epoch": 0.12072, + "grad_norm": 1.8582258224487305, + "learning_rate": 9.648789829692594e-06, + "loss": 0.4373, + "step": 1509 + }, + { + "epoch": 0.1208, + "grad_norm": 1.683634877204895, + "learning_rate": 9.648326654485931e-06, + "loss": 0.3919, + "step": 1510 + }, + { + "epoch": 0.12088, + "grad_norm": 2.1171162128448486, + "learning_rate": 9.647863185195197e-06, + "loss": 0.4234, + "step": 1511 + }, + { + "epoch": 0.12096, + "grad_norm": 1.5578632354736328, + "learning_rate": 9.647399421849708e-06, + "loss": 0.3344, + "step": 1512 + }, + { + "epoch": 0.12104, + "grad_norm": 1.7130169868469238, + "learning_rate": 9.64693536447881e-06, + "loss": 0.4795, + "step": 1513 + }, + { + "epoch": 0.12112, + "grad_norm": 1.5698732137680054, + "learning_rate": 9.64647101311186e-06, + "loss": 0.3961, + "step": 1514 + }, + { + "epoch": 0.1212, + "grad_norm": 1.7023333311080933, + "learning_rate": 9.646006367778235e-06, + "loss": 0.3476, + "step": 1515 + }, + { + "epoch": 0.12128, + "grad_norm": 1.8466098308563232, + "learning_rate": 9.645541428507334e-06, + "loss": 0.4728, + "step": 1516 + }, + { + "epoch": 0.12136, + "grad_norm": 1.5508915185928345, + "learning_rate": 9.64507619532857e-06, + "loss": 0.4352, + "step": 1517 + }, + { + "epoch": 0.12144, + "grad_norm": 1.3814231157302856, + "learning_rate": 9.644610668271377e-06, + "loss": 0.3032, + "step": 1518 + }, + { + "epoch": 0.12152, + "grad_norm": 1.73944890499115, + "learning_rate": 9.64414484736521e-06, + "loss": 0.3277, + "step": 1519 + }, + { + "epoch": 0.1216, + "grad_norm": 1.2703596353530884, + "learning_rate": 9.643678732639537e-06, + "loss": 0.2786, + "step": 1520 + }, + { + "epoch": 0.12168, + "grad_norm": 1.4023946523666382, + "learning_rate": 9.643212324123848e-06, + "loss": 0.2626, + "step": 1521 + }, + { + "epoch": 0.12176, + "grad_norm": 1.5729879140853882, + "learning_rate": 9.64274562184765e-06, + "loss": 0.3552, + "step": 1522 + }, + { + "epoch": 0.12184, + "grad_norm": 1.6716023683547974, + "learning_rate": 9.642278625840473e-06, + "loss": 0.3472, + "step": 1523 + }, + { + "epoch": 0.12192, + "grad_norm": 1.9659554958343506, + "learning_rate": 9.64181133613186e-06, + "loss": 0.3494, + "step": 1524 + }, + { + "epoch": 0.122, + "grad_norm": 1.7329281568527222, + "learning_rate": 9.641343752751375e-06, + "loss": 0.3407, + "step": 1525 + }, + { + "epoch": 0.12208, + "grad_norm": 1.9919676780700684, + "learning_rate": 9.640875875728602e-06, + "loss": 0.461, + "step": 1526 + }, + { + "epoch": 0.12216, + "grad_norm": 1.6600706577301025, + "learning_rate": 9.64040770509314e-06, + "loss": 0.3863, + "step": 1527 + }, + { + "epoch": 0.12224, + "grad_norm": 1.8697322607040405, + "learning_rate": 9.639939240874609e-06, + "loss": 0.4213, + "step": 1528 + }, + { + "epoch": 0.12232, + "grad_norm": 1.6128782033920288, + "learning_rate": 9.639470483102647e-06, + "loss": 0.2803, + "step": 1529 + }, + { + "epoch": 0.1224, + "grad_norm": 1.698266625404358, + "learning_rate": 9.639001431806912e-06, + "loss": 0.3881, + "step": 1530 + }, + { + "epoch": 0.12248, + "grad_norm": 1.2810356616973877, + "learning_rate": 9.638532087017079e-06, + "loss": 0.2839, + "step": 1531 + }, + { + "epoch": 0.12256, + "grad_norm": 1.382637619972229, + "learning_rate": 9.638062448762842e-06, + "loss": 0.2794, + "step": 1532 + }, + { + "epoch": 0.12264, + "grad_norm": 1.7451645135879517, + "learning_rate": 9.637592517073911e-06, + "loss": 0.3944, + "step": 1533 + }, + { + "epoch": 0.12272, + "grad_norm": 1.6562174558639526, + "learning_rate": 9.63712229198002e-06, + "loss": 0.3808, + "step": 1534 + }, + { + "epoch": 0.1228, + "grad_norm": 1.47275972366333, + "learning_rate": 9.636651773510917e-06, + "loss": 0.3047, + "step": 1535 + }, + { + "epoch": 0.12288, + "grad_norm": 1.6594504117965698, + "learning_rate": 9.636180961696371e-06, + "loss": 0.3467, + "step": 1536 + }, + { + "epoch": 0.12296, + "grad_norm": 1.4340310096740723, + "learning_rate": 9.635709856566167e-06, + "loss": 0.2981, + "step": 1537 + }, + { + "epoch": 0.12304, + "grad_norm": 1.524653673171997, + "learning_rate": 9.635238458150114e-06, + "loss": 0.2904, + "step": 1538 + }, + { + "epoch": 0.12312, + "grad_norm": 1.5759027004241943, + "learning_rate": 9.634766766478032e-06, + "loss": 0.3737, + "step": 1539 + }, + { + "epoch": 0.1232, + "grad_norm": 1.283862590789795, + "learning_rate": 9.634294781579764e-06, + "loss": 0.2563, + "step": 1540 + }, + { + "epoch": 0.12328, + "grad_norm": 1.5947818756103516, + "learning_rate": 9.633822503485172e-06, + "loss": 0.3229, + "step": 1541 + }, + { + "epoch": 0.12336, + "grad_norm": 1.2302602529525757, + "learning_rate": 9.633349932224135e-06, + "loss": 0.2566, + "step": 1542 + }, + { + "epoch": 0.12344, + "grad_norm": 1.265610694885254, + "learning_rate": 9.632877067826552e-06, + "loss": 0.25, + "step": 1543 + }, + { + "epoch": 0.12352, + "grad_norm": 1.6030651330947876, + "learning_rate": 9.632403910322337e-06, + "loss": 0.3573, + "step": 1544 + }, + { + "epoch": 0.1236, + "grad_norm": 1.4760074615478516, + "learning_rate": 9.631930459741427e-06, + "loss": 0.3391, + "step": 1545 + }, + { + "epoch": 0.12368, + "grad_norm": 1.7136142253875732, + "learning_rate": 9.631456716113777e-06, + "loss": 0.3732, + "step": 1546 + }, + { + "epoch": 0.12376, + "grad_norm": 1.8622156381607056, + "learning_rate": 9.630982679469355e-06, + "loss": 0.3882, + "step": 1547 + }, + { + "epoch": 0.12384, + "grad_norm": 1.7958600521087646, + "learning_rate": 9.630508349838155e-06, + "loss": 0.3191, + "step": 1548 + }, + { + "epoch": 0.12392, + "grad_norm": 1.9841034412384033, + "learning_rate": 9.630033727250186e-06, + "loss": 0.4149, + "step": 1549 + }, + { + "epoch": 0.124, + "grad_norm": 1.6318235397338867, + "learning_rate": 9.629558811735475e-06, + "loss": 0.349, + "step": 1550 + }, + { + "epoch": 0.12408, + "grad_norm": 1.3617398738861084, + "learning_rate": 9.62908360332407e-06, + "loss": 0.3016, + "step": 1551 + }, + { + "epoch": 0.12416, + "grad_norm": 1.6608842611312866, + "learning_rate": 9.628608102046032e-06, + "loss": 0.3923, + "step": 1552 + }, + { + "epoch": 0.12424, + "grad_norm": 1.6326426267623901, + "learning_rate": 9.628132307931446e-06, + "loss": 0.3796, + "step": 1553 + }, + { + "epoch": 0.12432, + "grad_norm": 1.473404049873352, + "learning_rate": 9.627656221010417e-06, + "loss": 0.3165, + "step": 1554 + }, + { + "epoch": 0.1244, + "grad_norm": 1.6658129692077637, + "learning_rate": 9.627179841313063e-06, + "loss": 0.3675, + "step": 1555 + }, + { + "epoch": 0.12448, + "grad_norm": 1.455764889717102, + "learning_rate": 9.626703168869522e-06, + "loss": 0.3633, + "step": 1556 + }, + { + "epoch": 0.12456, + "grad_norm": 1.4597536325454712, + "learning_rate": 9.626226203709954e-06, + "loss": 0.3412, + "step": 1557 + }, + { + "epoch": 0.12464, + "grad_norm": 1.6526191234588623, + "learning_rate": 9.625748945864531e-06, + "loss": 0.4253, + "step": 1558 + }, + { + "epoch": 0.12472, + "grad_norm": 1.3261500597000122, + "learning_rate": 9.625271395363453e-06, + "loss": 0.2575, + "step": 1559 + }, + { + "epoch": 0.1248, + "grad_norm": 1.5173916816711426, + "learning_rate": 9.624793552236927e-06, + "loss": 0.3687, + "step": 1560 + }, + { + "epoch": 0.12488, + "grad_norm": 1.2248857021331787, + "learning_rate": 9.62431541651519e-06, + "loss": 0.2559, + "step": 1561 + }, + { + "epoch": 0.12496, + "grad_norm": 1.925079584121704, + "learning_rate": 9.623836988228487e-06, + "loss": 0.365, + "step": 1562 + }, + { + "epoch": 0.12504, + "grad_norm": 1.8328197002410889, + "learning_rate": 9.623358267407092e-06, + "loss": 0.3888, + "step": 1563 + }, + { + "epoch": 0.12512, + "grad_norm": 1.723613977432251, + "learning_rate": 9.622879254081288e-06, + "loss": 0.4945, + "step": 1564 + }, + { + "epoch": 0.1252, + "grad_norm": 1.787847638130188, + "learning_rate": 9.622399948281382e-06, + "loss": 0.424, + "step": 1565 + }, + { + "epoch": 0.12528, + "grad_norm": 1.1664930582046509, + "learning_rate": 9.621920350037697e-06, + "loss": 0.2628, + "step": 1566 + }, + { + "epoch": 0.12536, + "grad_norm": 1.3959739208221436, + "learning_rate": 9.621440459380577e-06, + "loss": 0.2896, + "step": 1567 + }, + { + "epoch": 0.12544, + "grad_norm": 1.3110196590423584, + "learning_rate": 9.620960276340383e-06, + "loss": 0.2658, + "step": 1568 + }, + { + "epoch": 0.12552, + "grad_norm": 1.2237164974212646, + "learning_rate": 9.620479800947494e-06, + "loss": 0.2624, + "step": 1569 + }, + { + "epoch": 0.1256, + "grad_norm": 1.7535439729690552, + "learning_rate": 9.619999033232308e-06, + "loss": 0.3613, + "step": 1570 + }, + { + "epoch": 0.12568, + "grad_norm": 1.5596789121627808, + "learning_rate": 9.61951797322524e-06, + "loss": 0.3552, + "step": 1571 + }, + { + "epoch": 0.12576, + "grad_norm": 2.2569661140441895, + "learning_rate": 9.61903662095673e-06, + "loss": 0.4619, + "step": 1572 + }, + { + "epoch": 0.12584, + "grad_norm": 1.540496826171875, + "learning_rate": 9.618554976457226e-06, + "loss": 0.2943, + "step": 1573 + }, + { + "epoch": 0.12592, + "grad_norm": 1.417845606803894, + "learning_rate": 9.618073039757204e-06, + "loss": 0.2505, + "step": 1574 + }, + { + "epoch": 0.126, + "grad_norm": 1.3284822702407837, + "learning_rate": 9.617590810887151e-06, + "loss": 0.2476, + "step": 1575 + }, + { + "epoch": 0.12608, + "grad_norm": 1.5151894092559814, + "learning_rate": 9.617108289877578e-06, + "loss": 0.3971, + "step": 1576 + }, + { + "epoch": 0.12616, + "grad_norm": 1.6880301237106323, + "learning_rate": 9.616625476759014e-06, + "loss": 0.3476, + "step": 1577 + }, + { + "epoch": 0.12624, + "grad_norm": 1.7845243215560913, + "learning_rate": 9.616142371562003e-06, + "loss": 0.4271, + "step": 1578 + }, + { + "epoch": 0.12632, + "grad_norm": 1.6483310461044312, + "learning_rate": 9.61565897431711e-06, + "loss": 0.3685, + "step": 1579 + }, + { + "epoch": 0.1264, + "grad_norm": 1.4874074459075928, + "learning_rate": 9.615175285054916e-06, + "loss": 0.3274, + "step": 1580 + }, + { + "epoch": 0.12648, + "grad_norm": 1.4330966472625732, + "learning_rate": 9.614691303806027e-06, + "loss": 0.3611, + "step": 1581 + }, + { + "epoch": 0.12656, + "grad_norm": 1.424462914466858, + "learning_rate": 9.614207030601057e-06, + "loss": 0.2903, + "step": 1582 + }, + { + "epoch": 0.12664, + "grad_norm": 1.2430349588394165, + "learning_rate": 9.61372246547065e-06, + "loss": 0.2505, + "step": 1583 + }, + { + "epoch": 0.12672, + "grad_norm": 1.5988134145736694, + "learning_rate": 9.613237608445458e-06, + "loss": 0.3123, + "step": 1584 + }, + { + "epoch": 0.1268, + "grad_norm": 1.6877517700195312, + "learning_rate": 9.612752459556161e-06, + "loss": 0.4546, + "step": 1585 + }, + { + "epoch": 0.12688, + "grad_norm": 1.350681185722351, + "learning_rate": 9.612267018833448e-06, + "loss": 0.3087, + "step": 1586 + }, + { + "epoch": 0.12696, + "grad_norm": 1.260632038116455, + "learning_rate": 9.611781286308032e-06, + "loss": 0.2645, + "step": 1587 + }, + { + "epoch": 0.12704, + "grad_norm": 1.2421963214874268, + "learning_rate": 9.611295262010649e-06, + "loss": 0.296, + "step": 1588 + }, + { + "epoch": 0.12712, + "grad_norm": 1.5727829933166504, + "learning_rate": 9.610808945972042e-06, + "loss": 0.3084, + "step": 1589 + }, + { + "epoch": 0.1272, + "grad_norm": 1.3248558044433594, + "learning_rate": 9.610322338222982e-06, + "loss": 0.3195, + "step": 1590 + }, + { + "epoch": 0.12728, + "grad_norm": 1.5574604272842407, + "learning_rate": 9.60983543879425e-06, + "loss": 0.3364, + "step": 1591 + }, + { + "epoch": 0.12736, + "grad_norm": 1.849393367767334, + "learning_rate": 9.609348247716658e-06, + "loss": 0.3784, + "step": 1592 + }, + { + "epoch": 0.12744, + "grad_norm": 1.5725646018981934, + "learning_rate": 9.608860765021025e-06, + "loss": 0.3448, + "step": 1593 + }, + { + "epoch": 0.12752, + "grad_norm": 1.3313325643539429, + "learning_rate": 9.608372990738193e-06, + "loss": 0.2394, + "step": 1594 + }, + { + "epoch": 0.1276, + "grad_norm": 1.4957760572433472, + "learning_rate": 9.60788492489902e-06, + "loss": 0.3165, + "step": 1595 + }, + { + "epoch": 0.12768, + "grad_norm": 1.8721736669540405, + "learning_rate": 9.607396567534387e-06, + "loss": 0.3877, + "step": 1596 + }, + { + "epoch": 0.12776, + "grad_norm": 1.5303232669830322, + "learning_rate": 9.606907918675189e-06, + "loss": 0.3717, + "step": 1597 + }, + { + "epoch": 0.12784, + "grad_norm": 1.6356817483901978, + "learning_rate": 9.606418978352342e-06, + "loss": 0.4317, + "step": 1598 + }, + { + "epoch": 0.12792, + "grad_norm": 1.7951912879943848, + "learning_rate": 9.60592974659678e-06, + "loss": 0.3572, + "step": 1599 + }, + { + "epoch": 0.128, + "grad_norm": 1.7281938791275024, + "learning_rate": 9.605440223439452e-06, + "loss": 0.3721, + "step": 1600 + }, + { + "epoch": 0.12808, + "grad_norm": 1.4484968185424805, + "learning_rate": 9.604950408911334e-06, + "loss": 0.2891, + "step": 1601 + }, + { + "epoch": 0.12816, + "grad_norm": 1.6097825765609741, + "learning_rate": 9.604460303043411e-06, + "loss": 0.3225, + "step": 1602 + }, + { + "epoch": 0.12824, + "grad_norm": 1.80912184715271, + "learning_rate": 9.60396990586669e-06, + "loss": 0.4023, + "step": 1603 + }, + { + "epoch": 0.12832, + "grad_norm": 1.5157806873321533, + "learning_rate": 9.6034792174122e-06, + "loss": 0.3546, + "step": 1604 + }, + { + "epoch": 0.1284, + "grad_norm": 1.6665889024734497, + "learning_rate": 9.60298823771098e-06, + "loss": 0.3968, + "step": 1605 + }, + { + "epoch": 0.12848, + "grad_norm": 1.4822949171066284, + "learning_rate": 9.602496966794098e-06, + "loss": 0.3122, + "step": 1606 + }, + { + "epoch": 0.12856, + "grad_norm": 1.6601344347000122, + "learning_rate": 9.602005404692633e-06, + "loss": 0.3554, + "step": 1607 + }, + { + "epoch": 0.12864, + "grad_norm": 1.8020589351654053, + "learning_rate": 9.601513551437685e-06, + "loss": 0.4218, + "step": 1608 + }, + { + "epoch": 0.12872, + "grad_norm": 1.8085490465164185, + "learning_rate": 9.60102140706037e-06, + "loss": 0.5042, + "step": 1609 + }, + { + "epoch": 0.1288, + "grad_norm": 1.8596782684326172, + "learning_rate": 9.600528971591824e-06, + "loss": 0.4824, + "step": 1610 + }, + { + "epoch": 0.12888, + "grad_norm": 1.5251644849777222, + "learning_rate": 9.600036245063206e-06, + "loss": 0.298, + "step": 1611 + }, + { + "epoch": 0.12896, + "grad_norm": 1.4406249523162842, + "learning_rate": 9.599543227505685e-06, + "loss": 0.286, + "step": 1612 + }, + { + "epoch": 0.12904, + "grad_norm": 2.1787021160125732, + "learning_rate": 9.599049918950456e-06, + "loss": 0.4677, + "step": 1613 + }, + { + "epoch": 0.12912, + "grad_norm": 1.8443048000335693, + "learning_rate": 9.598556319428726e-06, + "loss": 0.3842, + "step": 1614 + }, + { + "epoch": 0.1292, + "grad_norm": 2.4744701385498047, + "learning_rate": 9.598062428971725e-06, + "loss": 0.5183, + "step": 1615 + }, + { + "epoch": 0.12928, + "grad_norm": 1.6642011404037476, + "learning_rate": 9.597568247610699e-06, + "loss": 0.3329, + "step": 1616 + }, + { + "epoch": 0.12936, + "grad_norm": 1.8446506261825562, + "learning_rate": 9.597073775376912e-06, + "loss": 0.4378, + "step": 1617 + }, + { + "epoch": 0.12944, + "grad_norm": 2.154025077819824, + "learning_rate": 9.596579012301652e-06, + "loss": 0.4079, + "step": 1618 + }, + { + "epoch": 0.12952, + "grad_norm": 1.8335460424423218, + "learning_rate": 9.596083958416216e-06, + "loss": 0.3403, + "step": 1619 + }, + { + "epoch": 0.1296, + "grad_norm": 1.4928038120269775, + "learning_rate": 9.595588613751927e-06, + "loss": 0.3134, + "step": 1620 + }, + { + "epoch": 0.12968, + "grad_norm": 1.566196322441101, + "learning_rate": 9.595092978340124e-06, + "loss": 0.3698, + "step": 1621 + }, + { + "epoch": 0.12976, + "grad_norm": 1.6201937198638916, + "learning_rate": 9.594597052212163e-06, + "loss": 0.3139, + "step": 1622 + }, + { + "epoch": 0.12984, + "grad_norm": 1.5580354928970337, + "learning_rate": 9.59410083539942e-06, + "loss": 0.3374, + "step": 1623 + }, + { + "epoch": 0.12992, + "grad_norm": 1.472684621810913, + "learning_rate": 9.593604327933288e-06, + "loss": 0.3058, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 1.9004932641983032, + "learning_rate": 9.59310752984518e-06, + "loss": 0.3661, + "step": 1625 + }, + { + "epoch": 0.13008, + "grad_norm": 1.7129523754119873, + "learning_rate": 9.59261044116653e-06, + "loss": 0.4236, + "step": 1626 + }, + { + "epoch": 0.13016, + "grad_norm": 1.9962881803512573, + "learning_rate": 9.592113061928783e-06, + "loss": 0.4657, + "step": 1627 + }, + { + "epoch": 0.13024, + "grad_norm": 1.0669331550598145, + "learning_rate": 9.591615392163408e-06, + "loss": 0.2477, + "step": 1628 + }, + { + "epoch": 0.13032, + "grad_norm": 1.2336879968643188, + "learning_rate": 9.59111743190189e-06, + "loss": 0.2468, + "step": 1629 + }, + { + "epoch": 0.1304, + "grad_norm": 2.120313882827759, + "learning_rate": 9.590619181175736e-06, + "loss": 0.423, + "step": 1630 + }, + { + "epoch": 0.13048, + "grad_norm": 2.0639867782592773, + "learning_rate": 9.590120640016463e-06, + "loss": 0.3888, + "step": 1631 + }, + { + "epoch": 0.13056, + "grad_norm": 1.6942673921585083, + "learning_rate": 9.589621808455617e-06, + "loss": 0.377, + "step": 1632 + }, + { + "epoch": 0.13064, + "grad_norm": 1.8577876091003418, + "learning_rate": 9.589122686524759e-06, + "loss": 0.3929, + "step": 1633 + }, + { + "epoch": 0.13072, + "grad_norm": 1.837852954864502, + "learning_rate": 9.588623274255461e-06, + "loss": 0.3284, + "step": 1634 + }, + { + "epoch": 0.1308, + "grad_norm": 1.546615481376648, + "learning_rate": 9.588123571679323e-06, + "loss": 0.3227, + "step": 1635 + }, + { + "epoch": 0.13088, + "grad_norm": 1.5524723529815674, + "learning_rate": 9.587623578827958e-06, + "loss": 0.3375, + "step": 1636 + }, + { + "epoch": 0.13096, + "grad_norm": 1.36067533493042, + "learning_rate": 9.587123295733e-06, + "loss": 0.394, + "step": 1637 + }, + { + "epoch": 0.13104, + "grad_norm": 1.860670804977417, + "learning_rate": 9.5866227224261e-06, + "loss": 0.563, + "step": 1638 + }, + { + "epoch": 0.13112, + "grad_norm": 1.4262064695358276, + "learning_rate": 9.586121858938926e-06, + "loss": 0.3446, + "step": 1639 + }, + { + "epoch": 0.1312, + "grad_norm": 1.6558672189712524, + "learning_rate": 9.585620705303168e-06, + "loss": 0.3452, + "step": 1640 + }, + { + "epoch": 0.13128, + "grad_norm": 1.3890858888626099, + "learning_rate": 9.585119261550531e-06, + "loss": 0.3047, + "step": 1641 + }, + { + "epoch": 0.13136, + "grad_norm": 1.5068423748016357, + "learning_rate": 9.58461752771274e-06, + "loss": 0.3401, + "step": 1642 + }, + { + "epoch": 0.13144, + "grad_norm": 1.6645478010177612, + "learning_rate": 9.584115503821538e-06, + "loss": 0.2867, + "step": 1643 + }, + { + "epoch": 0.13152, + "grad_norm": 1.6663644313812256, + "learning_rate": 9.583613189908688e-06, + "loss": 0.3418, + "step": 1644 + }, + { + "epoch": 0.1316, + "grad_norm": 1.356171727180481, + "learning_rate": 9.583110586005969e-06, + "loss": 0.2561, + "step": 1645 + }, + { + "epoch": 0.13168, + "grad_norm": 2.0098698139190674, + "learning_rate": 9.582607692145176e-06, + "loss": 0.3869, + "step": 1646 + }, + { + "epoch": 0.13176, + "grad_norm": 1.6945312023162842, + "learning_rate": 9.582104508358128e-06, + "loss": 0.3589, + "step": 1647 + }, + { + "epoch": 0.13184, + "grad_norm": 1.593902826309204, + "learning_rate": 9.58160103467666e-06, + "loss": 0.367, + "step": 1648 + }, + { + "epoch": 0.13192, + "grad_norm": 1.8232020139694214, + "learning_rate": 9.581097271132626e-06, + "loss": 0.4105, + "step": 1649 + }, + { + "epoch": 0.132, + "grad_norm": 1.5520724058151245, + "learning_rate": 9.580593217757893e-06, + "loss": 0.3554, + "step": 1650 + }, + { + "epoch": 0.13208, + "grad_norm": 1.5328576564788818, + "learning_rate": 9.580088874584356e-06, + "loss": 0.3405, + "step": 1651 + }, + { + "epoch": 0.13216, + "grad_norm": 1.6017850637435913, + "learning_rate": 9.579584241643923e-06, + "loss": 0.3577, + "step": 1652 + }, + { + "epoch": 0.13224, + "grad_norm": 1.6488410234451294, + "learning_rate": 9.579079318968514e-06, + "loss": 0.3528, + "step": 1653 + }, + { + "epoch": 0.13232, + "grad_norm": 2.3161511421203613, + "learning_rate": 9.578574106590081e-06, + "loss": 0.5027, + "step": 1654 + }, + { + "epoch": 0.1324, + "grad_norm": 1.536486029624939, + "learning_rate": 9.578068604540582e-06, + "loss": 0.3476, + "step": 1655 + }, + { + "epoch": 0.13248, + "grad_norm": 1.761263132095337, + "learning_rate": 9.577562812852004e-06, + "loss": 0.36, + "step": 1656 + }, + { + "epoch": 0.13256, + "grad_norm": 1.8455753326416016, + "learning_rate": 9.57705673155634e-06, + "loss": 0.4576, + "step": 1657 + }, + { + "epoch": 0.13264, + "grad_norm": 1.7750699520111084, + "learning_rate": 9.576550360685613e-06, + "loss": 0.3432, + "step": 1658 + }, + { + "epoch": 0.13272, + "grad_norm": 1.4518996477127075, + "learning_rate": 9.576043700271857e-06, + "loss": 0.3352, + "step": 1659 + }, + { + "epoch": 0.1328, + "grad_norm": 1.6674631834030151, + "learning_rate": 9.57553675034713e-06, + "loss": 0.3836, + "step": 1660 + }, + { + "epoch": 0.13288, + "grad_norm": 1.7204508781433105, + "learning_rate": 9.575029510943501e-06, + "loss": 0.3045, + "step": 1661 + }, + { + "epoch": 0.13296, + "grad_norm": 1.37985098361969, + "learning_rate": 9.574521982093063e-06, + "loss": 0.2761, + "step": 1662 + }, + { + "epoch": 0.13304, + "grad_norm": 1.4991388320922852, + "learning_rate": 9.574014163827926e-06, + "loss": 0.3735, + "step": 1663 + }, + { + "epoch": 0.13312, + "grad_norm": 1.4861139059066772, + "learning_rate": 9.573506056180215e-06, + "loss": 0.3132, + "step": 1664 + }, + { + "epoch": 0.1332, + "grad_norm": 1.6248459815979004, + "learning_rate": 9.572997659182081e-06, + "loss": 0.3233, + "step": 1665 + }, + { + "epoch": 0.13328, + "grad_norm": 1.3340996503829956, + "learning_rate": 9.572488972865686e-06, + "loss": 0.3419, + "step": 1666 + }, + { + "epoch": 0.13336, + "grad_norm": 1.760284423828125, + "learning_rate": 9.571979997263214e-06, + "loss": 0.3719, + "step": 1667 + }, + { + "epoch": 0.13344, + "grad_norm": 1.4957592487335205, + "learning_rate": 9.571470732406865e-06, + "loss": 0.3708, + "step": 1668 + }, + { + "epoch": 0.13352, + "grad_norm": 1.7483258247375488, + "learning_rate": 9.570961178328859e-06, + "loss": 0.4577, + "step": 1669 + }, + { + "epoch": 0.1336, + "grad_norm": 1.3776280879974365, + "learning_rate": 9.570451335061433e-06, + "loss": 0.288, + "step": 1670 + }, + { + "epoch": 0.13368, + "grad_norm": 1.427151083946228, + "learning_rate": 9.569941202636846e-06, + "loss": 0.3302, + "step": 1671 + }, + { + "epoch": 0.13376, + "grad_norm": 1.7294217348098755, + "learning_rate": 9.569430781087367e-06, + "loss": 0.3842, + "step": 1672 + }, + { + "epoch": 0.13384, + "grad_norm": 1.1407139301300049, + "learning_rate": 9.568920070445295e-06, + "loss": 0.2215, + "step": 1673 + }, + { + "epoch": 0.13392, + "grad_norm": 1.2754679918289185, + "learning_rate": 9.568409070742936e-06, + "loss": 0.2493, + "step": 1674 + }, + { + "epoch": 0.134, + "grad_norm": 1.5319108963012695, + "learning_rate": 9.56789778201262e-06, + "loss": 0.3972, + "step": 1675 + }, + { + "epoch": 0.13408, + "grad_norm": 1.3693424463272095, + "learning_rate": 9.567386204286697e-06, + "loss": 0.261, + "step": 1676 + }, + { + "epoch": 0.13416, + "grad_norm": 1.649755597114563, + "learning_rate": 9.566874337597533e-06, + "loss": 0.3777, + "step": 1677 + }, + { + "epoch": 0.13424, + "grad_norm": 1.7307039499282837, + "learning_rate": 9.566362181977509e-06, + "loss": 0.4339, + "step": 1678 + }, + { + "epoch": 0.13432, + "grad_norm": 1.3721426725387573, + "learning_rate": 9.565849737459027e-06, + "loss": 0.2845, + "step": 1679 + }, + { + "epoch": 0.1344, + "grad_norm": 2.235292434692383, + "learning_rate": 9.565337004074512e-06, + "loss": 0.4784, + "step": 1680 + }, + { + "epoch": 0.13448, + "grad_norm": 1.6313210725784302, + "learning_rate": 9.5648239818564e-06, + "loss": 0.3986, + "step": 1681 + }, + { + "epoch": 0.13456, + "grad_norm": 1.9515478610992432, + "learning_rate": 9.564310670837146e-06, + "loss": 0.4105, + "step": 1682 + }, + { + "epoch": 0.13464, + "grad_norm": 1.635183572769165, + "learning_rate": 9.563797071049232e-06, + "loss": 0.3187, + "step": 1683 + }, + { + "epoch": 0.13472, + "grad_norm": 1.5895339250564575, + "learning_rate": 9.563283182525145e-06, + "loss": 0.3844, + "step": 1684 + }, + { + "epoch": 0.1348, + "grad_norm": 1.718421459197998, + "learning_rate": 9.562769005297401e-06, + "loss": 0.3469, + "step": 1685 + }, + { + "epoch": 0.13488, + "grad_norm": 1.5541642904281616, + "learning_rate": 9.56225453939853e-06, + "loss": 0.3621, + "step": 1686 + }, + { + "epoch": 0.13496, + "grad_norm": 1.4598628282546997, + "learning_rate": 9.561739784861077e-06, + "loss": 0.3712, + "step": 1687 + }, + { + "epoch": 0.13504, + "grad_norm": 1.4597030878067017, + "learning_rate": 9.561224741717614e-06, + "loss": 0.343, + "step": 1688 + }, + { + "epoch": 0.13512, + "grad_norm": 1.3350636959075928, + "learning_rate": 9.560709410000722e-06, + "loss": 0.2609, + "step": 1689 + }, + { + "epoch": 0.1352, + "grad_norm": 1.1671631336212158, + "learning_rate": 9.560193789743006e-06, + "loss": 0.2378, + "step": 1690 + }, + { + "epoch": 0.13528, + "grad_norm": 1.94452965259552, + "learning_rate": 9.55967788097709e-06, + "loss": 0.3756, + "step": 1691 + }, + { + "epoch": 0.13536, + "grad_norm": 2.2720422744750977, + "learning_rate": 9.559161683735607e-06, + "loss": 0.4896, + "step": 1692 + }, + { + "epoch": 0.13544, + "grad_norm": 1.2150167226791382, + "learning_rate": 9.558645198051221e-06, + "loss": 0.2944, + "step": 1693 + }, + { + "epoch": 0.13552, + "grad_norm": 1.321184754371643, + "learning_rate": 9.558128423956608e-06, + "loss": 0.3323, + "step": 1694 + }, + { + "epoch": 0.1356, + "grad_norm": 1.6017346382141113, + "learning_rate": 9.55761136148446e-06, + "loss": 0.3454, + "step": 1695 + }, + { + "epoch": 0.13568, + "grad_norm": 1.4574471712112427, + "learning_rate": 9.55709401066749e-06, + "loss": 0.3351, + "step": 1696 + }, + { + "epoch": 0.13576, + "grad_norm": 1.7760628461837769, + "learning_rate": 9.556576371538431e-06, + "loss": 0.3757, + "step": 1697 + }, + { + "epoch": 0.13584, + "grad_norm": 1.3665014505386353, + "learning_rate": 9.556058444130032e-06, + "loss": 0.2966, + "step": 1698 + }, + { + "epoch": 0.13592, + "grad_norm": 1.6557552814483643, + "learning_rate": 9.555540228475058e-06, + "loss": 0.3721, + "step": 1699 + }, + { + "epoch": 0.136, + "grad_norm": 1.6838151216506958, + "learning_rate": 9.555021724606298e-06, + "loss": 0.4326, + "step": 1700 + }, + { + "epoch": 0.13608, + "grad_norm": 1.3219002485275269, + "learning_rate": 9.554502932556555e-06, + "loss": 0.3573, + "step": 1701 + }, + { + "epoch": 0.13616, + "grad_norm": 1.4627076387405396, + "learning_rate": 9.55398385235865e-06, + "loss": 0.3406, + "step": 1702 + }, + { + "epoch": 0.13624, + "grad_norm": 1.6964737176895142, + "learning_rate": 9.553464484045425e-06, + "loss": 0.3365, + "step": 1703 + }, + { + "epoch": 0.13632, + "grad_norm": 1.4238336086273193, + "learning_rate": 9.552944827649737e-06, + "loss": 0.3474, + "step": 1704 + }, + { + "epoch": 0.1364, + "grad_norm": 1.3716814517974854, + "learning_rate": 9.552424883204465e-06, + "loss": 0.3811, + "step": 1705 + }, + { + "epoch": 0.13648, + "grad_norm": 1.5849741697311401, + "learning_rate": 9.551904650742503e-06, + "loss": 0.3416, + "step": 1706 + }, + { + "epoch": 0.13656, + "grad_norm": 1.7011810541152954, + "learning_rate": 9.551384130296763e-06, + "loss": 0.3839, + "step": 1707 + }, + { + "epoch": 0.13664, + "grad_norm": 1.6651289463043213, + "learning_rate": 9.55086332190018e-06, + "loss": 0.3876, + "step": 1708 + }, + { + "epoch": 0.13672, + "grad_norm": 1.391045331954956, + "learning_rate": 9.5503422255857e-06, + "loss": 0.2749, + "step": 1709 + }, + { + "epoch": 0.1368, + "grad_norm": 1.50571870803833, + "learning_rate": 9.549820841386295e-06, + "loss": 0.3082, + "step": 1710 + }, + { + "epoch": 0.13688, + "grad_norm": 1.1734155416488647, + "learning_rate": 9.549299169334948e-06, + "loss": 0.3251, + "step": 1711 + }, + { + "epoch": 0.13696, + "grad_norm": 1.4512368440628052, + "learning_rate": 9.548777209464664e-06, + "loss": 0.3287, + "step": 1712 + }, + { + "epoch": 0.13704, + "grad_norm": 1.5819345712661743, + "learning_rate": 9.548254961808467e-06, + "loss": 0.3006, + "step": 1713 + }, + { + "epoch": 0.13712, + "grad_norm": 1.6825000047683716, + "learning_rate": 9.547732426399397e-06, + "loss": 0.3654, + "step": 1714 + }, + { + "epoch": 0.1372, + "grad_norm": 2.002753496170044, + "learning_rate": 9.547209603270513e-06, + "loss": 0.4603, + "step": 1715 + }, + { + "epoch": 0.13728, + "grad_norm": 1.3878475427627563, + "learning_rate": 9.546686492454892e-06, + "loss": 0.3205, + "step": 1716 + }, + { + "epoch": 0.13736, + "grad_norm": 1.9172121286392212, + "learning_rate": 9.546163093985631e-06, + "loss": 0.4037, + "step": 1717 + }, + { + "epoch": 0.13744, + "grad_norm": 1.4683332443237305, + "learning_rate": 9.545639407895842e-06, + "loss": 0.3113, + "step": 1718 + }, + { + "epoch": 0.13752, + "grad_norm": 1.4814456701278687, + "learning_rate": 9.545115434218658e-06, + "loss": 0.3195, + "step": 1719 + }, + { + "epoch": 0.1376, + "grad_norm": 1.6814812421798706, + "learning_rate": 9.544591172987227e-06, + "loss": 0.3734, + "step": 1720 + }, + { + "epoch": 0.13768, + "grad_norm": 1.4139302968978882, + "learning_rate": 9.54406662423472e-06, + "loss": 0.3155, + "step": 1721 + }, + { + "epoch": 0.13776, + "grad_norm": 1.6127204895019531, + "learning_rate": 9.543541787994322e-06, + "loss": 0.3476, + "step": 1722 + }, + { + "epoch": 0.13784, + "grad_norm": 1.9391632080078125, + "learning_rate": 9.543016664299237e-06, + "loss": 0.4613, + "step": 1723 + }, + { + "epoch": 0.13792, + "grad_norm": 1.070672869682312, + "learning_rate": 9.542491253182689e-06, + "loss": 0.2503, + "step": 1724 + }, + { + "epoch": 0.138, + "grad_norm": 1.7871367931365967, + "learning_rate": 9.541965554677918e-06, + "loss": 0.4031, + "step": 1725 + }, + { + "epoch": 0.13808, + "grad_norm": 1.4849225282669067, + "learning_rate": 9.541439568818186e-06, + "loss": 0.3372, + "step": 1726 + }, + { + "epoch": 0.13816, + "grad_norm": 1.5674102306365967, + "learning_rate": 9.540913295636766e-06, + "loss": 0.4731, + "step": 1727 + }, + { + "epoch": 0.13824, + "grad_norm": 1.6845797300338745, + "learning_rate": 9.540386735166957e-06, + "loss": 0.4774, + "step": 1728 + }, + { + "epoch": 0.13832, + "grad_norm": 1.2045738697052002, + "learning_rate": 9.539859887442071e-06, + "loss": 0.3388, + "step": 1729 + }, + { + "epoch": 0.1384, + "grad_norm": 1.78014075756073, + "learning_rate": 9.53933275249544e-06, + "loss": 0.3886, + "step": 1730 + }, + { + "epoch": 0.13848, + "grad_norm": 1.5145901441574097, + "learning_rate": 9.538805330360415e-06, + "loss": 0.3352, + "step": 1731 + }, + { + "epoch": 0.13856, + "grad_norm": 1.681176781654358, + "learning_rate": 9.538277621070363e-06, + "loss": 0.3815, + "step": 1732 + }, + { + "epoch": 0.13864, + "grad_norm": 1.7225366830825806, + "learning_rate": 9.537749624658671e-06, + "loss": 0.3022, + "step": 1733 + }, + { + "epoch": 0.13872, + "grad_norm": 1.4362283945083618, + "learning_rate": 9.537221341158745e-06, + "loss": 0.3086, + "step": 1734 + }, + { + "epoch": 0.1388, + "grad_norm": 1.7611305713653564, + "learning_rate": 9.536692770604005e-06, + "loss": 0.3078, + "step": 1735 + }, + { + "epoch": 0.13888, + "grad_norm": 2.2746381759643555, + "learning_rate": 9.536163913027894e-06, + "loss": 0.4661, + "step": 1736 + }, + { + "epoch": 0.13896, + "grad_norm": 1.6022064685821533, + "learning_rate": 9.535634768463869e-06, + "loss": 0.3514, + "step": 1737 + }, + { + "epoch": 0.13904, + "grad_norm": 2.1435439586639404, + "learning_rate": 9.53510533694541e-06, + "loss": 0.4979, + "step": 1738 + }, + { + "epoch": 0.13912, + "grad_norm": 1.5919454097747803, + "learning_rate": 9.53457561850601e-06, + "loss": 0.387, + "step": 1739 + }, + { + "epoch": 0.1392, + "grad_norm": 1.475101351737976, + "learning_rate": 9.534045613179184e-06, + "loss": 0.3359, + "step": 1740 + }, + { + "epoch": 0.13928, + "grad_norm": 1.5171840190887451, + "learning_rate": 9.533515320998462e-06, + "loss": 0.3717, + "step": 1741 + }, + { + "epoch": 0.13936, + "grad_norm": 1.7646937370300293, + "learning_rate": 9.532984741997395e-06, + "loss": 0.3868, + "step": 1742 + }, + { + "epoch": 0.13944, + "grad_norm": 2.229182720184326, + "learning_rate": 9.532453876209551e-06, + "loss": 0.4725, + "step": 1743 + }, + { + "epoch": 0.13952, + "grad_norm": 1.8388185501098633, + "learning_rate": 9.531922723668517e-06, + "loss": 0.3709, + "step": 1744 + }, + { + "epoch": 0.1396, + "grad_norm": 1.9653486013412476, + "learning_rate": 9.531391284407896e-06, + "loss": 0.4021, + "step": 1745 + }, + { + "epoch": 0.13968, + "grad_norm": 1.6100776195526123, + "learning_rate": 9.530859558461309e-06, + "loss": 0.3279, + "step": 1746 + }, + { + "epoch": 0.13976, + "grad_norm": 1.587827205657959, + "learning_rate": 9.530327545862398e-06, + "loss": 0.3501, + "step": 1747 + }, + { + "epoch": 0.13984, + "grad_norm": 1.6959279775619507, + "learning_rate": 9.529795246644821e-06, + "loss": 0.3558, + "step": 1748 + }, + { + "epoch": 0.13992, + "grad_norm": 1.7590053081512451, + "learning_rate": 9.529262660842257e-06, + "loss": 0.3838, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 2.2163288593292236, + "learning_rate": 9.5287297884884e-06, + "loss": 0.4245, + "step": 1750 + }, + { + "epoch": 0.14008, + "grad_norm": 1.510589599609375, + "learning_rate": 9.528196629616963e-06, + "loss": 0.3794, + "step": 1751 + }, + { + "epoch": 0.14016, + "grad_norm": 1.942276954650879, + "learning_rate": 9.527663184261674e-06, + "loss": 0.4612, + "step": 1752 + }, + { + "epoch": 0.14024, + "grad_norm": 1.3821239471435547, + "learning_rate": 9.527129452456288e-06, + "loss": 0.3189, + "step": 1753 + }, + { + "epoch": 0.14032, + "grad_norm": 2.456770181655884, + "learning_rate": 9.526595434234567e-06, + "loss": 0.5582, + "step": 1754 + }, + { + "epoch": 0.1404, + "grad_norm": 1.4780219793319702, + "learning_rate": 9.5260611296303e-06, + "loss": 0.373, + "step": 1755 + }, + { + "epoch": 0.14048, + "grad_norm": 1.8656375408172607, + "learning_rate": 9.52552653867729e-06, + "loss": 0.3918, + "step": 1756 + }, + { + "epoch": 0.14056, + "grad_norm": 1.6014630794525146, + "learning_rate": 9.524991661409356e-06, + "loss": 0.3833, + "step": 1757 + }, + { + "epoch": 0.14064, + "grad_norm": 1.5639004707336426, + "learning_rate": 9.524456497860342e-06, + "loss": 0.4446, + "step": 1758 + }, + { + "epoch": 0.14072, + "grad_norm": 1.6596323251724243, + "learning_rate": 9.523921048064105e-06, + "loss": 0.3756, + "step": 1759 + }, + { + "epoch": 0.1408, + "grad_norm": 1.7615050077438354, + "learning_rate": 9.523385312054519e-06, + "loss": 0.355, + "step": 1760 + }, + { + "epoch": 0.14088, + "grad_norm": 1.968165636062622, + "learning_rate": 9.52284928986548e-06, + "loss": 0.4518, + "step": 1761 + }, + { + "epoch": 0.14096, + "grad_norm": 1.9906011819839478, + "learning_rate": 9.5223129815309e-06, + "loss": 0.4838, + "step": 1762 + }, + { + "epoch": 0.14104, + "grad_norm": 1.559403419494629, + "learning_rate": 9.52177638708471e-06, + "loss": 0.3133, + "step": 1763 + }, + { + "epoch": 0.14112, + "grad_norm": 1.639631748199463, + "learning_rate": 9.521239506560856e-06, + "loss": 0.3229, + "step": 1764 + }, + { + "epoch": 0.1412, + "grad_norm": 1.5165354013442993, + "learning_rate": 9.520702339993308e-06, + "loss": 0.2711, + "step": 1765 + }, + { + "epoch": 0.14128, + "grad_norm": 1.5883980989456177, + "learning_rate": 9.520164887416048e-06, + "loss": 0.3004, + "step": 1766 + }, + { + "epoch": 0.14136, + "grad_norm": 2.106351852416992, + "learning_rate": 9.519627148863083e-06, + "loss": 0.3873, + "step": 1767 + }, + { + "epoch": 0.14144, + "grad_norm": 1.6328215599060059, + "learning_rate": 9.519089124368428e-06, + "loss": 0.3724, + "step": 1768 + }, + { + "epoch": 0.14152, + "grad_norm": 1.9253731966018677, + "learning_rate": 9.518550813966127e-06, + "loss": 0.4179, + "step": 1769 + }, + { + "epoch": 0.1416, + "grad_norm": 1.5602757930755615, + "learning_rate": 9.518012217690233e-06, + "loss": 0.3443, + "step": 1770 + }, + { + "epoch": 0.14168, + "grad_norm": 0.9548719525337219, + "learning_rate": 9.517473335574826e-06, + "loss": 0.2493, + "step": 1771 + }, + { + "epoch": 0.14176, + "grad_norm": 1.837083339691162, + "learning_rate": 9.516934167653995e-06, + "loss": 0.4071, + "step": 1772 + }, + { + "epoch": 0.14184, + "grad_norm": 1.7493815422058105, + "learning_rate": 9.516394713961851e-06, + "loss": 0.3572, + "step": 1773 + }, + { + "epoch": 0.14192, + "grad_norm": 1.7509686946868896, + "learning_rate": 9.51585497453253e-06, + "loss": 0.38, + "step": 1774 + }, + { + "epoch": 0.142, + "grad_norm": 1.8923393487930298, + "learning_rate": 9.515314949400172e-06, + "loss": 0.3667, + "step": 1775 + }, + { + "epoch": 0.14208, + "grad_norm": 1.5569912195205688, + "learning_rate": 9.514774638598945e-06, + "loss": 0.3337, + "step": 1776 + }, + { + "epoch": 0.14216, + "grad_norm": 1.260204792022705, + "learning_rate": 9.514234042163033e-06, + "loss": 0.2805, + "step": 1777 + }, + { + "epoch": 0.14224, + "grad_norm": 1.447977066040039, + "learning_rate": 9.51369316012664e-06, + "loss": 0.3253, + "step": 1778 + }, + { + "epoch": 0.14232, + "grad_norm": 1.598922610282898, + "learning_rate": 9.513151992523982e-06, + "loss": 0.3362, + "step": 1779 + }, + { + "epoch": 0.1424, + "grad_norm": 1.7492644786834717, + "learning_rate": 9.512610539389297e-06, + "loss": 0.3523, + "step": 1780 + }, + { + "epoch": 0.14248, + "grad_norm": 1.6594542264938354, + "learning_rate": 9.512068800756845e-06, + "loss": 0.3077, + "step": 1781 + }, + { + "epoch": 0.14256, + "grad_norm": 1.445451259613037, + "learning_rate": 9.511526776660898e-06, + "loss": 0.2962, + "step": 1782 + }, + { + "epoch": 0.14264, + "grad_norm": 1.554978609085083, + "learning_rate": 9.510984467135744e-06, + "loss": 0.4125, + "step": 1783 + }, + { + "epoch": 0.14272, + "grad_norm": 1.4923337697982788, + "learning_rate": 9.5104418722157e-06, + "loss": 0.2499, + "step": 1784 + }, + { + "epoch": 0.1428, + "grad_norm": 1.4580963850021362, + "learning_rate": 9.509898991935088e-06, + "loss": 0.3109, + "step": 1785 + }, + { + "epoch": 0.14288, + "grad_norm": 1.3246641159057617, + "learning_rate": 9.50935582632826e-06, + "loss": 0.2577, + "step": 1786 + }, + { + "epoch": 0.14296, + "grad_norm": 1.401862382888794, + "learning_rate": 9.508812375429575e-06, + "loss": 0.34, + "step": 1787 + }, + { + "epoch": 0.14304, + "grad_norm": 1.5076367855072021, + "learning_rate": 9.508268639273417e-06, + "loss": 0.4019, + "step": 1788 + }, + { + "epoch": 0.14312, + "grad_norm": 1.6799163818359375, + "learning_rate": 9.507724617894188e-06, + "loss": 0.4145, + "step": 1789 + }, + { + "epoch": 0.1432, + "grad_norm": 1.2659521102905273, + "learning_rate": 9.507180311326306e-06, + "loss": 0.2803, + "step": 1790 + }, + { + "epoch": 0.14328, + "grad_norm": 2.103095054626465, + "learning_rate": 9.506635719604207e-06, + "loss": 0.4016, + "step": 1791 + }, + { + "epoch": 0.14336, + "grad_norm": 1.4720572233200073, + "learning_rate": 9.506090842762344e-06, + "loss": 0.3029, + "step": 1792 + }, + { + "epoch": 0.14344, + "grad_norm": 1.6554877758026123, + "learning_rate": 9.50554568083519e-06, + "loss": 0.3414, + "step": 1793 + }, + { + "epoch": 0.14352, + "grad_norm": 1.4517250061035156, + "learning_rate": 9.505000233857238e-06, + "loss": 0.423, + "step": 1794 + }, + { + "epoch": 0.1436, + "grad_norm": 1.9899773597717285, + "learning_rate": 9.504454501862994e-06, + "loss": 0.3435, + "step": 1795 + }, + { + "epoch": 0.14368, + "grad_norm": 1.4385102987289429, + "learning_rate": 9.503908484886986e-06, + "loss": 0.3223, + "step": 1796 + }, + { + "epoch": 0.14376, + "grad_norm": 1.4701497554779053, + "learning_rate": 9.503362182963757e-06, + "loss": 0.3363, + "step": 1797 + }, + { + "epoch": 0.14384, + "grad_norm": 2.0168707370758057, + "learning_rate": 9.502815596127874e-06, + "loss": 0.5457, + "step": 1798 + }, + { + "epoch": 0.14392, + "grad_norm": 1.6727696657180786, + "learning_rate": 9.502268724413913e-06, + "loss": 0.3096, + "step": 1799 + }, + { + "epoch": 0.144, + "grad_norm": 1.4367340803146362, + "learning_rate": 9.501721567856475e-06, + "loss": 0.2831, + "step": 1800 + }, + { + "epoch": 0.14408, + "grad_norm": 1.5396634340286255, + "learning_rate": 9.501174126490176e-06, + "loss": 0.3791, + "step": 1801 + }, + { + "epoch": 0.14416, + "grad_norm": 1.8134430646896362, + "learning_rate": 9.500626400349651e-06, + "loss": 0.5339, + "step": 1802 + }, + { + "epoch": 0.14424, + "grad_norm": 1.4883620738983154, + "learning_rate": 9.500078389469551e-06, + "loss": 0.3966, + "step": 1803 + }, + { + "epoch": 0.14432, + "grad_norm": 1.666069746017456, + "learning_rate": 9.49953009388455e-06, + "loss": 0.3484, + "step": 1804 + }, + { + "epoch": 0.1444, + "grad_norm": 1.8850048780441284, + "learning_rate": 9.498981513629336e-06, + "loss": 0.3768, + "step": 1805 + }, + { + "epoch": 0.14448, + "grad_norm": 1.885728359222412, + "learning_rate": 9.498432648738616e-06, + "loss": 0.413, + "step": 1806 + }, + { + "epoch": 0.14456, + "grad_norm": 1.6938875913619995, + "learning_rate": 9.497883499247112e-06, + "loss": 0.3417, + "step": 1807 + }, + { + "epoch": 0.14464, + "grad_norm": 1.6095532178878784, + "learning_rate": 9.49733406518957e-06, + "loss": 0.3333, + "step": 1808 + }, + { + "epoch": 0.14472, + "grad_norm": 2.2393977642059326, + "learning_rate": 9.496784346600749e-06, + "loss": 0.4968, + "step": 1809 + }, + { + "epoch": 0.1448, + "grad_norm": 1.5077526569366455, + "learning_rate": 9.496234343515428e-06, + "loss": 0.3411, + "step": 1810 + }, + { + "epoch": 0.14488, + "grad_norm": 1.496593713760376, + "learning_rate": 9.495684055968408e-06, + "loss": 0.468, + "step": 1811 + }, + { + "epoch": 0.14496, + "grad_norm": 1.466931700706482, + "learning_rate": 9.495133483994498e-06, + "loss": 0.3441, + "step": 1812 + }, + { + "epoch": 0.14504, + "grad_norm": 1.3516722917556763, + "learning_rate": 9.494582627628533e-06, + "loss": 0.2619, + "step": 1813 + }, + { + "epoch": 0.14512, + "grad_norm": 2.093080520629883, + "learning_rate": 9.494031486905366e-06, + "loss": 0.4629, + "step": 1814 + }, + { + "epoch": 0.1452, + "grad_norm": 1.777840256690979, + "learning_rate": 9.493480061859861e-06, + "loss": 0.3284, + "step": 1815 + }, + { + "epoch": 0.14528, + "grad_norm": 1.5537855625152588, + "learning_rate": 9.492928352526908e-06, + "loss": 0.3827, + "step": 1816 + }, + { + "epoch": 0.14536, + "grad_norm": 1.2391389608383179, + "learning_rate": 9.492376358941414e-06, + "loss": 0.3373, + "step": 1817 + }, + { + "epoch": 0.14544, + "grad_norm": 1.3341397047042847, + "learning_rate": 9.4918240811383e-06, + "loss": 0.2814, + "step": 1818 + }, + { + "epoch": 0.14552, + "grad_norm": 1.5178688764572144, + "learning_rate": 9.491271519152503e-06, + "loss": 0.3376, + "step": 1819 + }, + { + "epoch": 0.1456, + "grad_norm": 1.4326900243759155, + "learning_rate": 9.490718673018986e-06, + "loss": 0.3139, + "step": 1820 + }, + { + "epoch": 0.14568, + "grad_norm": 1.5329378843307495, + "learning_rate": 9.490165542772724e-06, + "loss": 0.3327, + "step": 1821 + }, + { + "epoch": 0.14576, + "grad_norm": 1.8199474811553955, + "learning_rate": 9.489612128448714e-06, + "loss": 0.4225, + "step": 1822 + }, + { + "epoch": 0.14584, + "grad_norm": 1.8470107316970825, + "learning_rate": 9.489058430081964e-06, + "loss": 0.3648, + "step": 1823 + }, + { + "epoch": 0.14592, + "grad_norm": 1.2022221088409424, + "learning_rate": 9.48850444770751e-06, + "loss": 0.304, + "step": 1824 + }, + { + "epoch": 0.146, + "grad_norm": 1.4264217615127563, + "learning_rate": 9.487950181360397e-06, + "loss": 0.3224, + "step": 1825 + }, + { + "epoch": 0.14608, + "grad_norm": 2.1026673316955566, + "learning_rate": 9.487395631075693e-06, + "loss": 0.4528, + "step": 1826 + }, + { + "epoch": 0.14616, + "grad_norm": 1.5366865396499634, + "learning_rate": 9.486840796888483e-06, + "loss": 0.2714, + "step": 1827 + }, + { + "epoch": 0.14624, + "grad_norm": 1.836358666419983, + "learning_rate": 9.48628567883387e-06, + "loss": 0.3978, + "step": 1828 + }, + { + "epoch": 0.14632, + "grad_norm": 1.6860939264297485, + "learning_rate": 9.48573027694697e-06, + "loss": 0.3714, + "step": 1829 + }, + { + "epoch": 0.1464, + "grad_norm": 1.3524024486541748, + "learning_rate": 9.485174591262925e-06, + "loss": 0.2514, + "step": 1830 + }, + { + "epoch": 0.14648, + "grad_norm": 1.6706621646881104, + "learning_rate": 9.484618621816892e-06, + "loss": 0.3619, + "step": 1831 + }, + { + "epoch": 0.14656, + "grad_norm": 1.2413554191589355, + "learning_rate": 9.484062368644045e-06, + "loss": 0.3066, + "step": 1832 + }, + { + "epoch": 0.14664, + "grad_norm": 1.7254809141159058, + "learning_rate": 9.483505831779577e-06, + "loss": 0.3962, + "step": 1833 + }, + { + "epoch": 0.14672, + "grad_norm": 1.8376824855804443, + "learning_rate": 9.482949011258693e-06, + "loss": 0.5107, + "step": 1834 + }, + { + "epoch": 0.1468, + "grad_norm": 1.2581253051757812, + "learning_rate": 9.482391907116628e-06, + "loss": 0.2733, + "step": 1835 + }, + { + "epoch": 0.14688, + "grad_norm": 1.1894922256469727, + "learning_rate": 9.481834519388624e-06, + "loss": 0.2422, + "step": 1836 + }, + { + "epoch": 0.14696, + "grad_norm": 1.6918854713439941, + "learning_rate": 9.481276848109947e-06, + "loss": 0.3263, + "step": 1837 + }, + { + "epoch": 0.14704, + "grad_norm": 1.6978294849395752, + "learning_rate": 9.480718893315876e-06, + "loss": 0.3933, + "step": 1838 + }, + { + "epoch": 0.14712, + "grad_norm": 1.701881766319275, + "learning_rate": 9.480160655041717e-06, + "loss": 0.3639, + "step": 1839 + }, + { + "epoch": 0.1472, + "grad_norm": 1.7398412227630615, + "learning_rate": 9.479602133322781e-06, + "loss": 0.3151, + "step": 1840 + }, + { + "epoch": 0.14728, + "grad_norm": 1.8988409042358398, + "learning_rate": 9.479043328194409e-06, + "loss": 0.4829, + "step": 1841 + }, + { + "epoch": 0.14736, + "grad_norm": 1.7794743776321411, + "learning_rate": 9.47848423969195e-06, + "loss": 0.3512, + "step": 1842 + }, + { + "epoch": 0.14744, + "grad_norm": 1.6518741846084595, + "learning_rate": 9.477924867850781e-06, + "loss": 0.35, + "step": 1843 + }, + { + "epoch": 0.14752, + "grad_norm": 2.0679967403411865, + "learning_rate": 9.477365212706286e-06, + "loss": 0.3591, + "step": 1844 + }, + { + "epoch": 0.1476, + "grad_norm": 1.650707721710205, + "learning_rate": 9.476805274293877e-06, + "loss": 0.2558, + "step": 1845 + }, + { + "epoch": 0.14768, + "grad_norm": 1.2669817209243774, + "learning_rate": 9.476245052648978e-06, + "loss": 0.2441, + "step": 1846 + }, + { + "epoch": 0.14776, + "grad_norm": 1.504056692123413, + "learning_rate": 9.475684547807032e-06, + "loss": 0.2845, + "step": 1847 + }, + { + "epoch": 0.14784, + "grad_norm": 0.9670877456665039, + "learning_rate": 9.4751237598035e-06, + "loss": 0.1881, + "step": 1848 + }, + { + "epoch": 0.14792, + "grad_norm": 1.458354115486145, + "learning_rate": 9.474562688673861e-06, + "loss": 0.3841, + "step": 1849 + }, + { + "epoch": 0.148, + "grad_norm": 1.9649925231933594, + "learning_rate": 9.474001334453613e-06, + "loss": 0.3908, + "step": 1850 + }, + { + "epoch": 0.14808, + "grad_norm": 1.5312343835830688, + "learning_rate": 9.47343969717827e-06, + "loss": 0.3213, + "step": 1851 + }, + { + "epoch": 0.14816, + "grad_norm": 1.8272303342819214, + "learning_rate": 9.472877776883365e-06, + "loss": 0.3786, + "step": 1852 + }, + { + "epoch": 0.14824, + "grad_norm": 1.590427041053772, + "learning_rate": 9.47231557360445e-06, + "loss": 0.3035, + "step": 1853 + }, + { + "epoch": 0.14832, + "grad_norm": 1.4117902517318726, + "learning_rate": 9.471753087377094e-06, + "loss": 0.4148, + "step": 1854 + }, + { + "epoch": 0.1484, + "grad_norm": 2.0782604217529297, + "learning_rate": 9.471190318236883e-06, + "loss": 0.7431, + "step": 1855 + }, + { + "epoch": 0.14848, + "grad_norm": 1.819188117980957, + "learning_rate": 9.47062726621942e-06, + "loss": 0.3289, + "step": 1856 + }, + { + "epoch": 0.14856, + "grad_norm": 1.4712084531784058, + "learning_rate": 9.470063931360329e-06, + "loss": 0.3727, + "step": 1857 + }, + { + "epoch": 0.14864, + "grad_norm": 1.7734615802764893, + "learning_rate": 9.46950031369525e-06, + "loss": 0.3527, + "step": 1858 + }, + { + "epoch": 0.14872, + "grad_norm": 1.6210514307022095, + "learning_rate": 9.468936413259842e-06, + "loss": 0.3714, + "step": 1859 + }, + { + "epoch": 0.1488, + "grad_norm": 1.4350985288619995, + "learning_rate": 9.468372230089779e-06, + "loss": 0.3185, + "step": 1860 + }, + { + "epoch": 0.14888, + "grad_norm": 1.2719461917877197, + "learning_rate": 9.467807764220757e-06, + "loss": 0.2664, + "step": 1861 + }, + { + "epoch": 0.14896, + "grad_norm": 1.8364416360855103, + "learning_rate": 9.467243015688486e-06, + "loss": 0.5466, + "step": 1862 + }, + { + "epoch": 0.14904, + "grad_norm": 1.5372499227523804, + "learning_rate": 9.466677984528698e-06, + "loss": 0.3587, + "step": 1863 + }, + { + "epoch": 0.14912, + "grad_norm": 1.268711805343628, + "learning_rate": 9.46611267077714e-06, + "loss": 0.2545, + "step": 1864 + }, + { + "epoch": 0.1492, + "grad_norm": 1.8258861303329468, + "learning_rate": 9.465547074469576e-06, + "loss": 0.4055, + "step": 1865 + }, + { + "epoch": 0.14928, + "grad_norm": 1.428105354309082, + "learning_rate": 9.46498119564179e-06, + "loss": 0.3134, + "step": 1866 + }, + { + "epoch": 0.14936, + "grad_norm": 1.4180129766464233, + "learning_rate": 9.464415034329584e-06, + "loss": 0.3236, + "step": 1867 + }, + { + "epoch": 0.14944, + "grad_norm": 1.5813634395599365, + "learning_rate": 9.463848590568776e-06, + "loss": 0.3679, + "step": 1868 + }, + { + "epoch": 0.14952, + "grad_norm": 1.8351691961288452, + "learning_rate": 9.463281864395204e-06, + "loss": 0.3565, + "step": 1869 + }, + { + "epoch": 0.1496, + "grad_norm": 1.5723137855529785, + "learning_rate": 9.462714855844724e-06, + "loss": 0.3112, + "step": 1870 + }, + { + "epoch": 0.14968, + "grad_norm": 1.319880723953247, + "learning_rate": 9.462147564953206e-06, + "loss": 0.2729, + "step": 1871 + }, + { + "epoch": 0.14976, + "grad_norm": 1.9398796558380127, + "learning_rate": 9.461579991756543e-06, + "loss": 0.4355, + "step": 1872 + }, + { + "epoch": 0.14984, + "grad_norm": 1.74547278881073, + "learning_rate": 9.461012136290641e-06, + "loss": 0.4414, + "step": 1873 + }, + { + "epoch": 0.14992, + "grad_norm": 1.329624891281128, + "learning_rate": 9.460443998591429e-06, + "loss": 0.3745, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 1.7463513612747192, + "learning_rate": 9.45987557869485e-06, + "loss": 0.5341, + "step": 1875 + }, + { + "epoch": 0.15008, + "grad_norm": 2.0810306072235107, + "learning_rate": 9.459306876636865e-06, + "loss": 0.5546, + "step": 1876 + }, + { + "epoch": 0.15016, + "grad_norm": 1.8790583610534668, + "learning_rate": 9.458737892453455e-06, + "loss": 0.5832, + "step": 1877 + }, + { + "epoch": 0.15024, + "grad_norm": 1.392000675201416, + "learning_rate": 9.458168626180619e-06, + "loss": 0.2942, + "step": 1878 + }, + { + "epoch": 0.15032, + "grad_norm": 1.9226977825164795, + "learning_rate": 9.457599077854369e-06, + "loss": 0.3703, + "step": 1879 + }, + { + "epoch": 0.1504, + "grad_norm": 1.9512839317321777, + "learning_rate": 9.457029247510742e-06, + "loss": 0.3894, + "step": 1880 + }, + { + "epoch": 0.15048, + "grad_norm": 1.148292899131775, + "learning_rate": 9.456459135185787e-06, + "loss": 0.2978, + "step": 1881 + }, + { + "epoch": 0.15056, + "grad_norm": 1.5737361907958984, + "learning_rate": 9.455888740915573e-06, + "loss": 0.3149, + "step": 1882 + }, + { + "epoch": 0.15064, + "grad_norm": 1.5501254796981812, + "learning_rate": 9.45531806473619e-06, + "loss": 0.3572, + "step": 1883 + }, + { + "epoch": 0.15072, + "grad_norm": 1.52022385597229, + "learning_rate": 9.45474710668374e-06, + "loss": 0.2995, + "step": 1884 + }, + { + "epoch": 0.1508, + "grad_norm": 2.051806926727295, + "learning_rate": 9.454175866794344e-06, + "loss": 0.4449, + "step": 1885 + }, + { + "epoch": 0.15088, + "grad_norm": 1.4593029022216797, + "learning_rate": 9.453604345104146e-06, + "loss": 0.3873, + "step": 1886 + }, + { + "epoch": 0.15096, + "grad_norm": 1.7019308805465698, + "learning_rate": 9.453032541649304e-06, + "loss": 0.3382, + "step": 1887 + }, + { + "epoch": 0.15104, + "grad_norm": 1.7120949029922485, + "learning_rate": 9.452460456465991e-06, + "loss": 0.3859, + "step": 1888 + }, + { + "epoch": 0.15112, + "grad_norm": 1.4484901428222656, + "learning_rate": 9.451888089590404e-06, + "loss": 0.3352, + "step": 1889 + }, + { + "epoch": 0.1512, + "grad_norm": 1.4985060691833496, + "learning_rate": 9.451315441058754e-06, + "loss": 0.3172, + "step": 1890 + }, + { + "epoch": 0.15128, + "grad_norm": 1.5723661184310913, + "learning_rate": 9.45074251090727e-06, + "loss": 0.394, + "step": 1891 + }, + { + "epoch": 0.15136, + "grad_norm": 1.131264567375183, + "learning_rate": 9.450169299172201e-06, + "loss": 0.2274, + "step": 1892 + }, + { + "epoch": 0.15144, + "grad_norm": 1.2636674642562866, + "learning_rate": 9.449595805889809e-06, + "loss": 0.3056, + "step": 1893 + }, + { + "epoch": 0.15152, + "grad_norm": 1.8149745464324951, + "learning_rate": 9.449022031096378e-06, + "loss": 0.3782, + "step": 1894 + }, + { + "epoch": 0.1516, + "grad_norm": 2.029224157333374, + "learning_rate": 9.448447974828209e-06, + "loss": 0.3975, + "step": 1895 + }, + { + "epoch": 0.15168, + "grad_norm": 1.4619930982589722, + "learning_rate": 9.447873637121624e-06, + "loss": 0.3157, + "step": 1896 + }, + { + "epoch": 0.15176, + "grad_norm": 1.570886254310608, + "learning_rate": 9.447299018012954e-06, + "loss": 0.372, + "step": 1897 + }, + { + "epoch": 0.15184, + "grad_norm": 1.4011813402175903, + "learning_rate": 9.446724117538559e-06, + "loss": 0.3174, + "step": 1898 + }, + { + "epoch": 0.15192, + "grad_norm": 1.6975699663162231, + "learning_rate": 9.446148935734804e-06, + "loss": 0.3275, + "step": 1899 + }, + { + "epoch": 0.152, + "grad_norm": 3.675276041030884, + "learning_rate": 9.445573472638085e-06, + "loss": 0.4014, + "step": 1900 + }, + { + "epoch": 0.15208, + "grad_norm": 1.3942408561706543, + "learning_rate": 9.444997728284808e-06, + "loss": 0.3226, + "step": 1901 + }, + { + "epoch": 0.15216, + "grad_norm": 1.4078660011291504, + "learning_rate": 9.444421702711397e-06, + "loss": 0.2997, + "step": 1902 + }, + { + "epoch": 0.15224, + "grad_norm": 1.1963045597076416, + "learning_rate": 9.443845395954295e-06, + "loss": 0.2498, + "step": 1903 + }, + { + "epoch": 0.15232, + "grad_norm": 1.53505277633667, + "learning_rate": 9.443268808049966e-06, + "loss": 0.3383, + "step": 1904 + }, + { + "epoch": 0.1524, + "grad_norm": 1.4499201774597168, + "learning_rate": 9.442691939034885e-06, + "loss": 0.2753, + "step": 1905 + }, + { + "epoch": 0.15248, + "grad_norm": 1.238735556602478, + "learning_rate": 9.44211478894555e-06, + "loss": 0.2528, + "step": 1906 + }, + { + "epoch": 0.15256, + "grad_norm": 1.7521767616271973, + "learning_rate": 9.441537357818476e-06, + "loss": 0.4218, + "step": 1907 + }, + { + "epoch": 0.15264, + "grad_norm": 1.695391297340393, + "learning_rate": 9.440959645690195e-06, + "loss": 0.3643, + "step": 1908 + }, + { + "epoch": 0.15272, + "grad_norm": 1.8289469480514526, + "learning_rate": 9.440381652597258e-06, + "loss": 0.3741, + "step": 1909 + }, + { + "epoch": 0.1528, + "grad_norm": 1.3207136392593384, + "learning_rate": 9.43980337857623e-06, + "loss": 0.3747, + "step": 1910 + }, + { + "epoch": 0.15288, + "grad_norm": 1.8912609815597534, + "learning_rate": 9.439224823663698e-06, + "loss": 0.4154, + "step": 1911 + }, + { + "epoch": 0.15296, + "grad_norm": 1.8537139892578125, + "learning_rate": 9.438645987896264e-06, + "loss": 0.4575, + "step": 1912 + }, + { + "epoch": 0.15304, + "grad_norm": 1.393486499786377, + "learning_rate": 9.438066871310552e-06, + "loss": 0.2722, + "step": 1913 + }, + { + "epoch": 0.15312, + "grad_norm": 1.4133944511413574, + "learning_rate": 9.437487473943198e-06, + "loss": 0.335, + "step": 1914 + }, + { + "epoch": 0.1532, + "grad_norm": 1.4364306926727295, + "learning_rate": 9.436907795830861e-06, + "loss": 0.302, + "step": 1915 + }, + { + "epoch": 0.15328, + "grad_norm": 1.730425477027893, + "learning_rate": 9.43632783701021e-06, + "loss": 0.3615, + "step": 1916 + }, + { + "epoch": 0.15336, + "grad_norm": 1.5723671913146973, + "learning_rate": 9.435747597517943e-06, + "loss": 0.2994, + "step": 1917 + }, + { + "epoch": 0.15344, + "grad_norm": 1.4602887630462646, + "learning_rate": 9.435167077390768e-06, + "loss": 0.3276, + "step": 1918 + }, + { + "epoch": 0.15352, + "grad_norm": 1.5619248151779175, + "learning_rate": 9.434586276665412e-06, + "loss": 0.4369, + "step": 1919 + }, + { + "epoch": 0.1536, + "grad_norm": 1.581910252571106, + "learning_rate": 9.434005195378622e-06, + "loss": 0.3914, + "step": 1920 + }, + { + "epoch": 0.15368, + "grad_norm": 1.3012194633483887, + "learning_rate": 9.433423833567156e-06, + "loss": 0.2824, + "step": 1921 + }, + { + "epoch": 0.15376, + "grad_norm": 1.5758806467056274, + "learning_rate": 9.432842191267802e-06, + "loss": 0.2919, + "step": 1922 + }, + { + "epoch": 0.15384, + "grad_norm": 2.18058180809021, + "learning_rate": 9.432260268517352e-06, + "loss": 0.3899, + "step": 1923 + }, + { + "epoch": 0.15392, + "grad_norm": 1.6730529069900513, + "learning_rate": 9.431678065352625e-06, + "loss": 0.5427, + "step": 1924 + }, + { + "epoch": 0.154, + "grad_norm": 1.690724492073059, + "learning_rate": 9.431095581810457e-06, + "loss": 0.3699, + "step": 1925 + }, + { + "epoch": 0.15408, + "grad_norm": 1.7529829740524292, + "learning_rate": 9.430512817927698e-06, + "loss": 0.3121, + "step": 1926 + }, + { + "epoch": 0.15416, + "grad_norm": 1.5383788347244263, + "learning_rate": 9.429929773741216e-06, + "loss": 0.389, + "step": 1927 + }, + { + "epoch": 0.15424, + "grad_norm": 2.017871141433716, + "learning_rate": 9.429346449287902e-06, + "loss": 0.531, + "step": 1928 + }, + { + "epoch": 0.15432, + "grad_norm": 1.6736904382705688, + "learning_rate": 9.428762844604658e-06, + "loss": 0.3876, + "step": 1929 + }, + { + "epoch": 0.1544, + "grad_norm": 1.5975745916366577, + "learning_rate": 9.428178959728406e-06, + "loss": 0.408, + "step": 1930 + }, + { + "epoch": 0.15448, + "grad_norm": 1.3620975017547607, + "learning_rate": 9.427594794696089e-06, + "loss": 0.2987, + "step": 1931 + }, + { + "epoch": 0.15456, + "grad_norm": 1.9585412740707397, + "learning_rate": 9.427010349544665e-06, + "loss": 0.4924, + "step": 1932 + }, + { + "epoch": 0.15464, + "grad_norm": 1.616124153137207, + "learning_rate": 9.426425624311107e-06, + "loss": 0.3609, + "step": 1933 + }, + { + "epoch": 0.15472, + "grad_norm": 1.6203333139419556, + "learning_rate": 9.425840619032411e-06, + "loss": 0.343, + "step": 1934 + }, + { + "epoch": 0.1548, + "grad_norm": 1.8410626649856567, + "learning_rate": 9.42525533374559e-06, + "loss": 0.3722, + "step": 1935 + }, + { + "epoch": 0.15488, + "grad_norm": 1.4306490421295166, + "learning_rate": 9.424669768487668e-06, + "loss": 0.3013, + "step": 1936 + }, + { + "epoch": 0.15496, + "grad_norm": 1.5979857444763184, + "learning_rate": 9.424083923295698e-06, + "loss": 0.4501, + "step": 1937 + }, + { + "epoch": 0.15504, + "grad_norm": 1.3215088844299316, + "learning_rate": 9.42349779820674e-06, + "loss": 0.3814, + "step": 1938 + }, + { + "epoch": 0.15512, + "grad_norm": 1.462342619895935, + "learning_rate": 9.422911393257876e-06, + "loss": 0.3256, + "step": 1939 + }, + { + "epoch": 0.1552, + "grad_norm": 1.2837486267089844, + "learning_rate": 9.422324708486208e-06, + "loss": 0.3029, + "step": 1940 + }, + { + "epoch": 0.15528, + "grad_norm": 1.586371660232544, + "learning_rate": 9.421737743928854e-06, + "loss": 0.4278, + "step": 1941 + }, + { + "epoch": 0.15536, + "grad_norm": 1.5170694589614868, + "learning_rate": 9.421150499622947e-06, + "loss": 0.3776, + "step": 1942 + }, + { + "epoch": 0.15544, + "grad_norm": 1.640691876411438, + "learning_rate": 9.42056297560564e-06, + "loss": 0.2805, + "step": 1943 + }, + { + "epoch": 0.15552, + "grad_norm": 1.7329473495483398, + "learning_rate": 9.419975171914108e-06, + "loss": 0.4477, + "step": 1944 + }, + { + "epoch": 0.1556, + "grad_norm": 1.7486817836761475, + "learning_rate": 9.419387088585534e-06, + "loss": 0.3196, + "step": 1945 + }, + { + "epoch": 0.15568, + "grad_norm": 1.9604579210281372, + "learning_rate": 9.418798725657125e-06, + "loss": 0.373, + "step": 1946 + }, + { + "epoch": 0.15576, + "grad_norm": 1.8238966464996338, + "learning_rate": 9.418210083166108e-06, + "loss": 0.4417, + "step": 1947 + }, + { + "epoch": 0.15584, + "grad_norm": 1.3136991262435913, + "learning_rate": 9.417621161149723e-06, + "loss": 0.2475, + "step": 1948 + }, + { + "epoch": 0.15592, + "grad_norm": 2.024016857147217, + "learning_rate": 9.417031959645227e-06, + "loss": 0.3924, + "step": 1949 + }, + { + "epoch": 0.156, + "grad_norm": 1.4279539585113525, + "learning_rate": 9.416442478689898e-06, + "loss": 0.3131, + "step": 1950 + }, + { + "epoch": 0.15608, + "grad_norm": 1.5639196634292603, + "learning_rate": 9.415852718321032e-06, + "loss": 0.3048, + "step": 1951 + }, + { + "epoch": 0.15616, + "grad_norm": 1.6544080972671509, + "learning_rate": 9.41526267857594e-06, + "loss": 0.3371, + "step": 1952 + }, + { + "epoch": 0.15624, + "grad_norm": 1.7706680297851562, + "learning_rate": 9.414672359491952e-06, + "loss": 0.3639, + "step": 1953 + }, + { + "epoch": 0.15632, + "grad_norm": 1.6847538948059082, + "learning_rate": 9.414081761106413e-06, + "loss": 0.5029, + "step": 1954 + }, + { + "epoch": 0.1564, + "grad_norm": 1.432637095451355, + "learning_rate": 9.413490883456694e-06, + "loss": 0.2809, + "step": 1955 + }, + { + "epoch": 0.15648, + "grad_norm": 1.5604770183563232, + "learning_rate": 9.412899726580171e-06, + "loss": 0.2855, + "step": 1956 + }, + { + "epoch": 0.15656, + "grad_norm": 1.5877360105514526, + "learning_rate": 9.41230829051425e-06, + "loss": 0.3401, + "step": 1957 + }, + { + "epoch": 0.15664, + "grad_norm": 2.027043581008911, + "learning_rate": 9.411716575296349e-06, + "loss": 0.4226, + "step": 1958 + }, + { + "epoch": 0.15672, + "grad_norm": 1.7963011264801025, + "learning_rate": 9.411124580963897e-06, + "loss": 0.3433, + "step": 1959 + }, + { + "epoch": 0.1568, + "grad_norm": 1.6322389841079712, + "learning_rate": 9.410532307554356e-06, + "loss": 0.3208, + "step": 1960 + }, + { + "epoch": 0.15688, + "grad_norm": 1.4425139427185059, + "learning_rate": 9.409939755105193e-06, + "loss": 0.3305, + "step": 1961 + }, + { + "epoch": 0.15696, + "grad_norm": 2.181800365447998, + "learning_rate": 9.409346923653897e-06, + "loss": 0.428, + "step": 1962 + }, + { + "epoch": 0.15704, + "grad_norm": 1.3570939302444458, + "learning_rate": 9.408753813237974e-06, + "loss": 0.3384, + "step": 1963 + }, + { + "epoch": 0.15712, + "grad_norm": 1.302597999572754, + "learning_rate": 9.40816042389495e-06, + "loss": 0.2947, + "step": 1964 + }, + { + "epoch": 0.1572, + "grad_norm": 1.9911285638809204, + "learning_rate": 9.407566755662366e-06, + "loss": 0.4879, + "step": 1965 + }, + { + "epoch": 0.15728, + "grad_norm": 1.518795132637024, + "learning_rate": 9.406972808577782e-06, + "loss": 0.4356, + "step": 1966 + }, + { + "epoch": 0.15736, + "grad_norm": 1.8553831577301025, + "learning_rate": 9.406378582678772e-06, + "loss": 0.407, + "step": 1967 + }, + { + "epoch": 0.15744, + "grad_norm": 1.409943699836731, + "learning_rate": 9.405784078002932e-06, + "loss": 0.3465, + "step": 1968 + }, + { + "epoch": 0.15752, + "grad_norm": 1.3879483938217163, + "learning_rate": 9.405189294587879e-06, + "loss": 0.3013, + "step": 1969 + }, + { + "epoch": 0.1576, + "grad_norm": 1.4256548881530762, + "learning_rate": 9.404594232471237e-06, + "loss": 0.2841, + "step": 1970 + }, + { + "epoch": 0.15768, + "grad_norm": 1.50364351272583, + "learning_rate": 9.403998891690655e-06, + "loss": 0.3085, + "step": 1971 + }, + { + "epoch": 0.15776, + "grad_norm": 1.7137510776519775, + "learning_rate": 9.4034032722838e-06, + "loss": 0.3598, + "step": 1972 + }, + { + "epoch": 0.15784, + "grad_norm": 1.9388797283172607, + "learning_rate": 9.402807374288354e-06, + "loss": 0.505, + "step": 1973 + }, + { + "epoch": 0.15792, + "grad_norm": 1.7556933164596558, + "learning_rate": 9.402211197742016e-06, + "loss": 0.3934, + "step": 1974 + }, + { + "epoch": 0.158, + "grad_norm": 1.7362114191055298, + "learning_rate": 9.401614742682508e-06, + "loss": 0.3839, + "step": 1975 + }, + { + "epoch": 0.15808, + "grad_norm": 1.413800597190857, + "learning_rate": 9.40101800914756e-06, + "loss": 0.3378, + "step": 1976 + }, + { + "epoch": 0.15816, + "grad_norm": 1.6335464715957642, + "learning_rate": 9.40042099717493e-06, + "loss": 0.4107, + "step": 1977 + }, + { + "epoch": 0.15824, + "grad_norm": 1.9229556322097778, + "learning_rate": 9.399823706802386e-06, + "loss": 0.5094, + "step": 1978 + }, + { + "epoch": 0.15832, + "grad_norm": 1.2669042348861694, + "learning_rate": 9.399226138067721e-06, + "loss": 0.304, + "step": 1979 + }, + { + "epoch": 0.1584, + "grad_norm": 1.4909025430679321, + "learning_rate": 9.398628291008735e-06, + "loss": 0.3361, + "step": 1980 + }, + { + "epoch": 0.15848, + "grad_norm": 1.4694348573684692, + "learning_rate": 9.398030165663257e-06, + "loss": 0.3228, + "step": 1981 + }, + { + "epoch": 0.15856, + "grad_norm": 1.7894591093063354, + "learning_rate": 9.397431762069124e-06, + "loss": 0.4964, + "step": 1982 + }, + { + "epoch": 0.15864, + "grad_norm": 1.3913213014602661, + "learning_rate": 9.396833080264198e-06, + "loss": 0.3159, + "step": 1983 + }, + { + "epoch": 0.15872, + "grad_norm": 1.4307069778442383, + "learning_rate": 9.396234120286356e-06, + "loss": 0.4041, + "step": 1984 + }, + { + "epoch": 0.1588, + "grad_norm": 1.663399577140808, + "learning_rate": 9.39563488217349e-06, + "loss": 0.3734, + "step": 1985 + }, + { + "epoch": 0.15888, + "grad_norm": 2.0649588108062744, + "learning_rate": 9.395035365963514e-06, + "loss": 0.4112, + "step": 1986 + }, + { + "epoch": 0.15896, + "grad_norm": 1.6770638227462769, + "learning_rate": 9.394435571694356e-06, + "loss": 0.3357, + "step": 1987 + }, + { + "epoch": 0.15904, + "grad_norm": 2.003188371658325, + "learning_rate": 9.393835499403963e-06, + "loss": 0.4054, + "step": 1988 + }, + { + "epoch": 0.15912, + "grad_norm": 1.657409906387329, + "learning_rate": 9.393235149130299e-06, + "loss": 0.3624, + "step": 1989 + }, + { + "epoch": 0.1592, + "grad_norm": 1.6382983922958374, + "learning_rate": 9.392634520911348e-06, + "loss": 0.4006, + "step": 1990 + }, + { + "epoch": 0.15928, + "grad_norm": 2.041346549987793, + "learning_rate": 9.392033614785108e-06, + "loss": 0.3929, + "step": 1991 + }, + { + "epoch": 0.15936, + "grad_norm": 1.2810925245285034, + "learning_rate": 9.391432430789597e-06, + "loss": 0.2469, + "step": 1992 + }, + { + "epoch": 0.15944, + "grad_norm": 1.6841486692428589, + "learning_rate": 9.390830968962849e-06, + "loss": 0.3423, + "step": 1993 + }, + { + "epoch": 0.15952, + "grad_norm": 1.7549775838851929, + "learning_rate": 9.390229229342918e-06, + "loss": 0.3976, + "step": 1994 + }, + { + "epoch": 0.1596, + "grad_norm": 1.5098248720169067, + "learning_rate": 9.389627211967874e-06, + "loss": 0.3533, + "step": 1995 + }, + { + "epoch": 0.15968, + "grad_norm": 1.392735242843628, + "learning_rate": 9.389024916875805e-06, + "loss": 0.4128, + "step": 1996 + }, + { + "epoch": 0.15976, + "grad_norm": 1.8983796834945679, + "learning_rate": 9.388422344104812e-06, + "loss": 0.4626, + "step": 1997 + }, + { + "epoch": 0.15984, + "grad_norm": 1.4891161918640137, + "learning_rate": 9.387819493693025e-06, + "loss": 0.3736, + "step": 1998 + }, + { + "epoch": 0.15992, + "grad_norm": 1.8602221012115479, + "learning_rate": 9.387216365678578e-06, + "loss": 0.4016, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 1.5026849508285522, + "learning_rate": 9.38661296009963e-06, + "loss": 0.3062, + "step": 2000 + }, + { + "epoch": 0.16008, + "grad_norm": 1.7527680397033691, + "learning_rate": 9.38600927699436e-06, + "loss": 0.3501, + "step": 2001 + }, + { + "epoch": 0.16016, + "grad_norm": 1.6725231409072876, + "learning_rate": 9.385405316400957e-06, + "loss": 0.4831, + "step": 2002 + }, + { + "epoch": 0.16024, + "grad_norm": 1.6204967498779297, + "learning_rate": 9.384801078357635e-06, + "loss": 0.3016, + "step": 2003 + }, + { + "epoch": 0.16032, + "grad_norm": 1.6982675790786743, + "learning_rate": 9.38419656290262e-06, + "loss": 0.5008, + "step": 2004 + }, + { + "epoch": 0.1604, + "grad_norm": 1.5643174648284912, + "learning_rate": 9.383591770074156e-06, + "loss": 0.302, + "step": 2005 + }, + { + "epoch": 0.16048, + "grad_norm": 1.7776808738708496, + "learning_rate": 9.38298669991051e-06, + "loss": 0.3796, + "step": 2006 + }, + { + "epoch": 0.16056, + "grad_norm": 1.6092027425765991, + "learning_rate": 9.38238135244996e-06, + "loss": 0.3388, + "step": 2007 + }, + { + "epoch": 0.16064, + "grad_norm": 1.8656328916549683, + "learning_rate": 9.381775727730807e-06, + "loss": 0.3574, + "step": 2008 + }, + { + "epoch": 0.16072, + "grad_norm": 1.6534117460250854, + "learning_rate": 9.381169825791364e-06, + "loss": 0.4165, + "step": 2009 + }, + { + "epoch": 0.1608, + "grad_norm": 1.7371861934661865, + "learning_rate": 9.380563646669967e-06, + "loss": 0.41, + "step": 2010 + }, + { + "epoch": 0.16088, + "grad_norm": 1.6235700845718384, + "learning_rate": 9.379957190404966e-06, + "loss": 0.3675, + "step": 2011 + }, + { + "epoch": 0.16096, + "grad_norm": 1.9247312545776367, + "learning_rate": 9.379350457034726e-06, + "loss": 0.4458, + "step": 2012 + }, + { + "epoch": 0.16104, + "grad_norm": 1.2080788612365723, + "learning_rate": 9.378743446597635e-06, + "loss": 0.2824, + "step": 2013 + }, + { + "epoch": 0.16112, + "grad_norm": 1.8663572072982788, + "learning_rate": 9.378136159132101e-06, + "loss": 0.3204, + "step": 2014 + }, + { + "epoch": 0.1612, + "grad_norm": 1.4532349109649658, + "learning_rate": 9.37752859467654e-06, + "loss": 0.3409, + "step": 2015 + }, + { + "epoch": 0.16128, + "grad_norm": 1.467343807220459, + "learning_rate": 9.376920753269391e-06, + "loss": 0.3027, + "step": 2016 + }, + { + "epoch": 0.16136, + "grad_norm": 1.7154619693756104, + "learning_rate": 9.376312634949114e-06, + "loss": 0.3916, + "step": 2017 + }, + { + "epoch": 0.16144, + "grad_norm": 1.4569969177246094, + "learning_rate": 9.375704239754178e-06, + "loss": 0.316, + "step": 2018 + }, + { + "epoch": 0.16152, + "grad_norm": 1.6265618801116943, + "learning_rate": 9.375095567723076e-06, + "loss": 0.29, + "step": 2019 + }, + { + "epoch": 0.1616, + "grad_norm": 1.7646688222885132, + "learning_rate": 9.374486618894316e-06, + "loss": 0.4374, + "step": 2020 + }, + { + "epoch": 0.16168, + "grad_norm": 1.7426738739013672, + "learning_rate": 9.373877393306426e-06, + "loss": 0.3223, + "step": 2021 + }, + { + "epoch": 0.16176, + "grad_norm": 1.8202601671218872, + "learning_rate": 9.373267890997949e-06, + "loss": 0.3962, + "step": 2022 + }, + { + "epoch": 0.16184, + "grad_norm": 1.5177546739578247, + "learning_rate": 9.372658112007442e-06, + "loss": 0.2866, + "step": 2023 + }, + { + "epoch": 0.16192, + "grad_norm": 1.7819550037384033, + "learning_rate": 9.37204805637349e-06, + "loss": 0.3551, + "step": 2024 + }, + { + "epoch": 0.162, + "grad_norm": 1.8583236932754517, + "learning_rate": 9.371437724134687e-06, + "loss": 0.3201, + "step": 2025 + }, + { + "epoch": 0.16208, + "grad_norm": 1.4678624868392944, + "learning_rate": 9.370827115329644e-06, + "loss": 0.3144, + "step": 2026 + }, + { + "epoch": 0.16216, + "grad_norm": 1.5797638893127441, + "learning_rate": 9.370216229996995e-06, + "loss": 0.2976, + "step": 2027 + }, + { + "epoch": 0.16224, + "grad_norm": 1.471345067024231, + "learning_rate": 9.369605068175388e-06, + "loss": 0.2832, + "step": 2028 + }, + { + "epoch": 0.16232, + "grad_norm": 2.2264575958251953, + "learning_rate": 9.368993629903489e-06, + "loss": 0.4478, + "step": 2029 + }, + { + "epoch": 0.1624, + "grad_norm": 2.1455178260803223, + "learning_rate": 9.368381915219982e-06, + "loss": 0.455, + "step": 2030 + }, + { + "epoch": 0.16248, + "grad_norm": 1.47684907913208, + "learning_rate": 9.367769924163568e-06, + "loss": 0.2801, + "step": 2031 + }, + { + "epoch": 0.16256, + "grad_norm": 1.5183435678482056, + "learning_rate": 9.367157656772965e-06, + "loss": 0.367, + "step": 2032 + }, + { + "epoch": 0.16264, + "grad_norm": 1.7493641376495361, + "learning_rate": 9.366545113086909e-06, + "loss": 0.3876, + "step": 2033 + }, + { + "epoch": 0.16272, + "grad_norm": 1.9206058979034424, + "learning_rate": 9.365932293144156e-06, + "loss": 0.3943, + "step": 2034 + }, + { + "epoch": 0.1628, + "grad_norm": 1.4696857929229736, + "learning_rate": 9.365319196983474e-06, + "loss": 0.316, + "step": 2035 + }, + { + "epoch": 0.16288, + "grad_norm": 1.558495044708252, + "learning_rate": 9.364705824643653e-06, + "loss": 0.3151, + "step": 2036 + }, + { + "epoch": 0.16296, + "grad_norm": 1.9205808639526367, + "learning_rate": 9.364092176163499e-06, + "loss": 0.3735, + "step": 2037 + }, + { + "epoch": 0.16304, + "grad_norm": 2.3546855449676514, + "learning_rate": 9.363478251581834e-06, + "loss": 0.4612, + "step": 2038 + }, + { + "epoch": 0.16312, + "grad_norm": 1.5820739269256592, + "learning_rate": 9.362864050937503e-06, + "loss": 0.3454, + "step": 2039 + }, + { + "epoch": 0.1632, + "grad_norm": 1.4024479389190674, + "learning_rate": 9.36224957426936e-06, + "loss": 0.284, + "step": 2040 + }, + { + "epoch": 0.16328, + "grad_norm": 1.346354603767395, + "learning_rate": 9.361634821616284e-06, + "loss": 0.3249, + "step": 2041 + }, + { + "epoch": 0.16336, + "grad_norm": 1.496208667755127, + "learning_rate": 9.361019793017164e-06, + "loss": 0.4657, + "step": 2042 + }, + { + "epoch": 0.16344, + "grad_norm": 1.449812650680542, + "learning_rate": 9.360404488510916e-06, + "loss": 0.332, + "step": 2043 + }, + { + "epoch": 0.16352, + "grad_norm": 1.5065429210662842, + "learning_rate": 9.359788908136464e-06, + "loss": 0.3212, + "step": 2044 + }, + { + "epoch": 0.1636, + "grad_norm": 1.5385485887527466, + "learning_rate": 9.359173051932758e-06, + "loss": 0.35, + "step": 2045 + }, + { + "epoch": 0.16368, + "grad_norm": 1.4125738143920898, + "learning_rate": 9.358556919938759e-06, + "loss": 0.3793, + "step": 2046 + }, + { + "epoch": 0.16376, + "grad_norm": 1.4307483434677124, + "learning_rate": 9.357940512193446e-06, + "loss": 0.3367, + "step": 2047 + }, + { + "epoch": 0.16384, + "grad_norm": 1.469079613685608, + "learning_rate": 9.357323828735818e-06, + "loss": 0.3201, + "step": 2048 + }, + { + "epoch": 0.16392, + "grad_norm": 1.462432861328125, + "learning_rate": 9.356706869604892e-06, + "loss": 0.2956, + "step": 2049 + }, + { + "epoch": 0.164, + "grad_norm": 1.8476369380950928, + "learning_rate": 9.3560896348397e-06, + "loss": 0.4874, + "step": 2050 + }, + { + "epoch": 0.16408, + "grad_norm": 1.6221674680709839, + "learning_rate": 9.355472124479292e-06, + "loss": 0.2875, + "step": 2051 + }, + { + "epoch": 0.16416, + "grad_norm": 1.772452473640442, + "learning_rate": 9.354854338562735e-06, + "loss": 0.3395, + "step": 2052 + }, + { + "epoch": 0.16424, + "grad_norm": 1.8067185878753662, + "learning_rate": 9.354236277129119e-06, + "loss": 0.4001, + "step": 2053 + }, + { + "epoch": 0.16432, + "grad_norm": 1.7058254480361938, + "learning_rate": 9.35361794021754e-06, + "loss": 0.48, + "step": 2054 + }, + { + "epoch": 0.1644, + "grad_norm": 1.4677443504333496, + "learning_rate": 9.352999327867122e-06, + "loss": 0.268, + "step": 2055 + }, + { + "epoch": 0.16448, + "grad_norm": 1.252163052558899, + "learning_rate": 9.352380440117002e-06, + "loss": 0.2939, + "step": 2056 + }, + { + "epoch": 0.16456, + "grad_norm": 1.5425444841384888, + "learning_rate": 9.351761277006332e-06, + "loss": 0.3803, + "step": 2057 + }, + { + "epoch": 0.16464, + "grad_norm": 1.7046698331832886, + "learning_rate": 9.351141838574291e-06, + "loss": 0.3808, + "step": 2058 + }, + { + "epoch": 0.16472, + "grad_norm": 1.4355201721191406, + "learning_rate": 9.350522124860063e-06, + "loss": 0.2472, + "step": 2059 + }, + { + "epoch": 0.1648, + "grad_norm": 1.3044764995574951, + "learning_rate": 9.349902135902857e-06, + "loss": 0.27, + "step": 2060 + }, + { + "epoch": 0.16488, + "grad_norm": 1.7062314748764038, + "learning_rate": 9.349281871741898e-06, + "loss": 0.3372, + "step": 2061 + }, + { + "epoch": 0.16496, + "grad_norm": 1.5979080200195312, + "learning_rate": 9.348661332416428e-06, + "loss": 0.3478, + "step": 2062 + }, + { + "epoch": 0.16504, + "grad_norm": 1.3202630281448364, + "learning_rate": 9.348040517965704e-06, + "loss": 0.2525, + "step": 2063 + }, + { + "epoch": 0.16512, + "grad_norm": 1.7079370021820068, + "learning_rate": 9.347419428429007e-06, + "loss": 0.4255, + "step": 2064 + }, + { + "epoch": 0.1652, + "grad_norm": 2.1258163452148438, + "learning_rate": 9.34679806384563e-06, + "loss": 0.4046, + "step": 2065 + }, + { + "epoch": 0.16528, + "grad_norm": 1.4599392414093018, + "learning_rate": 9.346176424254883e-06, + "loss": 0.3442, + "step": 2066 + }, + { + "epoch": 0.16536, + "grad_norm": 1.5100538730621338, + "learning_rate": 9.345554509696096e-06, + "loss": 0.314, + "step": 2067 + }, + { + "epoch": 0.16544, + "grad_norm": 1.472424030303955, + "learning_rate": 9.344932320208615e-06, + "loss": 0.3324, + "step": 2068 + }, + { + "epoch": 0.16552, + "grad_norm": 1.7998536825180054, + "learning_rate": 9.344309855831806e-06, + "loss": 0.4959, + "step": 2069 + }, + { + "epoch": 0.1656, + "grad_norm": 1.8450273275375366, + "learning_rate": 9.343687116605045e-06, + "loss": 0.3331, + "step": 2070 + }, + { + "epoch": 0.16568, + "grad_norm": 1.7891483306884766, + "learning_rate": 9.343064102567738e-06, + "loss": 0.3411, + "step": 2071 + }, + { + "epoch": 0.16576, + "grad_norm": 1.251073956489563, + "learning_rate": 9.342440813759294e-06, + "loss": 0.2926, + "step": 2072 + }, + { + "epoch": 0.16584, + "grad_norm": 1.9193722009658813, + "learning_rate": 9.341817250219153e-06, + "loss": 0.4983, + "step": 2073 + }, + { + "epoch": 0.16592, + "grad_norm": 1.5404998064041138, + "learning_rate": 9.34119341198676e-06, + "loss": 0.4395, + "step": 2074 + }, + { + "epoch": 0.166, + "grad_norm": 1.6261802911758423, + "learning_rate": 9.340569299101584e-06, + "loss": 0.382, + "step": 2075 + }, + { + "epoch": 0.16608, + "grad_norm": 1.4044370651245117, + "learning_rate": 9.339944911603116e-06, + "loss": 0.3091, + "step": 2076 + }, + { + "epoch": 0.16616, + "grad_norm": 1.707542061805725, + "learning_rate": 9.339320249530851e-06, + "loss": 0.3379, + "step": 2077 + }, + { + "epoch": 0.16624, + "grad_norm": 2.2420706748962402, + "learning_rate": 9.338695312924317e-06, + "loss": 0.4235, + "step": 2078 + }, + { + "epoch": 0.16632, + "grad_norm": 1.7224174737930298, + "learning_rate": 9.338070101823046e-06, + "loss": 0.3715, + "step": 2079 + }, + { + "epoch": 0.1664, + "grad_norm": 1.4423843622207642, + "learning_rate": 9.337444616266595e-06, + "loss": 0.3355, + "step": 2080 + }, + { + "epoch": 0.16648, + "grad_norm": 1.2990514039993286, + "learning_rate": 9.336818856294535e-06, + "loss": 0.2665, + "step": 2081 + }, + { + "epoch": 0.16656, + "grad_norm": 1.7929611206054688, + "learning_rate": 9.336192821946459e-06, + "loss": 0.34, + "step": 2082 + }, + { + "epoch": 0.16664, + "grad_norm": 1.742226243019104, + "learning_rate": 9.33556651326197e-06, + "loss": 0.3516, + "step": 2083 + }, + { + "epoch": 0.16672, + "grad_norm": 1.5436040163040161, + "learning_rate": 9.334939930280698e-06, + "loss": 0.2721, + "step": 2084 + }, + { + "epoch": 0.1668, + "grad_norm": 2.2682416439056396, + "learning_rate": 9.334313073042279e-06, + "loss": 0.4935, + "step": 2085 + }, + { + "epoch": 0.16688, + "grad_norm": 1.4342129230499268, + "learning_rate": 9.333685941586375e-06, + "loss": 0.382, + "step": 2086 + }, + { + "epoch": 0.16696, + "grad_norm": 1.3763701915740967, + "learning_rate": 9.333058535952661e-06, + "loss": 0.2874, + "step": 2087 + }, + { + "epoch": 0.16704, + "grad_norm": 1.4731640815734863, + "learning_rate": 9.332430856180831e-06, + "loss": 0.3329, + "step": 2088 + }, + { + "epoch": 0.16712, + "grad_norm": 1.6573126316070557, + "learning_rate": 9.3318029023106e-06, + "loss": 0.4924, + "step": 2089 + }, + { + "epoch": 0.1672, + "grad_norm": 1.5075602531433105, + "learning_rate": 9.331174674381692e-06, + "loss": 0.3175, + "step": 2090 + }, + { + "epoch": 0.16728, + "grad_norm": 1.938504934310913, + "learning_rate": 9.330546172433855e-06, + "loss": 0.4211, + "step": 2091 + }, + { + "epoch": 0.16736, + "grad_norm": 1.2835865020751953, + "learning_rate": 9.329917396506851e-06, + "loss": 0.2607, + "step": 2092 + }, + { + "epoch": 0.16744, + "grad_norm": 1.6493488550186157, + "learning_rate": 9.329288346640462e-06, + "loss": 0.3207, + "step": 2093 + }, + { + "epoch": 0.16752, + "grad_norm": 1.4919483661651611, + "learning_rate": 9.328659022874486e-06, + "loss": 0.3066, + "step": 2094 + }, + { + "epoch": 0.1676, + "grad_norm": 1.7278863191604614, + "learning_rate": 9.328029425248736e-06, + "loss": 0.4222, + "step": 2095 + }, + { + "epoch": 0.16768, + "grad_norm": 1.574093222618103, + "learning_rate": 9.327399553803047e-06, + "loss": 0.3571, + "step": 2096 + }, + { + "epoch": 0.16776, + "grad_norm": 1.94347083568573, + "learning_rate": 9.326769408577266e-06, + "loss": 0.4714, + "step": 2097 + }, + { + "epoch": 0.16784, + "grad_norm": 1.8207807540893555, + "learning_rate": 9.326138989611265e-06, + "loss": 0.3273, + "step": 2098 + }, + { + "epoch": 0.16792, + "grad_norm": 1.5437902212142944, + "learning_rate": 9.325508296944922e-06, + "loss": 0.3561, + "step": 2099 + }, + { + "epoch": 0.168, + "grad_norm": 1.638817310333252, + "learning_rate": 9.324877330618143e-06, + "loss": 0.3146, + "step": 2100 + }, + { + "epoch": 0.16808, + "grad_norm": 1.3537306785583496, + "learning_rate": 9.324246090670847e-06, + "loss": 0.263, + "step": 2101 + }, + { + "epoch": 0.16816, + "grad_norm": 1.4852831363677979, + "learning_rate": 9.32361457714297e-06, + "loss": 0.2881, + "step": 2102 + }, + { + "epoch": 0.16824, + "grad_norm": 1.9877256155014038, + "learning_rate": 9.322982790074467e-06, + "loss": 0.425, + "step": 2103 + }, + { + "epoch": 0.16832, + "grad_norm": 1.4754945039749146, + "learning_rate": 9.322350729505305e-06, + "loss": 0.2753, + "step": 2104 + }, + { + "epoch": 0.1684, + "grad_norm": 1.820770502090454, + "learning_rate": 9.321718395475475e-06, + "loss": 0.3354, + "step": 2105 + }, + { + "epoch": 0.16848, + "grad_norm": 1.3193517923355103, + "learning_rate": 9.321085788024983e-06, + "loss": 0.2598, + "step": 2106 + }, + { + "epoch": 0.16856, + "grad_norm": 1.3211908340454102, + "learning_rate": 9.320452907193854e-06, + "loss": 0.2607, + "step": 2107 + }, + { + "epoch": 0.16864, + "grad_norm": 2.045539140701294, + "learning_rate": 9.319819753022123e-06, + "loss": 0.3862, + "step": 2108 + }, + { + "epoch": 0.16872, + "grad_norm": 1.5828810930252075, + "learning_rate": 9.319186325549851e-06, + "loss": 0.478, + "step": 2109 + }, + { + "epoch": 0.1688, + "grad_norm": 1.4170411825180054, + "learning_rate": 9.318552624817114e-06, + "loss": 0.323, + "step": 2110 + }, + { + "epoch": 0.16888, + "grad_norm": 1.40409255027771, + "learning_rate": 9.317918650864e-06, + "loss": 0.2817, + "step": 2111 + }, + { + "epoch": 0.16896, + "grad_norm": 1.6362119913101196, + "learning_rate": 9.317284403730622e-06, + "loss": 0.3455, + "step": 2112 + }, + { + "epoch": 0.16904, + "grad_norm": 1.4493852853775024, + "learning_rate": 9.316649883457104e-06, + "loss": 0.3114, + "step": 2113 + }, + { + "epoch": 0.16912, + "grad_norm": 1.9049961566925049, + "learning_rate": 9.316015090083595e-06, + "loss": 0.3715, + "step": 2114 + }, + { + "epoch": 0.1692, + "grad_norm": 1.3683356046676636, + "learning_rate": 9.315380023650248e-06, + "loss": 0.2945, + "step": 2115 + }, + { + "epoch": 0.16928, + "grad_norm": 2.296933889389038, + "learning_rate": 9.31474468419725e-06, + "loss": 0.445, + "step": 2116 + }, + { + "epoch": 0.16936, + "grad_norm": 1.487442970275879, + "learning_rate": 9.314109071764793e-06, + "loss": 0.2976, + "step": 2117 + }, + { + "epoch": 0.16944, + "grad_norm": 1.581235647201538, + "learning_rate": 9.313473186393087e-06, + "loss": 0.3736, + "step": 2118 + }, + { + "epoch": 0.16952, + "grad_norm": 1.685365080833435, + "learning_rate": 9.312837028122368e-06, + "loss": 0.3519, + "step": 2119 + }, + { + "epoch": 0.1696, + "grad_norm": 1.5775948762893677, + "learning_rate": 9.312200596992879e-06, + "loss": 0.3245, + "step": 2120 + }, + { + "epoch": 0.16968, + "grad_norm": 1.5085936784744263, + "learning_rate": 9.311563893044888e-06, + "loss": 0.2838, + "step": 2121 + }, + { + "epoch": 0.16976, + "grad_norm": 1.522886037826538, + "learning_rate": 9.310926916318677e-06, + "loss": 0.3361, + "step": 2122 + }, + { + "epoch": 0.16984, + "grad_norm": 1.5970796346664429, + "learning_rate": 9.310289666854543e-06, + "loss": 0.4171, + "step": 2123 + }, + { + "epoch": 0.16992, + "grad_norm": 1.9791122674942017, + "learning_rate": 9.309652144692804e-06, + "loss": 0.4402, + "step": 2124 + }, + { + "epoch": 0.17, + "grad_norm": 2.1077051162719727, + "learning_rate": 9.309014349873795e-06, + "loss": 0.5385, + "step": 2125 + }, + { + "epoch": 0.17008, + "grad_norm": 1.4200516939163208, + "learning_rate": 9.308376282437866e-06, + "loss": 0.306, + "step": 2126 + }, + { + "epoch": 0.17016, + "grad_norm": 1.521388053894043, + "learning_rate": 9.307737942425385e-06, + "loss": 0.3083, + "step": 2127 + }, + { + "epoch": 0.17024, + "grad_norm": 2.1909584999084473, + "learning_rate": 9.307099329876736e-06, + "loss": 0.3912, + "step": 2128 + }, + { + "epoch": 0.17032, + "grad_norm": 1.6918203830718994, + "learning_rate": 9.306460444832327e-06, + "loss": 0.3992, + "step": 2129 + }, + { + "epoch": 0.1704, + "grad_norm": 1.7204002141952515, + "learning_rate": 9.305821287332575e-06, + "loss": 0.3076, + "step": 2130 + }, + { + "epoch": 0.17048, + "grad_norm": 1.634256362915039, + "learning_rate": 9.305181857417917e-06, + "loss": 0.3281, + "step": 2131 + }, + { + "epoch": 0.17056, + "grad_norm": 1.616892695426941, + "learning_rate": 9.304542155128806e-06, + "loss": 0.3026, + "step": 2132 + }, + { + "epoch": 0.17064, + "grad_norm": 1.3723448514938354, + "learning_rate": 9.30390218050572e-06, + "loss": 0.3969, + "step": 2133 + }, + { + "epoch": 0.17072, + "grad_norm": 1.7421725988388062, + "learning_rate": 9.303261933589141e-06, + "loss": 0.3514, + "step": 2134 + }, + { + "epoch": 0.1708, + "grad_norm": 1.6025031805038452, + "learning_rate": 9.302621414419577e-06, + "loss": 0.3093, + "step": 2135 + }, + { + "epoch": 0.17088, + "grad_norm": 1.4779335260391235, + "learning_rate": 9.301980623037556e-06, + "loss": 0.2883, + "step": 2136 + }, + { + "epoch": 0.17096, + "grad_norm": 1.6824702024459839, + "learning_rate": 9.301339559483614e-06, + "loss": 0.3437, + "step": 2137 + }, + { + "epoch": 0.17104, + "grad_norm": 1.7231637239456177, + "learning_rate": 9.30069822379831e-06, + "loss": 0.5239, + "step": 2138 + }, + { + "epoch": 0.17112, + "grad_norm": 1.4851043224334717, + "learning_rate": 9.30005661602222e-06, + "loss": 0.3925, + "step": 2139 + }, + { + "epoch": 0.1712, + "grad_norm": 1.611282229423523, + "learning_rate": 9.299414736195936e-06, + "loss": 0.316, + "step": 2140 + }, + { + "epoch": 0.17128, + "grad_norm": 1.8075153827667236, + "learning_rate": 9.298772584360068e-06, + "loss": 0.4018, + "step": 2141 + }, + { + "epoch": 0.17136, + "grad_norm": 1.6185144186019897, + "learning_rate": 9.298130160555241e-06, + "loss": 0.3886, + "step": 2142 + }, + { + "epoch": 0.17144, + "grad_norm": 1.4245575666427612, + "learning_rate": 9.297487464822101e-06, + "loss": 0.2893, + "step": 2143 + }, + { + "epoch": 0.17152, + "grad_norm": 1.7352980375289917, + "learning_rate": 9.296844497201309e-06, + "loss": 0.3129, + "step": 2144 + }, + { + "epoch": 0.1716, + "grad_norm": 1.7901087999343872, + "learning_rate": 9.296201257733542e-06, + "loss": 0.4773, + "step": 2145 + }, + { + "epoch": 0.17168, + "grad_norm": 1.4795407056808472, + "learning_rate": 9.295557746459498e-06, + "loss": 0.2931, + "step": 2146 + }, + { + "epoch": 0.17176, + "grad_norm": 1.349307894706726, + "learning_rate": 9.294913963419887e-06, + "loss": 0.2764, + "step": 2147 + }, + { + "epoch": 0.17184, + "grad_norm": 1.5399342775344849, + "learning_rate": 9.29426990865544e-06, + "loss": 0.3619, + "step": 2148 + }, + { + "epoch": 0.17192, + "grad_norm": 1.2979024648666382, + "learning_rate": 9.293625582206907e-06, + "loss": 0.2828, + "step": 2149 + }, + { + "epoch": 0.172, + "grad_norm": 1.999293565750122, + "learning_rate": 9.292980984115048e-06, + "loss": 0.4908, + "step": 2150 + }, + { + "epoch": 0.17208, + "grad_norm": 1.910706877708435, + "learning_rate": 9.292336114420645e-06, + "loss": 0.3672, + "step": 2151 + }, + { + "epoch": 0.17216, + "grad_norm": 1.7465935945510864, + "learning_rate": 9.2916909731645e-06, + "loss": 0.447, + "step": 2152 + }, + { + "epoch": 0.17224, + "grad_norm": 1.3406716585159302, + "learning_rate": 9.291045560387428e-06, + "loss": 0.3121, + "step": 2153 + }, + { + "epoch": 0.17232, + "grad_norm": 1.4294350147247314, + "learning_rate": 9.290399876130261e-06, + "loss": 0.3287, + "step": 2154 + }, + { + "epoch": 0.1724, + "grad_norm": 1.1393554210662842, + "learning_rate": 9.289753920433848e-06, + "loss": 0.2707, + "step": 2155 + }, + { + "epoch": 0.17248, + "grad_norm": 1.8590903282165527, + "learning_rate": 9.28910769333906e-06, + "loss": 0.4045, + "step": 2156 + }, + { + "epoch": 0.17256, + "grad_norm": 1.8060451745986938, + "learning_rate": 9.288461194886778e-06, + "loss": 0.4324, + "step": 2157 + }, + { + "epoch": 0.17264, + "grad_norm": 1.4243879318237305, + "learning_rate": 9.287814425117907e-06, + "loss": 0.2985, + "step": 2158 + }, + { + "epoch": 0.17272, + "grad_norm": 1.8122689723968506, + "learning_rate": 9.287167384073364e-06, + "loss": 0.4555, + "step": 2159 + }, + { + "epoch": 0.1728, + "grad_norm": 1.4533016681671143, + "learning_rate": 9.286520071794085e-06, + "loss": 0.2583, + "step": 2160 + }, + { + "epoch": 0.17288, + "grad_norm": 1.7077553272247314, + "learning_rate": 9.285872488321023e-06, + "loss": 0.475, + "step": 2161 + }, + { + "epoch": 0.17296, + "grad_norm": 1.8486095666885376, + "learning_rate": 9.285224633695151e-06, + "loss": 0.4598, + "step": 2162 + }, + { + "epoch": 0.17304, + "grad_norm": 1.693713903427124, + "learning_rate": 9.284576507957454e-06, + "loss": 0.3301, + "step": 2163 + }, + { + "epoch": 0.17312, + "grad_norm": 1.5107775926589966, + "learning_rate": 9.283928111148937e-06, + "loss": 0.3226, + "step": 2164 + }, + { + "epoch": 0.1732, + "grad_norm": 1.395003318786621, + "learning_rate": 9.283279443310623e-06, + "loss": 0.3354, + "step": 2165 + }, + { + "epoch": 0.17328, + "grad_norm": 1.4116548299789429, + "learning_rate": 9.28263050448355e-06, + "loss": 0.2977, + "step": 2166 + }, + { + "epoch": 0.17336, + "grad_norm": 1.5333601236343384, + "learning_rate": 9.281981294708775e-06, + "loss": 0.4171, + "step": 2167 + }, + { + "epoch": 0.17344, + "grad_norm": 1.510820746421814, + "learning_rate": 9.281331814027372e-06, + "loss": 0.3506, + "step": 2168 + }, + { + "epoch": 0.17352, + "grad_norm": 1.3123133182525635, + "learning_rate": 9.28068206248043e-06, + "loss": 0.2977, + "step": 2169 + }, + { + "epoch": 0.1736, + "grad_norm": 1.606671929359436, + "learning_rate": 9.280032040109057e-06, + "loss": 0.3311, + "step": 2170 + }, + { + "epoch": 0.17368, + "grad_norm": 1.8891907930374146, + "learning_rate": 9.279381746954378e-06, + "loss": 0.5067, + "step": 2171 + }, + { + "epoch": 0.17376, + "grad_norm": 1.3156824111938477, + "learning_rate": 9.278731183057533e-06, + "loss": 0.3077, + "step": 2172 + }, + { + "epoch": 0.17384, + "grad_norm": 1.8756389617919922, + "learning_rate": 9.278080348459684e-06, + "loss": 0.3977, + "step": 2173 + }, + { + "epoch": 0.17392, + "grad_norm": 1.0538018941879272, + "learning_rate": 9.277429243202007e-06, + "loss": 0.244, + "step": 2174 + }, + { + "epoch": 0.174, + "grad_norm": 1.7108577489852905, + "learning_rate": 9.276777867325693e-06, + "loss": 0.4015, + "step": 2175 + }, + { + "epoch": 0.17408, + "grad_norm": 1.6555908918380737, + "learning_rate": 9.276126220871952e-06, + "loss": 0.4131, + "step": 2176 + }, + { + "epoch": 0.17416, + "grad_norm": 1.5826455354690552, + "learning_rate": 9.275474303882016e-06, + "loss": 0.3234, + "step": 2177 + }, + { + "epoch": 0.17424, + "grad_norm": 1.338919758796692, + "learning_rate": 9.274822116397124e-06, + "loss": 0.3157, + "step": 2178 + }, + { + "epoch": 0.17432, + "grad_norm": 1.8238811492919922, + "learning_rate": 9.274169658458543e-06, + "loss": 0.3202, + "step": 2179 + }, + { + "epoch": 0.1744, + "grad_norm": 1.4607664346694946, + "learning_rate": 9.273516930107547e-06, + "loss": 0.3517, + "step": 2180 + }, + { + "epoch": 0.17448, + "grad_norm": 1.8293615579605103, + "learning_rate": 9.272863931385434e-06, + "loss": 0.3754, + "step": 2181 + }, + { + "epoch": 0.17456, + "grad_norm": 2.1700384616851807, + "learning_rate": 9.272210662333518e-06, + "loss": 0.5072, + "step": 2182 + }, + { + "epoch": 0.17464, + "grad_norm": 1.8814268112182617, + "learning_rate": 9.27155712299313e-06, + "loss": 0.366, + "step": 2183 + }, + { + "epoch": 0.17472, + "grad_norm": 1.32944655418396, + "learning_rate": 9.270903313405612e-06, + "loss": 0.287, + "step": 2184 + }, + { + "epoch": 0.1748, + "grad_norm": 1.4097766876220703, + "learning_rate": 9.270249233612334e-06, + "loss": 0.3646, + "step": 2185 + }, + { + "epoch": 0.17488, + "grad_norm": 2.0853865146636963, + "learning_rate": 9.269594883654673e-06, + "loss": 0.383, + "step": 2186 + }, + { + "epoch": 0.17496, + "grad_norm": 2.0160350799560547, + "learning_rate": 9.268940263574032e-06, + "loss": 0.5086, + "step": 2187 + }, + { + "epoch": 0.17504, + "grad_norm": 1.8854643106460571, + "learning_rate": 9.268285373411823e-06, + "loss": 0.4704, + "step": 2188 + }, + { + "epoch": 0.17512, + "grad_norm": 1.7649178504943848, + "learning_rate": 9.267630213209482e-06, + "loss": 0.3193, + "step": 2189 + }, + { + "epoch": 0.1752, + "grad_norm": 1.8833215236663818, + "learning_rate": 9.266974783008456e-06, + "loss": 0.4477, + "step": 2190 + }, + { + "epoch": 0.17528, + "grad_norm": 1.488447666168213, + "learning_rate": 9.266319082850212e-06, + "loss": 0.3464, + "step": 2191 + }, + { + "epoch": 0.17536, + "grad_norm": 1.2886351346969604, + "learning_rate": 9.265663112776237e-06, + "loss": 0.3003, + "step": 2192 + }, + { + "epoch": 0.17544, + "grad_norm": 2.210524320602417, + "learning_rate": 9.265006872828028e-06, + "loss": 0.4336, + "step": 2193 + }, + { + "epoch": 0.17552, + "grad_norm": 1.7764190435409546, + "learning_rate": 9.264350363047105e-06, + "loss": 0.3219, + "step": 2194 + }, + { + "epoch": 0.1756, + "grad_norm": 1.7255445718765259, + "learning_rate": 9.263693583475003e-06, + "loss": 0.4066, + "step": 2195 + }, + { + "epoch": 0.17568, + "grad_norm": 1.7819242477416992, + "learning_rate": 9.263036534153276e-06, + "loss": 0.3992, + "step": 2196 + }, + { + "epoch": 0.17576, + "grad_norm": 1.5859037637710571, + "learning_rate": 9.262379215123489e-06, + "loss": 0.4164, + "step": 2197 + }, + { + "epoch": 0.17584, + "grad_norm": 1.5518395900726318, + "learning_rate": 9.261721626427233e-06, + "loss": 0.2977, + "step": 2198 + }, + { + "epoch": 0.17592, + "grad_norm": 1.689637541770935, + "learning_rate": 9.26106376810611e-06, + "loss": 0.4156, + "step": 2199 + }, + { + "epoch": 0.176, + "grad_norm": 1.8921533823013306, + "learning_rate": 9.260405640201737e-06, + "loss": 0.4744, + "step": 2200 + }, + { + "epoch": 0.17608, + "grad_norm": 1.4645209312438965, + "learning_rate": 9.259747242755757e-06, + "loss": 0.3006, + "step": 2201 + }, + { + "epoch": 0.17616, + "grad_norm": 1.5621235370635986, + "learning_rate": 9.25908857580982e-06, + "loss": 0.3133, + "step": 2202 + }, + { + "epoch": 0.17624, + "grad_norm": 1.5191184282302856, + "learning_rate": 9.258429639405602e-06, + "loss": 0.3818, + "step": 2203 + }, + { + "epoch": 0.17632, + "grad_norm": 1.3126143217086792, + "learning_rate": 9.25777043358479e-06, + "loss": 0.2928, + "step": 2204 + }, + { + "epoch": 0.1764, + "grad_norm": 1.6664056777954102, + "learning_rate": 9.257110958389088e-06, + "loss": 0.3457, + "step": 2205 + }, + { + "epoch": 0.17648, + "grad_norm": 1.7404066324234009, + "learning_rate": 9.25645121386022e-06, + "loss": 0.5175, + "step": 2206 + }, + { + "epoch": 0.17656, + "grad_norm": 1.5736061334609985, + "learning_rate": 9.255791200039925e-06, + "loss": 0.3344, + "step": 2207 + }, + { + "epoch": 0.17664, + "grad_norm": 1.7708841562271118, + "learning_rate": 9.255130916969962e-06, + "loss": 0.3816, + "step": 2208 + }, + { + "epoch": 0.17672, + "grad_norm": 1.5940806865692139, + "learning_rate": 9.254470364692103e-06, + "loss": 0.2876, + "step": 2209 + }, + { + "epoch": 0.1768, + "grad_norm": 1.2727887630462646, + "learning_rate": 9.253809543248139e-06, + "loss": 0.3499, + "step": 2210 + }, + { + "epoch": 0.17688, + "grad_norm": 1.7963303327560425, + "learning_rate": 9.253148452679878e-06, + "loss": 0.4021, + "step": 2211 + }, + { + "epoch": 0.17696, + "grad_norm": 1.2776786088943481, + "learning_rate": 9.252487093029149e-06, + "loss": 0.2827, + "step": 2212 + }, + { + "epoch": 0.17704, + "grad_norm": 1.625144600868225, + "learning_rate": 9.251825464337785e-06, + "loss": 0.3478, + "step": 2213 + }, + { + "epoch": 0.17712, + "grad_norm": 1.4690876007080078, + "learning_rate": 9.251163566647655e-06, + "loss": 0.3588, + "step": 2214 + }, + { + "epoch": 0.1772, + "grad_norm": 1.4391189813613892, + "learning_rate": 9.250501400000628e-06, + "loss": 0.3249, + "step": 2215 + }, + { + "epoch": 0.17728, + "grad_norm": 1.5366137027740479, + "learning_rate": 9.249838964438602e-06, + "loss": 0.35, + "step": 2216 + }, + { + "epoch": 0.17736, + "grad_norm": 1.5445330142974854, + "learning_rate": 9.249176260003482e-06, + "loss": 0.3531, + "step": 2217 + }, + { + "epoch": 0.17744, + "grad_norm": 1.41277277469635, + "learning_rate": 9.248513286737199e-06, + "loss": 0.3252, + "step": 2218 + }, + { + "epoch": 0.17752, + "grad_norm": 1.4927526712417603, + "learning_rate": 9.247850044681698e-06, + "loss": 0.3272, + "step": 2219 + }, + { + "epoch": 0.1776, + "grad_norm": 1.9397213459014893, + "learning_rate": 9.247186533878936e-06, + "loss": 0.4147, + "step": 2220 + }, + { + "epoch": 0.17768, + "grad_norm": 1.734050989151001, + "learning_rate": 9.246522754370893e-06, + "loss": 0.3629, + "step": 2221 + }, + { + "epoch": 0.17776, + "grad_norm": 1.57426118850708, + "learning_rate": 9.245858706199565e-06, + "loss": 0.3241, + "step": 2222 + }, + { + "epoch": 0.17784, + "grad_norm": 1.3740696907043457, + "learning_rate": 9.245194389406961e-06, + "loss": 0.2866, + "step": 2223 + }, + { + "epoch": 0.17792, + "grad_norm": 1.3346598148345947, + "learning_rate": 9.244529804035116e-06, + "loss": 0.2516, + "step": 2224 + }, + { + "epoch": 0.178, + "grad_norm": 1.5931025743484497, + "learning_rate": 9.24386495012607e-06, + "loss": 0.4599, + "step": 2225 + }, + { + "epoch": 0.17808, + "grad_norm": 1.608982801437378, + "learning_rate": 9.24319982772189e-06, + "loss": 0.3688, + "step": 2226 + }, + { + "epoch": 0.17816, + "grad_norm": 1.6699520349502563, + "learning_rate": 9.242534436864654e-06, + "loss": 0.3289, + "step": 2227 + }, + { + "epoch": 0.17824, + "grad_norm": 1.524670958518982, + "learning_rate": 9.24186877759646e-06, + "loss": 0.3446, + "step": 2228 + }, + { + "epoch": 0.17832, + "grad_norm": 2.181579113006592, + "learning_rate": 9.241202849959422e-06, + "loss": 0.4242, + "step": 2229 + }, + { + "epoch": 0.1784, + "grad_norm": 1.4129284620285034, + "learning_rate": 9.240536653995671e-06, + "loss": 0.3339, + "step": 2230 + }, + { + "epoch": 0.17848, + "grad_norm": 1.5664548873901367, + "learning_rate": 9.239870189747355e-06, + "loss": 0.3472, + "step": 2231 + }, + { + "epoch": 0.17856, + "grad_norm": 2.093280076980591, + "learning_rate": 9.239203457256636e-06, + "loss": 0.4444, + "step": 2232 + }, + { + "epoch": 0.17864, + "grad_norm": 1.6080608367919922, + "learning_rate": 9.238536456565702e-06, + "loss": 0.4602, + "step": 2233 + }, + { + "epoch": 0.17872, + "grad_norm": 1.671555519104004, + "learning_rate": 9.237869187716747e-06, + "loss": 0.4864, + "step": 2234 + }, + { + "epoch": 0.1788, + "grad_norm": 1.5570496320724487, + "learning_rate": 9.237201650751987e-06, + "loss": 0.4289, + "step": 2235 + }, + { + "epoch": 0.17888, + "grad_norm": 1.9579259157180786, + "learning_rate": 9.23653384571366e-06, + "loss": 0.373, + "step": 2236 + }, + { + "epoch": 0.17896, + "grad_norm": 1.2836438417434692, + "learning_rate": 9.23586577264401e-06, + "loss": 0.2913, + "step": 2237 + }, + { + "epoch": 0.17904, + "grad_norm": 1.651989221572876, + "learning_rate": 9.235197431585305e-06, + "loss": 0.3415, + "step": 2238 + }, + { + "epoch": 0.17912, + "grad_norm": 1.5639806985855103, + "learning_rate": 9.23452882257983e-06, + "loss": 0.4252, + "step": 2239 + }, + { + "epoch": 0.1792, + "grad_norm": 1.5605380535125732, + "learning_rate": 9.233859945669888e-06, + "loss": 0.303, + "step": 2240 + }, + { + "epoch": 0.17928, + "grad_norm": 1.6003406047821045, + "learning_rate": 9.23319080089779e-06, + "loss": 0.3035, + "step": 2241 + }, + { + "epoch": 0.17936, + "grad_norm": 1.3260300159454346, + "learning_rate": 9.232521388305876e-06, + "loss": 0.2764, + "step": 2242 + }, + { + "epoch": 0.17944, + "grad_norm": 1.5889723300933838, + "learning_rate": 9.231851707936495e-06, + "loss": 0.3757, + "step": 2243 + }, + { + "epoch": 0.17952, + "grad_norm": 1.5519212484359741, + "learning_rate": 9.231181759832017e-06, + "loss": 0.3367, + "step": 2244 + }, + { + "epoch": 0.1796, + "grad_norm": 1.8289942741394043, + "learning_rate": 9.230511544034826e-06, + "loss": 0.3846, + "step": 2245 + }, + { + "epoch": 0.17968, + "grad_norm": 1.6226454973220825, + "learning_rate": 9.229841060587326e-06, + "loss": 0.3169, + "step": 2246 + }, + { + "epoch": 0.17976, + "grad_norm": 1.7149393558502197, + "learning_rate": 9.229170309531934e-06, + "loss": 0.3524, + "step": 2247 + }, + { + "epoch": 0.17984, + "grad_norm": 1.6193126440048218, + "learning_rate": 9.228499290911088e-06, + "loss": 0.3369, + "step": 2248 + }, + { + "epoch": 0.17992, + "grad_norm": 2.0542750358581543, + "learning_rate": 9.22782800476724e-06, + "loss": 0.3477, + "step": 2249 + }, + { + "epoch": 0.18, + "grad_norm": 1.3201754093170166, + "learning_rate": 9.227156451142863e-06, + "loss": 0.3066, + "step": 2250 + }, + { + "epoch": 0.18008, + "grad_norm": 1.7556428909301758, + "learning_rate": 9.226484630080439e-06, + "loss": 0.3791, + "step": 2251 + }, + { + "epoch": 0.18016, + "grad_norm": 1.8350175619125366, + "learning_rate": 9.225812541622474e-06, + "loss": 0.3668, + "step": 2252 + }, + { + "epoch": 0.18024, + "grad_norm": 1.2991888523101807, + "learning_rate": 9.22514018581149e-06, + "loss": 0.2949, + "step": 2253 + }, + { + "epoch": 0.18032, + "grad_norm": 1.7130646705627441, + "learning_rate": 9.224467562690022e-06, + "loss": 0.381, + "step": 2254 + }, + { + "epoch": 0.1804, + "grad_norm": 1.361568570137024, + "learning_rate": 9.22379467230063e-06, + "loss": 0.2958, + "step": 2255 + }, + { + "epoch": 0.18048, + "grad_norm": 1.8371490240097046, + "learning_rate": 9.22312151468588e-06, + "loss": 0.3656, + "step": 2256 + }, + { + "epoch": 0.18056, + "grad_norm": 1.5410816669464111, + "learning_rate": 9.22244808988836e-06, + "loss": 0.3638, + "step": 2257 + }, + { + "epoch": 0.18064, + "grad_norm": 1.7891284227371216, + "learning_rate": 9.22177439795068e-06, + "loss": 0.4046, + "step": 2258 + }, + { + "epoch": 0.18072, + "grad_norm": 1.0117267370224, + "learning_rate": 9.221100438915462e-06, + "loss": 0.2083, + "step": 2259 + }, + { + "epoch": 0.1808, + "grad_norm": 1.6379694938659668, + "learning_rate": 9.22042621282534e-06, + "loss": 0.3407, + "step": 2260 + }, + { + "epoch": 0.18088, + "grad_norm": 1.8293073177337646, + "learning_rate": 9.219751719722974e-06, + "loss": 0.3578, + "step": 2261 + }, + { + "epoch": 0.18096, + "grad_norm": 1.6874363422393799, + "learning_rate": 9.219076959651037e-06, + "loss": 0.438, + "step": 2262 + }, + { + "epoch": 0.18104, + "grad_norm": 1.3425878286361694, + "learning_rate": 9.218401932652217e-06, + "loss": 0.2945, + "step": 2263 + }, + { + "epoch": 0.18112, + "grad_norm": 1.3295488357543945, + "learning_rate": 9.21772663876922e-06, + "loss": 0.3008, + "step": 2264 + }, + { + "epoch": 0.1812, + "grad_norm": 1.479942798614502, + "learning_rate": 9.217051078044773e-06, + "loss": 0.3512, + "step": 2265 + }, + { + "epoch": 0.18128, + "grad_norm": 1.6754070520401, + "learning_rate": 9.216375250521614e-06, + "loss": 0.3106, + "step": 2266 + }, + { + "epoch": 0.18136, + "grad_norm": 1.2535821199417114, + "learning_rate": 9.215699156242501e-06, + "loss": 0.298, + "step": 2267 + }, + { + "epoch": 0.18144, + "grad_norm": 1.4116896390914917, + "learning_rate": 9.215022795250209e-06, + "loss": 0.3316, + "step": 2268 + }, + { + "epoch": 0.18152, + "grad_norm": 1.704809546470642, + "learning_rate": 9.214346167587529e-06, + "loss": 0.4056, + "step": 2269 + }, + { + "epoch": 0.1816, + "grad_norm": 1.3820918798446655, + "learning_rate": 9.213669273297266e-06, + "loss": 0.3467, + "step": 2270 + }, + { + "epoch": 0.18168, + "grad_norm": 1.1921701431274414, + "learning_rate": 9.212992112422248e-06, + "loss": 0.277, + "step": 2271 + }, + { + "epoch": 0.18176, + "grad_norm": 1.5377929210662842, + "learning_rate": 9.212314685005314e-06, + "loss": 0.3292, + "step": 2272 + }, + { + "epoch": 0.18184, + "grad_norm": 1.6128257513046265, + "learning_rate": 9.211636991089328e-06, + "loss": 0.3684, + "step": 2273 + }, + { + "epoch": 0.18192, + "grad_norm": 2.1811368465423584, + "learning_rate": 9.210959030717158e-06, + "loss": 0.4364, + "step": 2274 + }, + { + "epoch": 0.182, + "grad_norm": 1.9046738147735596, + "learning_rate": 9.210280803931702e-06, + "loss": 0.3347, + "step": 2275 + }, + { + "epoch": 0.18208, + "grad_norm": 1.7492754459381104, + "learning_rate": 9.209602310775868e-06, + "loss": 0.3851, + "step": 2276 + }, + { + "epoch": 0.18216, + "grad_norm": 1.6498044729232788, + "learning_rate": 9.208923551292578e-06, + "loss": 0.3078, + "step": 2277 + }, + { + "epoch": 0.18224, + "grad_norm": 1.6144391298294067, + "learning_rate": 9.208244525524782e-06, + "loss": 0.4138, + "step": 2278 + }, + { + "epoch": 0.18232, + "grad_norm": 1.871065616607666, + "learning_rate": 9.207565233515434e-06, + "loss": 0.4951, + "step": 2279 + }, + { + "epoch": 0.1824, + "grad_norm": 1.9937913417816162, + "learning_rate": 9.20688567530751e-06, + "loss": 0.4494, + "step": 2280 + }, + { + "epoch": 0.18248, + "grad_norm": 1.3968439102172852, + "learning_rate": 9.206205850944009e-06, + "loss": 0.2798, + "step": 2281 + }, + { + "epoch": 0.18256, + "grad_norm": 1.554082989692688, + "learning_rate": 9.205525760467937e-06, + "loss": 0.3656, + "step": 2282 + }, + { + "epoch": 0.18264, + "grad_norm": 2.032961130142212, + "learning_rate": 9.204845403922321e-06, + "loss": 0.4478, + "step": 2283 + }, + { + "epoch": 0.18272, + "grad_norm": 1.621554970741272, + "learning_rate": 9.204164781350207e-06, + "loss": 0.3502, + "step": 2284 + }, + { + "epoch": 0.1828, + "grad_norm": 1.4644954204559326, + "learning_rate": 9.203483892794652e-06, + "loss": 0.3379, + "step": 2285 + }, + { + "epoch": 0.18288, + "grad_norm": 1.294154167175293, + "learning_rate": 9.202802738298738e-06, + "loss": 0.2404, + "step": 2286 + }, + { + "epoch": 0.18296, + "grad_norm": 2.0851991176605225, + "learning_rate": 9.202121317905557e-06, + "loss": 0.3982, + "step": 2287 + }, + { + "epoch": 0.18304, + "grad_norm": 1.1855297088623047, + "learning_rate": 9.20143963165822e-06, + "loss": 0.2492, + "step": 2288 + }, + { + "epoch": 0.18312, + "grad_norm": 1.5839331150054932, + "learning_rate": 9.200757679599857e-06, + "loss": 0.2824, + "step": 2289 + }, + { + "epoch": 0.1832, + "grad_norm": 1.5208096504211426, + "learning_rate": 9.20007546177361e-06, + "loss": 0.3692, + "step": 2290 + }, + { + "epoch": 0.18328, + "grad_norm": 2.2178423404693604, + "learning_rate": 9.199392978222644e-06, + "loss": 0.436, + "step": 2291 + }, + { + "epoch": 0.18336, + "grad_norm": 1.7429447174072266, + "learning_rate": 9.198710228990132e-06, + "loss": 0.4171, + "step": 2292 + }, + { + "epoch": 0.18344, + "grad_norm": 1.6876320838928223, + "learning_rate": 9.198027214119275e-06, + "loss": 0.4079, + "step": 2293 + }, + { + "epoch": 0.18352, + "grad_norm": 2.200977087020874, + "learning_rate": 9.197343933653283e-06, + "loss": 0.4152, + "step": 2294 + }, + { + "epoch": 0.1836, + "grad_norm": 1.5130945444107056, + "learning_rate": 9.196660387635384e-06, + "loss": 0.2694, + "step": 2295 + }, + { + "epoch": 0.18368, + "grad_norm": 1.4931762218475342, + "learning_rate": 9.195976576108825e-06, + "loss": 0.4342, + "step": 2296 + }, + { + "epoch": 0.18376, + "grad_norm": 1.4749504327774048, + "learning_rate": 9.195292499116868e-06, + "loss": 0.3459, + "step": 2297 + }, + { + "epoch": 0.18384, + "grad_norm": 1.4674580097198486, + "learning_rate": 9.19460815670279e-06, + "loss": 0.3511, + "step": 2298 + }, + { + "epoch": 0.18392, + "grad_norm": 1.4509178400039673, + "learning_rate": 9.193923548909891e-06, + "loss": 0.279, + "step": 2299 + }, + { + "epoch": 0.184, + "grad_norm": 1.370303988456726, + "learning_rate": 9.193238675781482e-06, + "loss": 0.2597, + "step": 2300 + }, + { + "epoch": 0.18408, + "grad_norm": 1.7368767261505127, + "learning_rate": 9.19255353736089e-06, + "loss": 0.387, + "step": 2301 + }, + { + "epoch": 0.18416, + "grad_norm": 1.8166148662567139, + "learning_rate": 9.191868133691467e-06, + "loss": 0.3739, + "step": 2302 + }, + { + "epoch": 0.18424, + "grad_norm": 1.7067928314208984, + "learning_rate": 9.191182464816572e-06, + "loss": 0.3617, + "step": 2303 + }, + { + "epoch": 0.18432, + "grad_norm": 1.0539000034332275, + "learning_rate": 9.190496530779587e-06, + "loss": 0.2726, + "step": 2304 + }, + { + "epoch": 0.1844, + "grad_norm": 1.555287480354309, + "learning_rate": 9.189810331623908e-06, + "loss": 0.332, + "step": 2305 + }, + { + "epoch": 0.18448, + "grad_norm": 1.7620913982391357, + "learning_rate": 9.189123867392947e-06, + "loss": 0.4206, + "step": 2306 + }, + { + "epoch": 0.18456, + "grad_norm": 1.1607882976531982, + "learning_rate": 9.188437138130138e-06, + "loss": 0.242, + "step": 2307 + }, + { + "epoch": 0.18464, + "grad_norm": 1.4528453350067139, + "learning_rate": 9.187750143878924e-06, + "loss": 0.2846, + "step": 2308 + }, + { + "epoch": 0.18472, + "grad_norm": 1.9288702011108398, + "learning_rate": 9.187062884682772e-06, + "loss": 0.4231, + "step": 2309 + }, + { + "epoch": 0.1848, + "grad_norm": 1.4445855617523193, + "learning_rate": 9.186375360585159e-06, + "loss": 0.3683, + "step": 2310 + }, + { + "epoch": 0.18488, + "grad_norm": 1.6469016075134277, + "learning_rate": 9.185687571629587e-06, + "loss": 0.4188, + "step": 2311 + }, + { + "epoch": 0.18496, + "grad_norm": 1.7436647415161133, + "learning_rate": 9.184999517859566e-06, + "loss": 0.3646, + "step": 2312 + }, + { + "epoch": 0.18504, + "grad_norm": 2.044245719909668, + "learning_rate": 9.18431119931863e-06, + "loss": 0.4894, + "step": 2313 + }, + { + "epoch": 0.18512, + "grad_norm": 1.396704077720642, + "learning_rate": 9.183622616050323e-06, + "loss": 0.3058, + "step": 2314 + }, + { + "epoch": 0.1852, + "grad_norm": 1.5335050821304321, + "learning_rate": 9.182933768098213e-06, + "loss": 0.3617, + "step": 2315 + }, + { + "epoch": 0.18528, + "grad_norm": 1.3849726915359497, + "learning_rate": 9.18224465550588e-06, + "loss": 0.2886, + "step": 2316 + }, + { + "epoch": 0.18536, + "grad_norm": 1.3755298852920532, + "learning_rate": 9.181555278316921e-06, + "loss": 0.2749, + "step": 2317 + }, + { + "epoch": 0.18544, + "grad_norm": 1.4264538288116455, + "learning_rate": 9.180865636574951e-06, + "loss": 0.369, + "step": 2318 + }, + { + "epoch": 0.18552, + "grad_norm": 1.5905702114105225, + "learning_rate": 9.180175730323602e-06, + "loss": 0.3422, + "step": 2319 + }, + { + "epoch": 0.1856, + "grad_norm": 2.0537757873535156, + "learning_rate": 9.17948555960652e-06, + "loss": 0.3696, + "step": 2320 + }, + { + "epoch": 0.18568, + "grad_norm": 1.9900577068328857, + "learning_rate": 9.178795124467372e-06, + "loss": 0.5154, + "step": 2321 + }, + { + "epoch": 0.18576, + "grad_norm": 1.8288344144821167, + "learning_rate": 9.17810442494984e-06, + "loss": 0.3817, + "step": 2322 + }, + { + "epoch": 0.18584, + "grad_norm": 2.499269723892212, + "learning_rate": 9.17741346109762e-06, + "loss": 0.4914, + "step": 2323 + }, + { + "epoch": 0.18592, + "grad_norm": 1.8195888996124268, + "learning_rate": 9.176722232954426e-06, + "loss": 0.3946, + "step": 2324 + }, + { + "epoch": 0.186, + "grad_norm": 2.1742544174194336, + "learning_rate": 9.176030740563994e-06, + "loss": 0.465, + "step": 2325 + }, + { + "epoch": 0.18608, + "grad_norm": 2.0490312576293945, + "learning_rate": 9.175338983970071e-06, + "loss": 0.3448, + "step": 2326 + }, + { + "epoch": 0.18616, + "grad_norm": 1.3807387351989746, + "learning_rate": 9.17464696321642e-06, + "loss": 0.3351, + "step": 2327 + }, + { + "epoch": 0.18624, + "grad_norm": 1.89095938205719, + "learning_rate": 9.173954678346823e-06, + "loss": 0.336, + "step": 2328 + }, + { + "epoch": 0.18632, + "grad_norm": 1.5240287780761719, + "learning_rate": 9.173262129405081e-06, + "loss": 0.3218, + "step": 2329 + }, + { + "epoch": 0.1864, + "grad_norm": 1.6663240194320679, + "learning_rate": 9.172569316435008e-06, + "loss": 0.3771, + "step": 2330 + }, + { + "epoch": 0.18648, + "grad_norm": 1.4801119565963745, + "learning_rate": 9.171876239480435e-06, + "loss": 0.3371, + "step": 2331 + }, + { + "epoch": 0.18656, + "grad_norm": 1.4088448286056519, + "learning_rate": 9.17118289858521e-06, + "loss": 0.351, + "step": 2332 + }, + { + "epoch": 0.18664, + "grad_norm": 1.6157206296920776, + "learning_rate": 9.170489293793203e-06, + "loss": 0.3401, + "step": 2333 + }, + { + "epoch": 0.18672, + "grad_norm": 1.7214024066925049, + "learning_rate": 9.16979542514829e-06, + "loss": 0.3182, + "step": 2334 + }, + { + "epoch": 0.1868, + "grad_norm": 1.4012519121170044, + "learning_rate": 9.169101292694376e-06, + "loss": 0.3879, + "step": 2335 + }, + { + "epoch": 0.18688, + "grad_norm": 1.5938009023666382, + "learning_rate": 9.168406896475372e-06, + "loss": 0.3759, + "step": 2336 + }, + { + "epoch": 0.18696, + "grad_norm": 1.4147008657455444, + "learning_rate": 9.167712236535209e-06, + "loss": 0.2956, + "step": 2337 + }, + { + "epoch": 0.18704, + "grad_norm": 1.8677122592926025, + "learning_rate": 9.16701731291784e-06, + "loss": 0.4634, + "step": 2338 + }, + { + "epoch": 0.18712, + "grad_norm": 1.4628612995147705, + "learning_rate": 9.166322125667229e-06, + "loss": 0.2652, + "step": 2339 + }, + { + "epoch": 0.1872, + "grad_norm": 1.6055920124053955, + "learning_rate": 9.165626674827355e-06, + "loss": 0.4288, + "step": 2340 + }, + { + "epoch": 0.18728, + "grad_norm": 1.2751909494400024, + "learning_rate": 9.164930960442222e-06, + "loss": 0.249, + "step": 2341 + }, + { + "epoch": 0.18736, + "grad_norm": 1.565497875213623, + "learning_rate": 9.164234982555841e-06, + "loss": 0.3797, + "step": 2342 + }, + { + "epoch": 0.18744, + "grad_norm": 1.4904969930648804, + "learning_rate": 9.163538741212247e-06, + "loss": 0.3217, + "step": 2343 + }, + { + "epoch": 0.18752, + "grad_norm": 1.5334428548812866, + "learning_rate": 9.16284223645549e-06, + "loss": 0.3089, + "step": 2344 + }, + { + "epoch": 0.1876, + "grad_norm": 1.3491618633270264, + "learning_rate": 9.16214546832963e-06, + "loss": 0.3263, + "step": 2345 + }, + { + "epoch": 0.18768, + "grad_norm": 1.757620930671692, + "learning_rate": 9.161448436878755e-06, + "loss": 0.3392, + "step": 2346 + }, + { + "epoch": 0.18776, + "grad_norm": 1.591558814048767, + "learning_rate": 9.160751142146962e-06, + "loss": 0.3605, + "step": 2347 + }, + { + "epoch": 0.18784, + "grad_norm": 1.4992103576660156, + "learning_rate": 9.160053584178365e-06, + "loss": 0.3162, + "step": 2348 + }, + { + "epoch": 0.18792, + "grad_norm": 1.847265601158142, + "learning_rate": 9.159355763017099e-06, + "loss": 0.4105, + "step": 2349 + }, + { + "epoch": 0.188, + "grad_norm": 1.6127371788024902, + "learning_rate": 9.15865767870731e-06, + "loss": 0.345, + "step": 2350 + }, + { + "epoch": 0.18808, + "grad_norm": 1.6684445142745972, + "learning_rate": 9.157959331293165e-06, + "loss": 0.3925, + "step": 2351 + }, + { + "epoch": 0.18816, + "grad_norm": 1.3808155059814453, + "learning_rate": 9.157260720818843e-06, + "loss": 0.3334, + "step": 2352 + }, + { + "epoch": 0.18824, + "grad_norm": 1.176571249961853, + "learning_rate": 9.15656184732855e-06, + "loss": 0.2693, + "step": 2353 + }, + { + "epoch": 0.18832, + "grad_norm": 1.5287673473358154, + "learning_rate": 9.155862710866493e-06, + "loss": 0.3012, + "step": 2354 + }, + { + "epoch": 0.1884, + "grad_norm": 2.0192978382110596, + "learning_rate": 9.15516331147691e-06, + "loss": 0.5501, + "step": 2355 + }, + { + "epoch": 0.18848, + "grad_norm": 1.7251996994018555, + "learning_rate": 9.154463649204046e-06, + "loss": 0.4322, + "step": 2356 + }, + { + "epoch": 0.18856, + "grad_norm": 1.4777201414108276, + "learning_rate": 9.153763724092169e-06, + "loss": 0.3243, + "step": 2357 + }, + { + "epoch": 0.18864, + "grad_norm": 1.573727011680603, + "learning_rate": 9.15306353618556e-06, + "loss": 0.3615, + "step": 2358 + }, + { + "epoch": 0.18872, + "grad_norm": 1.976193904876709, + "learning_rate": 9.152363085528516e-06, + "loss": 0.3525, + "step": 2359 + }, + { + "epoch": 0.1888, + "grad_norm": 1.72635817527771, + "learning_rate": 9.151662372165354e-06, + "loss": 0.357, + "step": 2360 + }, + { + "epoch": 0.18888, + "grad_norm": 1.5015325546264648, + "learning_rate": 9.150961396140405e-06, + "loss": 0.3098, + "step": 2361 + }, + { + "epoch": 0.18896, + "grad_norm": 1.459358811378479, + "learning_rate": 9.15026015749802e-06, + "loss": 0.3554, + "step": 2362 + }, + { + "epoch": 0.18904, + "grad_norm": 1.7636947631835938, + "learning_rate": 9.149558656282557e-06, + "loss": 0.4678, + "step": 2363 + }, + { + "epoch": 0.18912, + "grad_norm": 1.615004539489746, + "learning_rate": 9.148856892538406e-06, + "loss": 0.3788, + "step": 2364 + }, + { + "epoch": 0.1892, + "grad_norm": 1.5762476921081543, + "learning_rate": 9.14815486630996e-06, + "loss": 0.3102, + "step": 2365 + }, + { + "epoch": 0.18928, + "grad_norm": 1.6073107719421387, + "learning_rate": 9.147452577641635e-06, + "loss": 0.3938, + "step": 2366 + }, + { + "epoch": 0.18936, + "grad_norm": 1.3423787355422974, + "learning_rate": 9.146750026577865e-06, + "loss": 0.2977, + "step": 2367 + }, + { + "epoch": 0.18944, + "grad_norm": 1.3909164667129517, + "learning_rate": 9.146047213163094e-06, + "loss": 0.256, + "step": 2368 + }, + { + "epoch": 0.18952, + "grad_norm": 1.6559034585952759, + "learning_rate": 9.14534413744179e-06, + "loss": 0.3731, + "step": 2369 + }, + { + "epoch": 0.1896, + "grad_norm": 1.2788715362548828, + "learning_rate": 9.14464079945843e-06, + "loss": 0.2787, + "step": 2370 + }, + { + "epoch": 0.18968, + "grad_norm": 1.9036865234375, + "learning_rate": 9.143937199257518e-06, + "loss": 0.3465, + "step": 2371 + }, + { + "epoch": 0.18976, + "grad_norm": 1.7238149642944336, + "learning_rate": 9.143233336883563e-06, + "loss": 0.3865, + "step": 2372 + }, + { + "epoch": 0.18984, + "grad_norm": 1.2368091344833374, + "learning_rate": 9.142529212381098e-06, + "loss": 0.2267, + "step": 2373 + }, + { + "epoch": 0.18992, + "grad_norm": 1.7011198997497559, + "learning_rate": 9.141824825794672e-06, + "loss": 0.3792, + "step": 2374 + }, + { + "epoch": 0.19, + "grad_norm": 1.9078706502914429, + "learning_rate": 9.141120177168846e-06, + "loss": 0.3637, + "step": 2375 + }, + { + "epoch": 0.19008, + "grad_norm": 2.0864484310150146, + "learning_rate": 9.140415266548203e-06, + "loss": 0.4524, + "step": 2376 + }, + { + "epoch": 0.19016, + "grad_norm": 1.4254752397537231, + "learning_rate": 9.139710093977342e-06, + "loss": 0.2662, + "step": 2377 + }, + { + "epoch": 0.19024, + "grad_norm": 1.6436759233474731, + "learning_rate": 9.139004659500874e-06, + "loss": 0.3361, + "step": 2378 + }, + { + "epoch": 0.19032, + "grad_norm": 2.105926752090454, + "learning_rate": 9.138298963163429e-06, + "loss": 0.3339, + "step": 2379 + }, + { + "epoch": 0.1904, + "grad_norm": 1.5537519454956055, + "learning_rate": 9.137593005009657e-06, + "loss": 0.3319, + "step": 2380 + }, + { + "epoch": 0.19048, + "grad_norm": 2.272524833679199, + "learning_rate": 9.136886785084217e-06, + "loss": 0.4195, + "step": 2381 + }, + { + "epoch": 0.19056, + "grad_norm": 1.2556356191635132, + "learning_rate": 9.136180303431797e-06, + "loss": 0.3392, + "step": 2382 + }, + { + "epoch": 0.19064, + "grad_norm": 1.5762921571731567, + "learning_rate": 9.135473560097086e-06, + "loss": 0.3597, + "step": 2383 + }, + { + "epoch": 0.19072, + "grad_norm": 1.3007557392120361, + "learning_rate": 9.1347665551248e-06, + "loss": 0.2813, + "step": 2384 + }, + { + "epoch": 0.1908, + "grad_norm": 1.295035719871521, + "learning_rate": 9.134059288559669e-06, + "loss": 0.3211, + "step": 2385 + }, + { + "epoch": 0.19088, + "grad_norm": 1.5391051769256592, + "learning_rate": 9.13335176044644e-06, + "loss": 0.3058, + "step": 2386 + }, + { + "epoch": 0.19096, + "grad_norm": 1.6031337976455688, + "learning_rate": 9.132643970829876e-06, + "loss": 0.3478, + "step": 2387 + }, + { + "epoch": 0.19104, + "grad_norm": 1.572896122932434, + "learning_rate": 9.131935919754755e-06, + "loss": 0.4024, + "step": 2388 + }, + { + "epoch": 0.19112, + "grad_norm": 1.4550867080688477, + "learning_rate": 9.131227607265874e-06, + "loss": 0.2725, + "step": 2389 + }, + { + "epoch": 0.1912, + "grad_norm": 2.0746469497680664, + "learning_rate": 9.130519033408045e-06, + "loss": 0.4477, + "step": 2390 + }, + { + "epoch": 0.19128, + "grad_norm": 1.5469022989273071, + "learning_rate": 9.129810198226099e-06, + "loss": 0.3219, + "step": 2391 + }, + { + "epoch": 0.19136, + "grad_norm": 1.4501341581344604, + "learning_rate": 9.12910110176488e-06, + "loss": 0.4573, + "step": 2392 + }, + { + "epoch": 0.19144, + "grad_norm": 1.1760106086730957, + "learning_rate": 9.128391744069248e-06, + "loss": 0.2149, + "step": 2393 + }, + { + "epoch": 0.19152, + "grad_norm": 1.8944004774093628, + "learning_rate": 9.127682125184085e-06, + "loss": 0.4243, + "step": 2394 + }, + { + "epoch": 0.1916, + "grad_norm": 1.382332682609558, + "learning_rate": 9.126972245154287e-06, + "loss": 0.2722, + "step": 2395 + }, + { + "epoch": 0.19168, + "grad_norm": 1.5882385969161987, + "learning_rate": 9.12626210402476e-06, + "loss": 0.4348, + "step": 2396 + }, + { + "epoch": 0.19176, + "grad_norm": 1.7748380899429321, + "learning_rate": 9.12555170184044e-06, + "loss": 0.3083, + "step": 2397 + }, + { + "epoch": 0.19184, + "grad_norm": 1.7226957082748413, + "learning_rate": 9.124841038646268e-06, + "loss": 0.3241, + "step": 2398 + }, + { + "epoch": 0.19192, + "grad_norm": 1.642120599746704, + "learning_rate": 9.124130114487203e-06, + "loss": 0.3724, + "step": 2399 + }, + { + "epoch": 0.192, + "grad_norm": 1.916398048400879, + "learning_rate": 9.123418929408225e-06, + "loss": 0.4925, + "step": 2400 + }, + { + "epoch": 0.19208, + "grad_norm": 1.2868858575820923, + "learning_rate": 9.12270748345433e-06, + "loss": 0.3048, + "step": 2401 + }, + { + "epoch": 0.19216, + "grad_norm": 1.6964247226715088, + "learning_rate": 9.121995776670527e-06, + "loss": 0.3746, + "step": 2402 + }, + { + "epoch": 0.19224, + "grad_norm": 1.710965871810913, + "learning_rate": 9.121283809101843e-06, + "loss": 0.4557, + "step": 2403 + }, + { + "epoch": 0.19232, + "grad_norm": 1.4680863618850708, + "learning_rate": 9.120571580793322e-06, + "loss": 0.303, + "step": 2404 + }, + { + "epoch": 0.1924, + "grad_norm": 1.802053451538086, + "learning_rate": 9.119859091790025e-06, + "loss": 0.3215, + "step": 2405 + }, + { + "epoch": 0.19248, + "grad_norm": 1.676236867904663, + "learning_rate": 9.119146342137029e-06, + "loss": 0.4425, + "step": 2406 + }, + { + "epoch": 0.19256, + "grad_norm": 1.6125231981277466, + "learning_rate": 9.118433331879424e-06, + "loss": 0.3079, + "step": 2407 + }, + { + "epoch": 0.19264, + "grad_norm": 1.313044548034668, + "learning_rate": 9.117720061062324e-06, + "loss": 0.3317, + "step": 2408 + }, + { + "epoch": 0.19272, + "grad_norm": 1.5678797960281372, + "learning_rate": 9.117006529730853e-06, + "loss": 0.311, + "step": 2409 + }, + { + "epoch": 0.1928, + "grad_norm": 1.5004569292068481, + "learning_rate": 9.116292737930156e-06, + "loss": 0.3152, + "step": 2410 + }, + { + "epoch": 0.19288, + "grad_norm": 1.86920964717865, + "learning_rate": 9.115578685705391e-06, + "loss": 0.3537, + "step": 2411 + }, + { + "epoch": 0.19296, + "grad_norm": 1.8753119707107544, + "learning_rate": 9.114864373101733e-06, + "loss": 0.4428, + "step": 2412 + }, + { + "epoch": 0.19304, + "grad_norm": 1.3418453931808472, + "learning_rate": 9.114149800164372e-06, + "loss": 0.2649, + "step": 2413 + }, + { + "epoch": 0.19312, + "grad_norm": 1.6189359426498413, + "learning_rate": 9.11343496693852e-06, + "loss": 0.3543, + "step": 2414 + }, + { + "epoch": 0.1932, + "grad_norm": 1.6323224306106567, + "learning_rate": 9.112719873469403e-06, + "loss": 0.3309, + "step": 2415 + }, + { + "epoch": 0.19328, + "grad_norm": 1.9282023906707764, + "learning_rate": 9.11200451980226e-06, + "loss": 0.3991, + "step": 2416 + }, + { + "epoch": 0.19336, + "grad_norm": 1.5333116054534912, + "learning_rate": 9.11128890598235e-06, + "loss": 0.3514, + "step": 2417 + }, + { + "epoch": 0.19344, + "grad_norm": 1.4084820747375488, + "learning_rate": 9.110573032054947e-06, + "loss": 0.3691, + "step": 2418 + }, + { + "epoch": 0.19352, + "grad_norm": 1.6476877927780151, + "learning_rate": 9.109856898065343e-06, + "loss": 0.3209, + "step": 2419 + }, + { + "epoch": 0.1936, + "grad_norm": 1.7577415704727173, + "learning_rate": 9.109140504058843e-06, + "loss": 0.3974, + "step": 2420 + }, + { + "epoch": 0.19368, + "grad_norm": 1.454849123954773, + "learning_rate": 9.108423850080774e-06, + "loss": 0.2629, + "step": 2421 + }, + { + "epoch": 0.19376, + "grad_norm": 1.6696566343307495, + "learning_rate": 9.107706936176474e-06, + "loss": 0.3151, + "step": 2422 + }, + { + "epoch": 0.19384, + "grad_norm": 2.0093162059783936, + "learning_rate": 9.1069897623913e-06, + "loss": 0.3786, + "step": 2423 + }, + { + "epoch": 0.19392, + "grad_norm": 1.7907012701034546, + "learning_rate": 9.106272328770627e-06, + "loss": 0.3346, + "step": 2424 + }, + { + "epoch": 0.194, + "grad_norm": 1.5353763103485107, + "learning_rate": 9.105554635359843e-06, + "loss": 0.3275, + "step": 2425 + }, + { + "epoch": 0.19408, + "grad_norm": 1.6136126518249512, + "learning_rate": 9.104836682204354e-06, + "loss": 0.3316, + "step": 2426 + }, + { + "epoch": 0.19416, + "grad_norm": 1.877288579940796, + "learning_rate": 9.104118469349585e-06, + "loss": 0.3735, + "step": 2427 + }, + { + "epoch": 0.19424, + "grad_norm": 1.869982361793518, + "learning_rate": 9.103399996840972e-06, + "loss": 0.4767, + "step": 2428 + }, + { + "epoch": 0.19432, + "grad_norm": 1.6566518545150757, + "learning_rate": 9.102681264723969e-06, + "loss": 0.3472, + "step": 2429 + }, + { + "epoch": 0.1944, + "grad_norm": 1.631504774093628, + "learning_rate": 9.101962273044053e-06, + "loss": 0.3334, + "step": 2430 + }, + { + "epoch": 0.19448, + "grad_norm": 1.7339836359024048, + "learning_rate": 9.101243021846705e-06, + "loss": 0.4071, + "step": 2431 + }, + { + "epoch": 0.19456, + "grad_norm": 1.6289665699005127, + "learning_rate": 9.10052351117744e-06, + "loss": 0.4196, + "step": 2432 + }, + { + "epoch": 0.19464, + "grad_norm": 1.76044762134552, + "learning_rate": 9.099803741081767e-06, + "loss": 0.4187, + "step": 2433 + }, + { + "epoch": 0.19472, + "grad_norm": 1.4212559461593628, + "learning_rate": 9.099083711605233e-06, + "loss": 0.3407, + "step": 2434 + }, + { + "epoch": 0.1948, + "grad_norm": 1.439171314239502, + "learning_rate": 9.098363422793387e-06, + "loss": 0.2902, + "step": 2435 + }, + { + "epoch": 0.19488, + "grad_norm": 1.539538025856018, + "learning_rate": 9.0976428746918e-06, + "loss": 0.3686, + "step": 2436 + }, + { + "epoch": 0.19496, + "grad_norm": 1.2634087800979614, + "learning_rate": 9.09692206734606e-06, + "loss": 0.2648, + "step": 2437 + }, + { + "epoch": 0.19504, + "grad_norm": 1.2132208347320557, + "learning_rate": 9.096201000801768e-06, + "loss": 0.2667, + "step": 2438 + }, + { + "epoch": 0.19512, + "grad_norm": 1.5921686887741089, + "learning_rate": 9.095479675104543e-06, + "loss": 0.3618, + "step": 2439 + }, + { + "epoch": 0.1952, + "grad_norm": 1.5952152013778687, + "learning_rate": 9.094758090300026e-06, + "loss": 0.3662, + "step": 2440 + }, + { + "epoch": 0.19528, + "grad_norm": 1.3034418821334839, + "learning_rate": 9.094036246433863e-06, + "loss": 0.2761, + "step": 2441 + }, + { + "epoch": 0.19536, + "grad_norm": 1.6235853433609009, + "learning_rate": 9.093314143551728e-06, + "loss": 0.4082, + "step": 2442 + }, + { + "epoch": 0.19544, + "grad_norm": 1.4469619989395142, + "learning_rate": 9.092591781699302e-06, + "loss": 0.3711, + "step": 2443 + }, + { + "epoch": 0.19552, + "grad_norm": 1.574036717414856, + "learning_rate": 9.09186916092229e-06, + "loss": 0.399, + "step": 2444 + }, + { + "epoch": 0.1956, + "grad_norm": 1.363416314125061, + "learning_rate": 9.091146281266403e-06, + "loss": 0.3059, + "step": 2445 + }, + { + "epoch": 0.19568, + "grad_norm": 1.4807307720184326, + "learning_rate": 9.090423142777383e-06, + "loss": 0.3728, + "step": 2446 + }, + { + "epoch": 0.19576, + "grad_norm": 1.6184738874435425, + "learning_rate": 9.089699745500977e-06, + "loss": 0.3375, + "step": 2447 + }, + { + "epoch": 0.19584, + "grad_norm": 1.5262713432312012, + "learning_rate": 9.08897608948295e-06, + "loss": 0.3389, + "step": 2448 + }, + { + "epoch": 0.19592, + "grad_norm": 1.2660731077194214, + "learning_rate": 9.088252174769092e-06, + "loss": 0.2845, + "step": 2449 + }, + { + "epoch": 0.196, + "grad_norm": 2.0074665546417236, + "learning_rate": 9.087528001405194e-06, + "loss": 0.3709, + "step": 2450 + }, + { + "epoch": 0.19608, + "grad_norm": 1.678610920906067, + "learning_rate": 9.08680356943708e-06, + "loss": 0.3842, + "step": 2451 + }, + { + "epoch": 0.19616, + "grad_norm": 1.604011058807373, + "learning_rate": 9.086078878910576e-06, + "loss": 0.305, + "step": 2452 + }, + { + "epoch": 0.19624, + "grad_norm": 1.7445040941238403, + "learning_rate": 9.085353929871534e-06, + "loss": 0.3833, + "step": 2453 + }, + { + "epoch": 0.19632, + "grad_norm": 1.6903281211853027, + "learning_rate": 9.084628722365817e-06, + "loss": 0.3579, + "step": 2454 + }, + { + "epoch": 0.1964, + "grad_norm": 1.587699294090271, + "learning_rate": 9.08390325643931e-06, + "loss": 0.324, + "step": 2455 + }, + { + "epoch": 0.19648, + "grad_norm": 1.681836724281311, + "learning_rate": 9.083177532137909e-06, + "loss": 0.3006, + "step": 2456 + }, + { + "epoch": 0.19656, + "grad_norm": 1.259889006614685, + "learning_rate": 9.082451549507528e-06, + "loss": 0.3757, + "step": 2457 + }, + { + "epoch": 0.19664, + "grad_norm": 1.4303218126296997, + "learning_rate": 9.081725308594096e-06, + "loss": 0.3122, + "step": 2458 + }, + { + "epoch": 0.19672, + "grad_norm": 1.956416130065918, + "learning_rate": 9.080998809443563e-06, + "loss": 0.5083, + "step": 2459 + }, + { + "epoch": 0.1968, + "grad_norm": 1.0173094272613525, + "learning_rate": 9.080272052101888e-06, + "loss": 0.204, + "step": 2460 + }, + { + "epoch": 0.19688, + "grad_norm": 1.6032788753509521, + "learning_rate": 9.079545036615054e-06, + "loss": 0.4081, + "step": 2461 + }, + { + "epoch": 0.19696, + "grad_norm": 1.881056547164917, + "learning_rate": 9.078817763029054e-06, + "loss": 0.4511, + "step": 2462 + }, + { + "epoch": 0.19704, + "grad_norm": 1.7417298555374146, + "learning_rate": 9.078090231389904e-06, + "loss": 0.307, + "step": 2463 + }, + { + "epoch": 0.19712, + "grad_norm": 1.7411683797836304, + "learning_rate": 9.077362441743632e-06, + "loss": 0.3968, + "step": 2464 + }, + { + "epoch": 0.1972, + "grad_norm": 1.9373418092727661, + "learning_rate": 9.076634394136279e-06, + "loss": 0.5019, + "step": 2465 + }, + { + "epoch": 0.19728, + "grad_norm": 1.9146742820739746, + "learning_rate": 9.075906088613909e-06, + "loss": 0.421, + "step": 2466 + }, + { + "epoch": 0.19736, + "grad_norm": 1.7911179065704346, + "learning_rate": 9.075177525222597e-06, + "loss": 0.3551, + "step": 2467 + }, + { + "epoch": 0.19744, + "grad_norm": 2.1474406719207764, + "learning_rate": 9.074448704008441e-06, + "loss": 0.4482, + "step": 2468 + }, + { + "epoch": 0.19752, + "grad_norm": 1.610866665840149, + "learning_rate": 9.073719625017548e-06, + "loss": 0.4025, + "step": 2469 + }, + { + "epoch": 0.1976, + "grad_norm": 1.5338889360427856, + "learning_rate": 9.072990288296044e-06, + "loss": 0.3913, + "step": 2470 + }, + { + "epoch": 0.19768, + "grad_norm": 1.8097115755081177, + "learning_rate": 9.072260693890073e-06, + "loss": 0.4809, + "step": 2471 + }, + { + "epoch": 0.19776, + "grad_norm": 1.3825757503509521, + "learning_rate": 9.071530841845794e-06, + "loss": 0.3793, + "step": 2472 + }, + { + "epoch": 0.19784, + "grad_norm": 1.5398048162460327, + "learning_rate": 9.070800732209382e-06, + "loss": 0.3685, + "step": 2473 + }, + { + "epoch": 0.19792, + "grad_norm": 1.705269694328308, + "learning_rate": 9.070070365027029e-06, + "loss": 0.5052, + "step": 2474 + }, + { + "epoch": 0.198, + "grad_norm": 1.5767617225646973, + "learning_rate": 9.069339740344943e-06, + "loss": 0.3937, + "step": 2475 + }, + { + "epoch": 0.19808, + "grad_norm": 1.8536933660507202, + "learning_rate": 9.068608858209347e-06, + "loss": 0.5109, + "step": 2476 + }, + { + "epoch": 0.19816, + "grad_norm": 1.6715956926345825, + "learning_rate": 9.067877718666482e-06, + "loss": 0.3558, + "step": 2477 + }, + { + "epoch": 0.19824, + "grad_norm": 1.6765884160995483, + "learning_rate": 9.067146321762603e-06, + "loss": 0.3681, + "step": 2478 + }, + { + "epoch": 0.19832, + "grad_norm": 1.7231676578521729, + "learning_rate": 9.066414667543988e-06, + "loss": 0.3274, + "step": 2479 + }, + { + "epoch": 0.1984, + "grad_norm": 1.4541349411010742, + "learning_rate": 9.065682756056922e-06, + "loss": 0.3247, + "step": 2480 + }, + { + "epoch": 0.19848, + "grad_norm": 1.4403581619262695, + "learning_rate": 9.064950587347711e-06, + "loss": 0.4417, + "step": 2481 + }, + { + "epoch": 0.19856, + "grad_norm": 1.7160850763320923, + "learning_rate": 9.06421816146268e-06, + "loss": 0.3751, + "step": 2482 + }, + { + "epoch": 0.19864, + "grad_norm": 1.7260886430740356, + "learning_rate": 9.063485478448164e-06, + "loss": 0.3181, + "step": 2483 + }, + { + "epoch": 0.19872, + "grad_norm": 2.1682159900665283, + "learning_rate": 9.062752538350517e-06, + "loss": 0.4809, + "step": 2484 + }, + { + "epoch": 0.1988, + "grad_norm": 1.7218812704086304, + "learning_rate": 9.062019341216112e-06, + "loss": 0.3777, + "step": 2485 + }, + { + "epoch": 0.19888, + "grad_norm": 1.914481520652771, + "learning_rate": 9.061285887091334e-06, + "loss": 0.4097, + "step": 2486 + }, + { + "epoch": 0.19896, + "grad_norm": 1.4902898073196411, + "learning_rate": 9.060552176022587e-06, + "loss": 0.3261, + "step": 2487 + }, + { + "epoch": 0.19904, + "grad_norm": 1.306275725364685, + "learning_rate": 9.059818208056293e-06, + "loss": 0.2747, + "step": 2488 + }, + { + "epoch": 0.19912, + "grad_norm": 1.6835832595825195, + "learning_rate": 9.059083983238882e-06, + "loss": 0.3671, + "step": 2489 + }, + { + "epoch": 0.1992, + "grad_norm": 1.5236090421676636, + "learning_rate": 9.05834950161681e-06, + "loss": 0.359, + "step": 2490 + }, + { + "epoch": 0.19928, + "grad_norm": 1.5683189630508423, + "learning_rate": 9.057614763236545e-06, + "loss": 0.3818, + "step": 2491 + }, + { + "epoch": 0.19936, + "grad_norm": 1.9629745483398438, + "learning_rate": 9.056879768144572e-06, + "loss": 0.4597, + "step": 2492 + }, + { + "epoch": 0.19944, + "grad_norm": 1.8202420473098755, + "learning_rate": 9.056144516387387e-06, + "loss": 0.3821, + "step": 2493 + }, + { + "epoch": 0.19952, + "grad_norm": 2.0477683544158936, + "learning_rate": 9.055409008011513e-06, + "loss": 0.5704, + "step": 2494 + }, + { + "epoch": 0.1996, + "grad_norm": 1.5534504652023315, + "learning_rate": 9.05467324306348e-06, + "loss": 0.2988, + "step": 2495 + }, + { + "epoch": 0.19968, + "grad_norm": 1.8985226154327393, + "learning_rate": 9.053937221589837e-06, + "loss": 0.4126, + "step": 2496 + }, + { + "epoch": 0.19976, + "grad_norm": 1.7892488241195679, + "learning_rate": 9.05320094363715e-06, + "loss": 0.4152, + "step": 2497 + }, + { + "epoch": 0.19984, + "grad_norm": 1.563428282737732, + "learning_rate": 9.052464409252003e-06, + "loss": 0.3499, + "step": 2498 + }, + { + "epoch": 0.19992, + "grad_norm": 1.7737313508987427, + "learning_rate": 9.051727618480992e-06, + "loss": 0.4627, + "step": 2499 + }, + { + "epoch": 0.2, + "grad_norm": 1.461615800857544, + "learning_rate": 9.050990571370731e-06, + "loss": 0.3934, + "step": 2500 + }, + { + "epoch": 0.20008, + "grad_norm": 1.1548340320587158, + "learning_rate": 9.050253267967852e-06, + "loss": 0.2507, + "step": 2501 + }, + { + "epoch": 0.20016, + "grad_norm": 1.709346055984497, + "learning_rate": 9.049515708319001e-06, + "loss": 0.3237, + "step": 2502 + }, + { + "epoch": 0.20024, + "grad_norm": 1.4969236850738525, + "learning_rate": 9.048777892470841e-06, + "loss": 0.2986, + "step": 2503 + }, + { + "epoch": 0.20032, + "grad_norm": 1.687313199043274, + "learning_rate": 9.048039820470049e-06, + "loss": 0.3521, + "step": 2504 + }, + { + "epoch": 0.2004, + "grad_norm": 1.6837893724441528, + "learning_rate": 9.047301492363325e-06, + "loss": 0.376, + "step": 2505 + }, + { + "epoch": 0.20048, + "grad_norm": 1.6461684703826904, + "learning_rate": 9.046562908197376e-06, + "loss": 0.3472, + "step": 2506 + }, + { + "epoch": 0.20056, + "grad_norm": 1.2622382640838623, + "learning_rate": 9.045824068018934e-06, + "loss": 0.259, + "step": 2507 + }, + { + "epoch": 0.20064, + "grad_norm": 1.7232669591903687, + "learning_rate": 9.045084971874738e-06, + "loss": 0.4805, + "step": 2508 + }, + { + "epoch": 0.20072, + "grad_norm": 1.287067174911499, + "learning_rate": 9.044345619811552e-06, + "loss": 0.2888, + "step": 2509 + }, + { + "epoch": 0.2008, + "grad_norm": 1.4580618143081665, + "learning_rate": 9.04360601187615e-06, + "loss": 0.3821, + "step": 2510 + }, + { + "epoch": 0.20088, + "grad_norm": 1.244520902633667, + "learning_rate": 9.042866148115325e-06, + "loss": 0.276, + "step": 2511 + }, + { + "epoch": 0.20096, + "grad_norm": 1.3449219465255737, + "learning_rate": 9.042126028575889e-06, + "loss": 0.2915, + "step": 2512 + }, + { + "epoch": 0.20104, + "grad_norm": 1.5387424230575562, + "learning_rate": 9.041385653304664e-06, + "loss": 0.3284, + "step": 2513 + }, + { + "epoch": 0.20112, + "grad_norm": 2.1138205528259277, + "learning_rate": 9.04064502234849e-06, + "loss": 0.4097, + "step": 2514 + }, + { + "epoch": 0.2012, + "grad_norm": 1.5106236934661865, + "learning_rate": 9.039904135754225e-06, + "loss": 0.3061, + "step": 2515 + }, + { + "epoch": 0.20128, + "grad_norm": 1.434637427330017, + "learning_rate": 9.039162993568743e-06, + "loss": 0.3543, + "step": 2516 + }, + { + "epoch": 0.20136, + "grad_norm": 1.5437572002410889, + "learning_rate": 9.038421595838934e-06, + "loss": 0.3407, + "step": 2517 + }, + { + "epoch": 0.20144, + "grad_norm": 1.3405263423919678, + "learning_rate": 9.037679942611704e-06, + "loss": 0.2682, + "step": 2518 + }, + { + "epoch": 0.20152, + "grad_norm": 1.4590741395950317, + "learning_rate": 9.036938033933973e-06, + "loss": 0.3135, + "step": 2519 + }, + { + "epoch": 0.2016, + "grad_norm": 1.43552565574646, + "learning_rate": 9.03619586985268e-06, + "loss": 0.2935, + "step": 2520 + }, + { + "epoch": 0.20168, + "grad_norm": 1.7763866186141968, + "learning_rate": 9.035453450414779e-06, + "loss": 0.3757, + "step": 2521 + }, + { + "epoch": 0.20176, + "grad_norm": 1.8434540033340454, + "learning_rate": 9.034710775667242e-06, + "loss": 0.4078, + "step": 2522 + }, + { + "epoch": 0.20184, + "grad_norm": 1.806249737739563, + "learning_rate": 9.033967845657054e-06, + "loss": 0.4245, + "step": 2523 + }, + { + "epoch": 0.20192, + "grad_norm": 1.451335072517395, + "learning_rate": 9.033224660431219e-06, + "loss": 0.3327, + "step": 2524 + }, + { + "epoch": 0.202, + "grad_norm": 1.4611728191375732, + "learning_rate": 9.032481220036754e-06, + "loss": 0.3051, + "step": 2525 + }, + { + "epoch": 0.20208, + "grad_norm": 1.524274230003357, + "learning_rate": 9.031737524520697e-06, + "loss": 0.3235, + "step": 2526 + }, + { + "epoch": 0.20216, + "grad_norm": 1.7024296522140503, + "learning_rate": 9.030993573930094e-06, + "loss": 0.3785, + "step": 2527 + }, + { + "epoch": 0.20224, + "grad_norm": 0.9294398427009583, + "learning_rate": 9.030249368312015e-06, + "loss": 0.1819, + "step": 2528 + }, + { + "epoch": 0.20232, + "grad_norm": 1.809221625328064, + "learning_rate": 9.029504907713547e-06, + "loss": 0.3686, + "step": 2529 + }, + { + "epoch": 0.2024, + "grad_norm": 1.8182200193405151, + "learning_rate": 9.028760192181785e-06, + "loss": 0.3476, + "step": 2530 + }, + { + "epoch": 0.20248, + "grad_norm": 1.5775073766708374, + "learning_rate": 9.028015221763844e-06, + "loss": 0.3117, + "step": 2531 + }, + { + "epoch": 0.20256, + "grad_norm": 1.9079387187957764, + "learning_rate": 9.02726999650686e-06, + "loss": 0.4509, + "step": 2532 + }, + { + "epoch": 0.20264, + "grad_norm": 1.451041579246521, + "learning_rate": 9.026524516457977e-06, + "loss": 0.3606, + "step": 2533 + }, + { + "epoch": 0.20272, + "grad_norm": 1.1723933219909668, + "learning_rate": 9.025778781664361e-06, + "loss": 0.2418, + "step": 2534 + }, + { + "epoch": 0.2028, + "grad_norm": 1.693428635597229, + "learning_rate": 9.025032792173193e-06, + "loss": 0.3352, + "step": 2535 + }, + { + "epoch": 0.20288, + "grad_norm": 1.216301441192627, + "learning_rate": 9.024286548031666e-06, + "loss": 0.334, + "step": 2536 + }, + { + "epoch": 0.20296, + "grad_norm": 1.6559041738510132, + "learning_rate": 9.023540049286996e-06, + "loss": 0.3532, + "step": 2537 + }, + { + "epoch": 0.20304, + "grad_norm": 1.55605149269104, + "learning_rate": 9.02279329598641e-06, + "loss": 0.3522, + "step": 2538 + }, + { + "epoch": 0.20312, + "grad_norm": 1.8087921142578125, + "learning_rate": 9.022046288177153e-06, + "loss": 0.3933, + "step": 2539 + }, + { + "epoch": 0.2032, + "grad_norm": 1.6193270683288574, + "learning_rate": 9.021299025906482e-06, + "loss": 0.3471, + "step": 2540 + }, + { + "epoch": 0.20328, + "grad_norm": 1.2611744403839111, + "learning_rate": 9.02055150922168e-06, + "loss": 0.2635, + "step": 2541 + }, + { + "epoch": 0.20336, + "grad_norm": 1.7183457612991333, + "learning_rate": 9.019803738170036e-06, + "loss": 0.3218, + "step": 2542 + }, + { + "epoch": 0.20344, + "grad_norm": 1.3315821886062622, + "learning_rate": 9.01905571279886e-06, + "loss": 0.2472, + "step": 2543 + }, + { + "epoch": 0.20352, + "grad_norm": 1.7462687492370605, + "learning_rate": 9.018307433155477e-06, + "loss": 0.3837, + "step": 2544 + }, + { + "epoch": 0.2036, + "grad_norm": 2.0826213359832764, + "learning_rate": 9.017558899287226e-06, + "loss": 0.4974, + "step": 2545 + }, + { + "epoch": 0.20368, + "grad_norm": 1.5876009464263916, + "learning_rate": 9.01681011124147e-06, + "loss": 0.3448, + "step": 2546 + }, + { + "epoch": 0.20376, + "grad_norm": 1.6092311143875122, + "learning_rate": 9.016061069065576e-06, + "loss": 0.2966, + "step": 2547 + }, + { + "epoch": 0.20384, + "grad_norm": 1.5083656311035156, + "learning_rate": 9.015311772806937e-06, + "loss": 0.3039, + "step": 2548 + }, + { + "epoch": 0.20392, + "grad_norm": 1.6449142694473267, + "learning_rate": 9.014562222512954e-06, + "loss": 0.3372, + "step": 2549 + }, + { + "epoch": 0.204, + "grad_norm": 1.5135823488235474, + "learning_rate": 9.013812418231055e-06, + "loss": 0.2915, + "step": 2550 + }, + { + "epoch": 0.20408, + "grad_norm": 2.0886871814727783, + "learning_rate": 9.013062360008675e-06, + "loss": 0.3916, + "step": 2551 + }, + { + "epoch": 0.20416, + "grad_norm": 1.2576608657836914, + "learning_rate": 9.012312047893265e-06, + "loss": 0.2639, + "step": 2552 + }, + { + "epoch": 0.20424, + "grad_norm": 1.7450748682022095, + "learning_rate": 9.011561481932301e-06, + "loss": 0.3559, + "step": 2553 + }, + { + "epoch": 0.20432, + "grad_norm": 1.6265859603881836, + "learning_rate": 9.010810662173262e-06, + "loss": 0.3201, + "step": 2554 + }, + { + "epoch": 0.2044, + "grad_norm": 1.57313072681427, + "learning_rate": 9.010059588663651e-06, + "loss": 0.4484, + "step": 2555 + }, + { + "epoch": 0.20448, + "grad_norm": 1.5142831802368164, + "learning_rate": 9.00930826145099e-06, + "loss": 0.3172, + "step": 2556 + }, + { + "epoch": 0.20456, + "grad_norm": 1.7187414169311523, + "learning_rate": 9.00855668058281e-06, + "loss": 0.3237, + "step": 2557 + }, + { + "epoch": 0.20464, + "grad_norm": 1.5038609504699707, + "learning_rate": 9.007804846106662e-06, + "loss": 0.3068, + "step": 2558 + }, + { + "epoch": 0.20472, + "grad_norm": 1.3688982725143433, + "learning_rate": 9.007052758070111e-06, + "loss": 0.2904, + "step": 2559 + }, + { + "epoch": 0.2048, + "grad_norm": 1.5706552267074585, + "learning_rate": 9.00630041652074e-06, + "loss": 0.3624, + "step": 2560 + }, + { + "epoch": 0.20488, + "grad_norm": 1.5629266500473022, + "learning_rate": 9.005547821506145e-06, + "loss": 0.3069, + "step": 2561 + }, + { + "epoch": 0.20496, + "grad_norm": 1.3994823694229126, + "learning_rate": 9.004794973073943e-06, + "loss": 0.2774, + "step": 2562 + }, + { + "epoch": 0.20504, + "grad_norm": 1.459088921546936, + "learning_rate": 9.004041871271763e-06, + "loss": 0.3553, + "step": 2563 + }, + { + "epoch": 0.20512, + "grad_norm": 1.7317893505096436, + "learning_rate": 9.003288516147253e-06, + "loss": 0.3853, + "step": 2564 + }, + { + "epoch": 0.2052, + "grad_norm": 2.0651421546936035, + "learning_rate": 9.002534907748071e-06, + "loss": 0.6532, + "step": 2565 + }, + { + "epoch": 0.20528, + "grad_norm": 1.663630485534668, + "learning_rate": 9.0017810461219e-06, + "loss": 0.3526, + "step": 2566 + }, + { + "epoch": 0.20536, + "grad_norm": 1.511698842048645, + "learning_rate": 9.00102693131643e-06, + "loss": 0.2987, + "step": 2567 + }, + { + "epoch": 0.20544, + "grad_norm": 1.929552674293518, + "learning_rate": 9.000272563379375e-06, + "loss": 0.4855, + "step": 2568 + }, + { + "epoch": 0.20552, + "grad_norm": 1.6176848411560059, + "learning_rate": 8.99951794235846e-06, + "loss": 0.3513, + "step": 2569 + }, + { + "epoch": 0.2056, + "grad_norm": 1.647544264793396, + "learning_rate": 8.998763068301428e-06, + "loss": 0.4105, + "step": 2570 + }, + { + "epoch": 0.20568, + "grad_norm": 1.4232338666915894, + "learning_rate": 8.998007941256035e-06, + "loss": 0.3615, + "step": 2571 + }, + { + "epoch": 0.20576, + "grad_norm": 1.769791841506958, + "learning_rate": 8.997252561270058e-06, + "loss": 0.3636, + "step": 2572 + }, + { + "epoch": 0.20584, + "grad_norm": 1.6207672357559204, + "learning_rate": 8.996496928391285e-06, + "loss": 0.3445, + "step": 2573 + }, + { + "epoch": 0.20592, + "grad_norm": 1.8386582136154175, + "learning_rate": 8.995741042667524e-06, + "loss": 0.3672, + "step": 2574 + }, + { + "epoch": 0.206, + "grad_norm": 1.6057279109954834, + "learning_rate": 8.994984904146599e-06, + "loss": 0.344, + "step": 2575 + }, + { + "epoch": 0.20608, + "grad_norm": 1.5600706338882446, + "learning_rate": 8.994228512876345e-06, + "loss": 0.346, + "step": 2576 + }, + { + "epoch": 0.20616, + "grad_norm": 1.798862338066101, + "learning_rate": 8.993471868904617e-06, + "loss": 0.3838, + "step": 2577 + }, + { + "epoch": 0.20624, + "grad_norm": 1.6833524703979492, + "learning_rate": 8.992714972279285e-06, + "loss": 0.4041, + "step": 2578 + }, + { + "epoch": 0.20632, + "grad_norm": 1.3995176553726196, + "learning_rate": 8.99195782304824e-06, + "loss": 0.3753, + "step": 2579 + }, + { + "epoch": 0.2064, + "grad_norm": 1.242598295211792, + "learning_rate": 8.991200421259378e-06, + "loss": 0.2893, + "step": 2580 + }, + { + "epoch": 0.20648, + "grad_norm": 1.356940746307373, + "learning_rate": 8.990442766960622e-06, + "loss": 0.2868, + "step": 2581 + }, + { + "epoch": 0.20656, + "grad_norm": 1.8150062561035156, + "learning_rate": 8.989684860199903e-06, + "loss": 0.4231, + "step": 2582 + }, + { + "epoch": 0.20664, + "grad_norm": 1.3813666105270386, + "learning_rate": 8.988926701025171e-06, + "loss": 0.33, + "step": 2583 + }, + { + "epoch": 0.20672, + "grad_norm": 1.7560433149337769, + "learning_rate": 8.988168289484396e-06, + "loss": 0.34, + "step": 2584 + }, + { + "epoch": 0.2068, + "grad_norm": 1.6420555114746094, + "learning_rate": 8.987409625625556e-06, + "loss": 0.3671, + "step": 2585 + }, + { + "epoch": 0.20688, + "grad_norm": 1.5787367820739746, + "learning_rate": 8.986650709496652e-06, + "loss": 0.3619, + "step": 2586 + }, + { + "epoch": 0.20696, + "grad_norm": 1.6930251121520996, + "learning_rate": 8.985891541145696e-06, + "loss": 0.3442, + "step": 2587 + }, + { + "epoch": 0.20704, + "grad_norm": 1.488715410232544, + "learning_rate": 8.98513212062072e-06, + "loss": 0.4164, + "step": 2588 + }, + { + "epoch": 0.20712, + "grad_norm": 1.5280767679214478, + "learning_rate": 8.98437244796977e-06, + "loss": 0.2627, + "step": 2589 + }, + { + "epoch": 0.2072, + "grad_norm": 1.939414381980896, + "learning_rate": 8.983612523240903e-06, + "loss": 0.3893, + "step": 2590 + }, + { + "epoch": 0.20728, + "grad_norm": 1.3220232725143433, + "learning_rate": 8.982852346482205e-06, + "loss": 0.3041, + "step": 2591 + }, + { + "epoch": 0.20736, + "grad_norm": 1.585889458656311, + "learning_rate": 8.982091917741764e-06, + "loss": 0.3324, + "step": 2592 + }, + { + "epoch": 0.20744, + "grad_norm": 1.9471094608306885, + "learning_rate": 8.981331237067691e-06, + "loss": 0.4718, + "step": 2593 + }, + { + "epoch": 0.20752, + "grad_norm": 1.783488392829895, + "learning_rate": 8.980570304508114e-06, + "loss": 0.4395, + "step": 2594 + }, + { + "epoch": 0.2076, + "grad_norm": 1.2568445205688477, + "learning_rate": 8.97980912011117e-06, + "loss": 0.2889, + "step": 2595 + }, + { + "epoch": 0.20768, + "grad_norm": 1.5438101291656494, + "learning_rate": 8.979047683925022e-06, + "loss": 0.4005, + "step": 2596 + }, + { + "epoch": 0.20776, + "grad_norm": 1.58918297290802, + "learning_rate": 8.978285995997839e-06, + "loss": 0.2668, + "step": 2597 + }, + { + "epoch": 0.20784, + "grad_norm": 1.4388453960418701, + "learning_rate": 8.977524056377814e-06, + "loss": 0.2873, + "step": 2598 + }, + { + "epoch": 0.20792, + "grad_norm": 2.0037522315979004, + "learning_rate": 8.97676186511315e-06, + "loss": 0.3798, + "step": 2599 + }, + { + "epoch": 0.208, + "grad_norm": 1.6372380256652832, + "learning_rate": 8.975999422252071e-06, + "loss": 0.3634, + "step": 2600 + }, + { + "epoch": 0.20808, + "grad_norm": 1.2802543640136719, + "learning_rate": 8.97523672784281e-06, + "loss": 0.3416, + "step": 2601 + }, + { + "epoch": 0.20816, + "grad_norm": 1.6268035173416138, + "learning_rate": 8.974473781933623e-06, + "loss": 0.3532, + "step": 2602 + }, + { + "epoch": 0.20824, + "grad_norm": 1.604090690612793, + "learning_rate": 8.97371058457278e-06, + "loss": 0.3794, + "step": 2603 + }, + { + "epoch": 0.20832, + "grad_norm": 1.903481125831604, + "learning_rate": 8.97294713580856e-06, + "loss": 0.4268, + "step": 2604 + }, + { + "epoch": 0.2084, + "grad_norm": 1.351362705230713, + "learning_rate": 8.972183435689273e-06, + "loss": 0.2761, + "step": 2605 + }, + { + "epoch": 0.20848, + "grad_norm": 1.6017656326293945, + "learning_rate": 8.97141948426323e-06, + "loss": 0.3673, + "step": 2606 + }, + { + "epoch": 0.20856, + "grad_norm": 1.882969856262207, + "learning_rate": 8.970655281578762e-06, + "loss": 0.3843, + "step": 2607 + }, + { + "epoch": 0.20864, + "grad_norm": 2.003662347793579, + "learning_rate": 8.969890827684222e-06, + "loss": 0.4264, + "step": 2608 + }, + { + "epoch": 0.20872, + "grad_norm": 1.6131091117858887, + "learning_rate": 8.969126122627973e-06, + "loss": 0.3647, + "step": 2609 + }, + { + "epoch": 0.2088, + "grad_norm": 1.922102451324463, + "learning_rate": 8.968361166458395e-06, + "loss": 0.3897, + "step": 2610 + }, + { + "epoch": 0.20888, + "grad_norm": 1.5826460123062134, + "learning_rate": 8.967595959223882e-06, + "loss": 0.3342, + "step": 2611 + }, + { + "epoch": 0.20896, + "grad_norm": 1.6929792165756226, + "learning_rate": 8.966830500972852e-06, + "loss": 0.4384, + "step": 2612 + }, + { + "epoch": 0.20904, + "grad_norm": 1.5595133304595947, + "learning_rate": 8.966064791753727e-06, + "loss": 0.3149, + "step": 2613 + }, + { + "epoch": 0.20912, + "grad_norm": 1.4859492778778076, + "learning_rate": 8.965298831614952e-06, + "loss": 0.2991, + "step": 2614 + }, + { + "epoch": 0.2092, + "grad_norm": 2.3546998500823975, + "learning_rate": 8.96453262060499e-06, + "loss": 0.6323, + "step": 2615 + }, + { + "epoch": 0.20928, + "grad_norm": 1.4650485515594482, + "learning_rate": 8.963766158772314e-06, + "loss": 0.2931, + "step": 2616 + }, + { + "epoch": 0.20936, + "grad_norm": 1.8523716926574707, + "learning_rate": 8.962999446165417e-06, + "loss": 0.3536, + "step": 2617 + }, + { + "epoch": 0.20944, + "grad_norm": 1.4977569580078125, + "learning_rate": 8.962232482832803e-06, + "loss": 0.356, + "step": 2618 + }, + { + "epoch": 0.20952, + "grad_norm": 1.681130051612854, + "learning_rate": 8.961465268822997e-06, + "loss": 0.2998, + "step": 2619 + }, + { + "epoch": 0.2096, + "grad_norm": 1.994041919708252, + "learning_rate": 8.960697804184541e-06, + "loss": 0.4286, + "step": 2620 + }, + { + "epoch": 0.20968, + "grad_norm": 1.5844112634658813, + "learning_rate": 8.959930088965987e-06, + "loss": 0.4088, + "step": 2621 + }, + { + "epoch": 0.20976, + "grad_norm": 1.7651921510696411, + "learning_rate": 8.959162123215906e-06, + "loss": 0.4029, + "step": 2622 + }, + { + "epoch": 0.20984, + "grad_norm": 1.6215308904647827, + "learning_rate": 8.958393906982885e-06, + "loss": 0.4116, + "step": 2623 + }, + { + "epoch": 0.20992, + "grad_norm": 1.4633090496063232, + "learning_rate": 8.957625440315524e-06, + "loss": 0.4603, + "step": 2624 + }, + { + "epoch": 0.21, + "grad_norm": 1.4941697120666504, + "learning_rate": 8.956856723262445e-06, + "loss": 0.3767, + "step": 2625 + }, + { + "epoch": 0.21008, + "grad_norm": 2.0093963146209717, + "learning_rate": 8.956087755872283e-06, + "loss": 0.3451, + "step": 2626 + }, + { + "epoch": 0.21016, + "grad_norm": 1.8792312145233154, + "learning_rate": 8.955318538193684e-06, + "loss": 0.3411, + "step": 2627 + }, + { + "epoch": 0.21024, + "grad_norm": 1.4969063997268677, + "learning_rate": 8.954549070275316e-06, + "loss": 0.331, + "step": 2628 + }, + { + "epoch": 0.21032, + "grad_norm": 1.5657908916473389, + "learning_rate": 8.953779352165859e-06, + "loss": 0.4029, + "step": 2629 + }, + { + "epoch": 0.2104, + "grad_norm": 1.5333744287490845, + "learning_rate": 8.953009383914012e-06, + "loss": 0.2624, + "step": 2630 + }, + { + "epoch": 0.21048, + "grad_norm": 1.338257908821106, + "learning_rate": 8.95223916556849e-06, + "loss": 0.3209, + "step": 2631 + }, + { + "epoch": 0.21056, + "grad_norm": 1.6348015069961548, + "learning_rate": 8.95146869717802e-06, + "loss": 0.3488, + "step": 2632 + }, + { + "epoch": 0.21064, + "grad_norm": 1.488411784172058, + "learning_rate": 8.950697978791345e-06, + "loss": 0.3626, + "step": 2633 + }, + { + "epoch": 0.21072, + "grad_norm": 1.8199793100357056, + "learning_rate": 8.94992701045723e-06, + "loss": 0.3665, + "step": 2634 + }, + { + "epoch": 0.2108, + "grad_norm": 1.5185476541519165, + "learning_rate": 8.949155792224448e-06, + "loss": 0.3566, + "step": 2635 + }, + { + "epoch": 0.21088, + "grad_norm": 1.959179401397705, + "learning_rate": 8.948384324141794e-06, + "loss": 0.3719, + "step": 2636 + }, + { + "epoch": 0.21096, + "grad_norm": 1.65220046043396, + "learning_rate": 8.947612606258076e-06, + "loss": 0.3149, + "step": 2637 + }, + { + "epoch": 0.21104, + "grad_norm": 1.392369270324707, + "learning_rate": 8.946840638622117e-06, + "loss": 0.2671, + "step": 2638 + }, + { + "epoch": 0.21112, + "grad_norm": 1.6883777379989624, + "learning_rate": 8.946068421282754e-06, + "loss": 0.4213, + "step": 2639 + }, + { + "epoch": 0.2112, + "grad_norm": 1.4826557636260986, + "learning_rate": 8.945295954288848e-06, + "loss": 0.312, + "step": 2640 + }, + { + "epoch": 0.21128, + "grad_norm": 1.6768203973770142, + "learning_rate": 8.944523237689268e-06, + "loss": 0.3766, + "step": 2641 + }, + { + "epoch": 0.21136, + "grad_norm": 1.9561296701431274, + "learning_rate": 8.9437502715329e-06, + "loss": 0.3921, + "step": 2642 + }, + { + "epoch": 0.21144, + "grad_norm": 1.2287523746490479, + "learning_rate": 8.94297705586865e-06, + "loss": 0.3746, + "step": 2643 + }, + { + "epoch": 0.21152, + "grad_norm": 1.383736252784729, + "learning_rate": 8.942203590745433e-06, + "loss": 0.3747, + "step": 2644 + }, + { + "epoch": 0.2116, + "grad_norm": 2.0053927898406982, + "learning_rate": 8.941429876212187e-06, + "loss": 0.2964, + "step": 2645 + }, + { + "epoch": 0.21168, + "grad_norm": 1.3449758291244507, + "learning_rate": 8.94065591231786e-06, + "loss": 0.2413, + "step": 2646 + }, + { + "epoch": 0.21176, + "grad_norm": 1.6508148908615112, + "learning_rate": 8.939881699111418e-06, + "loss": 0.3681, + "step": 2647 + }, + { + "epoch": 0.21184, + "grad_norm": 1.292067050933838, + "learning_rate": 8.939107236641845e-06, + "loss": 0.3007, + "step": 2648 + }, + { + "epoch": 0.21192, + "grad_norm": 1.5809190273284912, + "learning_rate": 8.938332524958137e-06, + "loss": 0.3361, + "step": 2649 + }, + { + "epoch": 0.212, + "grad_norm": 1.343364953994751, + "learning_rate": 8.937557564109307e-06, + "loss": 0.2596, + "step": 2650 + }, + { + "epoch": 0.21208, + "grad_norm": 1.7212300300598145, + "learning_rate": 8.936782354144387e-06, + "loss": 0.3525, + "step": 2651 + }, + { + "epoch": 0.21216, + "grad_norm": 2.104867935180664, + "learning_rate": 8.93600689511242e-06, + "loss": 0.4524, + "step": 2652 + }, + { + "epoch": 0.21224, + "grad_norm": 1.942893624305725, + "learning_rate": 8.935231187062465e-06, + "loss": 0.4223, + "step": 2653 + }, + { + "epoch": 0.21232, + "grad_norm": 1.5138806104660034, + "learning_rate": 8.9344552300436e-06, + "loss": 0.2939, + "step": 2654 + }, + { + "epoch": 0.2124, + "grad_norm": 1.6802589893341064, + "learning_rate": 8.93367902410492e-06, + "loss": 0.3564, + "step": 2655 + }, + { + "epoch": 0.21248, + "grad_norm": 1.7636982202529907, + "learning_rate": 8.932902569295527e-06, + "loss": 0.4802, + "step": 2656 + }, + { + "epoch": 0.21256, + "grad_norm": 1.8842812776565552, + "learning_rate": 8.932125865664549e-06, + "loss": 0.4784, + "step": 2657 + }, + { + "epoch": 0.21264, + "grad_norm": 1.495451807975769, + "learning_rate": 8.931348913261125e-06, + "loss": 0.2886, + "step": 2658 + }, + { + "epoch": 0.21272, + "grad_norm": 1.4396910667419434, + "learning_rate": 8.93057171213441e-06, + "loss": 0.2712, + "step": 2659 + }, + { + "epoch": 0.2128, + "grad_norm": 1.7895922660827637, + "learning_rate": 8.929794262333574e-06, + "loss": 0.3548, + "step": 2660 + }, + { + "epoch": 0.21288, + "grad_norm": 2.094775438308716, + "learning_rate": 8.929016563907805e-06, + "loss": 0.4313, + "step": 2661 + }, + { + "epoch": 0.21296, + "grad_norm": 1.7981663942337036, + "learning_rate": 8.928238616906302e-06, + "loss": 0.3359, + "step": 2662 + }, + { + "epoch": 0.21304, + "grad_norm": 1.6360666751861572, + "learning_rate": 8.927460421378287e-06, + "loss": 0.3905, + "step": 2663 + }, + { + "epoch": 0.21312, + "grad_norm": 2.0702054500579834, + "learning_rate": 8.926681977372993e-06, + "loss": 0.4009, + "step": 2664 + }, + { + "epoch": 0.2132, + "grad_norm": 1.4153721332550049, + "learning_rate": 8.92590328493967e-06, + "loss": 0.3303, + "step": 2665 + }, + { + "epoch": 0.21328, + "grad_norm": 1.5334463119506836, + "learning_rate": 8.92512434412758e-06, + "loss": 0.3581, + "step": 2666 + }, + { + "epoch": 0.21336, + "grad_norm": 1.445220708847046, + "learning_rate": 8.924345154986008e-06, + "loss": 0.2794, + "step": 2667 + }, + { + "epoch": 0.21344, + "grad_norm": 1.4006540775299072, + "learning_rate": 8.923565717564247e-06, + "loss": 0.2452, + "step": 2668 + }, + { + "epoch": 0.21352, + "grad_norm": 1.3207964897155762, + "learning_rate": 8.922786031911613e-06, + "loss": 0.2685, + "step": 2669 + }, + { + "epoch": 0.2136, + "grad_norm": 1.587815523147583, + "learning_rate": 8.922006098077432e-06, + "loss": 0.3331, + "step": 2670 + }, + { + "epoch": 0.21368, + "grad_norm": 1.743464469909668, + "learning_rate": 8.921225916111048e-06, + "loss": 0.3479, + "step": 2671 + }, + { + "epoch": 0.21376, + "grad_norm": 1.6636502742767334, + "learning_rate": 8.920445486061822e-06, + "loss": 0.3878, + "step": 2672 + }, + { + "epoch": 0.21384, + "grad_norm": 1.9191046953201294, + "learning_rate": 8.919664807979126e-06, + "loss": 0.4291, + "step": 2673 + }, + { + "epoch": 0.21392, + "grad_norm": 1.7954356670379639, + "learning_rate": 8.918883881912353e-06, + "loss": 0.3258, + "step": 2674 + }, + { + "epoch": 0.214, + "grad_norm": 1.4741268157958984, + "learning_rate": 8.91810270791091e-06, + "loss": 0.2867, + "step": 2675 + }, + { + "epoch": 0.21408, + "grad_norm": 1.7667266130447388, + "learning_rate": 8.917321286024218e-06, + "loss": 0.4209, + "step": 2676 + }, + { + "epoch": 0.21416, + "grad_norm": 1.924230694770813, + "learning_rate": 8.916539616301718e-06, + "loss": 0.3504, + "step": 2677 + }, + { + "epoch": 0.21424, + "grad_norm": 1.4865533113479614, + "learning_rate": 8.91575769879286e-06, + "loss": 0.3856, + "step": 2678 + }, + { + "epoch": 0.21432, + "grad_norm": 1.7114523649215698, + "learning_rate": 8.914975533547114e-06, + "loss": 0.4292, + "step": 2679 + }, + { + "epoch": 0.2144, + "grad_norm": 1.5713534355163574, + "learning_rate": 8.914193120613966e-06, + "loss": 0.3665, + "step": 2680 + }, + { + "epoch": 0.21448, + "grad_norm": 1.5431712865829468, + "learning_rate": 8.913410460042915e-06, + "loss": 0.3065, + "step": 2681 + }, + { + "epoch": 0.21456, + "grad_norm": 1.625646710395813, + "learning_rate": 8.91262755188348e-06, + "loss": 0.4117, + "step": 2682 + }, + { + "epoch": 0.21464, + "grad_norm": 1.5041875839233398, + "learning_rate": 8.911844396185192e-06, + "loss": 0.3816, + "step": 2683 + }, + { + "epoch": 0.21472, + "grad_norm": 1.8179059028625488, + "learning_rate": 8.911060992997596e-06, + "loss": 0.3031, + "step": 2684 + }, + { + "epoch": 0.2148, + "grad_norm": 1.7004882097244263, + "learning_rate": 8.910277342370259e-06, + "loss": 0.4804, + "step": 2685 + }, + { + "epoch": 0.21488, + "grad_norm": 2.1051254272460938, + "learning_rate": 8.909493444352757e-06, + "loss": 0.4295, + "step": 2686 + }, + { + "epoch": 0.21496, + "grad_norm": 1.3784528970718384, + "learning_rate": 8.908709298994686e-06, + "loss": 0.3112, + "step": 2687 + }, + { + "epoch": 0.21504, + "grad_norm": 1.7142976522445679, + "learning_rate": 8.907924906345659e-06, + "loss": 0.3797, + "step": 2688 + }, + { + "epoch": 0.21512, + "grad_norm": 1.7396079301834106, + "learning_rate": 8.907140266455297e-06, + "loss": 0.4081, + "step": 2689 + }, + { + "epoch": 0.2152, + "grad_norm": 0.8471266627311707, + "learning_rate": 8.906355379373243e-06, + "loss": 0.2378, + "step": 2690 + }, + { + "epoch": 0.21528, + "grad_norm": 1.6248892545700073, + "learning_rate": 8.905570245149156e-06, + "loss": 0.3695, + "step": 2691 + }, + { + "epoch": 0.21536, + "grad_norm": 1.4380123615264893, + "learning_rate": 8.904784863832708e-06, + "loss": 0.3377, + "step": 2692 + }, + { + "epoch": 0.21544, + "grad_norm": 1.8028794527053833, + "learning_rate": 8.903999235473586e-06, + "loss": 0.5013, + "step": 2693 + }, + { + "epoch": 0.21552, + "grad_norm": 1.2009968757629395, + "learning_rate": 8.903213360121496e-06, + "loss": 0.2609, + "step": 2694 + }, + { + "epoch": 0.2156, + "grad_norm": 1.5965559482574463, + "learning_rate": 8.902427237826157e-06, + "loss": 0.3731, + "step": 2695 + }, + { + "epoch": 0.21568, + "grad_norm": 1.466230869293213, + "learning_rate": 8.901640868637304e-06, + "loss": 0.3941, + "step": 2696 + }, + { + "epoch": 0.21576, + "grad_norm": 1.538789987564087, + "learning_rate": 8.900854252604689e-06, + "loss": 0.2974, + "step": 2697 + }, + { + "epoch": 0.21584, + "grad_norm": 1.5418365001678467, + "learning_rate": 8.900067389778075e-06, + "loss": 0.38, + "step": 2698 + }, + { + "epoch": 0.21592, + "grad_norm": 1.4428279399871826, + "learning_rate": 8.89928028020725e-06, + "loss": 0.3617, + "step": 2699 + }, + { + "epoch": 0.216, + "grad_norm": 1.5055745840072632, + "learning_rate": 8.898492923942007e-06, + "loss": 0.2906, + "step": 2700 + }, + { + "epoch": 0.21608, + "grad_norm": 1.5170435905456543, + "learning_rate": 8.897705321032162e-06, + "loss": 0.3001, + "step": 2701 + }, + { + "epoch": 0.21616, + "grad_norm": 1.5084747076034546, + "learning_rate": 8.896917471527542e-06, + "loss": 0.3537, + "step": 2702 + }, + { + "epoch": 0.21624, + "grad_norm": 1.590378761291504, + "learning_rate": 8.896129375477993e-06, + "loss": 0.2969, + "step": 2703 + }, + { + "epoch": 0.21632, + "grad_norm": 1.5651493072509766, + "learning_rate": 8.895341032933376e-06, + "loss": 0.3104, + "step": 2704 + }, + { + "epoch": 0.2164, + "grad_norm": 1.545362949371338, + "learning_rate": 8.894552443943564e-06, + "loss": 0.3274, + "step": 2705 + }, + { + "epoch": 0.21648, + "grad_norm": 1.7417821884155273, + "learning_rate": 8.893763608558453e-06, + "loss": 0.4584, + "step": 2706 + }, + { + "epoch": 0.21656, + "grad_norm": 1.2758499383926392, + "learning_rate": 8.892974526827944e-06, + "loss": 0.2997, + "step": 2707 + }, + { + "epoch": 0.21664, + "grad_norm": 1.4422128200531006, + "learning_rate": 8.892185198801963e-06, + "loss": 0.3044, + "step": 2708 + }, + { + "epoch": 0.21672, + "grad_norm": 1.5123414993286133, + "learning_rate": 8.891395624530449e-06, + "loss": 0.3265, + "step": 2709 + }, + { + "epoch": 0.2168, + "grad_norm": 1.5700615644454956, + "learning_rate": 8.890605804063353e-06, + "loss": 0.3064, + "step": 2710 + }, + { + "epoch": 0.21688, + "grad_norm": 1.971044659614563, + "learning_rate": 8.889815737450648e-06, + "loss": 0.3514, + "step": 2711 + }, + { + "epoch": 0.21696, + "grad_norm": 1.5621892213821411, + "learning_rate": 8.889025424742314e-06, + "loss": 0.3242, + "step": 2712 + }, + { + "epoch": 0.21704, + "grad_norm": 1.4305284023284912, + "learning_rate": 8.888234865988356e-06, + "loss": 0.3518, + "step": 2713 + }, + { + "epoch": 0.21712, + "grad_norm": 1.436153769493103, + "learning_rate": 8.887444061238787e-06, + "loss": 0.3249, + "step": 2714 + }, + { + "epoch": 0.2172, + "grad_norm": 1.4724273681640625, + "learning_rate": 8.886653010543641e-06, + "loss": 0.303, + "step": 2715 + }, + { + "epoch": 0.21728, + "grad_norm": 2.188133716583252, + "learning_rate": 8.885861713952964e-06, + "loss": 0.4513, + "step": 2716 + }, + { + "epoch": 0.21736, + "grad_norm": 1.3821355104446411, + "learning_rate": 8.885070171516816e-06, + "loss": 0.2599, + "step": 2717 + }, + { + "epoch": 0.21744, + "grad_norm": 1.943583369255066, + "learning_rate": 8.88427838328528e-06, + "loss": 0.5347, + "step": 2718 + }, + { + "epoch": 0.21752, + "grad_norm": 1.9702650308609009, + "learning_rate": 8.883486349308446e-06, + "loss": 0.387, + "step": 2719 + }, + { + "epoch": 0.2176, + "grad_norm": 1.7490559816360474, + "learning_rate": 8.882694069636426e-06, + "loss": 0.3813, + "step": 2720 + }, + { + "epoch": 0.21768, + "grad_norm": 1.3750137090682983, + "learning_rate": 8.881901544319345e-06, + "loss": 0.3268, + "step": 2721 + }, + { + "epoch": 0.21776, + "grad_norm": 1.3257077932357788, + "learning_rate": 8.881108773407338e-06, + "loss": 0.2837, + "step": 2722 + }, + { + "epoch": 0.21784, + "grad_norm": 1.7465240955352783, + "learning_rate": 8.88031575695057e-06, + "loss": 0.3157, + "step": 2723 + }, + { + "epoch": 0.21792, + "grad_norm": 1.619113564491272, + "learning_rate": 8.879522494999204e-06, + "loss": 0.3192, + "step": 2724 + }, + { + "epoch": 0.218, + "grad_norm": 1.9502321481704712, + "learning_rate": 8.878728987603433e-06, + "loss": 0.3442, + "step": 2725 + }, + { + "epoch": 0.21808, + "grad_norm": 1.1939575672149658, + "learning_rate": 8.877935234813455e-06, + "loss": 0.2337, + "step": 2726 + }, + { + "epoch": 0.21816, + "grad_norm": 1.9584698677062988, + "learning_rate": 8.877141236679492e-06, + "loss": 0.3656, + "step": 2727 + }, + { + "epoch": 0.21824, + "grad_norm": 1.442341685295105, + "learning_rate": 8.876346993251777e-06, + "loss": 0.352, + "step": 2728 + }, + { + "epoch": 0.21832, + "grad_norm": 1.8673537969589233, + "learning_rate": 8.875552504580556e-06, + "loss": 0.391, + "step": 2729 + }, + { + "epoch": 0.2184, + "grad_norm": 1.3063445091247559, + "learning_rate": 8.874757770716096e-06, + "loss": 0.3345, + "step": 2730 + }, + { + "epoch": 0.21848, + "grad_norm": 1.615033507347107, + "learning_rate": 8.873962791708676e-06, + "loss": 0.3744, + "step": 2731 + }, + { + "epoch": 0.21856, + "grad_norm": 1.8220925331115723, + "learning_rate": 8.873167567608594e-06, + "loss": 0.3211, + "step": 2732 + }, + { + "epoch": 0.21864, + "grad_norm": 1.4469846487045288, + "learning_rate": 8.872372098466159e-06, + "loss": 0.4197, + "step": 2733 + }, + { + "epoch": 0.21872, + "grad_norm": 1.1662476062774658, + "learning_rate": 8.871576384331699e-06, + "loss": 0.263, + "step": 2734 + }, + { + "epoch": 0.2188, + "grad_norm": 1.2721196413040161, + "learning_rate": 8.870780425255554e-06, + "loss": 0.2601, + "step": 2735 + }, + { + "epoch": 0.21888, + "grad_norm": 1.9340859651565552, + "learning_rate": 8.869984221288085e-06, + "loss": 0.3647, + "step": 2736 + }, + { + "epoch": 0.21896, + "grad_norm": 1.5645490884780884, + "learning_rate": 8.869187772479661e-06, + "loss": 0.3446, + "step": 2737 + }, + { + "epoch": 0.21904, + "grad_norm": 1.3666660785675049, + "learning_rate": 8.868391078880677e-06, + "loss": 0.3243, + "step": 2738 + }, + { + "epoch": 0.21912, + "grad_norm": 1.340316653251648, + "learning_rate": 8.86759414054153e-06, + "loss": 0.2668, + "step": 2739 + }, + { + "epoch": 0.2192, + "grad_norm": 1.5360743999481201, + "learning_rate": 8.866796957512642e-06, + "loss": 0.3216, + "step": 2740 + }, + { + "epoch": 0.21928, + "grad_norm": 1.6655339002609253, + "learning_rate": 8.865999529844452e-06, + "loss": 0.3079, + "step": 2741 + }, + { + "epoch": 0.21936, + "grad_norm": 1.4910820722579956, + "learning_rate": 8.865201857587405e-06, + "loss": 0.2896, + "step": 2742 + }, + { + "epoch": 0.21944, + "grad_norm": 1.905479907989502, + "learning_rate": 8.864403940791969e-06, + "loss": 0.4271, + "step": 2743 + }, + { + "epoch": 0.21952, + "grad_norm": 1.5210912227630615, + "learning_rate": 8.863605779508627e-06, + "loss": 0.319, + "step": 2744 + }, + { + "epoch": 0.2196, + "grad_norm": 1.7995033264160156, + "learning_rate": 8.862807373787876e-06, + "loss": 0.3828, + "step": 2745 + }, + { + "epoch": 0.21968, + "grad_norm": 1.853249430656433, + "learning_rate": 8.862008723680225e-06, + "loss": 0.374, + "step": 2746 + }, + { + "epoch": 0.21976, + "grad_norm": 1.831558108329773, + "learning_rate": 8.861209829236206e-06, + "loss": 0.4955, + "step": 2747 + }, + { + "epoch": 0.21984, + "grad_norm": 1.945634365081787, + "learning_rate": 8.860410690506361e-06, + "loss": 0.3997, + "step": 2748 + }, + { + "epoch": 0.21992, + "grad_norm": 1.5275148153305054, + "learning_rate": 8.859611307541247e-06, + "loss": 0.3866, + "step": 2749 + }, + { + "epoch": 0.22, + "grad_norm": 1.7615596055984497, + "learning_rate": 8.858811680391442e-06, + "loss": 0.3599, + "step": 2750 + }, + { + "epoch": 0.22008, + "grad_norm": 1.431030511856079, + "learning_rate": 8.858011809107532e-06, + "loss": 0.2883, + "step": 2751 + }, + { + "epoch": 0.22016, + "grad_norm": 1.2879825830459595, + "learning_rate": 8.857211693740125e-06, + "loss": 0.317, + "step": 2752 + }, + { + "epoch": 0.22024, + "grad_norm": 1.4592652320861816, + "learning_rate": 8.856411334339841e-06, + "loss": 0.2936, + "step": 2753 + }, + { + "epoch": 0.22032, + "grad_norm": 1.5514662265777588, + "learning_rate": 8.855610730957313e-06, + "loss": 0.4303, + "step": 2754 + }, + { + "epoch": 0.2204, + "grad_norm": 1.5528576374053955, + "learning_rate": 8.854809883643197e-06, + "loss": 0.3552, + "step": 2755 + }, + { + "epoch": 0.22048, + "grad_norm": 1.8523775339126587, + "learning_rate": 8.854008792448156e-06, + "loss": 0.4036, + "step": 2756 + }, + { + "epoch": 0.22056, + "grad_norm": 1.6846541166305542, + "learning_rate": 8.853207457422877e-06, + "loss": 0.3821, + "step": 2757 + }, + { + "epoch": 0.22064, + "grad_norm": 1.7963849306106567, + "learning_rate": 8.852405878618052e-06, + "loss": 0.3123, + "step": 2758 + }, + { + "epoch": 0.22072, + "grad_norm": 1.451002597808838, + "learning_rate": 8.8516040560844e-06, + "loss": 0.3076, + "step": 2759 + }, + { + "epoch": 0.2208, + "grad_norm": 1.808860421180725, + "learning_rate": 8.850801989872644e-06, + "loss": 0.3846, + "step": 2760 + }, + { + "epoch": 0.22088, + "grad_norm": 1.5843877792358398, + "learning_rate": 8.849999680033535e-06, + "loss": 0.286, + "step": 2761 + }, + { + "epoch": 0.22096, + "grad_norm": 1.2339478731155396, + "learning_rate": 8.849197126617824e-06, + "loss": 0.3054, + "step": 2762 + }, + { + "epoch": 0.22104, + "grad_norm": 1.9607781171798706, + "learning_rate": 8.848394329676294e-06, + "loss": 0.3739, + "step": 2763 + }, + { + "epoch": 0.22112, + "grad_norm": 1.4922035932540894, + "learning_rate": 8.847591289259729e-06, + "loss": 0.3911, + "step": 2764 + }, + { + "epoch": 0.2212, + "grad_norm": 1.7909282445907593, + "learning_rate": 8.846788005418938e-06, + "loss": 0.401, + "step": 2765 + }, + { + "epoch": 0.22128, + "grad_norm": 1.6844979524612427, + "learning_rate": 8.845984478204742e-06, + "loss": 0.4258, + "step": 2766 + }, + { + "epoch": 0.22136, + "grad_norm": 1.6638482809066772, + "learning_rate": 8.845180707667975e-06, + "loss": 0.3188, + "step": 2767 + }, + { + "epoch": 0.22144, + "grad_norm": 1.4212157726287842, + "learning_rate": 8.84437669385949e-06, + "loss": 0.3071, + "step": 2768 + }, + { + "epoch": 0.22152, + "grad_norm": 1.4176979064941406, + "learning_rate": 8.843572436830157e-06, + "loss": 0.3029, + "step": 2769 + }, + { + "epoch": 0.2216, + "grad_norm": 1.8087478876113892, + "learning_rate": 8.842767936630857e-06, + "loss": 0.4011, + "step": 2770 + }, + { + "epoch": 0.22168, + "grad_norm": 1.3983772993087769, + "learning_rate": 8.841963193312487e-06, + "loss": 0.347, + "step": 2771 + }, + { + "epoch": 0.22176, + "grad_norm": 1.5463216304779053, + "learning_rate": 8.841158206925959e-06, + "loss": 0.3349, + "step": 2772 + }, + { + "epoch": 0.22184, + "grad_norm": 1.3587409257888794, + "learning_rate": 8.840352977522206e-06, + "loss": 0.3083, + "step": 2773 + }, + { + "epoch": 0.22192, + "grad_norm": 1.3786388635635376, + "learning_rate": 8.83954750515217e-06, + "loss": 0.3388, + "step": 2774 + }, + { + "epoch": 0.222, + "grad_norm": 1.750858187675476, + "learning_rate": 8.83874178986681e-06, + "loss": 0.4101, + "step": 2775 + }, + { + "epoch": 0.22208, + "grad_norm": 1.5796808004379272, + "learning_rate": 8.837935831717102e-06, + "loss": 0.3572, + "step": 2776 + }, + { + "epoch": 0.22216, + "grad_norm": 1.397741675376892, + "learning_rate": 8.837129630754034e-06, + "loss": 0.3428, + "step": 2777 + }, + { + "epoch": 0.22224, + "grad_norm": 1.4873077869415283, + "learning_rate": 8.836323187028615e-06, + "loss": 0.3411, + "step": 2778 + }, + { + "epoch": 0.22232, + "grad_norm": 1.351516604423523, + "learning_rate": 8.835516500591863e-06, + "loss": 0.3341, + "step": 2779 + }, + { + "epoch": 0.2224, + "grad_norm": 1.303882360458374, + "learning_rate": 8.834709571494817e-06, + "loss": 0.3025, + "step": 2780 + }, + { + "epoch": 0.22248, + "grad_norm": 1.4679828882217407, + "learning_rate": 8.833902399788527e-06, + "loss": 0.3012, + "step": 2781 + }, + { + "epoch": 0.22256, + "grad_norm": 1.4130665063858032, + "learning_rate": 8.83309498552406e-06, + "loss": 0.2994, + "step": 2782 + }, + { + "epoch": 0.22264, + "grad_norm": 1.885979413986206, + "learning_rate": 8.832287328752499e-06, + "loss": 0.4008, + "step": 2783 + }, + { + "epoch": 0.22272, + "grad_norm": 2.2439332008361816, + "learning_rate": 8.83147942952494e-06, + "loss": 0.5255, + "step": 2784 + }, + { + "epoch": 0.2228, + "grad_norm": 1.7513844966888428, + "learning_rate": 8.8306712878925e-06, + "loss": 0.3906, + "step": 2785 + }, + { + "epoch": 0.22288, + "grad_norm": 1.585735559463501, + "learning_rate": 8.829862903906306e-06, + "loss": 0.3534, + "step": 2786 + }, + { + "epoch": 0.22296, + "grad_norm": 1.6071237325668335, + "learning_rate": 8.829054277617499e-06, + "loss": 0.3327, + "step": 2787 + }, + { + "epoch": 0.22304, + "grad_norm": 1.4853163957595825, + "learning_rate": 8.828245409077241e-06, + "loss": 0.3353, + "step": 2788 + }, + { + "epoch": 0.22312, + "grad_norm": 1.8716695308685303, + "learning_rate": 8.827436298336703e-06, + "loss": 0.4146, + "step": 2789 + }, + { + "epoch": 0.2232, + "grad_norm": 1.6231932640075684, + "learning_rate": 8.826626945447079e-06, + "loss": 0.3451, + "step": 2790 + }, + { + "epoch": 0.22328, + "grad_norm": 1.5309462547302246, + "learning_rate": 8.825817350459571e-06, + "loss": 0.3078, + "step": 2791 + }, + { + "epoch": 0.22336, + "grad_norm": 1.340057134628296, + "learning_rate": 8.825007513425401e-06, + "loss": 0.2812, + "step": 2792 + }, + { + "epoch": 0.22344, + "grad_norm": 1.6794257164001465, + "learning_rate": 8.824197434395805e-06, + "loss": 0.3899, + "step": 2793 + }, + { + "epoch": 0.22352, + "grad_norm": 1.566836953163147, + "learning_rate": 8.823387113422034e-06, + "loss": 0.4213, + "step": 2794 + }, + { + "epoch": 0.2236, + "grad_norm": 1.4850705862045288, + "learning_rate": 8.82257655055535e-06, + "loss": 0.3569, + "step": 2795 + }, + { + "epoch": 0.22368, + "grad_norm": 1.5087475776672363, + "learning_rate": 8.82176574584704e-06, + "loss": 0.3534, + "step": 2796 + }, + { + "epoch": 0.22376, + "grad_norm": 1.5350658893585205, + "learning_rate": 8.820954699348399e-06, + "loss": 0.3551, + "step": 2797 + }, + { + "epoch": 0.22384, + "grad_norm": 1.1860790252685547, + "learning_rate": 8.820143411110737e-06, + "loss": 0.214, + "step": 2798 + }, + { + "epoch": 0.22392, + "grad_norm": 2.052689552307129, + "learning_rate": 8.819331881185387e-06, + "loss": 0.5796, + "step": 2799 + }, + { + "epoch": 0.224, + "grad_norm": 1.7889010906219482, + "learning_rate": 8.818520109623687e-06, + "loss": 0.441, + "step": 2800 + }, + { + "epoch": 0.22408, + "grad_norm": 1.6740270853042603, + "learning_rate": 8.817708096476996e-06, + "loss": 0.3098, + "step": 2801 + }, + { + "epoch": 0.22416, + "grad_norm": 1.4243390560150146, + "learning_rate": 8.81689584179669e-06, + "loss": 0.2933, + "step": 2802 + }, + { + "epoch": 0.22424, + "grad_norm": 1.7533442974090576, + "learning_rate": 8.816083345634153e-06, + "loss": 0.3185, + "step": 2803 + }, + { + "epoch": 0.22432, + "grad_norm": 1.8434312343597412, + "learning_rate": 8.815270608040792e-06, + "loss": 0.4669, + "step": 2804 + }, + { + "epoch": 0.2244, + "grad_norm": 1.609260082244873, + "learning_rate": 8.814457629068025e-06, + "loss": 0.304, + "step": 2805 + }, + { + "epoch": 0.22448, + "grad_norm": 1.258373737335205, + "learning_rate": 8.813644408767287e-06, + "loss": 0.2606, + "step": 2806 + }, + { + "epoch": 0.22456, + "grad_norm": 1.3866522312164307, + "learning_rate": 8.812830947190028e-06, + "loss": 0.2768, + "step": 2807 + }, + { + "epoch": 0.22464, + "grad_norm": 1.794108271598816, + "learning_rate": 8.812017244387714e-06, + "loss": 0.4018, + "step": 2808 + }, + { + "epoch": 0.22472, + "grad_norm": 1.3862345218658447, + "learning_rate": 8.811203300411823e-06, + "loss": 0.3217, + "step": 2809 + }, + { + "epoch": 0.2248, + "grad_norm": 2.0301730632781982, + "learning_rate": 8.81038911531385e-06, + "loss": 0.3777, + "step": 2810 + }, + { + "epoch": 0.22488, + "grad_norm": 1.378750205039978, + "learning_rate": 8.80957468914531e-06, + "loss": 0.3752, + "step": 2811 + }, + { + "epoch": 0.22496, + "grad_norm": 1.7173678874969482, + "learning_rate": 8.808760021957725e-06, + "loss": 0.3415, + "step": 2812 + }, + { + "epoch": 0.22504, + "grad_norm": 1.7305114269256592, + "learning_rate": 8.807945113802638e-06, + "loss": 0.4127, + "step": 2813 + }, + { + "epoch": 0.22512, + "grad_norm": 1.6595813035964966, + "learning_rate": 8.807129964731604e-06, + "loss": 0.3818, + "step": 2814 + }, + { + "epoch": 0.2252, + "grad_norm": 1.9219270944595337, + "learning_rate": 8.806314574796198e-06, + "loss": 0.4096, + "step": 2815 + }, + { + "epoch": 0.22528, + "grad_norm": 1.5241590738296509, + "learning_rate": 8.805498944048003e-06, + "loss": 0.3071, + "step": 2816 + }, + { + "epoch": 0.22536, + "grad_norm": 1.5931185483932495, + "learning_rate": 8.804683072538623e-06, + "loss": 0.3009, + "step": 2817 + }, + { + "epoch": 0.22544, + "grad_norm": 1.7457307577133179, + "learning_rate": 8.803866960319676e-06, + "loss": 0.2971, + "step": 2818 + }, + { + "epoch": 0.22552, + "grad_norm": 1.769567847251892, + "learning_rate": 8.803050607442794e-06, + "loss": 0.4444, + "step": 2819 + }, + { + "epoch": 0.2256, + "grad_norm": 2.1283857822418213, + "learning_rate": 8.802234013959626e-06, + "loss": 0.4656, + "step": 2820 + }, + { + "epoch": 0.22568, + "grad_norm": 1.2949507236480713, + "learning_rate": 8.801417179921834e-06, + "loss": 0.2726, + "step": 2821 + }, + { + "epoch": 0.22576, + "grad_norm": 1.8464694023132324, + "learning_rate": 8.800600105381097e-06, + "loss": 0.4021, + "step": 2822 + }, + { + "epoch": 0.22584, + "grad_norm": 1.674856185913086, + "learning_rate": 8.799782790389107e-06, + "loss": 0.3484, + "step": 2823 + }, + { + "epoch": 0.22592, + "grad_norm": 1.2917641401290894, + "learning_rate": 8.798965234997574e-06, + "loss": 0.2873, + "step": 2824 + }, + { + "epoch": 0.226, + "grad_norm": 1.5360853672027588, + "learning_rate": 8.798147439258222e-06, + "loss": 0.3182, + "step": 2825 + }, + { + "epoch": 0.22608, + "grad_norm": 1.5402098894119263, + "learning_rate": 8.797329403222791e-06, + "loss": 0.3192, + "step": 2826 + }, + { + "epoch": 0.22616, + "grad_norm": 1.3484301567077637, + "learning_rate": 8.796511126943032e-06, + "loss": 0.2512, + "step": 2827 + }, + { + "epoch": 0.22624, + "grad_norm": 1.4177839756011963, + "learning_rate": 8.79569261047072e-06, + "loss": 0.3263, + "step": 2828 + }, + { + "epoch": 0.22632, + "grad_norm": 1.4625004529953003, + "learning_rate": 8.794873853857638e-06, + "loss": 0.3261, + "step": 2829 + }, + { + "epoch": 0.2264, + "grad_norm": 1.3922655582427979, + "learning_rate": 8.794054857155582e-06, + "loss": 0.3243, + "step": 2830 + }, + { + "epoch": 0.22648, + "grad_norm": 1.4068520069122314, + "learning_rate": 8.793235620416372e-06, + "loss": 0.2632, + "step": 2831 + }, + { + "epoch": 0.22656, + "grad_norm": 1.9339982271194458, + "learning_rate": 8.792416143691836e-06, + "loss": 0.4937, + "step": 2832 + }, + { + "epoch": 0.22664, + "grad_norm": 1.5683056116104126, + "learning_rate": 8.791596427033818e-06, + "loss": 0.3712, + "step": 2833 + }, + { + "epoch": 0.22672, + "grad_norm": 1.811155915260315, + "learning_rate": 8.790776470494183e-06, + "loss": 0.3811, + "step": 2834 + }, + { + "epoch": 0.2268, + "grad_norm": 1.855803370475769, + "learning_rate": 8.789956274124805e-06, + "loss": 0.4298, + "step": 2835 + }, + { + "epoch": 0.22688, + "grad_norm": 2.0151405334472656, + "learning_rate": 8.789135837977573e-06, + "loss": 0.4112, + "step": 2836 + }, + { + "epoch": 0.22696, + "grad_norm": 1.684455394744873, + "learning_rate": 8.788315162104396e-06, + "loss": 0.2819, + "step": 2837 + }, + { + "epoch": 0.22704, + "grad_norm": 1.5724467039108276, + "learning_rate": 8.787494246557195e-06, + "loss": 0.3237, + "step": 2838 + }, + { + "epoch": 0.22712, + "grad_norm": 1.4349114894866943, + "learning_rate": 8.786673091387906e-06, + "loss": 0.3576, + "step": 2839 + }, + { + "epoch": 0.2272, + "grad_norm": 1.4084806442260742, + "learning_rate": 8.78585169664848e-06, + "loss": 0.2697, + "step": 2840 + }, + { + "epoch": 0.22728, + "grad_norm": 1.615369200706482, + "learning_rate": 8.785030062390885e-06, + "loss": 0.3478, + "step": 2841 + }, + { + "epoch": 0.22736, + "grad_norm": 1.2227230072021484, + "learning_rate": 8.784208188667102e-06, + "loss": 0.2652, + "step": 2842 + }, + { + "epoch": 0.22744, + "grad_norm": 1.4531816244125366, + "learning_rate": 8.78338607552913e-06, + "loss": 0.3219, + "step": 2843 + }, + { + "epoch": 0.22752, + "grad_norm": 1.391142725944519, + "learning_rate": 8.782563723028979e-06, + "loss": 0.3095, + "step": 2844 + }, + { + "epoch": 0.2276, + "grad_norm": 1.6999260187149048, + "learning_rate": 8.781741131218678e-06, + "loss": 0.3403, + "step": 2845 + }, + { + "epoch": 0.22768, + "grad_norm": 1.1833946704864502, + "learning_rate": 8.78091830015027e-06, + "loss": 0.2708, + "step": 2846 + }, + { + "epoch": 0.22776, + "grad_norm": 1.4600259065628052, + "learning_rate": 8.780095229875813e-06, + "loss": 0.4186, + "step": 2847 + }, + { + "epoch": 0.22784, + "grad_norm": 1.6836893558502197, + "learning_rate": 8.779271920447378e-06, + "loss": 0.4746, + "step": 2848 + }, + { + "epoch": 0.22792, + "grad_norm": 1.6797387599945068, + "learning_rate": 8.778448371917055e-06, + "loss": 0.3224, + "step": 2849 + }, + { + "epoch": 0.228, + "grad_norm": 1.6203384399414062, + "learning_rate": 8.777624584336944e-06, + "loss": 0.4012, + "step": 2850 + }, + { + "epoch": 0.22808, + "grad_norm": 1.385426640510559, + "learning_rate": 8.776800557759167e-06, + "loss": 0.3187, + "step": 2851 + }, + { + "epoch": 0.22816, + "grad_norm": 1.3712928295135498, + "learning_rate": 8.775976292235857e-06, + "loss": 0.3654, + "step": 2852 + }, + { + "epoch": 0.22824, + "grad_norm": 1.6508076190948486, + "learning_rate": 8.775151787819159e-06, + "loss": 0.4184, + "step": 2853 + }, + { + "epoch": 0.22832, + "grad_norm": 1.793189525604248, + "learning_rate": 8.77432704456124e-06, + "loss": 0.4786, + "step": 2854 + }, + { + "epoch": 0.2284, + "grad_norm": 1.7186195850372314, + "learning_rate": 8.77350206251428e-06, + "loss": 0.3742, + "step": 2855 + }, + { + "epoch": 0.22848, + "grad_norm": 1.5993332862854004, + "learning_rate": 8.772676841730468e-06, + "loss": 0.397, + "step": 2856 + }, + { + "epoch": 0.22856, + "grad_norm": 1.4774996042251587, + "learning_rate": 8.771851382262016e-06, + "loss": 0.3349, + "step": 2857 + }, + { + "epoch": 0.22864, + "grad_norm": 1.5428051948547363, + "learning_rate": 8.771025684161147e-06, + "loss": 0.3738, + "step": 2858 + }, + { + "epoch": 0.22872, + "grad_norm": 1.4334566593170166, + "learning_rate": 8.770199747480105e-06, + "loss": 0.2782, + "step": 2859 + }, + { + "epoch": 0.2288, + "grad_norm": 2.1138930320739746, + "learning_rate": 8.769373572271137e-06, + "loss": 0.4068, + "step": 2860 + }, + { + "epoch": 0.22888, + "grad_norm": 1.580213189125061, + "learning_rate": 8.768547158586514e-06, + "loss": 0.304, + "step": 2861 + }, + { + "epoch": 0.22896, + "grad_norm": 1.601637840270996, + "learning_rate": 8.767720506478523e-06, + "loss": 0.3489, + "step": 2862 + }, + { + "epoch": 0.22904, + "grad_norm": 1.425022840499878, + "learning_rate": 8.766893615999463e-06, + "loss": 0.2925, + "step": 2863 + }, + { + "epoch": 0.22912, + "grad_norm": 1.401037573814392, + "learning_rate": 8.766066487201648e-06, + "loss": 0.3418, + "step": 2864 + }, + { + "epoch": 0.2292, + "grad_norm": 1.5454732179641724, + "learning_rate": 8.765239120137407e-06, + "loss": 0.3042, + "step": 2865 + }, + { + "epoch": 0.22928, + "grad_norm": 1.3919222354888916, + "learning_rate": 8.764411514859086e-06, + "loss": 0.3628, + "step": 2866 + }, + { + "epoch": 0.22936, + "grad_norm": 1.6965492963790894, + "learning_rate": 8.763583671419045e-06, + "loss": 0.3109, + "step": 2867 + }, + { + "epoch": 0.22944, + "grad_norm": 1.665880799293518, + "learning_rate": 8.762755589869655e-06, + "loss": 0.3084, + "step": 2868 + }, + { + "epoch": 0.22952, + "grad_norm": 1.277369737625122, + "learning_rate": 8.761927270263313e-06, + "loss": 0.2969, + "step": 2869 + }, + { + "epoch": 0.2296, + "grad_norm": 1.5863888263702393, + "learning_rate": 8.761098712652418e-06, + "loss": 0.342, + "step": 2870 + }, + { + "epoch": 0.22968, + "grad_norm": 1.9805275201797485, + "learning_rate": 8.760269917089392e-06, + "loss": 0.4074, + "step": 2871 + }, + { + "epoch": 0.22976, + "grad_norm": 1.5686451196670532, + "learning_rate": 8.75944088362667e-06, + "loss": 0.3272, + "step": 2872 + }, + { + "epoch": 0.22984, + "grad_norm": 1.4256458282470703, + "learning_rate": 8.758611612316704e-06, + "loss": 0.2976, + "step": 2873 + }, + { + "epoch": 0.22992, + "grad_norm": 1.0142706632614136, + "learning_rate": 8.757782103211958e-06, + "loss": 0.2594, + "step": 2874 + }, + { + "epoch": 0.23, + "grad_norm": 1.2783145904541016, + "learning_rate": 8.756952356364909e-06, + "loss": 0.3521, + "step": 2875 + }, + { + "epoch": 0.23008, + "grad_norm": 1.5316811800003052, + "learning_rate": 8.756122371828058e-06, + "loss": 0.3306, + "step": 2876 + }, + { + "epoch": 0.23016, + "grad_norm": 1.5814586877822876, + "learning_rate": 8.75529214965391e-06, + "loss": 0.2806, + "step": 2877 + }, + { + "epoch": 0.23024, + "grad_norm": 1.5847948789596558, + "learning_rate": 8.754461689894995e-06, + "loss": 0.4181, + "step": 2878 + }, + { + "epoch": 0.23032, + "grad_norm": 1.3571077585220337, + "learning_rate": 8.75363099260385e-06, + "loss": 0.3639, + "step": 2879 + }, + { + "epoch": 0.2304, + "grad_norm": 1.354897379875183, + "learning_rate": 8.752800057833033e-06, + "loss": 0.3231, + "step": 2880 + }, + { + "epoch": 0.23048, + "grad_norm": 2.0537118911743164, + "learning_rate": 8.751968885635115e-06, + "loss": 0.3662, + "step": 2881 + }, + { + "epoch": 0.23056, + "grad_norm": 1.585959792137146, + "learning_rate": 8.751137476062677e-06, + "loss": 0.392, + "step": 2882 + }, + { + "epoch": 0.23064, + "grad_norm": 2.2385966777801514, + "learning_rate": 8.750305829168321e-06, + "loss": 0.4388, + "step": 2883 + }, + { + "epoch": 0.23072, + "grad_norm": 1.3772207498550415, + "learning_rate": 8.749473945004665e-06, + "loss": 0.3189, + "step": 2884 + }, + { + "epoch": 0.2308, + "grad_norm": 1.298640251159668, + "learning_rate": 8.74864182362434e-06, + "loss": 0.2753, + "step": 2885 + }, + { + "epoch": 0.23088, + "grad_norm": 1.3288183212280273, + "learning_rate": 8.747809465079988e-06, + "loss": 0.297, + "step": 2886 + }, + { + "epoch": 0.23096, + "grad_norm": 1.2760984897613525, + "learning_rate": 8.746976869424272e-06, + "loss": 0.2857, + "step": 2887 + }, + { + "epoch": 0.23104, + "grad_norm": 1.6551486253738403, + "learning_rate": 8.746144036709867e-06, + "loss": 0.3029, + "step": 2888 + }, + { + "epoch": 0.23112, + "grad_norm": 1.5374372005462646, + "learning_rate": 8.745310966989463e-06, + "loss": 0.3214, + "step": 2889 + }, + { + "epoch": 0.2312, + "grad_norm": 2.0101284980773926, + "learning_rate": 8.744477660315767e-06, + "loss": 0.3779, + "step": 2890 + }, + { + "epoch": 0.23128, + "grad_norm": 1.6186890602111816, + "learning_rate": 8.743644116741497e-06, + "loss": 0.3475, + "step": 2891 + }, + { + "epoch": 0.23136, + "grad_norm": 1.7061872482299805, + "learning_rate": 8.742810336319391e-06, + "loss": 0.3468, + "step": 2892 + }, + { + "epoch": 0.23144, + "grad_norm": 1.7789872884750366, + "learning_rate": 8.741976319102198e-06, + "loss": 0.4333, + "step": 2893 + }, + { + "epoch": 0.23152, + "grad_norm": 1.6100202798843384, + "learning_rate": 8.741142065142683e-06, + "loss": 0.39, + "step": 2894 + }, + { + "epoch": 0.2316, + "grad_norm": 1.5869725942611694, + "learning_rate": 8.74030757449363e-06, + "loss": 0.3075, + "step": 2895 + }, + { + "epoch": 0.23168, + "grad_norm": 1.3948324918746948, + "learning_rate": 8.73947284720783e-06, + "loss": 0.2776, + "step": 2896 + }, + { + "epoch": 0.23176, + "grad_norm": 2.0415258407592773, + "learning_rate": 8.738637883338097e-06, + "loss": 0.3917, + "step": 2897 + }, + { + "epoch": 0.23184, + "grad_norm": 1.994744062423706, + "learning_rate": 8.737802682937253e-06, + "loss": 0.407, + "step": 2898 + }, + { + "epoch": 0.23192, + "grad_norm": 1.5355931520462036, + "learning_rate": 8.736967246058139e-06, + "loss": 0.3687, + "step": 2899 + }, + { + "epoch": 0.232, + "grad_norm": 1.7060344219207764, + "learning_rate": 8.736131572753614e-06, + "loss": 0.326, + "step": 2900 + }, + { + "epoch": 0.23208, + "grad_norm": 1.27048921585083, + "learning_rate": 8.735295663076545e-06, + "loss": 0.2716, + "step": 2901 + }, + { + "epoch": 0.23216, + "grad_norm": 1.5977073907852173, + "learning_rate": 8.734459517079815e-06, + "loss": 0.3342, + "step": 2902 + }, + { + "epoch": 0.23224, + "grad_norm": 2.4220876693725586, + "learning_rate": 8.733623134816329e-06, + "loss": 0.4348, + "step": 2903 + }, + { + "epoch": 0.23232, + "grad_norm": 1.7467727661132812, + "learning_rate": 8.732786516339e-06, + "loss": 0.346, + "step": 2904 + }, + { + "epoch": 0.2324, + "grad_norm": 1.4733282327651978, + "learning_rate": 8.731949661700759e-06, + "loss": 0.3384, + "step": 2905 + }, + { + "epoch": 0.23248, + "grad_norm": 1.3760995864868164, + "learning_rate": 8.731112570954547e-06, + "loss": 0.3029, + "step": 2906 + }, + { + "epoch": 0.23256, + "grad_norm": 1.9718873500823975, + "learning_rate": 8.73027524415333e-06, + "loss": 0.395, + "step": 2907 + }, + { + "epoch": 0.23264, + "grad_norm": 1.4080829620361328, + "learning_rate": 8.729437681350078e-06, + "loss": 0.4309, + "step": 2908 + }, + { + "epoch": 0.23272, + "grad_norm": 1.7443673610687256, + "learning_rate": 8.728599882597784e-06, + "loss": 0.373, + "step": 2909 + }, + { + "epoch": 0.2328, + "grad_norm": 1.4104093313217163, + "learning_rate": 8.72776184794945e-06, + "loss": 0.2877, + "step": 2910 + }, + { + "epoch": 0.23288, + "grad_norm": 1.8802305459976196, + "learning_rate": 8.726923577458097e-06, + "loss": 0.2943, + "step": 2911 + }, + { + "epoch": 0.23296, + "grad_norm": 1.4886568784713745, + "learning_rate": 8.726085071176761e-06, + "loss": 0.3797, + "step": 2912 + }, + { + "epoch": 0.23304, + "grad_norm": 1.259099006652832, + "learning_rate": 8.72524632915849e-06, + "loss": 0.3082, + "step": 2913 + }, + { + "epoch": 0.23312, + "grad_norm": 1.6729494333267212, + "learning_rate": 8.724407351456348e-06, + "loss": 0.3901, + "step": 2914 + }, + { + "epoch": 0.2332, + "grad_norm": 2.0305817127227783, + "learning_rate": 8.723568138123414e-06, + "loss": 0.6386, + "step": 2915 + }, + { + "epoch": 0.23328, + "grad_norm": 1.870473027229309, + "learning_rate": 8.722728689212785e-06, + "loss": 0.4489, + "step": 2916 + }, + { + "epoch": 0.23336, + "grad_norm": 1.5763081312179565, + "learning_rate": 8.721889004777566e-06, + "loss": 0.3733, + "step": 2917 + }, + { + "epoch": 0.23344, + "grad_norm": 1.4396545886993408, + "learning_rate": 8.721049084870883e-06, + "loss": 0.3985, + "step": 2918 + }, + { + "epoch": 0.23352, + "grad_norm": 1.4830892086029053, + "learning_rate": 8.720208929545876e-06, + "loss": 0.2998, + "step": 2919 + }, + { + "epoch": 0.2336, + "grad_norm": 1.6304142475128174, + "learning_rate": 8.719368538855699e-06, + "loss": 0.4229, + "step": 2920 + }, + { + "epoch": 0.23368, + "grad_norm": 1.3104946613311768, + "learning_rate": 8.718527912853518e-06, + "loss": 0.2697, + "step": 2921 + }, + { + "epoch": 0.23376, + "grad_norm": 1.498120665550232, + "learning_rate": 8.717687051592518e-06, + "loss": 0.3604, + "step": 2922 + }, + { + "epoch": 0.23384, + "grad_norm": 2.0636587142944336, + "learning_rate": 8.716845955125899e-06, + "loss": 0.3904, + "step": 2923 + }, + { + "epoch": 0.23392, + "grad_norm": 1.651100516319275, + "learning_rate": 8.716004623506872e-06, + "loss": 0.3936, + "step": 2924 + }, + { + "epoch": 0.234, + "grad_norm": 1.8705461025238037, + "learning_rate": 8.715163056788666e-06, + "loss": 0.4764, + "step": 2925 + }, + { + "epoch": 0.23408, + "grad_norm": 1.6736024618148804, + "learning_rate": 8.714321255024525e-06, + "loss": 0.3558, + "step": 2926 + }, + { + "epoch": 0.23416, + "grad_norm": 1.4450464248657227, + "learning_rate": 8.713479218267707e-06, + "loss": 0.3201, + "step": 2927 + }, + { + "epoch": 0.23424, + "grad_norm": 1.6315438747406006, + "learning_rate": 8.712636946571484e-06, + "loss": 0.3774, + "step": 2928 + }, + { + "epoch": 0.23432, + "grad_norm": 2.0019845962524414, + "learning_rate": 8.711794439989142e-06, + "loss": 0.3845, + "step": 2929 + }, + { + "epoch": 0.2344, + "grad_norm": 1.9739534854888916, + "learning_rate": 8.710951698573987e-06, + "loss": 0.3859, + "step": 2930 + }, + { + "epoch": 0.23448, + "grad_norm": 1.6644498109817505, + "learning_rate": 8.710108722379335e-06, + "loss": 0.3413, + "step": 2931 + }, + { + "epoch": 0.23456, + "grad_norm": 1.7164329290390015, + "learning_rate": 8.709265511458518e-06, + "loss": 0.3603, + "step": 2932 + }, + { + "epoch": 0.23464, + "grad_norm": 1.5060267448425293, + "learning_rate": 8.708422065864884e-06, + "loss": 0.3308, + "step": 2933 + }, + { + "epoch": 0.23472, + "grad_norm": 1.650381326675415, + "learning_rate": 8.707578385651795e-06, + "loss": 0.3483, + "step": 2934 + }, + { + "epoch": 0.2348, + "grad_norm": 1.5644901990890503, + "learning_rate": 8.706734470872624e-06, + "loss": 0.2845, + "step": 2935 + }, + { + "epoch": 0.23488, + "grad_norm": 1.9320694208145142, + "learning_rate": 8.705890321580768e-06, + "loss": 0.3227, + "step": 2936 + }, + { + "epoch": 0.23496, + "grad_norm": 1.542862892150879, + "learning_rate": 8.70504593782963e-06, + "loss": 0.2855, + "step": 2937 + }, + { + "epoch": 0.23504, + "grad_norm": 1.6313329935073853, + "learning_rate": 8.704201319672635e-06, + "loss": 0.3229, + "step": 2938 + }, + { + "epoch": 0.23512, + "grad_norm": 2.1460344791412354, + "learning_rate": 8.703356467163214e-06, + "loss": 0.5213, + "step": 2939 + }, + { + "epoch": 0.2352, + "grad_norm": 1.938667893409729, + "learning_rate": 8.702511380354822e-06, + "loss": 0.4258, + "step": 2940 + }, + { + "epoch": 0.23528, + "grad_norm": 1.58048415184021, + "learning_rate": 8.701666059300924e-06, + "loss": 0.3398, + "step": 2941 + }, + { + "epoch": 0.23536, + "grad_norm": 1.4565937519073486, + "learning_rate": 8.700820504055e-06, + "loss": 0.3913, + "step": 2942 + }, + { + "epoch": 0.23544, + "grad_norm": 1.457868218421936, + "learning_rate": 8.699974714670544e-06, + "loss": 0.3067, + "step": 2943 + }, + { + "epoch": 0.23552, + "grad_norm": 1.6874083280563354, + "learning_rate": 8.699128691201071e-06, + "loss": 0.3866, + "step": 2944 + }, + { + "epoch": 0.2356, + "grad_norm": 1.3682913780212402, + "learning_rate": 8.698282433700102e-06, + "loss": 0.2925, + "step": 2945 + }, + { + "epoch": 0.23568, + "grad_norm": 1.2929877042770386, + "learning_rate": 8.697435942221178e-06, + "loss": 0.2813, + "step": 2946 + }, + { + "epoch": 0.23576, + "grad_norm": 1.2776330709457397, + "learning_rate": 8.696589216817852e-06, + "loss": 0.2464, + "step": 2947 + }, + { + "epoch": 0.23584, + "grad_norm": 1.6202374696731567, + "learning_rate": 8.695742257543697e-06, + "loss": 0.3886, + "step": 2948 + }, + { + "epoch": 0.23592, + "grad_norm": 1.200810432434082, + "learning_rate": 8.694895064452294e-06, + "loss": 0.3024, + "step": 2949 + }, + { + "epoch": 0.236, + "grad_norm": 1.6930382251739502, + "learning_rate": 8.694047637597245e-06, + "loss": 0.3178, + "step": 2950 + }, + { + "epoch": 0.23608, + "grad_norm": 1.5066219568252563, + "learning_rate": 8.693199977032161e-06, + "loss": 0.322, + "step": 2951 + }, + { + "epoch": 0.23616, + "grad_norm": 1.304224967956543, + "learning_rate": 8.692352082810673e-06, + "loss": 0.3451, + "step": 2952 + }, + { + "epoch": 0.23624, + "grad_norm": 1.763218641281128, + "learning_rate": 8.691503954986422e-06, + "loss": 0.4435, + "step": 2953 + }, + { + "epoch": 0.23632, + "grad_norm": 1.316672444343567, + "learning_rate": 8.690655593613068e-06, + "loss": 0.3426, + "step": 2954 + }, + { + "epoch": 0.2364, + "grad_norm": 1.6360975503921509, + "learning_rate": 8.689806998744284e-06, + "loss": 0.3094, + "step": 2955 + }, + { + "epoch": 0.23648, + "grad_norm": 1.7397687435150146, + "learning_rate": 8.688958170433757e-06, + "loss": 0.4201, + "step": 2956 + }, + { + "epoch": 0.23656, + "grad_norm": 1.3750033378601074, + "learning_rate": 8.68810910873519e-06, + "loss": 0.2625, + "step": 2957 + }, + { + "epoch": 0.23664, + "grad_norm": 1.3705966472625732, + "learning_rate": 8.687259813702301e-06, + "loss": 0.3107, + "step": 2958 + }, + { + "epoch": 0.23672, + "grad_norm": 1.367472529411316, + "learning_rate": 8.686410285388818e-06, + "loss": 0.2897, + "step": 2959 + }, + { + "epoch": 0.2368, + "grad_norm": 1.3883439302444458, + "learning_rate": 8.685560523848494e-06, + "loss": 0.3076, + "step": 2960 + }, + { + "epoch": 0.23688, + "grad_norm": 1.2411656379699707, + "learning_rate": 8.684710529135088e-06, + "loss": 0.3676, + "step": 2961 + }, + { + "epoch": 0.23696, + "grad_norm": 1.4780466556549072, + "learning_rate": 8.683860301302373e-06, + "loss": 0.2878, + "step": 2962 + }, + { + "epoch": 0.23704, + "grad_norm": 1.2274631261825562, + "learning_rate": 8.683009840404145e-06, + "loss": 0.3174, + "step": 2963 + }, + { + "epoch": 0.23712, + "grad_norm": 1.504817247390747, + "learning_rate": 8.682159146494208e-06, + "loss": 0.291, + "step": 2964 + }, + { + "epoch": 0.2372, + "grad_norm": 1.6749637126922607, + "learning_rate": 8.681308219626381e-06, + "loss": 0.3828, + "step": 2965 + }, + { + "epoch": 0.23728, + "grad_norm": 1.835526943206787, + "learning_rate": 8.680457059854502e-06, + "loss": 0.3636, + "step": 2966 + }, + { + "epoch": 0.23736, + "grad_norm": 1.364960789680481, + "learning_rate": 8.679605667232421e-06, + "loss": 0.3038, + "step": 2967 + }, + { + "epoch": 0.23744, + "grad_norm": 1.8709944486618042, + "learning_rate": 8.678754041813996e-06, + "loss": 0.4753, + "step": 2968 + }, + { + "epoch": 0.23752, + "grad_norm": 1.5966829061508179, + "learning_rate": 8.677902183653117e-06, + "loss": 0.3359, + "step": 2969 + }, + { + "epoch": 0.2376, + "grad_norm": 1.4219958782196045, + "learning_rate": 8.677050092803671e-06, + "loss": 0.2726, + "step": 2970 + }, + { + "epoch": 0.23768, + "grad_norm": 1.6112638711929321, + "learning_rate": 8.67619776931957e-06, + "loss": 0.4072, + "step": 2971 + }, + { + "epoch": 0.23776, + "grad_norm": 1.330270767211914, + "learning_rate": 8.675345213254739e-06, + "loss": 0.3817, + "step": 2972 + }, + { + "epoch": 0.23784, + "grad_norm": 1.7107291221618652, + "learning_rate": 8.67449242466311e-06, + "loss": 0.3458, + "step": 2973 + }, + { + "epoch": 0.23792, + "grad_norm": 1.2672098875045776, + "learning_rate": 8.67363940359864e-06, + "loss": 0.2455, + "step": 2974 + }, + { + "epoch": 0.238, + "grad_norm": 1.589351773262024, + "learning_rate": 8.6727861501153e-06, + "loss": 0.3935, + "step": 2975 + }, + { + "epoch": 0.23808, + "grad_norm": 1.4220237731933594, + "learning_rate": 8.67193266426707e-06, + "loss": 0.3488, + "step": 2976 + }, + { + "epoch": 0.23816, + "grad_norm": 1.346812129020691, + "learning_rate": 8.671078946107942e-06, + "loss": 0.2996, + "step": 2977 + }, + { + "epoch": 0.23824, + "grad_norm": 1.5532695055007935, + "learning_rate": 8.670224995691937e-06, + "loss": 0.2827, + "step": 2978 + }, + { + "epoch": 0.23832, + "grad_norm": 1.6328967809677124, + "learning_rate": 8.669370813073076e-06, + "loss": 0.3164, + "step": 2979 + }, + { + "epoch": 0.2384, + "grad_norm": 1.4983983039855957, + "learning_rate": 8.6685163983054e-06, + "loss": 0.3014, + "step": 2980 + }, + { + "epoch": 0.23848, + "grad_norm": 1.8855558633804321, + "learning_rate": 8.667661751442967e-06, + "loss": 0.3764, + "step": 2981 + }, + { + "epoch": 0.23856, + "grad_norm": 1.2733697891235352, + "learning_rate": 8.666806872539848e-06, + "loss": 0.2521, + "step": 2982 + }, + { + "epoch": 0.23864, + "grad_norm": 1.7222651243209839, + "learning_rate": 8.665951761650126e-06, + "loss": 0.451, + "step": 2983 + }, + { + "epoch": 0.23872, + "grad_norm": 1.8912527561187744, + "learning_rate": 8.665096418827902e-06, + "loss": 0.4282, + "step": 2984 + }, + { + "epoch": 0.2388, + "grad_norm": 1.8017947673797607, + "learning_rate": 8.664240844127294e-06, + "loss": 0.414, + "step": 2985 + }, + { + "epoch": 0.23888, + "grad_norm": 1.7017481327056885, + "learning_rate": 8.663385037602425e-06, + "loss": 0.3998, + "step": 2986 + }, + { + "epoch": 0.23896, + "grad_norm": 1.7164137363433838, + "learning_rate": 8.662528999307445e-06, + "loss": 0.4701, + "step": 2987 + }, + { + "epoch": 0.23904, + "grad_norm": 1.832329273223877, + "learning_rate": 8.661672729296508e-06, + "loss": 0.368, + "step": 2988 + }, + { + "epoch": 0.23912, + "grad_norm": 1.7810895442962646, + "learning_rate": 8.660816227623791e-06, + "loss": 0.3759, + "step": 2989 + }, + { + "epoch": 0.2392, + "grad_norm": 1.597602128982544, + "learning_rate": 8.65995949434348e-06, + "loss": 0.3215, + "step": 2990 + }, + { + "epoch": 0.23928, + "grad_norm": 1.6633431911468506, + "learning_rate": 8.659102529509777e-06, + "loss": 0.3171, + "step": 2991 + }, + { + "epoch": 0.23936, + "grad_norm": 1.16764497756958, + "learning_rate": 8.6582453331769e-06, + "loss": 0.2655, + "step": 2992 + }, + { + "epoch": 0.23944, + "grad_norm": 1.6470662355422974, + "learning_rate": 8.657387905399085e-06, + "loss": 0.3766, + "step": 2993 + }, + { + "epoch": 0.23952, + "grad_norm": 1.707756519317627, + "learning_rate": 8.65653024623057e-06, + "loss": 0.3319, + "step": 2994 + }, + { + "epoch": 0.2396, + "grad_norm": 1.4462788105010986, + "learning_rate": 8.655672355725624e-06, + "loss": 0.3719, + "step": 2995 + }, + { + "epoch": 0.23968, + "grad_norm": 1.4448779821395874, + "learning_rate": 8.65481423393852e-06, + "loss": 0.3045, + "step": 2996 + }, + { + "epoch": 0.23976, + "grad_norm": 1.3186711072921753, + "learning_rate": 8.653955880923548e-06, + "loss": 0.2988, + "step": 2997 + }, + { + "epoch": 0.23984, + "grad_norm": 1.8978686332702637, + "learning_rate": 8.653097296735013e-06, + "loss": 0.4307, + "step": 2998 + }, + { + "epoch": 0.23992, + "grad_norm": 1.8591535091400146, + "learning_rate": 8.652238481427236e-06, + "loss": 0.4093, + "step": 2999 + }, + { + "epoch": 0.24, + "grad_norm": 1.7549595832824707, + "learning_rate": 8.65137943505455e-06, + "loss": 0.3208, + "step": 3000 + }, + { + "epoch": 0.24008, + "grad_norm": 1.3385145664215088, + "learning_rate": 8.650520157671305e-06, + "loss": 0.3473, + "step": 3001 + }, + { + "epoch": 0.24016, + "grad_norm": 1.6653660535812378, + "learning_rate": 8.649660649331866e-06, + "loss": 0.4244, + "step": 3002 + }, + { + "epoch": 0.24024, + "grad_norm": 1.7460511922836304, + "learning_rate": 8.648800910090607e-06, + "loss": 0.4825, + "step": 3003 + }, + { + "epoch": 0.24032, + "grad_norm": 1.650795817375183, + "learning_rate": 8.647940940001925e-06, + "loss": 0.3525, + "step": 3004 + }, + { + "epoch": 0.2404, + "grad_norm": 1.116976261138916, + "learning_rate": 8.647080739120224e-06, + "loss": 0.2398, + "step": 3005 + }, + { + "epoch": 0.24048, + "grad_norm": 2.5539703369140625, + "learning_rate": 8.64622030749993e-06, + "loss": 0.5062, + "step": 3006 + }, + { + "epoch": 0.24056, + "grad_norm": 1.4344453811645508, + "learning_rate": 8.645359645195475e-06, + "loss": 0.4, + "step": 3007 + }, + { + "epoch": 0.24064, + "grad_norm": 1.8538168668746948, + "learning_rate": 8.644498752261314e-06, + "loss": 0.3884, + "step": 3008 + }, + { + "epoch": 0.24072, + "grad_norm": 1.4812930822372437, + "learning_rate": 8.643637628751912e-06, + "loss": 0.379, + "step": 3009 + }, + { + "epoch": 0.2408, + "grad_norm": 2.2699289321899414, + "learning_rate": 8.642776274721747e-06, + "loss": 0.4227, + "step": 3010 + }, + { + "epoch": 0.24088, + "grad_norm": 1.7083232402801514, + "learning_rate": 8.64191469022532e-06, + "loss": 0.4064, + "step": 3011 + }, + { + "epoch": 0.24096, + "grad_norm": 1.5670645236968994, + "learning_rate": 8.641052875317134e-06, + "loss": 0.4143, + "step": 3012 + }, + { + "epoch": 0.24104, + "grad_norm": 1.5622729063034058, + "learning_rate": 8.640190830051714e-06, + "loss": 0.3417, + "step": 3013 + }, + { + "epoch": 0.24112, + "grad_norm": 1.4791356325149536, + "learning_rate": 8.639328554483602e-06, + "loss": 0.3159, + "step": 3014 + }, + { + "epoch": 0.2412, + "grad_norm": 1.7441151142120361, + "learning_rate": 8.63846604866735e-06, + "loss": 0.3908, + "step": 3015 + }, + { + "epoch": 0.24128, + "grad_norm": 1.3369641304016113, + "learning_rate": 8.637603312657523e-06, + "loss": 0.3106, + "step": 3016 + }, + { + "epoch": 0.24136, + "grad_norm": 1.8151953220367432, + "learning_rate": 8.636740346508708e-06, + "loss": 0.3418, + "step": 3017 + }, + { + "epoch": 0.24144, + "grad_norm": 1.3240052461624146, + "learning_rate": 8.635877150275498e-06, + "loss": 0.2562, + "step": 3018 + }, + { + "epoch": 0.24152, + "grad_norm": 1.6432435512542725, + "learning_rate": 8.63501372401251e-06, + "loss": 0.4784, + "step": 3019 + }, + { + "epoch": 0.2416, + "grad_norm": 1.8531707525253296, + "learning_rate": 8.634150067774363e-06, + "loss": 0.4161, + "step": 3020 + }, + { + "epoch": 0.24168, + "grad_norm": 1.6908509731292725, + "learning_rate": 8.633286181615701e-06, + "loss": 0.3549, + "step": 3021 + }, + { + "epoch": 0.24176, + "grad_norm": 1.5330978631973267, + "learning_rate": 8.632422065591181e-06, + "loss": 0.3241, + "step": 3022 + }, + { + "epoch": 0.24184, + "grad_norm": 1.633192777633667, + "learning_rate": 8.63155771975547e-06, + "loss": 0.3212, + "step": 3023 + }, + { + "epoch": 0.24192, + "grad_norm": 1.6121563911437988, + "learning_rate": 8.630693144163255e-06, + "loss": 0.3842, + "step": 3024 + }, + { + "epoch": 0.242, + "grad_norm": 1.674614667892456, + "learning_rate": 8.629828338869232e-06, + "loss": 0.3152, + "step": 3025 + }, + { + "epoch": 0.24208, + "grad_norm": 1.8287391662597656, + "learning_rate": 8.628963303928115e-06, + "loss": 0.5112, + "step": 3026 + }, + { + "epoch": 0.24216, + "grad_norm": 1.853173017501831, + "learning_rate": 8.628098039394632e-06, + "loss": 0.3848, + "step": 3027 + }, + { + "epoch": 0.24224, + "grad_norm": 1.7024120092391968, + "learning_rate": 8.627232545323527e-06, + "loss": 0.3737, + "step": 3028 + }, + { + "epoch": 0.24232, + "grad_norm": 1.4395607709884644, + "learning_rate": 8.626366821769556e-06, + "loss": 0.3088, + "step": 3029 + }, + { + "epoch": 0.2424, + "grad_norm": 1.6949063539505005, + "learning_rate": 8.625500868787488e-06, + "loss": 0.4567, + "step": 3030 + }, + { + "epoch": 0.24248, + "grad_norm": 2.09765887260437, + "learning_rate": 8.624634686432112e-06, + "loss": 0.2887, + "step": 3031 + }, + { + "epoch": 0.24256, + "grad_norm": 1.5703338384628296, + "learning_rate": 8.623768274758228e-06, + "loss": 0.3327, + "step": 3032 + }, + { + "epoch": 0.24264, + "grad_norm": 1.6307145357131958, + "learning_rate": 8.62290163382065e-06, + "loss": 0.3305, + "step": 3033 + }, + { + "epoch": 0.24272, + "grad_norm": 1.5243542194366455, + "learning_rate": 8.622034763674207e-06, + "loss": 0.3326, + "step": 3034 + }, + { + "epoch": 0.2428, + "grad_norm": 1.9055639505386353, + "learning_rate": 8.621167664373746e-06, + "loss": 0.4019, + "step": 3035 + }, + { + "epoch": 0.24288, + "grad_norm": 1.4484702348709106, + "learning_rate": 8.620300335974122e-06, + "loss": 0.2865, + "step": 3036 + }, + { + "epoch": 0.24296, + "grad_norm": 1.721147894859314, + "learning_rate": 8.61943277853021e-06, + "loss": 0.4413, + "step": 3037 + }, + { + "epoch": 0.24304, + "grad_norm": 1.7040313482284546, + "learning_rate": 8.618564992096896e-06, + "loss": 0.2903, + "step": 3038 + }, + { + "epoch": 0.24312, + "grad_norm": 1.6943026781082153, + "learning_rate": 8.617696976729082e-06, + "loss": 0.3307, + "step": 3039 + }, + { + "epoch": 0.2432, + "grad_norm": 1.560989260673523, + "learning_rate": 8.616828732481687e-06, + "loss": 0.4324, + "step": 3040 + }, + { + "epoch": 0.24328, + "grad_norm": 1.2807221412658691, + "learning_rate": 8.61596025940964e-06, + "loss": 0.3329, + "step": 3041 + }, + { + "epoch": 0.24336, + "grad_norm": 1.504556655883789, + "learning_rate": 8.615091557567884e-06, + "loss": 0.303, + "step": 3042 + }, + { + "epoch": 0.24344, + "grad_norm": 1.388826847076416, + "learning_rate": 8.614222627011382e-06, + "loss": 0.3595, + "step": 3043 + }, + { + "epoch": 0.24352, + "grad_norm": 1.3632359504699707, + "learning_rate": 8.613353467795108e-06, + "loss": 0.2863, + "step": 3044 + }, + { + "epoch": 0.2436, + "grad_norm": 1.6326030492782593, + "learning_rate": 8.612484079974053e-06, + "loss": 0.3433, + "step": 3045 + }, + { + "epoch": 0.24368, + "grad_norm": 1.4086010456085205, + "learning_rate": 8.611614463603215e-06, + "loss": 0.4176, + "step": 3046 + }, + { + "epoch": 0.24376, + "grad_norm": 1.719990611076355, + "learning_rate": 8.610744618737614e-06, + "loss": 0.4066, + "step": 3047 + }, + { + "epoch": 0.24384, + "grad_norm": 1.6791647672653198, + "learning_rate": 8.609874545432285e-06, + "loss": 0.3398, + "step": 3048 + }, + { + "epoch": 0.24392, + "grad_norm": 1.7768301963806152, + "learning_rate": 8.60900424374227e-06, + "loss": 0.4071, + "step": 3049 + }, + { + "epoch": 0.244, + "grad_norm": 1.4605388641357422, + "learning_rate": 8.608133713722631e-06, + "loss": 0.3426, + "step": 3050 + }, + { + "epoch": 0.24408, + "grad_norm": 1.4044535160064697, + "learning_rate": 8.607262955428449e-06, + "loss": 0.3431, + "step": 3051 + }, + { + "epoch": 0.24416, + "grad_norm": 1.5170437097549438, + "learning_rate": 8.606391968914807e-06, + "loss": 0.3865, + "step": 3052 + }, + { + "epoch": 0.24424, + "grad_norm": 1.5312057733535767, + "learning_rate": 8.60552075423681e-06, + "loss": 0.391, + "step": 3053 + }, + { + "epoch": 0.24432, + "grad_norm": 2.062711715698242, + "learning_rate": 8.604649311449583e-06, + "loss": 0.5652, + "step": 3054 + }, + { + "epoch": 0.2444, + "grad_norm": 1.4739586114883423, + "learning_rate": 8.603777640608254e-06, + "loss": 0.3014, + "step": 3055 + }, + { + "epoch": 0.24448, + "grad_norm": 1.3225152492523193, + "learning_rate": 8.60290574176797e-06, + "loss": 0.311, + "step": 3056 + }, + { + "epoch": 0.24456, + "grad_norm": 1.7784897089004517, + "learning_rate": 8.602033614983898e-06, + "loss": 0.3726, + "step": 3057 + }, + { + "epoch": 0.24464, + "grad_norm": 1.7232739925384521, + "learning_rate": 8.60116126031121e-06, + "loss": 0.3162, + "step": 3058 + }, + { + "epoch": 0.24472, + "grad_norm": 1.1945064067840576, + "learning_rate": 8.6002886778051e-06, + "loss": 0.2356, + "step": 3059 + }, + { + "epoch": 0.2448, + "grad_norm": 1.5598511695861816, + "learning_rate": 8.59941586752077e-06, + "loss": 0.3743, + "step": 3060 + }, + { + "epoch": 0.24488, + "grad_norm": 1.2169926166534424, + "learning_rate": 8.598542829513444e-06, + "loss": 0.2614, + "step": 3061 + }, + { + "epoch": 0.24496, + "grad_norm": 1.6568950414657593, + "learning_rate": 8.597669563838353e-06, + "loss": 0.3015, + "step": 3062 + }, + { + "epoch": 0.24504, + "grad_norm": 1.6429921388626099, + "learning_rate": 8.596796070550746e-06, + "loss": 0.3532, + "step": 3063 + }, + { + "epoch": 0.24512, + "grad_norm": 1.6444114446640015, + "learning_rate": 8.595922349705886e-06, + "loss": 0.4508, + "step": 3064 + }, + { + "epoch": 0.2452, + "grad_norm": 1.5652337074279785, + "learning_rate": 8.595048401359052e-06, + "loss": 0.2787, + "step": 3065 + }, + { + "epoch": 0.24528, + "grad_norm": 2.0153632164001465, + "learning_rate": 8.594174225565535e-06, + "loss": 0.3774, + "step": 3066 + }, + { + "epoch": 0.24536, + "grad_norm": 1.5180140733718872, + "learning_rate": 8.59329982238064e-06, + "loss": 0.3183, + "step": 3067 + }, + { + "epoch": 0.24544, + "grad_norm": 1.8348703384399414, + "learning_rate": 8.592425191859687e-06, + "loss": 0.4139, + "step": 3068 + }, + { + "epoch": 0.24552, + "grad_norm": 1.6924543380737305, + "learning_rate": 8.591550334058015e-06, + "loss": 0.3641, + "step": 3069 + }, + { + "epoch": 0.2456, + "grad_norm": 1.5413908958435059, + "learning_rate": 8.59067524903097e-06, + "loss": 0.3124, + "step": 3070 + }, + { + "epoch": 0.24568, + "grad_norm": 1.7611223459243774, + "learning_rate": 8.589799936833916e-06, + "loss": 0.397, + "step": 3071 + }, + { + "epoch": 0.24576, + "grad_norm": 1.5739551782608032, + "learning_rate": 8.588924397522231e-06, + "loss": 0.2906, + "step": 3072 + }, + { + "epoch": 0.24584, + "grad_norm": 1.6537935733795166, + "learning_rate": 8.58804863115131e-06, + "loss": 0.3932, + "step": 3073 + }, + { + "epoch": 0.24592, + "grad_norm": 1.7853214740753174, + "learning_rate": 8.587172637776558e-06, + "loss": 0.3289, + "step": 3074 + }, + { + "epoch": 0.246, + "grad_norm": 1.2469711303710938, + "learning_rate": 8.586296417453393e-06, + "loss": 0.3285, + "step": 3075 + }, + { + "epoch": 0.24608, + "grad_norm": 1.7327982187271118, + "learning_rate": 8.585419970237255e-06, + "loss": 0.5739, + "step": 3076 + }, + { + "epoch": 0.24616, + "grad_norm": 1.298710823059082, + "learning_rate": 8.584543296183591e-06, + "loss": 0.2513, + "step": 3077 + }, + { + "epoch": 0.24624, + "grad_norm": 1.2274887561798096, + "learning_rate": 8.583666395347869e-06, + "loss": 0.2949, + "step": 3078 + }, + { + "epoch": 0.24632, + "grad_norm": 1.9516730308532715, + "learning_rate": 8.582789267785563e-06, + "loss": 0.4306, + "step": 3079 + }, + { + "epoch": 0.2464, + "grad_norm": 1.424416422843933, + "learning_rate": 8.58191191355217e-06, + "loss": 0.3194, + "step": 3080 + }, + { + "epoch": 0.24648, + "grad_norm": 1.8527755737304688, + "learning_rate": 8.581034332703194e-06, + "loss": 0.3808, + "step": 3081 + }, + { + "epoch": 0.24656, + "grad_norm": 1.4947212934494019, + "learning_rate": 8.58015652529416e-06, + "loss": 0.2791, + "step": 3082 + }, + { + "epoch": 0.24664, + "grad_norm": 1.3371108770370483, + "learning_rate": 8.579278491380598e-06, + "loss": 0.2267, + "step": 3083 + }, + { + "epoch": 0.24672, + "grad_norm": 1.5755013227462769, + "learning_rate": 8.578400231018064e-06, + "loss": 0.3123, + "step": 3084 + }, + { + "epoch": 0.2468, + "grad_norm": 1.653031349182129, + "learning_rate": 8.577521744262123e-06, + "loss": 0.4084, + "step": 3085 + }, + { + "epoch": 0.24688, + "grad_norm": 1.827981948852539, + "learning_rate": 8.576643031168349e-06, + "loss": 0.3714, + "step": 3086 + }, + { + "epoch": 0.24696, + "grad_norm": 1.6476799249649048, + "learning_rate": 8.575764091792339e-06, + "loss": 0.2948, + "step": 3087 + }, + { + "epoch": 0.24704, + "grad_norm": 1.5807143449783325, + "learning_rate": 8.574884926189699e-06, + "loss": 0.4274, + "step": 3088 + }, + { + "epoch": 0.24712, + "grad_norm": 1.801981806755066, + "learning_rate": 8.57400553441605e-06, + "loss": 0.4747, + "step": 3089 + }, + { + "epoch": 0.2472, + "grad_norm": 1.1952424049377441, + "learning_rate": 8.573125916527031e-06, + "loss": 0.2432, + "step": 3090 + }, + { + "epoch": 0.24728, + "grad_norm": 1.6386176347732544, + "learning_rate": 8.572246072578292e-06, + "loss": 0.3894, + "step": 3091 + }, + { + "epoch": 0.24736, + "grad_norm": 1.36851966381073, + "learning_rate": 8.571366002625495e-06, + "loss": 0.2686, + "step": 3092 + }, + { + "epoch": 0.24744, + "grad_norm": 1.3977208137512207, + "learning_rate": 8.570485706724322e-06, + "loss": 0.3077, + "step": 3093 + }, + { + "epoch": 0.24752, + "grad_norm": 1.9915634393692017, + "learning_rate": 8.569605184930466e-06, + "loss": 0.3775, + "step": 3094 + }, + { + "epoch": 0.2476, + "grad_norm": 1.7627999782562256, + "learning_rate": 8.568724437299631e-06, + "loss": 0.3882, + "step": 3095 + }, + { + "epoch": 0.24768, + "grad_norm": 1.5895828008651733, + "learning_rate": 8.567843463887543e-06, + "loss": 0.2905, + "step": 3096 + }, + { + "epoch": 0.24776, + "grad_norm": 1.7350879907608032, + "learning_rate": 8.566962264749938e-06, + "loss": 0.3652, + "step": 3097 + }, + { + "epoch": 0.24784, + "grad_norm": 1.7022144794464111, + "learning_rate": 8.566080839942566e-06, + "loss": 0.3455, + "step": 3098 + }, + { + "epoch": 0.24792, + "grad_norm": 1.8170467615127563, + "learning_rate": 8.565199189521189e-06, + "loss": 0.3867, + "step": 3099 + }, + { + "epoch": 0.248, + "grad_norm": 1.3393418788909912, + "learning_rate": 8.56431731354159e-06, + "loss": 0.2957, + "step": 3100 + }, + { + "epoch": 0.24808, + "grad_norm": 1.4289052486419678, + "learning_rate": 8.563435212059561e-06, + "loss": 0.3263, + "step": 3101 + }, + { + "epoch": 0.24816, + "grad_norm": 1.3951023817062378, + "learning_rate": 8.56255288513091e-06, + "loss": 0.2906, + "step": 3102 + }, + { + "epoch": 0.24824, + "grad_norm": 1.709173560142517, + "learning_rate": 8.561670332811458e-06, + "loss": 0.2987, + "step": 3103 + }, + { + "epoch": 0.24832, + "grad_norm": 2.1143503189086914, + "learning_rate": 8.56078755515704e-06, + "loss": 0.4867, + "step": 3104 + }, + { + "epoch": 0.2484, + "grad_norm": 1.202492356300354, + "learning_rate": 8.55990455222351e-06, + "loss": 0.2769, + "step": 3105 + }, + { + "epoch": 0.24848, + "grad_norm": 1.666141390800476, + "learning_rate": 8.559021324066728e-06, + "loss": 0.3279, + "step": 3106 + }, + { + "epoch": 0.24856, + "grad_norm": 1.3046966791152954, + "learning_rate": 8.558137870742578e-06, + "loss": 0.3452, + "step": 3107 + }, + { + "epoch": 0.24864, + "grad_norm": 1.476283073425293, + "learning_rate": 8.557254192306948e-06, + "loss": 0.3225, + "step": 3108 + }, + { + "epoch": 0.24872, + "grad_norm": 1.7318915128707886, + "learning_rate": 8.55637028881575e-06, + "loss": 0.3913, + "step": 3109 + }, + { + "epoch": 0.2488, + "grad_norm": 1.5203393697738647, + "learning_rate": 8.555486160324902e-06, + "loss": 0.3077, + "step": 3110 + }, + { + "epoch": 0.24888, + "grad_norm": 1.610252857208252, + "learning_rate": 8.554601806890342e-06, + "loss": 0.3474, + "step": 3111 + }, + { + "epoch": 0.24896, + "grad_norm": 1.8684039115905762, + "learning_rate": 8.553717228568018e-06, + "loss": 0.361, + "step": 3112 + }, + { + "epoch": 0.24904, + "grad_norm": 1.8766143321990967, + "learning_rate": 8.552832425413897e-06, + "loss": 0.3453, + "step": 3113 + }, + { + "epoch": 0.24912, + "grad_norm": 1.7946739196777344, + "learning_rate": 8.551947397483957e-06, + "loss": 0.4859, + "step": 3114 + }, + { + "epoch": 0.2492, + "grad_norm": 1.4358093738555908, + "learning_rate": 8.551062144834189e-06, + "loss": 0.3495, + "step": 3115 + }, + { + "epoch": 0.24928, + "grad_norm": 1.4946582317352295, + "learning_rate": 8.5501766675206e-06, + "loss": 0.361, + "step": 3116 + }, + { + "epoch": 0.24936, + "grad_norm": 1.362602710723877, + "learning_rate": 8.549290965599214e-06, + "loss": 0.3362, + "step": 3117 + }, + { + "epoch": 0.24944, + "grad_norm": 1.9473742246627808, + "learning_rate": 8.548405039126064e-06, + "loss": 0.4263, + "step": 3118 + }, + { + "epoch": 0.24952, + "grad_norm": 1.4761608839035034, + "learning_rate": 8.5475188881572e-06, + "loss": 0.3398, + "step": 3119 + }, + { + "epoch": 0.2496, + "grad_norm": 1.3862004280090332, + "learning_rate": 8.546632512748685e-06, + "loss": 0.361, + "step": 3120 + }, + { + "epoch": 0.24968, + "grad_norm": 1.507232904434204, + "learning_rate": 8.5457459129566e-06, + "loss": 0.2987, + "step": 3121 + }, + { + "epoch": 0.24976, + "grad_norm": 1.8091908693313599, + "learning_rate": 8.544859088837034e-06, + "loss": 0.3306, + "step": 3122 + }, + { + "epoch": 0.24984, + "grad_norm": 1.4990782737731934, + "learning_rate": 8.543972040446094e-06, + "loss": 0.2953, + "step": 3123 + }, + { + "epoch": 0.24992, + "grad_norm": 1.6368714570999146, + "learning_rate": 8.543084767839903e-06, + "loss": 0.2681, + "step": 3124 + }, + { + "epoch": 0.25, + "grad_norm": 1.792515516281128, + "learning_rate": 8.542197271074593e-06, + "loss": 0.3714, + "step": 3125 + }, + { + "epoch": 0.25008, + "grad_norm": 1.680100679397583, + "learning_rate": 8.541309550206314e-06, + "loss": 0.4091, + "step": 3126 + }, + { + "epoch": 0.25016, + "grad_norm": 1.2095801830291748, + "learning_rate": 8.540421605291228e-06, + "loss": 0.2428, + "step": 3127 + }, + { + "epoch": 0.25024, + "grad_norm": 1.3905729055404663, + "learning_rate": 8.539533436385515e-06, + "loss": 0.2728, + "step": 3128 + }, + { + "epoch": 0.25032, + "grad_norm": 1.7047653198242188, + "learning_rate": 8.538645043545364e-06, + "loss": 0.3631, + "step": 3129 + }, + { + "epoch": 0.2504, + "grad_norm": 1.328326940536499, + "learning_rate": 8.537756426826981e-06, + "loss": 0.2764, + "step": 3130 + }, + { + "epoch": 0.25048, + "grad_norm": 1.6265760660171509, + "learning_rate": 8.536867586286586e-06, + "loss": 0.3045, + "step": 3131 + }, + { + "epoch": 0.25056, + "grad_norm": 1.6370584964752197, + "learning_rate": 8.535978521980414e-06, + "loss": 0.3782, + "step": 3132 + }, + { + "epoch": 0.25064, + "grad_norm": 1.4649031162261963, + "learning_rate": 8.535089233964712e-06, + "loss": 0.3088, + "step": 3133 + }, + { + "epoch": 0.25072, + "grad_norm": 1.2999662160873413, + "learning_rate": 8.534199722295744e-06, + "loss": 0.2541, + "step": 3134 + }, + { + "epoch": 0.2508, + "grad_norm": 1.5978385210037231, + "learning_rate": 8.533309987029783e-06, + "loss": 0.3905, + "step": 3135 + }, + { + "epoch": 0.25088, + "grad_norm": 1.4170856475830078, + "learning_rate": 8.532420028223122e-06, + "loss": 0.3053, + "step": 3136 + }, + { + "epoch": 0.25096, + "grad_norm": 1.6570608615875244, + "learning_rate": 8.531529845932066e-06, + "loss": 0.3299, + "step": 3137 + }, + { + "epoch": 0.25104, + "grad_norm": 1.5469874143600464, + "learning_rate": 8.530639440212934e-06, + "loss": 0.3261, + "step": 3138 + }, + { + "epoch": 0.25112, + "grad_norm": 1.5680407285690308, + "learning_rate": 8.529748811122056e-06, + "loss": 0.3176, + "step": 3139 + }, + { + "epoch": 0.2512, + "grad_norm": 1.4539591073989868, + "learning_rate": 8.528857958715783e-06, + "loss": 0.3613, + "step": 3140 + }, + { + "epoch": 0.25128, + "grad_norm": 1.8934903144836426, + "learning_rate": 8.527966883050474e-06, + "loss": 0.445, + "step": 3141 + }, + { + "epoch": 0.25136, + "grad_norm": 1.4876171350479126, + "learning_rate": 8.527075584182505e-06, + "loss": 0.2891, + "step": 3142 + }, + { + "epoch": 0.25144, + "grad_norm": 1.2558088302612305, + "learning_rate": 8.526184062168264e-06, + "loss": 0.3297, + "step": 3143 + }, + { + "epoch": 0.25152, + "grad_norm": 1.8519783020019531, + "learning_rate": 8.525292317064157e-06, + "loss": 0.3826, + "step": 3144 + }, + { + "epoch": 0.2516, + "grad_norm": 1.5815104246139526, + "learning_rate": 8.524400348926602e-06, + "loss": 0.3588, + "step": 3145 + }, + { + "epoch": 0.25168, + "grad_norm": 1.5304956436157227, + "learning_rate": 8.523508157812029e-06, + "loss": 0.3735, + "step": 3146 + }, + { + "epoch": 0.25176, + "grad_norm": 1.5469937324523926, + "learning_rate": 8.522615743776885e-06, + "loss": 0.3096, + "step": 3147 + }, + { + "epoch": 0.25184, + "grad_norm": 1.3774428367614746, + "learning_rate": 8.52172310687763e-06, + "loss": 0.248, + "step": 3148 + }, + { + "epoch": 0.25192, + "grad_norm": 1.1401811838150024, + "learning_rate": 8.520830247170735e-06, + "loss": 0.2834, + "step": 3149 + }, + { + "epoch": 0.252, + "grad_norm": 1.574325680732727, + "learning_rate": 8.519937164712691e-06, + "loss": 0.3037, + "step": 3150 + }, + { + "epoch": 0.25208, + "grad_norm": 1.6470081806182861, + "learning_rate": 8.519043859560004e-06, + "loss": 0.3799, + "step": 3151 + }, + { + "epoch": 0.25216, + "grad_norm": 1.6649773120880127, + "learning_rate": 8.518150331769184e-06, + "loss": 0.3009, + "step": 3152 + }, + { + "epoch": 0.25224, + "grad_norm": 1.4246584177017212, + "learning_rate": 8.517256581396764e-06, + "loss": 0.3222, + "step": 3153 + }, + { + "epoch": 0.25232, + "grad_norm": 1.5069494247436523, + "learning_rate": 8.51636260849929e-06, + "loss": 0.3699, + "step": 3154 + }, + { + "epoch": 0.2524, + "grad_norm": 1.4292433261871338, + "learning_rate": 8.51546841313332e-06, + "loss": 0.4126, + "step": 3155 + }, + { + "epoch": 0.25248, + "grad_norm": 1.3793959617614746, + "learning_rate": 8.514573995355426e-06, + "loss": 0.2454, + "step": 3156 + }, + { + "epoch": 0.25256, + "grad_norm": 1.3602819442749023, + "learning_rate": 8.513679355222195e-06, + "loss": 0.3414, + "step": 3157 + }, + { + "epoch": 0.25264, + "grad_norm": 1.6190898418426514, + "learning_rate": 8.512784492790227e-06, + "loss": 0.3326, + "step": 3158 + }, + { + "epoch": 0.25272, + "grad_norm": 2.0170743465423584, + "learning_rate": 8.511889408116138e-06, + "loss": 0.3803, + "step": 3159 + }, + { + "epoch": 0.2528, + "grad_norm": 1.572218418121338, + "learning_rate": 8.510994101256557e-06, + "loss": 0.3277, + "step": 3160 + }, + { + "epoch": 0.25288, + "grad_norm": 1.3365064859390259, + "learning_rate": 8.510098572268129e-06, + "loss": 0.2734, + "step": 3161 + }, + { + "epoch": 0.25296, + "grad_norm": 1.7582436800003052, + "learning_rate": 8.509202821207508e-06, + "loss": 0.4216, + "step": 3162 + }, + { + "epoch": 0.25304, + "grad_norm": 1.665613055229187, + "learning_rate": 8.508306848131367e-06, + "loss": 0.3075, + "step": 3163 + }, + { + "epoch": 0.25312, + "grad_norm": 1.382347822189331, + "learning_rate": 8.50741065309639e-06, + "loss": 0.3831, + "step": 3164 + }, + { + "epoch": 0.2532, + "grad_norm": 1.4488331079483032, + "learning_rate": 8.506514236159276e-06, + "loss": 0.353, + "step": 3165 + }, + { + "epoch": 0.25328, + "grad_norm": 1.5848164558410645, + "learning_rate": 8.505617597376739e-06, + "loss": 0.4441, + "step": 3166 + }, + { + "epoch": 0.25336, + "grad_norm": 1.5356500148773193, + "learning_rate": 8.504720736805507e-06, + "loss": 0.3197, + "step": 3167 + }, + { + "epoch": 0.25344, + "grad_norm": 1.2928869724273682, + "learning_rate": 8.50382365450232e-06, + "loss": 0.2967, + "step": 3168 + }, + { + "epoch": 0.25352, + "grad_norm": 1.418664813041687, + "learning_rate": 8.502926350523937e-06, + "loss": 0.2878, + "step": 3169 + }, + { + "epoch": 0.2536, + "grad_norm": 2.2894961833953857, + "learning_rate": 8.502028824927123e-06, + "loss": 0.5619, + "step": 3170 + }, + { + "epoch": 0.25368, + "grad_norm": 1.8666496276855469, + "learning_rate": 8.501131077768664e-06, + "loss": 0.3883, + "step": 3171 + }, + { + "epoch": 0.25376, + "grad_norm": 1.9164226055145264, + "learning_rate": 8.500233109105354e-06, + "loss": 0.3319, + "step": 3172 + }, + { + "epoch": 0.25384, + "grad_norm": 1.4399014711380005, + "learning_rate": 8.499334918994008e-06, + "loss": 0.3641, + "step": 3173 + }, + { + "epoch": 0.25392, + "grad_norm": 1.7369312047958374, + "learning_rate": 8.498436507491452e-06, + "loss": 0.3709, + "step": 3174 + }, + { + "epoch": 0.254, + "grad_norm": 1.5380709171295166, + "learning_rate": 8.497537874654523e-06, + "loss": 0.3893, + "step": 3175 + }, + { + "epoch": 0.25408, + "grad_norm": 1.6948041915893555, + "learning_rate": 8.496639020540074e-06, + "loss": 0.3261, + "step": 3176 + }, + { + "epoch": 0.25416, + "grad_norm": 1.0962127447128296, + "learning_rate": 8.495739945204975e-06, + "loss": 0.2629, + "step": 3177 + }, + { + "epoch": 0.25424, + "grad_norm": 1.5826691389083862, + "learning_rate": 8.494840648706107e-06, + "loss": 0.4263, + "step": 3178 + }, + { + "epoch": 0.25432, + "grad_norm": 1.7018965482711792, + "learning_rate": 8.493941131100365e-06, + "loss": 0.471, + "step": 3179 + }, + { + "epoch": 0.2544, + "grad_norm": 2.042285680770874, + "learning_rate": 8.49304139244466e-06, + "loss": 0.4853, + "step": 3180 + }, + { + "epoch": 0.25448, + "grad_norm": 1.3213742971420288, + "learning_rate": 8.492141432795911e-06, + "loss": 0.2682, + "step": 3181 + }, + { + "epoch": 0.25456, + "grad_norm": 1.6487033367156982, + "learning_rate": 8.491241252211058e-06, + "loss": 0.3989, + "step": 3182 + }, + { + "epoch": 0.25464, + "grad_norm": 1.4631835222244263, + "learning_rate": 8.490340850747055e-06, + "loss": 0.3634, + "step": 3183 + }, + { + "epoch": 0.25472, + "grad_norm": 1.5799041986465454, + "learning_rate": 8.489440228460864e-06, + "loss": 0.4085, + "step": 3184 + }, + { + "epoch": 0.2548, + "grad_norm": 1.6044838428497314, + "learning_rate": 8.488539385409464e-06, + "loss": 0.3399, + "step": 3185 + }, + { + "epoch": 0.25488, + "grad_norm": 1.7360122203826904, + "learning_rate": 8.48763832164985e-06, + "loss": 0.3473, + "step": 3186 + }, + { + "epoch": 0.25496, + "grad_norm": 1.6504510641098022, + "learning_rate": 8.486737037239032e-06, + "loss": 0.3314, + "step": 3187 + }, + { + "epoch": 0.25504, + "grad_norm": 1.5512864589691162, + "learning_rate": 8.485835532234027e-06, + "loss": 0.3203, + "step": 3188 + }, + { + "epoch": 0.25512, + "grad_norm": 1.515463948249817, + "learning_rate": 8.48493380669187e-06, + "loss": 0.3878, + "step": 3189 + }, + { + "epoch": 0.2552, + "grad_norm": 1.3933261632919312, + "learning_rate": 8.484031860669612e-06, + "loss": 0.3515, + "step": 3190 + }, + { + "epoch": 0.25528, + "grad_norm": 1.280632495880127, + "learning_rate": 8.483129694224319e-06, + "loss": 0.3116, + "step": 3191 + }, + { + "epoch": 0.25536, + "grad_norm": 1.7904398441314697, + "learning_rate": 8.482227307413063e-06, + "loss": 0.4518, + "step": 3192 + }, + { + "epoch": 0.25544, + "grad_norm": 1.5730119943618774, + "learning_rate": 8.481324700292934e-06, + "loss": 0.3031, + "step": 3193 + }, + { + "epoch": 0.25552, + "grad_norm": 1.5001622438430786, + "learning_rate": 8.480421872921042e-06, + "loss": 0.2442, + "step": 3194 + }, + { + "epoch": 0.2556, + "grad_norm": 1.639999508857727, + "learning_rate": 8.479518825354504e-06, + "loss": 0.3146, + "step": 3195 + }, + { + "epoch": 0.25568, + "grad_norm": 1.664425253868103, + "learning_rate": 8.478615557650453e-06, + "loss": 0.2826, + "step": 3196 + }, + { + "epoch": 0.25576, + "grad_norm": 1.624760389328003, + "learning_rate": 8.477712069866033e-06, + "loss": 0.3324, + "step": 3197 + }, + { + "epoch": 0.25584, + "grad_norm": 1.328769564628601, + "learning_rate": 8.47680836205841e-06, + "loss": 0.296, + "step": 3198 + }, + { + "epoch": 0.25592, + "grad_norm": 1.3259910345077515, + "learning_rate": 8.475904434284752e-06, + "loss": 0.2968, + "step": 3199 + }, + { + "epoch": 0.256, + "grad_norm": 1.7926405668258667, + "learning_rate": 8.475000286602254e-06, + "loss": 0.447, + "step": 3200 + }, + { + "epoch": 0.25608, + "grad_norm": 1.6081920862197876, + "learning_rate": 8.47409591906811e-06, + "loss": 0.3946, + "step": 3201 + }, + { + "epoch": 0.25616, + "grad_norm": 1.7535005807876587, + "learning_rate": 8.473191331739547e-06, + "loss": 0.4252, + "step": 3202 + }, + { + "epoch": 0.25624, + "grad_norm": 1.5326111316680908, + "learning_rate": 8.472286524673787e-06, + "loss": 0.423, + "step": 3203 + }, + { + "epoch": 0.25632, + "grad_norm": 1.3280184268951416, + "learning_rate": 8.471381497928079e-06, + "loss": 0.2994, + "step": 3204 + }, + { + "epoch": 0.2564, + "grad_norm": 1.3200703859329224, + "learning_rate": 8.470476251559677e-06, + "loss": 0.3823, + "step": 3205 + }, + { + "epoch": 0.25648, + "grad_norm": 1.8249784708023071, + "learning_rate": 8.469570785625856e-06, + "loss": 0.4093, + "step": 3206 + }, + { + "epoch": 0.25656, + "grad_norm": 1.7080583572387695, + "learning_rate": 8.4686651001839e-06, + "loss": 0.4029, + "step": 3207 + }, + { + "epoch": 0.25664, + "grad_norm": 1.537379503250122, + "learning_rate": 8.467759195291108e-06, + "loss": 0.3258, + "step": 3208 + }, + { + "epoch": 0.25672, + "grad_norm": 1.783140778541565, + "learning_rate": 8.466853071004797e-06, + "loss": 0.3672, + "step": 3209 + }, + { + "epoch": 0.2568, + "grad_norm": 1.411805272102356, + "learning_rate": 8.465946727382293e-06, + "loss": 0.2902, + "step": 3210 + }, + { + "epoch": 0.25688, + "grad_norm": 1.7587497234344482, + "learning_rate": 8.465040164480934e-06, + "loss": 0.3364, + "step": 3211 + }, + { + "epoch": 0.25696, + "grad_norm": 2.055811882019043, + "learning_rate": 8.46413338235808e-06, + "loss": 0.4274, + "step": 3212 + }, + { + "epoch": 0.25704, + "grad_norm": 1.577392339706421, + "learning_rate": 8.463226381071095e-06, + "loss": 0.2775, + "step": 3213 + }, + { + "epoch": 0.25712, + "grad_norm": 1.8780300617218018, + "learning_rate": 8.462319160677368e-06, + "loss": 0.3771, + "step": 3214 + }, + { + "epoch": 0.2572, + "grad_norm": 1.4582608938217163, + "learning_rate": 8.461411721234292e-06, + "loss": 0.4151, + "step": 3215 + }, + { + "epoch": 0.25728, + "grad_norm": 1.8028523921966553, + "learning_rate": 8.460504062799277e-06, + "loss": 0.3596, + "step": 3216 + }, + { + "epoch": 0.25736, + "grad_norm": 1.2800477743148804, + "learning_rate": 8.459596185429751e-06, + "loss": 0.2512, + "step": 3217 + }, + { + "epoch": 0.25744, + "grad_norm": 1.3028494119644165, + "learning_rate": 8.458688089183147e-06, + "loss": 0.3026, + "step": 3218 + }, + { + "epoch": 0.25752, + "grad_norm": 1.5475552082061768, + "learning_rate": 8.457779774116924e-06, + "loss": 0.2819, + "step": 3219 + }, + { + "epoch": 0.2576, + "grad_norm": 1.6385855674743652, + "learning_rate": 8.456871240288542e-06, + "loss": 0.4, + "step": 3220 + }, + { + "epoch": 0.25768, + "grad_norm": 1.2522298097610474, + "learning_rate": 8.455962487755484e-06, + "loss": 0.3563, + "step": 3221 + }, + { + "epoch": 0.25776, + "grad_norm": 1.9725730419158936, + "learning_rate": 8.455053516575243e-06, + "loss": 0.3775, + "step": 3222 + }, + { + "epoch": 0.25784, + "grad_norm": 1.417087197303772, + "learning_rate": 8.454144326805328e-06, + "loss": 0.3495, + "step": 3223 + }, + { + "epoch": 0.25792, + "grad_norm": 1.4763423204421997, + "learning_rate": 8.453234918503257e-06, + "loss": 0.3413, + "step": 3224 + }, + { + "epoch": 0.258, + "grad_norm": 1.2496310472488403, + "learning_rate": 8.452325291726567e-06, + "loss": 0.2642, + "step": 3225 + }, + { + "epoch": 0.25808, + "grad_norm": 2.0870649814605713, + "learning_rate": 8.45141544653281e-06, + "loss": 0.4066, + "step": 3226 + }, + { + "epoch": 0.25816, + "grad_norm": 1.2118961811065674, + "learning_rate": 8.450505382979544e-06, + "loss": 0.2908, + "step": 3227 + }, + { + "epoch": 0.25824, + "grad_norm": 1.6424560546875, + "learning_rate": 8.449595101124349e-06, + "loss": 0.2963, + "step": 3228 + }, + { + "epoch": 0.25832, + "grad_norm": 1.646569013595581, + "learning_rate": 8.448684601024812e-06, + "loss": 0.3907, + "step": 3229 + }, + { + "epoch": 0.2584, + "grad_norm": 1.6771202087402344, + "learning_rate": 8.447773882738542e-06, + "loss": 0.3657, + "step": 3230 + }, + { + "epoch": 0.25848, + "grad_norm": 2.0412819385528564, + "learning_rate": 8.446862946323154e-06, + "loss": 0.4464, + "step": 3231 + }, + { + "epoch": 0.25856, + "grad_norm": 1.4120683670043945, + "learning_rate": 8.44595179183628e-06, + "loss": 0.3551, + "step": 3232 + }, + { + "epoch": 0.25864, + "grad_norm": 1.4363163709640503, + "learning_rate": 8.445040419335569e-06, + "loss": 0.3243, + "step": 3233 + }, + { + "epoch": 0.25872, + "grad_norm": 1.8272966146469116, + "learning_rate": 8.444128828878676e-06, + "loss": 0.3876, + "step": 3234 + }, + { + "epoch": 0.2588, + "grad_norm": 1.5481266975402832, + "learning_rate": 8.443217020523275e-06, + "loss": 0.4025, + "step": 3235 + }, + { + "epoch": 0.25888, + "grad_norm": 1.6646552085876465, + "learning_rate": 8.442304994327055e-06, + "loss": 0.3478, + "step": 3236 + }, + { + "epoch": 0.25896, + "grad_norm": 1.440777063369751, + "learning_rate": 8.441392750347716e-06, + "loss": 0.2645, + "step": 3237 + }, + { + "epoch": 0.25904, + "grad_norm": 1.4575594663619995, + "learning_rate": 8.440480288642972e-06, + "loss": 0.3257, + "step": 3238 + }, + { + "epoch": 0.25912, + "grad_norm": 1.6355897188186646, + "learning_rate": 8.439567609270554e-06, + "loss": 0.3219, + "step": 3239 + }, + { + "epoch": 0.2592, + "grad_norm": 1.4267311096191406, + "learning_rate": 8.438654712288202e-06, + "loss": 0.3609, + "step": 3240 + }, + { + "epoch": 0.25928, + "grad_norm": 1.911621332168579, + "learning_rate": 8.43774159775367e-06, + "loss": 0.4765, + "step": 3241 + }, + { + "epoch": 0.25936, + "grad_norm": 2.092682123184204, + "learning_rate": 8.43682826572473e-06, + "loss": 0.4551, + "step": 3242 + }, + { + "epoch": 0.25944, + "grad_norm": 1.7436370849609375, + "learning_rate": 8.435914716259166e-06, + "loss": 0.3244, + "step": 3243 + }, + { + "epoch": 0.25952, + "grad_norm": 1.278278112411499, + "learning_rate": 8.435000949414775e-06, + "loss": 0.2589, + "step": 3244 + }, + { + "epoch": 0.2596, + "grad_norm": 1.5885848999023438, + "learning_rate": 8.434086965249368e-06, + "loss": 0.445, + "step": 3245 + }, + { + "epoch": 0.25968, + "grad_norm": 1.544396996498108, + "learning_rate": 8.433172763820767e-06, + "loss": 0.406, + "step": 3246 + }, + { + "epoch": 0.25976, + "grad_norm": 1.5454270839691162, + "learning_rate": 8.432258345186815e-06, + "loss": 0.3159, + "step": 3247 + }, + { + "epoch": 0.25984, + "grad_norm": 1.3528130054473877, + "learning_rate": 8.43134370940536e-06, + "loss": 0.2816, + "step": 3248 + }, + { + "epoch": 0.25992, + "grad_norm": 1.8098217248916626, + "learning_rate": 8.430428856534271e-06, + "loss": 0.3795, + "step": 3249 + }, + { + "epoch": 0.26, + "grad_norm": 1.2492495775222778, + "learning_rate": 8.429513786631428e-06, + "loss": 0.2786, + "step": 3250 + }, + { + "epoch": 0.26008, + "grad_norm": 1.5359301567077637, + "learning_rate": 8.42859849975472e-06, + "loss": 0.3571, + "step": 3251 + }, + { + "epoch": 0.26016, + "grad_norm": 1.8178164958953857, + "learning_rate": 8.427682995962058e-06, + "loss": 0.41, + "step": 3252 + }, + { + "epoch": 0.26024, + "grad_norm": 1.6950284242630005, + "learning_rate": 8.426767275311361e-06, + "loss": 0.2952, + "step": 3253 + }, + { + "epoch": 0.26032, + "grad_norm": 1.3474059104919434, + "learning_rate": 8.425851337860566e-06, + "loss": 0.3301, + "step": 3254 + }, + { + "epoch": 0.2604, + "grad_norm": 1.5049055814743042, + "learning_rate": 8.42493518366762e-06, + "loss": 0.3448, + "step": 3255 + }, + { + "epoch": 0.26048, + "grad_norm": 1.99711275100708, + "learning_rate": 8.424018812790484e-06, + "loss": 0.3592, + "step": 3256 + }, + { + "epoch": 0.26056, + "grad_norm": 1.4437679052352905, + "learning_rate": 8.423102225287135e-06, + "loss": 0.3078, + "step": 3257 + }, + { + "epoch": 0.26064, + "grad_norm": 2.1142630577087402, + "learning_rate": 8.42218542121556e-06, + "loss": 0.5487, + "step": 3258 + }, + { + "epoch": 0.26072, + "grad_norm": 1.680812120437622, + "learning_rate": 8.421268400633766e-06, + "loss": 0.4302, + "step": 3259 + }, + { + "epoch": 0.2608, + "grad_norm": 1.1650234460830688, + "learning_rate": 8.420351163599768e-06, + "loss": 0.2241, + "step": 3260 + }, + { + "epoch": 0.26088, + "grad_norm": 1.5910313129425049, + "learning_rate": 8.419433710171596e-06, + "loss": 0.2969, + "step": 3261 + }, + { + "epoch": 0.26096, + "grad_norm": 1.5094048976898193, + "learning_rate": 8.418516040407295e-06, + "loss": 0.4483, + "step": 3262 + }, + { + "epoch": 0.26104, + "grad_norm": 1.2878527641296387, + "learning_rate": 8.417598154364923e-06, + "loss": 0.2714, + "step": 3263 + }, + { + "epoch": 0.26112, + "grad_norm": 1.6986520290374756, + "learning_rate": 8.416680052102549e-06, + "loss": 0.388, + "step": 3264 + }, + { + "epoch": 0.2612, + "grad_norm": 1.396224021911621, + "learning_rate": 8.415761733678262e-06, + "loss": 0.3414, + "step": 3265 + }, + { + "epoch": 0.26128, + "grad_norm": 1.9210736751556396, + "learning_rate": 8.414843199150159e-06, + "loss": 0.4595, + "step": 3266 + }, + { + "epoch": 0.26136, + "grad_norm": 1.2947478294372559, + "learning_rate": 8.413924448576351e-06, + "loss": 0.2695, + "step": 3267 + }, + { + "epoch": 0.26144, + "grad_norm": 1.2264394760131836, + "learning_rate": 8.41300548201497e-06, + "loss": 0.3017, + "step": 3268 + }, + { + "epoch": 0.26152, + "grad_norm": 1.5077235698699951, + "learning_rate": 8.412086299524149e-06, + "loss": 0.3803, + "step": 3269 + }, + { + "epoch": 0.2616, + "grad_norm": 1.603265643119812, + "learning_rate": 8.411166901162046e-06, + "loss": 0.3499, + "step": 3270 + }, + { + "epoch": 0.26168, + "grad_norm": 1.7666727304458618, + "learning_rate": 8.410247286986827e-06, + "loss": 0.3375, + "step": 3271 + }, + { + "epoch": 0.26176, + "grad_norm": 1.5865931510925293, + "learning_rate": 8.409327457056673e-06, + "loss": 0.3626, + "step": 3272 + }, + { + "epoch": 0.26184, + "grad_norm": 1.4339234828948975, + "learning_rate": 8.408407411429777e-06, + "loss": 0.3711, + "step": 3273 + }, + { + "epoch": 0.26192, + "grad_norm": 1.5342092514038086, + "learning_rate": 8.40748715016435e-06, + "loss": 0.3039, + "step": 3274 + }, + { + "epoch": 0.262, + "grad_norm": 1.602797269821167, + "learning_rate": 8.406566673318613e-06, + "loss": 0.3913, + "step": 3275 + }, + { + "epoch": 0.26208, + "grad_norm": 1.554508924484253, + "learning_rate": 8.405645980950799e-06, + "loss": 0.3879, + "step": 3276 + }, + { + "epoch": 0.26216, + "grad_norm": 1.9122850894927979, + "learning_rate": 8.40472507311916e-06, + "loss": 0.4212, + "step": 3277 + }, + { + "epoch": 0.26224, + "grad_norm": 1.8808695077896118, + "learning_rate": 8.403803949881958e-06, + "loss": 0.4648, + "step": 3278 + }, + { + "epoch": 0.26232, + "grad_norm": 1.2711478471755981, + "learning_rate": 8.402882611297469e-06, + "loss": 0.2795, + "step": 3279 + }, + { + "epoch": 0.2624, + "grad_norm": 1.6705055236816406, + "learning_rate": 8.401961057423984e-06, + "loss": 0.4656, + "step": 3280 + }, + { + "epoch": 0.26248, + "grad_norm": 1.4070430994033813, + "learning_rate": 8.401039288319805e-06, + "loss": 0.3143, + "step": 3281 + }, + { + "epoch": 0.26256, + "grad_norm": 1.4044073820114136, + "learning_rate": 8.400117304043252e-06, + "loss": 0.2966, + "step": 3282 + }, + { + "epoch": 0.26264, + "grad_norm": 1.5478609800338745, + "learning_rate": 8.399195104652652e-06, + "loss": 0.371, + "step": 3283 + }, + { + "epoch": 0.26272, + "grad_norm": 1.9816876649856567, + "learning_rate": 8.398272690206353e-06, + "loss": 0.3998, + "step": 3284 + }, + { + "epoch": 0.2628, + "grad_norm": 1.6786158084869385, + "learning_rate": 8.39735006076271e-06, + "loss": 0.3962, + "step": 3285 + }, + { + "epoch": 0.26288, + "grad_norm": 1.7463001012802124, + "learning_rate": 8.396427216380096e-06, + "loss": 0.3922, + "step": 3286 + }, + { + "epoch": 0.26296, + "grad_norm": 1.5134419202804565, + "learning_rate": 8.395504157116899e-06, + "loss": 0.2912, + "step": 3287 + }, + { + "epoch": 0.26304, + "grad_norm": 1.6220186948776245, + "learning_rate": 8.394580883031512e-06, + "loss": 0.4388, + "step": 3288 + }, + { + "epoch": 0.26312, + "grad_norm": 2.0490458011627197, + "learning_rate": 8.393657394182354e-06, + "loss": 0.524, + "step": 3289 + }, + { + "epoch": 0.2632, + "grad_norm": 1.5475538969039917, + "learning_rate": 8.392733690627846e-06, + "loss": 0.475, + "step": 3290 + }, + { + "epoch": 0.26328, + "grad_norm": 1.4094207286834717, + "learning_rate": 8.391809772426429e-06, + "loss": 0.3468, + "step": 3291 + }, + { + "epoch": 0.26336, + "grad_norm": 1.0840815305709839, + "learning_rate": 8.39088563963656e-06, + "loss": 0.2516, + "step": 3292 + }, + { + "epoch": 0.26344, + "grad_norm": 1.739927053451538, + "learning_rate": 8.389961292316699e-06, + "loss": 0.4647, + "step": 3293 + }, + { + "epoch": 0.26352, + "grad_norm": 1.712862491607666, + "learning_rate": 8.389036730525331e-06, + "loss": 0.3649, + "step": 3294 + }, + { + "epoch": 0.2636, + "grad_norm": 1.7053585052490234, + "learning_rate": 8.38811195432095e-06, + "loss": 0.4469, + "step": 3295 + }, + { + "epoch": 0.26368, + "grad_norm": 1.1163685321807861, + "learning_rate": 8.38718696376206e-06, + "loss": 0.2379, + "step": 3296 + }, + { + "epoch": 0.26376, + "grad_norm": 1.718429446220398, + "learning_rate": 8.386261758907187e-06, + "loss": 0.4064, + "step": 3297 + }, + { + "epoch": 0.26384, + "grad_norm": 1.4358896017074585, + "learning_rate": 8.38533633981486e-06, + "loss": 0.2584, + "step": 3298 + }, + { + "epoch": 0.26392, + "grad_norm": 1.6465083360671997, + "learning_rate": 8.384410706543632e-06, + "loss": 0.3218, + "step": 3299 + }, + { + "epoch": 0.264, + "grad_norm": 1.5639690160751343, + "learning_rate": 8.383484859152062e-06, + "loss": 0.4031, + "step": 3300 + }, + { + "epoch": 0.26408, + "grad_norm": 1.5496549606323242, + "learning_rate": 8.382558797698728e-06, + "loss": 0.3324, + "step": 3301 + }, + { + "epoch": 0.26416, + "grad_norm": 1.8462837934494019, + "learning_rate": 8.381632522242215e-06, + "loss": 0.3972, + "step": 3302 + }, + { + "epoch": 0.26424, + "grad_norm": 1.6575276851654053, + "learning_rate": 8.380706032841128e-06, + "loss": 0.4347, + "step": 3303 + }, + { + "epoch": 0.26432, + "grad_norm": 1.0943249464035034, + "learning_rate": 8.379779329554082e-06, + "loss": 0.2716, + "step": 3304 + }, + { + "epoch": 0.2644, + "grad_norm": 2.2363994121551514, + "learning_rate": 8.378852412439706e-06, + "loss": 0.3581, + "step": 3305 + }, + { + "epoch": 0.26448, + "grad_norm": 1.4077491760253906, + "learning_rate": 8.377925281556645e-06, + "loss": 0.2796, + "step": 3306 + }, + { + "epoch": 0.26456, + "grad_norm": 1.9200001955032349, + "learning_rate": 8.376997936963553e-06, + "loss": 0.431, + "step": 3307 + }, + { + "epoch": 0.26464, + "grad_norm": 1.444860577583313, + "learning_rate": 8.3760703787191e-06, + "loss": 0.2795, + "step": 3308 + }, + { + "epoch": 0.26472, + "grad_norm": 1.153676152229309, + "learning_rate": 8.375142606881973e-06, + "loss": 0.2588, + "step": 3309 + }, + { + "epoch": 0.2648, + "grad_norm": 2.0317533016204834, + "learning_rate": 8.374214621510866e-06, + "loss": 0.4834, + "step": 3310 + }, + { + "epoch": 0.26488, + "grad_norm": 1.2731088399887085, + "learning_rate": 8.373286422664489e-06, + "loss": 0.2232, + "step": 3311 + }, + { + "epoch": 0.26496, + "grad_norm": 1.506540298461914, + "learning_rate": 8.372358010401568e-06, + "loss": 0.3614, + "step": 3312 + }, + { + "epoch": 0.26504, + "grad_norm": 1.827217698097229, + "learning_rate": 8.371429384780839e-06, + "loss": 0.4667, + "step": 3313 + }, + { + "epoch": 0.26512, + "grad_norm": 1.9941977262496948, + "learning_rate": 8.370500545861053e-06, + "loss": 0.413, + "step": 3314 + }, + { + "epoch": 0.2652, + "grad_norm": 1.5264027118682861, + "learning_rate": 8.369571493700976e-06, + "loss": 0.3107, + "step": 3315 + }, + { + "epoch": 0.26528, + "grad_norm": 1.9555363655090332, + "learning_rate": 8.368642228359384e-06, + "loss": 0.3539, + "step": 3316 + }, + { + "epoch": 0.26536, + "grad_norm": 1.4721167087554932, + "learning_rate": 8.367712749895072e-06, + "loss": 0.3781, + "step": 3317 + }, + { + "epoch": 0.26544, + "grad_norm": 1.2694438695907593, + "learning_rate": 8.366783058366843e-06, + "loss": 0.279, + "step": 3318 + }, + { + "epoch": 0.26552, + "grad_norm": 1.8869990110397339, + "learning_rate": 8.365853153833511e-06, + "loss": 0.3966, + "step": 3319 + }, + { + "epoch": 0.2656, + "grad_norm": 1.7108945846557617, + "learning_rate": 8.364923036353915e-06, + "loss": 0.3471, + "step": 3320 + }, + { + "epoch": 0.26568, + "grad_norm": 1.5163145065307617, + "learning_rate": 8.3639927059869e-06, + "loss": 0.2747, + "step": 3321 + }, + { + "epoch": 0.26576, + "grad_norm": 1.5786588191986084, + "learning_rate": 8.36306216279132e-06, + "loss": 0.3084, + "step": 3322 + }, + { + "epoch": 0.26584, + "grad_norm": 1.5403141975402832, + "learning_rate": 8.362131406826049e-06, + "loss": 0.3665, + "step": 3323 + }, + { + "epoch": 0.26592, + "grad_norm": 1.248185634613037, + "learning_rate": 8.361200438149975e-06, + "loss": 0.2885, + "step": 3324 + }, + { + "epoch": 0.266, + "grad_norm": 1.7556192874908447, + "learning_rate": 8.360269256821994e-06, + "loss": 0.3611, + "step": 3325 + }, + { + "epoch": 0.26608, + "grad_norm": 1.7010356187820435, + "learning_rate": 8.359337862901023e-06, + "loss": 0.4007, + "step": 3326 + }, + { + "epoch": 0.26616, + "grad_norm": 1.3344570398330688, + "learning_rate": 8.358406256445985e-06, + "loss": 0.2836, + "step": 3327 + }, + { + "epoch": 0.26624, + "grad_norm": 1.2783137559890747, + "learning_rate": 8.357474437515819e-06, + "loss": 0.3394, + "step": 3328 + }, + { + "epoch": 0.26632, + "grad_norm": 1.2508137226104736, + "learning_rate": 8.356542406169481e-06, + "loss": 0.301, + "step": 3329 + }, + { + "epoch": 0.2664, + "grad_norm": 1.4016166925430298, + "learning_rate": 8.355610162465935e-06, + "loss": 0.2951, + "step": 3330 + }, + { + "epoch": 0.26648, + "grad_norm": 1.7911545038223267, + "learning_rate": 8.354677706464162e-06, + "loss": 0.4171, + "step": 3331 + }, + { + "epoch": 0.26656, + "grad_norm": 1.3079147338867188, + "learning_rate": 8.353745038223155e-06, + "loss": 0.2591, + "step": 3332 + }, + { + "epoch": 0.26664, + "grad_norm": 1.5401325225830078, + "learning_rate": 8.352812157801923e-06, + "loss": 0.3765, + "step": 3333 + }, + { + "epoch": 0.26672, + "grad_norm": 1.556061863899231, + "learning_rate": 8.351879065259484e-06, + "loss": 0.3329, + "step": 3334 + }, + { + "epoch": 0.2668, + "grad_norm": 1.2529242038726807, + "learning_rate": 8.35094576065487e-06, + "loss": 0.2728, + "step": 3335 + }, + { + "epoch": 0.26688, + "grad_norm": 1.6587908267974854, + "learning_rate": 8.350012244047132e-06, + "loss": 0.3042, + "step": 3336 + }, + { + "epoch": 0.26696, + "grad_norm": 1.5234625339508057, + "learning_rate": 8.349078515495327e-06, + "loss": 0.3707, + "step": 3337 + }, + { + "epoch": 0.26704, + "grad_norm": 1.4544942378997803, + "learning_rate": 8.348144575058531e-06, + "loss": 0.3395, + "step": 3338 + }, + { + "epoch": 0.26712, + "grad_norm": 1.9101989269256592, + "learning_rate": 8.347210422795831e-06, + "loss": 0.3822, + "step": 3339 + }, + { + "epoch": 0.2672, + "grad_norm": 1.5211460590362549, + "learning_rate": 8.346276058766327e-06, + "loss": 0.3944, + "step": 3340 + }, + { + "epoch": 0.26728, + "grad_norm": 1.3909361362457275, + "learning_rate": 8.345341483029134e-06, + "loss": 0.2667, + "step": 3341 + }, + { + "epoch": 0.26736, + "grad_norm": 1.3863193988800049, + "learning_rate": 8.344406695643378e-06, + "loss": 0.2917, + "step": 3342 + }, + { + "epoch": 0.26744, + "grad_norm": 1.6209913492202759, + "learning_rate": 8.343471696668202e-06, + "loss": 0.3255, + "step": 3343 + }, + { + "epoch": 0.26752, + "grad_norm": 1.6851791143417358, + "learning_rate": 8.342536486162758e-06, + "loss": 0.4018, + "step": 3344 + }, + { + "epoch": 0.2676, + "grad_norm": 1.5533232688903809, + "learning_rate": 8.341601064186215e-06, + "loss": 0.3628, + "step": 3345 + }, + { + "epoch": 0.26768, + "grad_norm": 1.481484055519104, + "learning_rate": 8.340665430797752e-06, + "loss": 0.3407, + "step": 3346 + }, + { + "epoch": 0.26776, + "grad_norm": 1.3775691986083984, + "learning_rate": 8.339729586056567e-06, + "loss": 0.3287, + "step": 3347 + }, + { + "epoch": 0.26784, + "grad_norm": 1.6615140438079834, + "learning_rate": 8.338793530021866e-06, + "loss": 0.4026, + "step": 3348 + }, + { + "epoch": 0.26792, + "grad_norm": 1.7780884504318237, + "learning_rate": 8.337857262752869e-06, + "loss": 0.3734, + "step": 3349 + }, + { + "epoch": 0.268, + "grad_norm": 1.2541722059249878, + "learning_rate": 8.336920784308814e-06, + "loss": 0.232, + "step": 3350 + }, + { + "epoch": 0.26808, + "grad_norm": 1.472259521484375, + "learning_rate": 8.335984094748944e-06, + "loss": 0.2674, + "step": 3351 + }, + { + "epoch": 0.26816, + "grad_norm": 1.723530888557434, + "learning_rate": 8.335047194132522e-06, + "loss": 0.3773, + "step": 3352 + }, + { + "epoch": 0.26824, + "grad_norm": 1.5939034223556519, + "learning_rate": 8.334110082518825e-06, + "loss": 0.4006, + "step": 3353 + }, + { + "epoch": 0.26832, + "grad_norm": 1.7733458280563354, + "learning_rate": 8.333172759967137e-06, + "loss": 0.3455, + "step": 3354 + }, + { + "epoch": 0.2684, + "grad_norm": 1.562965750694275, + "learning_rate": 8.332235226536762e-06, + "loss": 0.2928, + "step": 3355 + }, + { + "epoch": 0.26848, + "grad_norm": 1.6739492416381836, + "learning_rate": 8.331297482287016e-06, + "loss": 0.3414, + "step": 3356 + }, + { + "epoch": 0.26856, + "grad_norm": 1.6316815614700317, + "learning_rate": 8.330359527277224e-06, + "loss": 0.3977, + "step": 3357 + }, + { + "epoch": 0.26864, + "grad_norm": 1.0808318853378296, + "learning_rate": 8.329421361566724e-06, + "loss": 0.2338, + "step": 3358 + }, + { + "epoch": 0.26872, + "grad_norm": 1.3807647228240967, + "learning_rate": 8.328482985214879e-06, + "loss": 0.246, + "step": 3359 + }, + { + "epoch": 0.2688, + "grad_norm": 1.602565050125122, + "learning_rate": 8.327544398281053e-06, + "loss": 0.3755, + "step": 3360 + }, + { + "epoch": 0.26888, + "grad_norm": 1.6400891542434692, + "learning_rate": 8.326605600824625e-06, + "loss": 0.3009, + "step": 3361 + }, + { + "epoch": 0.26896, + "grad_norm": 1.5679333209991455, + "learning_rate": 8.325666592904993e-06, + "loss": 0.3595, + "step": 3362 + }, + { + "epoch": 0.26904, + "grad_norm": 1.984908103942871, + "learning_rate": 8.324727374581563e-06, + "loss": 0.4163, + "step": 3363 + }, + { + "epoch": 0.26912, + "grad_norm": 1.620694875717163, + "learning_rate": 8.323787945913755e-06, + "loss": 0.3158, + "step": 3364 + }, + { + "epoch": 0.2692, + "grad_norm": 1.4381059408187866, + "learning_rate": 8.322848306961007e-06, + "loss": 0.2875, + "step": 3365 + }, + { + "epoch": 0.26928, + "grad_norm": 2.2146494388580322, + "learning_rate": 8.321908457782764e-06, + "loss": 0.419, + "step": 3366 + }, + { + "epoch": 0.26936, + "grad_norm": 1.2770941257476807, + "learning_rate": 8.320968398438487e-06, + "loss": 0.3038, + "step": 3367 + }, + { + "epoch": 0.26944, + "grad_norm": 1.5118519067764282, + "learning_rate": 8.320028128987653e-06, + "loss": 0.3433, + "step": 3368 + }, + { + "epoch": 0.26952, + "grad_norm": 1.8498775959014893, + "learning_rate": 8.319087649489747e-06, + "loss": 0.3517, + "step": 3369 + }, + { + "epoch": 0.2696, + "grad_norm": 1.4128763675689697, + "learning_rate": 8.318146960004273e-06, + "loss": 0.2989, + "step": 3370 + }, + { + "epoch": 0.26968, + "grad_norm": 1.828460931777954, + "learning_rate": 8.31720606059074e-06, + "loss": 0.4235, + "step": 3371 + }, + { + "epoch": 0.26976, + "grad_norm": 1.5870622396469116, + "learning_rate": 8.316264951308682e-06, + "loss": 0.307, + "step": 3372 + }, + { + "epoch": 0.26984, + "grad_norm": 1.6679601669311523, + "learning_rate": 8.315323632217636e-06, + "loss": 0.3352, + "step": 3373 + }, + { + "epoch": 0.26992, + "grad_norm": 1.5351213216781616, + "learning_rate": 8.314382103377158e-06, + "loss": 0.3193, + "step": 3374 + }, + { + "epoch": 0.27, + "grad_norm": 1.365196704864502, + "learning_rate": 8.313440364846811e-06, + "loss": 0.2916, + "step": 3375 + }, + { + "epoch": 0.27008, + "grad_norm": 1.6591503620147705, + "learning_rate": 8.312498416686183e-06, + "loss": 0.3194, + "step": 3376 + }, + { + "epoch": 0.27016, + "grad_norm": 1.4480193853378296, + "learning_rate": 8.31155625895486e-06, + "loss": 0.3788, + "step": 3377 + }, + { + "epoch": 0.27024, + "grad_norm": 1.819912314414978, + "learning_rate": 8.310613891712455e-06, + "loss": 0.3924, + "step": 3378 + }, + { + "epoch": 0.27032, + "grad_norm": 1.4778650999069214, + "learning_rate": 8.309671315018587e-06, + "loss": 0.4356, + "step": 3379 + }, + { + "epoch": 0.2704, + "grad_norm": 1.509982705116272, + "learning_rate": 8.308728528932889e-06, + "loss": 0.3287, + "step": 3380 + }, + { + "epoch": 0.27048, + "grad_norm": 1.2625203132629395, + "learning_rate": 8.307785533515007e-06, + "loss": 0.2483, + "step": 3381 + }, + { + "epoch": 0.27056, + "grad_norm": 1.5584619045257568, + "learning_rate": 8.306842328824602e-06, + "loss": 0.3089, + "step": 3382 + }, + { + "epoch": 0.27064, + "grad_norm": 1.7761331796646118, + "learning_rate": 8.305898914921348e-06, + "loss": 0.3591, + "step": 3383 + }, + { + "epoch": 0.27072, + "grad_norm": 1.5493628978729248, + "learning_rate": 8.304955291864932e-06, + "loss": 0.3878, + "step": 3384 + }, + { + "epoch": 0.2708, + "grad_norm": 1.6428436040878296, + "learning_rate": 8.304011459715052e-06, + "loss": 0.3434, + "step": 3385 + }, + { + "epoch": 0.27088, + "grad_norm": 1.5427204370498657, + "learning_rate": 8.303067418531424e-06, + "loss": 0.3383, + "step": 3386 + }, + { + "epoch": 0.27096, + "grad_norm": 1.6140514612197876, + "learning_rate": 8.302123168373771e-06, + "loss": 0.3603, + "step": 3387 + }, + { + "epoch": 0.27104, + "grad_norm": 1.098120927810669, + "learning_rate": 8.301178709301833e-06, + "loss": 0.2703, + "step": 3388 + }, + { + "epoch": 0.27112, + "grad_norm": 1.3397828340530396, + "learning_rate": 8.300234041375366e-06, + "loss": 0.3006, + "step": 3389 + }, + { + "epoch": 0.2712, + "grad_norm": 2.0523359775543213, + "learning_rate": 8.299289164654132e-06, + "loss": 0.321, + "step": 3390 + }, + { + "epoch": 0.27128, + "grad_norm": 1.3592901229858398, + "learning_rate": 8.298344079197913e-06, + "loss": 0.2744, + "step": 3391 + }, + { + "epoch": 0.27136, + "grad_norm": 1.4901467561721802, + "learning_rate": 8.297398785066501e-06, + "loss": 0.4224, + "step": 3392 + }, + { + "epoch": 0.27144, + "grad_norm": 1.6656169891357422, + "learning_rate": 8.2964532823197e-06, + "loss": 0.3217, + "step": 3393 + }, + { + "epoch": 0.27152, + "grad_norm": 1.3976407051086426, + "learning_rate": 8.29550757101733e-06, + "loss": 0.3364, + "step": 3394 + }, + { + "epoch": 0.2716, + "grad_norm": 1.6198853254318237, + "learning_rate": 8.294561651219223e-06, + "loss": 0.4076, + "step": 3395 + }, + { + "epoch": 0.27168, + "grad_norm": 1.8720680475234985, + "learning_rate": 8.293615522985224e-06, + "loss": 0.3663, + "step": 3396 + }, + { + "epoch": 0.27176, + "grad_norm": 1.2774734497070312, + "learning_rate": 8.292669186375192e-06, + "loss": 0.3451, + "step": 3397 + }, + { + "epoch": 0.27184, + "grad_norm": 1.308482050895691, + "learning_rate": 8.291722641448995e-06, + "loss": 0.2712, + "step": 3398 + }, + { + "epoch": 0.27192, + "grad_norm": 1.2496081590652466, + "learning_rate": 8.290775888266525e-06, + "loss": 0.2932, + "step": 3399 + }, + { + "epoch": 0.272, + "grad_norm": 1.3944826126098633, + "learning_rate": 8.289828926887673e-06, + "loss": 0.2683, + "step": 3400 + }, + { + "epoch": 0.27208, + "grad_norm": 1.0699915885925293, + "learning_rate": 8.288881757372352e-06, + "loss": 0.2491, + "step": 3401 + }, + { + "epoch": 0.27216, + "grad_norm": 1.4618251323699951, + "learning_rate": 8.287934379780489e-06, + "loss": 0.3656, + "step": 3402 + }, + { + "epoch": 0.27224, + "grad_norm": 1.899168610572815, + "learning_rate": 8.286986794172017e-06, + "loss": 0.3084, + "step": 3403 + }, + { + "epoch": 0.27232, + "grad_norm": 1.8475021123886108, + "learning_rate": 8.286039000606889e-06, + "loss": 0.32, + "step": 3404 + }, + { + "epoch": 0.2724, + "grad_norm": 1.7992068529129028, + "learning_rate": 8.28509099914507e-06, + "loss": 0.3602, + "step": 3405 + }, + { + "epoch": 0.27248, + "grad_norm": 1.4878534078598022, + "learning_rate": 8.284142789846535e-06, + "loss": 0.3161, + "step": 3406 + }, + { + "epoch": 0.27256, + "grad_norm": 1.534593105316162, + "learning_rate": 8.283194372771274e-06, + "loss": 0.3459, + "step": 3407 + }, + { + "epoch": 0.27264, + "grad_norm": 1.5597976446151733, + "learning_rate": 8.28224574797929e-06, + "loss": 0.2994, + "step": 3408 + }, + { + "epoch": 0.27272, + "grad_norm": 1.2504432201385498, + "learning_rate": 8.281296915530602e-06, + "loss": 0.2811, + "step": 3409 + }, + { + "epoch": 0.2728, + "grad_norm": 1.392411231994629, + "learning_rate": 8.280347875485236e-06, + "loss": 0.2981, + "step": 3410 + }, + { + "epoch": 0.27288, + "grad_norm": 1.4544750452041626, + "learning_rate": 8.279398627903235e-06, + "loss": 0.2969, + "step": 3411 + }, + { + "epoch": 0.27296, + "grad_norm": 1.6616743803024292, + "learning_rate": 8.278449172844656e-06, + "loss": 0.3423, + "step": 3412 + }, + { + "epoch": 0.27304, + "grad_norm": 1.702860951423645, + "learning_rate": 8.27749951036957e-06, + "loss": 0.4418, + "step": 3413 + }, + { + "epoch": 0.27312, + "grad_norm": 1.229211449623108, + "learning_rate": 8.276549640538055e-06, + "loss": 0.2837, + "step": 3414 + }, + { + "epoch": 0.2732, + "grad_norm": 1.7604984045028687, + "learning_rate": 8.275599563410209e-06, + "loss": 0.3789, + "step": 3415 + }, + { + "epoch": 0.27328, + "grad_norm": 1.5034135580062866, + "learning_rate": 8.274649279046137e-06, + "loss": 0.2756, + "step": 3416 + }, + { + "epoch": 0.27336, + "grad_norm": 1.3147361278533936, + "learning_rate": 8.273698787505962e-06, + "loss": 0.2322, + "step": 3417 + }, + { + "epoch": 0.27344, + "grad_norm": 1.2072352170944214, + "learning_rate": 8.272748088849818e-06, + "loss": 0.2772, + "step": 3418 + }, + { + "epoch": 0.27352, + "grad_norm": 1.5700079202651978, + "learning_rate": 8.271797183137855e-06, + "loss": 0.3311, + "step": 3419 + }, + { + "epoch": 0.2736, + "grad_norm": 1.3132052421569824, + "learning_rate": 8.27084607043023e-06, + "loss": 0.271, + "step": 3420 + }, + { + "epoch": 0.27368, + "grad_norm": 1.235514760017395, + "learning_rate": 8.26989475078712e-06, + "loss": 0.2534, + "step": 3421 + }, + { + "epoch": 0.27376, + "grad_norm": 1.5095601081848145, + "learning_rate": 8.26894322426871e-06, + "loss": 0.294, + "step": 3422 + }, + { + "epoch": 0.27384, + "grad_norm": 1.3389074802398682, + "learning_rate": 8.267991490935199e-06, + "loss": 0.2859, + "step": 3423 + }, + { + "epoch": 0.27392, + "grad_norm": 1.3496683835983276, + "learning_rate": 8.267039550846802e-06, + "loss": 0.2212, + "step": 3424 + }, + { + "epoch": 0.274, + "grad_norm": 1.800253987312317, + "learning_rate": 8.266087404063743e-06, + "loss": 0.4047, + "step": 3425 + }, + { + "epoch": 0.27408, + "grad_norm": 1.3180961608886719, + "learning_rate": 8.265135050646262e-06, + "loss": 0.3051, + "step": 3426 + }, + { + "epoch": 0.27416, + "grad_norm": 1.7630568742752075, + "learning_rate": 8.264182490654614e-06, + "loss": 0.3703, + "step": 3427 + }, + { + "epoch": 0.27424, + "grad_norm": 1.518088459968567, + "learning_rate": 8.26322972414906e-06, + "loss": 0.3771, + "step": 3428 + }, + { + "epoch": 0.27432, + "grad_norm": 1.6900966167449951, + "learning_rate": 8.262276751189882e-06, + "loss": 0.3216, + "step": 3429 + }, + { + "epoch": 0.2744, + "grad_norm": 1.6469087600708008, + "learning_rate": 8.261323571837367e-06, + "loss": 0.3232, + "step": 3430 + }, + { + "epoch": 0.27448, + "grad_norm": 1.3444794416427612, + "learning_rate": 8.260370186151822e-06, + "loss": 0.2931, + "step": 3431 + }, + { + "epoch": 0.27456, + "grad_norm": 1.5142230987548828, + "learning_rate": 8.259416594193566e-06, + "loss": 0.3601, + "step": 3432 + }, + { + "epoch": 0.27464, + "grad_norm": 1.472582221031189, + "learning_rate": 8.25846279602293e-06, + "loss": 0.3059, + "step": 3433 + }, + { + "epoch": 0.27472, + "grad_norm": 1.7719913721084595, + "learning_rate": 8.257508791700253e-06, + "loss": 0.4288, + "step": 3434 + }, + { + "epoch": 0.2748, + "grad_norm": 1.7561125755310059, + "learning_rate": 8.256554581285895e-06, + "loss": 0.3458, + "step": 3435 + }, + { + "epoch": 0.27488, + "grad_norm": 1.25856614112854, + "learning_rate": 8.255600164840226e-06, + "loss": 0.2884, + "step": 3436 + }, + { + "epoch": 0.27496, + "grad_norm": 1.5106571912765503, + "learning_rate": 8.254645542423627e-06, + "loss": 0.314, + "step": 3437 + }, + { + "epoch": 0.27504, + "grad_norm": 1.3592396974563599, + "learning_rate": 8.253690714096494e-06, + "loss": 0.2891, + "step": 3438 + }, + { + "epoch": 0.27512, + "grad_norm": 1.3193227052688599, + "learning_rate": 8.252735679919238e-06, + "loss": 0.3048, + "step": 3439 + }, + { + "epoch": 0.2752, + "grad_norm": 1.72906494140625, + "learning_rate": 8.251780439952277e-06, + "loss": 0.3843, + "step": 3440 + }, + { + "epoch": 0.27528, + "grad_norm": 1.8231126070022583, + "learning_rate": 8.250824994256048e-06, + "loss": 0.4098, + "step": 3441 + }, + { + "epoch": 0.27536, + "grad_norm": 1.2340192794799805, + "learning_rate": 8.249869342891001e-06, + "loss": 0.245, + "step": 3442 + }, + { + "epoch": 0.27544, + "grad_norm": 1.667550802230835, + "learning_rate": 8.248913485917593e-06, + "loss": 0.4117, + "step": 3443 + }, + { + "epoch": 0.27552, + "grad_norm": 1.5927138328552246, + "learning_rate": 8.247957423396302e-06, + "loss": 0.4634, + "step": 3444 + }, + { + "epoch": 0.2756, + "grad_norm": 1.898964285850525, + "learning_rate": 8.24700115538761e-06, + "loss": 0.4804, + "step": 3445 + }, + { + "epoch": 0.27568, + "grad_norm": 1.5059139728546143, + "learning_rate": 8.246044681952022e-06, + "loss": 0.3067, + "step": 3446 + }, + { + "epoch": 0.27576, + "grad_norm": 1.2551331520080566, + "learning_rate": 8.245088003150047e-06, + "loss": 0.2362, + "step": 3447 + }, + { + "epoch": 0.27584, + "grad_norm": 1.5307579040527344, + "learning_rate": 8.244131119042211e-06, + "loss": 0.3315, + "step": 3448 + }, + { + "epoch": 0.27592, + "grad_norm": 1.7814452648162842, + "learning_rate": 8.243174029689055e-06, + "loss": 0.3452, + "step": 3449 + }, + { + "epoch": 0.276, + "grad_norm": 1.68953275680542, + "learning_rate": 8.242216735151131e-06, + "loss": 0.3785, + "step": 3450 + }, + { + "epoch": 0.27608, + "grad_norm": 1.7366198301315308, + "learning_rate": 8.241259235489001e-06, + "loss": 0.4651, + "step": 3451 + }, + { + "epoch": 0.27616, + "grad_norm": 1.8493014574050903, + "learning_rate": 8.240301530763244e-06, + "loss": 0.3389, + "step": 3452 + }, + { + "epoch": 0.27624, + "grad_norm": 1.3720508813858032, + "learning_rate": 8.239343621034452e-06, + "loss": 0.306, + "step": 3453 + }, + { + "epoch": 0.27632, + "grad_norm": 1.694554328918457, + "learning_rate": 8.23838550636323e-06, + "loss": 0.4108, + "step": 3454 + }, + { + "epoch": 0.2764, + "grad_norm": 1.6249841451644897, + "learning_rate": 8.23742718681019e-06, + "loss": 0.3531, + "step": 3455 + }, + { + "epoch": 0.27648, + "grad_norm": 1.7144601345062256, + "learning_rate": 8.236468662435964e-06, + "loss": 0.4359, + "step": 3456 + }, + { + "epoch": 0.27656, + "grad_norm": 1.3996644020080566, + "learning_rate": 8.235509933301197e-06, + "loss": 0.3071, + "step": 3457 + }, + { + "epoch": 0.27664, + "grad_norm": 1.3151053190231323, + "learning_rate": 8.234550999466542e-06, + "loss": 0.2705, + "step": 3458 + }, + { + "epoch": 0.27672, + "grad_norm": 1.8970770835876465, + "learning_rate": 8.233591860992667e-06, + "loss": 0.4059, + "step": 3459 + }, + { + "epoch": 0.2768, + "grad_norm": 1.183557391166687, + "learning_rate": 8.232632517940255e-06, + "loss": 0.226, + "step": 3460 + }, + { + "epoch": 0.27688, + "grad_norm": 2.460853099822998, + "learning_rate": 8.23167297037e-06, + "loss": 0.5838, + "step": 3461 + }, + { + "epoch": 0.27696, + "grad_norm": 1.3229994773864746, + "learning_rate": 8.230713218342611e-06, + "loss": 0.2763, + "step": 3462 + }, + { + "epoch": 0.27704, + "grad_norm": 1.5857011079788208, + "learning_rate": 8.229753261918805e-06, + "loss": 0.3172, + "step": 3463 + }, + { + "epoch": 0.27712, + "grad_norm": 1.5672333240509033, + "learning_rate": 8.228793101159318e-06, + "loss": 0.3418, + "step": 3464 + }, + { + "epoch": 0.2772, + "grad_norm": 1.766082763671875, + "learning_rate": 8.227832736124895e-06, + "loss": 0.4143, + "step": 3465 + }, + { + "epoch": 0.27728, + "grad_norm": 1.8570321798324585, + "learning_rate": 8.226872166876293e-06, + "loss": 0.4841, + "step": 3466 + }, + { + "epoch": 0.27736, + "grad_norm": 1.8656039237976074, + "learning_rate": 8.225911393474289e-06, + "loss": 0.4092, + "step": 3467 + }, + { + "epoch": 0.27744, + "grad_norm": 1.710142731666565, + "learning_rate": 8.224950415979664e-06, + "loss": 0.3453, + "step": 3468 + }, + { + "epoch": 0.27752, + "grad_norm": 2.0013701915740967, + "learning_rate": 8.223989234453215e-06, + "loss": 0.4788, + "step": 3469 + }, + { + "epoch": 0.2776, + "grad_norm": 1.8113597631454468, + "learning_rate": 8.223027848955757e-06, + "loss": 0.4269, + "step": 3470 + }, + { + "epoch": 0.27768, + "grad_norm": 1.8451036214828491, + "learning_rate": 8.222066259548111e-06, + "loss": 0.3931, + "step": 3471 + }, + { + "epoch": 0.27776, + "grad_norm": 2.501964569091797, + "learning_rate": 8.221104466291112e-06, + "loss": 0.5903, + "step": 3472 + }, + { + "epoch": 0.27784, + "grad_norm": 1.5321495532989502, + "learning_rate": 8.220142469245613e-06, + "loss": 0.3115, + "step": 3473 + }, + { + "epoch": 0.27792, + "grad_norm": 1.721560001373291, + "learning_rate": 8.219180268472476e-06, + "loss": 0.304, + "step": 3474 + }, + { + "epoch": 0.278, + "grad_norm": 1.3853156566619873, + "learning_rate": 8.218217864032572e-06, + "loss": 0.2682, + "step": 3475 + }, + { + "epoch": 0.27808, + "grad_norm": 1.5545319318771362, + "learning_rate": 8.217255255986794e-06, + "loss": 0.3532, + "step": 3476 + }, + { + "epoch": 0.27816, + "grad_norm": 1.3426564931869507, + "learning_rate": 8.216292444396038e-06, + "loss": 0.301, + "step": 3477 + }, + { + "epoch": 0.27824, + "grad_norm": 1.5732641220092773, + "learning_rate": 8.215329429321224e-06, + "loss": 0.4052, + "step": 3478 + }, + { + "epoch": 0.27832, + "grad_norm": 1.5441874265670776, + "learning_rate": 8.214366210823274e-06, + "loss": 0.2982, + "step": 3479 + }, + { + "epoch": 0.2784, + "grad_norm": 1.5156362056732178, + "learning_rate": 8.21340278896313e-06, + "loss": 0.4654, + "step": 3480 + }, + { + "epoch": 0.27848, + "grad_norm": 1.3966670036315918, + "learning_rate": 8.212439163801743e-06, + "loss": 0.2895, + "step": 3481 + }, + { + "epoch": 0.27856, + "grad_norm": 1.8723714351654053, + "learning_rate": 8.211475335400079e-06, + "loss": 0.3717, + "step": 3482 + }, + { + "epoch": 0.27864, + "grad_norm": 1.6649876832962036, + "learning_rate": 8.210511303819116e-06, + "loss": 0.3856, + "step": 3483 + }, + { + "epoch": 0.27872, + "grad_norm": 1.9745384454727173, + "learning_rate": 8.209547069119845e-06, + "loss": 0.4151, + "step": 3484 + }, + { + "epoch": 0.2788, + "grad_norm": 1.885003685951233, + "learning_rate": 8.20858263136327e-06, + "loss": 0.3486, + "step": 3485 + }, + { + "epoch": 0.27888, + "grad_norm": 1.5459332466125488, + "learning_rate": 8.20761799061041e-06, + "loss": 0.2786, + "step": 3486 + }, + { + "epoch": 0.27896, + "grad_norm": 0.8227094411849976, + "learning_rate": 8.20665314692229e-06, + "loss": 0.1874, + "step": 3487 + }, + { + "epoch": 0.27904, + "grad_norm": 1.5767573118209839, + "learning_rate": 8.205688100359956e-06, + "loss": 0.3388, + "step": 3488 + }, + { + "epoch": 0.27912, + "grad_norm": 1.772367238998413, + "learning_rate": 8.20472285098446e-06, + "loss": 0.3531, + "step": 3489 + }, + { + "epoch": 0.2792, + "grad_norm": 1.7118393182754517, + "learning_rate": 8.203757398856875e-06, + "loss": 0.312, + "step": 3490 + }, + { + "epoch": 0.27928, + "grad_norm": 1.6346728801727295, + "learning_rate": 8.202791744038278e-06, + "loss": 0.3631, + "step": 3491 + }, + { + "epoch": 0.27936, + "grad_norm": 1.87623131275177, + "learning_rate": 8.201825886589765e-06, + "loss": 0.5274, + "step": 3492 + }, + { + "epoch": 0.27944, + "grad_norm": 1.3338699340820312, + "learning_rate": 8.20085982657244e-06, + "loss": 0.2704, + "step": 3493 + }, + { + "epoch": 0.27952, + "grad_norm": 1.6640794277191162, + "learning_rate": 8.199893564047425e-06, + "loss": 0.3691, + "step": 3494 + }, + { + "epoch": 0.2796, + "grad_norm": 1.3115794658660889, + "learning_rate": 8.19892709907585e-06, + "loss": 0.3016, + "step": 3495 + }, + { + "epoch": 0.27968, + "grad_norm": 1.4660612344741821, + "learning_rate": 8.197960431718862e-06, + "loss": 0.3437, + "step": 3496 + }, + { + "epoch": 0.27976, + "grad_norm": 1.5010005235671997, + "learning_rate": 8.196993562037618e-06, + "loss": 0.4072, + "step": 3497 + }, + { + "epoch": 0.27984, + "grad_norm": 1.6829041242599487, + "learning_rate": 8.196026490093289e-06, + "loss": 0.4016, + "step": 3498 + }, + { + "epoch": 0.27992, + "grad_norm": 1.4259425401687622, + "learning_rate": 8.195059215947057e-06, + "loss": 0.3122, + "step": 3499 + }, + { + "epoch": 0.28, + "grad_norm": 1.4573216438293457, + "learning_rate": 8.194091739660119e-06, + "loss": 0.3861, + "step": 3500 + }, + { + "epoch": 0.28008, + "grad_norm": 1.3331818580627441, + "learning_rate": 8.193124061293684e-06, + "loss": 0.2762, + "step": 3501 + }, + { + "epoch": 0.28016, + "grad_norm": 1.7647954225540161, + "learning_rate": 8.192156180908974e-06, + "loss": 0.3346, + "step": 3502 + }, + { + "epoch": 0.28024, + "grad_norm": 1.65705144405365, + "learning_rate": 8.191188098567224e-06, + "loss": 0.3713, + "step": 3503 + }, + { + "epoch": 0.28032, + "grad_norm": 1.42042076587677, + "learning_rate": 8.190219814329681e-06, + "loss": 0.3316, + "step": 3504 + }, + { + "epoch": 0.2804, + "grad_norm": 1.3383320569992065, + "learning_rate": 8.189251328257604e-06, + "loss": 0.3086, + "step": 3505 + }, + { + "epoch": 0.28048, + "grad_norm": 1.7345651388168335, + "learning_rate": 8.188282640412267e-06, + "loss": 0.3242, + "step": 3506 + }, + { + "epoch": 0.28056, + "grad_norm": 1.544132113456726, + "learning_rate": 8.187313750854956e-06, + "loss": 0.3949, + "step": 3507 + }, + { + "epoch": 0.28064, + "grad_norm": 1.5500211715698242, + "learning_rate": 8.186344659646966e-06, + "loss": 0.4493, + "step": 3508 + }, + { + "epoch": 0.28072, + "grad_norm": 1.221084475517273, + "learning_rate": 8.185375366849613e-06, + "loss": 0.2511, + "step": 3509 + }, + { + "epoch": 0.2808, + "grad_norm": 1.6258049011230469, + "learning_rate": 8.184405872524219e-06, + "loss": 0.3507, + "step": 3510 + }, + { + "epoch": 0.28088, + "grad_norm": 1.774895191192627, + "learning_rate": 8.18343617673212e-06, + "loss": 0.3715, + "step": 3511 + }, + { + "epoch": 0.28096, + "grad_norm": 1.593440055847168, + "learning_rate": 8.182466279534666e-06, + "loss": 0.3073, + "step": 3512 + }, + { + "epoch": 0.28104, + "grad_norm": 1.700151801109314, + "learning_rate": 8.18149618099322e-06, + "loss": 0.3785, + "step": 3513 + }, + { + "epoch": 0.28112, + "grad_norm": 1.4120454788208008, + "learning_rate": 8.180525881169155e-06, + "loss": 0.326, + "step": 3514 + }, + { + "epoch": 0.2812, + "grad_norm": 1.9900215864181519, + "learning_rate": 8.17955538012386e-06, + "loss": 0.4358, + "step": 3515 + }, + { + "epoch": 0.28128, + "grad_norm": 1.1549736261367798, + "learning_rate": 8.178584677918734e-06, + "loss": 0.2993, + "step": 3516 + }, + { + "epoch": 0.28136, + "grad_norm": 1.4273407459259033, + "learning_rate": 8.177613774615193e-06, + "loss": 0.322, + "step": 3517 + }, + { + "epoch": 0.28144, + "grad_norm": 1.7175918817520142, + "learning_rate": 8.17664267027466e-06, + "loss": 0.3311, + "step": 3518 + }, + { + "epoch": 0.28152, + "grad_norm": 1.2839045524597168, + "learning_rate": 8.175671364958573e-06, + "loss": 0.304, + "step": 3519 + }, + { + "epoch": 0.2816, + "grad_norm": 1.3330825567245483, + "learning_rate": 8.174699858728386e-06, + "loss": 0.3253, + "step": 3520 + }, + { + "epoch": 0.28168, + "grad_norm": 1.5143156051635742, + "learning_rate": 8.173728151645561e-06, + "loss": 0.2915, + "step": 3521 + }, + { + "epoch": 0.28176, + "grad_norm": 1.783462405204773, + "learning_rate": 8.172756243771575e-06, + "loss": 0.4024, + "step": 3522 + }, + { + "epoch": 0.28184, + "grad_norm": 1.8704419136047363, + "learning_rate": 8.171784135167917e-06, + "loss": 0.3364, + "step": 3523 + }, + { + "epoch": 0.28192, + "grad_norm": 1.4959416389465332, + "learning_rate": 8.17081182589609e-06, + "loss": 0.3278, + "step": 3524 + }, + { + "epoch": 0.282, + "grad_norm": 2.0997822284698486, + "learning_rate": 8.169839316017609e-06, + "loss": 0.4619, + "step": 3525 + }, + { + "epoch": 0.28208, + "grad_norm": 1.242011547088623, + "learning_rate": 8.168866605594001e-06, + "loss": 0.2991, + "step": 3526 + }, + { + "epoch": 0.28216, + "grad_norm": 1.39487624168396, + "learning_rate": 8.167893694686805e-06, + "loss": 0.3224, + "step": 3527 + }, + { + "epoch": 0.28224, + "grad_norm": 1.8157869577407837, + "learning_rate": 8.166920583357575e-06, + "loss": 0.3669, + "step": 3528 + }, + { + "epoch": 0.28232, + "grad_norm": 1.2828103303909302, + "learning_rate": 8.165947271667875e-06, + "loss": 0.2376, + "step": 3529 + }, + { + "epoch": 0.2824, + "grad_norm": 1.9789408445358276, + "learning_rate": 8.164973759679287e-06, + "loss": 0.4204, + "step": 3530 + }, + { + "epoch": 0.28248, + "grad_norm": 1.2703884840011597, + "learning_rate": 8.164000047453398e-06, + "loss": 0.2719, + "step": 3531 + }, + { + "epoch": 0.28256, + "grad_norm": 1.5081241130828857, + "learning_rate": 8.163026135051813e-06, + "loss": 0.3015, + "step": 3532 + }, + { + "epoch": 0.28264, + "grad_norm": 1.3452731370925903, + "learning_rate": 8.162052022536148e-06, + "loss": 0.2942, + "step": 3533 + }, + { + "epoch": 0.28272, + "grad_norm": 1.6602774858474731, + "learning_rate": 8.16107770996803e-06, + "loss": 0.4434, + "step": 3534 + }, + { + "epoch": 0.2828, + "grad_norm": 2.201460361480713, + "learning_rate": 8.160103197409104e-06, + "loss": 0.3866, + "step": 3535 + }, + { + "epoch": 0.28288, + "grad_norm": 1.59469735622406, + "learning_rate": 8.159128484921022e-06, + "loss": 0.4352, + "step": 3536 + }, + { + "epoch": 0.28296, + "grad_norm": 1.86858332157135, + "learning_rate": 8.158153572565452e-06, + "loss": 0.4111, + "step": 3537 + }, + { + "epoch": 0.28304, + "grad_norm": 1.9216097593307495, + "learning_rate": 8.157178460404071e-06, + "loss": 0.4455, + "step": 3538 + }, + { + "epoch": 0.28312, + "grad_norm": 1.8076080083847046, + "learning_rate": 8.156203148498575e-06, + "loss": 0.4287, + "step": 3539 + }, + { + "epoch": 0.2832, + "grad_norm": 1.820548415184021, + "learning_rate": 8.155227636910665e-06, + "loss": 0.3744, + "step": 3540 + }, + { + "epoch": 0.28328, + "grad_norm": 1.749031662940979, + "learning_rate": 8.15425192570206e-06, + "loss": 0.4377, + "step": 3541 + }, + { + "epoch": 0.28336, + "grad_norm": 1.2342244386672974, + "learning_rate": 8.15327601493449e-06, + "loss": 0.2596, + "step": 3542 + }, + { + "epoch": 0.28344, + "grad_norm": 1.6412309408187866, + "learning_rate": 8.152299904669698e-06, + "loss": 0.344, + "step": 3543 + }, + { + "epoch": 0.28352, + "grad_norm": 1.5343546867370605, + "learning_rate": 8.151323594969438e-06, + "loss": 0.3667, + "step": 3544 + }, + { + "epoch": 0.2836, + "grad_norm": 1.3292542695999146, + "learning_rate": 8.150347085895479e-06, + "loss": 0.3124, + "step": 3545 + }, + { + "epoch": 0.28368, + "grad_norm": 1.3630868196487427, + "learning_rate": 8.1493703775096e-06, + "loss": 0.3347, + "step": 3546 + }, + { + "epoch": 0.28376, + "grad_norm": 1.5773979425430298, + "learning_rate": 8.148393469873596e-06, + "loss": 0.3451, + "step": 3547 + }, + { + "epoch": 0.28384, + "grad_norm": 1.5990095138549805, + "learning_rate": 8.147416363049271e-06, + "loss": 0.3509, + "step": 3548 + }, + { + "epoch": 0.28392, + "grad_norm": 1.6553959846496582, + "learning_rate": 8.146439057098446e-06, + "loss": 0.3553, + "step": 3549 + }, + { + "epoch": 0.284, + "grad_norm": 1.6501647233963013, + "learning_rate": 8.145461552082948e-06, + "loss": 0.3922, + "step": 3550 + }, + { + "epoch": 0.28408, + "grad_norm": 1.6862366199493408, + "learning_rate": 8.144483848064621e-06, + "loss": 0.5672, + "step": 3551 + }, + { + "epoch": 0.28416, + "grad_norm": 1.5185807943344116, + "learning_rate": 8.143505945105325e-06, + "loss": 0.2501, + "step": 3552 + }, + { + "epoch": 0.28424, + "grad_norm": 1.547339916229248, + "learning_rate": 8.142527843266924e-06, + "loss": 0.3314, + "step": 3553 + }, + { + "epoch": 0.28432, + "grad_norm": 1.4977695941925049, + "learning_rate": 8.141549542611302e-06, + "loss": 0.351, + "step": 3554 + }, + { + "epoch": 0.2844, + "grad_norm": 1.4256455898284912, + "learning_rate": 8.140571043200354e-06, + "loss": 0.313, + "step": 3555 + }, + { + "epoch": 0.28448, + "grad_norm": 1.908646821975708, + "learning_rate": 8.139592345095982e-06, + "loss": 0.4029, + "step": 3556 + }, + { + "epoch": 0.28456, + "grad_norm": 1.7634451389312744, + "learning_rate": 8.138613448360108e-06, + "loss": 0.4724, + "step": 3557 + }, + { + "epoch": 0.28464, + "grad_norm": 1.4443376064300537, + "learning_rate": 8.137634353054664e-06, + "loss": 0.2718, + "step": 3558 + }, + { + "epoch": 0.28472, + "grad_norm": 1.3614927530288696, + "learning_rate": 8.13665505924159e-06, + "loss": 0.2782, + "step": 3559 + }, + { + "epoch": 0.2848, + "grad_norm": 1.582389235496521, + "learning_rate": 8.13567556698285e-06, + "loss": 0.4158, + "step": 3560 + }, + { + "epoch": 0.28488, + "grad_norm": 1.4175881147384644, + "learning_rate": 8.134695876340406e-06, + "loss": 0.3379, + "step": 3561 + }, + { + "epoch": 0.28496, + "grad_norm": 1.4460728168487549, + "learning_rate": 8.133715987376245e-06, + "loss": 0.2862, + "step": 3562 + }, + { + "epoch": 0.28504, + "grad_norm": 1.5810394287109375, + "learning_rate": 8.132735900152357e-06, + "loss": 0.288, + "step": 3563 + }, + { + "epoch": 0.28512, + "grad_norm": 1.2738404273986816, + "learning_rate": 8.131755614730752e-06, + "loss": 0.3385, + "step": 3564 + }, + { + "epoch": 0.2852, + "grad_norm": 1.656475305557251, + "learning_rate": 8.130775131173447e-06, + "loss": 0.3369, + "step": 3565 + }, + { + "epoch": 0.28528, + "grad_norm": 1.2260715961456299, + "learning_rate": 8.129794449542474e-06, + "loss": 0.2934, + "step": 3566 + }, + { + "epoch": 0.28536, + "grad_norm": 1.6491786241531372, + "learning_rate": 8.12881356989988e-06, + "loss": 0.4359, + "step": 3567 + }, + { + "epoch": 0.28544, + "grad_norm": 1.270279049873352, + "learning_rate": 8.127832492307722e-06, + "loss": 0.277, + "step": 3568 + }, + { + "epoch": 0.28552, + "grad_norm": 1.3211328983306885, + "learning_rate": 8.126851216828065e-06, + "loss": 0.2417, + "step": 3569 + }, + { + "epoch": 0.2856, + "grad_norm": 1.2138761281967163, + "learning_rate": 8.125869743522997e-06, + "loss": 0.2868, + "step": 3570 + }, + { + "epoch": 0.28568, + "grad_norm": 1.6802400350570679, + "learning_rate": 8.124888072454607e-06, + "loss": 0.3408, + "step": 3571 + }, + { + "epoch": 0.28576, + "grad_norm": 1.6839826107025146, + "learning_rate": 8.123906203685007e-06, + "loss": 0.3029, + "step": 3572 + }, + { + "epoch": 0.28584, + "grad_norm": 1.53984534740448, + "learning_rate": 8.122924137276311e-06, + "loss": 0.3264, + "step": 3573 + }, + { + "epoch": 0.28592, + "grad_norm": 1.4914592504501343, + "learning_rate": 8.121941873290655e-06, + "loss": 0.3296, + "step": 3574 + }, + { + "epoch": 0.286, + "grad_norm": 1.3114702701568604, + "learning_rate": 8.120959411790184e-06, + "loss": 0.3089, + "step": 3575 + }, + { + "epoch": 0.28608, + "grad_norm": 1.9156404733657837, + "learning_rate": 8.119976752837054e-06, + "loss": 0.3684, + "step": 3576 + }, + { + "epoch": 0.28616, + "grad_norm": 2.184542179107666, + "learning_rate": 8.118993896493433e-06, + "loss": 0.4615, + "step": 3577 + }, + { + "epoch": 0.28624, + "grad_norm": 1.3617297410964966, + "learning_rate": 8.118010842821504e-06, + "loss": 0.3079, + "step": 3578 + }, + { + "epoch": 0.28632, + "grad_norm": 1.7069450616836548, + "learning_rate": 8.117027591883463e-06, + "loss": 0.3893, + "step": 3579 + }, + { + "epoch": 0.2864, + "grad_norm": 1.5273637771606445, + "learning_rate": 8.116044143741517e-06, + "loss": 0.3337, + "step": 3580 + }, + { + "epoch": 0.28648, + "grad_norm": 1.6013727188110352, + "learning_rate": 8.115060498457882e-06, + "loss": 0.3929, + "step": 3581 + }, + { + "epoch": 0.28656, + "grad_norm": 1.5870674848556519, + "learning_rate": 8.114076656094794e-06, + "loss": 0.3977, + "step": 3582 + }, + { + "epoch": 0.28664, + "grad_norm": 1.2662407159805298, + "learning_rate": 8.113092616714494e-06, + "loss": 0.3117, + "step": 3583 + }, + { + "epoch": 0.28672, + "grad_norm": 1.407432198524475, + "learning_rate": 8.112108380379242e-06, + "loss": 0.2418, + "step": 3584 + }, + { + "epoch": 0.2868, + "grad_norm": 1.5326191186904907, + "learning_rate": 8.111123947151305e-06, + "loss": 0.3517, + "step": 3585 + }, + { + "epoch": 0.28688, + "grad_norm": 2.2068679332733154, + "learning_rate": 8.110139317092966e-06, + "loss": 0.5408, + "step": 3586 + }, + { + "epoch": 0.28696, + "grad_norm": 1.5989627838134766, + "learning_rate": 8.10915449026652e-06, + "loss": 0.4202, + "step": 3587 + }, + { + "epoch": 0.28704, + "grad_norm": 1.3212885856628418, + "learning_rate": 8.108169466734271e-06, + "loss": 0.2814, + "step": 3588 + }, + { + "epoch": 0.28712, + "grad_norm": 1.6807013750076294, + "learning_rate": 8.107184246558542e-06, + "loss": 0.3642, + "step": 3589 + }, + { + "epoch": 0.2872, + "grad_norm": 1.4167367219924927, + "learning_rate": 8.10619882980166e-06, + "loss": 0.3045, + "step": 3590 + }, + { + "epoch": 0.28728, + "grad_norm": 1.8939565420150757, + "learning_rate": 8.105213216525974e-06, + "loss": 0.3761, + "step": 3591 + }, + { + "epoch": 0.28736, + "grad_norm": 1.1385818719863892, + "learning_rate": 8.104227406793834e-06, + "loss": 0.2466, + "step": 3592 + }, + { + "epoch": 0.28744, + "grad_norm": 1.3388983011245728, + "learning_rate": 8.103241400667617e-06, + "loss": 0.3011, + "step": 3593 + }, + { + "epoch": 0.28752, + "grad_norm": 1.2075142860412598, + "learning_rate": 8.102255198209696e-06, + "loss": 0.263, + "step": 3594 + }, + { + "epoch": 0.2876, + "grad_norm": 1.6224541664123535, + "learning_rate": 8.101268799482472e-06, + "loss": 0.3047, + "step": 3595 + }, + { + "epoch": 0.28768, + "grad_norm": 1.7167912721633911, + "learning_rate": 8.100282204548347e-06, + "loss": 0.4803, + "step": 3596 + }, + { + "epoch": 0.28776, + "grad_norm": 1.5466722249984741, + "learning_rate": 8.09929541346974e-06, + "loss": 0.4716, + "step": 3597 + }, + { + "epoch": 0.28784, + "grad_norm": 1.9157578945159912, + "learning_rate": 8.098308426309082e-06, + "loss": 0.3921, + "step": 3598 + }, + { + "epoch": 0.28792, + "grad_norm": 1.7497869729995728, + "learning_rate": 8.097321243128817e-06, + "loss": 0.3694, + "step": 3599 + }, + { + "epoch": 0.288, + "grad_norm": 1.7529637813568115, + "learning_rate": 8.096333863991402e-06, + "loss": 0.3621, + "step": 3600 + }, + { + "epoch": 0.28808, + "grad_norm": 1.7688055038452148, + "learning_rate": 8.095346288959303e-06, + "loss": 0.3573, + "step": 3601 + }, + { + "epoch": 0.28816, + "grad_norm": 2.178952217102051, + "learning_rate": 8.094358518095002e-06, + "loss": 0.5357, + "step": 3602 + }, + { + "epoch": 0.28824, + "grad_norm": 2.0295703411102295, + "learning_rate": 8.09337055146099e-06, + "loss": 0.4133, + "step": 3603 + }, + { + "epoch": 0.28832, + "grad_norm": 1.6052531003952026, + "learning_rate": 8.092382389119775e-06, + "loss": 0.2952, + "step": 3604 + }, + { + "epoch": 0.2884, + "grad_norm": 1.9326715469360352, + "learning_rate": 8.091394031133872e-06, + "loss": 0.3914, + "step": 3605 + }, + { + "epoch": 0.28848, + "grad_norm": 1.7421461343765259, + "learning_rate": 8.090405477565814e-06, + "loss": 0.3181, + "step": 3606 + }, + { + "epoch": 0.28856, + "grad_norm": 1.501417875289917, + "learning_rate": 8.089416728478141e-06, + "loss": 0.314, + "step": 3607 + }, + { + "epoch": 0.28864, + "grad_norm": 1.6021887063980103, + "learning_rate": 8.088427783933408e-06, + "loss": 0.3401, + "step": 3608 + }, + { + "epoch": 0.28872, + "grad_norm": 1.6619521379470825, + "learning_rate": 8.087438643994185e-06, + "loss": 0.3583, + "step": 3609 + }, + { + "epoch": 0.2888, + "grad_norm": 1.458363652229309, + "learning_rate": 8.086449308723048e-06, + "loss": 0.3985, + "step": 3610 + }, + { + "epoch": 0.28888, + "grad_norm": 1.698525309562683, + "learning_rate": 8.085459778182591e-06, + "loss": 0.3954, + "step": 3611 + }, + { + "epoch": 0.28896, + "grad_norm": 1.6027504205703735, + "learning_rate": 8.084470052435419e-06, + "loss": 0.3274, + "step": 3612 + }, + { + "epoch": 0.28904, + "grad_norm": 1.7605338096618652, + "learning_rate": 8.083480131544146e-06, + "loss": 0.3009, + "step": 3613 + }, + { + "epoch": 0.28912, + "grad_norm": 1.6151217222213745, + "learning_rate": 8.082490015571403e-06, + "loss": 0.3801, + "step": 3614 + }, + { + "epoch": 0.2892, + "grad_norm": 1.507580041885376, + "learning_rate": 8.08149970457983e-06, + "loss": 0.3096, + "step": 3615 + }, + { + "epoch": 0.28928, + "grad_norm": 1.56270432472229, + "learning_rate": 8.080509198632082e-06, + "loss": 0.2935, + "step": 3616 + }, + { + "epoch": 0.28936, + "grad_norm": 1.5426692962646484, + "learning_rate": 8.079518497790825e-06, + "loss": 0.3052, + "step": 3617 + }, + { + "epoch": 0.28944, + "grad_norm": 1.736161231994629, + "learning_rate": 8.078527602118735e-06, + "loss": 0.4002, + "step": 3618 + }, + { + "epoch": 0.28952, + "grad_norm": 1.4631191492080688, + "learning_rate": 8.077536511678506e-06, + "loss": 0.2789, + "step": 3619 + }, + { + "epoch": 0.2896, + "grad_norm": 1.389417052268982, + "learning_rate": 8.076545226532839e-06, + "loss": 0.2832, + "step": 3620 + }, + { + "epoch": 0.28968, + "grad_norm": 1.6184295415878296, + "learning_rate": 8.07555374674445e-06, + "loss": 0.3232, + "step": 3621 + }, + { + "epoch": 0.28976, + "grad_norm": 1.4686223268508911, + "learning_rate": 8.074562072376067e-06, + "loss": 0.3091, + "step": 3622 + }, + { + "epoch": 0.28984, + "grad_norm": 1.418705940246582, + "learning_rate": 8.073570203490428e-06, + "loss": 0.3433, + "step": 3623 + }, + { + "epoch": 0.28992, + "grad_norm": 2.270691394805908, + "learning_rate": 8.072578140150286e-06, + "loss": 0.5489, + "step": 3624 + }, + { + "epoch": 0.29, + "grad_norm": 2.0474798679351807, + "learning_rate": 8.071585882418406e-06, + "loss": 0.3902, + "step": 3625 + }, + { + "epoch": 0.29008, + "grad_norm": 1.4669115543365479, + "learning_rate": 8.070593430357565e-06, + "loss": 0.3705, + "step": 3626 + }, + { + "epoch": 0.29016, + "grad_norm": 1.3906588554382324, + "learning_rate": 8.069600784030553e-06, + "loss": 0.2618, + "step": 3627 + }, + { + "epoch": 0.29024, + "grad_norm": 2.14176869392395, + "learning_rate": 8.068607943500168e-06, + "loss": 0.6205, + "step": 3628 + }, + { + "epoch": 0.29032, + "grad_norm": 1.6090953350067139, + "learning_rate": 8.067614908829229e-06, + "loss": 0.3588, + "step": 3629 + }, + { + "epoch": 0.2904, + "grad_norm": 1.9249868392944336, + "learning_rate": 8.066621680080557e-06, + "loss": 0.3878, + "step": 3630 + }, + { + "epoch": 0.29048, + "grad_norm": 1.986382007598877, + "learning_rate": 8.065628257316993e-06, + "loss": 0.4614, + "step": 3631 + }, + { + "epoch": 0.29056, + "grad_norm": 1.3387824296951294, + "learning_rate": 8.064634640601386e-06, + "loss": 0.3331, + "step": 3632 + }, + { + "epoch": 0.29064, + "grad_norm": 1.2659355401992798, + "learning_rate": 8.0636408299966e-06, + "loss": 0.311, + "step": 3633 + }, + { + "epoch": 0.29072, + "grad_norm": 1.6108455657958984, + "learning_rate": 8.06264682556551e-06, + "loss": 0.4001, + "step": 3634 + }, + { + "epoch": 0.2908, + "grad_norm": 1.5894638299942017, + "learning_rate": 8.061652627371003e-06, + "loss": 0.291, + "step": 3635 + }, + { + "epoch": 0.29088, + "grad_norm": 2.1120431423187256, + "learning_rate": 8.060658235475978e-06, + "loss": 0.5541, + "step": 3636 + }, + { + "epoch": 0.29096, + "grad_norm": 1.6898366212844849, + "learning_rate": 8.059663649943348e-06, + "loss": 0.3644, + "step": 3637 + }, + { + "epoch": 0.29104, + "grad_norm": 1.6504249572753906, + "learning_rate": 8.058668870836035e-06, + "loss": 0.4423, + "step": 3638 + }, + { + "epoch": 0.29112, + "grad_norm": 1.3013478517532349, + "learning_rate": 8.05767389821698e-06, + "loss": 0.3372, + "step": 3639 + }, + { + "epoch": 0.2912, + "grad_norm": 1.45965576171875, + "learning_rate": 8.056678732149125e-06, + "loss": 0.2493, + "step": 3640 + }, + { + "epoch": 0.29128, + "grad_norm": 1.4169219732284546, + "learning_rate": 8.055683372695437e-06, + "loss": 0.3208, + "step": 3641 + }, + { + "epoch": 0.29136, + "grad_norm": 1.294084072113037, + "learning_rate": 8.054687819918884e-06, + "loss": 0.2904, + "step": 3642 + }, + { + "epoch": 0.29144, + "grad_norm": 1.3888262510299683, + "learning_rate": 8.053692073882456e-06, + "loss": 0.3211, + "step": 3643 + }, + { + "epoch": 0.29152, + "grad_norm": 1.5449509620666504, + "learning_rate": 8.052696134649147e-06, + "loss": 0.2892, + "step": 3644 + }, + { + "epoch": 0.2916, + "grad_norm": 1.308077096939087, + "learning_rate": 8.051700002281967e-06, + "loss": 0.2521, + "step": 3645 + }, + { + "epoch": 0.29168, + "grad_norm": 1.7512633800506592, + "learning_rate": 8.05070367684394e-06, + "loss": 0.3653, + "step": 3646 + }, + { + "epoch": 0.29176, + "grad_norm": 2.0131380558013916, + "learning_rate": 8.0497071583981e-06, + "loss": 0.4186, + "step": 3647 + }, + { + "epoch": 0.29184, + "grad_norm": 2.0268020629882812, + "learning_rate": 8.048710447007491e-06, + "loss": 0.4878, + "step": 3648 + }, + { + "epoch": 0.29192, + "grad_norm": 1.627658486366272, + "learning_rate": 8.047713542735173e-06, + "loss": 0.4156, + "step": 3649 + }, + { + "epoch": 0.292, + "grad_norm": 1.7770787477493286, + "learning_rate": 8.046716445644217e-06, + "loss": 0.3509, + "step": 3650 + }, + { + "epoch": 0.29208, + "grad_norm": 1.3930143117904663, + "learning_rate": 8.045719155797708e-06, + "loss": 0.2563, + "step": 3651 + }, + { + "epoch": 0.29216, + "grad_norm": 1.5242277383804321, + "learning_rate": 8.044721673258736e-06, + "loss": 0.4064, + "step": 3652 + }, + { + "epoch": 0.29224, + "grad_norm": 1.174406886100769, + "learning_rate": 8.043723998090413e-06, + "loss": 0.2569, + "step": 3653 + }, + { + "epoch": 0.29232, + "grad_norm": 1.1028757095336914, + "learning_rate": 8.042726130355856e-06, + "loss": 0.2104, + "step": 3654 + }, + { + "epoch": 0.2924, + "grad_norm": 1.654245138168335, + "learning_rate": 8.041728070118198e-06, + "loss": 0.4263, + "step": 3655 + }, + { + "epoch": 0.29248, + "grad_norm": 1.5495033264160156, + "learning_rate": 8.040729817440584e-06, + "loss": 0.3549, + "step": 3656 + }, + { + "epoch": 0.29256, + "grad_norm": 1.5318132638931274, + "learning_rate": 8.039731372386168e-06, + "loss": 0.3307, + "step": 3657 + }, + { + "epoch": 0.29264, + "grad_norm": 1.3253992795944214, + "learning_rate": 8.038732735018118e-06, + "loss": 0.3224, + "step": 3658 + }, + { + "epoch": 0.29272, + "grad_norm": 1.608940601348877, + "learning_rate": 8.037733905399616e-06, + "loss": 0.3678, + "step": 3659 + }, + { + "epoch": 0.2928, + "grad_norm": 1.910239577293396, + "learning_rate": 8.036734883593852e-06, + "loss": 0.3916, + "step": 3660 + }, + { + "epoch": 0.29288, + "grad_norm": 1.2812817096710205, + "learning_rate": 8.035735669664037e-06, + "loss": 0.296, + "step": 3661 + }, + { + "epoch": 0.29296, + "grad_norm": 1.475081205368042, + "learning_rate": 8.034736263673382e-06, + "loss": 0.3323, + "step": 3662 + }, + { + "epoch": 0.29304, + "grad_norm": 1.463401436805725, + "learning_rate": 8.033736665685119e-06, + "loss": 0.3216, + "step": 3663 + }, + { + "epoch": 0.29312, + "grad_norm": 1.6229404211044312, + "learning_rate": 8.032736875762486e-06, + "loss": 0.3206, + "step": 3664 + }, + { + "epoch": 0.2932, + "grad_norm": 1.4625945091247559, + "learning_rate": 8.031736893968738e-06, + "loss": 0.3063, + "step": 3665 + }, + { + "epoch": 0.29328, + "grad_norm": 1.762115716934204, + "learning_rate": 8.030736720367143e-06, + "loss": 0.3884, + "step": 3666 + }, + { + "epoch": 0.29336, + "grad_norm": 1.6735233068466187, + "learning_rate": 8.029736355020975e-06, + "loss": 0.4038, + "step": 3667 + }, + { + "epoch": 0.29344, + "grad_norm": 1.6050176620483398, + "learning_rate": 8.028735797993528e-06, + "loss": 0.4044, + "step": 3668 + }, + { + "epoch": 0.29352, + "grad_norm": 1.3272747993469238, + "learning_rate": 8.027735049348099e-06, + "loss": 0.2959, + "step": 3669 + }, + { + "epoch": 0.2936, + "grad_norm": 1.7667698860168457, + "learning_rate": 8.026734109148005e-06, + "loss": 0.423, + "step": 3670 + }, + { + "epoch": 0.29368, + "grad_norm": 1.7235229015350342, + "learning_rate": 8.02573297745657e-06, + "loss": 0.3251, + "step": 3671 + }, + { + "epoch": 0.29376, + "grad_norm": 1.4288824796676636, + "learning_rate": 8.024731654337134e-06, + "loss": 0.3099, + "step": 3672 + }, + { + "epoch": 0.29384, + "grad_norm": 1.4889461994171143, + "learning_rate": 8.023730139853049e-06, + "loss": 0.271, + "step": 3673 + }, + { + "epoch": 0.29392, + "grad_norm": 1.5079691410064697, + "learning_rate": 8.022728434067675e-06, + "loss": 0.3027, + "step": 3674 + }, + { + "epoch": 0.294, + "grad_norm": 1.52550208568573, + "learning_rate": 8.021726537044385e-06, + "loss": 0.4014, + "step": 3675 + }, + { + "epoch": 0.29408, + "grad_norm": 1.9863523244857788, + "learning_rate": 8.020724448846569e-06, + "loss": 0.4531, + "step": 3676 + }, + { + "epoch": 0.29416, + "grad_norm": 1.5718297958374023, + "learning_rate": 8.019722169537624e-06, + "loss": 0.4036, + "step": 3677 + }, + { + "epoch": 0.29424, + "grad_norm": 1.8104145526885986, + "learning_rate": 8.018719699180961e-06, + "loss": 0.3399, + "step": 3678 + }, + { + "epoch": 0.29432, + "grad_norm": 1.6687524318695068, + "learning_rate": 8.017717037840005e-06, + "loss": 0.3931, + "step": 3679 + }, + { + "epoch": 0.2944, + "grad_norm": 1.5040487051010132, + "learning_rate": 8.016714185578189e-06, + "loss": 0.3671, + "step": 3680 + }, + { + "epoch": 0.29448, + "grad_norm": 1.4653910398483276, + "learning_rate": 8.01571114245896e-06, + "loss": 0.3147, + "step": 3681 + }, + { + "epoch": 0.29456, + "grad_norm": 1.5103580951690674, + "learning_rate": 8.014707908545776e-06, + "loss": 0.3386, + "step": 3682 + }, + { + "epoch": 0.29464, + "grad_norm": 1.5704234838485718, + "learning_rate": 8.013704483902112e-06, + "loss": 0.2921, + "step": 3683 + }, + { + "epoch": 0.29472, + "grad_norm": 1.7446131706237793, + "learning_rate": 8.012700868591449e-06, + "loss": 0.3766, + "step": 3684 + }, + { + "epoch": 0.2948, + "grad_norm": 1.2436984777450562, + "learning_rate": 8.011697062677282e-06, + "loss": 0.2697, + "step": 3685 + }, + { + "epoch": 0.29488, + "grad_norm": 1.496191382408142, + "learning_rate": 8.01069306622312e-06, + "loss": 0.325, + "step": 3686 + }, + { + "epoch": 0.29496, + "grad_norm": 1.426172137260437, + "learning_rate": 8.00968887929248e-06, + "loss": 0.2959, + "step": 3687 + }, + { + "epoch": 0.29504, + "grad_norm": 1.596839427947998, + "learning_rate": 8.008684501948895e-06, + "loss": 0.3525, + "step": 3688 + }, + { + "epoch": 0.29512, + "grad_norm": 1.2332483530044556, + "learning_rate": 8.00767993425591e-06, + "loss": 0.2837, + "step": 3689 + }, + { + "epoch": 0.2952, + "grad_norm": 1.593542218208313, + "learning_rate": 8.00667517627708e-06, + "loss": 0.3293, + "step": 3690 + }, + { + "epoch": 0.29528, + "grad_norm": 1.504442572593689, + "learning_rate": 8.00567022807597e-06, + "loss": 0.3502, + "step": 3691 + }, + { + "epoch": 0.29536, + "grad_norm": 1.763466477394104, + "learning_rate": 8.004665089716162e-06, + "loss": 0.3728, + "step": 3692 + }, + { + "epoch": 0.29544, + "grad_norm": 1.6120593547821045, + "learning_rate": 8.003659761261248e-06, + "loss": 0.314, + "step": 3693 + }, + { + "epoch": 0.29552, + "grad_norm": 1.3458003997802734, + "learning_rate": 8.00265424277483e-06, + "loss": 0.3373, + "step": 3694 + }, + { + "epoch": 0.2956, + "grad_norm": 1.3163591623306274, + "learning_rate": 8.001648534320526e-06, + "loss": 0.325, + "step": 3695 + }, + { + "epoch": 0.29568, + "grad_norm": 1.616263747215271, + "learning_rate": 8.000642635961963e-06, + "loss": 0.4398, + "step": 3696 + }, + { + "epoch": 0.29576, + "grad_norm": 1.4315654039382935, + "learning_rate": 7.99963654776278e-06, + "loss": 0.3828, + "step": 3697 + }, + { + "epoch": 0.29584, + "grad_norm": 1.517011284828186, + "learning_rate": 7.99863026978663e-06, + "loss": 0.3207, + "step": 3698 + }, + { + "epoch": 0.29592, + "grad_norm": 1.4280073642730713, + "learning_rate": 7.997623802097176e-06, + "loss": 0.3207, + "step": 3699 + }, + { + "epoch": 0.296, + "grad_norm": 1.3090413808822632, + "learning_rate": 7.996617144758094e-06, + "loss": 0.3191, + "step": 3700 + }, + { + "epoch": 0.29608, + "grad_norm": 1.6390103101730347, + "learning_rate": 7.995610297833072e-06, + "loss": 0.3944, + "step": 3701 + }, + { + "epoch": 0.29616, + "grad_norm": 1.4760853052139282, + "learning_rate": 7.994603261385809e-06, + "loss": 0.3428, + "step": 3702 + }, + { + "epoch": 0.29624, + "grad_norm": 1.1745274066925049, + "learning_rate": 7.99359603548002e-06, + "loss": 0.2517, + "step": 3703 + }, + { + "epoch": 0.29632, + "grad_norm": 1.4406036138534546, + "learning_rate": 7.992588620179424e-06, + "loss": 0.3696, + "step": 3704 + }, + { + "epoch": 0.2964, + "grad_norm": 1.278671383857727, + "learning_rate": 7.99158101554776e-06, + "loss": 0.249, + "step": 3705 + }, + { + "epoch": 0.29648, + "grad_norm": 1.420791506767273, + "learning_rate": 7.990573221648775e-06, + "loss": 0.2905, + "step": 3706 + }, + { + "epoch": 0.29656, + "grad_norm": 1.7047336101531982, + "learning_rate": 7.989565238546228e-06, + "loss": 0.4367, + "step": 3707 + }, + { + "epoch": 0.29664, + "grad_norm": 2.036435127258301, + "learning_rate": 7.988557066303892e-06, + "loss": 0.3923, + "step": 3708 + }, + { + "epoch": 0.29672, + "grad_norm": 1.4706825017929077, + "learning_rate": 7.987548704985553e-06, + "loss": 0.429, + "step": 3709 + }, + { + "epoch": 0.2968, + "grad_norm": 1.6444703340530396, + "learning_rate": 7.986540154655e-06, + "loss": 0.3174, + "step": 3710 + }, + { + "epoch": 0.29688, + "grad_norm": 1.5745984315872192, + "learning_rate": 7.985531415376046e-06, + "loss": 0.2996, + "step": 3711 + }, + { + "epoch": 0.29696, + "grad_norm": 1.4535049200057983, + "learning_rate": 7.984522487212509e-06, + "loss": 0.3336, + "step": 3712 + }, + { + "epoch": 0.29704, + "grad_norm": 1.7702298164367676, + "learning_rate": 7.98351337022822e-06, + "loss": 0.388, + "step": 3713 + }, + { + "epoch": 0.29712, + "grad_norm": 1.5240532159805298, + "learning_rate": 7.982504064487022e-06, + "loss": 0.3296, + "step": 3714 + }, + { + "epoch": 0.2972, + "grad_norm": 1.6705554723739624, + "learning_rate": 7.981494570052775e-06, + "loss": 0.3306, + "step": 3715 + }, + { + "epoch": 0.29728, + "grad_norm": 1.8875733613967896, + "learning_rate": 7.980484886989338e-06, + "loss": 0.3438, + "step": 3716 + }, + { + "epoch": 0.29736, + "grad_norm": 1.5699726343154907, + "learning_rate": 7.979475015360599e-06, + "loss": 0.3333, + "step": 3717 + }, + { + "epoch": 0.29744, + "grad_norm": 1.580299973487854, + "learning_rate": 7.978464955230442e-06, + "loss": 0.3609, + "step": 3718 + }, + { + "epoch": 0.29752, + "grad_norm": 1.7409504652023315, + "learning_rate": 7.977454706662775e-06, + "loss": 0.3506, + "step": 3719 + }, + { + "epoch": 0.2976, + "grad_norm": 2.2885334491729736, + "learning_rate": 7.97644426972151e-06, + "loss": 0.4091, + "step": 3720 + }, + { + "epoch": 0.29768, + "grad_norm": 1.6000932455062866, + "learning_rate": 7.975433644470576e-06, + "loss": 0.3848, + "step": 3721 + }, + { + "epoch": 0.29776, + "grad_norm": 1.8004785776138306, + "learning_rate": 7.974422830973912e-06, + "loss": 0.3568, + "step": 3722 + }, + { + "epoch": 0.29784, + "grad_norm": 1.5973020792007446, + "learning_rate": 7.973411829295466e-06, + "loss": 0.3497, + "step": 3723 + }, + { + "epoch": 0.29792, + "grad_norm": 1.4873243570327759, + "learning_rate": 7.972400639499204e-06, + "loss": 0.3358, + "step": 3724 + }, + { + "epoch": 0.298, + "grad_norm": 1.824233055114746, + "learning_rate": 7.971389261649099e-06, + "loss": 0.4382, + "step": 3725 + }, + { + "epoch": 0.29808, + "grad_norm": 1.824907660484314, + "learning_rate": 7.970377695809138e-06, + "loss": 0.4416, + "step": 3726 + }, + { + "epoch": 0.29816, + "grad_norm": 1.4309464693069458, + "learning_rate": 7.96936594204332e-06, + "loss": 0.2923, + "step": 3727 + }, + { + "epoch": 0.29824, + "grad_norm": 1.8320714235305786, + "learning_rate": 7.968354000415652e-06, + "loss": 0.4268, + "step": 3728 + }, + { + "epoch": 0.29832, + "grad_norm": 1.6846635341644287, + "learning_rate": 7.967341870990159e-06, + "loss": 0.297, + "step": 3729 + }, + { + "epoch": 0.2984, + "grad_norm": 1.6068602800369263, + "learning_rate": 7.966329553830876e-06, + "loss": 0.3604, + "step": 3730 + }, + { + "epoch": 0.29848, + "grad_norm": 1.7175391912460327, + "learning_rate": 7.965317049001847e-06, + "loss": 0.394, + "step": 3731 + }, + { + "epoch": 0.29856, + "grad_norm": 1.2361955642700195, + "learning_rate": 7.96430435656713e-06, + "loss": 0.2636, + "step": 3732 + }, + { + "epoch": 0.29864, + "grad_norm": 1.6340537071228027, + "learning_rate": 7.963291476590795e-06, + "loss": 0.3437, + "step": 3733 + }, + { + "epoch": 0.29872, + "grad_norm": 1.4597103595733643, + "learning_rate": 7.962278409136924e-06, + "loss": 0.307, + "step": 3734 + }, + { + "epoch": 0.2988, + "grad_norm": 1.6404805183410645, + "learning_rate": 7.961265154269608e-06, + "loss": 0.5112, + "step": 3735 + }, + { + "epoch": 0.29888, + "grad_norm": 1.6717686653137207, + "learning_rate": 7.960251712052955e-06, + "loss": 0.3567, + "step": 3736 + }, + { + "epoch": 0.29896, + "grad_norm": 1.874589204788208, + "learning_rate": 7.959238082551081e-06, + "loss": 0.4107, + "step": 3737 + }, + { + "epoch": 0.29904, + "grad_norm": 1.2089720964431763, + "learning_rate": 7.958224265828118e-06, + "loss": 0.3063, + "step": 3738 + }, + { + "epoch": 0.29912, + "grad_norm": 1.2900773286819458, + "learning_rate": 7.957210261948201e-06, + "loss": 0.2518, + "step": 3739 + }, + { + "epoch": 0.2992, + "grad_norm": 1.737890362739563, + "learning_rate": 7.956196070975485e-06, + "loss": 0.3588, + "step": 3740 + }, + { + "epoch": 0.29928, + "grad_norm": 1.681058406829834, + "learning_rate": 7.955181692974138e-06, + "loss": 0.3226, + "step": 3741 + }, + { + "epoch": 0.29936, + "grad_norm": 1.6634056568145752, + "learning_rate": 7.954167128008332e-06, + "loss": 0.3443, + "step": 3742 + }, + { + "epoch": 0.29944, + "grad_norm": 1.8775358200073242, + "learning_rate": 7.953152376142255e-06, + "loss": 0.3255, + "step": 3743 + }, + { + "epoch": 0.29952, + "grad_norm": 1.728934645652771, + "learning_rate": 7.95213743744011e-06, + "loss": 0.3183, + "step": 3744 + }, + { + "epoch": 0.2996, + "grad_norm": 1.514907717704773, + "learning_rate": 7.951122311966108e-06, + "loss": 0.3914, + "step": 3745 + }, + { + "epoch": 0.29968, + "grad_norm": 1.5943530797958374, + "learning_rate": 7.95010699978447e-06, + "loss": 0.349, + "step": 3746 + }, + { + "epoch": 0.29976, + "grad_norm": 1.815433382987976, + "learning_rate": 7.949091500959434e-06, + "loss": 0.4143, + "step": 3747 + }, + { + "epoch": 0.29984, + "grad_norm": 1.7097547054290771, + "learning_rate": 7.948075815555246e-06, + "loss": 0.3884, + "step": 3748 + }, + { + "epoch": 0.29992, + "grad_norm": 1.619554877281189, + "learning_rate": 7.947059943636166e-06, + "loss": 0.3996, + "step": 3749 + }, + { + "epoch": 0.3, + "grad_norm": 1.394123911857605, + "learning_rate": 7.946043885266465e-06, + "loss": 0.2721, + "step": 3750 + }, + { + "epoch": 0.30008, + "grad_norm": 1.3874683380126953, + "learning_rate": 7.945027640510423e-06, + "loss": 0.3412, + "step": 3751 + }, + { + "epoch": 0.30016, + "grad_norm": 1.5768663883209229, + "learning_rate": 7.944011209432336e-06, + "loss": 0.3391, + "step": 3752 + }, + { + "epoch": 0.30024, + "grad_norm": 1.4615979194641113, + "learning_rate": 7.942994592096513e-06, + "loss": 0.2966, + "step": 3753 + }, + { + "epoch": 0.30032, + "grad_norm": 1.3654316663742065, + "learning_rate": 7.941977788567267e-06, + "loss": 0.3807, + "step": 3754 + }, + { + "epoch": 0.3004, + "grad_norm": 1.8147246837615967, + "learning_rate": 7.940960798908933e-06, + "loss": 0.3741, + "step": 3755 + }, + { + "epoch": 0.30048, + "grad_norm": 1.5056862831115723, + "learning_rate": 7.939943623185847e-06, + "loss": 0.3309, + "step": 3756 + }, + { + "epoch": 0.30056, + "grad_norm": 1.4294978380203247, + "learning_rate": 7.938926261462366e-06, + "loss": 0.3607, + "step": 3757 + }, + { + "epoch": 0.30064, + "grad_norm": 1.4605019092559814, + "learning_rate": 7.937908713802855e-06, + "loss": 0.3732, + "step": 3758 + }, + { + "epoch": 0.30072, + "grad_norm": 1.7275772094726562, + "learning_rate": 7.936890980271688e-06, + "loss": 0.3738, + "step": 3759 + }, + { + "epoch": 0.3008, + "grad_norm": 1.480759620666504, + "learning_rate": 7.935873060933257e-06, + "loss": 0.3517, + "step": 3760 + }, + { + "epoch": 0.30088, + "grad_norm": 1.655974268913269, + "learning_rate": 7.934854955851961e-06, + "loss": 0.3499, + "step": 3761 + }, + { + "epoch": 0.30096, + "grad_norm": 1.35233736038208, + "learning_rate": 7.933836665092212e-06, + "loss": 0.3247, + "step": 3762 + }, + { + "epoch": 0.30104, + "grad_norm": 1.2412519454956055, + "learning_rate": 7.932818188718433e-06, + "loss": 0.2889, + "step": 3763 + }, + { + "epoch": 0.30112, + "grad_norm": 1.5485812425613403, + "learning_rate": 7.931799526795062e-06, + "loss": 0.3616, + "step": 3764 + }, + { + "epoch": 0.3012, + "grad_norm": 1.448938250541687, + "learning_rate": 7.930780679386542e-06, + "loss": 0.3128, + "step": 3765 + }, + { + "epoch": 0.30128, + "grad_norm": 1.4666680097579956, + "learning_rate": 7.929761646557337e-06, + "loss": 0.3414, + "step": 3766 + }, + { + "epoch": 0.30136, + "grad_norm": 1.6032980680465698, + "learning_rate": 7.928742428371916e-06, + "loss": 0.3667, + "step": 3767 + }, + { + "epoch": 0.30144, + "grad_norm": 1.5190998315811157, + "learning_rate": 7.92772302489476e-06, + "loss": 0.3274, + "step": 3768 + }, + { + "epoch": 0.30152, + "grad_norm": 1.6384193897247314, + "learning_rate": 7.926703436190363e-06, + "loss": 0.371, + "step": 3769 + }, + { + "epoch": 0.3016, + "grad_norm": 1.7404835224151611, + "learning_rate": 7.925683662323235e-06, + "loss": 0.3464, + "step": 3770 + }, + { + "epoch": 0.30168, + "grad_norm": 1.719647765159607, + "learning_rate": 7.924663703357892e-06, + "loss": 0.371, + "step": 3771 + }, + { + "epoch": 0.30176, + "grad_norm": 1.545730471611023, + "learning_rate": 7.923643559358858e-06, + "loss": 0.3684, + "step": 3772 + }, + { + "epoch": 0.30184, + "grad_norm": 1.717699646949768, + "learning_rate": 7.922623230390682e-06, + "loss": 0.3934, + "step": 3773 + }, + { + "epoch": 0.30192, + "grad_norm": 1.8281782865524292, + "learning_rate": 7.921602716517914e-06, + "loss": 0.508, + "step": 3774 + }, + { + "epoch": 0.302, + "grad_norm": 2.0022242069244385, + "learning_rate": 7.920582017805114e-06, + "loss": 0.36, + "step": 3775 + }, + { + "epoch": 0.30208, + "grad_norm": 1.4404866695404053, + "learning_rate": 7.919561134316865e-06, + "loss": 0.2932, + "step": 3776 + }, + { + "epoch": 0.30216, + "grad_norm": 1.687625527381897, + "learning_rate": 7.918540066117752e-06, + "loss": 0.3467, + "step": 3777 + }, + { + "epoch": 0.30224, + "grad_norm": 1.2726261615753174, + "learning_rate": 7.917518813272373e-06, + "loss": 0.3044, + "step": 3778 + }, + { + "epoch": 0.30232, + "grad_norm": 2.043890953063965, + "learning_rate": 7.916497375845342e-06, + "loss": 0.3605, + "step": 3779 + }, + { + "epoch": 0.3024, + "grad_norm": 1.251383900642395, + "learning_rate": 7.91547575390128e-06, + "loss": 0.3175, + "step": 3780 + }, + { + "epoch": 0.30248, + "grad_norm": 1.544699788093567, + "learning_rate": 7.914453947504822e-06, + "loss": 0.3783, + "step": 3781 + }, + { + "epoch": 0.30256, + "grad_norm": 1.4141426086425781, + "learning_rate": 7.913431956720615e-06, + "loss": 0.26, + "step": 3782 + }, + { + "epoch": 0.30264, + "grad_norm": 1.7382965087890625, + "learning_rate": 7.912409781613317e-06, + "loss": 0.3664, + "step": 3783 + }, + { + "epoch": 0.30272, + "grad_norm": 1.1553996801376343, + "learning_rate": 7.911387422247596e-06, + "loss": 0.2621, + "step": 3784 + }, + { + "epoch": 0.3028, + "grad_norm": 1.534834623336792, + "learning_rate": 7.910364878688135e-06, + "loss": 0.3252, + "step": 3785 + }, + { + "epoch": 0.30288, + "grad_norm": 1.4503859281539917, + "learning_rate": 7.909342150999626e-06, + "loss": 0.3371, + "step": 3786 + }, + { + "epoch": 0.30296, + "grad_norm": 1.7136962413787842, + "learning_rate": 7.908319239246774e-06, + "loss": 0.3998, + "step": 3787 + }, + { + "epoch": 0.30304, + "grad_norm": 1.6705702543258667, + "learning_rate": 7.907296143494293e-06, + "loss": 0.4058, + "step": 3788 + }, + { + "epoch": 0.30312, + "grad_norm": 1.4927536249160767, + "learning_rate": 7.906272863806916e-06, + "loss": 0.3728, + "step": 3789 + }, + { + "epoch": 0.3032, + "grad_norm": 1.1272435188293457, + "learning_rate": 7.905249400249377e-06, + "loss": 0.2479, + "step": 3790 + }, + { + "epoch": 0.30328, + "grad_norm": 1.3360072374343872, + "learning_rate": 7.904225752886433e-06, + "loss": 0.269, + "step": 3791 + }, + { + "epoch": 0.30336, + "grad_norm": 1.445589542388916, + "learning_rate": 7.90320192178284e-06, + "loss": 0.311, + "step": 3792 + }, + { + "epoch": 0.30344, + "grad_norm": 2.228759527206421, + "learning_rate": 7.902177907003375e-06, + "loss": 0.4801, + "step": 3793 + }, + { + "epoch": 0.30352, + "grad_norm": 1.5473653078079224, + "learning_rate": 7.901153708612825e-06, + "loss": 0.2992, + "step": 3794 + }, + { + "epoch": 0.3036, + "grad_norm": 1.85487699508667, + "learning_rate": 7.90012932667599e-06, + "loss": 0.3764, + "step": 3795 + }, + { + "epoch": 0.30368, + "grad_norm": 1.6702008247375488, + "learning_rate": 7.899104761257673e-06, + "loss": 0.3616, + "step": 3796 + }, + { + "epoch": 0.30376, + "grad_norm": 1.6418166160583496, + "learning_rate": 7.8980800124227e-06, + "loss": 0.3333, + "step": 3797 + }, + { + "epoch": 0.30384, + "grad_norm": 1.5101503133773804, + "learning_rate": 7.8970550802359e-06, + "loss": 0.3233, + "step": 3798 + }, + { + "epoch": 0.30392, + "grad_norm": 1.378827452659607, + "learning_rate": 7.896029964762119e-06, + "loss": 0.2467, + "step": 3799 + }, + { + "epoch": 0.304, + "grad_norm": 1.2998595237731934, + "learning_rate": 7.895004666066214e-06, + "loss": 0.2874, + "step": 3800 + }, + { + "epoch": 0.30408, + "grad_norm": 1.455960750579834, + "learning_rate": 7.893979184213049e-06, + "loss": 0.2777, + "step": 3801 + }, + { + "epoch": 0.30416, + "grad_norm": 1.5600316524505615, + "learning_rate": 7.892953519267506e-06, + "loss": 0.3268, + "step": 3802 + }, + { + "epoch": 0.30424, + "grad_norm": 1.37201726436615, + "learning_rate": 7.891927671294472e-06, + "loss": 0.3666, + "step": 3803 + }, + { + "epoch": 0.30432, + "grad_norm": 1.4936875104904175, + "learning_rate": 7.890901640358852e-06, + "loss": 0.2798, + "step": 3804 + }, + { + "epoch": 0.3044, + "grad_norm": 1.624596357345581, + "learning_rate": 7.889875426525557e-06, + "loss": 0.4087, + "step": 3805 + }, + { + "epoch": 0.30448, + "grad_norm": 2.078230381011963, + "learning_rate": 7.888849029859513e-06, + "loss": 0.4298, + "step": 3806 + }, + { + "epoch": 0.30456, + "grad_norm": 1.4350990056991577, + "learning_rate": 7.887822450425658e-06, + "loss": 0.3688, + "step": 3807 + }, + { + "epoch": 0.30464, + "grad_norm": 1.392849087715149, + "learning_rate": 7.886795688288937e-06, + "loss": 0.3813, + "step": 3808 + }, + { + "epoch": 0.30472, + "grad_norm": 1.645775556564331, + "learning_rate": 7.885768743514316e-06, + "loss": 0.3152, + "step": 3809 + }, + { + "epoch": 0.3048, + "grad_norm": 1.6383919715881348, + "learning_rate": 7.88474161616676e-06, + "loss": 0.3558, + "step": 3810 + }, + { + "epoch": 0.30488, + "grad_norm": 2.151557445526123, + "learning_rate": 7.883714306311255e-06, + "loss": 0.4004, + "step": 3811 + }, + { + "epoch": 0.30496, + "grad_norm": 1.6432582139968872, + "learning_rate": 7.882686814012792e-06, + "loss": 0.4162, + "step": 3812 + }, + { + "epoch": 0.30504, + "grad_norm": 1.6074105501174927, + "learning_rate": 7.881659139336383e-06, + "loss": 0.3813, + "step": 3813 + }, + { + "epoch": 0.30512, + "grad_norm": 1.7226243019104004, + "learning_rate": 7.880631282347042e-06, + "loss": 0.446, + "step": 3814 + }, + { + "epoch": 0.3052, + "grad_norm": 1.3685539960861206, + "learning_rate": 7.879603243109799e-06, + "loss": 0.2664, + "step": 3815 + }, + { + "epoch": 0.30528, + "grad_norm": 2.002601146697998, + "learning_rate": 7.878575021689693e-06, + "loss": 0.402, + "step": 3816 + }, + { + "epoch": 0.30536, + "grad_norm": 1.7490203380584717, + "learning_rate": 7.877546618151776e-06, + "loss": 0.3748, + "step": 3817 + }, + { + "epoch": 0.30544, + "grad_norm": 1.197702169418335, + "learning_rate": 7.876518032561113e-06, + "loss": 0.2688, + "step": 3818 + }, + { + "epoch": 0.30552, + "grad_norm": 1.7372758388519287, + "learning_rate": 7.875489264982781e-06, + "loss": 0.377, + "step": 3819 + }, + { + "epoch": 0.3056, + "grad_norm": 1.6473854780197144, + "learning_rate": 7.874460315481863e-06, + "loss": 0.3239, + "step": 3820 + }, + { + "epoch": 0.30568, + "grad_norm": 1.2986738681793213, + "learning_rate": 7.87343118412346e-06, + "loss": 0.3873, + "step": 3821 + }, + { + "epoch": 0.30576, + "grad_norm": 2.0069668292999268, + "learning_rate": 7.872401870972679e-06, + "loss": 0.3118, + "step": 3822 + }, + { + "epoch": 0.30584, + "grad_norm": 1.1595829725265503, + "learning_rate": 7.871372376094642e-06, + "loss": 0.3185, + "step": 3823 + }, + { + "epoch": 0.30592, + "grad_norm": 1.917394995689392, + "learning_rate": 7.870342699554484e-06, + "loss": 0.4492, + "step": 3824 + }, + { + "epoch": 0.306, + "grad_norm": 1.6679039001464844, + "learning_rate": 7.869312841417346e-06, + "loss": 0.4018, + "step": 3825 + }, + { + "epoch": 0.30608, + "grad_norm": 1.2022053003311157, + "learning_rate": 7.868282801748389e-06, + "loss": 0.2811, + "step": 3826 + }, + { + "epoch": 0.30616, + "grad_norm": 1.3668828010559082, + "learning_rate": 7.867252580612775e-06, + "loss": 0.2921, + "step": 3827 + }, + { + "epoch": 0.30624, + "grad_norm": 1.7087875604629517, + "learning_rate": 7.866222178075681e-06, + "loss": 0.3836, + "step": 3828 + }, + { + "epoch": 0.30632, + "grad_norm": 1.1265833377838135, + "learning_rate": 7.865191594202302e-06, + "loss": 0.2809, + "step": 3829 + }, + { + "epoch": 0.3064, + "grad_norm": 1.4156330823898315, + "learning_rate": 7.864160829057838e-06, + "loss": 0.3044, + "step": 3830 + }, + { + "epoch": 0.30648, + "grad_norm": 1.5269126892089844, + "learning_rate": 7.863129882707504e-06, + "loss": 0.3253, + "step": 3831 + }, + { + "epoch": 0.30656, + "grad_norm": 1.6441112756729126, + "learning_rate": 7.862098755216519e-06, + "loss": 0.3304, + "step": 3832 + }, + { + "epoch": 0.30664, + "grad_norm": 1.804513931274414, + "learning_rate": 7.861067446650124e-06, + "loss": 0.3199, + "step": 3833 + }, + { + "epoch": 0.30672, + "grad_norm": 1.4178650379180908, + "learning_rate": 7.860035957073566e-06, + "loss": 0.326, + "step": 3834 + }, + { + "epoch": 0.3068, + "grad_norm": 1.4408081769943237, + "learning_rate": 7.8590042865521e-06, + "loss": 0.3487, + "step": 3835 + }, + { + "epoch": 0.30688, + "grad_norm": 1.7407159805297852, + "learning_rate": 7.857972435151e-06, + "loss": 0.3687, + "step": 3836 + }, + { + "epoch": 0.30696, + "grad_norm": 1.395830512046814, + "learning_rate": 7.856940402935547e-06, + "loss": 0.2618, + "step": 3837 + }, + { + "epoch": 0.30704, + "grad_norm": 2.0533602237701416, + "learning_rate": 7.855908189971036e-06, + "loss": 0.3489, + "step": 3838 + }, + { + "epoch": 0.30712, + "grad_norm": 1.6558175086975098, + "learning_rate": 7.854875796322767e-06, + "loss": 0.3476, + "step": 3839 + }, + { + "epoch": 0.3072, + "grad_norm": 1.524119257926941, + "learning_rate": 7.853843222056059e-06, + "loss": 0.4397, + "step": 3840 + }, + { + "epoch": 0.30728, + "grad_norm": 1.5205790996551514, + "learning_rate": 7.85281046723624e-06, + "loss": 0.3255, + "step": 3841 + }, + { + "epoch": 0.30736, + "grad_norm": 1.547203779220581, + "learning_rate": 7.851777531928648e-06, + "loss": 0.369, + "step": 3842 + }, + { + "epoch": 0.30744, + "grad_norm": 1.3715342283248901, + "learning_rate": 7.850744416198635e-06, + "loss": 0.2742, + "step": 3843 + }, + { + "epoch": 0.30752, + "grad_norm": 1.7399414777755737, + "learning_rate": 7.849711120111561e-06, + "loss": 0.3737, + "step": 3844 + }, + { + "epoch": 0.3076, + "grad_norm": 1.67813241481781, + "learning_rate": 7.848677643732799e-06, + "loss": 0.3005, + "step": 3845 + }, + { + "epoch": 0.30768, + "grad_norm": 1.495720386505127, + "learning_rate": 7.847643987127733e-06, + "loss": 0.3383, + "step": 3846 + }, + { + "epoch": 0.30776, + "grad_norm": 1.943218469619751, + "learning_rate": 7.846610150361761e-06, + "loss": 0.4522, + "step": 3847 + }, + { + "epoch": 0.30784, + "grad_norm": 2.0549914836883545, + "learning_rate": 7.84557613350029e-06, + "loss": 0.4285, + "step": 3848 + }, + { + "epoch": 0.30792, + "grad_norm": 2.2861602306365967, + "learning_rate": 7.844541936608738e-06, + "loss": 0.5222, + "step": 3849 + }, + { + "epoch": 0.308, + "grad_norm": 1.814911961555481, + "learning_rate": 7.843507559752537e-06, + "loss": 0.4032, + "step": 3850 + }, + { + "epoch": 0.30808, + "grad_norm": 1.2270601987838745, + "learning_rate": 7.842473002997127e-06, + "loss": 0.2968, + "step": 3851 + }, + { + "epoch": 0.30816, + "grad_norm": 1.692413091659546, + "learning_rate": 7.841438266407959e-06, + "loss": 0.4398, + "step": 3852 + }, + { + "epoch": 0.30824, + "grad_norm": 1.7804385423660278, + "learning_rate": 7.8404033500505e-06, + "loss": 0.4585, + "step": 3853 + }, + { + "epoch": 0.30832, + "grad_norm": 1.3115049600601196, + "learning_rate": 7.839368253990224e-06, + "loss": 0.2864, + "step": 3854 + }, + { + "epoch": 0.3084, + "grad_norm": 1.490980625152588, + "learning_rate": 7.838332978292622e-06, + "loss": 0.3292, + "step": 3855 + }, + { + "epoch": 0.30848, + "grad_norm": 1.78223717212677, + "learning_rate": 7.837297523023189e-06, + "loss": 0.4908, + "step": 3856 + }, + { + "epoch": 0.30856, + "grad_norm": 1.4501593112945557, + "learning_rate": 7.836261888247434e-06, + "loss": 0.3176, + "step": 3857 + }, + { + "epoch": 0.30864, + "grad_norm": 1.7654818296432495, + "learning_rate": 7.83522607403088e-06, + "loss": 0.3421, + "step": 3858 + }, + { + "epoch": 0.30872, + "grad_norm": 1.4273486137390137, + "learning_rate": 7.834190080439058e-06, + "loss": 0.3251, + "step": 3859 + }, + { + "epoch": 0.3088, + "grad_norm": 1.3740286827087402, + "learning_rate": 7.833153907537515e-06, + "loss": 0.3186, + "step": 3860 + }, + { + "epoch": 0.30888, + "grad_norm": 1.4216959476470947, + "learning_rate": 7.832117555391803e-06, + "loss": 0.3298, + "step": 3861 + }, + { + "epoch": 0.30896, + "grad_norm": 1.8324718475341797, + "learning_rate": 7.831081024067489e-06, + "loss": 0.4795, + "step": 3862 + }, + { + "epoch": 0.30904, + "grad_norm": 1.479018211364746, + "learning_rate": 7.83004431363015e-06, + "loss": 0.32, + "step": 3863 + }, + { + "epoch": 0.30912, + "grad_norm": 1.3187556266784668, + "learning_rate": 7.829007424145379e-06, + "loss": 0.2529, + "step": 3864 + }, + { + "epoch": 0.3092, + "grad_norm": 1.7161333560943604, + "learning_rate": 7.827970355678774e-06, + "loss": 0.3463, + "step": 3865 + }, + { + "epoch": 0.30928, + "grad_norm": 1.6725596189498901, + "learning_rate": 7.826933108295947e-06, + "loss": 0.4048, + "step": 3866 + }, + { + "epoch": 0.30936, + "grad_norm": 1.5978217124938965, + "learning_rate": 7.82589568206252e-06, + "loss": 0.333, + "step": 3867 + }, + { + "epoch": 0.30944, + "grad_norm": 1.6118669509887695, + "learning_rate": 7.82485807704413e-06, + "loss": 0.4502, + "step": 3868 + }, + { + "epoch": 0.30952, + "grad_norm": 1.6804618835449219, + "learning_rate": 7.823820293306421e-06, + "loss": 0.2939, + "step": 3869 + }, + { + "epoch": 0.3096, + "grad_norm": 1.3810745477676392, + "learning_rate": 7.82278233091505e-06, + "loss": 0.2853, + "step": 3870 + }, + { + "epoch": 0.30968, + "grad_norm": 1.4669585227966309, + "learning_rate": 7.821744189935687e-06, + "loss": 0.3352, + "step": 3871 + }, + { + "epoch": 0.30976, + "grad_norm": 1.5521767139434814, + "learning_rate": 7.82070587043401e-06, + "loss": 0.3828, + "step": 3872 + }, + { + "epoch": 0.30984, + "grad_norm": 2.0385186672210693, + "learning_rate": 7.81966737247571e-06, + "loss": 0.4688, + "step": 3873 + }, + { + "epoch": 0.30992, + "grad_norm": 1.3120101690292358, + "learning_rate": 7.81862869612649e-06, + "loss": 0.2976, + "step": 3874 + }, + { + "epoch": 0.31, + "grad_norm": 1.5607187747955322, + "learning_rate": 7.817589841452065e-06, + "loss": 0.2853, + "step": 3875 + }, + { + "epoch": 0.31008, + "grad_norm": 1.4162628650665283, + "learning_rate": 7.816550808518158e-06, + "loss": 0.3474, + "step": 3876 + }, + { + "epoch": 0.31016, + "grad_norm": 1.9220107793807983, + "learning_rate": 7.815511597390506e-06, + "loss": 0.3522, + "step": 3877 + }, + { + "epoch": 0.31024, + "grad_norm": 1.6877249479293823, + "learning_rate": 7.814472208134855e-06, + "loss": 0.331, + "step": 3878 + }, + { + "epoch": 0.31032, + "grad_norm": 1.6581977605819702, + "learning_rate": 7.813432640816965e-06, + "loss": 0.4292, + "step": 3879 + }, + { + "epoch": 0.3104, + "grad_norm": 1.860425591468811, + "learning_rate": 7.812392895502605e-06, + "loss": 0.4297, + "step": 3880 + }, + { + "epoch": 0.31048, + "grad_norm": 1.4564399719238281, + "learning_rate": 7.811352972257558e-06, + "loss": 0.3113, + "step": 3881 + }, + { + "epoch": 0.31056, + "grad_norm": 1.661933183670044, + "learning_rate": 7.810312871147614e-06, + "loss": 0.381, + "step": 3882 + }, + { + "epoch": 0.31064, + "grad_norm": 1.430387258529663, + "learning_rate": 7.809272592238579e-06, + "loss": 0.2592, + "step": 3883 + }, + { + "epoch": 0.31072, + "grad_norm": 1.3695735931396484, + "learning_rate": 7.808232135596267e-06, + "loss": 0.3324, + "step": 3884 + }, + { + "epoch": 0.3108, + "grad_norm": 1.8727529048919678, + "learning_rate": 7.807191501286505e-06, + "loss": 0.4287, + "step": 3885 + }, + { + "epoch": 0.31088, + "grad_norm": 1.6684083938598633, + "learning_rate": 7.806150689375126e-06, + "loss": 0.3635, + "step": 3886 + }, + { + "epoch": 0.31096, + "grad_norm": 1.6127012968063354, + "learning_rate": 7.805109699927985e-06, + "loss": 0.3655, + "step": 3887 + }, + { + "epoch": 0.31104, + "grad_norm": 1.7159547805786133, + "learning_rate": 7.804068533010938e-06, + "loss": 0.3361, + "step": 3888 + }, + { + "epoch": 0.31112, + "grad_norm": 1.4053112268447876, + "learning_rate": 7.803027188689859e-06, + "loss": 0.3538, + "step": 3889 + }, + { + "epoch": 0.3112, + "grad_norm": 1.524722695350647, + "learning_rate": 7.801985667030628e-06, + "loss": 0.4273, + "step": 3890 + }, + { + "epoch": 0.31128, + "grad_norm": 1.4059796333312988, + "learning_rate": 7.800943968099139e-06, + "loss": 0.3329, + "step": 3891 + }, + { + "epoch": 0.31136, + "grad_norm": 1.8556325435638428, + "learning_rate": 7.799902091961298e-06, + "loss": 0.4473, + "step": 3892 + }, + { + "epoch": 0.31144, + "grad_norm": 1.5466563701629639, + "learning_rate": 7.798860038683019e-06, + "loss": 0.3742, + "step": 3893 + }, + { + "epoch": 0.31152, + "grad_norm": 1.2450904846191406, + "learning_rate": 7.79781780833023e-06, + "loss": 0.2928, + "step": 3894 + }, + { + "epoch": 0.3116, + "grad_norm": 1.7872172594070435, + "learning_rate": 7.796775400968871e-06, + "loss": 0.415, + "step": 3895 + }, + { + "epoch": 0.31168, + "grad_norm": 1.6194071769714355, + "learning_rate": 7.79573281666489e-06, + "loss": 0.3123, + "step": 3896 + }, + { + "epoch": 0.31176, + "grad_norm": 1.4827814102172852, + "learning_rate": 7.794690055484249e-06, + "loss": 0.2888, + "step": 3897 + }, + { + "epoch": 0.31184, + "grad_norm": 1.2528879642486572, + "learning_rate": 7.793647117492919e-06, + "loss": 0.2737, + "step": 3898 + }, + { + "epoch": 0.31192, + "grad_norm": 1.405361294746399, + "learning_rate": 7.792604002756882e-06, + "loss": 0.3119, + "step": 3899 + }, + { + "epoch": 0.312, + "grad_norm": 1.7106093168258667, + "learning_rate": 7.791560711342134e-06, + "loss": 0.3068, + "step": 3900 + }, + { + "epoch": 0.31208, + "grad_norm": 2.097712755203247, + "learning_rate": 7.790517243314682e-06, + "loss": 0.5649, + "step": 3901 + }, + { + "epoch": 0.31216, + "grad_norm": 1.1007894277572632, + "learning_rate": 7.78947359874054e-06, + "loss": 0.2437, + "step": 3902 + }, + { + "epoch": 0.31224, + "grad_norm": 1.5791645050048828, + "learning_rate": 7.788429777685737e-06, + "loss": 0.3363, + "step": 3903 + }, + { + "epoch": 0.31232, + "grad_norm": 1.4127541780471802, + "learning_rate": 7.787385780216313e-06, + "loss": 0.3199, + "step": 3904 + }, + { + "epoch": 0.3124, + "grad_norm": 1.4177106618881226, + "learning_rate": 7.786341606398318e-06, + "loss": 0.3293, + "step": 3905 + }, + { + "epoch": 0.31248, + "grad_norm": 1.3515725135803223, + "learning_rate": 7.785297256297811e-06, + "loss": 0.2774, + "step": 3906 + }, + { + "epoch": 0.31256, + "grad_norm": 1.4087157249450684, + "learning_rate": 7.784252729980866e-06, + "loss": 0.3314, + "step": 3907 + }, + { + "epoch": 0.31264, + "grad_norm": 1.7299154996871948, + "learning_rate": 7.783208027513569e-06, + "loss": 0.3618, + "step": 3908 + }, + { + "epoch": 0.31272, + "grad_norm": 1.4551969766616821, + "learning_rate": 7.78216314896201e-06, + "loss": 0.3251, + "step": 3909 + }, + { + "epoch": 0.3128, + "grad_norm": 1.9065160751342773, + "learning_rate": 7.7811180943923e-06, + "loss": 0.5049, + "step": 3910 + }, + { + "epoch": 0.31288, + "grad_norm": 1.8001066446304321, + "learning_rate": 7.780072863870551e-06, + "loss": 0.4066, + "step": 3911 + }, + { + "epoch": 0.31296, + "grad_norm": 1.3444501161575317, + "learning_rate": 7.779027457462896e-06, + "loss": 0.3117, + "step": 3912 + }, + { + "epoch": 0.31304, + "grad_norm": 1.6955708265304565, + "learning_rate": 7.777981875235472e-06, + "loss": 0.4009, + "step": 3913 + }, + { + "epoch": 0.31312, + "grad_norm": 2.015012264251709, + "learning_rate": 7.776936117254432e-06, + "loss": 0.4429, + "step": 3914 + }, + { + "epoch": 0.3132, + "grad_norm": 1.5816140174865723, + "learning_rate": 7.775890183585932e-06, + "loss": 0.3805, + "step": 3915 + }, + { + "epoch": 0.31328, + "grad_norm": 1.2359122037887573, + "learning_rate": 7.77484407429615e-06, + "loss": 0.2843, + "step": 3916 + }, + { + "epoch": 0.31336, + "grad_norm": 1.453830361366272, + "learning_rate": 7.77379778945127e-06, + "loss": 0.3051, + "step": 3917 + }, + { + "epoch": 0.31344, + "grad_norm": 1.4686434268951416, + "learning_rate": 7.772751329117481e-06, + "loss": 0.3015, + "step": 3918 + }, + { + "epoch": 0.31352, + "grad_norm": 1.454747200012207, + "learning_rate": 7.771704693360996e-06, + "loss": 0.3953, + "step": 3919 + }, + { + "epoch": 0.3136, + "grad_norm": 1.7359517812728882, + "learning_rate": 7.77065788224803e-06, + "loss": 0.2675, + "step": 3920 + }, + { + "epoch": 0.31368, + "grad_norm": 1.285766839981079, + "learning_rate": 7.769610895844808e-06, + "loss": 0.2614, + "step": 3921 + }, + { + "epoch": 0.31376, + "grad_norm": 1.5808966159820557, + "learning_rate": 7.768563734217572e-06, + "loss": 0.2836, + "step": 3922 + }, + { + "epoch": 0.31384, + "grad_norm": 1.6945921182632446, + "learning_rate": 7.767516397432574e-06, + "loss": 0.3945, + "step": 3923 + }, + { + "epoch": 0.31392, + "grad_norm": 1.5754956007003784, + "learning_rate": 7.766468885556072e-06, + "loss": 0.3035, + "step": 3924 + }, + { + "epoch": 0.314, + "grad_norm": 1.4004344940185547, + "learning_rate": 7.76542119865434e-06, + "loss": 0.3592, + "step": 3925 + }, + { + "epoch": 0.31408, + "grad_norm": 1.5656622648239136, + "learning_rate": 7.764373336793662e-06, + "loss": 0.3183, + "step": 3926 + }, + { + "epoch": 0.31416, + "grad_norm": 1.9068760871887207, + "learning_rate": 7.763325300040333e-06, + "loss": 0.3495, + "step": 3927 + }, + { + "epoch": 0.31424, + "grad_norm": 1.5390478372573853, + "learning_rate": 7.762277088460659e-06, + "loss": 0.3439, + "step": 3928 + }, + { + "epoch": 0.31432, + "grad_norm": 1.4874063730239868, + "learning_rate": 7.761228702120956e-06, + "loss": 0.3257, + "step": 3929 + }, + { + "epoch": 0.3144, + "grad_norm": 1.8362594842910767, + "learning_rate": 7.760180141087552e-06, + "loss": 0.5189, + "step": 3930 + }, + { + "epoch": 0.31448, + "grad_norm": 1.7211016416549683, + "learning_rate": 7.759131405426785e-06, + "loss": 0.3418, + "step": 3931 + }, + { + "epoch": 0.31456, + "grad_norm": 1.237994909286499, + "learning_rate": 7.758082495205007e-06, + "loss": 0.2734, + "step": 3932 + }, + { + "epoch": 0.31464, + "grad_norm": 1.7199347019195557, + "learning_rate": 7.757033410488577e-06, + "loss": 0.4034, + "step": 3933 + }, + { + "epoch": 0.31472, + "grad_norm": 1.5101882219314575, + "learning_rate": 7.75598415134387e-06, + "loss": 0.288, + "step": 3934 + }, + { + "epoch": 0.3148, + "grad_norm": 1.4008091688156128, + "learning_rate": 7.754934717837267e-06, + "loss": 0.3252, + "step": 3935 + }, + { + "epoch": 0.31488, + "grad_norm": 1.382588505744934, + "learning_rate": 7.753885110035161e-06, + "loss": 0.2844, + "step": 3936 + }, + { + "epoch": 0.31496, + "grad_norm": 1.5880348682403564, + "learning_rate": 7.75283532800396e-06, + "loss": 0.2899, + "step": 3937 + }, + { + "epoch": 0.31504, + "grad_norm": 1.820833444595337, + "learning_rate": 7.751785371810079e-06, + "loss": 0.4055, + "step": 3938 + }, + { + "epoch": 0.31512, + "grad_norm": 1.5325983762741089, + "learning_rate": 7.750735241519945e-06, + "loss": 0.3185, + "step": 3939 + }, + { + "epoch": 0.3152, + "grad_norm": 1.4334882497787476, + "learning_rate": 7.749684937199995e-06, + "loss": 0.3512, + "step": 3940 + }, + { + "epoch": 0.31528, + "grad_norm": 1.317023754119873, + "learning_rate": 7.74863445891668e-06, + "loss": 0.2794, + "step": 3941 + }, + { + "epoch": 0.31536, + "grad_norm": 1.646425724029541, + "learning_rate": 7.74758380673646e-06, + "loss": 0.4987, + "step": 3942 + }, + { + "epoch": 0.31544, + "grad_norm": 1.4610693454742432, + "learning_rate": 7.746532980725806e-06, + "loss": 0.3181, + "step": 3943 + }, + { + "epoch": 0.31552, + "grad_norm": 1.675247311592102, + "learning_rate": 7.7454819809512e-06, + "loss": 0.3994, + "step": 3944 + }, + { + "epoch": 0.3156, + "grad_norm": 1.453763723373413, + "learning_rate": 7.744430807479136e-06, + "loss": 0.3001, + "step": 3945 + }, + { + "epoch": 0.31568, + "grad_norm": 1.418670415878296, + "learning_rate": 7.743379460376117e-06, + "loss": 0.2949, + "step": 3946 + }, + { + "epoch": 0.31576, + "grad_norm": 1.5007295608520508, + "learning_rate": 7.74232793970866e-06, + "loss": 0.4157, + "step": 3947 + }, + { + "epoch": 0.31584, + "grad_norm": 1.3415838479995728, + "learning_rate": 7.741276245543287e-06, + "loss": 0.3165, + "step": 3948 + }, + { + "epoch": 0.31592, + "grad_norm": 1.6637834310531616, + "learning_rate": 7.740224377946543e-06, + "loss": 0.419, + "step": 3949 + }, + { + "epoch": 0.316, + "grad_norm": 1.9902657270431519, + "learning_rate": 7.739172336984969e-06, + "loss": 0.3941, + "step": 3950 + }, + { + "epoch": 0.31608, + "grad_norm": 1.3266291618347168, + "learning_rate": 7.738120122725126e-06, + "loss": 0.3171, + "step": 3951 + }, + { + "epoch": 0.31616, + "grad_norm": 1.462837815284729, + "learning_rate": 7.737067735233586e-06, + "loss": 0.3602, + "step": 3952 + }, + { + "epoch": 0.31624, + "grad_norm": 1.3161622285842896, + "learning_rate": 7.736015174576927e-06, + "loss": 0.2974, + "step": 3953 + }, + { + "epoch": 0.31632, + "grad_norm": 1.4802780151367188, + "learning_rate": 7.734962440821742e-06, + "loss": 0.3562, + "step": 3954 + }, + { + "epoch": 0.3164, + "grad_norm": 1.5523854494094849, + "learning_rate": 7.733909534034638e-06, + "loss": 0.3115, + "step": 3955 + }, + { + "epoch": 0.31648, + "grad_norm": 1.815574288368225, + "learning_rate": 7.732856454282223e-06, + "loss": 0.3293, + "step": 3956 + }, + { + "epoch": 0.31656, + "grad_norm": 2.250521183013916, + "learning_rate": 7.731803201631125e-06, + "loss": 0.597, + "step": 3957 + }, + { + "epoch": 0.31664, + "grad_norm": 1.4423999786376953, + "learning_rate": 7.730749776147978e-06, + "loss": 0.3607, + "step": 3958 + }, + { + "epoch": 0.31672, + "grad_norm": 1.7350084781646729, + "learning_rate": 7.729696177899432e-06, + "loss": 0.4186, + "step": 3959 + }, + { + "epoch": 0.3168, + "grad_norm": 1.1580049991607666, + "learning_rate": 7.72864240695214e-06, + "loss": 0.2433, + "step": 3960 + }, + { + "epoch": 0.31688, + "grad_norm": 1.309934139251709, + "learning_rate": 7.727588463372775e-06, + "loss": 0.2844, + "step": 3961 + }, + { + "epoch": 0.31696, + "grad_norm": 1.4602081775665283, + "learning_rate": 7.726534347228013e-06, + "loss": 0.4136, + "step": 3962 + }, + { + "epoch": 0.31704, + "grad_norm": 1.3389019966125488, + "learning_rate": 7.725480058584547e-06, + "loss": 0.2797, + "step": 3963 + }, + { + "epoch": 0.31712, + "grad_norm": 1.8739509582519531, + "learning_rate": 7.724425597509077e-06, + "loss": 0.4236, + "step": 3964 + }, + { + "epoch": 0.3172, + "grad_norm": 0.9077721834182739, + "learning_rate": 7.723370964068315e-06, + "loss": 0.2111, + "step": 3965 + }, + { + "epoch": 0.31728, + "grad_norm": 1.4916582107543945, + "learning_rate": 7.722316158328983e-06, + "loss": 0.3819, + "step": 3966 + }, + { + "epoch": 0.31736, + "grad_norm": 1.4052239656448364, + "learning_rate": 7.72126118035782e-06, + "loss": 0.2928, + "step": 3967 + }, + { + "epoch": 0.31744, + "grad_norm": 1.968377947807312, + "learning_rate": 7.720206030221567e-06, + "loss": 0.3799, + "step": 3968 + }, + { + "epoch": 0.31752, + "grad_norm": 1.677701473236084, + "learning_rate": 7.719150707986977e-06, + "loss": 0.4268, + "step": 3969 + }, + { + "epoch": 0.3176, + "grad_norm": 1.2914515733718872, + "learning_rate": 7.718095213720822e-06, + "loss": 0.2356, + "step": 3970 + }, + { + "epoch": 0.31768, + "grad_norm": 1.7517569065093994, + "learning_rate": 7.717039547489877e-06, + "loss": 0.3818, + "step": 3971 + }, + { + "epoch": 0.31776, + "grad_norm": 1.708135962486267, + "learning_rate": 7.715983709360932e-06, + "loss": 0.3412, + "step": 3972 + }, + { + "epoch": 0.31784, + "grad_norm": 1.5727461576461792, + "learning_rate": 7.714927699400785e-06, + "loss": 0.3977, + "step": 3973 + }, + { + "epoch": 0.31792, + "grad_norm": 1.4590837955474854, + "learning_rate": 7.713871517676247e-06, + "loss": 0.3593, + "step": 3974 + }, + { + "epoch": 0.318, + "grad_norm": 1.291216492652893, + "learning_rate": 7.712815164254138e-06, + "loss": 0.2984, + "step": 3975 + }, + { + "epoch": 0.31808, + "grad_norm": 2.102529525756836, + "learning_rate": 7.71175863920129e-06, + "loss": 0.392, + "step": 3976 + }, + { + "epoch": 0.31816, + "grad_norm": 2.003671169281006, + "learning_rate": 7.710701942584549e-06, + "loss": 0.3816, + "step": 3977 + }, + { + "epoch": 0.31824, + "grad_norm": 1.6712678670883179, + "learning_rate": 7.709645074470764e-06, + "loss": 0.3481, + "step": 3978 + }, + { + "epoch": 0.31832, + "grad_norm": 1.2910213470458984, + "learning_rate": 7.708588034926802e-06, + "loss": 0.2783, + "step": 3979 + }, + { + "epoch": 0.3184, + "grad_norm": 2.094609260559082, + "learning_rate": 7.70753082401954e-06, + "loss": 0.6013, + "step": 3980 + }, + { + "epoch": 0.31848, + "grad_norm": 1.9006894826889038, + "learning_rate": 7.70647344181586e-06, + "loss": 0.3544, + "step": 3981 + }, + { + "epoch": 0.31856, + "grad_norm": 1.320239543914795, + "learning_rate": 7.705415888382662e-06, + "loss": 0.3105, + "step": 3982 + }, + { + "epoch": 0.31864, + "grad_norm": 1.9996472597122192, + "learning_rate": 7.704358163786852e-06, + "loss": 0.4681, + "step": 3983 + }, + { + "epoch": 0.31872, + "grad_norm": 1.3153249025344849, + "learning_rate": 7.70330026809535e-06, + "loss": 0.2759, + "step": 3984 + }, + { + "epoch": 0.3188, + "grad_norm": 1.4559813737869263, + "learning_rate": 7.702242201375087e-06, + "loss": 0.3541, + "step": 3985 + }, + { + "epoch": 0.31888, + "grad_norm": 1.5201846361160278, + "learning_rate": 7.701183963693e-06, + "loss": 0.323, + "step": 3986 + }, + { + "epoch": 0.31896, + "grad_norm": 1.1571747064590454, + "learning_rate": 7.700125555116043e-06, + "loss": 0.2244, + "step": 3987 + }, + { + "epoch": 0.31904, + "grad_norm": 1.2063324451446533, + "learning_rate": 7.699066975711176e-06, + "loss": 0.3063, + "step": 3988 + }, + { + "epoch": 0.31912, + "grad_norm": 1.5614107847213745, + "learning_rate": 7.698008225545372e-06, + "loss": 0.3284, + "step": 3989 + }, + { + "epoch": 0.3192, + "grad_norm": 1.713163137435913, + "learning_rate": 7.696949304685616e-06, + "loss": 0.384, + "step": 3990 + }, + { + "epoch": 0.31928, + "grad_norm": 1.4818403720855713, + "learning_rate": 7.695890213198902e-06, + "loss": 0.3769, + "step": 3991 + }, + { + "epoch": 0.31936, + "grad_norm": 1.8636175394058228, + "learning_rate": 7.694830951152233e-06, + "loss": 0.3745, + "step": 3992 + }, + { + "epoch": 0.31944, + "grad_norm": 1.5398733615875244, + "learning_rate": 7.693771518612627e-06, + "loss": 0.3747, + "step": 3993 + }, + { + "epoch": 0.31952, + "grad_norm": 1.2448610067367554, + "learning_rate": 7.69271191564711e-06, + "loss": 0.2528, + "step": 3994 + }, + { + "epoch": 0.3196, + "grad_norm": 1.3927911520004272, + "learning_rate": 7.691652142322723e-06, + "loss": 0.3104, + "step": 3995 + }, + { + "epoch": 0.31968, + "grad_norm": 1.3788466453552246, + "learning_rate": 7.690592198706508e-06, + "loss": 0.3017, + "step": 3996 + }, + { + "epoch": 0.31976, + "grad_norm": 1.8893440961837769, + "learning_rate": 7.68953208486553e-06, + "loss": 0.4398, + "step": 3997 + }, + { + "epoch": 0.31984, + "grad_norm": 1.4432144165039062, + "learning_rate": 7.688471800866855e-06, + "loss": 0.3526, + "step": 3998 + }, + { + "epoch": 0.31992, + "grad_norm": 1.7225422859191895, + "learning_rate": 7.687411346777564e-06, + "loss": 0.3249, + "step": 3999 + }, + { + "epoch": 0.32, + "grad_norm": 1.5686019659042358, + "learning_rate": 7.686350722664751e-06, + "loss": 0.2928, + "step": 4000 + }, + { + "epoch": 0.32008, + "grad_norm": 1.5652544498443604, + "learning_rate": 7.685289928595514e-06, + "loss": 0.3355, + "step": 4001 + }, + { + "epoch": 0.32016, + "grad_norm": 1.6727375984191895, + "learning_rate": 7.68422896463697e-06, + "loss": 0.3482, + "step": 4002 + }, + { + "epoch": 0.32024, + "grad_norm": 1.4307200908660889, + "learning_rate": 7.68316783085624e-06, + "loss": 0.34, + "step": 4003 + }, + { + "epoch": 0.32032, + "grad_norm": 1.5332558155059814, + "learning_rate": 7.682106527320458e-06, + "loss": 0.3796, + "step": 4004 + }, + { + "epoch": 0.3204, + "grad_norm": 1.9234296083450317, + "learning_rate": 7.681045054096773e-06, + "loss": 0.3732, + "step": 4005 + }, + { + "epoch": 0.32048, + "grad_norm": 1.8026024103164673, + "learning_rate": 7.679983411252336e-06, + "loss": 0.3543, + "step": 4006 + }, + { + "epoch": 0.32056, + "grad_norm": 1.5684106349945068, + "learning_rate": 7.678921598854316e-06, + "loss": 0.3274, + "step": 4007 + }, + { + "epoch": 0.32064, + "grad_norm": 1.4080421924591064, + "learning_rate": 7.677859616969892e-06, + "loss": 0.346, + "step": 4008 + }, + { + "epoch": 0.32072, + "grad_norm": 1.667383074760437, + "learning_rate": 7.67679746566625e-06, + "loss": 0.3846, + "step": 4009 + }, + { + "epoch": 0.3208, + "grad_norm": 1.5758278369903564, + "learning_rate": 7.67573514501059e-06, + "loss": 0.396, + "step": 4010 + }, + { + "epoch": 0.32088, + "grad_norm": 1.7792264223098755, + "learning_rate": 7.674672655070117e-06, + "loss": 0.441, + "step": 4011 + }, + { + "epoch": 0.32096, + "grad_norm": 1.3458983898162842, + "learning_rate": 7.673609995912056e-06, + "loss": 0.2971, + "step": 4012 + }, + { + "epoch": 0.32104, + "grad_norm": 1.343664526939392, + "learning_rate": 7.672547167603638e-06, + "loss": 0.3028, + "step": 4013 + }, + { + "epoch": 0.32112, + "grad_norm": 1.9842946529388428, + "learning_rate": 7.6714841702121e-06, + "loss": 0.3945, + "step": 4014 + }, + { + "epoch": 0.3212, + "grad_norm": 1.2746963500976562, + "learning_rate": 7.6704210038047e-06, + "loss": 0.2742, + "step": 4015 + }, + { + "epoch": 0.32128, + "grad_norm": 1.8831450939178467, + "learning_rate": 7.669357668448695e-06, + "loss": 0.3977, + "step": 4016 + }, + { + "epoch": 0.32136, + "grad_norm": 1.4678828716278076, + "learning_rate": 7.668294164211365e-06, + "loss": 0.3188, + "step": 4017 + }, + { + "epoch": 0.32144, + "grad_norm": 2.1258130073547363, + "learning_rate": 7.667230491159992e-06, + "loss": 0.3754, + "step": 4018 + }, + { + "epoch": 0.32152, + "grad_norm": 2.3288698196411133, + "learning_rate": 7.666166649361868e-06, + "loss": 0.6207, + "step": 4019 + }, + { + "epoch": 0.3216, + "grad_norm": 1.5426217317581177, + "learning_rate": 7.665102638884303e-06, + "loss": 0.331, + "step": 4020 + }, + { + "epoch": 0.32168, + "grad_norm": 1.523314356803894, + "learning_rate": 7.664038459794612e-06, + "loss": 0.3148, + "step": 4021 + }, + { + "epoch": 0.32176, + "grad_norm": 1.3950161933898926, + "learning_rate": 7.66297411216012e-06, + "loss": 0.2575, + "step": 4022 + }, + { + "epoch": 0.32184, + "grad_norm": 1.7832293510437012, + "learning_rate": 7.661909596048167e-06, + "loss": 0.3661, + "step": 4023 + }, + { + "epoch": 0.32192, + "grad_norm": 1.4251043796539307, + "learning_rate": 7.6608449115261e-06, + "loss": 0.3476, + "step": 4024 + }, + { + "epoch": 0.322, + "grad_norm": 1.7455947399139404, + "learning_rate": 7.659780058661278e-06, + "loss": 0.4794, + "step": 4025 + }, + { + "epoch": 0.32208, + "grad_norm": 1.5899649858474731, + "learning_rate": 7.658715037521074e-06, + "loss": 0.3369, + "step": 4026 + }, + { + "epoch": 0.32216, + "grad_norm": 1.148187279701233, + "learning_rate": 7.657649848172864e-06, + "loss": 0.2667, + "step": 4027 + }, + { + "epoch": 0.32224, + "grad_norm": 1.3705857992172241, + "learning_rate": 7.65658449068404e-06, + "loss": 0.2696, + "step": 4028 + }, + { + "epoch": 0.32232, + "grad_norm": 1.8258371353149414, + "learning_rate": 7.655518965122004e-06, + "loss": 0.3487, + "step": 4029 + }, + { + "epoch": 0.3224, + "grad_norm": 1.5610127449035645, + "learning_rate": 7.65445327155417e-06, + "loss": 0.353, + "step": 4030 + }, + { + "epoch": 0.32248, + "grad_norm": 1.566694974899292, + "learning_rate": 7.653387410047958e-06, + "loss": 0.3971, + "step": 4031 + }, + { + "epoch": 0.32256, + "grad_norm": 1.6996960639953613, + "learning_rate": 7.652321380670805e-06, + "loss": 0.3876, + "step": 4032 + }, + { + "epoch": 0.32264, + "grad_norm": 1.460684061050415, + "learning_rate": 7.65125518349015e-06, + "loss": 0.3302, + "step": 4033 + }, + { + "epoch": 0.32272, + "grad_norm": 1.5468209981918335, + "learning_rate": 7.650188818573452e-06, + "loss": 0.3631, + "step": 4034 + }, + { + "epoch": 0.3228, + "grad_norm": 1.5066543817520142, + "learning_rate": 7.649122285988175e-06, + "loss": 0.3756, + "step": 4035 + }, + { + "epoch": 0.32288, + "grad_norm": 1.4689096212387085, + "learning_rate": 7.648055585801795e-06, + "loss": 0.3488, + "step": 4036 + }, + { + "epoch": 0.32296, + "grad_norm": 1.2508280277252197, + "learning_rate": 7.646988718081799e-06, + "loss": 0.2692, + "step": 4037 + }, + { + "epoch": 0.32304, + "grad_norm": 1.4631690979003906, + "learning_rate": 7.645921682895684e-06, + "loss": 0.3593, + "step": 4038 + }, + { + "epoch": 0.32312, + "grad_norm": 1.4271583557128906, + "learning_rate": 7.644854480310955e-06, + "loss": 0.3081, + "step": 4039 + }, + { + "epoch": 0.3232, + "grad_norm": 1.613905906677246, + "learning_rate": 7.643787110395135e-06, + "loss": 0.4458, + "step": 4040 + }, + { + "epoch": 0.32328, + "grad_norm": 1.3938994407653809, + "learning_rate": 7.642719573215748e-06, + "loss": 0.3038, + "step": 4041 + }, + { + "epoch": 0.32336, + "grad_norm": 1.4477261304855347, + "learning_rate": 7.641651868840338e-06, + "loss": 0.3112, + "step": 4042 + }, + { + "epoch": 0.32344, + "grad_norm": 1.692185401916504, + "learning_rate": 7.640583997336455e-06, + "loss": 0.4446, + "step": 4043 + }, + { + "epoch": 0.32352, + "grad_norm": 1.4064701795578003, + "learning_rate": 7.639515958771656e-06, + "loss": 0.2536, + "step": 4044 + }, + { + "epoch": 0.3236, + "grad_norm": 1.7881546020507812, + "learning_rate": 7.638447753213516e-06, + "loss": 0.4048, + "step": 4045 + }, + { + "epoch": 0.32368, + "grad_norm": 1.3405356407165527, + "learning_rate": 7.637379380729612e-06, + "loss": 0.2965, + "step": 4046 + }, + { + "epoch": 0.32376, + "grad_norm": 1.182118535041809, + "learning_rate": 7.636310841387541e-06, + "loss": 0.2477, + "step": 4047 + }, + { + "epoch": 0.32384, + "grad_norm": 1.4148246049880981, + "learning_rate": 7.635242135254903e-06, + "loss": 0.2628, + "step": 4048 + }, + { + "epoch": 0.32392, + "grad_norm": 1.527912974357605, + "learning_rate": 7.634173262399315e-06, + "loss": 0.3746, + "step": 4049 + }, + { + "epoch": 0.324, + "grad_norm": 1.4282273054122925, + "learning_rate": 7.633104222888396e-06, + "loss": 0.2963, + "step": 4050 + }, + { + "epoch": 0.32408, + "grad_norm": 1.5053621530532837, + "learning_rate": 7.632035016789785e-06, + "loss": 0.3858, + "step": 4051 + }, + { + "epoch": 0.32416, + "grad_norm": 1.633727788925171, + "learning_rate": 7.630965644171124e-06, + "loss": 0.4096, + "step": 4052 + }, + { + "epoch": 0.32424, + "grad_norm": 1.6081035137176514, + "learning_rate": 7.62989610510007e-06, + "loss": 0.296, + "step": 4053 + }, + { + "epoch": 0.32432, + "grad_norm": 1.2334622144699097, + "learning_rate": 7.628826399644292e-06, + "loss": 0.2918, + "step": 4054 + }, + { + "epoch": 0.3244, + "grad_norm": 1.6431307792663574, + "learning_rate": 7.627756527871459e-06, + "loss": 0.3395, + "step": 4055 + }, + { + "epoch": 0.32448, + "grad_norm": 1.6120516061782837, + "learning_rate": 7.626686489849266e-06, + "loss": 0.3045, + "step": 4056 + }, + { + "epoch": 0.32456, + "grad_norm": 1.4079047441482544, + "learning_rate": 7.625616285645408e-06, + "loss": 0.316, + "step": 4057 + }, + { + "epoch": 0.32464, + "grad_norm": 1.4125021696090698, + "learning_rate": 7.624545915327593e-06, + "loss": 0.3104, + "step": 4058 + }, + { + "epoch": 0.32472, + "grad_norm": 1.4613399505615234, + "learning_rate": 7.6234753789635374e-06, + "loss": 0.4579, + "step": 4059 + }, + { + "epoch": 0.3248, + "grad_norm": 1.8771039247512817, + "learning_rate": 7.622404676620974e-06, + "loss": 0.3648, + "step": 4060 + }, + { + "epoch": 0.32488, + "grad_norm": 1.4824306964874268, + "learning_rate": 7.621333808367643e-06, + "loss": 0.2957, + "step": 4061 + }, + { + "epoch": 0.32496, + "grad_norm": 1.4490859508514404, + "learning_rate": 7.62026277427129e-06, + "loss": 0.2975, + "step": 4062 + }, + { + "epoch": 0.32504, + "grad_norm": 1.4416332244873047, + "learning_rate": 7.61919157439968e-06, + "loss": 0.4024, + "step": 4063 + }, + { + "epoch": 0.32512, + "grad_norm": 1.0941718816757202, + "learning_rate": 7.618120208820583e-06, + "loss": 0.2395, + "step": 4064 + }, + { + "epoch": 0.3252, + "grad_norm": 1.8790380954742432, + "learning_rate": 7.617048677601781e-06, + "loss": 0.4126, + "step": 4065 + }, + { + "epoch": 0.32528, + "grad_norm": 1.4821059703826904, + "learning_rate": 7.615976980811067e-06, + "loss": 0.3245, + "step": 4066 + }, + { + "epoch": 0.32536, + "grad_norm": 1.9037108421325684, + "learning_rate": 7.61490511851624e-06, + "loss": 0.4843, + "step": 4067 + }, + { + "epoch": 0.32544, + "grad_norm": 1.4991943836212158, + "learning_rate": 7.613833090785117e-06, + "loss": 0.3384, + "step": 4068 + }, + { + "epoch": 0.32552, + "grad_norm": 1.4513674974441528, + "learning_rate": 7.612760897685519e-06, + "loss": 0.3904, + "step": 4069 + }, + { + "epoch": 0.3256, + "grad_norm": 1.277267575263977, + "learning_rate": 7.611688539285283e-06, + "loss": 0.2926, + "step": 4070 + }, + { + "epoch": 0.32568, + "grad_norm": 1.3457417488098145, + "learning_rate": 7.61061601565225e-06, + "loss": 0.2733, + "step": 4071 + }, + { + "epoch": 0.32576, + "grad_norm": 1.9008210897445679, + "learning_rate": 7.609543326854278e-06, + "loss": 0.4293, + "step": 4072 + }, + { + "epoch": 0.32584, + "grad_norm": 1.4903104305267334, + "learning_rate": 7.608470472959233e-06, + "loss": 0.3002, + "step": 4073 + }, + { + "epoch": 0.32592, + "grad_norm": 1.4304206371307373, + "learning_rate": 7.607397454034986e-06, + "loss": 0.3073, + "step": 4074 + }, + { + "epoch": 0.326, + "grad_norm": 1.7637519836425781, + "learning_rate": 7.606324270149428e-06, + "loss": 0.3062, + "step": 4075 + }, + { + "epoch": 0.32608, + "grad_norm": 1.7573155164718628, + "learning_rate": 7.605250921370454e-06, + "loss": 0.3944, + "step": 4076 + }, + { + "epoch": 0.32616, + "grad_norm": 1.3438637256622314, + "learning_rate": 7.604177407765972e-06, + "loss": 0.2591, + "step": 4077 + }, + { + "epoch": 0.32624, + "grad_norm": 1.9403959512710571, + "learning_rate": 7.603103729403898e-06, + "loss": 0.5459, + "step": 4078 + }, + { + "epoch": 0.32632, + "grad_norm": 1.421299934387207, + "learning_rate": 7.602029886352163e-06, + "loss": 0.3475, + "step": 4079 + }, + { + "epoch": 0.3264, + "grad_norm": 1.635377287864685, + "learning_rate": 7.600955878678702e-06, + "loss": 0.3548, + "step": 4080 + }, + { + "epoch": 0.32648, + "grad_norm": 1.75666344165802, + "learning_rate": 7.599881706451464e-06, + "loss": 0.5445, + "step": 4081 + }, + { + "epoch": 0.32656, + "grad_norm": 1.7399910688400269, + "learning_rate": 7.598807369738411e-06, + "loss": 0.3803, + "step": 4082 + }, + { + "epoch": 0.32664, + "grad_norm": 1.6762953996658325, + "learning_rate": 7.597732868607511e-06, + "loss": 0.3633, + "step": 4083 + }, + { + "epoch": 0.32672, + "grad_norm": 1.6950074434280396, + "learning_rate": 7.5966582031267455e-06, + "loss": 0.4613, + "step": 4084 + }, + { + "epoch": 0.3268, + "grad_norm": 1.7901536226272583, + "learning_rate": 7.595583373364104e-06, + "loss": 0.3687, + "step": 4085 + }, + { + "epoch": 0.32688, + "grad_norm": 1.0144829750061035, + "learning_rate": 7.594508379387586e-06, + "loss": 0.2249, + "step": 4086 + }, + { + "epoch": 0.32696, + "grad_norm": 1.360308051109314, + "learning_rate": 7.593433221265205e-06, + "loss": 0.3309, + "step": 4087 + }, + { + "epoch": 0.32704, + "grad_norm": 1.6082754135131836, + "learning_rate": 7.59235789906498e-06, + "loss": 0.3511, + "step": 4088 + }, + { + "epoch": 0.32712, + "grad_norm": 1.4917949438095093, + "learning_rate": 7.591282412854947e-06, + "loss": 0.3855, + "step": 4089 + }, + { + "epoch": 0.3272, + "grad_norm": 1.895923376083374, + "learning_rate": 7.590206762703145e-06, + "loss": 0.3869, + "step": 4090 + }, + { + "epoch": 0.32728, + "grad_norm": 1.37894606590271, + "learning_rate": 7.58913094867763e-06, + "loss": 0.3502, + "step": 4091 + }, + { + "epoch": 0.32736, + "grad_norm": 1.5004284381866455, + "learning_rate": 7.588054970846461e-06, + "loss": 0.3698, + "step": 4092 + }, + { + "epoch": 0.32744, + "grad_norm": 1.6978641748428345, + "learning_rate": 7.586978829277714e-06, + "loss": 0.3473, + "step": 4093 + }, + { + "epoch": 0.32752, + "grad_norm": 1.7193430662155151, + "learning_rate": 7.5859025240394725e-06, + "loss": 0.3368, + "step": 4094 + }, + { + "epoch": 0.3276, + "grad_norm": 1.4797018766403198, + "learning_rate": 7.584826055199831e-06, + "loss": 0.2682, + "step": 4095 + }, + { + "epoch": 0.32768, + "grad_norm": 1.4176084995269775, + "learning_rate": 7.583749422826894e-06, + "loss": 0.2922, + "step": 4096 + }, + { + "epoch": 0.32776, + "grad_norm": 1.4341297149658203, + "learning_rate": 7.582672626988776e-06, + "loss": 0.3214, + "step": 4097 + }, + { + "epoch": 0.32784, + "grad_norm": 1.48208749294281, + "learning_rate": 7.581595667753603e-06, + "loss": 0.3318, + "step": 4098 + }, + { + "epoch": 0.32792, + "grad_norm": 1.4464733600616455, + "learning_rate": 7.580518545189509e-06, + "loss": 0.358, + "step": 4099 + }, + { + "epoch": 0.328, + "grad_norm": 1.890705943107605, + "learning_rate": 7.579441259364643e-06, + "loss": 0.5347, + "step": 4100 + }, + { + "epoch": 0.32808, + "grad_norm": 1.4273359775543213, + "learning_rate": 7.578363810347158e-06, + "loss": 0.3098, + "step": 4101 + }, + { + "epoch": 0.32816, + "grad_norm": 1.5816519260406494, + "learning_rate": 7.577286198205223e-06, + "loss": 0.3446, + "step": 4102 + }, + { + "epoch": 0.32824, + "grad_norm": 1.684738278388977, + "learning_rate": 7.576208423007012e-06, + "loss": 0.4043, + "step": 4103 + }, + { + "epoch": 0.32832, + "grad_norm": 1.6417845487594604, + "learning_rate": 7.575130484820715e-06, + "loss": 0.3435, + "step": 4104 + }, + { + "epoch": 0.3284, + "grad_norm": 1.5224345922470093, + "learning_rate": 7.57405238371453e-06, + "loss": 0.4164, + "step": 4105 + }, + { + "epoch": 0.32848, + "grad_norm": 1.3996219635009766, + "learning_rate": 7.5729741197566604e-06, + "loss": 0.3818, + "step": 4106 + }, + { + "epoch": 0.32856, + "grad_norm": 1.8475956916809082, + "learning_rate": 7.571895693015329e-06, + "loss": 0.5135, + "step": 4107 + }, + { + "epoch": 0.32864, + "grad_norm": 2.0243968963623047, + "learning_rate": 7.570817103558764e-06, + "loss": 0.3888, + "step": 4108 + }, + { + "epoch": 0.32872, + "grad_norm": 2.109097480773926, + "learning_rate": 7.569738351455203e-06, + "loss": 0.3905, + "step": 4109 + }, + { + "epoch": 0.3288, + "grad_norm": 1.4075511693954468, + "learning_rate": 7.568659436772892e-06, + "loss": 0.2856, + "step": 4110 + }, + { + "epoch": 0.32888, + "grad_norm": 1.384484052658081, + "learning_rate": 7.567580359580095e-06, + "loss": 0.3404, + "step": 4111 + }, + { + "epoch": 0.32896, + "grad_norm": 1.7616000175476074, + "learning_rate": 7.56650111994508e-06, + "loss": 0.3693, + "step": 4112 + }, + { + "epoch": 0.32904, + "grad_norm": 1.8043757677078247, + "learning_rate": 7.565421717936127e-06, + "loss": 0.3948, + "step": 4113 + }, + { + "epoch": 0.32912, + "grad_norm": 1.4485535621643066, + "learning_rate": 7.564342153621525e-06, + "loss": 0.3289, + "step": 4114 + }, + { + "epoch": 0.3292, + "grad_norm": 1.7356315851211548, + "learning_rate": 7.5632624270695755e-06, + "loss": 0.3701, + "step": 4115 + }, + { + "epoch": 0.32928, + "grad_norm": 1.4794209003448486, + "learning_rate": 7.562182538348589e-06, + "loss": 0.3695, + "step": 4116 + }, + { + "epoch": 0.32936, + "grad_norm": 1.806711196899414, + "learning_rate": 7.5611024875268856e-06, + "loss": 0.3913, + "step": 4117 + }, + { + "epoch": 0.32944, + "grad_norm": 1.9311702251434326, + "learning_rate": 7.560022274672798e-06, + "loss": 0.3523, + "step": 4118 + }, + { + "epoch": 0.32952, + "grad_norm": 1.8792706727981567, + "learning_rate": 7.5589418998546675e-06, + "loss": 0.477, + "step": 4119 + }, + { + "epoch": 0.3296, + "grad_norm": 1.8583472967147827, + "learning_rate": 7.5578613631408444e-06, + "loss": 0.4641, + "step": 4120 + }, + { + "epoch": 0.32968, + "grad_norm": 1.8453302383422852, + "learning_rate": 7.556780664599692e-06, + "loss": 0.365, + "step": 4121 + }, + { + "epoch": 0.32976, + "grad_norm": 1.3581293821334839, + "learning_rate": 7.555699804299581e-06, + "loss": 0.3245, + "step": 4122 + }, + { + "epoch": 0.32984, + "grad_norm": 1.490319848060608, + "learning_rate": 7.554618782308897e-06, + "loss": 0.2991, + "step": 4123 + }, + { + "epoch": 0.32992, + "grad_norm": 1.4145665168762207, + "learning_rate": 7.553537598696028e-06, + "loss": 0.2676, + "step": 4124 + }, + { + "epoch": 0.33, + "grad_norm": 2.0299646854400635, + "learning_rate": 7.552456253529381e-06, + "loss": 0.3604, + "step": 4125 + }, + { + "epoch": 0.33008, + "grad_norm": 1.8027198314666748, + "learning_rate": 7.551374746877367e-06, + "loss": 0.5369, + "step": 4126 + }, + { + "epoch": 0.33016, + "grad_norm": 1.3814693689346313, + "learning_rate": 7.550293078808408e-06, + "loss": 0.3013, + "step": 4127 + }, + { + "epoch": 0.33024, + "grad_norm": 1.5172748565673828, + "learning_rate": 7.549211249390942e-06, + "loss": 0.3157, + "step": 4128 + }, + { + "epoch": 0.33032, + "grad_norm": 1.5251926183700562, + "learning_rate": 7.548129258693407e-06, + "loss": 0.3171, + "step": 4129 + }, + { + "epoch": 0.3304, + "grad_norm": 1.377307415008545, + "learning_rate": 7.547047106784262e-06, + "loss": 0.3191, + "step": 4130 + }, + { + "epoch": 0.33048, + "grad_norm": 1.4914178848266602, + "learning_rate": 7.545964793731968e-06, + "loss": 0.3543, + "step": 4131 + }, + { + "epoch": 0.33056, + "grad_norm": 1.3784854412078857, + "learning_rate": 7.544882319605e-06, + "loss": 0.2985, + "step": 4132 + }, + { + "epoch": 0.33064, + "grad_norm": 1.4411064386367798, + "learning_rate": 7.543799684471845e-06, + "loss": 0.3399, + "step": 4133 + }, + { + "epoch": 0.33072, + "grad_norm": 0.9017268419265747, + "learning_rate": 7.542716888400994e-06, + "loss": 0.2339, + "step": 4134 + }, + { + "epoch": 0.3308, + "grad_norm": 1.7791081666946411, + "learning_rate": 7.541633931460953e-06, + "loss": 0.4546, + "step": 4135 + }, + { + "epoch": 0.33088, + "grad_norm": 1.6857120990753174, + "learning_rate": 7.540550813720238e-06, + "loss": 0.3708, + "step": 4136 + }, + { + "epoch": 0.33096, + "grad_norm": 1.3614606857299805, + "learning_rate": 7.5394675352473735e-06, + "loss": 0.2817, + "step": 4137 + }, + { + "epoch": 0.33104, + "grad_norm": 1.4521223306655884, + "learning_rate": 7.538384096110896e-06, + "loss": 0.3394, + "step": 4138 + }, + { + "epoch": 0.33112, + "grad_norm": 1.4563080072402954, + "learning_rate": 7.5373004963793496e-06, + "loss": 0.2831, + "step": 4139 + }, + { + "epoch": 0.3312, + "grad_norm": 1.2740604877471924, + "learning_rate": 7.53621673612129e-06, + "loss": 0.3314, + "step": 4140 + }, + { + "epoch": 0.33128, + "grad_norm": 1.6157991886138916, + "learning_rate": 7.5351328154052835e-06, + "loss": 0.334, + "step": 4141 + }, + { + "epoch": 0.33136, + "grad_norm": 1.7817760705947876, + "learning_rate": 7.534048734299908e-06, + "loss": 0.3722, + "step": 4142 + }, + { + "epoch": 0.33144, + "grad_norm": 1.3499414920806885, + "learning_rate": 7.532964492873747e-06, + "loss": 0.2966, + "step": 4143 + }, + { + "epoch": 0.33152, + "grad_norm": 1.4766731262207031, + "learning_rate": 7.531880091195398e-06, + "loss": 0.3477, + "step": 4144 + }, + { + "epoch": 0.3316, + "grad_norm": 1.642026424407959, + "learning_rate": 7.530795529333468e-06, + "loss": 0.3435, + "step": 4145 + }, + { + "epoch": 0.33168, + "grad_norm": 1.4514563083648682, + "learning_rate": 7.529710807356572e-06, + "loss": 0.2747, + "step": 4146 + }, + { + "epoch": 0.33176, + "grad_norm": 1.4751369953155518, + "learning_rate": 7.528625925333337e-06, + "loss": 0.3201, + "step": 4147 + }, + { + "epoch": 0.33184, + "grad_norm": 1.3058736324310303, + "learning_rate": 7.5275408833324025e-06, + "loss": 0.2439, + "step": 4148 + }, + { + "epoch": 0.33192, + "grad_norm": 1.7958042621612549, + "learning_rate": 7.526455681422413e-06, + "loss": 0.344, + "step": 4149 + }, + { + "epoch": 0.332, + "grad_norm": 1.462531328201294, + "learning_rate": 7.525370319672025e-06, + "loss": 0.2951, + "step": 4150 + }, + { + "epoch": 0.33208, + "grad_norm": 1.6958011388778687, + "learning_rate": 7.5242847981499064e-06, + "loss": 0.4044, + "step": 4151 + }, + { + "epoch": 0.33216, + "grad_norm": 1.6132330894470215, + "learning_rate": 7.5231991169247355e-06, + "loss": 0.5282, + "step": 4152 + }, + { + "epoch": 0.33224, + "grad_norm": 1.6550413370132446, + "learning_rate": 7.522113276065199e-06, + "loss": 0.3572, + "step": 4153 + }, + { + "epoch": 0.33232, + "grad_norm": 1.4192287921905518, + "learning_rate": 7.521027275639996e-06, + "loss": 0.3088, + "step": 4154 + }, + { + "epoch": 0.3324, + "grad_norm": 1.506937861442566, + "learning_rate": 7.5199411157178316e-06, + "loss": 0.2994, + "step": 4155 + }, + { + "epoch": 0.33248, + "grad_norm": 1.411502480506897, + "learning_rate": 7.518854796367424e-06, + "loss": 0.2824, + "step": 4156 + }, + { + "epoch": 0.33256, + "grad_norm": 1.7876211404800415, + "learning_rate": 7.517768317657503e-06, + "loss": 0.3639, + "step": 4157 + }, + { + "epoch": 0.33264, + "grad_norm": 1.318710207939148, + "learning_rate": 7.516681679656804e-06, + "loss": 0.2662, + "step": 4158 + }, + { + "epoch": 0.33272, + "grad_norm": 1.4276552200317383, + "learning_rate": 7.515594882434076e-06, + "loss": 0.3084, + "step": 4159 + }, + { + "epoch": 0.3328, + "grad_norm": 1.7345210313796997, + "learning_rate": 7.514507926058077e-06, + "loss": 0.3099, + "step": 4160 + }, + { + "epoch": 0.33288, + "grad_norm": 1.3311537504196167, + "learning_rate": 7.513420810597576e-06, + "loss": 0.2696, + "step": 4161 + }, + { + "epoch": 0.33296, + "grad_norm": 1.3819355964660645, + "learning_rate": 7.512333536121349e-06, + "loss": 0.3543, + "step": 4162 + }, + { + "epoch": 0.33304, + "grad_norm": 1.6310116052627563, + "learning_rate": 7.5112461026981855e-06, + "loss": 0.385, + "step": 4163 + }, + { + "epoch": 0.33312, + "grad_norm": 1.8169513940811157, + "learning_rate": 7.510158510396883e-06, + "loss": 0.4667, + "step": 4164 + }, + { + "epoch": 0.3332, + "grad_norm": 1.5222045183181763, + "learning_rate": 7.509070759286252e-06, + "loss": 0.3944, + "step": 4165 + }, + { + "epoch": 0.33328, + "grad_norm": 1.5854097604751587, + "learning_rate": 7.507982849435109e-06, + "loss": 0.3542, + "step": 4166 + }, + { + "epoch": 0.33336, + "grad_norm": 1.7982919216156006, + "learning_rate": 7.5068947809122835e-06, + "loss": 0.4572, + "step": 4167 + }, + { + "epoch": 0.33344, + "grad_norm": 1.4023102521896362, + "learning_rate": 7.505806553786613e-06, + "loss": 0.2961, + "step": 4168 + }, + { + "epoch": 0.33352, + "grad_norm": 1.5655568838119507, + "learning_rate": 7.504718168126947e-06, + "loss": 0.451, + "step": 4169 + }, + { + "epoch": 0.3336, + "grad_norm": 1.6367048025131226, + "learning_rate": 7.5036296240021425e-06, + "loss": 0.3058, + "step": 4170 + }, + { + "epoch": 0.33368, + "grad_norm": 1.4759256839752197, + "learning_rate": 7.5025409214810676e-06, + "loss": 0.3068, + "step": 4171 + }, + { + "epoch": 0.33376, + "grad_norm": 2.158902406692505, + "learning_rate": 7.501452060632603e-06, + "loss": 0.4489, + "step": 4172 + }, + { + "epoch": 0.33384, + "grad_norm": 1.8630950450897217, + "learning_rate": 7.500363041525638e-06, + "loss": 0.4119, + "step": 4173 + }, + { + "epoch": 0.33392, + "grad_norm": 1.9762685298919678, + "learning_rate": 7.499273864229069e-06, + "loss": 0.491, + "step": 4174 + }, + { + "epoch": 0.334, + "grad_norm": 1.4855585098266602, + "learning_rate": 7.4981845288118026e-06, + "loss": 0.2987, + "step": 4175 + }, + { + "epoch": 0.33408, + "grad_norm": 1.9851583242416382, + "learning_rate": 7.497095035342762e-06, + "loss": 0.3656, + "step": 4176 + }, + { + "epoch": 0.33416, + "grad_norm": 1.7855724096298218, + "learning_rate": 7.496005383890874e-06, + "loss": 0.344, + "step": 4177 + }, + { + "epoch": 0.33424, + "grad_norm": 1.5612376928329468, + "learning_rate": 7.4949155745250765e-06, + "loss": 0.3669, + "step": 4178 + }, + { + "epoch": 0.33432, + "grad_norm": 1.6685304641723633, + "learning_rate": 7.493825607314319e-06, + "loss": 0.4009, + "step": 4179 + }, + { + "epoch": 0.3344, + "grad_norm": 1.3716298341751099, + "learning_rate": 7.49273548232756e-06, + "loss": 0.3536, + "step": 4180 + }, + { + "epoch": 0.33448, + "grad_norm": 1.7887216806411743, + "learning_rate": 7.491645199633768e-06, + "loss": 0.4492, + "step": 4181 + }, + { + "epoch": 0.33456, + "grad_norm": 1.4562711715698242, + "learning_rate": 7.4905547593019205e-06, + "loss": 0.3379, + "step": 4182 + }, + { + "epoch": 0.33464, + "grad_norm": 1.4248229265213013, + "learning_rate": 7.4894641614010065e-06, + "loss": 0.3323, + "step": 4183 + }, + { + "epoch": 0.33472, + "grad_norm": 1.4600697755813599, + "learning_rate": 7.488373406000024e-06, + "loss": 0.3882, + "step": 4184 + }, + { + "epoch": 0.3348, + "grad_norm": 1.9589154720306396, + "learning_rate": 7.487282493167985e-06, + "loss": 0.4016, + "step": 4185 + }, + { + "epoch": 0.33488, + "grad_norm": 1.8087719678878784, + "learning_rate": 7.486191422973903e-06, + "loss": 0.3524, + "step": 4186 + }, + { + "epoch": 0.33496, + "grad_norm": 1.154066801071167, + "learning_rate": 7.485100195486808e-06, + "loss": 0.2329, + "step": 4187 + }, + { + "epoch": 0.33504, + "grad_norm": 2.5276176929473877, + "learning_rate": 7.48400881077574e-06, + "loss": 0.5058, + "step": 4188 + }, + { + "epoch": 0.33512, + "grad_norm": 2.2821571826934814, + "learning_rate": 7.482917268909746e-06, + "loss": 0.5157, + "step": 4189 + }, + { + "epoch": 0.3352, + "grad_norm": 1.4835360050201416, + "learning_rate": 7.481825569957886e-06, + "loss": 0.3223, + "step": 4190 + }, + { + "epoch": 0.33528, + "grad_norm": 1.7583729028701782, + "learning_rate": 7.4807337139892235e-06, + "loss": 0.3243, + "step": 4191 + }, + { + "epoch": 0.33536, + "grad_norm": 1.958134412765503, + "learning_rate": 7.479641701072841e-06, + "loss": 0.448, + "step": 4192 + }, + { + "epoch": 0.33544, + "grad_norm": 1.5461760759353638, + "learning_rate": 7.478549531277824e-06, + "loss": 0.2942, + "step": 4193 + }, + { + "epoch": 0.33552, + "grad_norm": 1.5328489542007446, + "learning_rate": 7.477457204673272e-06, + "loss": 0.327, + "step": 4194 + }, + { + "epoch": 0.3356, + "grad_norm": 1.610932469367981, + "learning_rate": 7.476364721328292e-06, + "loss": 0.3925, + "step": 4195 + }, + { + "epoch": 0.33568, + "grad_norm": 1.6235414743423462, + "learning_rate": 7.475272081312003e-06, + "loss": 0.3026, + "step": 4196 + }, + { + "epoch": 0.33576, + "grad_norm": 1.615761399269104, + "learning_rate": 7.4741792846935304e-06, + "loss": 0.3267, + "step": 4197 + }, + { + "epoch": 0.33584, + "grad_norm": 1.9006683826446533, + "learning_rate": 7.473086331542013e-06, + "loss": 0.3666, + "step": 4198 + }, + { + "epoch": 0.33592, + "grad_norm": 1.3763328790664673, + "learning_rate": 7.471993221926599e-06, + "loss": 0.2783, + "step": 4199 + }, + { + "epoch": 0.336, + "grad_norm": 1.5629264116287231, + "learning_rate": 7.4708999559164455e-06, + "loss": 0.3361, + "step": 4200 + }, + { + "epoch": 0.33608, + "grad_norm": 1.5894604921340942, + "learning_rate": 7.469806533580719e-06, + "loss": 0.3866, + "step": 4201 + }, + { + "epoch": 0.33616, + "grad_norm": 1.707032322883606, + "learning_rate": 7.468712954988597e-06, + "loss": 0.337, + "step": 4202 + }, + { + "epoch": 0.33624, + "grad_norm": 1.5144277811050415, + "learning_rate": 7.467619220209268e-06, + "loss": 0.4724, + "step": 4203 + }, + { + "epoch": 0.33632, + "grad_norm": 1.9514338970184326, + "learning_rate": 7.466525329311927e-06, + "loss": 0.4198, + "step": 4204 + }, + { + "epoch": 0.3364, + "grad_norm": 1.3086645603179932, + "learning_rate": 7.4654312823657805e-06, + "loss": 0.2661, + "step": 4205 + }, + { + "epoch": 0.33648, + "grad_norm": 1.8274202346801758, + "learning_rate": 7.464337079440046e-06, + "loss": 0.3858, + "step": 4206 + }, + { + "epoch": 0.33656, + "grad_norm": 1.772868037223816, + "learning_rate": 7.4632427206039505e-06, + "loss": 0.3487, + "step": 4207 + }, + { + "epoch": 0.33664, + "grad_norm": 1.3562657833099365, + "learning_rate": 7.4621482059267315e-06, + "loss": 0.3073, + "step": 4208 + }, + { + "epoch": 0.33672, + "grad_norm": 1.3614493608474731, + "learning_rate": 7.461053535477632e-06, + "loss": 0.3029, + "step": 4209 + }, + { + "epoch": 0.3368, + "grad_norm": 1.4911876916885376, + "learning_rate": 7.459958709325911e-06, + "loss": 0.4744, + "step": 4210 + }, + { + "epoch": 0.33688, + "grad_norm": 1.3329766988754272, + "learning_rate": 7.458863727540832e-06, + "loss": 0.2758, + "step": 4211 + }, + { + "epoch": 0.33696, + "grad_norm": 1.798316478729248, + "learning_rate": 7.457768590191674e-06, + "loss": 0.3735, + "step": 4212 + }, + { + "epoch": 0.33704, + "grad_norm": 1.8065177202224731, + "learning_rate": 7.456673297347721e-06, + "loss": 0.3828, + "step": 4213 + }, + { + "epoch": 0.33712, + "grad_norm": 1.4247829914093018, + "learning_rate": 7.455577849078269e-06, + "loss": 0.3492, + "step": 4214 + }, + { + "epoch": 0.3372, + "grad_norm": 1.6128746271133423, + "learning_rate": 7.454482245452621e-06, + "loss": 0.4182, + "step": 4215 + }, + { + "epoch": 0.33728, + "grad_norm": 1.8242870569229126, + "learning_rate": 7.453386486540095e-06, + "loss": 0.4306, + "step": 4216 + }, + { + "epoch": 0.33736, + "grad_norm": 1.6618531942367554, + "learning_rate": 7.4522905724100155e-06, + "loss": 0.3258, + "step": 4217 + }, + { + "epoch": 0.33744, + "grad_norm": 2.2641119956970215, + "learning_rate": 7.451194503131715e-06, + "loss": 0.4345, + "step": 4218 + }, + { + "epoch": 0.33752, + "grad_norm": 1.4390738010406494, + "learning_rate": 7.450098278774542e-06, + "loss": 0.2435, + "step": 4219 + }, + { + "epoch": 0.3376, + "grad_norm": 1.6688294410705566, + "learning_rate": 7.449001899407849e-06, + "loss": 0.3547, + "step": 4220 + }, + { + "epoch": 0.33768, + "grad_norm": 1.3295515775680542, + "learning_rate": 7.447905365101e-06, + "loss": 0.3372, + "step": 4221 + }, + { + "epoch": 0.33776, + "grad_norm": 1.7421692609786987, + "learning_rate": 7.446808675923369e-06, + "loss": 0.417, + "step": 4222 + }, + { + "epoch": 0.33784, + "grad_norm": 1.7241042852401733, + "learning_rate": 7.44571183194434e-06, + "loss": 0.436, + "step": 4223 + }, + { + "epoch": 0.33792, + "grad_norm": 1.7911804914474487, + "learning_rate": 7.4446148332333065e-06, + "loss": 0.4371, + "step": 4224 + }, + { + "epoch": 0.338, + "grad_norm": 1.1841015815734863, + "learning_rate": 7.4435176798596735e-06, + "loss": 0.3404, + "step": 4225 + }, + { + "epoch": 0.33808, + "grad_norm": 2.037875175476074, + "learning_rate": 7.442420371892852e-06, + "loss": 0.373, + "step": 4226 + }, + { + "epoch": 0.33816, + "grad_norm": 1.385729432106018, + "learning_rate": 7.4413229094022655e-06, + "loss": 0.297, + "step": 4227 + }, + { + "epoch": 0.33824, + "grad_norm": 1.7231978178024292, + "learning_rate": 7.440225292457348e-06, + "loss": 0.4303, + "step": 4228 + }, + { + "epoch": 0.33832, + "grad_norm": 1.3325543403625488, + "learning_rate": 7.43912752112754e-06, + "loss": 0.243, + "step": 4229 + }, + { + "epoch": 0.3384, + "grad_norm": 1.3618645668029785, + "learning_rate": 7.438029595482297e-06, + "loss": 0.3506, + "step": 4230 + }, + { + "epoch": 0.33848, + "grad_norm": 1.3418787717819214, + "learning_rate": 7.436931515591077e-06, + "loss": 0.2405, + "step": 4231 + }, + { + "epoch": 0.33856, + "grad_norm": 1.4859660863876343, + "learning_rate": 7.435833281523356e-06, + "loss": 0.3964, + "step": 4232 + }, + { + "epoch": 0.33864, + "grad_norm": 1.6270725727081299, + "learning_rate": 7.434734893348612e-06, + "loss": 0.3465, + "step": 4233 + }, + { + "epoch": 0.33872, + "grad_norm": 1.3810713291168213, + "learning_rate": 7.433636351136338e-06, + "loss": 0.3105, + "step": 4234 + }, + { + "epoch": 0.3388, + "grad_norm": 1.595778465270996, + "learning_rate": 7.432537654956036e-06, + "loss": 0.3078, + "step": 4235 + }, + { + "epoch": 0.33888, + "grad_norm": 1.770795464515686, + "learning_rate": 7.431438804877216e-06, + "loss": 0.3117, + "step": 4236 + }, + { + "epoch": 0.33896, + "grad_norm": 1.5661720037460327, + "learning_rate": 7.4303398009694e-06, + "loss": 0.3066, + "step": 4237 + }, + { + "epoch": 0.33904, + "grad_norm": 1.6826071739196777, + "learning_rate": 7.429240643302114e-06, + "loss": 0.2934, + "step": 4238 + }, + { + "epoch": 0.33912, + "grad_norm": 1.8159326314926147, + "learning_rate": 7.428141331944901e-06, + "loss": 0.3794, + "step": 4239 + }, + { + "epoch": 0.3392, + "grad_norm": 1.6127220392227173, + "learning_rate": 7.42704186696731e-06, + "loss": 0.3804, + "step": 4240 + }, + { + "epoch": 0.33928, + "grad_norm": 1.844524621963501, + "learning_rate": 7.425942248438902e-06, + "loss": 0.4147, + "step": 4241 + }, + { + "epoch": 0.33936, + "grad_norm": 1.4185341596603394, + "learning_rate": 7.424842476429246e-06, + "loss": 0.247, + "step": 4242 + }, + { + "epoch": 0.33944, + "grad_norm": 1.1865193843841553, + "learning_rate": 7.423742551007919e-06, + "loss": 0.2286, + "step": 4243 + }, + { + "epoch": 0.33952, + "grad_norm": 1.284876823425293, + "learning_rate": 7.422642472244511e-06, + "loss": 0.2601, + "step": 4244 + }, + { + "epoch": 0.3396, + "grad_norm": 1.4666966199874878, + "learning_rate": 7.42154224020862e-06, + "loss": 0.315, + "step": 4245 + }, + { + "epoch": 0.33968, + "grad_norm": 1.2044739723205566, + "learning_rate": 7.420441854969853e-06, + "loss": 0.2234, + "step": 4246 + }, + { + "epoch": 0.33976, + "grad_norm": 1.548977255821228, + "learning_rate": 7.41934131659783e-06, + "loss": 0.3675, + "step": 4247 + }, + { + "epoch": 0.33984, + "grad_norm": 1.2386624813079834, + "learning_rate": 7.418240625162178e-06, + "loss": 0.2938, + "step": 4248 + }, + { + "epoch": 0.33992, + "grad_norm": 1.4964457750320435, + "learning_rate": 7.4171397807325314e-06, + "loss": 0.3568, + "step": 4249 + }, + { + "epoch": 0.34, + "grad_norm": 1.543899416923523, + "learning_rate": 7.416038783378539e-06, + "loss": 0.4038, + "step": 4250 + }, + { + "epoch": 0.34008, + "grad_norm": 1.7202891111373901, + "learning_rate": 7.414937633169857e-06, + "loss": 0.4456, + "step": 4251 + }, + { + "epoch": 0.34016, + "grad_norm": 1.3532027006149292, + "learning_rate": 7.413836330176149e-06, + "loss": 0.2768, + "step": 4252 + }, + { + "epoch": 0.34024, + "grad_norm": 1.1683356761932373, + "learning_rate": 7.412734874467096e-06, + "loss": 0.3269, + "step": 4253 + }, + { + "epoch": 0.34032, + "grad_norm": 1.4227070808410645, + "learning_rate": 7.411633266112379e-06, + "loss": 0.3285, + "step": 4254 + }, + { + "epoch": 0.3404, + "grad_norm": 1.7242900133132935, + "learning_rate": 7.410531505181697e-06, + "loss": 0.4367, + "step": 4255 + }, + { + "epoch": 0.34048, + "grad_norm": 1.3391542434692383, + "learning_rate": 7.40942959174475e-06, + "loss": 0.3332, + "step": 4256 + }, + { + "epoch": 0.34056, + "grad_norm": 1.4209461212158203, + "learning_rate": 7.408327525871255e-06, + "loss": 0.3338, + "step": 4257 + }, + { + "epoch": 0.34064, + "grad_norm": 2.0780012607574463, + "learning_rate": 7.4072253076309375e-06, + "loss": 0.51, + "step": 4258 + }, + { + "epoch": 0.34072, + "grad_norm": 2.3870983123779297, + "learning_rate": 7.4061229370935275e-06, + "loss": 0.4668, + "step": 4259 + }, + { + "epoch": 0.3408, + "grad_norm": 1.4779009819030762, + "learning_rate": 7.405020414328771e-06, + "loss": 0.2749, + "step": 4260 + }, + { + "epoch": 0.34088, + "grad_norm": 1.7060835361480713, + "learning_rate": 7.4039177394064196e-06, + "loss": 0.3663, + "step": 4261 + }, + { + "epoch": 0.34096, + "grad_norm": 1.5579965114593506, + "learning_rate": 7.4028149123962365e-06, + "loss": 0.3155, + "step": 4262 + }, + { + "epoch": 0.34104, + "grad_norm": 1.6776325702667236, + "learning_rate": 7.401711933367994e-06, + "loss": 0.3946, + "step": 4263 + }, + { + "epoch": 0.34112, + "grad_norm": 2.1484310626983643, + "learning_rate": 7.4006088023914735e-06, + "loss": 0.541, + "step": 4264 + }, + { + "epoch": 0.3412, + "grad_norm": 1.8193962574005127, + "learning_rate": 7.399505519536465e-06, + "loss": 0.4295, + "step": 4265 + }, + { + "epoch": 0.34128, + "grad_norm": 1.7744461297988892, + "learning_rate": 7.398402084872773e-06, + "loss": 0.3818, + "step": 4266 + }, + { + "epoch": 0.34136, + "grad_norm": 1.5246503353118896, + "learning_rate": 7.397298498470206e-06, + "loss": 0.3992, + "step": 4267 + }, + { + "epoch": 0.34144, + "grad_norm": 1.3743784427642822, + "learning_rate": 7.396194760398584e-06, + "loss": 0.3579, + "step": 4268 + }, + { + "epoch": 0.34152, + "grad_norm": 1.4627759456634521, + "learning_rate": 7.395090870727737e-06, + "loss": 0.4135, + "step": 4269 + }, + { + "epoch": 0.3416, + "grad_norm": 1.4635417461395264, + "learning_rate": 7.393986829527506e-06, + "loss": 0.3232, + "step": 4270 + }, + { + "epoch": 0.34168, + "grad_norm": 1.804682731628418, + "learning_rate": 7.392882636867738e-06, + "loss": 0.3609, + "step": 4271 + }, + { + "epoch": 0.34176, + "grad_norm": 1.6113015413284302, + "learning_rate": 7.39177829281829e-06, + "loss": 0.319, + "step": 4272 + }, + { + "epoch": 0.34184, + "grad_norm": 1.5073604583740234, + "learning_rate": 7.390673797449035e-06, + "loss": 0.3688, + "step": 4273 + }, + { + "epoch": 0.34192, + "grad_norm": 1.9971095323562622, + "learning_rate": 7.389569150829847e-06, + "loss": 0.4176, + "step": 4274 + }, + { + "epoch": 0.342, + "grad_norm": 1.400302767753601, + "learning_rate": 7.3884643530306146e-06, + "loss": 0.281, + "step": 4275 + }, + { + "epoch": 0.34208, + "grad_norm": 1.5128846168518066, + "learning_rate": 7.387359404121234e-06, + "loss": 0.3501, + "step": 4276 + }, + { + "epoch": 0.34216, + "grad_norm": 1.645851969718933, + "learning_rate": 7.386254304171612e-06, + "loss": 0.3803, + "step": 4277 + }, + { + "epoch": 0.34224, + "grad_norm": 1.6219562292099, + "learning_rate": 7.385149053251664e-06, + "loss": 0.3261, + "step": 4278 + }, + { + "epoch": 0.34232, + "grad_norm": 2.3219873905181885, + "learning_rate": 7.3840436514313185e-06, + "loss": 0.5242, + "step": 4279 + }, + { + "epoch": 0.3424, + "grad_norm": 1.3534795045852661, + "learning_rate": 7.382938098780505e-06, + "loss": 0.2412, + "step": 4280 + }, + { + "epoch": 0.34248, + "grad_norm": 1.458430290222168, + "learning_rate": 7.381832395369175e-06, + "loss": 0.303, + "step": 4281 + }, + { + "epoch": 0.34256, + "grad_norm": 1.8676531314849854, + "learning_rate": 7.380726541267276e-06, + "loss": 0.4078, + "step": 4282 + }, + { + "epoch": 0.34264, + "grad_norm": 1.8521391153335571, + "learning_rate": 7.379620536544776e-06, + "loss": 0.5314, + "step": 4283 + }, + { + "epoch": 0.34272, + "grad_norm": 1.4564507007598877, + "learning_rate": 7.378514381271646e-06, + "loss": 0.3131, + "step": 4284 + }, + { + "epoch": 0.3428, + "grad_norm": 1.7293351888656616, + "learning_rate": 7.377408075517871e-06, + "loss": 0.3745, + "step": 4285 + }, + { + "epoch": 0.34288, + "grad_norm": 1.8894083499908447, + "learning_rate": 7.376301619353441e-06, + "loss": 0.4519, + "step": 4286 + }, + { + "epoch": 0.34296, + "grad_norm": 1.4705125093460083, + "learning_rate": 7.375195012848359e-06, + "loss": 0.3943, + "step": 4287 + }, + { + "epoch": 0.34304, + "grad_norm": 1.75259530544281, + "learning_rate": 7.374088256072635e-06, + "loss": 0.3682, + "step": 4288 + }, + { + "epoch": 0.34312, + "grad_norm": 1.3996787071228027, + "learning_rate": 7.372981349096291e-06, + "loss": 0.2633, + "step": 4289 + }, + { + "epoch": 0.3432, + "grad_norm": 1.3005949258804321, + "learning_rate": 7.371874291989358e-06, + "loss": 0.2874, + "step": 4290 + }, + { + "epoch": 0.34328, + "grad_norm": 1.8466405868530273, + "learning_rate": 7.370767084821875e-06, + "loss": 0.4616, + "step": 4291 + }, + { + "epoch": 0.34336, + "grad_norm": 1.9775362014770508, + "learning_rate": 7.369659727663889e-06, + "loss": 0.4652, + "step": 4292 + }, + { + "epoch": 0.34344, + "grad_norm": 1.816942811012268, + "learning_rate": 7.368552220585464e-06, + "loss": 0.5346, + "step": 4293 + }, + { + "epoch": 0.34352, + "grad_norm": 1.3692759275436401, + "learning_rate": 7.367444563656663e-06, + "loss": 0.4167, + "step": 4294 + }, + { + "epoch": 0.3436, + "grad_norm": 1.5143382549285889, + "learning_rate": 7.3663367569475665e-06, + "loss": 0.2954, + "step": 4295 + }, + { + "epoch": 0.34368, + "grad_norm": 1.7048492431640625, + "learning_rate": 7.365228800528263e-06, + "loss": 0.3539, + "step": 4296 + }, + { + "epoch": 0.34376, + "grad_norm": 1.409371256828308, + "learning_rate": 7.364120694468845e-06, + "loss": 0.3441, + "step": 4297 + }, + { + "epoch": 0.34384, + "grad_norm": 1.3780807256698608, + "learning_rate": 7.363012438839421e-06, + "loss": 0.3102, + "step": 4298 + }, + { + "epoch": 0.34392, + "grad_norm": 1.3144261837005615, + "learning_rate": 7.361904033710108e-06, + "loss": 0.2998, + "step": 4299 + }, + { + "epoch": 0.344, + "grad_norm": 1.5273244380950928, + "learning_rate": 7.360795479151029e-06, + "loss": 0.3879, + "step": 4300 + }, + { + "epoch": 0.34408, + "grad_norm": 1.3805782794952393, + "learning_rate": 7.359686775232318e-06, + "loss": 0.3264, + "step": 4301 + }, + { + "epoch": 0.34416, + "grad_norm": 1.8906997442245483, + "learning_rate": 7.358577922024123e-06, + "loss": 0.496, + "step": 4302 + }, + { + "epoch": 0.34424, + "grad_norm": 1.627760410308838, + "learning_rate": 7.357468919596593e-06, + "loss": 0.3229, + "step": 4303 + }, + { + "epoch": 0.34432, + "grad_norm": 1.7278337478637695, + "learning_rate": 7.356359768019894e-06, + "loss": 0.4291, + "step": 4304 + }, + { + "epoch": 0.3444, + "grad_norm": 1.5394865274429321, + "learning_rate": 7.355250467364196e-06, + "loss": 0.321, + "step": 4305 + }, + { + "epoch": 0.34448, + "grad_norm": 1.5800862312316895, + "learning_rate": 7.354141017699681e-06, + "loss": 0.3424, + "step": 4306 + }, + { + "epoch": 0.34456, + "grad_norm": 1.81803297996521, + "learning_rate": 7.35303141909654e-06, + "loss": 0.3574, + "step": 4307 + }, + { + "epoch": 0.34464, + "grad_norm": 1.6725924015045166, + "learning_rate": 7.351921671624977e-06, + "loss": 0.3973, + "step": 4308 + }, + { + "epoch": 0.34472, + "grad_norm": 1.7673554420471191, + "learning_rate": 7.350811775355197e-06, + "loss": 0.4279, + "step": 4309 + }, + { + "epoch": 0.3448, + "grad_norm": 1.319742202758789, + "learning_rate": 7.349701730357424e-06, + "loss": 0.2885, + "step": 4310 + }, + { + "epoch": 0.34488, + "grad_norm": 1.64664888381958, + "learning_rate": 7.348591536701882e-06, + "loss": 0.4351, + "step": 4311 + }, + { + "epoch": 0.34496, + "grad_norm": 2.0317161083221436, + "learning_rate": 7.347481194458813e-06, + "loss": 0.4843, + "step": 4312 + }, + { + "epoch": 0.34504, + "grad_norm": 1.2819757461547852, + "learning_rate": 7.346370703698464e-06, + "loss": 0.2645, + "step": 4313 + }, + { + "epoch": 0.34512, + "grad_norm": 1.667986512184143, + "learning_rate": 7.345260064491092e-06, + "loss": 0.4948, + "step": 4314 + }, + { + "epoch": 0.3452, + "grad_norm": 1.6327108144760132, + "learning_rate": 7.344149276906962e-06, + "loss": 0.3498, + "step": 4315 + }, + { + "epoch": 0.34528, + "grad_norm": 1.8304697275161743, + "learning_rate": 7.343038341016352e-06, + "loss": 0.364, + "step": 4316 + }, + { + "epoch": 0.34536, + "grad_norm": 1.6535539627075195, + "learning_rate": 7.341927256889545e-06, + "loss": 0.3881, + "step": 4317 + }, + { + "epoch": 0.34544, + "grad_norm": 1.561083197593689, + "learning_rate": 7.340816024596838e-06, + "loss": 0.4237, + "step": 4318 + }, + { + "epoch": 0.34552, + "grad_norm": 1.6609004735946655, + "learning_rate": 7.339704644208531e-06, + "loss": 0.3515, + "step": 4319 + }, + { + "epoch": 0.3456, + "grad_norm": 1.5211528539657593, + "learning_rate": 7.338593115794942e-06, + "loss": 0.3339, + "step": 4320 + }, + { + "epoch": 0.34568, + "grad_norm": 1.4612761735916138, + "learning_rate": 7.33748143942639e-06, + "loss": 0.3212, + "step": 4321 + }, + { + "epoch": 0.34576, + "grad_norm": 1.503321647644043, + "learning_rate": 7.336369615173209e-06, + "loss": 0.4281, + "step": 4322 + }, + { + "epoch": 0.34584, + "grad_norm": 1.4206557273864746, + "learning_rate": 7.33525764310574e-06, + "loss": 0.309, + "step": 4323 + }, + { + "epoch": 0.34592, + "grad_norm": 1.1924537420272827, + "learning_rate": 7.334145523294334e-06, + "loss": 0.2613, + "step": 4324 + }, + { + "epoch": 0.346, + "grad_norm": 1.3378185033798218, + "learning_rate": 7.333033255809351e-06, + "loss": 0.2799, + "step": 4325 + }, + { + "epoch": 0.34608, + "grad_norm": 1.678286075592041, + "learning_rate": 7.33192084072116e-06, + "loss": 0.3864, + "step": 4326 + }, + { + "epoch": 0.34616, + "grad_norm": 1.57291841506958, + "learning_rate": 7.330808278100141e-06, + "loss": 0.3265, + "step": 4327 + }, + { + "epoch": 0.34624, + "grad_norm": 1.6815236806869507, + "learning_rate": 7.329695568016679e-06, + "loss": 0.3757, + "step": 4328 + }, + { + "epoch": 0.34632, + "grad_norm": 1.3270570039749146, + "learning_rate": 7.328582710541174e-06, + "loss": 0.2978, + "step": 4329 + }, + { + "epoch": 0.3464, + "grad_norm": 1.3988102674484253, + "learning_rate": 7.327469705744034e-06, + "loss": 0.3347, + "step": 4330 + }, + { + "epoch": 0.34648, + "grad_norm": 1.7103873491287231, + "learning_rate": 7.3263565536956725e-06, + "loss": 0.3344, + "step": 4331 + }, + { + "epoch": 0.34656, + "grad_norm": 1.768531322479248, + "learning_rate": 7.325243254466516e-06, + "loss": 0.3403, + "step": 4332 + }, + { + "epoch": 0.34664, + "grad_norm": 1.3608916997909546, + "learning_rate": 7.3241298081269995e-06, + "loss": 0.2652, + "step": 4333 + }, + { + "epoch": 0.34672, + "grad_norm": 2.1497552394866943, + "learning_rate": 7.323016214747566e-06, + "loss": 0.404, + "step": 4334 + }, + { + "epoch": 0.3468, + "grad_norm": 1.8790044784545898, + "learning_rate": 7.321902474398669e-06, + "loss": 0.3277, + "step": 4335 + }, + { + "epoch": 0.34688, + "grad_norm": 1.679563045501709, + "learning_rate": 7.3207885871507715e-06, + "loss": 0.3758, + "step": 4336 + }, + { + "epoch": 0.34696, + "grad_norm": 1.669385552406311, + "learning_rate": 7.319674553074347e-06, + "loss": 0.3434, + "step": 4337 + }, + { + "epoch": 0.34704, + "grad_norm": 1.5998573303222656, + "learning_rate": 7.318560372239873e-06, + "loss": 0.4371, + "step": 4338 + }, + { + "epoch": 0.34712, + "grad_norm": 1.4684016704559326, + "learning_rate": 7.317446044717845e-06, + "loss": 0.2922, + "step": 4339 + }, + { + "epoch": 0.3472, + "grad_norm": 1.7351469993591309, + "learning_rate": 7.316331570578759e-06, + "loss": 0.41, + "step": 4340 + }, + { + "epoch": 0.34728, + "grad_norm": 1.467858910560608, + "learning_rate": 7.315216949893124e-06, + "loss": 0.353, + "step": 4341 + }, + { + "epoch": 0.34736, + "grad_norm": 1.418001651763916, + "learning_rate": 7.314102182731459e-06, + "loss": 0.3429, + "step": 4342 + }, + { + "epoch": 0.34744, + "grad_norm": 1.5263274908065796, + "learning_rate": 7.3129872691642935e-06, + "loss": 0.4126, + "step": 4343 + }, + { + "epoch": 0.34752, + "grad_norm": 1.2809735536575317, + "learning_rate": 7.3118722092621615e-06, + "loss": 0.2515, + "step": 4344 + }, + { + "epoch": 0.3476, + "grad_norm": 1.377831220626831, + "learning_rate": 7.31075700309561e-06, + "loss": 0.2804, + "step": 4345 + }, + { + "epoch": 0.34768, + "grad_norm": 1.5023661851882935, + "learning_rate": 7.309641650735195e-06, + "loss": 0.3874, + "step": 4346 + }, + { + "epoch": 0.34776, + "grad_norm": 1.7414848804473877, + "learning_rate": 7.308526152251482e-06, + "loss": 0.346, + "step": 4347 + }, + { + "epoch": 0.34784, + "grad_norm": 1.4180262088775635, + "learning_rate": 7.307410507715044e-06, + "loss": 0.3632, + "step": 4348 + }, + { + "epoch": 0.34792, + "grad_norm": 1.8054182529449463, + "learning_rate": 7.3062947171964626e-06, + "loss": 0.3541, + "step": 4349 + }, + { + "epoch": 0.348, + "grad_norm": 1.9470112323760986, + "learning_rate": 7.305178780766332e-06, + "loss": 0.3118, + "step": 4350 + }, + { + "epoch": 0.34808, + "grad_norm": 1.3836066722869873, + "learning_rate": 7.304062698495253e-06, + "loss": 0.2609, + "step": 4351 + }, + { + "epoch": 0.34816, + "grad_norm": 1.3901145458221436, + "learning_rate": 7.302946470453835e-06, + "loss": 0.3282, + "step": 4352 + }, + { + "epoch": 0.34824, + "grad_norm": 1.5342923402786255, + "learning_rate": 7.301830096712701e-06, + "loss": 0.284, + "step": 4353 + }, + { + "epoch": 0.34832, + "grad_norm": 1.7975738048553467, + "learning_rate": 7.300713577342477e-06, + "loss": 0.415, + "step": 4354 + }, + { + "epoch": 0.3484, + "grad_norm": 1.4693024158477783, + "learning_rate": 7.2995969124138054e-06, + "loss": 0.3352, + "step": 4355 + }, + { + "epoch": 0.34848, + "grad_norm": 1.8647490739822388, + "learning_rate": 7.298480101997331e-06, + "loss": 0.399, + "step": 4356 + }, + { + "epoch": 0.34856, + "grad_norm": 1.3949613571166992, + "learning_rate": 7.297363146163711e-06, + "loss": 0.4118, + "step": 4357 + }, + { + "epoch": 0.34864, + "grad_norm": 1.5804004669189453, + "learning_rate": 7.296246044983611e-06, + "loss": 0.3258, + "step": 4358 + }, + { + "epoch": 0.34872, + "grad_norm": 1.418825626373291, + "learning_rate": 7.295128798527708e-06, + "loss": 0.324, + "step": 4359 + }, + { + "epoch": 0.3488, + "grad_norm": 1.6800353527069092, + "learning_rate": 7.294011406866686e-06, + "loss": 0.4247, + "step": 4360 + }, + { + "epoch": 0.34888, + "grad_norm": 1.2685750722885132, + "learning_rate": 7.292893870071238e-06, + "loss": 0.2687, + "step": 4361 + }, + { + "epoch": 0.34896, + "grad_norm": 1.8051155805587769, + "learning_rate": 7.2917761882120655e-06, + "loss": 0.4082, + "step": 4362 + }, + { + "epoch": 0.34904, + "grad_norm": 1.8455771207809448, + "learning_rate": 7.290658361359883e-06, + "loss": 0.3075, + "step": 4363 + }, + { + "epoch": 0.34912, + "grad_norm": 1.6396703720092773, + "learning_rate": 7.28954038958541e-06, + "loss": 0.4184, + "step": 4364 + }, + { + "epoch": 0.3492, + "grad_norm": 1.6618815660476685, + "learning_rate": 7.2884222729593765e-06, + "loss": 0.3217, + "step": 4365 + }, + { + "epoch": 0.34928, + "grad_norm": 1.307723045349121, + "learning_rate": 7.287304011552524e-06, + "loss": 0.3596, + "step": 4366 + }, + { + "epoch": 0.34936, + "grad_norm": 1.5229804515838623, + "learning_rate": 7.2861856054356e-06, + "loss": 0.3199, + "step": 4367 + }, + { + "epoch": 0.34944, + "grad_norm": 1.6636910438537598, + "learning_rate": 7.285067054679362e-06, + "loss": 0.2866, + "step": 4368 + }, + { + "epoch": 0.34952, + "grad_norm": 2.0462241172790527, + "learning_rate": 7.283948359354578e-06, + "loss": 0.4956, + "step": 4369 + }, + { + "epoch": 0.3496, + "grad_norm": 1.820526361465454, + "learning_rate": 7.282829519532022e-06, + "loss": 0.369, + "step": 4370 + }, + { + "epoch": 0.34968, + "grad_norm": 1.3398878574371338, + "learning_rate": 7.281710535282482e-06, + "loss": 0.3695, + "step": 4371 + }, + { + "epoch": 0.34976, + "grad_norm": 1.5608426332473755, + "learning_rate": 7.280591406676751e-06, + "loss": 0.34, + "step": 4372 + }, + { + "epoch": 0.34984, + "grad_norm": 1.7027915716171265, + "learning_rate": 7.279472133785633e-06, + "loss": 0.3301, + "step": 4373 + }, + { + "epoch": 0.34992, + "grad_norm": 1.7262037992477417, + "learning_rate": 7.278352716679939e-06, + "loss": 0.3066, + "step": 4374 + }, + { + "epoch": 0.35, + "grad_norm": 1.5613858699798584, + "learning_rate": 7.277233155430492e-06, + "loss": 0.3567, + "step": 4375 + }, + { + "epoch": 0.35008, + "grad_norm": 1.6144390106201172, + "learning_rate": 7.2761134501081246e-06, + "loss": 0.4078, + "step": 4376 + }, + { + "epoch": 0.35016, + "grad_norm": 1.7558294534683228, + "learning_rate": 7.274993600783673e-06, + "loss": 0.4121, + "step": 4377 + }, + { + "epoch": 0.35024, + "grad_norm": 1.5915411710739136, + "learning_rate": 7.27387360752799e-06, + "loss": 0.3196, + "step": 4378 + }, + { + "epoch": 0.35032, + "grad_norm": 2.133772850036621, + "learning_rate": 7.272753470411931e-06, + "loss": 0.4273, + "step": 4379 + }, + { + "epoch": 0.3504, + "grad_norm": 1.4743467569351196, + "learning_rate": 7.271633189506366e-06, + "loss": 0.2437, + "step": 4380 + }, + { + "epoch": 0.35048, + "grad_norm": 1.521664023399353, + "learning_rate": 7.270512764882168e-06, + "loss": 0.3741, + "step": 4381 + }, + { + "epoch": 0.35056, + "grad_norm": 1.5323858261108398, + "learning_rate": 7.269392196610226e-06, + "loss": 0.3979, + "step": 4382 + }, + { + "epoch": 0.35064, + "grad_norm": 1.616807222366333, + "learning_rate": 7.268271484761433e-06, + "loss": 0.3299, + "step": 4383 + }, + { + "epoch": 0.35072, + "grad_norm": 1.7666263580322266, + "learning_rate": 7.267150629406694e-06, + "loss": 0.4004, + "step": 4384 + }, + { + "epoch": 0.3508, + "grad_norm": 1.719724178314209, + "learning_rate": 7.266029630616918e-06, + "loss": 0.4081, + "step": 4385 + }, + { + "epoch": 0.35088, + "grad_norm": 1.9632675647735596, + "learning_rate": 7.2649084884630305e-06, + "loss": 0.3993, + "step": 4386 + }, + { + "epoch": 0.35096, + "grad_norm": 1.2466901540756226, + "learning_rate": 7.2637872030159616e-06, + "loss": 0.2621, + "step": 4387 + }, + { + "epoch": 0.35104, + "grad_norm": 1.5172184705734253, + "learning_rate": 7.262665774346651e-06, + "loss": 0.3656, + "step": 4388 + }, + { + "epoch": 0.35112, + "grad_norm": 1.544595718383789, + "learning_rate": 7.261544202526047e-06, + "loss": 0.4009, + "step": 4389 + }, + { + "epoch": 0.3512, + "grad_norm": 1.762475609779358, + "learning_rate": 7.260422487625109e-06, + "loss": 0.4339, + "step": 4390 + }, + { + "epoch": 0.35128, + "grad_norm": 1.3777724504470825, + "learning_rate": 7.259300629714805e-06, + "loss": 0.3057, + "step": 4391 + }, + { + "epoch": 0.35136, + "grad_norm": 1.9480410814285278, + "learning_rate": 7.258178628866108e-06, + "loss": 0.493, + "step": 4392 + }, + { + "epoch": 0.35144, + "grad_norm": 1.656561255455017, + "learning_rate": 7.257056485150004e-06, + "loss": 0.5685, + "step": 4393 + }, + { + "epoch": 0.35152, + "grad_norm": 1.3406410217285156, + "learning_rate": 7.255934198637492e-06, + "loss": 0.321, + "step": 4394 + }, + { + "epoch": 0.3516, + "grad_norm": 1.745924472808838, + "learning_rate": 7.254811769399569e-06, + "loss": 0.4519, + "step": 4395 + }, + { + "epoch": 0.35168, + "grad_norm": 1.6247973442077637, + "learning_rate": 7.253689197507252e-06, + "loss": 0.3926, + "step": 4396 + }, + { + "epoch": 0.35176, + "grad_norm": 1.585526943206787, + "learning_rate": 7.252566483031558e-06, + "loss": 0.32, + "step": 4397 + }, + { + "epoch": 0.35184, + "grad_norm": 1.8269659280776978, + "learning_rate": 7.251443626043521e-06, + "loss": 0.442, + "step": 4398 + }, + { + "epoch": 0.35192, + "grad_norm": 2.0274364948272705, + "learning_rate": 7.250320626614178e-06, + "loss": 0.3676, + "step": 4399 + }, + { + "epoch": 0.352, + "grad_norm": 1.7257592678070068, + "learning_rate": 7.249197484814579e-06, + "loss": 0.4199, + "step": 4400 + }, + { + "epoch": 0.35208, + "grad_norm": 2.1232810020446777, + "learning_rate": 7.2480742007157815e-06, + "loss": 0.3919, + "step": 4401 + }, + { + "epoch": 0.35216, + "grad_norm": 1.9855226278305054, + "learning_rate": 7.246950774388851e-06, + "loss": 0.5002, + "step": 4402 + }, + { + "epoch": 0.35224, + "grad_norm": 1.150065541267395, + "learning_rate": 7.245827205904864e-06, + "loss": 0.2116, + "step": 4403 + }, + { + "epoch": 0.35232, + "grad_norm": 1.5630650520324707, + "learning_rate": 7.244703495334904e-06, + "loss": 0.2761, + "step": 4404 + }, + { + "epoch": 0.3524, + "grad_norm": 1.6066936254501343, + "learning_rate": 7.243579642750064e-06, + "loss": 0.3575, + "step": 4405 + }, + { + "epoch": 0.35248, + "grad_norm": 1.361370325088501, + "learning_rate": 7.242455648221447e-06, + "loss": 0.2968, + "step": 4406 + }, + { + "epoch": 0.35256, + "grad_norm": 1.8183329105377197, + "learning_rate": 7.241331511820165e-06, + "loss": 0.3925, + "step": 4407 + }, + { + "epoch": 0.35264, + "grad_norm": 1.5168720483779907, + "learning_rate": 7.240207233617338e-06, + "loss": 0.3869, + "step": 4408 + }, + { + "epoch": 0.35272, + "grad_norm": 1.3194420337677002, + "learning_rate": 7.239082813684095e-06, + "loss": 0.2716, + "step": 4409 + }, + { + "epoch": 0.3528, + "grad_norm": 1.8594125509262085, + "learning_rate": 7.237958252091573e-06, + "loss": 0.4105, + "step": 4410 + }, + { + "epoch": 0.35288, + "grad_norm": 1.588093638420105, + "learning_rate": 7.236833548910922e-06, + "loss": 0.3534, + "step": 4411 + }, + { + "epoch": 0.35296, + "grad_norm": 1.6373640298843384, + "learning_rate": 7.235708704213297e-06, + "loss": 0.3045, + "step": 4412 + }, + { + "epoch": 0.35304, + "grad_norm": 2.0101304054260254, + "learning_rate": 7.234583718069862e-06, + "loss": 0.4869, + "step": 4413 + }, + { + "epoch": 0.35312, + "grad_norm": 1.357629656791687, + "learning_rate": 7.233458590551793e-06, + "loss": 0.2507, + "step": 4414 + }, + { + "epoch": 0.3532, + "grad_norm": 1.851575255393982, + "learning_rate": 7.232333321730271e-06, + "loss": 0.3451, + "step": 4415 + }, + { + "epoch": 0.35328, + "grad_norm": 1.6210441589355469, + "learning_rate": 7.2312079116764895e-06, + "loss": 0.3416, + "step": 4416 + }, + { + "epoch": 0.35336, + "grad_norm": 1.2965114116668701, + "learning_rate": 7.230082360461651e-06, + "loss": 0.2877, + "step": 4417 + }, + { + "epoch": 0.35344, + "grad_norm": 1.6795660257339478, + "learning_rate": 7.228956668156961e-06, + "loss": 0.4307, + "step": 4418 + }, + { + "epoch": 0.35352, + "grad_norm": 1.4039353132247925, + "learning_rate": 7.2278308348336425e-06, + "loss": 0.3429, + "step": 4419 + }, + { + "epoch": 0.3536, + "grad_norm": 1.2655251026153564, + "learning_rate": 7.226704860562921e-06, + "loss": 0.3212, + "step": 4420 + }, + { + "epoch": 0.35368, + "grad_norm": 1.6746938228607178, + "learning_rate": 7.225578745416033e-06, + "loss": 0.3959, + "step": 4421 + }, + { + "epoch": 0.35376, + "grad_norm": 2.1023738384246826, + "learning_rate": 7.224452489464224e-06, + "loss": 0.3883, + "step": 4422 + }, + { + "epoch": 0.35384, + "grad_norm": 1.2368510961532593, + "learning_rate": 7.2233260927787495e-06, + "loss": 0.241, + "step": 4423 + }, + { + "epoch": 0.35392, + "grad_norm": 1.6504780054092407, + "learning_rate": 7.222199555430872e-06, + "loss": 0.5443, + "step": 4424 + }, + { + "epoch": 0.354, + "grad_norm": 1.4802947044372559, + "learning_rate": 7.221072877491866e-06, + "loss": 0.3644, + "step": 4425 + }, + { + "epoch": 0.35408, + "grad_norm": 1.553727626800537, + "learning_rate": 7.219946059033009e-06, + "loss": 0.3166, + "step": 4426 + }, + { + "epoch": 0.35416, + "grad_norm": 1.4172799587249756, + "learning_rate": 7.2188191001255935e-06, + "loss": 0.3096, + "step": 4427 + }, + { + "epoch": 0.35424, + "grad_norm": 1.5616695880889893, + "learning_rate": 7.2176920008409175e-06, + "loss": 0.3423, + "step": 4428 + }, + { + "epoch": 0.35432, + "grad_norm": 1.895857810974121, + "learning_rate": 7.216564761250289e-06, + "loss": 0.3599, + "step": 4429 + }, + { + "epoch": 0.3544, + "grad_norm": 1.2682089805603027, + "learning_rate": 7.2154373814250246e-06, + "loss": 0.2516, + "step": 4430 + }, + { + "epoch": 0.35448, + "grad_norm": 1.800615906715393, + "learning_rate": 7.2143098614364504e-06, + "loss": 0.5255, + "step": 4431 + }, + { + "epoch": 0.35456, + "grad_norm": 1.2873491048812866, + "learning_rate": 7.2131822013559e-06, + "loss": 0.2427, + "step": 4432 + }, + { + "epoch": 0.35464, + "grad_norm": 1.6039749383926392, + "learning_rate": 7.212054401254718e-06, + "loss": 0.3177, + "step": 4433 + }, + { + "epoch": 0.35472, + "grad_norm": 1.0338603258132935, + "learning_rate": 7.210926461204254e-06, + "loss": 0.2171, + "step": 4434 + }, + { + "epoch": 0.3548, + "grad_norm": 1.7069857120513916, + "learning_rate": 7.209798381275871e-06, + "loss": 0.3947, + "step": 4435 + }, + { + "epoch": 0.35488, + "grad_norm": 1.2922322750091553, + "learning_rate": 7.208670161540938e-06, + "loss": 0.2809, + "step": 4436 + }, + { + "epoch": 0.35496, + "grad_norm": 1.5950067043304443, + "learning_rate": 7.207541802070836e-06, + "loss": 0.3408, + "step": 4437 + }, + { + "epoch": 0.35504, + "grad_norm": 2.009056806564331, + "learning_rate": 7.206413302936948e-06, + "loss": 0.423, + "step": 4438 + }, + { + "epoch": 0.35512, + "grad_norm": 1.1245336532592773, + "learning_rate": 7.2052846642106754e-06, + "loss": 0.2455, + "step": 4439 + }, + { + "epoch": 0.3552, + "grad_norm": 1.768202781677246, + "learning_rate": 7.204155885963421e-06, + "loss": 0.3644, + "step": 4440 + }, + { + "epoch": 0.35528, + "grad_norm": 1.4672619104385376, + "learning_rate": 7.203026968266598e-06, + "loss": 0.4002, + "step": 4441 + }, + { + "epoch": 0.35536, + "grad_norm": 1.898700475692749, + "learning_rate": 7.201897911191629e-06, + "loss": 0.6915, + "step": 4442 + }, + { + "epoch": 0.35544, + "grad_norm": 1.4047620296478271, + "learning_rate": 7.200768714809949e-06, + "loss": 0.2622, + "step": 4443 + }, + { + "epoch": 0.35552, + "grad_norm": 1.597065806388855, + "learning_rate": 7.199639379192994e-06, + "loss": 0.3354, + "step": 4444 + }, + { + "epoch": 0.3556, + "grad_norm": 1.5272079706192017, + "learning_rate": 7.198509904412216e-06, + "loss": 0.3772, + "step": 4445 + }, + { + "epoch": 0.35568, + "grad_norm": 1.6030325889587402, + "learning_rate": 7.197380290539073e-06, + "loss": 0.4348, + "step": 4446 + }, + { + "epoch": 0.35576, + "grad_norm": 1.7476811408996582, + "learning_rate": 7.1962505376450305e-06, + "loss": 0.3492, + "step": 4447 + }, + { + "epoch": 0.35584, + "grad_norm": 1.6694729328155518, + "learning_rate": 7.195120645801567e-06, + "loss": 0.2928, + "step": 4448 + }, + { + "epoch": 0.35592, + "grad_norm": 1.4422017335891724, + "learning_rate": 7.193990615080165e-06, + "loss": 0.3645, + "step": 4449 + }, + { + "epoch": 0.356, + "grad_norm": 1.421806812286377, + "learning_rate": 7.192860445552317e-06, + "loss": 0.3291, + "step": 4450 + }, + { + "epoch": 0.35608, + "grad_norm": 1.5557352304458618, + "learning_rate": 7.1917301372895265e-06, + "loss": 0.321, + "step": 4451 + }, + { + "epoch": 0.35616, + "grad_norm": 1.34674870967865, + "learning_rate": 7.190599690363303e-06, + "loss": 0.3187, + "step": 4452 + }, + { + "epoch": 0.35624, + "grad_norm": 1.7821531295776367, + "learning_rate": 7.189469104845167e-06, + "loss": 0.3548, + "step": 4453 + }, + { + "epoch": 0.35632, + "grad_norm": 1.5039072036743164, + "learning_rate": 7.1883383808066474e-06, + "loss": 0.3209, + "step": 4454 + }, + { + "epoch": 0.3564, + "grad_norm": 1.5421476364135742, + "learning_rate": 7.187207518319281e-06, + "loss": 0.3517, + "step": 4455 + }, + { + "epoch": 0.35648, + "grad_norm": 1.7966395616531372, + "learning_rate": 7.186076517454612e-06, + "loss": 0.5422, + "step": 4456 + }, + { + "epoch": 0.35656, + "grad_norm": 1.0441017150878906, + "learning_rate": 7.184945378284196e-06, + "loss": 0.2174, + "step": 4457 + }, + { + "epoch": 0.35664, + "grad_norm": 1.6243857145309448, + "learning_rate": 7.1838141008795985e-06, + "loss": 0.3314, + "step": 4458 + }, + { + "epoch": 0.35672, + "grad_norm": 1.5501540899276733, + "learning_rate": 7.182682685312389e-06, + "loss": 0.3994, + "step": 4459 + }, + { + "epoch": 0.3568, + "grad_norm": 1.3322809934616089, + "learning_rate": 7.181551131654149e-06, + "loss": 0.2615, + "step": 4460 + }, + { + "epoch": 0.35688, + "grad_norm": 1.8182624578475952, + "learning_rate": 7.1804194399764695e-06, + "loss": 0.4124, + "step": 4461 + }, + { + "epoch": 0.35696, + "grad_norm": 1.7917581796646118, + "learning_rate": 7.179287610350947e-06, + "loss": 0.3101, + "step": 4462 + }, + { + "epoch": 0.35704, + "grad_norm": 1.4548834562301636, + "learning_rate": 7.1781556428491895e-06, + "loss": 0.3088, + "step": 4463 + }, + { + "epoch": 0.35712, + "grad_norm": 1.8527675867080688, + "learning_rate": 7.177023537542812e-06, + "loss": 0.4651, + "step": 4464 + }, + { + "epoch": 0.3572, + "grad_norm": 2.1842334270477295, + "learning_rate": 7.17589129450344e-06, + "loss": 0.4734, + "step": 4465 + }, + { + "epoch": 0.35728, + "grad_norm": 1.522463321685791, + "learning_rate": 7.174758913802707e-06, + "loss": 0.3419, + "step": 4466 + }, + { + "epoch": 0.35736, + "grad_norm": 1.5129618644714355, + "learning_rate": 7.173626395512253e-06, + "loss": 0.3018, + "step": 4467 + }, + { + "epoch": 0.35744, + "grad_norm": 1.5700559616088867, + "learning_rate": 7.172493739703731e-06, + "loss": 0.3354, + "step": 4468 + }, + { + "epoch": 0.35752, + "grad_norm": 1.571109414100647, + "learning_rate": 7.171360946448799e-06, + "loss": 0.4211, + "step": 4469 + }, + { + "epoch": 0.3576, + "grad_norm": 1.8297759294509888, + "learning_rate": 7.170228015819125e-06, + "loss": 0.4264, + "step": 4470 + }, + { + "epoch": 0.35768, + "grad_norm": 1.6647297143936157, + "learning_rate": 7.169094947886386e-06, + "loss": 0.3417, + "step": 4471 + }, + { + "epoch": 0.35776, + "grad_norm": 1.950027585029602, + "learning_rate": 7.167961742722268e-06, + "loss": 0.3798, + "step": 4472 + }, + { + "epoch": 0.35784, + "grad_norm": 1.2567858695983887, + "learning_rate": 7.166828400398465e-06, + "loss": 0.2779, + "step": 4473 + }, + { + "epoch": 0.35792, + "grad_norm": 1.0137466192245483, + "learning_rate": 7.165694920986679e-06, + "loss": 0.2356, + "step": 4474 + }, + { + "epoch": 0.358, + "grad_norm": 1.2423343658447266, + "learning_rate": 7.16456130455862e-06, + "loss": 0.2513, + "step": 4475 + }, + { + "epoch": 0.35808, + "grad_norm": 1.8165005445480347, + "learning_rate": 7.163427551186012e-06, + "loss": 0.3987, + "step": 4476 + }, + { + "epoch": 0.35816, + "grad_norm": 1.666602373123169, + "learning_rate": 7.1622936609405804e-06, + "loss": 0.3845, + "step": 4477 + }, + { + "epoch": 0.35824, + "grad_norm": 1.850458025932312, + "learning_rate": 7.161159633894065e-06, + "loss": 0.3629, + "step": 4478 + }, + { + "epoch": 0.35832, + "grad_norm": 1.5257556438446045, + "learning_rate": 7.1600254701182106e-06, + "loss": 0.4075, + "step": 4479 + }, + { + "epoch": 0.3584, + "grad_norm": 2.071976661682129, + "learning_rate": 7.158891169684772e-06, + "loss": 0.394, + "step": 4480 + }, + { + "epoch": 0.35848, + "grad_norm": 1.8309593200683594, + "learning_rate": 7.157756732665512e-06, + "loss": 0.3949, + "step": 4481 + }, + { + "epoch": 0.35856, + "grad_norm": 1.468177080154419, + "learning_rate": 7.156622159132204e-06, + "loss": 0.3528, + "step": 4482 + }, + { + "epoch": 0.35864, + "grad_norm": 1.4701149463653564, + "learning_rate": 7.1554874491566274e-06, + "loss": 0.334, + "step": 4483 + }, + { + "epoch": 0.35872, + "grad_norm": 1.1661218404769897, + "learning_rate": 7.1543526028105735e-06, + "loss": 0.2672, + "step": 4484 + }, + { + "epoch": 0.3588, + "grad_norm": 1.2732799053192139, + "learning_rate": 7.153217620165838e-06, + "loss": 0.2915, + "step": 4485 + }, + { + "epoch": 0.35888, + "grad_norm": 1.7382800579071045, + "learning_rate": 7.15208250129423e-06, + "loss": 0.296, + "step": 4486 + }, + { + "epoch": 0.35896, + "grad_norm": 1.537358283996582, + "learning_rate": 7.150947246267561e-06, + "loss": 0.3903, + "step": 4487 + }, + { + "epoch": 0.35904, + "grad_norm": 1.4270464181900024, + "learning_rate": 7.1498118551576574e-06, + "loss": 0.3617, + "step": 4488 + }, + { + "epoch": 0.35912, + "grad_norm": 1.366553544998169, + "learning_rate": 7.148676328036352e-06, + "loss": 0.3077, + "step": 4489 + }, + { + "epoch": 0.3592, + "grad_norm": 1.4373116493225098, + "learning_rate": 7.1475406649754845e-06, + "loss": 0.3871, + "step": 4490 + }, + { + "epoch": 0.35928, + "grad_norm": 1.6480387449264526, + "learning_rate": 7.146404866046903e-06, + "loss": 0.3344, + "step": 4491 + }, + { + "epoch": 0.35936, + "grad_norm": 1.5638269186019897, + "learning_rate": 7.145268931322469e-06, + "loss": 0.301, + "step": 4492 + }, + { + "epoch": 0.35944, + "grad_norm": 1.2544904947280884, + "learning_rate": 7.144132860874047e-06, + "loss": 0.2363, + "step": 4493 + }, + { + "epoch": 0.35952, + "grad_norm": 1.566571593284607, + "learning_rate": 7.142996654773514e-06, + "loss": 0.3634, + "step": 4494 + }, + { + "epoch": 0.3596, + "grad_norm": 2.4747540950775146, + "learning_rate": 7.141860313092753e-06, + "loss": 0.5304, + "step": 4495 + }, + { + "epoch": 0.35968, + "grad_norm": 1.7263084650039673, + "learning_rate": 7.1407238359036565e-06, + "loss": 0.3384, + "step": 4496 + }, + { + "epoch": 0.35976, + "grad_norm": 1.6196907758712769, + "learning_rate": 7.139587223278127e-06, + "loss": 0.3003, + "step": 4497 + }, + { + "epoch": 0.35984, + "grad_norm": 0.9107118844985962, + "learning_rate": 7.138450475288072e-06, + "loss": 0.163, + "step": 4498 + }, + { + "epoch": 0.35992, + "grad_norm": 1.3455191850662231, + "learning_rate": 7.1373135920054104e-06, + "loss": 0.308, + "step": 4499 + }, + { + "epoch": 0.36, + "grad_norm": 1.6801081895828247, + "learning_rate": 7.1361765735020695e-06, + "loss": 0.3765, + "step": 4500 + }, + { + "epoch": 0.36008, + "grad_norm": 1.4999953508377075, + "learning_rate": 7.135039419849984e-06, + "loss": 0.2996, + "step": 4501 + }, + { + "epoch": 0.36016, + "grad_norm": 1.4781824350357056, + "learning_rate": 7.1339021311211e-06, + "loss": 0.3048, + "step": 4502 + }, + { + "epoch": 0.36024, + "grad_norm": 1.7782272100448608, + "learning_rate": 7.1327647073873665e-06, + "loss": 0.4415, + "step": 4503 + }, + { + "epoch": 0.36032, + "grad_norm": 1.6102408170700073, + "learning_rate": 7.131627148720746e-06, + "loss": 0.3111, + "step": 4504 + }, + { + "epoch": 0.3604, + "grad_norm": 1.8156652450561523, + "learning_rate": 7.130489455193208e-06, + "loss": 0.2951, + "step": 4505 + }, + { + "epoch": 0.36048, + "grad_norm": 1.3658769130706787, + "learning_rate": 7.129351626876733e-06, + "loss": 0.348, + "step": 4506 + }, + { + "epoch": 0.36056, + "grad_norm": 1.5961403846740723, + "learning_rate": 7.128213663843304e-06, + "loss": 0.3197, + "step": 4507 + }, + { + "epoch": 0.36064, + "grad_norm": 1.7644686698913574, + "learning_rate": 7.127075566164919e-06, + "loss": 0.3424, + "step": 4508 + }, + { + "epoch": 0.36072, + "grad_norm": 1.4091618061065674, + "learning_rate": 7.125937333913577e-06, + "loss": 0.3399, + "step": 4509 + }, + { + "epoch": 0.3608, + "grad_norm": 2.0317022800445557, + "learning_rate": 7.124798967161296e-06, + "loss": 0.4621, + "step": 4510 + }, + { + "epoch": 0.36088, + "grad_norm": 2.1083295345306396, + "learning_rate": 7.123660465980093e-06, + "loss": 0.4287, + "step": 4511 + }, + { + "epoch": 0.36096, + "grad_norm": 2.015735626220703, + "learning_rate": 7.122521830441998e-06, + "loss": 0.4459, + "step": 4512 + }, + { + "epoch": 0.36104, + "grad_norm": 1.581146001815796, + "learning_rate": 7.121383060619048e-06, + "loss": 0.3427, + "step": 4513 + }, + { + "epoch": 0.36112, + "grad_norm": 1.9534862041473389, + "learning_rate": 7.120244156583291e-06, + "loss": 0.4336, + "step": 4514 + }, + { + "epoch": 0.3612, + "grad_norm": 1.0503820180892944, + "learning_rate": 7.11910511840678e-06, + "loss": 0.2304, + "step": 4515 + }, + { + "epoch": 0.36128, + "grad_norm": 1.4276210069656372, + "learning_rate": 7.11796594616158e-06, + "loss": 0.3487, + "step": 4516 + }, + { + "epoch": 0.36136, + "grad_norm": 1.9091681241989136, + "learning_rate": 7.116826639919761e-06, + "loss": 0.4407, + "step": 4517 + }, + { + "epoch": 0.36144, + "grad_norm": 1.8984267711639404, + "learning_rate": 7.115687199753403e-06, + "loss": 0.4895, + "step": 4518 + }, + { + "epoch": 0.36152, + "grad_norm": 1.8324373960494995, + "learning_rate": 7.114547625734593e-06, + "loss": 0.384, + "step": 4519 + }, + { + "epoch": 0.3616, + "grad_norm": 1.296120524406433, + "learning_rate": 7.113407917935433e-06, + "loss": 0.2754, + "step": 4520 + }, + { + "epoch": 0.36168, + "grad_norm": 1.7863622903823853, + "learning_rate": 7.112268076428025e-06, + "loss": 0.3302, + "step": 4521 + }, + { + "epoch": 0.36176, + "grad_norm": 2.1770036220550537, + "learning_rate": 7.1111281012844825e-06, + "loss": 0.3736, + "step": 4522 + }, + { + "epoch": 0.36184, + "grad_norm": 1.410852074623108, + "learning_rate": 7.109987992576929e-06, + "loss": 0.3158, + "step": 4523 + }, + { + "epoch": 0.36192, + "grad_norm": 1.6189733743667603, + "learning_rate": 7.108847750377494e-06, + "loss": 0.4493, + "step": 4524 + }, + { + "epoch": 0.362, + "grad_norm": 1.5762578248977661, + "learning_rate": 7.107707374758321e-06, + "loss": 0.3384, + "step": 4525 + }, + { + "epoch": 0.36208, + "grad_norm": 1.3969612121582031, + "learning_rate": 7.106566865791553e-06, + "loss": 0.3198, + "step": 4526 + }, + { + "epoch": 0.36216, + "grad_norm": 1.4169987440109253, + "learning_rate": 7.105426223549349e-06, + "loss": 0.3192, + "step": 4527 + }, + { + "epoch": 0.36224, + "grad_norm": 1.601337194442749, + "learning_rate": 7.104285448103871e-06, + "loss": 0.4275, + "step": 4528 + }, + { + "epoch": 0.36232, + "grad_norm": 1.4484046697616577, + "learning_rate": 7.103144539527295e-06, + "loss": 0.2617, + "step": 4529 + }, + { + "epoch": 0.3624, + "grad_norm": 1.1574383974075317, + "learning_rate": 7.1020034978918006e-06, + "loss": 0.248, + "step": 4530 + }, + { + "epoch": 0.36248, + "grad_norm": 1.2772170305252075, + "learning_rate": 7.100862323269579e-06, + "loss": 0.2806, + "step": 4531 + }, + { + "epoch": 0.36256, + "grad_norm": 1.5458264350891113, + "learning_rate": 7.099721015732828e-06, + "loss": 0.4107, + "step": 4532 + }, + { + "epoch": 0.36264, + "grad_norm": 1.3658666610717773, + "learning_rate": 7.0985795753537525e-06, + "loss": 0.3861, + "step": 4533 + }, + { + "epoch": 0.36272, + "grad_norm": 1.4563086032867432, + "learning_rate": 7.097438002204568e-06, + "loss": 0.3225, + "step": 4534 + }, + { + "epoch": 0.3628, + "grad_norm": 1.7949572801589966, + "learning_rate": 7.096296296357502e-06, + "loss": 0.5492, + "step": 4535 + }, + { + "epoch": 0.36288, + "grad_norm": 1.9033316373825073, + "learning_rate": 7.095154457884782e-06, + "loss": 0.3523, + "step": 4536 + }, + { + "epoch": 0.36296, + "grad_norm": 1.7623441219329834, + "learning_rate": 7.094012486858652e-06, + "loss": 0.4402, + "step": 4537 + }, + { + "epoch": 0.36304, + "grad_norm": 1.6129101514816284, + "learning_rate": 7.092870383351355e-06, + "loss": 0.4225, + "step": 4538 + }, + { + "epoch": 0.36312, + "grad_norm": 1.721638798713684, + "learning_rate": 7.091728147435154e-06, + "loss": 0.3564, + "step": 4539 + }, + { + "epoch": 0.3632, + "grad_norm": 1.588778018951416, + "learning_rate": 7.090585779182311e-06, + "loss": 0.3925, + "step": 4540 + }, + { + "epoch": 0.36328, + "grad_norm": 1.32707941532135, + "learning_rate": 7.089443278665102e-06, + "loss": 0.324, + "step": 4541 + }, + { + "epoch": 0.36336, + "grad_norm": 2.0091614723205566, + "learning_rate": 7.088300645955808e-06, + "loss": 0.405, + "step": 4542 + }, + { + "epoch": 0.36344, + "grad_norm": 1.9140995740890503, + "learning_rate": 7.087157881126719e-06, + "loss": 0.4189, + "step": 4543 + }, + { + "epoch": 0.36352, + "grad_norm": 1.6873313188552856, + "learning_rate": 7.0860149842501345e-06, + "loss": 0.2981, + "step": 4544 + }, + { + "epoch": 0.3636, + "grad_norm": 1.7330468893051147, + "learning_rate": 7.084871955398361e-06, + "loss": 0.4079, + "step": 4545 + }, + { + "epoch": 0.36368, + "grad_norm": 1.319456934928894, + "learning_rate": 7.083728794643716e-06, + "loss": 0.2838, + "step": 4546 + }, + { + "epoch": 0.36376, + "grad_norm": 1.4083654880523682, + "learning_rate": 7.082585502058522e-06, + "loss": 0.2926, + "step": 4547 + }, + { + "epoch": 0.36384, + "grad_norm": 1.7500334978103638, + "learning_rate": 7.0814420777151115e-06, + "loss": 0.4215, + "step": 4548 + }, + { + "epoch": 0.36392, + "grad_norm": 2.052196979522705, + "learning_rate": 7.080298521685826e-06, + "loss": 0.4126, + "step": 4549 + }, + { + "epoch": 0.364, + "grad_norm": 1.5984829664230347, + "learning_rate": 7.0791548340430125e-06, + "loss": 0.3908, + "step": 4550 + }, + { + "epoch": 0.36408, + "grad_norm": 1.5750199556350708, + "learning_rate": 7.0780110148590305e-06, + "loss": 0.4188, + "step": 4551 + }, + { + "epoch": 0.36416, + "grad_norm": 1.4217820167541504, + "learning_rate": 7.076867064206244e-06, + "loss": 0.2855, + "step": 4552 + }, + { + "epoch": 0.36424, + "grad_norm": 1.2456365823745728, + "learning_rate": 7.0757229821570285e-06, + "loss": 0.3137, + "step": 4553 + }, + { + "epoch": 0.36432, + "grad_norm": 1.511091947555542, + "learning_rate": 7.074578768783764e-06, + "loss": 0.3978, + "step": 4554 + }, + { + "epoch": 0.3644, + "grad_norm": 1.2580100297927856, + "learning_rate": 7.073434424158845e-06, + "loss": 0.2876, + "step": 4555 + }, + { + "epoch": 0.36448, + "grad_norm": 1.751238465309143, + "learning_rate": 7.072289948354665e-06, + "loss": 0.3838, + "step": 4556 + }, + { + "epoch": 0.36456, + "grad_norm": 1.7836511135101318, + "learning_rate": 7.071145341443635e-06, + "loss": 0.418, + "step": 4557 + }, + { + "epoch": 0.36464, + "grad_norm": 1.676069974899292, + "learning_rate": 7.070000603498169e-06, + "loss": 0.4017, + "step": 4558 + }, + { + "epoch": 0.36472, + "grad_norm": 1.334571123123169, + "learning_rate": 7.06885573459069e-06, + "loss": 0.2776, + "step": 4559 + }, + { + "epoch": 0.3648, + "grad_norm": 1.4708949327468872, + "learning_rate": 7.067710734793631e-06, + "loss": 0.2896, + "step": 4560 + }, + { + "epoch": 0.36488, + "grad_norm": 1.422624945640564, + "learning_rate": 7.066565604179435e-06, + "loss": 0.3168, + "step": 4561 + }, + { + "epoch": 0.36496, + "grad_norm": 1.2501122951507568, + "learning_rate": 7.065420342820546e-06, + "loss": 0.2806, + "step": 4562 + }, + { + "epoch": 0.36504, + "grad_norm": 1.5056577920913696, + "learning_rate": 7.064274950789424e-06, + "loss": 0.2764, + "step": 4563 + }, + { + "epoch": 0.36512, + "grad_norm": 1.5189259052276611, + "learning_rate": 7.063129428158533e-06, + "loss": 0.3423, + "step": 4564 + }, + { + "epoch": 0.3652, + "grad_norm": 1.5131878852844238, + "learning_rate": 7.061983775000345e-06, + "loss": 0.3619, + "step": 4565 + }, + { + "epoch": 0.36528, + "grad_norm": 1.3569074869155884, + "learning_rate": 7.0608379913873444e-06, + "loss": 0.2639, + "step": 4566 + }, + { + "epoch": 0.36536, + "grad_norm": 1.1969329118728638, + "learning_rate": 7.059692077392018e-06, + "loss": 0.2341, + "step": 4567 + }, + { + "epoch": 0.36544, + "grad_norm": 1.7154649496078491, + "learning_rate": 7.058546033086867e-06, + "loss": 0.4684, + "step": 4568 + }, + { + "epoch": 0.36552, + "grad_norm": 1.4121015071868896, + "learning_rate": 7.057399858544396e-06, + "loss": 0.3091, + "step": 4569 + }, + { + "epoch": 0.3656, + "grad_norm": 1.7438020706176758, + "learning_rate": 7.056253553837119e-06, + "loss": 0.4294, + "step": 4570 + }, + { + "epoch": 0.36568, + "grad_norm": 1.392918586730957, + "learning_rate": 7.0551071190375605e-06, + "loss": 0.3318, + "step": 4571 + }, + { + "epoch": 0.36576, + "grad_norm": 1.6059353351593018, + "learning_rate": 7.053960554218252e-06, + "loss": 0.3442, + "step": 4572 + }, + { + "epoch": 0.36584, + "grad_norm": 1.400192379951477, + "learning_rate": 7.052813859451731e-06, + "loss": 0.3341, + "step": 4573 + }, + { + "epoch": 0.36592, + "grad_norm": 1.4998594522476196, + "learning_rate": 7.0516670348105455e-06, + "loss": 0.365, + "step": 4574 + }, + { + "epoch": 0.366, + "grad_norm": 1.7953191995620728, + "learning_rate": 7.050520080367254e-06, + "loss": 0.3027, + "step": 4575 + }, + { + "epoch": 0.36608, + "grad_norm": 2.084156036376953, + "learning_rate": 7.049372996194415e-06, + "loss": 0.43, + "step": 4576 + }, + { + "epoch": 0.36616, + "grad_norm": 1.0851385593414307, + "learning_rate": 7.048225782364605e-06, + "loss": 0.244, + "step": 4577 + }, + { + "epoch": 0.36624, + "grad_norm": 1.446227788925171, + "learning_rate": 7.047078438950403e-06, + "loss": 0.3054, + "step": 4578 + }, + { + "epoch": 0.36632, + "grad_norm": 1.2726980447769165, + "learning_rate": 7.0459309660243976e-06, + "loss": 0.292, + "step": 4579 + }, + { + "epoch": 0.3664, + "grad_norm": 1.6056383848190308, + "learning_rate": 7.044783363659185e-06, + "loss": 0.3092, + "step": 4580 + }, + { + "epoch": 0.36648, + "grad_norm": 1.5209200382232666, + "learning_rate": 7.043635631927372e-06, + "loss": 0.3111, + "step": 4581 + }, + { + "epoch": 0.36656, + "grad_norm": 1.5532723665237427, + "learning_rate": 7.04248777090157e-06, + "loss": 0.339, + "step": 4582 + }, + { + "epoch": 0.36664, + "grad_norm": 2.0113229751586914, + "learning_rate": 7.041339780654401e-06, + "loss": 0.3921, + "step": 4583 + }, + { + "epoch": 0.36672, + "grad_norm": 1.3075275421142578, + "learning_rate": 7.040191661258495e-06, + "loss": 0.346, + "step": 4584 + }, + { + "epoch": 0.3668, + "grad_norm": 1.2967735528945923, + "learning_rate": 7.039043412786489e-06, + "loss": 0.3008, + "step": 4585 + }, + { + "epoch": 0.36688, + "grad_norm": 1.4599136114120483, + "learning_rate": 7.037895035311029e-06, + "loss": 0.337, + "step": 4586 + }, + { + "epoch": 0.36696, + "grad_norm": 1.6987855434417725, + "learning_rate": 7.0367465289047685e-06, + "loss": 0.3539, + "step": 4587 + }, + { + "epoch": 0.36704, + "grad_norm": 1.5327343940734863, + "learning_rate": 7.03559789364037e-06, + "loss": 0.4788, + "step": 4588 + }, + { + "epoch": 0.36712, + "grad_norm": 1.7550090551376343, + "learning_rate": 7.034449129590504e-06, + "loss": 0.3794, + "step": 4589 + }, + { + "epoch": 0.3672, + "grad_norm": 1.623400092124939, + "learning_rate": 7.033300236827849e-06, + "loss": 0.323, + "step": 4590 + }, + { + "epoch": 0.36728, + "grad_norm": 1.9743831157684326, + "learning_rate": 7.032151215425092e-06, + "loss": 0.3837, + "step": 4591 + }, + { + "epoch": 0.36736, + "grad_norm": 1.4057821035385132, + "learning_rate": 7.0310020654549264e-06, + "loss": 0.289, + "step": 4592 + }, + { + "epoch": 0.36744, + "grad_norm": 1.7358758449554443, + "learning_rate": 7.029852786990056e-06, + "loss": 0.3107, + "step": 4593 + }, + { + "epoch": 0.36752, + "grad_norm": 1.3099263906478882, + "learning_rate": 7.028703380103192e-06, + "loss": 0.2482, + "step": 4594 + }, + { + "epoch": 0.3676, + "grad_norm": 1.3868745565414429, + "learning_rate": 7.027553844867052e-06, + "loss": 0.3824, + "step": 4595 + }, + { + "epoch": 0.36768, + "grad_norm": 1.7561724185943604, + "learning_rate": 7.0264041813543655e-06, + "loss": 0.3757, + "step": 4596 + }, + { + "epoch": 0.36776, + "grad_norm": 1.4879529476165771, + "learning_rate": 7.0252543896378666e-06, + "loss": 0.3778, + "step": 4597 + }, + { + "epoch": 0.36784, + "grad_norm": 1.437186360359192, + "learning_rate": 7.024104469790301e-06, + "loss": 0.3313, + "step": 4598 + }, + { + "epoch": 0.36792, + "grad_norm": 1.691285252571106, + "learning_rate": 7.022954421884415e-06, + "loss": 0.4338, + "step": 4599 + }, + { + "epoch": 0.368, + "grad_norm": 1.7200967073440552, + "learning_rate": 7.021804245992973e-06, + "loss": 0.3275, + "step": 4600 + }, + { + "epoch": 0.36808, + "grad_norm": 1.6157788038253784, + "learning_rate": 7.020653942188741e-06, + "loss": 0.4735, + "step": 4601 + }, + { + "epoch": 0.36816, + "grad_norm": 1.5696213245391846, + "learning_rate": 7.019503510544496e-06, + "loss": 0.3177, + "step": 4602 + }, + { + "epoch": 0.36824, + "grad_norm": 1.3805010318756104, + "learning_rate": 7.01835295113302e-06, + "loss": 0.2877, + "step": 4603 + }, + { + "epoch": 0.36832, + "grad_norm": 2.140430212020874, + "learning_rate": 7.017202264027108e-06, + "loss": 0.5055, + "step": 4604 + }, + { + "epoch": 0.3684, + "grad_norm": 1.4156843423843384, + "learning_rate": 7.016051449299556e-06, + "loss": 0.347, + "step": 4605 + }, + { + "epoch": 0.36848, + "grad_norm": 1.9364955425262451, + "learning_rate": 7.014900507023175e-06, + "loss": 0.3778, + "step": 4606 + }, + { + "epoch": 0.36856, + "grad_norm": 1.7083888053894043, + "learning_rate": 7.013749437270781e-06, + "loss": 0.3028, + "step": 4607 + }, + { + "epoch": 0.36864, + "grad_norm": 1.3670393228530884, + "learning_rate": 7.012598240115201e-06, + "loss": 0.2461, + "step": 4608 + }, + { + "epoch": 0.36872, + "grad_norm": 1.3490629196166992, + "learning_rate": 7.011446915629261e-06, + "loss": 0.2549, + "step": 4609 + }, + { + "epoch": 0.3688, + "grad_norm": 1.7474075555801392, + "learning_rate": 7.0102954638858065e-06, + "loss": 0.4072, + "step": 4610 + }, + { + "epoch": 0.36888, + "grad_norm": 1.9252272844314575, + "learning_rate": 7.009143884957684e-06, + "loss": 0.3497, + "step": 4611 + }, + { + "epoch": 0.36896, + "grad_norm": 1.2210733890533447, + "learning_rate": 7.007992178917751e-06, + "loss": 0.2395, + "step": 4612 + }, + { + "epoch": 0.36904, + "grad_norm": 1.4923193454742432, + "learning_rate": 7.0068403458388715e-06, + "loss": 0.3063, + "step": 4613 + }, + { + "epoch": 0.36912, + "grad_norm": 1.8423126935958862, + "learning_rate": 7.0056883857939174e-06, + "loss": 0.5195, + "step": 4614 + }, + { + "epoch": 0.3692, + "grad_norm": 1.8047568798065186, + "learning_rate": 7.004536298855771e-06, + "loss": 0.4048, + "step": 4615 + }, + { + "epoch": 0.36928, + "grad_norm": 1.594499111175537, + "learning_rate": 7.003384085097319e-06, + "loss": 0.3086, + "step": 4616 + }, + { + "epoch": 0.36936, + "grad_norm": 1.4673727750778198, + "learning_rate": 7.00223174459146e-06, + "loss": 0.2774, + "step": 4617 + }, + { + "epoch": 0.36944, + "grad_norm": 1.760596752166748, + "learning_rate": 7.001079277411098e-06, + "loss": 0.3534, + "step": 4618 + }, + { + "epoch": 0.36952, + "grad_norm": 1.7721569538116455, + "learning_rate": 6.9999266836291464e-06, + "loss": 0.3507, + "step": 4619 + }, + { + "epoch": 0.3696, + "grad_norm": 1.4745303392410278, + "learning_rate": 6.9987739633185245e-06, + "loss": 0.35, + "step": 4620 + }, + { + "epoch": 0.36968, + "grad_norm": 1.7846648693084717, + "learning_rate": 6.9976211165521635e-06, + "loss": 0.4137, + "step": 4621 + }, + { + "epoch": 0.36976, + "grad_norm": 1.6463090181350708, + "learning_rate": 6.996468143402997e-06, + "loss": 0.3458, + "step": 4622 + }, + { + "epoch": 0.36984, + "grad_norm": 1.7248306274414062, + "learning_rate": 6.995315043943971e-06, + "loss": 0.3225, + "step": 4623 + }, + { + "epoch": 0.36992, + "grad_norm": 2.039926528930664, + "learning_rate": 6.9941618182480384e-06, + "loss": 0.4484, + "step": 4624 + }, + { + "epoch": 0.37, + "grad_norm": 1.9294219017028809, + "learning_rate": 6.993008466388161e-06, + "loss": 0.407, + "step": 4625 + }, + { + "epoch": 0.37008, + "grad_norm": 1.5425759553909302, + "learning_rate": 6.991854988437307e-06, + "loss": 0.3614, + "step": 4626 + }, + { + "epoch": 0.37016, + "grad_norm": 1.3891255855560303, + "learning_rate": 6.990701384468451e-06, + "loss": 0.2851, + "step": 4627 + }, + { + "epoch": 0.37024, + "grad_norm": 2.027620792388916, + "learning_rate": 6.9895476545545804e-06, + "loss": 0.3724, + "step": 4628 + }, + { + "epoch": 0.37032, + "grad_norm": 1.5156015157699585, + "learning_rate": 6.988393798768685e-06, + "loss": 0.3557, + "step": 4629 + }, + { + "epoch": 0.3704, + "grad_norm": 1.4785630702972412, + "learning_rate": 6.987239817183769e-06, + "loss": 0.3771, + "step": 4630 + }, + { + "epoch": 0.37048, + "grad_norm": 1.4804576635360718, + "learning_rate": 6.98608570987284e-06, + "loss": 0.4071, + "step": 4631 + }, + { + "epoch": 0.37056, + "grad_norm": 1.5395426750183105, + "learning_rate": 6.984931476908911e-06, + "loss": 0.337, + "step": 4632 + }, + { + "epoch": 0.37064, + "grad_norm": 1.7918102741241455, + "learning_rate": 6.983777118365011e-06, + "loss": 0.4346, + "step": 4633 + }, + { + "epoch": 0.37072, + "grad_norm": 1.6034590005874634, + "learning_rate": 6.982622634314171e-06, + "loss": 0.3478, + "step": 4634 + }, + { + "epoch": 0.3708, + "grad_norm": 1.6067657470703125, + "learning_rate": 6.981468024829428e-06, + "loss": 0.4161, + "step": 4635 + }, + { + "epoch": 0.37088, + "grad_norm": 1.1507148742675781, + "learning_rate": 6.980313289983836e-06, + "loss": 0.2877, + "step": 4636 + }, + { + "epoch": 0.37096, + "grad_norm": 1.345964789390564, + "learning_rate": 6.979158429850448e-06, + "loss": 0.2516, + "step": 4637 + }, + { + "epoch": 0.37104, + "grad_norm": 1.6395801305770874, + "learning_rate": 6.978003444502326e-06, + "loss": 0.4643, + "step": 4638 + }, + { + "epoch": 0.37112, + "grad_norm": 1.3134948015213013, + "learning_rate": 6.976848334012546e-06, + "loss": 0.3187, + "step": 4639 + }, + { + "epoch": 0.3712, + "grad_norm": 1.9457036256790161, + "learning_rate": 6.975693098454186e-06, + "loss": 0.4266, + "step": 4640 + }, + { + "epoch": 0.37128, + "grad_norm": 1.6113479137420654, + "learning_rate": 6.974537737900336e-06, + "loss": 0.3117, + "step": 4641 + }, + { + "epoch": 0.37136, + "grad_norm": 1.1665359735488892, + "learning_rate": 6.973382252424088e-06, + "loss": 0.2507, + "step": 4642 + }, + { + "epoch": 0.37144, + "grad_norm": 1.7264834642410278, + "learning_rate": 6.9722266420985495e-06, + "loss": 0.4146, + "step": 4643 + }, + { + "epoch": 0.37152, + "grad_norm": 2.008244276046753, + "learning_rate": 6.97107090699683e-06, + "loss": 0.3615, + "step": 4644 + }, + { + "epoch": 0.3716, + "grad_norm": 1.2290860414505005, + "learning_rate": 6.969915047192049e-06, + "loss": 0.2695, + "step": 4645 + }, + { + "epoch": 0.37168, + "grad_norm": 1.5274767875671387, + "learning_rate": 6.968759062757334e-06, + "loss": 0.281, + "step": 4646 + }, + { + "epoch": 0.37176, + "grad_norm": 1.9756278991699219, + "learning_rate": 6.967602953765821e-06, + "loss": 0.3648, + "step": 4647 + }, + { + "epoch": 0.37184, + "grad_norm": 1.4363765716552734, + "learning_rate": 6.966446720290652e-06, + "loss": 0.3549, + "step": 4648 + }, + { + "epoch": 0.37192, + "grad_norm": 1.5073497295379639, + "learning_rate": 6.9652903624049804e-06, + "loss": 0.3297, + "step": 4649 + }, + { + "epoch": 0.372, + "grad_norm": 1.4804567098617554, + "learning_rate": 6.964133880181962e-06, + "loss": 0.3181, + "step": 4650 + }, + { + "epoch": 0.37208, + "grad_norm": 1.5861263275146484, + "learning_rate": 6.962977273694765e-06, + "loss": 0.3307, + "step": 4651 + }, + { + "epoch": 0.37216, + "grad_norm": 2.007474422454834, + "learning_rate": 6.961820543016565e-06, + "loss": 0.3815, + "step": 4652 + }, + { + "epoch": 0.37224, + "grad_norm": 1.6723278760910034, + "learning_rate": 6.960663688220543e-06, + "loss": 0.3828, + "step": 4653 + }, + { + "epoch": 0.37232, + "grad_norm": 1.481465458869934, + "learning_rate": 6.959506709379891e-06, + "loss": 0.2932, + "step": 4654 + }, + { + "epoch": 0.3724, + "grad_norm": 1.6673827171325684, + "learning_rate": 6.958349606567806e-06, + "loss": 0.3636, + "step": 4655 + }, + { + "epoch": 0.37248, + "grad_norm": 2.048527956008911, + "learning_rate": 6.957192379857493e-06, + "loss": 0.45, + "step": 4656 + }, + { + "epoch": 0.37256, + "grad_norm": 1.8853121995925903, + "learning_rate": 6.9560350293221655e-06, + "loss": 0.3628, + "step": 4657 + }, + { + "epoch": 0.37264, + "grad_norm": 1.4432258605957031, + "learning_rate": 6.954877555035049e-06, + "loss": 0.2615, + "step": 4658 + }, + { + "epoch": 0.37272, + "grad_norm": 1.38565993309021, + "learning_rate": 6.953719957069369e-06, + "loss": 0.2794, + "step": 4659 + }, + { + "epoch": 0.3728, + "grad_norm": 2.1929290294647217, + "learning_rate": 6.952562235498366e-06, + "loss": 0.5944, + "step": 4660 + }, + { + "epoch": 0.37288, + "grad_norm": 1.4748060703277588, + "learning_rate": 6.951404390395282e-06, + "loss": 0.2881, + "step": 4661 + }, + { + "epoch": 0.37296, + "grad_norm": 1.4496852159500122, + "learning_rate": 6.950246421833373e-06, + "loss": 0.2684, + "step": 4662 + }, + { + "epoch": 0.37304, + "grad_norm": 1.4460296630859375, + "learning_rate": 6.949088329885898e-06, + "loss": 0.2797, + "step": 4663 + }, + { + "epoch": 0.37312, + "grad_norm": 1.7262259721755981, + "learning_rate": 6.947930114626125e-06, + "loss": 0.3871, + "step": 4664 + }, + { + "epoch": 0.3732, + "grad_norm": 1.6757460832595825, + "learning_rate": 6.946771776127334e-06, + "loss": 0.2729, + "step": 4665 + }, + { + "epoch": 0.37328, + "grad_norm": 1.40213143825531, + "learning_rate": 6.945613314462804e-06, + "loss": 0.3466, + "step": 4666 + }, + { + "epoch": 0.37336, + "grad_norm": 2.0440454483032227, + "learning_rate": 6.9444547297058315e-06, + "loss": 0.4531, + "step": 4667 + }, + { + "epoch": 0.37344, + "grad_norm": 1.950875163078308, + "learning_rate": 6.943296021929713e-06, + "loss": 0.4574, + "step": 4668 + }, + { + "epoch": 0.37352, + "grad_norm": 1.5116314888000488, + "learning_rate": 6.9421371912077585e-06, + "loss": 0.4164, + "step": 4669 + }, + { + "epoch": 0.3736, + "grad_norm": 2.183765172958374, + "learning_rate": 6.9409782376132805e-06, + "loss": 0.5813, + "step": 4670 + }, + { + "epoch": 0.37368, + "grad_norm": 1.54366934299469, + "learning_rate": 6.9398191612196055e-06, + "loss": 0.3999, + "step": 4671 + }, + { + "epoch": 0.37376, + "grad_norm": 1.3263498544692993, + "learning_rate": 6.938659962100064e-06, + "loss": 0.2936, + "step": 4672 + }, + { + "epoch": 0.37384, + "grad_norm": 1.3171329498291016, + "learning_rate": 6.937500640327992e-06, + "loss": 0.3129, + "step": 4673 + }, + { + "epoch": 0.37392, + "grad_norm": 1.4824219942092896, + "learning_rate": 6.936341195976737e-06, + "loss": 0.2798, + "step": 4674 + }, + { + "epoch": 0.374, + "grad_norm": 1.6193792819976807, + "learning_rate": 6.935181629119654e-06, + "loss": 0.3232, + "step": 4675 + }, + { + "epoch": 0.37408, + "grad_norm": 2.0140795707702637, + "learning_rate": 6.934021939830104e-06, + "loss": 0.4507, + "step": 4676 + }, + { + "epoch": 0.37416, + "grad_norm": 1.3184332847595215, + "learning_rate": 6.932862128181459e-06, + "loss": 0.3604, + "step": 4677 + }, + { + "epoch": 0.37424, + "grad_norm": 1.5860306024551392, + "learning_rate": 6.931702194247094e-06, + "loss": 0.3256, + "step": 4678 + }, + { + "epoch": 0.37432, + "grad_norm": 1.9520553350448608, + "learning_rate": 6.930542138100393e-06, + "loss": 0.416, + "step": 4679 + }, + { + "epoch": 0.3744, + "grad_norm": 1.1498091220855713, + "learning_rate": 6.929381959814751e-06, + "loss": 0.3029, + "step": 4680 + }, + { + "epoch": 0.37448, + "grad_norm": 1.5657634735107422, + "learning_rate": 6.928221659463568e-06, + "loss": 0.3745, + "step": 4681 + }, + { + "epoch": 0.37456, + "grad_norm": 1.403543472290039, + "learning_rate": 6.9270612371202524e-06, + "loss": 0.3247, + "step": 4682 + }, + { + "epoch": 0.37464, + "grad_norm": 1.869012475013733, + "learning_rate": 6.925900692858222e-06, + "loss": 0.3587, + "step": 4683 + }, + { + "epoch": 0.37472, + "grad_norm": 1.9943881034851074, + "learning_rate": 6.924740026750898e-06, + "loss": 0.4173, + "step": 4684 + }, + { + "epoch": 0.3748, + "grad_norm": 1.7604013681411743, + "learning_rate": 6.923579238871712e-06, + "loss": 0.3363, + "step": 4685 + }, + { + "epoch": 0.37488, + "grad_norm": 1.7223265171051025, + "learning_rate": 6.922418329294104e-06, + "loss": 0.3656, + "step": 4686 + }, + { + "epoch": 0.37496, + "grad_norm": 1.5790249109268188, + "learning_rate": 6.921257298091522e-06, + "loss": 0.3046, + "step": 4687 + }, + { + "epoch": 0.37504, + "grad_norm": 1.5063668489456177, + "learning_rate": 6.920096145337418e-06, + "loss": 0.2886, + "step": 4688 + }, + { + "epoch": 0.37512, + "grad_norm": 1.7410809993743896, + "learning_rate": 6.9189348711052565e-06, + "loss": 0.3991, + "step": 4689 + }, + { + "epoch": 0.3752, + "grad_norm": 1.309981346130371, + "learning_rate": 6.9177734754685055e-06, + "loss": 0.2905, + "step": 4690 + }, + { + "epoch": 0.37528, + "grad_norm": 1.7682570219039917, + "learning_rate": 6.916611958500644e-06, + "loss": 0.3659, + "step": 4691 + }, + { + "epoch": 0.37536, + "grad_norm": 1.6350557804107666, + "learning_rate": 6.9154503202751564e-06, + "loss": 0.3403, + "step": 4692 + }, + { + "epoch": 0.37544, + "grad_norm": 1.6600843667984009, + "learning_rate": 6.914288560865536e-06, + "loss": 0.3286, + "step": 4693 + }, + { + "epoch": 0.37552, + "grad_norm": 1.66802179813385, + "learning_rate": 6.913126680345285e-06, + "loss": 0.4507, + "step": 4694 + }, + { + "epoch": 0.3756, + "grad_norm": 2.0192372798919678, + "learning_rate": 6.911964678787908e-06, + "loss": 0.459, + "step": 4695 + }, + { + "epoch": 0.37568, + "grad_norm": 1.459499478340149, + "learning_rate": 6.910802556266927e-06, + "loss": 0.279, + "step": 4696 + }, + { + "epoch": 0.37576, + "grad_norm": 2.2443490028381348, + "learning_rate": 6.909640312855859e-06, + "loss": 0.3926, + "step": 4697 + }, + { + "epoch": 0.37584, + "grad_norm": 1.5917470455169678, + "learning_rate": 6.9084779486282385e-06, + "loss": 0.3271, + "step": 4698 + }, + { + "epoch": 0.37592, + "grad_norm": 1.4331306219100952, + "learning_rate": 6.907315463657603e-06, + "loss": 0.2767, + "step": 4699 + }, + { + "epoch": 0.376, + "grad_norm": 1.7702385187149048, + "learning_rate": 6.906152858017502e-06, + "loss": 0.3622, + "step": 4700 + }, + { + "epoch": 0.37608, + "grad_norm": 1.6627930402755737, + "learning_rate": 6.904990131781486e-06, + "loss": 0.4196, + "step": 4701 + }, + { + "epoch": 0.37616, + "grad_norm": 1.6581454277038574, + "learning_rate": 6.90382728502312e-06, + "loss": 0.3285, + "step": 4702 + }, + { + "epoch": 0.37624, + "grad_norm": 1.6215664148330688, + "learning_rate": 6.9026643178159714e-06, + "loss": 0.3781, + "step": 4703 + }, + { + "epoch": 0.37632, + "grad_norm": 1.4504485130310059, + "learning_rate": 6.901501230233617e-06, + "loss": 0.3214, + "step": 4704 + }, + { + "epoch": 0.3764, + "grad_norm": 1.4753299951553345, + "learning_rate": 6.900338022349643e-06, + "loss": 0.3267, + "step": 4705 + }, + { + "epoch": 0.37648, + "grad_norm": 1.9508014917373657, + "learning_rate": 6.89917469423764e-06, + "loss": 0.4347, + "step": 4706 + }, + { + "epoch": 0.37656, + "grad_norm": 1.515520453453064, + "learning_rate": 6.89801124597121e-06, + "loss": 0.3053, + "step": 4707 + }, + { + "epoch": 0.37664, + "grad_norm": 1.8075672388076782, + "learning_rate": 6.896847677623959e-06, + "loss": 0.3736, + "step": 4708 + }, + { + "epoch": 0.37672, + "grad_norm": 1.3629133701324463, + "learning_rate": 6.8956839892695015e-06, + "loss": 0.2908, + "step": 4709 + }, + { + "epoch": 0.3768, + "grad_norm": 1.7496662139892578, + "learning_rate": 6.894520180981461e-06, + "loss": 0.3834, + "step": 4710 + }, + { + "epoch": 0.37688, + "grad_norm": 2.0693397521972656, + "learning_rate": 6.893356252833469e-06, + "loss": 0.3402, + "step": 4711 + }, + { + "epoch": 0.37696, + "grad_norm": 1.662286400794983, + "learning_rate": 6.892192204899161e-06, + "loss": 0.3915, + "step": 4712 + }, + { + "epoch": 0.37704, + "grad_norm": 1.633707880973816, + "learning_rate": 6.8910280372521834e-06, + "loss": 0.2893, + "step": 4713 + }, + { + "epoch": 0.37712, + "grad_norm": 1.2424566745758057, + "learning_rate": 6.8898637499661906e-06, + "loss": 0.2871, + "step": 4714 + }, + { + "epoch": 0.3772, + "grad_norm": 1.8725136518478394, + "learning_rate": 6.8886993431148395e-06, + "loss": 0.406, + "step": 4715 + }, + { + "epoch": 0.37728, + "grad_norm": 1.4558591842651367, + "learning_rate": 6.887534816771802e-06, + "loss": 0.2686, + "step": 4716 + }, + { + "epoch": 0.37736, + "grad_norm": 1.2151341438293457, + "learning_rate": 6.886370171010752e-06, + "loss": 0.2585, + "step": 4717 + }, + { + "epoch": 0.37744, + "grad_norm": 1.3010770082473755, + "learning_rate": 6.885205405905373e-06, + "loss": 0.2596, + "step": 4718 + }, + { + "epoch": 0.37752, + "grad_norm": 1.303667664527893, + "learning_rate": 6.884040521529356e-06, + "loss": 0.2937, + "step": 4719 + }, + { + "epoch": 0.3776, + "grad_norm": 1.5519078969955444, + "learning_rate": 6.8828755179564e-06, + "loss": 0.3064, + "step": 4720 + }, + { + "epoch": 0.37768, + "grad_norm": 2.096662759780884, + "learning_rate": 6.88171039526021e-06, + "loss": 0.469, + "step": 4721 + }, + { + "epoch": 0.37776, + "grad_norm": 1.5880454778671265, + "learning_rate": 6.880545153514498e-06, + "loss": 0.3606, + "step": 4722 + }, + { + "epoch": 0.37784, + "grad_norm": 1.351523756980896, + "learning_rate": 6.879379792792988e-06, + "loss": 0.3292, + "step": 4723 + }, + { + "epoch": 0.37792, + "grad_norm": 1.5579664707183838, + "learning_rate": 6.878214313169407e-06, + "loss": 0.3451, + "step": 4724 + }, + { + "epoch": 0.378, + "grad_norm": 1.906211018562317, + "learning_rate": 6.87704871471749e-06, + "loss": 0.3471, + "step": 4725 + }, + { + "epoch": 0.37808, + "grad_norm": 1.8630839586257935, + "learning_rate": 6.875882997510982e-06, + "loss": 0.4373, + "step": 4726 + }, + { + "epoch": 0.37816, + "grad_norm": 1.5359853506088257, + "learning_rate": 6.874717161623633e-06, + "loss": 0.323, + "step": 4727 + }, + { + "epoch": 0.37824, + "grad_norm": 1.6257649660110474, + "learning_rate": 6.8735512071292024e-06, + "loss": 0.4539, + "step": 4728 + }, + { + "epoch": 0.37832, + "grad_norm": 1.5281004905700684, + "learning_rate": 6.872385134101454e-06, + "loss": 0.3276, + "step": 4729 + }, + { + "epoch": 0.3784, + "grad_norm": 1.4861382246017456, + "learning_rate": 6.871218942614165e-06, + "loss": 0.398, + "step": 4730 + }, + { + "epoch": 0.37848, + "grad_norm": 1.907596230506897, + "learning_rate": 6.8700526327411155e-06, + "loss": 0.3705, + "step": 4731 + }, + { + "epoch": 0.37856, + "grad_norm": 1.6404950618743896, + "learning_rate": 6.868886204556092e-06, + "loss": 0.4075, + "step": 4732 + }, + { + "epoch": 0.37864, + "grad_norm": 1.4109077453613281, + "learning_rate": 6.867719658132892e-06, + "loss": 0.3178, + "step": 4733 + }, + { + "epoch": 0.37872, + "grad_norm": 1.863938808441162, + "learning_rate": 6.866552993545319e-06, + "loss": 0.3065, + "step": 4734 + }, + { + "epoch": 0.3788, + "grad_norm": 2.2239177227020264, + "learning_rate": 6.865386210867182e-06, + "loss": 0.4868, + "step": 4735 + }, + { + "epoch": 0.37888, + "grad_norm": 1.7756961584091187, + "learning_rate": 6.864219310172302e-06, + "loss": 0.3413, + "step": 4736 + }, + { + "epoch": 0.37896, + "grad_norm": 1.6238456964492798, + "learning_rate": 6.863052291534505e-06, + "loss": 0.3501, + "step": 4737 + }, + { + "epoch": 0.37904, + "grad_norm": 1.5405563116073608, + "learning_rate": 6.8618851550276225e-06, + "loss": 0.3405, + "step": 4738 + }, + { + "epoch": 0.37912, + "grad_norm": 1.663956642150879, + "learning_rate": 6.860717900725495e-06, + "loss": 0.378, + "step": 4739 + }, + { + "epoch": 0.3792, + "grad_norm": 1.5309809446334839, + "learning_rate": 6.859550528701972e-06, + "loss": 0.3202, + "step": 4740 + }, + { + "epoch": 0.37928, + "grad_norm": 1.7055078744888306, + "learning_rate": 6.858383039030911e-06, + "loss": 0.3591, + "step": 4741 + }, + { + "epoch": 0.37936, + "grad_norm": 1.452616572380066, + "learning_rate": 6.857215431786172e-06, + "loss": 0.3181, + "step": 4742 + }, + { + "epoch": 0.37944, + "grad_norm": 1.454076886177063, + "learning_rate": 6.856047707041628e-06, + "loss": 0.319, + "step": 4743 + }, + { + "epoch": 0.37952, + "grad_norm": 1.9659279584884644, + "learning_rate": 6.854879864871155e-06, + "loss": 0.4883, + "step": 4744 + }, + { + "epoch": 0.3796, + "grad_norm": 1.0691478252410889, + "learning_rate": 6.85371190534864e-06, + "loss": 0.2573, + "step": 4745 + }, + { + "epoch": 0.37968, + "grad_norm": 1.682841420173645, + "learning_rate": 6.8525438285479755e-06, + "loss": 0.2741, + "step": 4746 + }, + { + "epoch": 0.37976, + "grad_norm": 1.7029143571853638, + "learning_rate": 6.85137563454306e-06, + "loss": 0.3921, + "step": 4747 + }, + { + "epoch": 0.37984, + "grad_norm": 1.5727744102478027, + "learning_rate": 6.850207323407803e-06, + "loss": 0.3508, + "step": 4748 + }, + { + "epoch": 0.37992, + "grad_norm": 1.4125081300735474, + "learning_rate": 6.8490388952161215e-06, + "loss": 0.3447, + "step": 4749 + }, + { + "epoch": 0.38, + "grad_norm": 1.9592466354370117, + "learning_rate": 6.847870350041934e-06, + "loss": 0.3249, + "step": 4750 + }, + { + "epoch": 0.38008, + "grad_norm": 1.4628976583480835, + "learning_rate": 6.846701687959173e-06, + "loss": 0.2875, + "step": 4751 + }, + { + "epoch": 0.38016, + "grad_norm": 1.5076208114624023, + "learning_rate": 6.845532909041775e-06, + "loss": 0.3145, + "step": 4752 + }, + { + "epoch": 0.38024, + "grad_norm": 1.3270679712295532, + "learning_rate": 6.8443640133636845e-06, + "loss": 0.3381, + "step": 4753 + }, + { + "epoch": 0.38032, + "grad_norm": 2.057631731033325, + "learning_rate": 6.8431950009988565e-06, + "loss": 0.3674, + "step": 4754 + }, + { + "epoch": 0.3804, + "grad_norm": 1.4664708375930786, + "learning_rate": 6.842025872021246e-06, + "loss": 0.3211, + "step": 4755 + }, + { + "epoch": 0.38048, + "grad_norm": 1.819962739944458, + "learning_rate": 6.840856626504822e-06, + "loss": 0.3592, + "step": 4756 + }, + { + "epoch": 0.38056, + "grad_norm": 1.3360856771469116, + "learning_rate": 6.8396872645235615e-06, + "loss": 0.2608, + "step": 4757 + }, + { + "epoch": 0.38064, + "grad_norm": 1.8079757690429688, + "learning_rate": 6.838517786151441e-06, + "loss": 0.4004, + "step": 4758 + }, + { + "epoch": 0.38072, + "grad_norm": 1.4537521600723267, + "learning_rate": 6.837348191462452e-06, + "loss": 0.288, + "step": 4759 + }, + { + "epoch": 0.3808, + "grad_norm": 1.3475890159606934, + "learning_rate": 6.8361784805305905e-06, + "loss": 0.2607, + "step": 4760 + }, + { + "epoch": 0.38088, + "grad_norm": 1.749821424484253, + "learning_rate": 6.835008653429862e-06, + "loss": 0.3755, + "step": 4761 + }, + { + "epoch": 0.38096, + "grad_norm": 1.7868049144744873, + "learning_rate": 6.833838710234274e-06, + "loss": 0.5124, + "step": 4762 + }, + { + "epoch": 0.38104, + "grad_norm": 1.7680143117904663, + "learning_rate": 6.8326686510178475e-06, + "loss": 0.3669, + "step": 4763 + }, + { + "epoch": 0.38112, + "grad_norm": 1.4955719709396362, + "learning_rate": 6.831498475854607e-06, + "loss": 0.3469, + "step": 4764 + }, + { + "epoch": 0.3812, + "grad_norm": 1.3218600749969482, + "learning_rate": 6.8303281848185864e-06, + "loss": 0.3065, + "step": 4765 + }, + { + "epoch": 0.38128, + "grad_norm": 1.1174854040145874, + "learning_rate": 6.829157777983828e-06, + "loss": 0.2463, + "step": 4766 + }, + { + "epoch": 0.38136, + "grad_norm": 1.5425974130630493, + "learning_rate": 6.827987255424375e-06, + "loss": 0.3862, + "step": 4767 + }, + { + "epoch": 0.38144, + "grad_norm": 1.8264150619506836, + "learning_rate": 6.826816617214287e-06, + "loss": 0.4402, + "step": 4768 + }, + { + "epoch": 0.38152, + "grad_norm": 1.5018960237503052, + "learning_rate": 6.8256458634276216e-06, + "loss": 0.3495, + "step": 4769 + }, + { + "epoch": 0.3816, + "grad_norm": 1.9828064441680908, + "learning_rate": 6.824474994138453e-06, + "loss": 0.4644, + "step": 4770 + }, + { + "epoch": 0.38168, + "grad_norm": 1.6333398818969727, + "learning_rate": 6.823304009420855e-06, + "loss": 0.3376, + "step": 4771 + }, + { + "epoch": 0.38176, + "grad_norm": 1.5310996770858765, + "learning_rate": 6.822132909348913e-06, + "loss": 0.3445, + "step": 4772 + }, + { + "epoch": 0.38184, + "grad_norm": 1.3783681392669678, + "learning_rate": 6.820961693996719e-06, + "loss": 0.2656, + "step": 4773 + }, + { + "epoch": 0.38192, + "grad_norm": 1.7104060649871826, + "learning_rate": 6.819790363438372e-06, + "loss": 0.3171, + "step": 4774 + }, + { + "epoch": 0.382, + "grad_norm": 1.5256901979446411, + "learning_rate": 6.818618917747977e-06, + "loss": 0.3218, + "step": 4775 + }, + { + "epoch": 0.38208, + "grad_norm": 1.4606126546859741, + "learning_rate": 6.817447356999649e-06, + "loss": 0.2958, + "step": 4776 + }, + { + "epoch": 0.38216, + "grad_norm": 1.5362343788146973, + "learning_rate": 6.816275681267507e-06, + "loss": 0.3253, + "step": 4777 + }, + { + "epoch": 0.38224, + "grad_norm": 1.2135709524154663, + "learning_rate": 6.815103890625682e-06, + "loss": 0.2384, + "step": 4778 + }, + { + "epoch": 0.38232, + "grad_norm": 1.9567755460739136, + "learning_rate": 6.813931985148306e-06, + "loss": 0.4436, + "step": 4779 + }, + { + "epoch": 0.3824, + "grad_norm": 1.4235215187072754, + "learning_rate": 6.812759964909522e-06, + "loss": 0.2772, + "step": 4780 + }, + { + "epoch": 0.38248, + "grad_norm": 1.557592749595642, + "learning_rate": 6.811587829983481e-06, + "loss": 0.3733, + "step": 4781 + }, + { + "epoch": 0.38256, + "grad_norm": 1.7264912128448486, + "learning_rate": 6.810415580444339e-06, + "loss": 0.3979, + "step": 4782 + }, + { + "epoch": 0.38264, + "grad_norm": 1.3333731889724731, + "learning_rate": 6.809243216366261e-06, + "loss": 0.3936, + "step": 4783 + }, + { + "epoch": 0.38272, + "grad_norm": 1.5328093767166138, + "learning_rate": 6.80807073782342e-06, + "loss": 0.3305, + "step": 4784 + }, + { + "epoch": 0.3828, + "grad_norm": 1.6207122802734375, + "learning_rate": 6.806898144889992e-06, + "loss": 0.3319, + "step": 4785 + }, + { + "epoch": 0.38288, + "grad_norm": 1.7416425943374634, + "learning_rate": 6.8057254376401635e-06, + "loss": 0.4054, + "step": 4786 + }, + { + "epoch": 0.38296, + "grad_norm": 1.8004319667816162, + "learning_rate": 6.80455261614813e-06, + "loss": 0.4405, + "step": 4787 + }, + { + "epoch": 0.38304, + "grad_norm": 1.8042232990264893, + "learning_rate": 6.803379680488089e-06, + "loss": 0.3759, + "step": 4788 + }, + { + "epoch": 0.38312, + "grad_norm": 1.7607706785202026, + "learning_rate": 6.802206630734252e-06, + "loss": 0.4345, + "step": 4789 + }, + { + "epoch": 0.3832, + "grad_norm": 1.388512372970581, + "learning_rate": 6.80103346696083e-06, + "loss": 0.2903, + "step": 4790 + }, + { + "epoch": 0.38328, + "grad_norm": 1.2507061958312988, + "learning_rate": 6.799860189242049e-06, + "loss": 0.2915, + "step": 4791 + }, + { + "epoch": 0.38336, + "grad_norm": 1.4583854675292969, + "learning_rate": 6.798686797652134e-06, + "loss": 0.3479, + "step": 4792 + }, + { + "epoch": 0.38344, + "grad_norm": 1.3498902320861816, + "learning_rate": 6.797513292265323e-06, + "loss": 0.3524, + "step": 4793 + }, + { + "epoch": 0.38352, + "grad_norm": 1.3097642660140991, + "learning_rate": 6.7963396731558615e-06, + "loss": 0.2726, + "step": 4794 + }, + { + "epoch": 0.3836, + "grad_norm": 1.6635525226593018, + "learning_rate": 6.795165940398e-06, + "loss": 0.3982, + "step": 4795 + }, + { + "epoch": 0.38368, + "grad_norm": 1.2881489992141724, + "learning_rate": 6.7939920940659966e-06, + "loss": 0.2503, + "step": 4796 + }, + { + "epoch": 0.38376, + "grad_norm": 1.7574576139450073, + "learning_rate": 6.792818134234115e-06, + "loss": 0.4502, + "step": 4797 + }, + { + "epoch": 0.38384, + "grad_norm": 1.5125421285629272, + "learning_rate": 6.791644060976629e-06, + "loss": 0.401, + "step": 4798 + }, + { + "epoch": 0.38392, + "grad_norm": 1.5928010940551758, + "learning_rate": 6.7904698743678175e-06, + "loss": 0.3318, + "step": 4799 + }, + { + "epoch": 0.384, + "grad_norm": 1.3006750345230103, + "learning_rate": 6.789295574481969e-06, + "loss": 0.2972, + "step": 4800 + }, + { + "epoch": 0.38408, + "grad_norm": 1.9647408723831177, + "learning_rate": 6.788121161393376e-06, + "loss": 0.4477, + "step": 4801 + }, + { + "epoch": 0.38416, + "grad_norm": 1.5676745176315308, + "learning_rate": 6.7869466351763405e-06, + "loss": 0.3331, + "step": 4802 + }, + { + "epoch": 0.38424, + "grad_norm": 1.4142674207687378, + "learning_rate": 6.78577199590517e-06, + "loss": 0.2803, + "step": 4803 + }, + { + "epoch": 0.38432, + "grad_norm": 1.6045714616775513, + "learning_rate": 6.784597243654182e-06, + "loss": 0.3822, + "step": 4804 + }, + { + "epoch": 0.3844, + "grad_norm": 1.3804314136505127, + "learning_rate": 6.783422378497696e-06, + "loss": 0.362, + "step": 4805 + }, + { + "epoch": 0.38448, + "grad_norm": 1.6979622840881348, + "learning_rate": 6.7822474005100435e-06, + "loss": 0.413, + "step": 4806 + }, + { + "epoch": 0.38456, + "grad_norm": 1.5267022848129272, + "learning_rate": 6.781072309765563e-06, + "loss": 0.3208, + "step": 4807 + }, + { + "epoch": 0.38464, + "grad_norm": 1.3634998798370361, + "learning_rate": 6.779897106338595e-06, + "loss": 0.2869, + "step": 4808 + }, + { + "epoch": 0.38472, + "grad_norm": 1.6127568483352661, + "learning_rate": 6.778721790303494e-06, + "loss": 0.3086, + "step": 4809 + }, + { + "epoch": 0.3848, + "grad_norm": 1.515779733657837, + "learning_rate": 6.777546361734616e-06, + "loss": 0.3614, + "step": 4810 + }, + { + "epoch": 0.38488, + "grad_norm": 1.6880332231521606, + "learning_rate": 6.7763708207063286e-06, + "loss": 0.3785, + "step": 4811 + }, + { + "epoch": 0.38496, + "grad_norm": 1.5529166460037231, + "learning_rate": 6.775195167293003e-06, + "loss": 0.3199, + "step": 4812 + }, + { + "epoch": 0.38504, + "grad_norm": 1.7941169738769531, + "learning_rate": 6.7740194015690175e-06, + "loss": 0.4966, + "step": 4813 + }, + { + "epoch": 0.38512, + "grad_norm": 1.9801031351089478, + "learning_rate": 6.772843523608762e-06, + "loss": 0.3773, + "step": 4814 + }, + { + "epoch": 0.3852, + "grad_norm": 1.4930901527404785, + "learning_rate": 6.771667533486628e-06, + "loss": 0.3426, + "step": 4815 + }, + { + "epoch": 0.38528, + "grad_norm": 1.4776076078414917, + "learning_rate": 6.770491431277017e-06, + "loss": 0.3437, + "step": 4816 + }, + { + "epoch": 0.38536, + "grad_norm": 1.2407801151275635, + "learning_rate": 6.7693152170543365e-06, + "loss": 0.2532, + "step": 4817 + }, + { + "epoch": 0.38544, + "grad_norm": 1.7486261129379272, + "learning_rate": 6.768138890893004e-06, + "loss": 0.353, + "step": 4818 + }, + { + "epoch": 0.38552, + "grad_norm": 2.0305092334747314, + "learning_rate": 6.766962452867439e-06, + "loss": 0.4327, + "step": 4819 + }, + { + "epoch": 0.3856, + "grad_norm": 1.3625537157058716, + "learning_rate": 6.765785903052072e-06, + "loss": 0.2888, + "step": 4820 + }, + { + "epoch": 0.38568, + "grad_norm": 1.9757248163223267, + "learning_rate": 6.764609241521339e-06, + "loss": 0.4204, + "step": 4821 + }, + { + "epoch": 0.38576, + "grad_norm": 1.4327483177185059, + "learning_rate": 6.763432468349684e-06, + "loss": 0.297, + "step": 4822 + }, + { + "epoch": 0.38584, + "grad_norm": 1.9168870449066162, + "learning_rate": 6.762255583611557e-06, + "loss": 0.435, + "step": 4823 + }, + { + "epoch": 0.38592, + "grad_norm": 1.9956098794937134, + "learning_rate": 6.761078587381416e-06, + "loss": 0.3668, + "step": 4824 + }, + { + "epoch": 0.386, + "grad_norm": 1.5235965251922607, + "learning_rate": 6.759901479733727e-06, + "loss": 0.3124, + "step": 4825 + }, + { + "epoch": 0.38608, + "grad_norm": 1.2701911926269531, + "learning_rate": 6.7587242607429585e-06, + "loss": 0.2314, + "step": 4826 + }, + { + "epoch": 0.38616, + "grad_norm": 2.225844144821167, + "learning_rate": 6.7575469304835905e-06, + "loss": 0.467, + "step": 4827 + }, + { + "epoch": 0.38624, + "grad_norm": 1.8088279962539673, + "learning_rate": 6.756369489030109e-06, + "loss": 0.4096, + "step": 4828 + }, + { + "epoch": 0.38632, + "grad_norm": 1.5828858613967896, + "learning_rate": 6.755191936457006e-06, + "loss": 0.3111, + "step": 4829 + }, + { + "epoch": 0.3864, + "grad_norm": 2.1753506660461426, + "learning_rate": 6.754014272838782e-06, + "loss": 0.3957, + "step": 4830 + }, + { + "epoch": 0.38648, + "grad_norm": 1.830126166343689, + "learning_rate": 6.752836498249946e-06, + "loss": 0.3539, + "step": 4831 + }, + { + "epoch": 0.38656, + "grad_norm": 1.4874694347381592, + "learning_rate": 6.751658612765008e-06, + "loss": 0.3003, + "step": 4832 + }, + { + "epoch": 0.38664, + "grad_norm": 1.2147022485733032, + "learning_rate": 6.750480616458492e-06, + "loss": 0.2779, + "step": 4833 + }, + { + "epoch": 0.38672, + "grad_norm": 1.595517873764038, + "learning_rate": 6.749302509404924e-06, + "loss": 0.3439, + "step": 4834 + }, + { + "epoch": 0.3868, + "grad_norm": 1.3618996143341064, + "learning_rate": 6.748124291678839e-06, + "loss": 0.3388, + "step": 4835 + }, + { + "epoch": 0.38688, + "grad_norm": 1.43161940574646, + "learning_rate": 6.74694596335478e-06, + "loss": 0.2954, + "step": 4836 + }, + { + "epoch": 0.38696, + "grad_norm": 1.5240548849105835, + "learning_rate": 6.745767524507296e-06, + "loss": 0.2675, + "step": 4837 + }, + { + "epoch": 0.38704, + "grad_norm": 1.5682686567306519, + "learning_rate": 6.744588975210939e-06, + "loss": 0.3627, + "step": 4838 + }, + { + "epoch": 0.38712, + "grad_norm": 1.8606730699539185, + "learning_rate": 6.743410315540277e-06, + "loss": 0.3254, + "step": 4839 + }, + { + "epoch": 0.3872, + "grad_norm": 1.5660977363586426, + "learning_rate": 6.742231545569879e-06, + "loss": 0.4426, + "step": 4840 + }, + { + "epoch": 0.38728, + "grad_norm": 1.6271204948425293, + "learning_rate": 6.741052665374318e-06, + "loss": 0.3316, + "step": 4841 + }, + { + "epoch": 0.38736, + "grad_norm": 1.4664791822433472, + "learning_rate": 6.739873675028182e-06, + "loss": 0.3391, + "step": 4842 + }, + { + "epoch": 0.38744, + "grad_norm": 1.467913269996643, + "learning_rate": 6.738694574606059e-06, + "loss": 0.2995, + "step": 4843 + }, + { + "epoch": 0.38752, + "grad_norm": 1.3580753803253174, + "learning_rate": 6.73751536418255e-06, + "loss": 0.282, + "step": 4844 + }, + { + "epoch": 0.3876, + "grad_norm": 1.404686689376831, + "learning_rate": 6.736336043832255e-06, + "loss": 0.3058, + "step": 4845 + }, + { + "epoch": 0.38768, + "grad_norm": 1.6138893365859985, + "learning_rate": 6.73515661362979e-06, + "loss": 0.3093, + "step": 4846 + }, + { + "epoch": 0.38776, + "grad_norm": 1.5155889987945557, + "learning_rate": 6.733977073649774e-06, + "loss": 0.2914, + "step": 4847 + }, + { + "epoch": 0.38784, + "grad_norm": 1.9643943309783936, + "learning_rate": 6.732797423966828e-06, + "loss": 0.3926, + "step": 4848 + }, + { + "epoch": 0.38792, + "grad_norm": 1.6930899620056152, + "learning_rate": 6.731617664655586e-06, + "loss": 0.4257, + "step": 4849 + }, + { + "epoch": 0.388, + "grad_norm": 1.435917854309082, + "learning_rate": 6.73043779579069e-06, + "loss": 0.3335, + "step": 4850 + }, + { + "epoch": 0.38808, + "grad_norm": 1.7660568952560425, + "learning_rate": 6.7292578174467835e-06, + "loss": 0.3879, + "step": 4851 + }, + { + "epoch": 0.38816, + "grad_norm": 1.6072431802749634, + "learning_rate": 6.728077729698521e-06, + "loss": 0.3515, + "step": 4852 + }, + { + "epoch": 0.38824, + "grad_norm": 1.5983455181121826, + "learning_rate": 6.726897532620564e-06, + "loss": 0.3655, + "step": 4853 + }, + { + "epoch": 0.38832, + "grad_norm": 1.5852428674697876, + "learning_rate": 6.725717226287578e-06, + "loss": 0.3414, + "step": 4854 + }, + { + "epoch": 0.3884, + "grad_norm": 1.3663932085037231, + "learning_rate": 6.724536810774237e-06, + "loss": 0.2862, + "step": 4855 + }, + { + "epoch": 0.38848, + "grad_norm": 1.508460521697998, + "learning_rate": 6.723356286155223e-06, + "loss": 0.3473, + "step": 4856 + }, + { + "epoch": 0.38856, + "grad_norm": 2.041231870651245, + "learning_rate": 6.722175652505222e-06, + "loss": 0.3408, + "step": 4857 + }, + { + "epoch": 0.38864, + "grad_norm": 1.6656372547149658, + "learning_rate": 6.72099490989893e-06, + "loss": 0.3285, + "step": 4858 + }, + { + "epoch": 0.38872, + "grad_norm": 1.4684637784957886, + "learning_rate": 6.719814058411049e-06, + "loss": 0.3015, + "step": 4859 + }, + { + "epoch": 0.3888, + "grad_norm": 1.4412018060684204, + "learning_rate": 6.718633098116288e-06, + "loss": 0.3504, + "step": 4860 + }, + { + "epoch": 0.38888, + "grad_norm": 1.2994134426116943, + "learning_rate": 6.7174520290893594e-06, + "loss": 0.3174, + "step": 4861 + }, + { + "epoch": 0.38896, + "grad_norm": 2.318960428237915, + "learning_rate": 6.716270851404989e-06, + "loss": 0.5163, + "step": 4862 + }, + { + "epoch": 0.38904, + "grad_norm": 1.9144383668899536, + "learning_rate": 6.715089565137904e-06, + "loss": 0.3362, + "step": 4863 + }, + { + "epoch": 0.38912, + "grad_norm": 1.4348660707473755, + "learning_rate": 6.713908170362841e-06, + "loss": 0.383, + "step": 4864 + }, + { + "epoch": 0.3892, + "grad_norm": 1.6656911373138428, + "learning_rate": 6.712726667154542e-06, + "loss": 0.3147, + "step": 4865 + }, + { + "epoch": 0.38928, + "grad_norm": 1.9064844846725464, + "learning_rate": 6.711545055587759e-06, + "loss": 0.5062, + "step": 4866 + }, + { + "epoch": 0.38936, + "grad_norm": 1.5904337167739868, + "learning_rate": 6.710363335737246e-06, + "loss": 0.3469, + "step": 4867 + }, + { + "epoch": 0.38944, + "grad_norm": 1.8707817792892456, + "learning_rate": 6.709181507677769e-06, + "loss": 0.3365, + "step": 4868 + }, + { + "epoch": 0.38952, + "grad_norm": 1.9314450025558472, + "learning_rate": 6.7079995714840965e-06, + "loss": 0.3962, + "step": 4869 + }, + { + "epoch": 0.3896, + "grad_norm": 1.5333346128463745, + "learning_rate": 6.706817527231006e-06, + "loss": 0.304, + "step": 4870 + }, + { + "epoch": 0.38968, + "grad_norm": 1.4997177124023438, + "learning_rate": 6.70563537499328e-06, + "loss": 0.3568, + "step": 4871 + }, + { + "epoch": 0.38976, + "grad_norm": 2.006237745285034, + "learning_rate": 6.704453114845712e-06, + "loss": 0.4563, + "step": 4872 + }, + { + "epoch": 0.38984, + "grad_norm": 2.0449886322021484, + "learning_rate": 6.7032707468631e-06, + "loss": 0.4209, + "step": 4873 + }, + { + "epoch": 0.38992, + "grad_norm": 1.588484525680542, + "learning_rate": 6.702088271120245e-06, + "loss": 0.3102, + "step": 4874 + }, + { + "epoch": 0.39, + "grad_norm": 1.401896595954895, + "learning_rate": 6.700905687691961e-06, + "loss": 0.3116, + "step": 4875 + }, + { + "epoch": 0.39008, + "grad_norm": 1.8815094232559204, + "learning_rate": 6.699722996653065e-06, + "loss": 0.481, + "step": 4876 + }, + { + "epoch": 0.39016, + "grad_norm": 1.3775348663330078, + "learning_rate": 6.6985401980783826e-06, + "loss": 0.2989, + "step": 4877 + }, + { + "epoch": 0.39024, + "grad_norm": 1.5223547220230103, + "learning_rate": 6.697357292042746e-06, + "loss": 0.3186, + "step": 4878 + }, + { + "epoch": 0.39032, + "grad_norm": 1.3824423551559448, + "learning_rate": 6.6961742786209925e-06, + "loss": 0.2951, + "step": 4879 + }, + { + "epoch": 0.3904, + "grad_norm": 1.2650786638259888, + "learning_rate": 6.694991157887967e-06, + "loss": 0.2914, + "step": 4880 + }, + { + "epoch": 0.39048, + "grad_norm": 1.40303373336792, + "learning_rate": 6.693807929918523e-06, + "loss": 0.3228, + "step": 4881 + }, + { + "epoch": 0.39056, + "grad_norm": 1.308904767036438, + "learning_rate": 6.692624594787519e-06, + "loss": 0.248, + "step": 4882 + }, + { + "epoch": 0.39064, + "grad_norm": 1.8516181707382202, + "learning_rate": 6.69144115256982e-06, + "loss": 0.4687, + "step": 4883 + }, + { + "epoch": 0.39072, + "grad_norm": 1.5992012023925781, + "learning_rate": 6.6902576033403e-06, + "loss": 0.363, + "step": 4884 + }, + { + "epoch": 0.3908, + "grad_norm": 2.104926109313965, + "learning_rate": 6.689073947173835e-06, + "loss": 0.3872, + "step": 4885 + }, + { + "epoch": 0.39088, + "grad_norm": 1.384028434753418, + "learning_rate": 6.6878901841453135e-06, + "loss": 0.3222, + "step": 4886 + }, + { + "epoch": 0.39096, + "grad_norm": 1.4685722589492798, + "learning_rate": 6.6867063143296285e-06, + "loss": 0.3079, + "step": 4887 + }, + { + "epoch": 0.39104, + "grad_norm": 1.832414984703064, + "learning_rate": 6.685522337801678e-06, + "loss": 0.3503, + "step": 4888 + }, + { + "epoch": 0.39112, + "grad_norm": 1.4985992908477783, + "learning_rate": 6.68433825463637e-06, + "loss": 0.343, + "step": 4889 + }, + { + "epoch": 0.3912, + "grad_norm": 1.4093996286392212, + "learning_rate": 6.6831540649086165e-06, + "loss": 0.3069, + "step": 4890 + }, + { + "epoch": 0.39128, + "grad_norm": 1.5814567804336548, + "learning_rate": 6.681969768693336e-06, + "loss": 0.3293, + "step": 4891 + }, + { + "epoch": 0.39136, + "grad_norm": 1.4069743156433105, + "learning_rate": 6.680785366065458e-06, + "loss": 0.3107, + "step": 4892 + }, + { + "epoch": 0.39144, + "grad_norm": 1.08096182346344, + "learning_rate": 6.679600857099913e-06, + "loss": 0.2361, + "step": 4893 + }, + { + "epoch": 0.39152, + "grad_norm": 1.1872847080230713, + "learning_rate": 6.67841624187164e-06, + "loss": 0.3018, + "step": 4894 + }, + { + "epoch": 0.3916, + "grad_norm": 1.4051975011825562, + "learning_rate": 6.6772315204555896e-06, + "loss": 0.3002, + "step": 4895 + }, + { + "epoch": 0.39168, + "grad_norm": 1.3737307786941528, + "learning_rate": 6.676046692926712e-06, + "loss": 0.2727, + "step": 4896 + }, + { + "epoch": 0.39176, + "grad_norm": 1.3034765720367432, + "learning_rate": 6.674861759359967e-06, + "loss": 0.3119, + "step": 4897 + }, + { + "epoch": 0.39184, + "grad_norm": 1.5516210794448853, + "learning_rate": 6.6736767198303245e-06, + "loss": 0.2893, + "step": 4898 + }, + { + "epoch": 0.39192, + "grad_norm": 1.2979193925857544, + "learning_rate": 6.672491574412755e-06, + "loss": 0.2884, + "step": 4899 + }, + { + "epoch": 0.392, + "grad_norm": 1.7181899547576904, + "learning_rate": 6.671306323182239e-06, + "loss": 0.3425, + "step": 4900 + }, + { + "epoch": 0.39208, + "grad_norm": 1.7885940074920654, + "learning_rate": 6.6701209662137665e-06, + "loss": 0.3883, + "step": 4901 + }, + { + "epoch": 0.39216, + "grad_norm": 2.0114247798919678, + "learning_rate": 6.668935503582328e-06, + "loss": 0.347, + "step": 4902 + }, + { + "epoch": 0.39224, + "grad_norm": 1.6095882654190063, + "learning_rate": 6.667749935362922e-06, + "loss": 0.3343, + "step": 4903 + }, + { + "epoch": 0.39232, + "grad_norm": 1.5267553329467773, + "learning_rate": 6.6665642616305615e-06, + "loss": 0.3258, + "step": 4904 + }, + { + "epoch": 0.3924, + "grad_norm": 1.5813835859298706, + "learning_rate": 6.665378482460254e-06, + "loss": 0.3566, + "step": 4905 + }, + { + "epoch": 0.39248, + "grad_norm": 1.3659838438034058, + "learning_rate": 6.664192597927024e-06, + "loss": 0.3428, + "step": 4906 + }, + { + "epoch": 0.39256, + "grad_norm": 1.4735257625579834, + "learning_rate": 6.663006608105897e-06, + "loss": 0.3298, + "step": 4907 + }, + { + "epoch": 0.39264, + "grad_norm": 1.72593092918396, + "learning_rate": 6.6618205130719034e-06, + "loss": 0.3582, + "step": 4908 + }, + { + "epoch": 0.39272, + "grad_norm": 1.6992379426956177, + "learning_rate": 6.660634312900089e-06, + "loss": 0.3377, + "step": 4909 + }, + { + "epoch": 0.3928, + "grad_norm": 1.6187288761138916, + "learning_rate": 6.659448007665496e-06, + "loss": 0.2841, + "step": 4910 + }, + { + "epoch": 0.39288, + "grad_norm": 1.6214263439178467, + "learning_rate": 6.658261597443182e-06, + "loss": 0.4277, + "step": 4911 + }, + { + "epoch": 0.39296, + "grad_norm": 1.5356061458587646, + "learning_rate": 6.657075082308204e-06, + "loss": 0.3734, + "step": 4912 + }, + { + "epoch": 0.39304, + "grad_norm": 1.5965813398361206, + "learning_rate": 6.655888462335631e-06, + "loss": 0.3108, + "step": 4913 + }, + { + "epoch": 0.39312, + "grad_norm": 1.5381505489349365, + "learning_rate": 6.654701737600536e-06, + "loss": 0.3088, + "step": 4914 + }, + { + "epoch": 0.3932, + "grad_norm": 1.7315890789031982, + "learning_rate": 6.653514908177999e-06, + "loss": 0.3554, + "step": 4915 + }, + { + "epoch": 0.39328, + "grad_norm": 1.5875256061553955, + "learning_rate": 6.652327974143106e-06, + "loss": 0.355, + "step": 4916 + }, + { + "epoch": 0.39336, + "grad_norm": 1.443566083908081, + "learning_rate": 6.65114093557095e-06, + "loss": 0.2864, + "step": 4917 + }, + { + "epoch": 0.39344, + "grad_norm": 1.1734968423843384, + "learning_rate": 6.649953792536632e-06, + "loss": 0.2207, + "step": 4918 + }, + { + "epoch": 0.39352, + "grad_norm": 1.580723524093628, + "learning_rate": 6.648766545115258e-06, + "loss": 0.3254, + "step": 4919 + }, + { + "epoch": 0.3936, + "grad_norm": 1.456050157546997, + "learning_rate": 6.647579193381942e-06, + "loss": 0.3356, + "step": 4920 + }, + { + "epoch": 0.39368, + "grad_norm": 0.9805746078491211, + "learning_rate": 6.646391737411803e-06, + "loss": 0.1802, + "step": 4921 + }, + { + "epoch": 0.39376, + "grad_norm": 1.4817471504211426, + "learning_rate": 6.645204177279968e-06, + "loss": 0.4042, + "step": 4922 + }, + { + "epoch": 0.39384, + "grad_norm": 1.3607851266860962, + "learning_rate": 6.644016513061569e-06, + "loss": 0.3145, + "step": 4923 + }, + { + "epoch": 0.39392, + "grad_norm": 1.8702746629714966, + "learning_rate": 6.642828744831746e-06, + "loss": 0.3977, + "step": 4924 + }, + { + "epoch": 0.394, + "grad_norm": 1.4964871406555176, + "learning_rate": 6.641640872665647e-06, + "loss": 0.3943, + "step": 4925 + }, + { + "epoch": 0.39408, + "grad_norm": 1.3008499145507812, + "learning_rate": 6.640452896638421e-06, + "loss": 0.3038, + "step": 4926 + }, + { + "epoch": 0.39416, + "grad_norm": 1.3478739261627197, + "learning_rate": 6.639264816825231e-06, + "loss": 0.3404, + "step": 4927 + }, + { + "epoch": 0.39424, + "grad_norm": 1.33133864402771, + "learning_rate": 6.638076633301239e-06, + "loss": 0.3305, + "step": 4928 + }, + { + "epoch": 0.39432, + "grad_norm": 1.773242712020874, + "learning_rate": 6.63688834614162e-06, + "loss": 0.3097, + "step": 4929 + }, + { + "epoch": 0.3944, + "grad_norm": 1.4814815521240234, + "learning_rate": 6.635699955421553e-06, + "loss": 0.3365, + "step": 4930 + }, + { + "epoch": 0.39448, + "grad_norm": 1.6367100477218628, + "learning_rate": 6.6345114612162235e-06, + "loss": 0.319, + "step": 4931 + }, + { + "epoch": 0.39456, + "grad_norm": 1.5399671792984009, + "learning_rate": 6.633322863600822e-06, + "loss": 0.2835, + "step": 4932 + }, + { + "epoch": 0.39464, + "grad_norm": 1.9397109746932983, + "learning_rate": 6.632134162650547e-06, + "loss": 0.3802, + "step": 4933 + }, + { + "epoch": 0.39472, + "grad_norm": 1.1391005516052246, + "learning_rate": 6.630945358440606e-06, + "loss": 0.2647, + "step": 4934 + }, + { + "epoch": 0.3948, + "grad_norm": 1.9419512748718262, + "learning_rate": 6.6297564510462075e-06, + "loss": 0.3465, + "step": 4935 + }, + { + "epoch": 0.39488, + "grad_norm": 1.6960084438323975, + "learning_rate": 6.6285674405425746e-06, + "loss": 0.3128, + "step": 4936 + }, + { + "epoch": 0.39496, + "grad_norm": 1.3205006122589111, + "learning_rate": 6.627378327004927e-06, + "loss": 0.2925, + "step": 4937 + }, + { + "epoch": 0.39504, + "grad_norm": 1.6510692834854126, + "learning_rate": 6.626189110508498e-06, + "loss": 0.3364, + "step": 4938 + }, + { + "epoch": 0.39512, + "grad_norm": 1.390600323677063, + "learning_rate": 6.624999791128527e-06, + "loss": 0.2794, + "step": 4939 + }, + { + "epoch": 0.3952, + "grad_norm": 1.7261474132537842, + "learning_rate": 6.623810368940254e-06, + "loss": 0.3829, + "step": 4940 + }, + { + "epoch": 0.39528, + "grad_norm": 1.63364839553833, + "learning_rate": 6.6226208440189334e-06, + "loss": 0.3381, + "step": 4941 + }, + { + "epoch": 0.39536, + "grad_norm": 2.4538915157318115, + "learning_rate": 6.621431216439822e-06, + "loss": 0.4066, + "step": 4942 + }, + { + "epoch": 0.39544, + "grad_norm": 1.6599690914154053, + "learning_rate": 6.620241486278181e-06, + "loss": 0.3646, + "step": 4943 + }, + { + "epoch": 0.39552, + "grad_norm": 1.3066409826278687, + "learning_rate": 6.619051653609284e-06, + "loss": 0.2685, + "step": 4944 + }, + { + "epoch": 0.3956, + "grad_norm": 1.3815221786499023, + "learning_rate": 6.6178617185084045e-06, + "loss": 0.2759, + "step": 4945 + }, + { + "epoch": 0.39568, + "grad_norm": 2.038173198699951, + "learning_rate": 6.616671681050829e-06, + "loss": 0.3673, + "step": 4946 + }, + { + "epoch": 0.39576, + "grad_norm": 1.204803466796875, + "learning_rate": 6.615481541311846e-06, + "loss": 0.2506, + "step": 4947 + }, + { + "epoch": 0.39584, + "grad_norm": 1.624482274055481, + "learning_rate": 6.614291299366751e-06, + "loss": 0.3633, + "step": 4948 + }, + { + "epoch": 0.39592, + "grad_norm": 1.5271668434143066, + "learning_rate": 6.613100955290845e-06, + "loss": 0.4242, + "step": 4949 + }, + { + "epoch": 0.396, + "grad_norm": 1.756809949874878, + "learning_rate": 6.611910509159443e-06, + "loss": 0.5438, + "step": 4950 + }, + { + "epoch": 0.39608, + "grad_norm": 1.434124231338501, + "learning_rate": 6.610719961047853e-06, + "loss": 0.2511, + "step": 4951 + }, + { + "epoch": 0.39616, + "grad_norm": 1.770772933959961, + "learning_rate": 6.609529311031402e-06, + "loss": 0.5268, + "step": 4952 + }, + { + "epoch": 0.39624, + "grad_norm": 1.617964267730713, + "learning_rate": 6.608338559185417e-06, + "loss": 0.3534, + "step": 4953 + }, + { + "epoch": 0.39632, + "grad_norm": 1.4315916299819946, + "learning_rate": 6.607147705585233e-06, + "loss": 0.2874, + "step": 4954 + }, + { + "epoch": 0.3964, + "grad_norm": 1.1115981340408325, + "learning_rate": 6.6059567503061905e-06, + "loss": 0.2664, + "step": 4955 + }, + { + "epoch": 0.39648, + "grad_norm": 1.2988982200622559, + "learning_rate": 6.604765693423637e-06, + "loss": 0.2593, + "step": 4956 + }, + { + "epoch": 0.39656, + "grad_norm": 1.7615128755569458, + "learning_rate": 6.6035745350129285e-06, + "loss": 0.4727, + "step": 4957 + }, + { + "epoch": 0.39664, + "grad_norm": 1.4541040658950806, + "learning_rate": 6.602383275149425e-06, + "loss": 0.3868, + "step": 4958 + }, + { + "epoch": 0.39672, + "grad_norm": 1.7334989309310913, + "learning_rate": 6.601191913908495e-06, + "loss": 0.3599, + "step": 4959 + }, + { + "epoch": 0.3968, + "grad_norm": 2.142695188522339, + "learning_rate": 6.600000451365508e-06, + "loss": 0.4301, + "step": 4960 + }, + { + "epoch": 0.39688, + "grad_norm": 1.5532326698303223, + "learning_rate": 6.598808887595847e-06, + "loss": 0.3237, + "step": 4961 + }, + { + "epoch": 0.39696, + "grad_norm": 1.5868544578552246, + "learning_rate": 6.5976172226748965e-06, + "loss": 0.3802, + "step": 4962 + }, + { + "epoch": 0.39704, + "grad_norm": 1.2728503942489624, + "learning_rate": 6.596425456678051e-06, + "loss": 0.3255, + "step": 4963 + }, + { + "epoch": 0.39712, + "grad_norm": 1.2911931276321411, + "learning_rate": 6.595233589680708e-06, + "loss": 0.2739, + "step": 4964 + }, + { + "epoch": 0.3972, + "grad_norm": 1.98240065574646, + "learning_rate": 6.594041621758272e-06, + "loss": 0.4007, + "step": 4965 + }, + { + "epoch": 0.39728, + "grad_norm": 1.2459379434585571, + "learning_rate": 6.592849552986159e-06, + "loss": 0.2915, + "step": 4966 + }, + { + "epoch": 0.39736, + "grad_norm": 1.5010486841201782, + "learning_rate": 6.591657383439782e-06, + "loss": 0.3646, + "step": 4967 + }, + { + "epoch": 0.39744, + "grad_norm": 1.6176382303237915, + "learning_rate": 6.590465113194569e-06, + "loss": 0.2956, + "step": 4968 + }, + { + "epoch": 0.39752, + "grad_norm": 1.6453156471252441, + "learning_rate": 6.58927274232595e-06, + "loss": 0.3022, + "step": 4969 + }, + { + "epoch": 0.3976, + "grad_norm": 1.58073091506958, + "learning_rate": 6.588080270909363e-06, + "loss": 0.3601, + "step": 4970 + }, + { + "epoch": 0.39768, + "grad_norm": 1.4092754125595093, + "learning_rate": 6.586887699020252e-06, + "loss": 0.3454, + "step": 4971 + }, + { + "epoch": 0.39776, + "grad_norm": 1.4815930128097534, + "learning_rate": 6.585695026734065e-06, + "loss": 0.3053, + "step": 4972 + }, + { + "epoch": 0.39784, + "grad_norm": 1.828636884689331, + "learning_rate": 6.584502254126258e-06, + "loss": 0.3526, + "step": 4973 + }, + { + "epoch": 0.39792, + "grad_norm": 1.1700973510742188, + "learning_rate": 6.583309381272296e-06, + "loss": 0.2726, + "step": 4974 + }, + { + "epoch": 0.398, + "grad_norm": 1.965246319770813, + "learning_rate": 6.582116408247647e-06, + "loss": 0.3759, + "step": 4975 + }, + { + "epoch": 0.39808, + "grad_norm": 1.8582028150558472, + "learning_rate": 6.580923335127787e-06, + "loss": 0.3591, + "step": 4976 + }, + { + "epoch": 0.39816, + "grad_norm": 1.556747317314148, + "learning_rate": 6.579730161988197e-06, + "loss": 0.401, + "step": 4977 + }, + { + "epoch": 0.39824, + "grad_norm": 1.0947948694229126, + "learning_rate": 6.578536888904367e-06, + "loss": 0.201, + "step": 4978 + }, + { + "epoch": 0.39832, + "grad_norm": 1.6996358633041382, + "learning_rate": 6.577343515951787e-06, + "loss": 0.3871, + "step": 4979 + }, + { + "epoch": 0.3984, + "grad_norm": 1.6404669284820557, + "learning_rate": 6.576150043205962e-06, + "loss": 0.3518, + "step": 4980 + }, + { + "epoch": 0.39848, + "grad_norm": 1.4954818487167358, + "learning_rate": 6.574956470742398e-06, + "loss": 0.2871, + "step": 4981 + }, + { + "epoch": 0.39856, + "grad_norm": 1.6011344194412231, + "learning_rate": 6.573762798636608e-06, + "loss": 0.3557, + "step": 4982 + }, + { + "epoch": 0.39864, + "grad_norm": 1.716171383857727, + "learning_rate": 6.57256902696411e-06, + "loss": 0.3298, + "step": 4983 + }, + { + "epoch": 0.39872, + "grad_norm": 1.5395945310592651, + "learning_rate": 6.571375155800434e-06, + "loss": 0.4424, + "step": 4984 + }, + { + "epoch": 0.3988, + "grad_norm": 1.4372494220733643, + "learning_rate": 6.570181185221109e-06, + "loss": 0.3702, + "step": 4985 + }, + { + "epoch": 0.39888, + "grad_norm": 1.6280698776245117, + "learning_rate": 6.5689871153016726e-06, + "loss": 0.3452, + "step": 4986 + }, + { + "epoch": 0.39896, + "grad_norm": 1.3518006801605225, + "learning_rate": 6.567792946117673e-06, + "loss": 0.2486, + "step": 4987 + }, + { + "epoch": 0.39904, + "grad_norm": 1.4701176881790161, + "learning_rate": 6.566598677744658e-06, + "loss": 0.3986, + "step": 4988 + }, + { + "epoch": 0.39912, + "grad_norm": 1.6428401470184326, + "learning_rate": 6.565404310258189e-06, + "loss": 0.3209, + "step": 4989 + }, + { + "epoch": 0.3992, + "grad_norm": 1.4474351406097412, + "learning_rate": 6.5642098437338245e-06, + "loss": 0.2907, + "step": 4990 + }, + { + "epoch": 0.39928, + "grad_norm": 1.8276543617248535, + "learning_rate": 6.563015278247138e-06, + "loss": 0.4284, + "step": 4991 + }, + { + "epoch": 0.39936, + "grad_norm": 1.533747673034668, + "learning_rate": 6.561820613873704e-06, + "loss": 0.3259, + "step": 4992 + }, + { + "epoch": 0.39944, + "grad_norm": 1.8883445262908936, + "learning_rate": 6.560625850689106e-06, + "loss": 0.3844, + "step": 4993 + }, + { + "epoch": 0.39952, + "grad_norm": 1.6395256519317627, + "learning_rate": 6.559430988768934e-06, + "loss": 0.348, + "step": 4994 + }, + { + "epoch": 0.3996, + "grad_norm": 1.8606520891189575, + "learning_rate": 6.55823602818878e-06, + "loss": 0.353, + "step": 4995 + }, + { + "epoch": 0.39968, + "grad_norm": 1.542927861213684, + "learning_rate": 6.557040969024246e-06, + "loss": 0.3696, + "step": 4996 + }, + { + "epoch": 0.39976, + "grad_norm": 1.8894641399383545, + "learning_rate": 6.55584581135094e-06, + "loss": 0.3741, + "step": 4997 + }, + { + "epoch": 0.39984, + "grad_norm": 1.6438285112380981, + "learning_rate": 6.554650555244476e-06, + "loss": 0.3712, + "step": 4998 + }, + { + "epoch": 0.39992, + "grad_norm": 1.3953157663345337, + "learning_rate": 6.5534552007804715e-06, + "loss": 0.3146, + "step": 4999 + }, + { + "epoch": 0.4, + "grad_norm": 2.405379295349121, + "learning_rate": 6.5522597480345564e-06, + "loss": 0.6259, + "step": 5000 + }, + { + "epoch": 0.40008, + "grad_norm": 1.4833146333694458, + "learning_rate": 6.551064197082361e-06, + "loss": 0.3241, + "step": 5001 + }, + { + "epoch": 0.40016, + "grad_norm": 1.4547537565231323, + "learning_rate": 6.549868547999523e-06, + "loss": 0.3425, + "step": 5002 + }, + { + "epoch": 0.40024, + "grad_norm": 1.2077940702438354, + "learning_rate": 6.548672800861686e-06, + "loss": 0.2656, + "step": 5003 + }, + { + "epoch": 0.40032, + "grad_norm": 1.5668079853057861, + "learning_rate": 6.547476955744505e-06, + "loss": 0.3011, + "step": 5004 + }, + { + "epoch": 0.4004, + "grad_norm": 1.4270055294036865, + "learning_rate": 6.546281012723634e-06, + "loss": 0.2762, + "step": 5005 + }, + { + "epoch": 0.40048, + "grad_norm": 1.215239405632019, + "learning_rate": 6.545084971874738e-06, + "loss": 0.2474, + "step": 5006 + }, + { + "epoch": 0.40056, + "grad_norm": 1.7257269620895386, + "learning_rate": 6.543888833273486e-06, + "loss": 0.3579, + "step": 5007 + }, + { + "epoch": 0.40064, + "grad_norm": 1.6582391262054443, + "learning_rate": 6.5426925969955524e-06, + "loss": 0.3379, + "step": 5008 + }, + { + "epoch": 0.40072, + "grad_norm": 1.4435465335845947, + "learning_rate": 6.54149626311662e-06, + "loss": 0.3572, + "step": 5009 + }, + { + "epoch": 0.4008, + "grad_norm": 1.3516596555709839, + "learning_rate": 6.5402998317123765e-06, + "loss": 0.3044, + "step": 5010 + }, + { + "epoch": 0.40088, + "grad_norm": 1.3053381443023682, + "learning_rate": 6.539103302858517e-06, + "loss": 0.2901, + "step": 5011 + }, + { + "epoch": 0.40096, + "grad_norm": 1.8313065767288208, + "learning_rate": 6.537906676630741e-06, + "loss": 0.369, + "step": 5012 + }, + { + "epoch": 0.40104, + "grad_norm": 1.388274908065796, + "learning_rate": 6.536709953104756e-06, + "loss": 0.2789, + "step": 5013 + }, + { + "epoch": 0.40112, + "grad_norm": 2.075113296508789, + "learning_rate": 6.535513132356275e-06, + "loss": 0.5173, + "step": 5014 + }, + { + "epoch": 0.4012, + "grad_norm": 1.500080943107605, + "learning_rate": 6.534316214461014e-06, + "loss": 0.4011, + "step": 5015 + }, + { + "epoch": 0.40128, + "grad_norm": 1.7447412014007568, + "learning_rate": 6.533119199494702e-06, + "loss": 0.3899, + "step": 5016 + }, + { + "epoch": 0.40136, + "grad_norm": 1.5773173570632935, + "learning_rate": 6.531922087533067e-06, + "loss": 0.2934, + "step": 5017 + }, + { + "epoch": 0.40144, + "grad_norm": 1.36077880859375, + "learning_rate": 6.530724878651846e-06, + "loss": 0.2573, + "step": 5018 + }, + { + "epoch": 0.40152, + "grad_norm": 1.796042799949646, + "learning_rate": 6.5295275729267874e-06, + "loss": 0.3084, + "step": 5019 + }, + { + "epoch": 0.4016, + "grad_norm": 1.5335108041763306, + "learning_rate": 6.528330170433634e-06, + "loss": 0.3073, + "step": 5020 + }, + { + "epoch": 0.40168, + "grad_norm": 1.5497459173202515, + "learning_rate": 6.527132671248145e-06, + "loss": 0.3406, + "step": 5021 + }, + { + "epoch": 0.40176, + "grad_norm": 1.802089810371399, + "learning_rate": 6.525935075446081e-06, + "loss": 0.5411, + "step": 5022 + }, + { + "epoch": 0.40184, + "grad_norm": 1.59895658493042, + "learning_rate": 6.524737383103212e-06, + "loss": 0.3232, + "step": 5023 + }, + { + "epoch": 0.40192, + "grad_norm": 1.7338372468948364, + "learning_rate": 6.523539594295309e-06, + "loss": 0.3667, + "step": 5024 + }, + { + "epoch": 0.402, + "grad_norm": 1.4909459352493286, + "learning_rate": 6.5223417090981565e-06, + "loss": 0.3944, + "step": 5025 + }, + { + "epoch": 0.40208, + "grad_norm": 1.567449927330017, + "learning_rate": 6.521143727587536e-06, + "loss": 0.3354, + "step": 5026 + }, + { + "epoch": 0.40216, + "grad_norm": 1.2927497625350952, + "learning_rate": 6.519945649839241e-06, + "loss": 0.2782, + "step": 5027 + }, + { + "epoch": 0.40224, + "grad_norm": 1.6907589435577393, + "learning_rate": 6.518747475929073e-06, + "loss": 0.4177, + "step": 5028 + }, + { + "epoch": 0.40232, + "grad_norm": 1.5473442077636719, + "learning_rate": 6.517549205932832e-06, + "loss": 0.4238, + "step": 5029 + }, + { + "epoch": 0.4024, + "grad_norm": 1.1968157291412354, + "learning_rate": 6.51635083992633e-06, + "loss": 0.2689, + "step": 5030 + }, + { + "epoch": 0.40248, + "grad_norm": 1.7409216165542603, + "learning_rate": 6.515152377985385e-06, + "loss": 0.5057, + "step": 5031 + }, + { + "epoch": 0.40256, + "grad_norm": 1.6909297704696655, + "learning_rate": 6.513953820185819e-06, + "loss": 0.3212, + "step": 5032 + }, + { + "epoch": 0.40264, + "grad_norm": 1.6705437898635864, + "learning_rate": 6.512755166603459e-06, + "loss": 0.4307, + "step": 5033 + }, + { + "epoch": 0.40272, + "grad_norm": 1.2356233596801758, + "learning_rate": 6.511556417314142e-06, + "loss": 0.2553, + "step": 5034 + }, + { + "epoch": 0.4028, + "grad_norm": 1.6176491975784302, + "learning_rate": 6.510357572393709e-06, + "loss": 0.3523, + "step": 5035 + }, + { + "epoch": 0.40288, + "grad_norm": 1.7396360635757446, + "learning_rate": 6.509158631918006e-06, + "loss": 0.4734, + "step": 5036 + }, + { + "epoch": 0.40296, + "grad_norm": 1.2196110486984253, + "learning_rate": 6.507959595962885e-06, + "loss": 0.3238, + "step": 5037 + }, + { + "epoch": 0.40304, + "grad_norm": 1.266950249671936, + "learning_rate": 6.506760464604206e-06, + "loss": 0.2835, + "step": 5038 + }, + { + "epoch": 0.40312, + "grad_norm": 1.3458009958267212, + "learning_rate": 6.5055612379178355e-06, + "loss": 0.278, + "step": 5039 + }, + { + "epoch": 0.4032, + "grad_norm": 1.686802864074707, + "learning_rate": 6.504361915979643e-06, + "loss": 0.391, + "step": 5040 + }, + { + "epoch": 0.40328, + "grad_norm": 1.483590841293335, + "learning_rate": 6.503162498865504e-06, + "loss": 0.3368, + "step": 5041 + }, + { + "epoch": 0.40336, + "grad_norm": 1.6863470077514648, + "learning_rate": 6.501962986651305e-06, + "loss": 0.3795, + "step": 5042 + }, + { + "epoch": 0.40344, + "grad_norm": 1.6084065437316895, + "learning_rate": 6.500763379412932e-06, + "loss": 0.4195, + "step": 5043 + }, + { + "epoch": 0.40352, + "grad_norm": 1.6228187084197998, + "learning_rate": 6.499563677226281e-06, + "loss": 0.4213, + "step": 5044 + }, + { + "epoch": 0.4036, + "grad_norm": 1.7429604530334473, + "learning_rate": 6.498363880167256e-06, + "loss": 0.3857, + "step": 5045 + }, + { + "epoch": 0.40368, + "grad_norm": 0.9011700749397278, + "learning_rate": 6.49716398831176e-06, + "loss": 0.2047, + "step": 5046 + }, + { + "epoch": 0.40376, + "grad_norm": 1.3937594890594482, + "learning_rate": 6.4959640017357086e-06, + "loss": 0.331, + "step": 5047 + }, + { + "epoch": 0.40384, + "grad_norm": 1.7457072734832764, + "learning_rate": 6.494763920515021e-06, + "loss": 0.3449, + "step": 5048 + }, + { + "epoch": 0.40392, + "grad_norm": 1.6541334390640259, + "learning_rate": 6.493563744725621e-06, + "loss": 0.3061, + "step": 5049 + }, + { + "epoch": 0.404, + "grad_norm": 1.375489592552185, + "learning_rate": 6.49236347444344e-06, + "loss": 0.3041, + "step": 5050 + }, + { + "epoch": 0.40408, + "grad_norm": 1.7582788467407227, + "learning_rate": 6.491163109744416e-06, + "loss": 0.4728, + "step": 5051 + }, + { + "epoch": 0.40416, + "grad_norm": 1.3825554847717285, + "learning_rate": 6.489962650704491e-06, + "loss": 0.3534, + "step": 5052 + }, + { + "epoch": 0.40424, + "grad_norm": 1.3173959255218506, + "learning_rate": 6.4887620973996145e-06, + "loss": 0.2669, + "step": 5053 + }, + { + "epoch": 0.40432, + "grad_norm": 1.8339534997940063, + "learning_rate": 6.487561449905744e-06, + "loss": 0.3635, + "step": 5054 + }, + { + "epoch": 0.4044, + "grad_norm": 1.564780831336975, + "learning_rate": 6.4863607082988345e-06, + "loss": 0.3529, + "step": 5055 + }, + { + "epoch": 0.40448, + "grad_norm": 1.4042036533355713, + "learning_rate": 6.485159872654858e-06, + "loss": 0.2709, + "step": 5056 + }, + { + "epoch": 0.40456, + "grad_norm": 1.776026725769043, + "learning_rate": 6.483958943049785e-06, + "loss": 0.4495, + "step": 5057 + }, + { + "epoch": 0.40464, + "grad_norm": 1.5129555463790894, + "learning_rate": 6.482757919559594e-06, + "loss": 0.3174, + "step": 5058 + }, + { + "epoch": 0.40472, + "grad_norm": 1.493586540222168, + "learning_rate": 6.481556802260273e-06, + "loss": 0.3843, + "step": 5059 + }, + { + "epoch": 0.4048, + "grad_norm": 1.5436426401138306, + "learning_rate": 6.4803555912278106e-06, + "loss": 0.3938, + "step": 5060 + }, + { + "epoch": 0.40488, + "grad_norm": 1.4713767766952515, + "learning_rate": 6.4791542865382015e-06, + "loss": 0.3512, + "step": 5061 + }, + { + "epoch": 0.40496, + "grad_norm": 1.3419086933135986, + "learning_rate": 6.477952888267451e-06, + "loss": 0.3038, + "step": 5062 + }, + { + "epoch": 0.40504, + "grad_norm": 1.6907343864440918, + "learning_rate": 6.476751396491566e-06, + "loss": 0.3195, + "step": 5063 + }, + { + "epoch": 0.40512, + "grad_norm": 1.6713168621063232, + "learning_rate": 6.475549811286561e-06, + "loss": 0.3558, + "step": 5064 + }, + { + "epoch": 0.4052, + "grad_norm": 1.596295714378357, + "learning_rate": 6.474348132728457e-06, + "loss": 0.3346, + "step": 5065 + }, + { + "epoch": 0.40528, + "grad_norm": 1.508131980895996, + "learning_rate": 6.473146360893281e-06, + "loss": 0.341, + "step": 5066 + }, + { + "epoch": 0.40536, + "grad_norm": 1.456239104270935, + "learning_rate": 6.4719444958570635e-06, + "loss": 0.3222, + "step": 5067 + }, + { + "epoch": 0.40544, + "grad_norm": 1.7773851156234741, + "learning_rate": 6.470742537695842e-06, + "loss": 0.4088, + "step": 5068 + }, + { + "epoch": 0.40552, + "grad_norm": 1.5959703922271729, + "learning_rate": 6.469540486485662e-06, + "loss": 0.3296, + "step": 5069 + }, + { + "epoch": 0.4056, + "grad_norm": 1.6706092357635498, + "learning_rate": 6.468338342302571e-06, + "loss": 0.3701, + "step": 5070 + }, + { + "epoch": 0.40568, + "grad_norm": 1.721103549003601, + "learning_rate": 6.467136105222627e-06, + "loss": 0.3396, + "step": 5071 + }, + { + "epoch": 0.40576, + "grad_norm": 1.8070697784423828, + "learning_rate": 6.465933775321891e-06, + "loss": 0.3382, + "step": 5072 + }, + { + "epoch": 0.40584, + "grad_norm": 1.7369276285171509, + "learning_rate": 6.46473135267643e-06, + "loss": 0.3338, + "step": 5073 + }, + { + "epoch": 0.40592, + "grad_norm": 1.6247727870941162, + "learning_rate": 6.463528837362319e-06, + "loss": 0.3429, + "step": 5074 + }, + { + "epoch": 0.406, + "grad_norm": 1.6333701610565186, + "learning_rate": 6.4623262294556335e-06, + "loss": 0.4485, + "step": 5075 + }, + { + "epoch": 0.40608, + "grad_norm": 1.7924059629440308, + "learning_rate": 6.46112352903246e-06, + "loss": 0.2964, + "step": 5076 + }, + { + "epoch": 0.40616, + "grad_norm": 1.6652660369873047, + "learning_rate": 6.4599207361688895e-06, + "loss": 0.389, + "step": 5077 + }, + { + "epoch": 0.40624, + "grad_norm": 1.426188349723816, + "learning_rate": 6.458717850941021e-06, + "loss": 0.286, + "step": 5078 + }, + { + "epoch": 0.40632, + "grad_norm": 1.797730565071106, + "learning_rate": 6.4575148734249535e-06, + "loss": 0.4004, + "step": 5079 + }, + { + "epoch": 0.4064, + "grad_norm": 1.5713727474212646, + "learning_rate": 6.456311803696797e-06, + "loss": 0.3227, + "step": 5080 + }, + { + "epoch": 0.40648, + "grad_norm": 1.5011001825332642, + "learning_rate": 6.455108641832666e-06, + "loss": 0.3584, + "step": 5081 + }, + { + "epoch": 0.40656, + "grad_norm": 1.5323615074157715, + "learning_rate": 6.453905387908679e-06, + "loss": 0.3302, + "step": 5082 + }, + { + "epoch": 0.40664, + "grad_norm": 1.2687554359436035, + "learning_rate": 6.452702042000964e-06, + "loss": 0.2704, + "step": 5083 + }, + { + "epoch": 0.40672, + "grad_norm": 1.6853008270263672, + "learning_rate": 6.4514986041856506e-06, + "loss": 0.3229, + "step": 5084 + }, + { + "epoch": 0.4068, + "grad_norm": 1.6804182529449463, + "learning_rate": 6.450295074538879e-06, + "loss": 0.3343, + "step": 5085 + }, + { + "epoch": 0.40688, + "grad_norm": 1.2748759984970093, + "learning_rate": 6.449091453136789e-06, + "loss": 0.2909, + "step": 5086 + }, + { + "epoch": 0.40696, + "grad_norm": 1.752908706665039, + "learning_rate": 6.447887740055532e-06, + "loss": 0.3041, + "step": 5087 + }, + { + "epoch": 0.40704, + "grad_norm": 1.6320358514785767, + "learning_rate": 6.446683935371262e-06, + "loss": 0.3439, + "step": 5088 + }, + { + "epoch": 0.40712, + "grad_norm": 1.5598769187927246, + "learning_rate": 6.445480039160141e-06, + "loss": 0.3354, + "step": 5089 + }, + { + "epoch": 0.4072, + "grad_norm": 1.6841180324554443, + "learning_rate": 6.444276051498334e-06, + "loss": 0.3556, + "step": 5090 + }, + { + "epoch": 0.40728, + "grad_norm": 1.4227954149246216, + "learning_rate": 6.443071972462013e-06, + "loss": 0.331, + "step": 5091 + }, + { + "epoch": 0.40736, + "grad_norm": 1.5023025274276733, + "learning_rate": 6.441867802127357e-06, + "loss": 0.3008, + "step": 5092 + }, + { + "epoch": 0.40744, + "grad_norm": 1.3716212511062622, + "learning_rate": 6.44066354057055e-06, + "loss": 0.313, + "step": 5093 + }, + { + "epoch": 0.40752, + "grad_norm": 1.4322949647903442, + "learning_rate": 6.43945918786778e-06, + "loss": 0.3185, + "step": 5094 + }, + { + "epoch": 0.4076, + "grad_norm": 1.3301920890808105, + "learning_rate": 6.438254744095247e-06, + "loss": 0.2555, + "step": 5095 + }, + { + "epoch": 0.40768, + "grad_norm": 1.4562873840332031, + "learning_rate": 6.437050209329147e-06, + "loss": 0.2832, + "step": 5096 + }, + { + "epoch": 0.40776, + "grad_norm": 2.004117965698242, + "learning_rate": 6.435845583645688e-06, + "loss": 0.3892, + "step": 5097 + }, + { + "epoch": 0.40784, + "grad_norm": 1.5688518285751343, + "learning_rate": 6.434640867121084e-06, + "loss": 0.372, + "step": 5098 + }, + { + "epoch": 0.40792, + "grad_norm": 1.229386806488037, + "learning_rate": 6.433436059831552e-06, + "loss": 0.2306, + "step": 5099 + }, + { + "epoch": 0.408, + "grad_norm": 1.6038435697555542, + "learning_rate": 6.432231161853317e-06, + "loss": 0.3113, + "step": 5100 + }, + { + "epoch": 0.40808, + "grad_norm": 1.4195326566696167, + "learning_rate": 6.431026173262609e-06, + "loss": 0.3563, + "step": 5101 + }, + { + "epoch": 0.40816, + "grad_norm": 1.3468478918075562, + "learning_rate": 6.429821094135663e-06, + "loss": 0.3255, + "step": 5102 + }, + { + "epoch": 0.40824, + "grad_norm": 1.5173962116241455, + "learning_rate": 6.42861592454872e-06, + "loss": 0.3577, + "step": 5103 + }, + { + "epoch": 0.40832, + "grad_norm": 1.4830057621002197, + "learning_rate": 6.427410664578029e-06, + "loss": 0.3339, + "step": 5104 + }, + { + "epoch": 0.4084, + "grad_norm": 1.4850959777832031, + "learning_rate": 6.42620531429984e-06, + "loss": 0.3405, + "step": 5105 + }, + { + "epoch": 0.40848, + "grad_norm": 1.410702109336853, + "learning_rate": 6.424999873790414e-06, + "loss": 0.3099, + "step": 5106 + }, + { + "epoch": 0.40856, + "grad_norm": 1.793792486190796, + "learning_rate": 6.423794343126013e-06, + "loss": 0.3515, + "step": 5107 + }, + { + "epoch": 0.40864, + "grad_norm": 1.6600618362426758, + "learning_rate": 6.422588722382909e-06, + "loss": 0.2998, + "step": 5108 + }, + { + "epoch": 0.40872, + "grad_norm": 1.887887954711914, + "learning_rate": 6.4213830116373766e-06, + "loss": 0.3906, + "step": 5109 + }, + { + "epoch": 0.4088, + "grad_norm": 1.5626739263534546, + "learning_rate": 6.4201772109656956e-06, + "loss": 0.3316, + "step": 5110 + }, + { + "epoch": 0.40888, + "grad_norm": 1.23286771774292, + "learning_rate": 6.418971320444155e-06, + "loss": 0.3091, + "step": 5111 + }, + { + "epoch": 0.40896, + "grad_norm": 1.5620896816253662, + "learning_rate": 6.417765340149047e-06, + "loss": 0.3231, + "step": 5112 + }, + { + "epoch": 0.40904, + "grad_norm": 1.6344822645187378, + "learning_rate": 6.41655927015667e-06, + "loss": 0.4167, + "step": 5113 + }, + { + "epoch": 0.40912, + "grad_norm": 2.126511335372925, + "learning_rate": 6.4153531105433276e-06, + "loss": 0.4323, + "step": 5114 + }, + { + "epoch": 0.4092, + "grad_norm": 1.6435238122940063, + "learning_rate": 6.4141468613853295e-06, + "loss": 0.3263, + "step": 5115 + }, + { + "epoch": 0.40928, + "grad_norm": 1.6907066106796265, + "learning_rate": 6.412940522758992e-06, + "loss": 0.3561, + "step": 5116 + }, + { + "epoch": 0.40936, + "grad_norm": 1.7472286224365234, + "learning_rate": 6.411734094740634e-06, + "loss": 0.375, + "step": 5117 + }, + { + "epoch": 0.40944, + "grad_norm": 1.2598026990890503, + "learning_rate": 6.4105275774065846e-06, + "loss": 0.2562, + "step": 5118 + }, + { + "epoch": 0.40952, + "grad_norm": 1.2201982736587524, + "learning_rate": 6.409320970833176e-06, + "loss": 0.271, + "step": 5119 + }, + { + "epoch": 0.4096, + "grad_norm": 2.0412731170654297, + "learning_rate": 6.408114275096743e-06, + "loss": 0.3964, + "step": 5120 + }, + { + "epoch": 0.40968, + "grad_norm": 1.8842438459396362, + "learning_rate": 6.406907490273633e-06, + "loss": 0.427, + "step": 5121 + }, + { + "epoch": 0.40976, + "grad_norm": 1.1666141748428345, + "learning_rate": 6.405700616440191e-06, + "loss": 0.2608, + "step": 5122 + }, + { + "epoch": 0.40984, + "grad_norm": 1.613979697227478, + "learning_rate": 6.404493653672776e-06, + "loss": 0.34, + "step": 5123 + }, + { + "epoch": 0.40992, + "grad_norm": 1.5906524658203125, + "learning_rate": 6.403286602047748e-06, + "loss": 0.4057, + "step": 5124 + }, + { + "epoch": 0.41, + "grad_norm": 1.3950166702270508, + "learning_rate": 6.4020794616414725e-06, + "loss": 0.3133, + "step": 5125 + }, + { + "epoch": 0.41008, + "grad_norm": 1.232621431350708, + "learning_rate": 6.40087223253032e-06, + "loss": 0.2786, + "step": 5126 + }, + { + "epoch": 0.41016, + "grad_norm": 1.6018669605255127, + "learning_rate": 6.3996649147906675e-06, + "loss": 0.2908, + "step": 5127 + }, + { + "epoch": 0.41024, + "grad_norm": 1.5088248252868652, + "learning_rate": 6.3984575084989e-06, + "loss": 0.2802, + "step": 5128 + }, + { + "epoch": 0.41032, + "grad_norm": 1.2472470998764038, + "learning_rate": 6.3972500137314066e-06, + "loss": 0.26, + "step": 5129 + }, + { + "epoch": 0.4104, + "grad_norm": 1.7943204641342163, + "learning_rate": 6.396042430564577e-06, + "loss": 0.3852, + "step": 5130 + }, + { + "epoch": 0.41048, + "grad_norm": 1.9341444969177246, + "learning_rate": 6.394834759074817e-06, + "loss": 0.392, + "step": 5131 + }, + { + "epoch": 0.41056, + "grad_norm": 1.7144229412078857, + "learning_rate": 6.393626999338527e-06, + "loss": 0.3879, + "step": 5132 + }, + { + "epoch": 0.41064, + "grad_norm": 1.494651198387146, + "learning_rate": 6.392419151432121e-06, + "loss": 0.3137, + "step": 5133 + }, + { + "epoch": 0.41072, + "grad_norm": 1.3737579584121704, + "learning_rate": 6.3912112154320135e-06, + "loss": 0.3458, + "step": 5134 + }, + { + "epoch": 0.4108, + "grad_norm": 1.907779574394226, + "learning_rate": 6.390003191414627e-06, + "loss": 0.368, + "step": 5135 + }, + { + "epoch": 0.41088, + "grad_norm": 1.4840914011001587, + "learning_rate": 6.388795079456392e-06, + "loss": 0.3191, + "step": 5136 + }, + { + "epoch": 0.41096, + "grad_norm": 1.8090542554855347, + "learning_rate": 6.387586879633736e-06, + "loss": 0.366, + "step": 5137 + }, + { + "epoch": 0.41104, + "grad_norm": 1.704310417175293, + "learning_rate": 6.386378592023103e-06, + "loss": 0.3767, + "step": 5138 + }, + { + "epoch": 0.41112, + "grad_norm": 1.956494927406311, + "learning_rate": 6.385170216700934e-06, + "loss": 0.3635, + "step": 5139 + }, + { + "epoch": 0.4112, + "grad_norm": 1.283545732498169, + "learning_rate": 6.383961753743681e-06, + "loss": 0.2738, + "step": 5140 + }, + { + "epoch": 0.41128, + "grad_norm": 1.40552818775177, + "learning_rate": 6.382753203227799e-06, + "loss": 0.3511, + "step": 5141 + }, + { + "epoch": 0.41136, + "grad_norm": 1.738274335861206, + "learning_rate": 6.381544565229749e-06, + "loss": 0.4043, + "step": 5142 + }, + { + "epoch": 0.41144, + "grad_norm": 1.6897915601730347, + "learning_rate": 6.380335839825996e-06, + "loss": 0.3846, + "step": 5143 + }, + { + "epoch": 0.41152, + "grad_norm": 1.5609641075134277, + "learning_rate": 6.379127027093013e-06, + "loss": 0.3149, + "step": 5144 + }, + { + "epoch": 0.4116, + "grad_norm": 1.7952966690063477, + "learning_rate": 6.377918127107277e-06, + "loss": 0.3799, + "step": 5145 + }, + { + "epoch": 0.41168, + "grad_norm": 1.743720293045044, + "learning_rate": 6.376709139945273e-06, + "loss": 0.3468, + "step": 5146 + }, + { + "epoch": 0.41176, + "grad_norm": 1.60451078414917, + "learning_rate": 6.375500065683486e-06, + "loss": 0.3882, + "step": 5147 + }, + { + "epoch": 0.41184, + "grad_norm": 1.7727147340774536, + "learning_rate": 6.374290904398416e-06, + "loss": 0.367, + "step": 5148 + }, + { + "epoch": 0.41192, + "grad_norm": 1.4645249843597412, + "learning_rate": 6.373081656166557e-06, + "loss": 0.3483, + "step": 5149 + }, + { + "epoch": 0.412, + "grad_norm": 1.3234875202178955, + "learning_rate": 6.371872321064414e-06, + "loss": 0.2924, + "step": 5150 + }, + { + "epoch": 0.41208, + "grad_norm": 1.859861969947815, + "learning_rate": 6.370662899168501e-06, + "loss": 0.3757, + "step": 5151 + }, + { + "epoch": 0.41216, + "grad_norm": 1.227710485458374, + "learning_rate": 6.3694533905553346e-06, + "loss": 0.3016, + "step": 5152 + }, + { + "epoch": 0.41224, + "grad_norm": 1.643816351890564, + "learning_rate": 6.368243795301432e-06, + "loss": 0.3616, + "step": 5153 + }, + { + "epoch": 0.41232, + "grad_norm": 1.3771861791610718, + "learning_rate": 6.3670341134833235e-06, + "loss": 0.3275, + "step": 5154 + }, + { + "epoch": 0.4124, + "grad_norm": 1.796872854232788, + "learning_rate": 6.3658243451775404e-06, + "loss": 0.3235, + "step": 5155 + }, + { + "epoch": 0.41248, + "grad_norm": 1.5238722562789917, + "learning_rate": 6.3646144904606214e-06, + "loss": 0.3444, + "step": 5156 + }, + { + "epoch": 0.41256, + "grad_norm": 1.4715383052825928, + "learning_rate": 6.363404549409109e-06, + "loss": 0.3818, + "step": 5157 + }, + { + "epoch": 0.41264, + "grad_norm": 1.8105343580245972, + "learning_rate": 6.362194522099553e-06, + "loss": 0.4152, + "step": 5158 + }, + { + "epoch": 0.41272, + "grad_norm": 1.5102286338806152, + "learning_rate": 6.360984408608507e-06, + "loss": 0.3434, + "step": 5159 + }, + { + "epoch": 0.4128, + "grad_norm": 1.4143092632293701, + "learning_rate": 6.359774209012532e-06, + "loss": 0.3415, + "step": 5160 + }, + { + "epoch": 0.41288, + "grad_norm": 1.791418433189392, + "learning_rate": 6.358563923388194e-06, + "loss": 0.3569, + "step": 5161 + }, + { + "epoch": 0.41296, + "grad_norm": 1.825252890586853, + "learning_rate": 6.35735355181206e-06, + "loss": 0.3656, + "step": 5162 + }, + { + "epoch": 0.41304, + "grad_norm": 1.6053016185760498, + "learning_rate": 6.3561430943607105e-06, + "loss": 0.3853, + "step": 5163 + }, + { + "epoch": 0.41312, + "grad_norm": 1.2607011795043945, + "learning_rate": 6.354932551110724e-06, + "loss": 0.259, + "step": 5164 + }, + { + "epoch": 0.4132, + "grad_norm": 1.842266321182251, + "learning_rate": 6.3537219221386885e-06, + "loss": 0.4299, + "step": 5165 + }, + { + "epoch": 0.41328, + "grad_norm": 1.4322572946548462, + "learning_rate": 6.352511207521197e-06, + "loss": 0.3413, + "step": 5166 + }, + { + "epoch": 0.41336, + "grad_norm": 1.5515720844268799, + "learning_rate": 6.3513004073348465e-06, + "loss": 0.2727, + "step": 5167 + }, + { + "epoch": 0.41344, + "grad_norm": 1.2557002305984497, + "learning_rate": 6.35008952165624e-06, + "loss": 0.3055, + "step": 5168 + }, + { + "epoch": 0.41352, + "grad_norm": 1.6650270223617554, + "learning_rate": 6.3488785505619875e-06, + "loss": 0.3785, + "step": 5169 + }, + { + "epoch": 0.4136, + "grad_norm": 1.5979880094528198, + "learning_rate": 6.347667494128702e-06, + "loss": 0.3962, + "step": 5170 + }, + { + "epoch": 0.41368, + "grad_norm": 2.0301644802093506, + "learning_rate": 6.3464563524330034e-06, + "loss": 0.3584, + "step": 5171 + }, + { + "epoch": 0.41376, + "grad_norm": 1.6554670333862305, + "learning_rate": 6.345245125551518e-06, + "loss": 0.3181, + "step": 5172 + }, + { + "epoch": 0.41384, + "grad_norm": 1.6080145835876465, + "learning_rate": 6.344033813560875e-06, + "loss": 0.5606, + "step": 5173 + }, + { + "epoch": 0.41392, + "grad_norm": 1.3876444101333618, + "learning_rate": 6.342822416537708e-06, + "loss": 0.4102, + "step": 5174 + }, + { + "epoch": 0.414, + "grad_norm": 1.4370923042297363, + "learning_rate": 6.341610934558662e-06, + "loss": 0.3051, + "step": 5175 + }, + { + "epoch": 0.41408, + "grad_norm": 1.7648351192474365, + "learning_rate": 6.340399367700379e-06, + "loss": 0.4381, + "step": 5176 + }, + { + "epoch": 0.41416, + "grad_norm": 1.2672674655914307, + "learning_rate": 6.339187716039514e-06, + "loss": 0.2934, + "step": 5177 + }, + { + "epoch": 0.41424, + "grad_norm": 1.697166085243225, + "learning_rate": 6.337975979652723e-06, + "loss": 0.3332, + "step": 5178 + }, + { + "epoch": 0.41432, + "grad_norm": 1.230234980583191, + "learning_rate": 6.336764158616669e-06, + "loss": 0.302, + "step": 5179 + }, + { + "epoch": 0.4144, + "grad_norm": 1.0796769857406616, + "learning_rate": 6.335552253008018e-06, + "loss": 0.2222, + "step": 5180 + }, + { + "epoch": 0.41448, + "grad_norm": 1.8272526264190674, + "learning_rate": 6.3343402629034456e-06, + "loss": 0.4759, + "step": 5181 + }, + { + "epoch": 0.41456, + "grad_norm": 2.0101819038391113, + "learning_rate": 6.333128188379629e-06, + "loss": 0.4156, + "step": 5182 + }, + { + "epoch": 0.41464, + "grad_norm": 1.8950108289718628, + "learning_rate": 6.3319160295132544e-06, + "loss": 0.3522, + "step": 5183 + }, + { + "epoch": 0.41472, + "grad_norm": 1.547398328781128, + "learning_rate": 6.3307037863810075e-06, + "loss": 0.274, + "step": 5184 + }, + { + "epoch": 0.4148, + "grad_norm": 1.70745849609375, + "learning_rate": 6.329491459059584e-06, + "loss": 0.3314, + "step": 5185 + }, + { + "epoch": 0.41488, + "grad_norm": 1.835158348083496, + "learning_rate": 6.328279047625687e-06, + "loss": 0.4241, + "step": 5186 + }, + { + "epoch": 0.41496, + "grad_norm": 1.7435204982757568, + "learning_rate": 6.327066552156018e-06, + "loss": 0.4203, + "step": 5187 + }, + { + "epoch": 0.41504, + "grad_norm": 1.393307089805603, + "learning_rate": 6.325853972727288e-06, + "loss": 0.2993, + "step": 5188 + }, + { + "epoch": 0.41512, + "grad_norm": 1.5650931596755981, + "learning_rate": 6.324641309416215e-06, + "loss": 0.3302, + "step": 5189 + }, + { + "epoch": 0.4152, + "grad_norm": 1.2271121740341187, + "learning_rate": 6.323428562299516e-06, + "loss": 0.2451, + "step": 5190 + }, + { + "epoch": 0.41528, + "grad_norm": 1.5217052698135376, + "learning_rate": 6.322215731453922e-06, + "loss": 0.3315, + "step": 5191 + }, + { + "epoch": 0.41536, + "grad_norm": 1.3272700309753418, + "learning_rate": 6.321002816956162e-06, + "loss": 0.273, + "step": 5192 + }, + { + "epoch": 0.41544, + "grad_norm": 1.313028335571289, + "learning_rate": 6.3197898188829736e-06, + "loss": 0.2515, + "step": 5193 + }, + { + "epoch": 0.41552, + "grad_norm": 1.6728700399398804, + "learning_rate": 6.3185767373111e-06, + "loss": 0.4031, + "step": 5194 + }, + { + "epoch": 0.4156, + "grad_norm": 1.6574065685272217, + "learning_rate": 6.317363572317289e-06, + "loss": 0.3371, + "step": 5195 + }, + { + "epoch": 0.41568, + "grad_norm": 1.8870322704315186, + "learning_rate": 6.316150323978291e-06, + "loss": 0.3752, + "step": 5196 + }, + { + "epoch": 0.41576, + "grad_norm": 1.766973853111267, + "learning_rate": 6.3149369923708656e-06, + "loss": 0.3786, + "step": 5197 + }, + { + "epoch": 0.41584, + "grad_norm": 1.1528550386428833, + "learning_rate": 6.313723577571779e-06, + "loss": 0.2809, + "step": 5198 + }, + { + "epoch": 0.41592, + "grad_norm": 1.3488599061965942, + "learning_rate": 6.3125100796577956e-06, + "loss": 0.3158, + "step": 5199 + }, + { + "epoch": 0.416, + "grad_norm": 1.634043574333191, + "learning_rate": 6.311296498705691e-06, + "loss": 0.3833, + "step": 5200 + }, + { + "epoch": 0.41608, + "grad_norm": 1.544848084449768, + "learning_rate": 6.310082834792246e-06, + "loss": 0.3798, + "step": 5201 + }, + { + "epoch": 0.41616, + "grad_norm": 1.647979736328125, + "learning_rate": 6.308869087994243e-06, + "loss": 0.3992, + "step": 5202 + }, + { + "epoch": 0.41624, + "grad_norm": 1.9953075647354126, + "learning_rate": 6.307655258388471e-06, + "loss": 0.4337, + "step": 5203 + }, + { + "epoch": 0.41632, + "grad_norm": 1.6946043968200684, + "learning_rate": 6.306441346051727e-06, + "loss": 0.4, + "step": 5204 + }, + { + "epoch": 0.4164, + "grad_norm": 1.4132634401321411, + "learning_rate": 6.305227351060809e-06, + "loss": 0.2904, + "step": 5205 + }, + { + "epoch": 0.41648, + "grad_norm": 1.3155604600906372, + "learning_rate": 6.304013273492526e-06, + "loss": 0.2809, + "step": 5206 + }, + { + "epoch": 0.41656, + "grad_norm": 1.562078595161438, + "learning_rate": 6.302799113423686e-06, + "loss": 0.323, + "step": 5207 + }, + { + "epoch": 0.41664, + "grad_norm": 1.5895940065383911, + "learning_rate": 6.3015848709311055e-06, + "loss": 0.332, + "step": 5208 + }, + { + "epoch": 0.41672, + "grad_norm": 1.8904573917388916, + "learning_rate": 6.300370546091605e-06, + "loss": 0.4672, + "step": 5209 + }, + { + "epoch": 0.4168, + "grad_norm": 1.3617215156555176, + "learning_rate": 6.299156138982011e-06, + "loss": 0.2794, + "step": 5210 + }, + { + "epoch": 0.41688, + "grad_norm": 1.2635133266448975, + "learning_rate": 6.2979416496791545e-06, + "loss": 0.2584, + "step": 5211 + }, + { + "epoch": 0.41696, + "grad_norm": 1.8867225646972656, + "learning_rate": 6.2967270782598735e-06, + "loss": 0.4578, + "step": 5212 + }, + { + "epoch": 0.41704, + "grad_norm": 1.5454792976379395, + "learning_rate": 6.295512424801009e-06, + "loss": 0.3402, + "step": 5213 + }, + { + "epoch": 0.41712, + "grad_norm": 1.5802242755889893, + "learning_rate": 6.2942976893794085e-06, + "loss": 0.2926, + "step": 5214 + }, + { + "epoch": 0.4172, + "grad_norm": 1.3554202318191528, + "learning_rate": 6.293082872071923e-06, + "loss": 0.3325, + "step": 5215 + }, + { + "epoch": 0.41728, + "grad_norm": 1.8882520198822021, + "learning_rate": 6.291867972955411e-06, + "loss": 0.463, + "step": 5216 + }, + { + "epoch": 0.41736, + "grad_norm": 1.6655519008636475, + "learning_rate": 6.290652992106735e-06, + "loss": 0.3663, + "step": 5217 + }, + { + "epoch": 0.41744, + "grad_norm": 1.787269115447998, + "learning_rate": 6.289437929602763e-06, + "loss": 0.358, + "step": 5218 + }, + { + "epoch": 0.41752, + "grad_norm": 1.5246529579162598, + "learning_rate": 6.288222785520368e-06, + "loss": 0.3243, + "step": 5219 + }, + { + "epoch": 0.4176, + "grad_norm": 1.75812828540802, + "learning_rate": 6.287007559936426e-06, + "loss": 0.3917, + "step": 5220 + }, + { + "epoch": 0.41768, + "grad_norm": 1.2533934116363525, + "learning_rate": 6.285792252927826e-06, + "loss": 0.3793, + "step": 5221 + }, + { + "epoch": 0.41776, + "grad_norm": 1.364107608795166, + "learning_rate": 6.284576864571449e-06, + "loss": 0.3192, + "step": 5222 + }, + { + "epoch": 0.41784, + "grad_norm": 1.3123117685317993, + "learning_rate": 6.283361394944193e-06, + "loss": 0.3421, + "step": 5223 + }, + { + "epoch": 0.41792, + "grad_norm": 1.1852989196777344, + "learning_rate": 6.282145844122956e-06, + "loss": 0.2568, + "step": 5224 + }, + { + "epoch": 0.418, + "grad_norm": 1.9244329929351807, + "learning_rate": 6.2809302121846415e-06, + "loss": 0.5014, + "step": 5225 + }, + { + "epoch": 0.41808, + "grad_norm": 1.4049652814865112, + "learning_rate": 6.279714499206157e-06, + "loss": 0.3108, + "step": 5226 + }, + { + "epoch": 0.41816, + "grad_norm": 1.9336098432540894, + "learning_rate": 6.27849870526442e-06, + "loss": 0.3474, + "step": 5227 + }, + { + "epoch": 0.41824, + "grad_norm": 1.9458836317062378, + "learning_rate": 6.277282830436346e-06, + "loss": 0.3939, + "step": 5228 + }, + { + "epoch": 0.41832, + "grad_norm": 2.153730869293213, + "learning_rate": 6.276066874798862e-06, + "loss": 0.4109, + "step": 5229 + }, + { + "epoch": 0.4184, + "grad_norm": 1.5371249914169312, + "learning_rate": 6.274850838428896e-06, + "loss": 0.3472, + "step": 5230 + }, + { + "epoch": 0.41848, + "grad_norm": 1.760744571685791, + "learning_rate": 6.273634721403385e-06, + "loss": 0.3729, + "step": 5231 + }, + { + "epoch": 0.41856, + "grad_norm": 1.7278648614883423, + "learning_rate": 6.272418523799266e-06, + "loss": 0.3745, + "step": 5232 + }, + { + "epoch": 0.41864, + "grad_norm": 1.7429898977279663, + "learning_rate": 6.271202245693484e-06, + "loss": 0.3522, + "step": 5233 + }, + { + "epoch": 0.41872, + "grad_norm": 1.5041425228118896, + "learning_rate": 6.269985887162988e-06, + "loss": 0.3114, + "step": 5234 + }, + { + "epoch": 0.4188, + "grad_norm": 1.4007189273834229, + "learning_rate": 6.268769448284736e-06, + "loss": 0.2983, + "step": 5235 + }, + { + "epoch": 0.41888, + "grad_norm": 1.4600435495376587, + "learning_rate": 6.267552929135688e-06, + "loss": 0.3146, + "step": 5236 + }, + { + "epoch": 0.41896, + "grad_norm": 1.7335883378982544, + "learning_rate": 6.266336329792804e-06, + "loss": 0.5255, + "step": 5237 + }, + { + "epoch": 0.41904, + "grad_norm": 1.3453007936477661, + "learning_rate": 6.265119650333059e-06, + "loss": 0.3394, + "step": 5238 + }, + { + "epoch": 0.41912, + "grad_norm": 1.1924803256988525, + "learning_rate": 6.263902890833427e-06, + "loss": 0.2937, + "step": 5239 + }, + { + "epoch": 0.4192, + "grad_norm": 1.4555060863494873, + "learning_rate": 6.2626860513708875e-06, + "loss": 0.2826, + "step": 5240 + }, + { + "epoch": 0.41928, + "grad_norm": 1.6543127298355103, + "learning_rate": 6.261469132022426e-06, + "loss": 0.3925, + "step": 5241 + }, + { + "epoch": 0.41936, + "grad_norm": 1.2936607599258423, + "learning_rate": 6.260252132865035e-06, + "loss": 0.2698, + "step": 5242 + }, + { + "epoch": 0.41944, + "grad_norm": 1.5900158882141113, + "learning_rate": 6.259035053975708e-06, + "loss": 0.35, + "step": 5243 + }, + { + "epoch": 0.41952, + "grad_norm": 1.5317350625991821, + "learning_rate": 6.257817895431446e-06, + "loss": 0.3286, + "step": 5244 + }, + { + "epoch": 0.4196, + "grad_norm": 1.4321229457855225, + "learning_rate": 6.256600657309254e-06, + "loss": 0.2899, + "step": 5245 + }, + { + "epoch": 0.41968, + "grad_norm": 1.6110750436782837, + "learning_rate": 6.255383339686143e-06, + "loss": 0.3403, + "step": 5246 + }, + { + "epoch": 0.41976, + "grad_norm": 1.6724129915237427, + "learning_rate": 6.254165942639128e-06, + "loss": 0.3894, + "step": 5247 + }, + { + "epoch": 0.41984, + "grad_norm": 1.6472164392471313, + "learning_rate": 6.252948466245232e-06, + "loss": 0.4126, + "step": 5248 + }, + { + "epoch": 0.41992, + "grad_norm": 1.4509929418563843, + "learning_rate": 6.251730910581478e-06, + "loss": 0.3197, + "step": 5249 + }, + { + "epoch": 0.42, + "grad_norm": 1.9968012571334839, + "learning_rate": 6.250513275724896e-06, + "loss": 0.3536, + "step": 5250 + }, + { + "epoch": 0.42008, + "grad_norm": 1.2318665981292725, + "learning_rate": 6.249295561752525e-06, + "loss": 0.322, + "step": 5251 + }, + { + "epoch": 0.42016, + "grad_norm": 1.3581112623214722, + "learning_rate": 6.248077768741404e-06, + "loss": 0.2863, + "step": 5252 + }, + { + "epoch": 0.42024, + "grad_norm": 1.4493217468261719, + "learning_rate": 6.246859896768579e-06, + "loss": 0.2835, + "step": 5253 + }, + { + "epoch": 0.42032, + "grad_norm": 1.253295660018921, + "learning_rate": 6.245641945911099e-06, + "loss": 0.2689, + "step": 5254 + }, + { + "epoch": 0.4204, + "grad_norm": 1.5289180278778076, + "learning_rate": 6.244423916246023e-06, + "loss": 0.3176, + "step": 5255 + }, + { + "epoch": 0.42048, + "grad_norm": 1.4782484769821167, + "learning_rate": 6.243205807850408e-06, + "loss": 0.3583, + "step": 5256 + }, + { + "epoch": 0.42056, + "grad_norm": 1.5398263931274414, + "learning_rate": 6.241987620801322e-06, + "loss": 0.3898, + "step": 5257 + }, + { + "epoch": 0.42064, + "grad_norm": 1.4873570203781128, + "learning_rate": 6.240769355175834e-06, + "loss": 0.3556, + "step": 5258 + }, + { + "epoch": 0.42072, + "grad_norm": 1.5727860927581787, + "learning_rate": 6.239551011051021e-06, + "loss": 0.2808, + "step": 5259 + }, + { + "epoch": 0.4208, + "grad_norm": 1.371330738067627, + "learning_rate": 6.2383325885039635e-06, + "loss": 0.2745, + "step": 5260 + }, + { + "epoch": 0.42088, + "grad_norm": 1.5367326736450195, + "learning_rate": 6.237114087611747e-06, + "loss": 0.4118, + "step": 5261 + }, + { + "epoch": 0.42096, + "grad_norm": 1.6745377779006958, + "learning_rate": 6.23589550845146e-06, + "loss": 0.3761, + "step": 5262 + }, + { + "epoch": 0.42104, + "grad_norm": 1.4240877628326416, + "learning_rate": 6.234676851100201e-06, + "loss": 0.3344, + "step": 5263 + }, + { + "epoch": 0.42112, + "grad_norm": 1.6593492031097412, + "learning_rate": 6.233458115635067e-06, + "loss": 0.4413, + "step": 5264 + }, + { + "epoch": 0.4212, + "grad_norm": 1.6215040683746338, + "learning_rate": 6.232239302133167e-06, + "loss": 0.4039, + "step": 5265 + }, + { + "epoch": 0.42128, + "grad_norm": 1.453888177871704, + "learning_rate": 6.23102041067161e-06, + "loss": 0.253, + "step": 5266 + }, + { + "epoch": 0.42136, + "grad_norm": 1.473840355873108, + "learning_rate": 6.22980144132751e-06, + "loss": 0.2827, + "step": 5267 + }, + { + "epoch": 0.42144, + "grad_norm": 2.1219794750213623, + "learning_rate": 6.2285823941779864e-06, + "loss": 0.374, + "step": 5268 + }, + { + "epoch": 0.42152, + "grad_norm": 1.2442418336868286, + "learning_rate": 6.227363269300166e-06, + "loss": 0.2711, + "step": 5269 + }, + { + "epoch": 0.4216, + "grad_norm": 1.3598564863204956, + "learning_rate": 6.226144066771179e-06, + "loss": 0.2622, + "step": 5270 + }, + { + "epoch": 0.42168, + "grad_norm": 2.1796743869781494, + "learning_rate": 6.224924786668161e-06, + "loss": 0.4318, + "step": 5271 + }, + { + "epoch": 0.42176, + "grad_norm": 1.6060525178909302, + "learning_rate": 6.2237054290682475e-06, + "loss": 0.2812, + "step": 5272 + }, + { + "epoch": 0.42184, + "grad_norm": 1.5547593832015991, + "learning_rate": 6.2224859940485874e-06, + "loss": 0.3616, + "step": 5273 + }, + { + "epoch": 0.42192, + "grad_norm": 1.3605103492736816, + "learning_rate": 6.221266481686328e-06, + "loss": 0.2399, + "step": 5274 + }, + { + "epoch": 0.422, + "grad_norm": 1.3548734188079834, + "learning_rate": 6.220046892058626e-06, + "loss": 0.2625, + "step": 5275 + }, + { + "epoch": 0.42208, + "grad_norm": 1.6121068000793457, + "learning_rate": 6.218827225242638e-06, + "loss": 0.3836, + "step": 5276 + }, + { + "epoch": 0.42216, + "grad_norm": 1.4999924898147583, + "learning_rate": 6.217607481315531e-06, + "loss": 0.3994, + "step": 5277 + }, + { + "epoch": 0.42224, + "grad_norm": 1.484944462776184, + "learning_rate": 6.216387660354472e-06, + "loss": 0.3187, + "step": 5278 + }, + { + "epoch": 0.42232, + "grad_norm": 1.4199094772338867, + "learning_rate": 6.215167762436637e-06, + "loss": 0.3337, + "step": 5279 + }, + { + "epoch": 0.4224, + "grad_norm": 1.7457900047302246, + "learning_rate": 6.213947787639203e-06, + "loss": 0.3496, + "step": 5280 + }, + { + "epoch": 0.42248, + "grad_norm": 1.5127125978469849, + "learning_rate": 6.212727736039354e-06, + "loss": 0.3331, + "step": 5281 + }, + { + "epoch": 0.42256, + "grad_norm": 1.618870735168457, + "learning_rate": 6.211507607714277e-06, + "loss": 0.2917, + "step": 5282 + }, + { + "epoch": 0.42264, + "grad_norm": 1.5503782033920288, + "learning_rate": 6.210287402741171e-06, + "loss": 0.3699, + "step": 5283 + }, + { + "epoch": 0.42272, + "grad_norm": 1.6964318752288818, + "learning_rate": 6.209067121197228e-06, + "loss": 0.3575, + "step": 5284 + }, + { + "epoch": 0.4228, + "grad_norm": 1.616135835647583, + "learning_rate": 6.207846763159655e-06, + "loss": 0.3734, + "step": 5285 + }, + { + "epoch": 0.42288, + "grad_norm": 1.5970280170440674, + "learning_rate": 6.206626328705659e-06, + "loss": 0.3787, + "step": 5286 + }, + { + "epoch": 0.42296, + "grad_norm": 1.7362499237060547, + "learning_rate": 6.205405817912452e-06, + "loss": 0.3442, + "step": 5287 + }, + { + "epoch": 0.42304, + "grad_norm": 1.6331170797348022, + "learning_rate": 6.204185230857252e-06, + "loss": 0.3461, + "step": 5288 + }, + { + "epoch": 0.42312, + "grad_norm": 2.0485804080963135, + "learning_rate": 6.202964567617283e-06, + "loss": 0.3591, + "step": 5289 + }, + { + "epoch": 0.4232, + "grad_norm": 1.9766350984573364, + "learning_rate": 6.20174382826977e-06, + "loss": 0.4748, + "step": 5290 + }, + { + "epoch": 0.42328, + "grad_norm": 1.2820550203323364, + "learning_rate": 6.200523012891945e-06, + "loss": 0.2704, + "step": 5291 + }, + { + "epoch": 0.42336, + "grad_norm": 1.3562285900115967, + "learning_rate": 6.199302121561048e-06, + "loss": 0.329, + "step": 5292 + }, + { + "epoch": 0.42344, + "grad_norm": 1.444642186164856, + "learning_rate": 6.198081154354317e-06, + "loss": 0.4026, + "step": 5293 + }, + { + "epoch": 0.42352, + "grad_norm": 1.574292778968811, + "learning_rate": 6.196860111349001e-06, + "loss": 0.299, + "step": 5294 + }, + { + "epoch": 0.4236, + "grad_norm": 1.5647331476211548, + "learning_rate": 6.19563899262235e-06, + "loss": 0.2892, + "step": 5295 + }, + { + "epoch": 0.42368, + "grad_norm": 1.810318112373352, + "learning_rate": 6.194417798251622e-06, + "loss": 0.3837, + "step": 5296 + }, + { + "epoch": 0.42376, + "grad_norm": 1.7629601955413818, + "learning_rate": 6.193196528314073e-06, + "loss": 0.3893, + "step": 5297 + }, + { + "epoch": 0.42384, + "grad_norm": 1.5911134481430054, + "learning_rate": 6.191975182886976e-06, + "loss": 0.3948, + "step": 5298 + }, + { + "epoch": 0.42392, + "grad_norm": 1.661783218383789, + "learning_rate": 6.1907537620475955e-06, + "loss": 0.3691, + "step": 5299 + }, + { + "epoch": 0.424, + "grad_norm": 1.1115741729736328, + "learning_rate": 6.189532265873209e-06, + "loss": 0.2339, + "step": 5300 + }, + { + "epoch": 0.42408, + "grad_norm": 1.4616998434066772, + "learning_rate": 6.188310694441097e-06, + "loss": 0.2663, + "step": 5301 + }, + { + "epoch": 0.42416, + "grad_norm": 1.6125479936599731, + "learning_rate": 6.187089047828542e-06, + "loss": 0.3089, + "step": 5302 + }, + { + "epoch": 0.42424, + "grad_norm": 1.8839313983917236, + "learning_rate": 6.1858673261128364e-06, + "loss": 0.4514, + "step": 5303 + }, + { + "epoch": 0.42432, + "grad_norm": 1.655595302581787, + "learning_rate": 6.184645529371272e-06, + "loss": 0.3448, + "step": 5304 + }, + { + "epoch": 0.4244, + "grad_norm": 1.311193585395813, + "learning_rate": 6.183423657681149e-06, + "loss": 0.2564, + "step": 5305 + }, + { + "epoch": 0.42448, + "grad_norm": 2.025151491165161, + "learning_rate": 6.182201711119771e-06, + "loss": 0.5696, + "step": 5306 + }, + { + "epoch": 0.42456, + "grad_norm": 1.321874737739563, + "learning_rate": 6.180979689764447e-06, + "loss": 0.3092, + "step": 5307 + }, + { + "epoch": 0.42464, + "grad_norm": 2.123602867126465, + "learning_rate": 6.179757593692488e-06, + "loss": 0.4035, + "step": 5308 + }, + { + "epoch": 0.42472, + "grad_norm": 1.5167906284332275, + "learning_rate": 6.178535422981216e-06, + "loss": 0.3193, + "step": 5309 + }, + { + "epoch": 0.4248, + "grad_norm": 1.7157925367355347, + "learning_rate": 6.17731317770795e-06, + "loss": 0.3725, + "step": 5310 + }, + { + "epoch": 0.42488, + "grad_norm": 1.4427510499954224, + "learning_rate": 6.176090857950018e-06, + "loss": 0.2961, + "step": 5311 + }, + { + "epoch": 0.42496, + "grad_norm": 1.3995834589004517, + "learning_rate": 6.174868463784752e-06, + "loss": 0.3033, + "step": 5312 + }, + { + "epoch": 0.42504, + "grad_norm": 1.511171817779541, + "learning_rate": 6.173645995289491e-06, + "loss": 0.3218, + "step": 5313 + }, + { + "epoch": 0.42512, + "grad_norm": 1.7538864612579346, + "learning_rate": 6.172423452541574e-06, + "loss": 0.4702, + "step": 5314 + }, + { + "epoch": 0.4252, + "grad_norm": 1.4485112428665161, + "learning_rate": 6.1712008356183485e-06, + "loss": 0.2855, + "step": 5315 + }, + { + "epoch": 0.42528, + "grad_norm": 1.475102186203003, + "learning_rate": 6.169978144597164e-06, + "loss": 0.3669, + "step": 5316 + }, + { + "epoch": 0.42536, + "grad_norm": 1.569522500038147, + "learning_rate": 6.168755379555378e-06, + "loss": 0.3393, + "step": 5317 + }, + { + "epoch": 0.42544, + "grad_norm": 1.7826415300369263, + "learning_rate": 6.167532540570351e-06, + "loss": 0.3331, + "step": 5318 + }, + { + "epoch": 0.42552, + "grad_norm": 1.3612468242645264, + "learning_rate": 6.166309627719444e-06, + "loss": 0.3009, + "step": 5319 + }, + { + "epoch": 0.4256, + "grad_norm": 1.4943021535873413, + "learning_rate": 6.165086641080032e-06, + "loss": 0.3387, + "step": 5320 + }, + { + "epoch": 0.42568, + "grad_norm": 1.629370927810669, + "learning_rate": 6.163863580729484e-06, + "loss": 0.3885, + "step": 5321 + }, + { + "epoch": 0.42576, + "grad_norm": 2.004427433013916, + "learning_rate": 6.162640446745184e-06, + "loss": 0.4166, + "step": 5322 + }, + { + "epoch": 0.42584, + "grad_norm": 1.5527446269989014, + "learning_rate": 6.161417239204512e-06, + "loss": 0.3396, + "step": 5323 + }, + { + "epoch": 0.42592, + "grad_norm": 1.4790587425231934, + "learning_rate": 6.160193958184858e-06, + "loss": 0.3218, + "step": 5324 + }, + { + "epoch": 0.426, + "grad_norm": 1.5748834609985352, + "learning_rate": 6.158970603763615e-06, + "loss": 0.3968, + "step": 5325 + }, + { + "epoch": 0.42608, + "grad_norm": 1.7657493352890015, + "learning_rate": 6.157747176018177e-06, + "loss": 0.3378, + "step": 5326 + }, + { + "epoch": 0.42616, + "grad_norm": 2.0443902015686035, + "learning_rate": 6.15652367502595e-06, + "loss": 0.4723, + "step": 5327 + }, + { + "epoch": 0.42624, + "grad_norm": 1.6636515855789185, + "learning_rate": 6.155300100864341e-06, + "loss": 0.3116, + "step": 5328 + }, + { + "epoch": 0.42632, + "grad_norm": 1.5545495748519897, + "learning_rate": 6.154076453610759e-06, + "loss": 0.3614, + "step": 5329 + }, + { + "epoch": 0.4264, + "grad_norm": 1.6116411685943604, + "learning_rate": 6.152852733342623e-06, + "loss": 0.3714, + "step": 5330 + }, + { + "epoch": 0.42648, + "grad_norm": 1.3033878803253174, + "learning_rate": 6.151628940137351e-06, + "loss": 0.3168, + "step": 5331 + }, + { + "epoch": 0.42656, + "grad_norm": 1.3937110900878906, + "learning_rate": 6.150405074072369e-06, + "loss": 0.274, + "step": 5332 + }, + { + "epoch": 0.42664, + "grad_norm": 1.9616461992263794, + "learning_rate": 6.1491811352251085e-06, + "loss": 0.4114, + "step": 5333 + }, + { + "epoch": 0.42672, + "grad_norm": 1.5841161012649536, + "learning_rate": 6.1479571236730005e-06, + "loss": 0.3684, + "step": 5334 + }, + { + "epoch": 0.4268, + "grad_norm": 1.403443455696106, + "learning_rate": 6.146733039493487e-06, + "loss": 0.2633, + "step": 5335 + }, + { + "epoch": 0.42688, + "grad_norm": 1.5101227760314941, + "learning_rate": 6.145508882764013e-06, + "loss": 0.2751, + "step": 5336 + }, + { + "epoch": 0.42696, + "grad_norm": 1.6489311456680298, + "learning_rate": 6.144284653562024e-06, + "loss": 0.3991, + "step": 5337 + }, + { + "epoch": 0.42704, + "grad_norm": 1.270580530166626, + "learning_rate": 6.143060351964973e-06, + "loss": 0.2707, + "step": 5338 + }, + { + "epoch": 0.42712, + "grad_norm": 1.7063907384872437, + "learning_rate": 6.141835978050318e-06, + "loss": 0.5227, + "step": 5339 + }, + { + "epoch": 0.4272, + "grad_norm": 1.2827900648117065, + "learning_rate": 6.140611531895522e-06, + "loss": 0.3149, + "step": 5340 + }, + { + "epoch": 0.42728, + "grad_norm": 1.45085608959198, + "learning_rate": 6.139387013578051e-06, + "loss": 0.3797, + "step": 5341 + }, + { + "epoch": 0.42736, + "grad_norm": 1.335211992263794, + "learning_rate": 6.138162423175375e-06, + "loss": 0.3144, + "step": 5342 + }, + { + "epoch": 0.42744, + "grad_norm": 1.3709321022033691, + "learning_rate": 6.136937760764972e-06, + "loss": 0.2786, + "step": 5343 + }, + { + "epoch": 0.42752, + "grad_norm": 1.9491933584213257, + "learning_rate": 6.13571302642432e-06, + "loss": 0.3513, + "step": 5344 + }, + { + "epoch": 0.4276, + "grad_norm": 1.6201810836791992, + "learning_rate": 6.1344882202309075e-06, + "loss": 0.3616, + "step": 5345 + }, + { + "epoch": 0.42768, + "grad_norm": 1.6798819303512573, + "learning_rate": 6.133263342262219e-06, + "loss": 0.3304, + "step": 5346 + }, + { + "epoch": 0.42776, + "grad_norm": 1.485134243965149, + "learning_rate": 6.132038392595751e-06, + "loss": 0.2942, + "step": 5347 + }, + { + "epoch": 0.42784, + "grad_norm": 1.5289043188095093, + "learning_rate": 6.130813371309002e-06, + "loss": 0.311, + "step": 5348 + }, + { + "epoch": 0.42792, + "grad_norm": 1.708020567893982, + "learning_rate": 6.129588278479475e-06, + "loss": 0.3565, + "step": 5349 + }, + { + "epoch": 0.428, + "grad_norm": 1.249941349029541, + "learning_rate": 6.1283631141846755e-06, + "loss": 0.239, + "step": 5350 + }, + { + "epoch": 0.42808, + "grad_norm": 1.3397772312164307, + "learning_rate": 6.127137878502118e-06, + "loss": 0.3618, + "step": 5351 + }, + { + "epoch": 0.42816, + "grad_norm": 1.3979480266571045, + "learning_rate": 6.125912571509319e-06, + "loss": 0.2997, + "step": 5352 + }, + { + "epoch": 0.42824, + "grad_norm": 1.456049919128418, + "learning_rate": 6.124687193283799e-06, + "loss": 0.315, + "step": 5353 + }, + { + "epoch": 0.42832, + "grad_norm": 1.965668797492981, + "learning_rate": 6.123461743903084e-06, + "loss": 0.4174, + "step": 5354 + }, + { + "epoch": 0.4284, + "grad_norm": 1.4012924432754517, + "learning_rate": 6.122236223444703e-06, + "loss": 0.325, + "step": 5355 + }, + { + "epoch": 0.42848, + "grad_norm": 1.5004931688308716, + "learning_rate": 6.121010631986192e-06, + "loss": 0.3237, + "step": 5356 + }, + { + "epoch": 0.42856, + "grad_norm": 1.6268855333328247, + "learning_rate": 6.119784969605088e-06, + "loss": 0.4094, + "step": 5357 + }, + { + "epoch": 0.42864, + "grad_norm": 1.6038322448730469, + "learning_rate": 6.1185592363789355e-06, + "loss": 0.2773, + "step": 5358 + }, + { + "epoch": 0.42872, + "grad_norm": 1.811691403388977, + "learning_rate": 6.117333432385283e-06, + "loss": 0.3442, + "step": 5359 + }, + { + "epoch": 0.4288, + "grad_norm": 1.5288585424423218, + "learning_rate": 6.116107557701685e-06, + "loss": 0.3593, + "step": 5360 + }, + { + "epoch": 0.42888, + "grad_norm": 1.7061458826065063, + "learning_rate": 6.114881612405694e-06, + "loss": 0.4021, + "step": 5361 + }, + { + "epoch": 0.42896, + "grad_norm": 1.6404433250427246, + "learning_rate": 6.1136555965748735e-06, + "loss": 0.3261, + "step": 5362 + }, + { + "epoch": 0.42904, + "grad_norm": 1.6574904918670654, + "learning_rate": 6.11242951028679e-06, + "loss": 0.4901, + "step": 5363 + }, + { + "epoch": 0.42912, + "grad_norm": 1.2963271141052246, + "learning_rate": 6.111203353619014e-06, + "loss": 0.2512, + "step": 5364 + }, + { + "epoch": 0.4292, + "grad_norm": 1.7801556587219238, + "learning_rate": 6.109977126649121e-06, + "loss": 0.3984, + "step": 5365 + }, + { + "epoch": 0.42928, + "grad_norm": 1.7426446676254272, + "learning_rate": 6.108750829454688e-06, + "loss": 0.3063, + "step": 5366 + }, + { + "epoch": 0.42936, + "grad_norm": 1.6115596294403076, + "learning_rate": 6.1075244621133e-06, + "loss": 0.3511, + "step": 5367 + }, + { + "epoch": 0.42944, + "grad_norm": 1.2986208200454712, + "learning_rate": 6.106298024702546e-06, + "loss": 0.3206, + "step": 5368 + }, + { + "epoch": 0.42952, + "grad_norm": 1.9597961902618408, + "learning_rate": 6.105071517300017e-06, + "loss": 0.3735, + "step": 5369 + }, + { + "epoch": 0.4296, + "grad_norm": 1.4968289136886597, + "learning_rate": 6.10384493998331e-06, + "loss": 0.2758, + "step": 5370 + }, + { + "epoch": 0.42968, + "grad_norm": 1.4992220401763916, + "learning_rate": 6.102618292830029e-06, + "loss": 0.3457, + "step": 5371 + }, + { + "epoch": 0.42976, + "grad_norm": 1.5327579975128174, + "learning_rate": 6.1013915759177765e-06, + "loss": 0.3693, + "step": 5372 + }, + { + "epoch": 0.42984, + "grad_norm": 1.6249293088912964, + "learning_rate": 6.1001647893241634e-06, + "loss": 0.3094, + "step": 5373 + }, + { + "epoch": 0.42992, + "grad_norm": 1.7367304563522339, + "learning_rate": 6.098937933126806e-06, + "loss": 0.341, + "step": 5374 + }, + { + "epoch": 0.43, + "grad_norm": 1.3340009450912476, + "learning_rate": 6.097711007403323e-06, + "loss": 0.279, + "step": 5375 + }, + { + "epoch": 0.43008, + "grad_norm": 1.6486116647720337, + "learning_rate": 6.096484012231337e-06, + "loss": 0.3304, + "step": 5376 + }, + { + "epoch": 0.43016, + "grad_norm": 1.1886416673660278, + "learning_rate": 6.095256947688478e-06, + "loss": 0.2639, + "step": 5377 + }, + { + "epoch": 0.43024, + "grad_norm": 1.6678515672683716, + "learning_rate": 6.094029813852376e-06, + "loss": 0.3143, + "step": 5378 + }, + { + "epoch": 0.43032, + "grad_norm": 1.7223871946334839, + "learning_rate": 6.0928026108006675e-06, + "loss": 0.4146, + "step": 5379 + }, + { + "epoch": 0.4304, + "grad_norm": 1.9803104400634766, + "learning_rate": 6.091575338610994e-06, + "loss": 0.4194, + "step": 5380 + }, + { + "epoch": 0.43048, + "grad_norm": 1.469686508178711, + "learning_rate": 6.090347997361002e-06, + "loss": 0.3625, + "step": 5381 + }, + { + "epoch": 0.43056, + "grad_norm": 1.2683942317962646, + "learning_rate": 6.089120587128341e-06, + "loss": 0.258, + "step": 5382 + }, + { + "epoch": 0.43064, + "grad_norm": 1.4345426559448242, + "learning_rate": 6.087893107990665e-06, + "loss": 0.3373, + "step": 5383 + }, + { + "epoch": 0.43072, + "grad_norm": 1.7910370826721191, + "learning_rate": 6.0866655600256305e-06, + "loss": 0.4067, + "step": 5384 + }, + { + "epoch": 0.4308, + "grad_norm": 1.5200092792510986, + "learning_rate": 6.085437943310902e-06, + "loss": 0.2964, + "step": 5385 + }, + { + "epoch": 0.43088, + "grad_norm": 1.9939285516738892, + "learning_rate": 6.084210257924148e-06, + "loss": 0.3602, + "step": 5386 + }, + { + "epoch": 0.43096, + "grad_norm": 1.6643867492675781, + "learning_rate": 6.082982503943038e-06, + "loss": 0.3337, + "step": 5387 + }, + { + "epoch": 0.43104, + "grad_norm": 1.9779936075210571, + "learning_rate": 6.081754681445249e-06, + "loss": 0.3836, + "step": 5388 + }, + { + "epoch": 0.43112, + "grad_norm": 1.9725874662399292, + "learning_rate": 6.080526790508461e-06, + "loss": 0.3404, + "step": 5389 + }, + { + "epoch": 0.4312, + "grad_norm": 1.6242984533309937, + "learning_rate": 6.079298831210357e-06, + "loss": 0.3975, + "step": 5390 + }, + { + "epoch": 0.43128, + "grad_norm": 1.8685696125030518, + "learning_rate": 6.078070803628629e-06, + "loss": 0.3465, + "step": 5391 + }, + { + "epoch": 0.43136, + "grad_norm": 1.4240777492523193, + "learning_rate": 6.076842707840969e-06, + "loss": 0.2717, + "step": 5392 + }, + { + "epoch": 0.43144, + "grad_norm": 1.6460999250411987, + "learning_rate": 6.0756145439250725e-06, + "loss": 0.4144, + "step": 5393 + }, + { + "epoch": 0.43152, + "grad_norm": 1.7656724452972412, + "learning_rate": 6.074386311958643e-06, + "loss": 0.383, + "step": 5394 + }, + { + "epoch": 0.4316, + "grad_norm": 1.7070986032485962, + "learning_rate": 6.073158012019388e-06, + "loss": 0.3203, + "step": 5395 + }, + { + "epoch": 0.43168, + "grad_norm": 2.166088581085205, + "learning_rate": 6.071929644185014e-06, + "loss": 0.454, + "step": 5396 + }, + { + "epoch": 0.43176, + "grad_norm": 1.7778912782669067, + "learning_rate": 6.07070120853324e-06, + "loss": 0.3748, + "step": 5397 + }, + { + "epoch": 0.43184, + "grad_norm": 1.4228088855743408, + "learning_rate": 6.069472705141781e-06, + "loss": 0.3252, + "step": 5398 + }, + { + "epoch": 0.43192, + "grad_norm": 1.510135293006897, + "learning_rate": 6.068244134088363e-06, + "loss": 0.4119, + "step": 5399 + }, + { + "epoch": 0.432, + "grad_norm": 1.453439712524414, + "learning_rate": 6.067015495450715e-06, + "loss": 0.2788, + "step": 5400 + }, + { + "epoch": 0.43208, + "grad_norm": 1.4725075960159302, + "learning_rate": 6.065786789306566e-06, + "loss": 0.3727, + "step": 5401 + }, + { + "epoch": 0.43216, + "grad_norm": 1.9456934928894043, + "learning_rate": 6.064558015733653e-06, + "loss": 0.3712, + "step": 5402 + }, + { + "epoch": 0.43224, + "grad_norm": 1.5379313230514526, + "learning_rate": 6.063329174809715e-06, + "loss": 0.3203, + "step": 5403 + }, + { + "epoch": 0.43232, + "grad_norm": 1.5046453475952148, + "learning_rate": 6.0621002666124995e-06, + "loss": 0.3101, + "step": 5404 + }, + { + "epoch": 0.4324, + "grad_norm": 1.9868191480636597, + "learning_rate": 6.060871291219753e-06, + "loss": 0.3739, + "step": 5405 + }, + { + "epoch": 0.43248, + "grad_norm": 1.5251084566116333, + "learning_rate": 6.0596422487092295e-06, + "loss": 0.322, + "step": 5406 + }, + { + "epoch": 0.43256, + "grad_norm": 1.3554657697677612, + "learning_rate": 6.058413139158687e-06, + "loss": 0.3143, + "step": 5407 + }, + { + "epoch": 0.43264, + "grad_norm": 1.592686414718628, + "learning_rate": 6.0571839626458875e-06, + "loss": 0.3187, + "step": 5408 + }, + { + "epoch": 0.43272, + "grad_norm": 1.3215630054473877, + "learning_rate": 6.055954719248595e-06, + "loss": 0.308, + "step": 5409 + }, + { + "epoch": 0.4328, + "grad_norm": 1.2852091789245605, + "learning_rate": 6.054725409044579e-06, + "loss": 0.2711, + "step": 5410 + }, + { + "epoch": 0.43288, + "grad_norm": 1.8097467422485352, + "learning_rate": 6.0534960321116175e-06, + "loss": 0.3427, + "step": 5411 + }, + { + "epoch": 0.43296, + "grad_norm": 1.3292269706726074, + "learning_rate": 6.052266588527488e-06, + "loss": 0.3537, + "step": 5412 + }, + { + "epoch": 0.43304, + "grad_norm": 1.6236307621002197, + "learning_rate": 6.051037078369972e-06, + "loss": 0.4301, + "step": 5413 + }, + { + "epoch": 0.43312, + "grad_norm": 1.4823275804519653, + "learning_rate": 6.049807501716856e-06, + "loss": 0.3199, + "step": 5414 + }, + { + "epoch": 0.4332, + "grad_norm": 1.5742340087890625, + "learning_rate": 6.048577858645932e-06, + "loss": 0.3312, + "step": 5415 + }, + { + "epoch": 0.43328, + "grad_norm": 1.4714792966842651, + "learning_rate": 6.047348149234995e-06, + "loss": 0.3228, + "step": 5416 + }, + { + "epoch": 0.43336, + "grad_norm": 1.3467466831207275, + "learning_rate": 6.046118373561845e-06, + "loss": 0.3045, + "step": 5417 + }, + { + "epoch": 0.43344, + "grad_norm": 1.4545137882232666, + "learning_rate": 6.044888531704287e-06, + "loss": 0.3308, + "step": 5418 + }, + { + "epoch": 0.43352, + "grad_norm": 1.3145484924316406, + "learning_rate": 6.043658623740127e-06, + "loss": 0.2933, + "step": 5419 + }, + { + "epoch": 0.4336, + "grad_norm": 1.6407479047775269, + "learning_rate": 6.042428649747177e-06, + "loss": 0.3418, + "step": 5420 + }, + { + "epoch": 0.43368, + "grad_norm": 1.6189351081848145, + "learning_rate": 6.041198609803256e-06, + "loss": 0.3938, + "step": 5421 + }, + { + "epoch": 0.43376, + "grad_norm": 1.1933389902114868, + "learning_rate": 6.039968503986182e-06, + "loss": 0.2499, + "step": 5422 + }, + { + "epoch": 0.43384, + "grad_norm": 1.6682744026184082, + "learning_rate": 6.038738332373781e-06, + "loss": 0.3783, + "step": 5423 + }, + { + "epoch": 0.43392, + "grad_norm": 1.450366497039795, + "learning_rate": 6.037508095043881e-06, + "loss": 0.3095, + "step": 5424 + }, + { + "epoch": 0.434, + "grad_norm": 1.6315852403640747, + "learning_rate": 6.036277792074316e-06, + "loss": 0.3228, + "step": 5425 + }, + { + "epoch": 0.43408, + "grad_norm": 1.3595662117004395, + "learning_rate": 6.035047423542922e-06, + "loss": 0.2532, + "step": 5426 + }, + { + "epoch": 0.43416, + "grad_norm": 1.3640127182006836, + "learning_rate": 6.033816989527541e-06, + "loss": 0.3054, + "step": 5427 + }, + { + "epoch": 0.43424, + "grad_norm": 1.3128430843353271, + "learning_rate": 6.032586490106018e-06, + "loss": 0.2737, + "step": 5428 + }, + { + "epoch": 0.43432, + "grad_norm": 1.4516959190368652, + "learning_rate": 6.0313559253562016e-06, + "loss": 0.3047, + "step": 5429 + }, + { + "epoch": 0.4344, + "grad_norm": 1.5541858673095703, + "learning_rate": 6.030125295355949e-06, + "loss": 0.2977, + "step": 5430 + }, + { + "epoch": 0.43448, + "grad_norm": 1.3899632692337036, + "learning_rate": 6.028894600183114e-06, + "loss": 0.2981, + "step": 5431 + }, + { + "epoch": 0.43456, + "grad_norm": 1.6935724020004272, + "learning_rate": 6.027663839915561e-06, + "loss": 0.4806, + "step": 5432 + }, + { + "epoch": 0.43464, + "grad_norm": 1.4013090133666992, + "learning_rate": 6.026433014631155e-06, + "loss": 0.3428, + "step": 5433 + }, + { + "epoch": 0.43472, + "grad_norm": 1.6702803373336792, + "learning_rate": 6.025202124407766e-06, + "loss": 0.3447, + "step": 5434 + }, + { + "epoch": 0.4348, + "grad_norm": 1.4864176511764526, + "learning_rate": 6.023971169323272e-06, + "loss": 0.3252, + "step": 5435 + }, + { + "epoch": 0.43488, + "grad_norm": 1.4419050216674805, + "learning_rate": 6.022740149455547e-06, + "loss": 0.314, + "step": 5436 + }, + { + "epoch": 0.43496, + "grad_norm": 1.489224910736084, + "learning_rate": 6.021509064882473e-06, + "loss": 0.353, + "step": 5437 + }, + { + "epoch": 0.43504, + "grad_norm": 1.4233548641204834, + "learning_rate": 6.0202779156819405e-06, + "loss": 0.3279, + "step": 5438 + }, + { + "epoch": 0.43512, + "grad_norm": 1.631506323814392, + "learning_rate": 6.019046701931836e-06, + "loss": 0.4391, + "step": 5439 + }, + { + "epoch": 0.4352, + "grad_norm": 1.7712832689285278, + "learning_rate": 6.0178154237100575e-06, + "loss": 0.3618, + "step": 5440 + }, + { + "epoch": 0.43528, + "grad_norm": 1.745080828666687, + "learning_rate": 6.016584081094503e-06, + "loss": 0.5306, + "step": 5441 + }, + { + "epoch": 0.43536, + "grad_norm": 1.2281311750411987, + "learning_rate": 6.015352674163075e-06, + "loss": 0.2491, + "step": 5442 + }, + { + "epoch": 0.43544, + "grad_norm": 1.725820779800415, + "learning_rate": 6.014121202993682e-06, + "loss": 0.3775, + "step": 5443 + }, + { + "epoch": 0.43552, + "grad_norm": 1.3258579969406128, + "learning_rate": 6.012889667664231e-06, + "loss": 0.2745, + "step": 5444 + }, + { + "epoch": 0.4356, + "grad_norm": 1.2049994468688965, + "learning_rate": 6.0116580682526415e-06, + "loss": 0.2574, + "step": 5445 + }, + { + "epoch": 0.43568, + "grad_norm": 1.1984772682189941, + "learning_rate": 6.010426404836831e-06, + "loss": 0.283, + "step": 5446 + }, + { + "epoch": 0.43576, + "grad_norm": 1.4736144542694092, + "learning_rate": 6.009194677494723e-06, + "loss": 0.3, + "step": 5447 + }, + { + "epoch": 0.43584, + "grad_norm": 1.5378073453903198, + "learning_rate": 6.007962886304245e-06, + "loss": 0.3541, + "step": 5448 + }, + { + "epoch": 0.43592, + "grad_norm": 1.6598979234695435, + "learning_rate": 6.006731031343327e-06, + "loss": 0.3801, + "step": 5449 + }, + { + "epoch": 0.436, + "grad_norm": 1.6594270467758179, + "learning_rate": 6.0054991126899055e-06, + "loss": 0.4225, + "step": 5450 + }, + { + "epoch": 0.43608, + "grad_norm": 1.6767255067825317, + "learning_rate": 6.004267130421918e-06, + "loss": 0.4542, + "step": 5451 + }, + { + "epoch": 0.43616, + "grad_norm": 1.594008207321167, + "learning_rate": 6.003035084617311e-06, + "loss": 0.2978, + "step": 5452 + }, + { + "epoch": 0.43624, + "grad_norm": 1.3313404321670532, + "learning_rate": 6.0018029753540295e-06, + "loss": 0.2959, + "step": 5453 + }, + { + "epoch": 0.43632, + "grad_norm": 1.4482040405273438, + "learning_rate": 6.0005708027100274e-06, + "loss": 0.4336, + "step": 5454 + }, + { + "epoch": 0.4364, + "grad_norm": 1.7941350936889648, + "learning_rate": 5.999338566763258e-06, + "loss": 0.3423, + "step": 5455 + }, + { + "epoch": 0.43648, + "grad_norm": 1.9843556880950928, + "learning_rate": 5.998106267591679e-06, + "loss": 0.4683, + "step": 5456 + }, + { + "epoch": 0.43656, + "grad_norm": 1.8331342935562134, + "learning_rate": 5.996873905273259e-06, + "loss": 0.4164, + "step": 5457 + }, + { + "epoch": 0.43664, + "grad_norm": 1.5630244016647339, + "learning_rate": 5.995641479885962e-06, + "loss": 0.3497, + "step": 5458 + }, + { + "epoch": 0.43672, + "grad_norm": 1.7044670581817627, + "learning_rate": 5.99440899150776e-06, + "loss": 0.4218, + "step": 5459 + }, + { + "epoch": 0.4368, + "grad_norm": 2.0335018634796143, + "learning_rate": 5.993176440216627e-06, + "loss": 0.3786, + "step": 5460 + }, + { + "epoch": 0.43688, + "grad_norm": 1.4986952543258667, + "learning_rate": 5.991943826090545e-06, + "loss": 0.3274, + "step": 5461 + }, + { + "epoch": 0.43696, + "grad_norm": 1.511153221130371, + "learning_rate": 5.990711149207496e-06, + "loss": 0.3809, + "step": 5462 + }, + { + "epoch": 0.43704, + "grad_norm": 1.593832015991211, + "learning_rate": 5.989478409645466e-06, + "loss": 0.346, + "step": 5463 + }, + { + "epoch": 0.43712, + "grad_norm": 1.7036159038543701, + "learning_rate": 5.988245607482449e-06, + "loss": 0.4385, + "step": 5464 + }, + { + "epoch": 0.4372, + "grad_norm": 1.4725526571273804, + "learning_rate": 5.987012742796441e-06, + "loss": 0.3309, + "step": 5465 + }, + { + "epoch": 0.43728, + "grad_norm": 1.5867393016815186, + "learning_rate": 5.985779815665436e-06, + "loss": 0.3505, + "step": 5466 + }, + { + "epoch": 0.43736, + "grad_norm": 2.0825116634368896, + "learning_rate": 5.9845468261674435e-06, + "loss": 0.4112, + "step": 5467 + }, + { + "epoch": 0.43744, + "grad_norm": 1.8679453134536743, + "learning_rate": 5.9833137743804645e-06, + "loss": 0.4366, + "step": 5468 + }, + { + "epoch": 0.43752, + "grad_norm": 1.5901966094970703, + "learning_rate": 5.982080660382516e-06, + "loss": 0.3359, + "step": 5469 + }, + { + "epoch": 0.4376, + "grad_norm": 1.1409281492233276, + "learning_rate": 5.98084748425161e-06, + "loss": 0.2324, + "step": 5470 + }, + { + "epoch": 0.43768, + "grad_norm": 1.6136082410812378, + "learning_rate": 5.979614246065765e-06, + "loss": 0.3601, + "step": 5471 + }, + { + "epoch": 0.43776, + "grad_norm": 1.8404332399368286, + "learning_rate": 5.978380945903004e-06, + "loss": 0.4764, + "step": 5472 + }, + { + "epoch": 0.43784, + "grad_norm": 1.9501445293426514, + "learning_rate": 5.977147583841354e-06, + "loss": 0.3829, + "step": 5473 + }, + { + "epoch": 0.43792, + "grad_norm": 1.4439702033996582, + "learning_rate": 5.975914159958846e-06, + "loss": 0.317, + "step": 5474 + }, + { + "epoch": 0.438, + "grad_norm": 1.3018183708190918, + "learning_rate": 5.974680674333514e-06, + "loss": 0.2696, + "step": 5475 + }, + { + "epoch": 0.43808, + "grad_norm": 1.4886512756347656, + "learning_rate": 5.973447127043398e-06, + "loss": 0.3454, + "step": 5476 + }, + { + "epoch": 0.43816, + "grad_norm": 1.5890697240829468, + "learning_rate": 5.97221351816654e-06, + "loss": 0.3576, + "step": 5477 + }, + { + "epoch": 0.43824, + "grad_norm": 1.8298115730285645, + "learning_rate": 5.970979847780984e-06, + "loss": 0.3515, + "step": 5478 + }, + { + "epoch": 0.43832, + "grad_norm": 1.5328552722930908, + "learning_rate": 5.969746115964783e-06, + "loss": 0.3744, + "step": 5479 + }, + { + "epoch": 0.4384, + "grad_norm": 1.7203729152679443, + "learning_rate": 5.968512322795991e-06, + "loss": 0.3269, + "step": 5480 + }, + { + "epoch": 0.43848, + "grad_norm": 1.672613501548767, + "learning_rate": 5.967278468352663e-06, + "loss": 0.3997, + "step": 5481 + }, + { + "epoch": 0.43856, + "grad_norm": 1.2868820428848267, + "learning_rate": 5.966044552712864e-06, + "loss": 0.3223, + "step": 5482 + }, + { + "epoch": 0.43864, + "grad_norm": 1.2581580877304077, + "learning_rate": 5.9648105759546595e-06, + "loss": 0.2698, + "step": 5483 + }, + { + "epoch": 0.43872, + "grad_norm": 1.465779185295105, + "learning_rate": 5.963576538156116e-06, + "loss": 0.3777, + "step": 5484 + }, + { + "epoch": 0.4388, + "grad_norm": 1.3545647859573364, + "learning_rate": 5.96234243939531e-06, + "loss": 0.3381, + "step": 5485 + }, + { + "epoch": 0.43888, + "grad_norm": 1.3060506582260132, + "learning_rate": 5.9611082797503175e-06, + "loss": 0.3209, + "step": 5486 + }, + { + "epoch": 0.43896, + "grad_norm": 1.4939861297607422, + "learning_rate": 5.95987405929922e-06, + "loss": 0.3916, + "step": 5487 + }, + { + "epoch": 0.43904, + "grad_norm": 1.9790840148925781, + "learning_rate": 5.9586397781201034e-06, + "loss": 0.516, + "step": 5488 + }, + { + "epoch": 0.43912, + "grad_norm": 1.3512998819351196, + "learning_rate": 5.957405436291055e-06, + "loss": 0.2739, + "step": 5489 + }, + { + "epoch": 0.4392, + "grad_norm": 1.5846513509750366, + "learning_rate": 5.956171033890168e-06, + "loss": 0.4435, + "step": 5490 + }, + { + "epoch": 0.43928, + "grad_norm": 1.4433051347732544, + "learning_rate": 5.95493657099554e-06, + "loss": 0.3871, + "step": 5491 + }, + { + "epoch": 0.43936, + "grad_norm": 1.4104619026184082, + "learning_rate": 5.953702047685271e-06, + "loss": 0.2823, + "step": 5492 + }, + { + "epoch": 0.43944, + "grad_norm": 1.4222887754440308, + "learning_rate": 5.952467464037462e-06, + "loss": 0.3227, + "step": 5493 + }, + { + "epoch": 0.43952, + "grad_norm": 1.3719890117645264, + "learning_rate": 5.951232820130224e-06, + "loss": 0.295, + "step": 5494 + }, + { + "epoch": 0.4396, + "grad_norm": 1.7587943077087402, + "learning_rate": 5.949998116041671e-06, + "loss": 0.4271, + "step": 5495 + }, + { + "epoch": 0.43968, + "grad_norm": 1.418924331665039, + "learning_rate": 5.948763351849913e-06, + "loss": 0.2773, + "step": 5496 + }, + { + "epoch": 0.43976, + "grad_norm": 1.8153401613235474, + "learning_rate": 5.947528527633073e-06, + "loss": 0.3063, + "step": 5497 + }, + { + "epoch": 0.43984, + "grad_norm": 1.4398388862609863, + "learning_rate": 5.946293643469274e-06, + "loss": 0.3319, + "step": 5498 + }, + { + "epoch": 0.43992, + "grad_norm": 1.4004582166671753, + "learning_rate": 5.945058699436641e-06, + "loss": 0.327, + "step": 5499 + }, + { + "epoch": 0.44, + "grad_norm": 1.4153037071228027, + "learning_rate": 5.943823695613308e-06, + "loss": 0.2834, + "step": 5500 + }, + { + "epoch": 0.44008, + "grad_norm": 1.4712837934494019, + "learning_rate": 5.9425886320774086e-06, + "loss": 0.2809, + "step": 5501 + }, + { + "epoch": 0.44016, + "grad_norm": 1.3245508670806885, + "learning_rate": 5.941353508907078e-06, + "loss": 0.224, + "step": 5502 + }, + { + "epoch": 0.44024, + "grad_norm": 1.4912713766098022, + "learning_rate": 5.940118326180463e-06, + "loss": 0.4387, + "step": 5503 + }, + { + "epoch": 0.44032, + "grad_norm": 1.4831585884094238, + "learning_rate": 5.938883083975706e-06, + "loss": 0.303, + "step": 5504 + }, + { + "epoch": 0.4404, + "grad_norm": 0.9887499809265137, + "learning_rate": 5.937647782370957e-06, + "loss": 0.2525, + "step": 5505 + }, + { + "epoch": 0.44048, + "grad_norm": 1.6899783611297607, + "learning_rate": 5.936412421444372e-06, + "loss": 0.3949, + "step": 5506 + }, + { + "epoch": 0.44056, + "grad_norm": 1.812455415725708, + "learning_rate": 5.935177001274105e-06, + "loss": 0.4223, + "step": 5507 + }, + { + "epoch": 0.44064, + "grad_norm": 1.5728967189788818, + "learning_rate": 5.933941521938318e-06, + "loss": 0.3647, + "step": 5508 + }, + { + "epoch": 0.44072, + "grad_norm": 1.8846977949142456, + "learning_rate": 5.932705983515176e-06, + "loss": 0.4741, + "step": 5509 + }, + { + "epoch": 0.4408, + "grad_norm": 1.6265419721603394, + "learning_rate": 5.931470386082847e-06, + "loss": 0.3013, + "step": 5510 + }, + { + "epoch": 0.44088, + "grad_norm": 1.5003933906555176, + "learning_rate": 5.930234729719504e-06, + "loss": 0.2989, + "step": 5511 + }, + { + "epoch": 0.44096, + "grad_norm": 1.764898657798767, + "learning_rate": 5.9289990145033226e-06, + "loss": 0.3484, + "step": 5512 + }, + { + "epoch": 0.44104, + "grad_norm": 1.7658780813217163, + "learning_rate": 5.927763240512482e-06, + "loss": 0.4016, + "step": 5513 + }, + { + "epoch": 0.44112, + "grad_norm": 1.515289545059204, + "learning_rate": 5.926527407825164e-06, + "loss": 0.3122, + "step": 5514 + }, + { + "epoch": 0.4412, + "grad_norm": 1.3999195098876953, + "learning_rate": 5.92529151651956e-06, + "loss": 0.4573, + "step": 5515 + }, + { + "epoch": 0.44128, + "grad_norm": 2.083272695541382, + "learning_rate": 5.924055566673855e-06, + "loss": 0.3821, + "step": 5516 + }, + { + "epoch": 0.44136, + "grad_norm": 1.596103310585022, + "learning_rate": 5.922819558366247e-06, + "loss": 0.4768, + "step": 5517 + }, + { + "epoch": 0.44144, + "grad_norm": 1.2892342805862427, + "learning_rate": 5.921583491674935e-06, + "loss": 0.2956, + "step": 5518 + }, + { + "epoch": 0.44152, + "grad_norm": 1.8127224445343018, + "learning_rate": 5.920347366678117e-06, + "loss": 0.3626, + "step": 5519 + }, + { + "epoch": 0.4416, + "grad_norm": 1.7415847778320312, + "learning_rate": 5.9191111834540006e-06, + "loss": 0.3858, + "step": 5520 + }, + { + "epoch": 0.44168, + "grad_norm": 1.5482152700424194, + "learning_rate": 5.917874942080796e-06, + "loss": 0.3114, + "step": 5521 + }, + { + "epoch": 0.44176, + "grad_norm": 1.4196959733963013, + "learning_rate": 5.916638642636714e-06, + "loss": 0.368, + "step": 5522 + }, + { + "epoch": 0.44184, + "grad_norm": 1.5246591567993164, + "learning_rate": 5.9154022851999725e-06, + "loss": 0.3554, + "step": 5523 + }, + { + "epoch": 0.44192, + "grad_norm": 1.4558786153793335, + "learning_rate": 5.914165869848793e-06, + "loss": 0.3025, + "step": 5524 + }, + { + "epoch": 0.442, + "grad_norm": 1.1743532419204712, + "learning_rate": 5.912929396661396e-06, + "loss": 0.2267, + "step": 5525 + }, + { + "epoch": 0.44208, + "grad_norm": 1.816625952720642, + "learning_rate": 5.911692865716011e-06, + "loss": 0.4033, + "step": 5526 + }, + { + "epoch": 0.44216, + "grad_norm": 1.3450242280960083, + "learning_rate": 5.910456277090869e-06, + "loss": 0.3432, + "step": 5527 + }, + { + "epoch": 0.44224, + "grad_norm": 1.6333425045013428, + "learning_rate": 5.909219630864204e-06, + "loss": 0.3124, + "step": 5528 + }, + { + "epoch": 0.44232, + "grad_norm": 1.9529705047607422, + "learning_rate": 5.907982927114257e-06, + "loss": 0.4574, + "step": 5529 + }, + { + "epoch": 0.4424, + "grad_norm": 1.9013431072235107, + "learning_rate": 5.906746165919267e-06, + "loss": 0.4198, + "step": 5530 + }, + { + "epoch": 0.44248, + "grad_norm": 1.8581068515777588, + "learning_rate": 5.905509347357481e-06, + "loss": 0.4414, + "step": 5531 + }, + { + "epoch": 0.44256, + "grad_norm": 1.7523456811904907, + "learning_rate": 5.904272471507148e-06, + "loss": 0.3963, + "step": 5532 + }, + { + "epoch": 0.44264, + "grad_norm": 1.8803638219833374, + "learning_rate": 5.903035538446524e-06, + "loss": 0.3608, + "step": 5533 + }, + { + "epoch": 0.44272, + "grad_norm": 1.630553126335144, + "learning_rate": 5.901798548253859e-06, + "loss": 0.3344, + "step": 5534 + }, + { + "epoch": 0.4428, + "grad_norm": 1.7994118928909302, + "learning_rate": 5.90056150100742e-06, + "loss": 0.4549, + "step": 5535 + }, + { + "epoch": 0.44288, + "grad_norm": 1.4933981895446777, + "learning_rate": 5.8993243967854685e-06, + "loss": 0.3576, + "step": 5536 + }, + { + "epoch": 0.44296, + "grad_norm": 1.6949869394302368, + "learning_rate": 5.898087235666271e-06, + "loss": 0.3546, + "step": 5537 + }, + { + "epoch": 0.44304, + "grad_norm": 1.6014376878738403, + "learning_rate": 5.8968500177281e-06, + "loss": 0.3171, + "step": 5538 + }, + { + "epoch": 0.44312, + "grad_norm": 1.835667371749878, + "learning_rate": 5.895612743049227e-06, + "loss": 0.509, + "step": 5539 + }, + { + "epoch": 0.4432, + "grad_norm": 1.977283000946045, + "learning_rate": 5.894375411707933e-06, + "loss": 0.4484, + "step": 5540 + }, + { + "epoch": 0.44328, + "grad_norm": 1.4197109937667847, + "learning_rate": 5.8931380237825e-06, + "loss": 0.2965, + "step": 5541 + }, + { + "epoch": 0.44336, + "grad_norm": 1.751121997833252, + "learning_rate": 5.891900579351213e-06, + "loss": 0.367, + "step": 5542 + }, + { + "epoch": 0.44344, + "grad_norm": 1.6477973461151123, + "learning_rate": 5.89066307849236e-06, + "loss": 0.3525, + "step": 5543 + }, + { + "epoch": 0.44352, + "grad_norm": 1.5611004829406738, + "learning_rate": 5.889425521284234e-06, + "loss": 0.3347, + "step": 5544 + }, + { + "epoch": 0.4436, + "grad_norm": 1.138242244720459, + "learning_rate": 5.888187907805132e-06, + "loss": 0.252, + "step": 5545 + }, + { + "epoch": 0.44368, + "grad_norm": 1.4641177654266357, + "learning_rate": 5.8869502381333525e-06, + "loss": 0.3063, + "step": 5546 + }, + { + "epoch": 0.44376, + "grad_norm": 1.6272233724594116, + "learning_rate": 5.8857125123472e-06, + "loss": 0.2902, + "step": 5547 + }, + { + "epoch": 0.44384, + "grad_norm": 1.4607881307601929, + "learning_rate": 5.88447473052498e-06, + "loss": 0.2488, + "step": 5548 + }, + { + "epoch": 0.44392, + "grad_norm": 1.5880279541015625, + "learning_rate": 5.883236892745003e-06, + "loss": 0.426, + "step": 5549 + }, + { + "epoch": 0.444, + "grad_norm": 1.4784557819366455, + "learning_rate": 5.881998999085583e-06, + "loss": 0.2709, + "step": 5550 + }, + { + "epoch": 0.44408, + "grad_norm": 1.4526431560516357, + "learning_rate": 5.880761049625038e-06, + "loss": 0.3239, + "step": 5551 + }, + { + "epoch": 0.44416, + "grad_norm": 1.838352918624878, + "learning_rate": 5.879523044441687e-06, + "loss": 0.3932, + "step": 5552 + }, + { + "epoch": 0.44424, + "grad_norm": 1.5688480138778687, + "learning_rate": 5.878284983613858e-06, + "loss": 0.2671, + "step": 5553 + }, + { + "epoch": 0.44432, + "grad_norm": 1.8419432640075684, + "learning_rate": 5.877046867219876e-06, + "loss": 0.3733, + "step": 5554 + }, + { + "epoch": 0.4444, + "grad_norm": 2.1512904167175293, + "learning_rate": 5.8758086953380725e-06, + "loss": 0.419, + "step": 5555 + }, + { + "epoch": 0.44448, + "grad_norm": 1.3534338474273682, + "learning_rate": 5.874570468046784e-06, + "loss": 0.3388, + "step": 5556 + }, + { + "epoch": 0.44456, + "grad_norm": 2.1905033588409424, + "learning_rate": 5.873332185424348e-06, + "loss": 0.4197, + "step": 5557 + }, + { + "epoch": 0.44464, + "grad_norm": 1.410273790359497, + "learning_rate": 5.872093847549106e-06, + "loss": 0.3875, + "step": 5558 + }, + { + "epoch": 0.44472, + "grad_norm": 1.7642848491668701, + "learning_rate": 5.870855454499407e-06, + "loss": 0.4297, + "step": 5559 + }, + { + "epoch": 0.4448, + "grad_norm": 1.5598673820495605, + "learning_rate": 5.869617006353596e-06, + "loss": 0.3774, + "step": 5560 + }, + { + "epoch": 0.44488, + "grad_norm": 1.347758412361145, + "learning_rate": 5.868378503190027e-06, + "loss": 0.2839, + "step": 5561 + }, + { + "epoch": 0.44496, + "grad_norm": 1.6148842573165894, + "learning_rate": 5.8671399450870535e-06, + "loss": 0.3108, + "step": 5562 + }, + { + "epoch": 0.44504, + "grad_norm": 1.48881196975708, + "learning_rate": 5.8659013321230385e-06, + "loss": 0.3588, + "step": 5563 + }, + { + "epoch": 0.44512, + "grad_norm": 1.3120616674423218, + "learning_rate": 5.8646626643763435e-06, + "loss": 0.2724, + "step": 5564 + }, + { + "epoch": 0.4452, + "grad_norm": 1.1921863555908203, + "learning_rate": 5.863423941925337e-06, + "loss": 0.2622, + "step": 5565 + }, + { + "epoch": 0.44528, + "grad_norm": 1.8236253261566162, + "learning_rate": 5.862185164848384e-06, + "loss": 0.3556, + "step": 5566 + }, + { + "epoch": 0.44536, + "grad_norm": 1.6157597303390503, + "learning_rate": 5.860946333223862e-06, + "loss": 0.369, + "step": 5567 + }, + { + "epoch": 0.44544, + "grad_norm": 1.5578926801681519, + "learning_rate": 5.859707447130144e-06, + "loss": 0.3256, + "step": 5568 + }, + { + "epoch": 0.44552, + "grad_norm": 1.4830211400985718, + "learning_rate": 5.858468506645613e-06, + "loss": 0.3329, + "step": 5569 + }, + { + "epoch": 0.4456, + "grad_norm": 1.290466547012329, + "learning_rate": 5.857229511848655e-06, + "loss": 0.2373, + "step": 5570 + }, + { + "epoch": 0.44568, + "grad_norm": 1.6431125402450562, + "learning_rate": 5.855990462817651e-06, + "loss": 0.3732, + "step": 5571 + }, + { + "epoch": 0.44576, + "grad_norm": 1.8566932678222656, + "learning_rate": 5.854751359630997e-06, + "loss": 0.4304, + "step": 5572 + }, + { + "epoch": 0.44584, + "grad_norm": 1.466186285018921, + "learning_rate": 5.853512202367083e-06, + "loss": 0.328, + "step": 5573 + }, + { + "epoch": 0.44592, + "grad_norm": 1.5933393239974976, + "learning_rate": 5.852272991104308e-06, + "loss": 0.3883, + "step": 5574 + }, + { + "epoch": 0.446, + "grad_norm": 1.850595474243164, + "learning_rate": 5.851033725921073e-06, + "loss": 0.486, + "step": 5575 + }, + { + "epoch": 0.44608, + "grad_norm": 1.4042195081710815, + "learning_rate": 5.84979440689578e-06, + "loss": 0.3212, + "step": 5576 + }, + { + "epoch": 0.44616, + "grad_norm": 1.733971118927002, + "learning_rate": 5.848555034106841e-06, + "loss": 0.3999, + "step": 5577 + }, + { + "epoch": 0.44624, + "grad_norm": 1.5726109743118286, + "learning_rate": 5.847315607632662e-06, + "loss": 0.3346, + "step": 5578 + }, + { + "epoch": 0.44632, + "grad_norm": 1.2969645261764526, + "learning_rate": 5.846076127551661e-06, + "loss": 0.2933, + "step": 5579 + }, + { + "epoch": 0.4464, + "grad_norm": 1.2956920862197876, + "learning_rate": 5.8448365939422534e-06, + "loss": 0.275, + "step": 5580 + }, + { + "epoch": 0.44648, + "grad_norm": 1.5254887342453003, + "learning_rate": 5.8435970068828605e-06, + "loss": 0.3264, + "step": 5581 + }, + { + "epoch": 0.44656, + "grad_norm": 1.3373383283615112, + "learning_rate": 5.842357366451911e-06, + "loss": 0.2972, + "step": 5582 + }, + { + "epoch": 0.44664, + "grad_norm": 1.5913358926773071, + "learning_rate": 5.841117672727827e-06, + "loss": 0.5288, + "step": 5583 + }, + { + "epoch": 0.44672, + "grad_norm": 1.6409987211227417, + "learning_rate": 5.839877925789043e-06, + "loss": 0.3628, + "step": 5584 + }, + { + "epoch": 0.4468, + "grad_norm": 1.7899576425552368, + "learning_rate": 5.8386381257139925e-06, + "loss": 0.3251, + "step": 5585 + }, + { + "epoch": 0.44688, + "grad_norm": 1.5527466535568237, + "learning_rate": 5.837398272581114e-06, + "loss": 0.3213, + "step": 5586 + }, + { + "epoch": 0.44696, + "grad_norm": 1.662186622619629, + "learning_rate": 5.836158366468848e-06, + "loss": 0.3146, + "step": 5587 + }, + { + "epoch": 0.44704, + "grad_norm": 1.686964750289917, + "learning_rate": 5.8349184074556396e-06, + "loss": 0.3395, + "step": 5588 + }, + { + "epoch": 0.44712, + "grad_norm": 1.8355437517166138, + "learning_rate": 5.833678395619939e-06, + "loss": 0.449, + "step": 5589 + }, + { + "epoch": 0.4472, + "grad_norm": 1.7132534980773926, + "learning_rate": 5.832438331040196e-06, + "loss": 0.3185, + "step": 5590 + }, + { + "epoch": 0.44728, + "grad_norm": 1.7874171733856201, + "learning_rate": 5.831198213794863e-06, + "loss": 0.4094, + "step": 5591 + }, + { + "epoch": 0.44736, + "grad_norm": 1.4985543489456177, + "learning_rate": 5.829958043962402e-06, + "loss": 0.4014, + "step": 5592 + }, + { + "epoch": 0.44744, + "grad_norm": 1.901485562324524, + "learning_rate": 5.828717821621272e-06, + "loss": 0.3675, + "step": 5593 + }, + { + "epoch": 0.44752, + "grad_norm": 1.6370835304260254, + "learning_rate": 5.827477546849938e-06, + "loss": 0.3236, + "step": 5594 + }, + { + "epoch": 0.4476, + "grad_norm": 1.7742931842803955, + "learning_rate": 5.826237219726869e-06, + "loss": 0.4238, + "step": 5595 + }, + { + "epoch": 0.44768, + "grad_norm": 1.4299365282058716, + "learning_rate": 5.824996840330536e-06, + "loss": 0.3525, + "step": 5596 + }, + { + "epoch": 0.44776, + "grad_norm": 1.4459290504455566, + "learning_rate": 5.823756408739412e-06, + "loss": 0.2749, + "step": 5597 + }, + { + "epoch": 0.44784, + "grad_norm": 1.5706748962402344, + "learning_rate": 5.822515925031977e-06, + "loss": 0.3489, + "step": 5598 + }, + { + "epoch": 0.44792, + "grad_norm": 0.996229350566864, + "learning_rate": 5.821275389286711e-06, + "loss": 0.2767, + "step": 5599 + }, + { + "epoch": 0.448, + "grad_norm": 1.8221417665481567, + "learning_rate": 5.820034801582101e-06, + "loss": 0.3905, + "step": 5600 + }, + { + "epoch": 0.44808, + "grad_norm": 1.9977222681045532, + "learning_rate": 5.818794161996631e-06, + "loss": 0.3864, + "step": 5601 + }, + { + "epoch": 0.44816, + "grad_norm": 2.152876615524292, + "learning_rate": 5.817553470608795e-06, + "loss": 0.5186, + "step": 5602 + }, + { + "epoch": 0.44824, + "grad_norm": 1.5512678623199463, + "learning_rate": 5.816312727497085e-06, + "loss": 0.2871, + "step": 5603 + }, + { + "epoch": 0.44832, + "grad_norm": 1.9361984729766846, + "learning_rate": 5.815071932740002e-06, + "loss": 0.3828, + "step": 5604 + }, + { + "epoch": 0.4484, + "grad_norm": 1.641486644744873, + "learning_rate": 5.813831086416044e-06, + "loss": 0.4174, + "step": 5605 + }, + { + "epoch": 0.44848, + "grad_norm": 1.4502854347229004, + "learning_rate": 5.812590188603718e-06, + "loss": 0.2888, + "step": 5606 + }, + { + "epoch": 0.44856, + "grad_norm": 1.4937154054641724, + "learning_rate": 5.811349239381528e-06, + "loss": 0.3603, + "step": 5607 + }, + { + "epoch": 0.44864, + "grad_norm": 1.1726678609848022, + "learning_rate": 5.810108238827986e-06, + "loss": 0.2616, + "step": 5608 + }, + { + "epoch": 0.44872, + "grad_norm": 1.500806212425232, + "learning_rate": 5.808867187021607e-06, + "loss": 0.2688, + "step": 5609 + }, + { + "epoch": 0.4488, + "grad_norm": 1.740702509880066, + "learning_rate": 5.8076260840409086e-06, + "loss": 0.4273, + "step": 5610 + }, + { + "epoch": 0.44888, + "grad_norm": 1.5013270378112793, + "learning_rate": 5.806384929964408e-06, + "loss": 0.3114, + "step": 5611 + }, + { + "epoch": 0.44896, + "grad_norm": 1.7903553247451782, + "learning_rate": 5.805143724870633e-06, + "loss": 0.3147, + "step": 5612 + }, + { + "epoch": 0.44904, + "grad_norm": 1.476843237876892, + "learning_rate": 5.8039024688381074e-06, + "loss": 0.3131, + "step": 5613 + }, + { + "epoch": 0.44912, + "grad_norm": 1.402813196182251, + "learning_rate": 5.802661161945363e-06, + "loss": 0.303, + "step": 5614 + }, + { + "epoch": 0.4492, + "grad_norm": 1.5428776741027832, + "learning_rate": 5.801419804270932e-06, + "loss": 0.3801, + "step": 5615 + }, + { + "epoch": 0.44928, + "grad_norm": 1.5787391662597656, + "learning_rate": 5.800178395893353e-06, + "loss": 0.3183, + "step": 5616 + }, + { + "epoch": 0.44936, + "grad_norm": 1.7391003370285034, + "learning_rate": 5.798936936891163e-06, + "loss": 0.3259, + "step": 5617 + }, + { + "epoch": 0.44944, + "grad_norm": 1.6685212850570679, + "learning_rate": 5.797695427342908e-06, + "loss": 0.3274, + "step": 5618 + }, + { + "epoch": 0.44952, + "grad_norm": 1.4090831279754639, + "learning_rate": 5.79645386732713e-06, + "loss": 0.3116, + "step": 5619 + }, + { + "epoch": 0.4496, + "grad_norm": 1.5553460121154785, + "learning_rate": 5.795212256922382e-06, + "loss": 0.3605, + "step": 5620 + }, + { + "epoch": 0.44968, + "grad_norm": 1.7685738801956177, + "learning_rate": 5.793970596207214e-06, + "loss": 0.3015, + "step": 5621 + }, + { + "epoch": 0.44976, + "grad_norm": 1.7890232801437378, + "learning_rate": 5.792728885260184e-06, + "loss": 0.3894, + "step": 5622 + }, + { + "epoch": 0.44984, + "grad_norm": 2.025926113128662, + "learning_rate": 5.79148712415985e-06, + "loss": 0.3836, + "step": 5623 + }, + { + "epoch": 0.44992, + "grad_norm": 1.6190468072891235, + "learning_rate": 5.790245312984775e-06, + "loss": 0.3422, + "step": 5624 + }, + { + "epoch": 0.45, + "grad_norm": 1.7557851076126099, + "learning_rate": 5.789003451813522e-06, + "loss": 0.3468, + "step": 5625 + }, + { + "epoch": 0.45008, + "grad_norm": 1.489891529083252, + "learning_rate": 5.78776154072466e-06, + "loss": 0.3636, + "step": 5626 + }, + { + "epoch": 0.45016, + "grad_norm": 1.5357917547225952, + "learning_rate": 5.786519579796764e-06, + "loss": 0.3056, + "step": 5627 + }, + { + "epoch": 0.45024, + "grad_norm": 1.792183756828308, + "learning_rate": 5.785277569108403e-06, + "loss": 0.3856, + "step": 5628 + }, + { + "epoch": 0.45032, + "grad_norm": 1.6786377429962158, + "learning_rate": 5.7840355087381575e-06, + "loss": 0.4319, + "step": 5629 + }, + { + "epoch": 0.4504, + "grad_norm": 1.8204642534255981, + "learning_rate": 5.7827933987646115e-06, + "loss": 0.3844, + "step": 5630 + }, + { + "epoch": 0.45048, + "grad_norm": 1.8311585187911987, + "learning_rate": 5.781551239266344e-06, + "loss": 0.5054, + "step": 5631 + }, + { + "epoch": 0.45056, + "grad_norm": 1.9697678089141846, + "learning_rate": 5.780309030321945e-06, + "loss": 0.359, + "step": 5632 + }, + { + "epoch": 0.45064, + "grad_norm": 1.9044475555419922, + "learning_rate": 5.779066772010005e-06, + "loss": 0.4049, + "step": 5633 + }, + { + "epoch": 0.45072, + "grad_norm": 1.8143421411514282, + "learning_rate": 5.777824464409117e-06, + "loss": 0.4335, + "step": 5634 + }, + { + "epoch": 0.4508, + "grad_norm": 1.4178447723388672, + "learning_rate": 5.776582107597877e-06, + "loss": 0.3715, + "step": 5635 + }, + { + "epoch": 0.45088, + "grad_norm": 1.598649024963379, + "learning_rate": 5.775339701654887e-06, + "loss": 0.3177, + "step": 5636 + }, + { + "epoch": 0.45096, + "grad_norm": 1.538147211074829, + "learning_rate": 5.7740972466587476e-06, + "loss": 0.3332, + "step": 5637 + }, + { + "epoch": 0.45104, + "grad_norm": 1.4716747999191284, + "learning_rate": 5.772854742688066e-06, + "loss": 0.4715, + "step": 5638 + }, + { + "epoch": 0.45112, + "grad_norm": 1.5471874475479126, + "learning_rate": 5.771612189821451e-06, + "loss": 0.3747, + "step": 5639 + }, + { + "epoch": 0.4512, + "grad_norm": 1.5069724321365356, + "learning_rate": 5.770369588137513e-06, + "loss": 0.2939, + "step": 5640 + }, + { + "epoch": 0.45128, + "grad_norm": 1.0864536762237549, + "learning_rate": 5.76912693771487e-06, + "loss": 0.215, + "step": 5641 + }, + { + "epoch": 0.45136, + "grad_norm": 1.2346502542495728, + "learning_rate": 5.76788423863214e-06, + "loss": 0.2422, + "step": 5642 + }, + { + "epoch": 0.45144, + "grad_norm": 1.443285346031189, + "learning_rate": 5.766641490967942e-06, + "loss": 0.26, + "step": 5643 + }, + { + "epoch": 0.45152, + "grad_norm": 1.8851925134658813, + "learning_rate": 5.765398694800902e-06, + "loss": 0.4111, + "step": 5644 + }, + { + "epoch": 0.4516, + "grad_norm": 1.4499890804290771, + "learning_rate": 5.764155850209649e-06, + "loss": 0.2916, + "step": 5645 + }, + { + "epoch": 0.45168, + "grad_norm": 1.421256422996521, + "learning_rate": 5.7629129572728105e-06, + "loss": 0.3533, + "step": 5646 + }, + { + "epoch": 0.45176, + "grad_norm": 1.4521507024765015, + "learning_rate": 5.761670016069025e-06, + "loss": 0.3407, + "step": 5647 + }, + { + "epoch": 0.45184, + "grad_norm": 1.6597949266433716, + "learning_rate": 5.760427026676923e-06, + "loss": 0.3523, + "step": 5648 + }, + { + "epoch": 0.45192, + "grad_norm": 1.5322909355163574, + "learning_rate": 5.759183989175148e-06, + "loss": 0.2657, + "step": 5649 + }, + { + "epoch": 0.452, + "grad_norm": 1.5093351602554321, + "learning_rate": 5.7579409036423426e-06, + "loss": 0.3397, + "step": 5650 + }, + { + "epoch": 0.45208, + "grad_norm": 1.5804753303527832, + "learning_rate": 5.756697770157152e-06, + "loss": 0.3578, + "step": 5651 + }, + { + "epoch": 0.45216, + "grad_norm": 1.694562315940857, + "learning_rate": 5.755454588798226e-06, + "loss": 0.3719, + "step": 5652 + }, + { + "epoch": 0.45224, + "grad_norm": 1.9576027393341064, + "learning_rate": 5.754211359644217e-06, + "loss": 0.3491, + "step": 5653 + }, + { + "epoch": 0.45232, + "grad_norm": 1.5842691659927368, + "learning_rate": 5.752968082773778e-06, + "loss": 0.3311, + "step": 5654 + }, + { + "epoch": 0.4524, + "grad_norm": 1.2589999437332153, + "learning_rate": 5.751724758265567e-06, + "loss": 0.2414, + "step": 5655 + }, + { + "epoch": 0.45248, + "grad_norm": 2.144315719604492, + "learning_rate": 5.750481386198246e-06, + "loss": 0.4948, + "step": 5656 + }, + { + "epoch": 0.45256, + "grad_norm": 1.865082859992981, + "learning_rate": 5.749237966650478e-06, + "loss": 0.4131, + "step": 5657 + }, + { + "epoch": 0.45264, + "grad_norm": 1.8475942611694336, + "learning_rate": 5.747994499700932e-06, + "loss": 0.3729, + "step": 5658 + }, + { + "epoch": 0.45272, + "grad_norm": 1.8844106197357178, + "learning_rate": 5.746750985428278e-06, + "loss": 0.5364, + "step": 5659 + }, + { + "epoch": 0.4528, + "grad_norm": 1.6912811994552612, + "learning_rate": 5.745507423911185e-06, + "loss": 0.3392, + "step": 5660 + }, + { + "epoch": 0.45288, + "grad_norm": 1.4193228483200073, + "learning_rate": 5.744263815228334e-06, + "loss": 0.3969, + "step": 5661 + }, + { + "epoch": 0.45296, + "grad_norm": 1.7651104927062988, + "learning_rate": 5.743020159458401e-06, + "loss": 0.4356, + "step": 5662 + }, + { + "epoch": 0.45304, + "grad_norm": 1.2144150733947754, + "learning_rate": 5.741776456680068e-06, + "loss": 0.2408, + "step": 5663 + }, + { + "epoch": 0.45312, + "grad_norm": 1.4597735404968262, + "learning_rate": 5.740532706972022e-06, + "loss": 0.3017, + "step": 5664 + }, + { + "epoch": 0.4532, + "grad_norm": 1.3759361505508423, + "learning_rate": 5.739288910412949e-06, + "loss": 0.2679, + "step": 5665 + }, + { + "epoch": 0.45328, + "grad_norm": 1.6209099292755127, + "learning_rate": 5.738045067081539e-06, + "loss": 0.2948, + "step": 5666 + }, + { + "epoch": 0.45336, + "grad_norm": 1.576300859451294, + "learning_rate": 5.736801177056488e-06, + "loss": 0.3133, + "step": 5667 + }, + { + "epoch": 0.45344, + "grad_norm": 1.6057082414627075, + "learning_rate": 5.735557240416492e-06, + "loss": 0.3194, + "step": 5668 + }, + { + "epoch": 0.45352, + "grad_norm": 1.371180772781372, + "learning_rate": 5.73431325724025e-06, + "loss": 0.2877, + "step": 5669 + }, + { + "epoch": 0.4536, + "grad_norm": 1.5031378269195557, + "learning_rate": 5.733069227606466e-06, + "loss": 0.3562, + "step": 5670 + }, + { + "epoch": 0.45368, + "grad_norm": 2.041217803955078, + "learning_rate": 5.731825151593845e-06, + "loss": 0.4971, + "step": 5671 + }, + { + "epoch": 0.45376, + "grad_norm": 1.57797372341156, + "learning_rate": 5.730581029281095e-06, + "loss": 0.3594, + "step": 5672 + }, + { + "epoch": 0.45384, + "grad_norm": 1.516638159751892, + "learning_rate": 5.729336860746928e-06, + "loss": 0.3172, + "step": 5673 + }, + { + "epoch": 0.45392, + "grad_norm": 1.504821538925171, + "learning_rate": 5.728092646070058e-06, + "loss": 0.3502, + "step": 5674 + }, + { + "epoch": 0.454, + "grad_norm": 1.4294888973236084, + "learning_rate": 5.726848385329202e-06, + "loss": 0.3397, + "step": 5675 + }, + { + "epoch": 0.45408, + "grad_norm": 1.1908982992172241, + "learning_rate": 5.725604078603081e-06, + "loss": 0.2893, + "step": 5676 + }, + { + "epoch": 0.45416, + "grad_norm": 1.6021041870117188, + "learning_rate": 5.724359725970419e-06, + "loss": 0.3478, + "step": 5677 + }, + { + "epoch": 0.45424, + "grad_norm": 1.542633056640625, + "learning_rate": 5.72311532750994e-06, + "loss": 0.3476, + "step": 5678 + }, + { + "epoch": 0.45432, + "grad_norm": 1.6992759704589844, + "learning_rate": 5.721870883300374e-06, + "loss": 0.3492, + "step": 5679 + }, + { + "epoch": 0.4544, + "grad_norm": 1.4652698040008545, + "learning_rate": 5.720626393420451e-06, + "loss": 0.3238, + "step": 5680 + }, + { + "epoch": 0.45448, + "grad_norm": 1.3540560007095337, + "learning_rate": 5.719381857948908e-06, + "loss": 0.3257, + "step": 5681 + }, + { + "epoch": 0.45456, + "grad_norm": 1.3179233074188232, + "learning_rate": 5.718137276964481e-06, + "loss": 0.3374, + "step": 5682 + }, + { + "epoch": 0.45464, + "grad_norm": 1.2918938398361206, + "learning_rate": 5.716892650545914e-06, + "loss": 0.2643, + "step": 5683 + }, + { + "epoch": 0.45472, + "grad_norm": 1.5984219312667847, + "learning_rate": 5.715647978771946e-06, + "loss": 0.3463, + "step": 5684 + }, + { + "epoch": 0.4548, + "grad_norm": 1.7894304990768433, + "learning_rate": 5.714403261721327e-06, + "loss": 0.4156, + "step": 5685 + }, + { + "epoch": 0.45488, + "grad_norm": 1.436471939086914, + "learning_rate": 5.713158499472802e-06, + "loss": 0.3837, + "step": 5686 + }, + { + "epoch": 0.45496, + "grad_norm": 1.6427680253982544, + "learning_rate": 5.711913692105126e-06, + "loss": 0.3569, + "step": 5687 + }, + { + "epoch": 0.45504, + "grad_norm": 1.8029788732528687, + "learning_rate": 5.710668839697051e-06, + "loss": 0.3681, + "step": 5688 + }, + { + "epoch": 0.45512, + "grad_norm": 1.501164197921753, + "learning_rate": 5.709423942327339e-06, + "loss": 0.3378, + "step": 5689 + }, + { + "epoch": 0.4552, + "grad_norm": 1.6958562135696411, + "learning_rate": 5.708179000074746e-06, + "loss": 0.3265, + "step": 5690 + }, + { + "epoch": 0.45528, + "grad_norm": 1.5554603338241577, + "learning_rate": 5.7069340130180375e-06, + "loss": 0.3386, + "step": 5691 + }, + { + "epoch": 0.45536, + "grad_norm": 1.462551474571228, + "learning_rate": 5.705688981235979e-06, + "loss": 0.2857, + "step": 5692 + }, + { + "epoch": 0.45544, + "grad_norm": 1.7861984968185425, + "learning_rate": 5.704443904807341e-06, + "loss": 0.426, + "step": 5693 + }, + { + "epoch": 0.45552, + "grad_norm": 2.0433483123779297, + "learning_rate": 5.7031987838108945e-06, + "loss": 0.3355, + "step": 5694 + }, + { + "epoch": 0.4556, + "grad_norm": 1.1957581043243408, + "learning_rate": 5.701953618325413e-06, + "loss": 0.2442, + "step": 5695 + }, + { + "epoch": 0.45568, + "grad_norm": 1.7323002815246582, + "learning_rate": 5.700708408429676e-06, + "loss": 0.3109, + "step": 5696 + }, + { + "epoch": 0.45576, + "grad_norm": 1.2197766304016113, + "learning_rate": 5.699463154202461e-06, + "loss": 0.3155, + "step": 5697 + }, + { + "epoch": 0.45584, + "grad_norm": 1.9804476499557495, + "learning_rate": 5.698217855722553e-06, + "loss": 0.5005, + "step": 5698 + }, + { + "epoch": 0.45592, + "grad_norm": 1.695799708366394, + "learning_rate": 5.696972513068738e-06, + "loss": 0.3383, + "step": 5699 + }, + { + "epoch": 0.456, + "grad_norm": 1.70815110206604, + "learning_rate": 5.695727126319805e-06, + "loss": 0.4038, + "step": 5700 + }, + { + "epoch": 0.45608, + "grad_norm": 1.5041000843048096, + "learning_rate": 5.694481695554542e-06, + "loss": 0.3908, + "step": 5701 + }, + { + "epoch": 0.45616, + "grad_norm": 1.1593267917633057, + "learning_rate": 5.693236220851748e-06, + "loss": 0.2464, + "step": 5702 + }, + { + "epoch": 0.45624, + "grad_norm": 1.5389819145202637, + "learning_rate": 5.691990702290217e-06, + "loss": 0.3828, + "step": 5703 + }, + { + "epoch": 0.45632, + "grad_norm": 1.3315058946609497, + "learning_rate": 5.69074513994875e-06, + "loss": 0.3166, + "step": 5704 + }, + { + "epoch": 0.4564, + "grad_norm": 1.4812344312667847, + "learning_rate": 5.6894995339061484e-06, + "loss": 0.294, + "step": 5705 + }, + { + "epoch": 0.45648, + "grad_norm": 2.301971197128296, + "learning_rate": 5.688253884241221e-06, + "loss": 0.4218, + "step": 5706 + }, + { + "epoch": 0.45656, + "grad_norm": 1.6880017518997192, + "learning_rate": 5.687008191032771e-06, + "loss": 0.3879, + "step": 5707 + }, + { + "epoch": 0.45664, + "grad_norm": 2.1264901161193848, + "learning_rate": 5.685762454359612e-06, + "loss": 0.4719, + "step": 5708 + }, + { + "epoch": 0.45672, + "grad_norm": 1.2993032932281494, + "learning_rate": 5.684516674300557e-06, + "loss": 0.2644, + "step": 5709 + }, + { + "epoch": 0.4568, + "grad_norm": 1.6621084213256836, + "learning_rate": 5.6832708509344215e-06, + "loss": 0.3838, + "step": 5710 + }, + { + "epoch": 0.45688, + "grad_norm": 1.6229028701782227, + "learning_rate": 5.682024984340027e-06, + "loss": 0.2956, + "step": 5711 + }, + { + "epoch": 0.45696, + "grad_norm": 1.6341001987457275, + "learning_rate": 5.6807790745961935e-06, + "loss": 0.4009, + "step": 5712 + }, + { + "epoch": 0.45704, + "grad_norm": 1.3154834508895874, + "learning_rate": 5.679533121781745e-06, + "loss": 0.2954, + "step": 5713 + }, + { + "epoch": 0.45712, + "grad_norm": 1.2968782186508179, + "learning_rate": 5.67828712597551e-06, + "loss": 0.2647, + "step": 5714 + }, + { + "epoch": 0.4572, + "grad_norm": 1.9382835626602173, + "learning_rate": 5.677041087256319e-06, + "loss": 0.4006, + "step": 5715 + }, + { + "epoch": 0.45728, + "grad_norm": 1.3526051044464111, + "learning_rate": 5.675795005703002e-06, + "loss": 0.3288, + "step": 5716 + }, + { + "epoch": 0.45736, + "grad_norm": 1.6821001768112183, + "learning_rate": 5.674548881394398e-06, + "loss": 0.3787, + "step": 5717 + }, + { + "epoch": 0.45744, + "grad_norm": 1.6728947162628174, + "learning_rate": 5.673302714409342e-06, + "loss": 0.3524, + "step": 5718 + }, + { + "epoch": 0.45752, + "grad_norm": 1.9203122854232788, + "learning_rate": 5.672056504826677e-06, + "loss": 0.4147, + "step": 5719 + }, + { + "epoch": 0.4576, + "grad_norm": 1.767820954322815, + "learning_rate": 5.670810252725246e-06, + "loss": 0.5169, + "step": 5720 + }, + { + "epoch": 0.45768, + "grad_norm": 1.085750937461853, + "learning_rate": 5.669563958183893e-06, + "loss": 0.2131, + "step": 5721 + }, + { + "epoch": 0.45776, + "grad_norm": 2.1359403133392334, + "learning_rate": 5.668317621281471e-06, + "loss": 0.4206, + "step": 5722 + }, + { + "epoch": 0.45784, + "grad_norm": 1.5051878690719604, + "learning_rate": 5.667071242096828e-06, + "loss": 0.2954, + "step": 5723 + }, + { + "epoch": 0.45792, + "grad_norm": 1.2976740598678589, + "learning_rate": 5.66582482070882e-06, + "loss": 0.284, + "step": 5724 + }, + { + "epoch": 0.458, + "grad_norm": 1.684295415878296, + "learning_rate": 5.664578357196303e-06, + "loss": 0.3567, + "step": 5725 + }, + { + "epoch": 0.45808, + "grad_norm": 1.7063944339752197, + "learning_rate": 5.663331851638136e-06, + "loss": 0.4559, + "step": 5726 + }, + { + "epoch": 0.45816, + "grad_norm": 1.91475510597229, + "learning_rate": 5.662085304113184e-06, + "loss": 0.4579, + "step": 5727 + }, + { + "epoch": 0.45824, + "grad_norm": 1.6786874532699585, + "learning_rate": 5.66083871470031e-06, + "loss": 0.3606, + "step": 5728 + }, + { + "epoch": 0.45832, + "grad_norm": 2.066380023956299, + "learning_rate": 5.6595920834783815e-06, + "loss": 0.3455, + "step": 5729 + }, + { + "epoch": 0.4584, + "grad_norm": 1.214521884918213, + "learning_rate": 5.658345410526269e-06, + "loss": 0.2417, + "step": 5730 + }, + { + "epoch": 0.45848, + "grad_norm": 1.282861351966858, + "learning_rate": 5.657098695922845e-06, + "loss": 0.3091, + "step": 5731 + }, + { + "epoch": 0.45856, + "grad_norm": 1.3990386724472046, + "learning_rate": 5.655851939746985e-06, + "loss": 0.2745, + "step": 5732 + }, + { + "epoch": 0.45864, + "grad_norm": 2.0859076976776123, + "learning_rate": 5.654605142077567e-06, + "loss": 0.3879, + "step": 5733 + }, + { + "epoch": 0.45872, + "grad_norm": 1.3720123767852783, + "learning_rate": 5.653358302993473e-06, + "loss": 0.2961, + "step": 5734 + }, + { + "epoch": 0.4588, + "grad_norm": 1.6038662195205688, + "learning_rate": 5.652111422573584e-06, + "loss": 0.3159, + "step": 5735 + }, + { + "epoch": 0.45888, + "grad_norm": 1.583141565322876, + "learning_rate": 5.6508645008967885e-06, + "loss": 0.3831, + "step": 5736 + }, + { + "epoch": 0.45896, + "grad_norm": 1.398667573928833, + "learning_rate": 5.649617538041973e-06, + "loss": 0.3121, + "step": 5737 + }, + { + "epoch": 0.45904, + "grad_norm": 1.727538824081421, + "learning_rate": 5.6483705340880305e-06, + "loss": 0.3834, + "step": 5738 + }, + { + "epoch": 0.45912, + "grad_norm": 1.9524167776107788, + "learning_rate": 5.647123489113852e-06, + "loss": 0.4467, + "step": 5739 + }, + { + "epoch": 0.4592, + "grad_norm": 1.4175621271133423, + "learning_rate": 5.645876403198337e-06, + "loss": 0.3502, + "step": 5740 + }, + { + "epoch": 0.45928, + "grad_norm": 1.418363094329834, + "learning_rate": 5.6446292764203825e-06, + "loss": 0.3052, + "step": 5741 + }, + { + "epoch": 0.45936, + "grad_norm": 1.6092485189437866, + "learning_rate": 5.643382108858891e-06, + "loss": 0.3851, + "step": 5742 + }, + { + "epoch": 0.45944, + "grad_norm": 1.4313188791275024, + "learning_rate": 5.642134900592766e-06, + "loss": 0.3024, + "step": 5743 + }, + { + "epoch": 0.45952, + "grad_norm": 1.4228692054748535, + "learning_rate": 5.6408876517009145e-06, + "loss": 0.2845, + "step": 5744 + }, + { + "epoch": 0.4596, + "grad_norm": 1.565037488937378, + "learning_rate": 5.6396403622622455e-06, + "loss": 0.3583, + "step": 5745 + }, + { + "epoch": 0.45968, + "grad_norm": 1.750694751739502, + "learning_rate": 5.638393032355671e-06, + "loss": 0.3198, + "step": 5746 + }, + { + "epoch": 0.45976, + "grad_norm": 1.5013595819473267, + "learning_rate": 5.637145662060106e-06, + "loss": 0.3254, + "step": 5747 + }, + { + "epoch": 0.45984, + "grad_norm": 1.337208867073059, + "learning_rate": 5.635898251454467e-06, + "loss": 0.2618, + "step": 5748 + }, + { + "epoch": 0.45992, + "grad_norm": 1.4910343885421753, + "learning_rate": 5.634650800617672e-06, + "loss": 0.2893, + "step": 5749 + }, + { + "epoch": 0.46, + "grad_norm": 1.4709289073944092, + "learning_rate": 5.633403309628645e-06, + "loss": 0.3244, + "step": 5750 + }, + { + "epoch": 0.46008, + "grad_norm": 1.6060997247695923, + "learning_rate": 5.6321557785663105e-06, + "loss": 0.3631, + "step": 5751 + }, + { + "epoch": 0.46016, + "grad_norm": 1.4144808053970337, + "learning_rate": 5.630908207509596e-06, + "loss": 0.3068, + "step": 5752 + }, + { + "epoch": 0.46024, + "grad_norm": 1.6381995677947998, + "learning_rate": 5.62966059653743e-06, + "loss": 0.3946, + "step": 5753 + }, + { + "epoch": 0.46032, + "grad_norm": 1.3315595388412476, + "learning_rate": 5.628412945728743e-06, + "loss": 0.2966, + "step": 5754 + }, + { + "epoch": 0.4604, + "grad_norm": 1.3688490390777588, + "learning_rate": 5.627165255162472e-06, + "loss": 0.2812, + "step": 5755 + }, + { + "epoch": 0.46048, + "grad_norm": 1.2457916736602783, + "learning_rate": 5.625917524917555e-06, + "loss": 0.316, + "step": 5756 + }, + { + "epoch": 0.46056, + "grad_norm": 1.8323296308517456, + "learning_rate": 5.624669755072929e-06, + "loss": 0.426, + "step": 5757 + }, + { + "epoch": 0.46064, + "grad_norm": 1.5934995412826538, + "learning_rate": 5.623421945707538e-06, + "loss": 0.3284, + "step": 5758 + }, + { + "epoch": 0.46072, + "grad_norm": 1.4549036026000977, + "learning_rate": 5.622174096900328e-06, + "loss": 0.3153, + "step": 5759 + }, + { + "epoch": 0.4608, + "grad_norm": 1.9752590656280518, + "learning_rate": 5.620926208730244e-06, + "loss": 0.3554, + "step": 5760 + }, + { + "epoch": 0.46088, + "grad_norm": 1.4583925008773804, + "learning_rate": 5.619678281276235e-06, + "loss": 0.2659, + "step": 5761 + }, + { + "epoch": 0.46096, + "grad_norm": 1.7075728178024292, + "learning_rate": 5.618430314617256e-06, + "loss": 0.3486, + "step": 5762 + }, + { + "epoch": 0.46104, + "grad_norm": 1.5881019830703735, + "learning_rate": 5.617182308832261e-06, + "loss": 0.3726, + "step": 5763 + }, + { + "epoch": 0.46112, + "grad_norm": 1.6844027042388916, + "learning_rate": 5.615934264000205e-06, + "loss": 0.2897, + "step": 5764 + }, + { + "epoch": 0.4612, + "grad_norm": 1.464428186416626, + "learning_rate": 5.614686180200051e-06, + "loss": 0.2831, + "step": 5765 + }, + { + "epoch": 0.46128, + "grad_norm": 1.572912573814392, + "learning_rate": 5.613438057510757e-06, + "loss": 0.3686, + "step": 5766 + }, + { + "epoch": 0.46136, + "grad_norm": 1.6168537139892578, + "learning_rate": 5.61218989601129e-06, + "loss": 0.4234, + "step": 5767 + }, + { + "epoch": 0.46144, + "grad_norm": 1.732604742050171, + "learning_rate": 5.610941695780616e-06, + "loss": 0.3026, + "step": 5768 + }, + { + "epoch": 0.46152, + "grad_norm": 1.795530915260315, + "learning_rate": 5.6096934568977065e-06, + "loss": 0.4124, + "step": 5769 + }, + { + "epoch": 0.4616, + "grad_norm": 1.4683985710144043, + "learning_rate": 5.60844517944153e-06, + "loss": 0.3594, + "step": 5770 + }, + { + "epoch": 0.46168, + "grad_norm": 1.6536790132522583, + "learning_rate": 5.607196863491067e-06, + "loss": 0.3532, + "step": 5771 + }, + { + "epoch": 0.46176, + "grad_norm": 1.6116361618041992, + "learning_rate": 5.605948509125288e-06, + "loss": 0.3776, + "step": 5772 + }, + { + "epoch": 0.46184, + "grad_norm": 1.5489414930343628, + "learning_rate": 5.604700116423173e-06, + "loss": 0.3541, + "step": 5773 + }, + { + "epoch": 0.46192, + "grad_norm": 1.51923668384552, + "learning_rate": 5.603451685463706e-06, + "loss": 0.3029, + "step": 5774 + }, + { + "epoch": 0.462, + "grad_norm": 1.9836581945419312, + "learning_rate": 5.60220321632587e-06, + "loss": 0.3891, + "step": 5775 + }, + { + "epoch": 0.46208, + "grad_norm": 1.7087833881378174, + "learning_rate": 5.600954709088651e-06, + "loss": 0.4867, + "step": 5776 + }, + { + "epoch": 0.46216, + "grad_norm": 1.7288364171981812, + "learning_rate": 5.5997061638310405e-06, + "loss": 0.321, + "step": 5777 + }, + { + "epoch": 0.46224, + "grad_norm": 1.787791132926941, + "learning_rate": 5.598457580632025e-06, + "loss": 0.4304, + "step": 5778 + }, + { + "epoch": 0.46232, + "grad_norm": 1.1242058277130127, + "learning_rate": 5.597208959570602e-06, + "loss": 0.3531, + "step": 5779 + }, + { + "epoch": 0.4624, + "grad_norm": 1.8064368963241577, + "learning_rate": 5.595960300725765e-06, + "loss": 0.3596, + "step": 5780 + }, + { + "epoch": 0.46248, + "grad_norm": 2.188750982284546, + "learning_rate": 5.594711604176515e-06, + "loss": 0.3838, + "step": 5781 + }, + { + "epoch": 0.46256, + "grad_norm": 1.5916962623596191, + "learning_rate": 5.593462870001851e-06, + "loss": 0.3726, + "step": 5782 + }, + { + "epoch": 0.46264, + "grad_norm": 1.3215919733047485, + "learning_rate": 5.592214098280778e-06, + "loss": 0.3084, + "step": 5783 + }, + { + "epoch": 0.46272, + "grad_norm": 1.3940409421920776, + "learning_rate": 5.5909652890923004e-06, + "loss": 0.3357, + "step": 5784 + }, + { + "epoch": 0.4628, + "grad_norm": 1.4418216943740845, + "learning_rate": 5.589716442515426e-06, + "loss": 0.2619, + "step": 5785 + }, + { + "epoch": 0.46288, + "grad_norm": 1.2304273843765259, + "learning_rate": 5.588467558629167e-06, + "loss": 0.2796, + "step": 5786 + }, + { + "epoch": 0.46296, + "grad_norm": 1.5511798858642578, + "learning_rate": 5.587218637512532e-06, + "loss": 0.2769, + "step": 5787 + }, + { + "epoch": 0.46304, + "grad_norm": 1.0966295003890991, + "learning_rate": 5.58596967924454e-06, + "loss": 0.2145, + "step": 5788 + }, + { + "epoch": 0.46312, + "grad_norm": 1.4968533515930176, + "learning_rate": 5.5847206839042075e-06, + "loss": 0.3548, + "step": 5789 + }, + { + "epoch": 0.4632, + "grad_norm": 1.6240606307983398, + "learning_rate": 5.5834716515705535e-06, + "loss": 0.3444, + "step": 5790 + }, + { + "epoch": 0.46328, + "grad_norm": 1.2611035108566284, + "learning_rate": 5.5822225823226e-06, + "loss": 0.2815, + "step": 5791 + }, + { + "epoch": 0.46336, + "grad_norm": 1.897449016571045, + "learning_rate": 5.580973476239371e-06, + "loss": 0.3436, + "step": 5792 + }, + { + "epoch": 0.46344, + "grad_norm": 1.5465292930603027, + "learning_rate": 5.5797243333998955e-06, + "loss": 0.293, + "step": 5793 + }, + { + "epoch": 0.46352, + "grad_norm": 1.9730859994888306, + "learning_rate": 5.578475153883201e-06, + "loss": 0.3797, + "step": 5794 + }, + { + "epoch": 0.4636, + "grad_norm": 1.8337812423706055, + "learning_rate": 5.577225937768319e-06, + "loss": 0.331, + "step": 5795 + }, + { + "epoch": 0.46368, + "grad_norm": 1.6676514148712158, + "learning_rate": 5.575976685134282e-06, + "loss": 0.3331, + "step": 5796 + }, + { + "epoch": 0.46376, + "grad_norm": 1.2623851299285889, + "learning_rate": 5.57472739606013e-06, + "loss": 0.2177, + "step": 5797 + }, + { + "epoch": 0.46384, + "grad_norm": 1.3866863250732422, + "learning_rate": 5.573478070624897e-06, + "loss": 0.3358, + "step": 5798 + }, + { + "epoch": 0.46392, + "grad_norm": 1.5590800046920776, + "learning_rate": 5.572228708907626e-06, + "loss": 0.3113, + "step": 5799 + }, + { + "epoch": 0.464, + "grad_norm": 1.5001630783081055, + "learning_rate": 5.570979310987359e-06, + "loss": 0.3252, + "step": 5800 + }, + { + "epoch": 0.46408, + "grad_norm": 1.6427888870239258, + "learning_rate": 5.5697298769431404e-06, + "loss": 0.4096, + "step": 5801 + }, + { + "epoch": 0.46416, + "grad_norm": 1.886500358581543, + "learning_rate": 5.56848040685402e-06, + "loss": 0.3298, + "step": 5802 + }, + { + "epoch": 0.46424, + "grad_norm": 1.4350028038024902, + "learning_rate": 5.567230900799046e-06, + "loss": 0.3214, + "step": 5803 + }, + { + "epoch": 0.46432, + "grad_norm": 1.796977162361145, + "learning_rate": 5.565981358857271e-06, + "loss": 0.3826, + "step": 5804 + }, + { + "epoch": 0.4644, + "grad_norm": 1.937929391860962, + "learning_rate": 5.564731781107749e-06, + "loss": 0.5146, + "step": 5805 + }, + { + "epoch": 0.46448, + "grad_norm": 1.6140251159667969, + "learning_rate": 5.563482167629537e-06, + "loss": 0.3653, + "step": 5806 + }, + { + "epoch": 0.46456, + "grad_norm": 1.971123218536377, + "learning_rate": 5.562232518501694e-06, + "loss": 0.481, + "step": 5807 + }, + { + "epoch": 0.46464, + "grad_norm": 1.7507970333099365, + "learning_rate": 5.56098283380328e-06, + "loss": 0.3149, + "step": 5808 + }, + { + "epoch": 0.46472, + "grad_norm": 1.2038670778274536, + "learning_rate": 5.55973311361336e-06, + "loss": 0.2867, + "step": 5809 + }, + { + "epoch": 0.4648, + "grad_norm": 1.8903367519378662, + "learning_rate": 5.558483358010999e-06, + "loss": 0.3769, + "step": 5810 + }, + { + "epoch": 0.46488, + "grad_norm": 1.3014912605285645, + "learning_rate": 5.557233567075263e-06, + "loss": 0.3411, + "step": 5811 + }, + { + "epoch": 0.46496, + "grad_norm": 1.7084940671920776, + "learning_rate": 5.555983740885225e-06, + "loss": 0.3719, + "step": 5812 + }, + { + "epoch": 0.46504, + "grad_norm": 1.7625243663787842, + "learning_rate": 5.554733879519956e-06, + "loss": 0.4562, + "step": 5813 + }, + { + "epoch": 0.46512, + "grad_norm": 1.159744381904602, + "learning_rate": 5.553483983058531e-06, + "loss": 0.2261, + "step": 5814 + }, + { + "epoch": 0.4652, + "grad_norm": 1.9187002182006836, + "learning_rate": 5.552234051580024e-06, + "loss": 0.3823, + "step": 5815 + }, + { + "epoch": 0.46528, + "grad_norm": 1.6461594104766846, + "learning_rate": 5.5509840851635185e-06, + "loss": 0.3652, + "step": 5816 + }, + { + "epoch": 0.46536, + "grad_norm": 1.612166166305542, + "learning_rate": 5.549734083888093e-06, + "loss": 0.3041, + "step": 5817 + }, + { + "epoch": 0.46544, + "grad_norm": 1.890860915184021, + "learning_rate": 5.548484047832833e-06, + "loss": 0.5575, + "step": 5818 + }, + { + "epoch": 0.46552, + "grad_norm": 1.3313186168670654, + "learning_rate": 5.54723397707682e-06, + "loss": 0.2978, + "step": 5819 + }, + { + "epoch": 0.4656, + "grad_norm": 1.85710608959198, + "learning_rate": 5.5459838716991465e-06, + "loss": 0.3179, + "step": 5820 + }, + { + "epoch": 0.46568, + "grad_norm": 1.5691063404083252, + "learning_rate": 5.5447337317788986e-06, + "loss": 0.3356, + "step": 5821 + }, + { + "epoch": 0.46576, + "grad_norm": 1.6753313541412354, + "learning_rate": 5.5434835573951704e-06, + "loss": 0.2848, + "step": 5822 + }, + { + "epoch": 0.46584, + "grad_norm": 1.4183037281036377, + "learning_rate": 5.542233348627056e-06, + "loss": 0.3081, + "step": 5823 + }, + { + "epoch": 0.46592, + "grad_norm": 1.2803384065628052, + "learning_rate": 5.540983105553654e-06, + "loss": 0.3479, + "step": 5824 + }, + { + "epoch": 0.466, + "grad_norm": 1.3356515169143677, + "learning_rate": 5.539732828254059e-06, + "loss": 0.2828, + "step": 5825 + }, + { + "epoch": 0.46608, + "grad_norm": 1.4406170845031738, + "learning_rate": 5.538482516807374e-06, + "loss": 0.3388, + "step": 5826 + }, + { + "epoch": 0.46616, + "grad_norm": 1.526328682899475, + "learning_rate": 5.537232171292702e-06, + "loss": 0.312, + "step": 5827 + }, + { + "epoch": 0.46624, + "grad_norm": 1.2586652040481567, + "learning_rate": 5.535981791789148e-06, + "loss": 0.2932, + "step": 5828 + }, + { + "epoch": 0.46632, + "grad_norm": 1.7249115705490112, + "learning_rate": 5.534731378375819e-06, + "loss": 0.489, + "step": 5829 + }, + { + "epoch": 0.4664, + "grad_norm": 1.484268069267273, + "learning_rate": 5.533480931131827e-06, + "loss": 0.2965, + "step": 5830 + }, + { + "epoch": 0.46648, + "grad_norm": 1.7964249849319458, + "learning_rate": 5.53223045013628e-06, + "loss": 0.4045, + "step": 5831 + }, + { + "epoch": 0.46656, + "grad_norm": 1.7476415634155273, + "learning_rate": 5.530979935468294e-06, + "loss": 0.3451, + "step": 5832 + }, + { + "epoch": 0.46664, + "grad_norm": 1.3235749006271362, + "learning_rate": 5.529729387206983e-06, + "loss": 0.2304, + "step": 5833 + }, + { + "epoch": 0.46672, + "grad_norm": 1.5181303024291992, + "learning_rate": 5.5284788054314665e-06, + "loss": 0.314, + "step": 5834 + }, + { + "epoch": 0.4668, + "grad_norm": 1.5556458234786987, + "learning_rate": 5.527228190220866e-06, + "loss": 0.3762, + "step": 5835 + }, + { + "epoch": 0.46688, + "grad_norm": 1.3270909786224365, + "learning_rate": 5.525977541654299e-06, + "loss": 0.2586, + "step": 5836 + }, + { + "epoch": 0.46696, + "grad_norm": 1.6928744316101074, + "learning_rate": 5.524726859810895e-06, + "loss": 0.3365, + "step": 5837 + }, + { + "epoch": 0.46704, + "grad_norm": 1.3896623849868774, + "learning_rate": 5.523476144769777e-06, + "loss": 0.3052, + "step": 5838 + }, + { + "epoch": 0.46712, + "grad_norm": 1.6852587461471558, + "learning_rate": 5.522225396610076e-06, + "loss": 0.3974, + "step": 5839 + }, + { + "epoch": 0.4672, + "grad_norm": 1.4615007638931274, + "learning_rate": 5.520974615410921e-06, + "loss": 0.3015, + "step": 5840 + }, + { + "epoch": 0.46728, + "grad_norm": 1.2389006614685059, + "learning_rate": 5.519723801251445e-06, + "loss": 0.2909, + "step": 5841 + }, + { + "epoch": 0.46736, + "grad_norm": 1.441849946975708, + "learning_rate": 5.518472954210784e-06, + "loss": 0.3162, + "step": 5842 + }, + { + "epoch": 0.46744, + "grad_norm": 1.5538463592529297, + "learning_rate": 5.5172220743680745e-06, + "loss": 0.3241, + "step": 5843 + }, + { + "epoch": 0.46752, + "grad_norm": 1.311487078666687, + "learning_rate": 5.515971161802454e-06, + "loss": 0.2759, + "step": 5844 + }, + { + "epoch": 0.4676, + "grad_norm": 1.836043119430542, + "learning_rate": 5.514720216593063e-06, + "loss": 0.4221, + "step": 5845 + }, + { + "epoch": 0.46768, + "grad_norm": 1.3168381452560425, + "learning_rate": 5.513469238819048e-06, + "loss": 0.3044, + "step": 5846 + }, + { + "epoch": 0.46776, + "grad_norm": 1.2070047855377197, + "learning_rate": 5.5122182285595525e-06, + "loss": 0.229, + "step": 5847 + }, + { + "epoch": 0.46784, + "grad_norm": 1.5772783756256104, + "learning_rate": 5.510967185893723e-06, + "loss": 0.3556, + "step": 5848 + }, + { + "epoch": 0.46792, + "grad_norm": 1.5532734394073486, + "learning_rate": 5.509716110900709e-06, + "loss": 0.3562, + "step": 5849 + }, + { + "epoch": 0.468, + "grad_norm": 1.7133153676986694, + "learning_rate": 5.508465003659663e-06, + "loss": 0.3732, + "step": 5850 + }, + { + "epoch": 0.46808, + "grad_norm": 1.6906065940856934, + "learning_rate": 5.5072138642497365e-06, + "loss": 0.398, + "step": 5851 + }, + { + "epoch": 0.46816, + "grad_norm": 1.8008275032043457, + "learning_rate": 5.505962692750087e-06, + "loss": 0.3578, + "step": 5852 + }, + { + "epoch": 0.46824, + "grad_norm": 2.004988193511963, + "learning_rate": 5.504711489239871e-06, + "loss": 0.4561, + "step": 5853 + }, + { + "epoch": 0.46832, + "grad_norm": 1.8562557697296143, + "learning_rate": 5.5034602537982485e-06, + "loss": 0.5832, + "step": 5854 + }, + { + "epoch": 0.4684, + "grad_norm": 1.4720324277877808, + "learning_rate": 5.502208986504378e-06, + "loss": 0.2837, + "step": 5855 + }, + { + "epoch": 0.46848, + "grad_norm": 1.579901099205017, + "learning_rate": 5.500957687437427e-06, + "loss": 0.2924, + "step": 5856 + }, + { + "epoch": 0.46856, + "grad_norm": 1.6447190046310425, + "learning_rate": 5.499706356676559e-06, + "loss": 0.325, + "step": 5857 + }, + { + "epoch": 0.46864, + "grad_norm": 1.4578676223754883, + "learning_rate": 5.498454994300941e-06, + "loss": 0.341, + "step": 5858 + }, + { + "epoch": 0.46872, + "grad_norm": 1.5523934364318848, + "learning_rate": 5.497203600389744e-06, + "loss": 0.4472, + "step": 5859 + }, + { + "epoch": 0.4688, + "grad_norm": 1.6504297256469727, + "learning_rate": 5.495952175022139e-06, + "loss": 0.3244, + "step": 5860 + }, + { + "epoch": 0.46888, + "grad_norm": 1.4491195678710938, + "learning_rate": 5.4947007182772995e-06, + "loss": 0.358, + "step": 5861 + }, + { + "epoch": 0.46896, + "grad_norm": 1.6385431289672852, + "learning_rate": 5.4934492302344e-06, + "loss": 0.3625, + "step": 5862 + }, + { + "epoch": 0.46904, + "grad_norm": 1.3259345293045044, + "learning_rate": 5.492197710972618e-06, + "loss": 0.3199, + "step": 5863 + }, + { + "epoch": 0.46912, + "grad_norm": 1.4403702020645142, + "learning_rate": 5.4909461605711365e-06, + "loss": 0.2822, + "step": 5864 + }, + { + "epoch": 0.4692, + "grad_norm": 1.74454665184021, + "learning_rate": 5.489694579109133e-06, + "loss": 0.3237, + "step": 5865 + }, + { + "epoch": 0.46928, + "grad_norm": 1.565183162689209, + "learning_rate": 5.4884429666657925e-06, + "loss": 0.3899, + "step": 5866 + }, + { + "epoch": 0.46936, + "grad_norm": 1.3134307861328125, + "learning_rate": 5.487191323320298e-06, + "loss": 0.421, + "step": 5867 + }, + { + "epoch": 0.46944, + "grad_norm": 1.5535221099853516, + "learning_rate": 5.48593964915184e-06, + "loss": 0.403, + "step": 5868 + }, + { + "epoch": 0.46952, + "grad_norm": 1.4464749097824097, + "learning_rate": 5.484687944239605e-06, + "loss": 0.3067, + "step": 5869 + }, + { + "epoch": 0.4696, + "grad_norm": 1.3258053064346313, + "learning_rate": 5.483436208662787e-06, + "loss": 0.3042, + "step": 5870 + }, + { + "epoch": 0.46968, + "grad_norm": 1.434024453163147, + "learning_rate": 5.482184442500578e-06, + "loss": 0.3709, + "step": 5871 + }, + { + "epoch": 0.46976, + "grad_norm": 1.493878960609436, + "learning_rate": 5.480932645832171e-06, + "loss": 0.2702, + "step": 5872 + }, + { + "epoch": 0.46984, + "grad_norm": 1.5920265913009644, + "learning_rate": 5.479680818736765e-06, + "loss": 0.3356, + "step": 5873 + }, + { + "epoch": 0.46992, + "grad_norm": 1.51193106174469, + "learning_rate": 5.478428961293559e-06, + "loss": 0.3876, + "step": 5874 + }, + { + "epoch": 0.47, + "grad_norm": 1.8197968006134033, + "learning_rate": 5.477177073581754e-06, + "loss": 0.4746, + "step": 5875 + }, + { + "epoch": 0.47008, + "grad_norm": 1.4704205989837646, + "learning_rate": 5.475925155680552e-06, + "loss": 0.2828, + "step": 5876 + }, + { + "epoch": 0.47016, + "grad_norm": 1.455899953842163, + "learning_rate": 5.474673207669159e-06, + "loss": 0.2777, + "step": 5877 + }, + { + "epoch": 0.47024, + "grad_norm": 1.9199295043945312, + "learning_rate": 5.473421229626779e-06, + "loss": 0.4936, + "step": 5878 + }, + { + "epoch": 0.47032, + "grad_norm": 1.642419457435608, + "learning_rate": 5.472169221632622e-06, + "loss": 0.4494, + "step": 5879 + }, + { + "epoch": 0.4704, + "grad_norm": 1.428835391998291, + "learning_rate": 5.470917183765898e-06, + "loss": 0.3226, + "step": 5880 + }, + { + "epoch": 0.47048, + "grad_norm": 1.3991670608520508, + "learning_rate": 5.4696651161058186e-06, + "loss": 0.2963, + "step": 5881 + }, + { + "epoch": 0.47056, + "grad_norm": 1.5357342958450317, + "learning_rate": 5.468413018731601e-06, + "loss": 0.3023, + "step": 5882 + }, + { + "epoch": 0.47064, + "grad_norm": 1.5561589002609253, + "learning_rate": 5.467160891722459e-06, + "loss": 0.3432, + "step": 5883 + }, + { + "epoch": 0.47072, + "grad_norm": 1.3665988445281982, + "learning_rate": 5.465908735157608e-06, + "loss": 0.333, + "step": 5884 + }, + { + "epoch": 0.4708, + "grad_norm": 1.4518331289291382, + "learning_rate": 5.4646565491162716e-06, + "loss": 0.3156, + "step": 5885 + }, + { + "epoch": 0.47088, + "grad_norm": 1.7487683296203613, + "learning_rate": 5.46340433367767e-06, + "loss": 0.3997, + "step": 5886 + }, + { + "epoch": 0.47096, + "grad_norm": 1.7349579334259033, + "learning_rate": 5.462152088921028e-06, + "loss": 0.3941, + "step": 5887 + }, + { + "epoch": 0.47104, + "grad_norm": 1.3524267673492432, + "learning_rate": 5.460899814925567e-06, + "loss": 0.3129, + "step": 5888 + }, + { + "epoch": 0.47112, + "grad_norm": 1.5749634504318237, + "learning_rate": 5.459647511770521e-06, + "loss": 0.4254, + "step": 5889 + }, + { + "epoch": 0.4712, + "grad_norm": 1.641700267791748, + "learning_rate": 5.458395179535112e-06, + "loss": 0.3265, + "step": 5890 + }, + { + "epoch": 0.47128, + "grad_norm": 1.3452178239822388, + "learning_rate": 5.457142818298573e-06, + "loss": 0.3459, + "step": 5891 + }, + { + "epoch": 0.47136, + "grad_norm": 1.5965007543563843, + "learning_rate": 5.455890428140139e-06, + "loss": 0.3374, + "step": 5892 + }, + { + "epoch": 0.47144, + "grad_norm": 1.6195091009140015, + "learning_rate": 5.4546380091390425e-06, + "loss": 0.4122, + "step": 5893 + }, + { + "epoch": 0.47152, + "grad_norm": 2.059089422225952, + "learning_rate": 5.453385561374521e-06, + "loss": 0.5191, + "step": 5894 + }, + { + "epoch": 0.4716, + "grad_norm": 1.6009624004364014, + "learning_rate": 5.452133084925812e-06, + "loss": 0.3699, + "step": 5895 + }, + { + "epoch": 0.47168, + "grad_norm": 1.5404019355773926, + "learning_rate": 5.450880579872156e-06, + "loss": 0.347, + "step": 5896 + }, + { + "epoch": 0.47176, + "grad_norm": 2.4010934829711914, + "learning_rate": 5.449628046292792e-06, + "loss": 0.4262, + "step": 5897 + }, + { + "epoch": 0.47184, + "grad_norm": 1.5734963417053223, + "learning_rate": 5.448375484266968e-06, + "loss": 0.4524, + "step": 5898 + }, + { + "epoch": 0.47192, + "grad_norm": 1.7515249252319336, + "learning_rate": 5.447122893873927e-06, + "loss": 0.4706, + "step": 5899 + }, + { + "epoch": 0.472, + "grad_norm": 1.6091927289962769, + "learning_rate": 5.445870275192918e-06, + "loss": 0.3039, + "step": 5900 + }, + { + "epoch": 0.47208, + "grad_norm": 1.3763673305511475, + "learning_rate": 5.444617628303187e-06, + "loss": 0.2805, + "step": 5901 + }, + { + "epoch": 0.47216, + "grad_norm": 1.7855626344680786, + "learning_rate": 5.443364953283986e-06, + "loss": 0.3412, + "step": 5902 + }, + { + "epoch": 0.47224, + "grad_norm": 1.6489816904067993, + "learning_rate": 5.442112250214569e-06, + "loss": 0.3524, + "step": 5903 + }, + { + "epoch": 0.47232, + "grad_norm": 1.6601721048355103, + "learning_rate": 5.440859519174187e-06, + "loss": 0.3718, + "step": 5904 + }, + { + "epoch": 0.4724, + "grad_norm": 1.6896872520446777, + "learning_rate": 5.4396067602421e-06, + "loss": 0.3397, + "step": 5905 + }, + { + "epoch": 0.47248, + "grad_norm": 1.254325270652771, + "learning_rate": 5.438353973497565e-06, + "loss": 0.2802, + "step": 5906 + }, + { + "epoch": 0.47256, + "grad_norm": 2.1470906734466553, + "learning_rate": 5.43710115901984e-06, + "loss": 0.5072, + "step": 5907 + }, + { + "epoch": 0.47264, + "grad_norm": 1.403336763381958, + "learning_rate": 5.435848316888187e-06, + "loss": 0.2941, + "step": 5908 + }, + { + "epoch": 0.47272, + "grad_norm": 1.7086148262023926, + "learning_rate": 5.4345954471818695e-06, + "loss": 0.3538, + "step": 5909 + }, + { + "epoch": 0.4728, + "grad_norm": 1.3657201528549194, + "learning_rate": 5.433342549980153e-06, + "loss": 0.2622, + "step": 5910 + }, + { + "epoch": 0.47288, + "grad_norm": 1.6097228527069092, + "learning_rate": 5.432089625362302e-06, + "loss": 0.4418, + "step": 5911 + }, + { + "epoch": 0.47296, + "grad_norm": 1.664275050163269, + "learning_rate": 5.430836673407588e-06, + "loss": 0.3672, + "step": 5912 + }, + { + "epoch": 0.47304, + "grad_norm": 1.4426720142364502, + "learning_rate": 5.4295836941952775e-06, + "loss": 0.33, + "step": 5913 + }, + { + "epoch": 0.47312, + "grad_norm": 1.6172068119049072, + "learning_rate": 5.428330687804643e-06, + "loss": 0.4129, + "step": 5914 + }, + { + "epoch": 0.4732, + "grad_norm": 1.322485327720642, + "learning_rate": 5.427077654314961e-06, + "loss": 0.3158, + "step": 5915 + }, + { + "epoch": 0.47328, + "grad_norm": 1.6107004880905151, + "learning_rate": 5.425824593805505e-06, + "loss": 0.3416, + "step": 5916 + }, + { + "epoch": 0.47336, + "grad_norm": 2.201343297958374, + "learning_rate": 5.424571506355552e-06, + "loss": 0.3547, + "step": 5917 + }, + { + "epoch": 0.47344, + "grad_norm": 1.490373969078064, + "learning_rate": 5.42331839204438e-06, + "loss": 0.3098, + "step": 5918 + }, + { + "epoch": 0.47352, + "grad_norm": 1.3307541608810425, + "learning_rate": 5.422065250951268e-06, + "loss": 0.3176, + "step": 5919 + }, + { + "epoch": 0.4736, + "grad_norm": 1.459407925605774, + "learning_rate": 5.420812083155502e-06, + "loss": 0.3212, + "step": 5920 + }, + { + "epoch": 0.47368, + "grad_norm": 1.4235657453536987, + "learning_rate": 5.4195588887363635e-06, + "loss": 0.3285, + "step": 5921 + }, + { + "epoch": 0.47376, + "grad_norm": 1.2670882940292358, + "learning_rate": 5.4183056677731376e-06, + "loss": 0.3147, + "step": 5922 + }, + { + "epoch": 0.47384, + "grad_norm": 1.553990125656128, + "learning_rate": 5.41705242034511e-06, + "loss": 0.4067, + "step": 5923 + }, + { + "epoch": 0.47392, + "grad_norm": 1.3905086517333984, + "learning_rate": 5.415799146531574e-06, + "loss": 0.3246, + "step": 5924 + }, + { + "epoch": 0.474, + "grad_norm": 1.6242258548736572, + "learning_rate": 5.414545846411815e-06, + "loss": 0.3597, + "step": 5925 + }, + { + "epoch": 0.47408, + "grad_norm": 2.0094828605651855, + "learning_rate": 5.413292520065129e-06, + "loss": 0.3506, + "step": 5926 + }, + { + "epoch": 0.47416, + "grad_norm": 1.0113637447357178, + "learning_rate": 5.4120391675708065e-06, + "loss": 0.2126, + "step": 5927 + }, + { + "epoch": 0.47424, + "grad_norm": 1.8587594032287598, + "learning_rate": 5.410785789008145e-06, + "loss": 0.4143, + "step": 5928 + }, + { + "epoch": 0.47432, + "grad_norm": 1.5328631401062012, + "learning_rate": 5.4095323844564425e-06, + "loss": 0.3364, + "step": 5929 + }, + { + "epoch": 0.4744, + "grad_norm": 1.8271329402923584, + "learning_rate": 5.408278953994996e-06, + "loss": 0.3848, + "step": 5930 + }, + { + "epoch": 0.47448, + "grad_norm": 1.4636245965957642, + "learning_rate": 5.407025497703105e-06, + "loss": 0.3291, + "step": 5931 + }, + { + "epoch": 0.47456, + "grad_norm": 1.4129557609558105, + "learning_rate": 5.405772015660072e-06, + "loss": 0.3842, + "step": 5932 + }, + { + "epoch": 0.47464, + "grad_norm": 1.7334128618240356, + "learning_rate": 5.404518507945205e-06, + "loss": 0.4864, + "step": 5933 + }, + { + "epoch": 0.47472, + "grad_norm": 1.5284255743026733, + "learning_rate": 5.403264974637802e-06, + "loss": 0.3454, + "step": 5934 + }, + { + "epoch": 0.4748, + "grad_norm": 1.4403399229049683, + "learning_rate": 5.402011415817176e-06, + "loss": 0.29, + "step": 5935 + }, + { + "epoch": 0.47488, + "grad_norm": 1.4352822303771973, + "learning_rate": 5.400757831562631e-06, + "loss": 0.2549, + "step": 5936 + }, + { + "epoch": 0.47496, + "grad_norm": 1.5623631477355957, + "learning_rate": 5.399504221953478e-06, + "loss": 0.2864, + "step": 5937 + }, + { + "epoch": 0.47504, + "grad_norm": 1.2767705917358398, + "learning_rate": 5.3982505870690316e-06, + "loss": 0.289, + "step": 5938 + }, + { + "epoch": 0.47512, + "grad_norm": 1.3673750162124634, + "learning_rate": 5.396996926988601e-06, + "loss": 0.2931, + "step": 5939 + }, + { + "epoch": 0.4752, + "grad_norm": 1.759554386138916, + "learning_rate": 5.395743241791504e-06, + "loss": 0.3399, + "step": 5940 + }, + { + "epoch": 0.47528, + "grad_norm": 1.6224939823150635, + "learning_rate": 5.394489531557059e-06, + "loss": 0.3372, + "step": 5941 + }, + { + "epoch": 0.47536, + "grad_norm": 1.5486100912094116, + "learning_rate": 5.393235796364578e-06, + "loss": 0.2582, + "step": 5942 + }, + { + "epoch": 0.47544, + "grad_norm": 0.9767140746116638, + "learning_rate": 5.391982036293385e-06, + "loss": 0.1933, + "step": 5943 + }, + { + "epoch": 0.47552, + "grad_norm": 1.7568838596343994, + "learning_rate": 5.390728251422801e-06, + "loss": 0.3333, + "step": 5944 + }, + { + "epoch": 0.4756, + "grad_norm": 1.5353167057037354, + "learning_rate": 5.389474441832148e-06, + "loss": 0.3076, + "step": 5945 + }, + { + "epoch": 0.47568, + "grad_norm": 1.9737414121627808, + "learning_rate": 5.388220607600748e-06, + "loss": 0.434, + "step": 5946 + }, + { + "epoch": 0.47576, + "grad_norm": 1.3293046951293945, + "learning_rate": 5.386966748807932e-06, + "loss": 0.2834, + "step": 5947 + }, + { + "epoch": 0.47584, + "grad_norm": 1.4846619367599487, + "learning_rate": 5.385712865533023e-06, + "loss": 0.3274, + "step": 5948 + }, + { + "epoch": 0.47592, + "grad_norm": 1.7403323650360107, + "learning_rate": 5.384458957855351e-06, + "loss": 0.3113, + "step": 5949 + }, + { + "epoch": 0.476, + "grad_norm": 1.7294260263442993, + "learning_rate": 5.383205025854248e-06, + "loss": 0.3651, + "step": 5950 + }, + { + "epoch": 0.47608, + "grad_norm": 1.7438633441925049, + "learning_rate": 5.381951069609045e-06, + "loss": 0.3918, + "step": 5951 + }, + { + "epoch": 0.47616, + "grad_norm": 1.3631834983825684, + "learning_rate": 5.380697089199075e-06, + "loss": 0.2842, + "step": 5952 + }, + { + "epoch": 0.47624, + "grad_norm": 1.3905704021453857, + "learning_rate": 5.379443084703676e-06, + "loss": 0.2753, + "step": 5953 + }, + { + "epoch": 0.47632, + "grad_norm": 1.5892817974090576, + "learning_rate": 5.378189056202181e-06, + "loss": 0.3348, + "step": 5954 + }, + { + "epoch": 0.4764, + "grad_norm": 1.856491208076477, + "learning_rate": 5.376935003773931e-06, + "loss": 0.3874, + "step": 5955 + }, + { + "epoch": 0.47648, + "grad_norm": 1.2700897455215454, + "learning_rate": 5.375680927498265e-06, + "loss": 0.309, + "step": 5956 + }, + { + "epoch": 0.47656, + "grad_norm": 1.3322601318359375, + "learning_rate": 5.374426827454522e-06, + "loss": 0.2654, + "step": 5957 + }, + { + "epoch": 0.47664, + "grad_norm": 2.0289957523345947, + "learning_rate": 5.373172703722046e-06, + "loss": 0.5895, + "step": 5958 + }, + { + "epoch": 0.47672, + "grad_norm": 1.688299298286438, + "learning_rate": 5.371918556380185e-06, + "loss": 0.4384, + "step": 5959 + }, + { + "epoch": 0.4768, + "grad_norm": 1.5574098825454712, + "learning_rate": 5.370664385508278e-06, + "loss": 0.3618, + "step": 5960 + }, + { + "epoch": 0.47688, + "grad_norm": 1.470866322517395, + "learning_rate": 5.369410191185676e-06, + "loss": 0.3197, + "step": 5961 + }, + { + "epoch": 0.47696, + "grad_norm": 1.5503671169281006, + "learning_rate": 5.368155973491729e-06, + "loss": 0.4128, + "step": 5962 + }, + { + "epoch": 0.47704, + "grad_norm": 1.393202304840088, + "learning_rate": 5.366901732505784e-06, + "loss": 0.2566, + "step": 5963 + }, + { + "epoch": 0.47712, + "grad_norm": 1.478087306022644, + "learning_rate": 5.365647468307193e-06, + "loss": 0.3087, + "step": 5964 + }, + { + "epoch": 0.4772, + "grad_norm": 1.4363526105880737, + "learning_rate": 5.364393180975314e-06, + "loss": 0.2757, + "step": 5965 + }, + { + "epoch": 0.47728, + "grad_norm": 1.8411908149719238, + "learning_rate": 5.363138870589495e-06, + "loss": 0.4493, + "step": 5966 + }, + { + "epoch": 0.47736, + "grad_norm": 1.8688772916793823, + "learning_rate": 5.361884537229095e-06, + "loss": 0.3322, + "step": 5967 + }, + { + "epoch": 0.47744, + "grad_norm": 1.6204429864883423, + "learning_rate": 5.360630180973472e-06, + "loss": 0.321, + "step": 5968 + }, + { + "epoch": 0.47752, + "grad_norm": 1.8783400058746338, + "learning_rate": 5.359375801901982e-06, + "loss": 0.3558, + "step": 5969 + }, + { + "epoch": 0.4776, + "grad_norm": 1.60300874710083, + "learning_rate": 5.358121400093989e-06, + "loss": 0.3528, + "step": 5970 + }, + { + "epoch": 0.47768, + "grad_norm": 1.5977694988250732, + "learning_rate": 5.356866975628854e-06, + "loss": 0.3363, + "step": 5971 + }, + { + "epoch": 0.47776, + "grad_norm": 1.698292851448059, + "learning_rate": 5.355612528585938e-06, + "loss": 0.391, + "step": 5972 + }, + { + "epoch": 0.47784, + "grad_norm": 1.7773863077163696, + "learning_rate": 5.354358059044608e-06, + "loss": 0.3306, + "step": 5973 + }, + { + "epoch": 0.47792, + "grad_norm": 1.5922447443008423, + "learning_rate": 5.353103567084229e-06, + "loss": 0.3131, + "step": 5974 + }, + { + "epoch": 0.478, + "grad_norm": 1.8159767389297485, + "learning_rate": 5.3518490527841685e-06, + "loss": 0.3942, + "step": 5975 + }, + { + "epoch": 0.47808, + "grad_norm": 1.657160997390747, + "learning_rate": 5.350594516223797e-06, + "loss": 0.4472, + "step": 5976 + }, + { + "epoch": 0.47816, + "grad_norm": 2.0365302562713623, + "learning_rate": 5.3493399574824824e-06, + "loss": 0.3914, + "step": 5977 + }, + { + "epoch": 0.47824, + "grad_norm": 1.715842843055725, + "learning_rate": 5.348085376639598e-06, + "loss": 0.3958, + "step": 5978 + }, + { + "epoch": 0.47832, + "grad_norm": 1.8368576765060425, + "learning_rate": 5.3468307737745175e-06, + "loss": 0.3646, + "step": 5979 + }, + { + "epoch": 0.4784, + "grad_norm": 1.8690496683120728, + "learning_rate": 5.345576148966612e-06, + "loss": 0.5069, + "step": 5980 + }, + { + "epoch": 0.47848, + "grad_norm": 1.9646300077438354, + "learning_rate": 5.344321502295262e-06, + "loss": 0.4976, + "step": 5981 + }, + { + "epoch": 0.47856, + "grad_norm": 1.7964569330215454, + "learning_rate": 5.3430668338398425e-06, + "loss": 0.3761, + "step": 5982 + }, + { + "epoch": 0.47864, + "grad_norm": 1.9966002702713013, + "learning_rate": 5.341812143679732e-06, + "loss": 0.4334, + "step": 5983 + }, + { + "epoch": 0.47872, + "grad_norm": 2.4749326705932617, + "learning_rate": 5.34055743189431e-06, + "loss": 0.5564, + "step": 5984 + }, + { + "epoch": 0.4788, + "grad_norm": 1.9291296005249023, + "learning_rate": 5.339302698562959e-06, + "loss": 0.3668, + "step": 5985 + }, + { + "epoch": 0.47888, + "grad_norm": 1.5976835489273071, + "learning_rate": 5.3380479437650625e-06, + "loss": 0.3629, + "step": 5986 + }, + { + "epoch": 0.47896, + "grad_norm": 1.5634150505065918, + "learning_rate": 5.336793167580002e-06, + "loss": 0.3366, + "step": 5987 + }, + { + "epoch": 0.47904, + "grad_norm": 1.9057620763778687, + "learning_rate": 5.3355383700871665e-06, + "loss": 0.3585, + "step": 5988 + }, + { + "epoch": 0.47912, + "grad_norm": 1.6130526065826416, + "learning_rate": 5.334283551365941e-06, + "loss": 0.4087, + "step": 5989 + }, + { + "epoch": 0.4792, + "grad_norm": 2.0664384365081787, + "learning_rate": 5.333028711495713e-06, + "loss": 0.5229, + "step": 5990 + }, + { + "epoch": 0.47928, + "grad_norm": 1.7554843425750732, + "learning_rate": 5.331773850555874e-06, + "loss": 0.3727, + "step": 5991 + }, + { + "epoch": 0.47936, + "grad_norm": 1.2213190793991089, + "learning_rate": 5.330518968625812e-06, + "loss": 0.2522, + "step": 5992 + }, + { + "epoch": 0.47944, + "grad_norm": 1.3879023790359497, + "learning_rate": 5.32926406578492e-06, + "loss": 0.3652, + "step": 5993 + }, + { + "epoch": 0.47952, + "grad_norm": 1.7158325910568237, + "learning_rate": 5.328009142112594e-06, + "loss": 0.3132, + "step": 5994 + }, + { + "epoch": 0.4796, + "grad_norm": 1.8352590799331665, + "learning_rate": 5.326754197688227e-06, + "loss": 0.3482, + "step": 5995 + }, + { + "epoch": 0.47968, + "grad_norm": 1.52081298828125, + "learning_rate": 5.3254992325912144e-06, + "loss": 0.309, + "step": 5996 + }, + { + "epoch": 0.47976, + "grad_norm": 1.6611655950546265, + "learning_rate": 5.324244246900955e-06, + "loss": 0.4123, + "step": 5997 + }, + { + "epoch": 0.47984, + "grad_norm": 1.4265227317810059, + "learning_rate": 5.322989240696846e-06, + "loss": 0.2749, + "step": 5998 + }, + { + "epoch": 0.47992, + "grad_norm": 1.5053348541259766, + "learning_rate": 5.3217342140582895e-06, + "loss": 0.268, + "step": 5999 + }, + { + "epoch": 0.48, + "grad_norm": 1.3345094919204712, + "learning_rate": 5.3204791670646875e-06, + "loss": 0.2847, + "step": 6000 + }, + { + "epoch": 0.48008, + "grad_norm": 1.3959380388259888, + "learning_rate": 5.319224099795438e-06, + "loss": 0.321, + "step": 6001 + }, + { + "epoch": 0.48016, + "grad_norm": 1.5935245752334595, + "learning_rate": 5.31796901232995e-06, + "loss": 0.3813, + "step": 6002 + }, + { + "epoch": 0.48024, + "grad_norm": 1.4641132354736328, + "learning_rate": 5.316713904747626e-06, + "loss": 0.3184, + "step": 6003 + }, + { + "epoch": 0.48032, + "grad_norm": 1.3873530626296997, + "learning_rate": 5.315458777127872e-06, + "loss": 0.2754, + "step": 6004 + }, + { + "epoch": 0.4804, + "grad_norm": 1.330491065979004, + "learning_rate": 5.3142036295500965e-06, + "loss": 0.2641, + "step": 6005 + }, + { + "epoch": 0.48048, + "grad_norm": 1.4904730319976807, + "learning_rate": 5.31294846209371e-06, + "loss": 0.3074, + "step": 6006 + }, + { + "epoch": 0.48056, + "grad_norm": 1.3027527332305908, + "learning_rate": 5.311693274838121e-06, + "loss": 0.3014, + "step": 6007 + }, + { + "epoch": 0.48064, + "grad_norm": 1.5734492540359497, + "learning_rate": 5.310438067862741e-06, + "loss": 0.2831, + "step": 6008 + }, + { + "epoch": 0.48072, + "grad_norm": 2.1448333263397217, + "learning_rate": 5.309182841246984e-06, + "loss": 0.4077, + "step": 6009 + }, + { + "epoch": 0.4808, + "grad_norm": 1.4222567081451416, + "learning_rate": 5.307927595070261e-06, + "loss": 0.3582, + "step": 6010 + }, + { + "epoch": 0.48088, + "grad_norm": 1.7736330032348633, + "learning_rate": 5.306672329411993e-06, + "loss": 0.4902, + "step": 6011 + }, + { + "epoch": 0.48096, + "grad_norm": 1.4863662719726562, + "learning_rate": 5.3054170443515895e-06, + "loss": 0.3672, + "step": 6012 + }, + { + "epoch": 0.48104, + "grad_norm": 1.2911968231201172, + "learning_rate": 5.304161739968474e-06, + "loss": 0.3233, + "step": 6013 + }, + { + "epoch": 0.48112, + "grad_norm": 1.451702356338501, + "learning_rate": 5.302906416342062e-06, + "loss": 0.2513, + "step": 6014 + }, + { + "epoch": 0.4812, + "grad_norm": 1.4997286796569824, + "learning_rate": 5.301651073551774e-06, + "loss": 0.2994, + "step": 6015 + }, + { + "epoch": 0.48128, + "grad_norm": 1.796286702156067, + "learning_rate": 5.300395711677032e-06, + "loss": 0.3123, + "step": 6016 + }, + { + "epoch": 0.48136, + "grad_norm": 1.705010175704956, + "learning_rate": 5.299140330797258e-06, + "loss": 0.4163, + "step": 6017 + }, + { + "epoch": 0.48144, + "grad_norm": 1.3299634456634521, + "learning_rate": 5.297884930991878e-06, + "loss": 0.2821, + "step": 6018 + }, + { + "epoch": 0.48152, + "grad_norm": 1.7674593925476074, + "learning_rate": 5.296629512340313e-06, + "loss": 0.3192, + "step": 6019 + }, + { + "epoch": 0.4816, + "grad_norm": 1.500632405281067, + "learning_rate": 5.295374074921993e-06, + "loss": 0.4209, + "step": 6020 + }, + { + "epoch": 0.48168, + "grad_norm": 1.5047935247421265, + "learning_rate": 5.294118618816342e-06, + "loss": 0.344, + "step": 6021 + }, + { + "epoch": 0.48176, + "grad_norm": 1.670544147491455, + "learning_rate": 5.292863144102791e-06, + "loss": 0.3467, + "step": 6022 + }, + { + "epoch": 0.48184, + "grad_norm": 1.3187230825424194, + "learning_rate": 5.291607650860769e-06, + "loss": 0.3187, + "step": 6023 + }, + { + "epoch": 0.48192, + "grad_norm": 1.355542778968811, + "learning_rate": 5.290352139169708e-06, + "loss": 0.2727, + "step": 6024 + }, + { + "epoch": 0.482, + "grad_norm": 1.7600772380828857, + "learning_rate": 5.289096609109037e-06, + "loss": 0.5256, + "step": 6025 + }, + { + "epoch": 0.48208, + "grad_norm": 1.4076069593429565, + "learning_rate": 5.287841060758191e-06, + "loss": 0.2691, + "step": 6026 + }, + { + "epoch": 0.48216, + "grad_norm": 1.5772290229797363, + "learning_rate": 5.2865854941966036e-06, + "loss": 0.3356, + "step": 6027 + }, + { + "epoch": 0.48224, + "grad_norm": 1.2514276504516602, + "learning_rate": 5.285329909503711e-06, + "loss": 0.2623, + "step": 6028 + }, + { + "epoch": 0.48232, + "grad_norm": 1.582513689994812, + "learning_rate": 5.284074306758951e-06, + "loss": 0.3788, + "step": 6029 + }, + { + "epoch": 0.4824, + "grad_norm": 1.5977832078933716, + "learning_rate": 5.2828186860417594e-06, + "loss": 0.3679, + "step": 6030 + }, + { + "epoch": 0.48248, + "grad_norm": 2.7796149253845215, + "learning_rate": 5.281563047431576e-06, + "loss": 3.9853, + "step": 6031 + }, + { + "epoch": 0.48256, + "grad_norm": 1.570940613746643, + "learning_rate": 5.2803073910078405e-06, + "loss": 0.3789, + "step": 6032 + }, + { + "epoch": 0.48264, + "grad_norm": 1.6044830083847046, + "learning_rate": 5.279051716849993e-06, + "loss": 0.3946, + "step": 6033 + }, + { + "epoch": 0.48272, + "grad_norm": 1.9162310361862183, + "learning_rate": 5.27779602503748e-06, + "loss": 0.3879, + "step": 6034 + }, + { + "epoch": 0.4828, + "grad_norm": 1.536901831626892, + "learning_rate": 5.27654031564974e-06, + "loss": 0.3266, + "step": 6035 + }, + { + "epoch": 0.48288, + "grad_norm": 1.6972392797470093, + "learning_rate": 5.275284588766221e-06, + "loss": 0.3758, + "step": 6036 + }, + { + "epoch": 0.48296, + "grad_norm": 1.5778000354766846, + "learning_rate": 5.274028844466366e-06, + "loss": 0.3237, + "step": 6037 + }, + { + "epoch": 0.48304, + "grad_norm": 1.4701905250549316, + "learning_rate": 5.272773082829623e-06, + "loss": 0.3285, + "step": 6038 + }, + { + "epoch": 0.48312, + "grad_norm": 1.5418689250946045, + "learning_rate": 5.2715173039354395e-06, + "loss": 0.3325, + "step": 6039 + }, + { + "epoch": 0.4832, + "grad_norm": 1.3240424394607544, + "learning_rate": 5.270261507863265e-06, + "loss": 0.2852, + "step": 6040 + }, + { + "epoch": 0.48328, + "grad_norm": 1.316301941871643, + "learning_rate": 5.26900569469255e-06, + "loss": 0.2481, + "step": 6041 + }, + { + "epoch": 0.48336, + "grad_norm": 1.2276591062545776, + "learning_rate": 5.267749864502744e-06, + "loss": 0.3035, + "step": 6042 + }, + { + "epoch": 0.48344, + "grad_norm": 1.5969600677490234, + "learning_rate": 5.266494017373299e-06, + "loss": 0.3646, + "step": 6043 + }, + { + "epoch": 0.48352, + "grad_norm": 1.2858132123947144, + "learning_rate": 5.265238153383669e-06, + "loss": 0.2527, + "step": 6044 + }, + { + "epoch": 0.4836, + "grad_norm": 1.9083600044250488, + "learning_rate": 5.26398227261331e-06, + "loss": 0.3858, + "step": 6045 + }, + { + "epoch": 0.48368, + "grad_norm": 2.1194543838500977, + "learning_rate": 5.2627263751416765e-06, + "loss": 0.4669, + "step": 6046 + }, + { + "epoch": 0.48376, + "grad_norm": 1.6560252904891968, + "learning_rate": 5.261470461048225e-06, + "loss": 0.3027, + "step": 6047 + }, + { + "epoch": 0.48384, + "grad_norm": 1.7517454624176025, + "learning_rate": 5.260214530412409e-06, + "loss": 0.4387, + "step": 6048 + }, + { + "epoch": 0.48392, + "grad_norm": 1.775425910949707, + "learning_rate": 5.258958583313692e-06, + "loss": 0.2734, + "step": 6049 + }, + { + "epoch": 0.484, + "grad_norm": 1.6235383749008179, + "learning_rate": 5.257702619831531e-06, + "loss": 0.2836, + "step": 6050 + }, + { + "epoch": 0.48408, + "grad_norm": 1.6095234155654907, + "learning_rate": 5.25644664004539e-06, + "loss": 0.3566, + "step": 6051 + }, + { + "epoch": 0.48416, + "grad_norm": 2.052558183670044, + "learning_rate": 5.2551906440347254e-06, + "loss": 0.5749, + "step": 6052 + }, + { + "epoch": 0.48424, + "grad_norm": 1.8320833444595337, + "learning_rate": 5.253934631879005e-06, + "loss": 0.3727, + "step": 6053 + }, + { + "epoch": 0.48432, + "grad_norm": 1.698598027229309, + "learning_rate": 5.252678603657689e-06, + "loss": 0.3987, + "step": 6054 + }, + { + "epoch": 0.4844, + "grad_norm": 2.1110999584198, + "learning_rate": 5.251422559450243e-06, + "loss": 0.4145, + "step": 6055 + }, + { + "epoch": 0.48448, + "grad_norm": 1.7855218648910522, + "learning_rate": 5.250166499336132e-06, + "loss": 0.3626, + "step": 6056 + }, + { + "epoch": 0.48456, + "grad_norm": 1.150992512702942, + "learning_rate": 5.248910423394827e-06, + "loss": 0.2392, + "step": 6057 + }, + { + "epoch": 0.48464, + "grad_norm": 1.548820972442627, + "learning_rate": 5.2476543317057896e-06, + "loss": 0.2712, + "step": 6058 + }, + { + "epoch": 0.48472, + "grad_norm": 1.7660967111587524, + "learning_rate": 5.246398224348492e-06, + "loss": 0.4438, + "step": 6059 + }, + { + "epoch": 0.4848, + "grad_norm": 1.4774236679077148, + "learning_rate": 5.245142101402403e-06, + "loss": 0.3322, + "step": 6060 + }, + { + "epoch": 0.48488, + "grad_norm": 1.747956395149231, + "learning_rate": 5.2438859629469926e-06, + "loss": 0.3714, + "step": 6061 + }, + { + "epoch": 0.48496, + "grad_norm": 1.422593116760254, + "learning_rate": 5.242629809061735e-06, + "loss": 0.3844, + "step": 6062 + }, + { + "epoch": 0.48504, + "grad_norm": 1.4882540702819824, + "learning_rate": 5.2413736398260995e-06, + "loss": 0.3285, + "step": 6063 + }, + { + "epoch": 0.48512, + "grad_norm": 1.318860650062561, + "learning_rate": 5.240117455319562e-06, + "loss": 0.2693, + "step": 6064 + }, + { + "epoch": 0.4852, + "grad_norm": 1.7626543045043945, + "learning_rate": 5.238861255621598e-06, + "loss": 0.3453, + "step": 6065 + }, + { + "epoch": 0.48528, + "grad_norm": 1.5399316549301147, + "learning_rate": 5.2376050408116805e-06, + "loss": 0.2871, + "step": 6066 + }, + { + "epoch": 0.48536, + "grad_norm": 1.6021755933761597, + "learning_rate": 5.236348810969287e-06, + "loss": 0.2933, + "step": 6067 + }, + { + "epoch": 0.48544, + "grad_norm": 1.620920181274414, + "learning_rate": 5.235092566173896e-06, + "loss": 0.3498, + "step": 6068 + }, + { + "epoch": 0.48552, + "grad_norm": 1.4818662405014038, + "learning_rate": 5.233836306504983e-06, + "loss": 0.2698, + "step": 6069 + }, + { + "epoch": 0.4856, + "grad_norm": 1.618439793586731, + "learning_rate": 5.232580032042032e-06, + "loss": 0.4304, + "step": 6070 + }, + { + "epoch": 0.48568, + "grad_norm": 1.531455636024475, + "learning_rate": 5.231323742864519e-06, + "loss": 0.3385, + "step": 6071 + }, + { + "epoch": 0.48576, + "grad_norm": 1.6968480348587036, + "learning_rate": 5.230067439051927e-06, + "loss": 0.33, + "step": 6072 + }, + { + "epoch": 0.48584, + "grad_norm": 1.7031455039978027, + "learning_rate": 5.228811120683738e-06, + "loss": 0.3422, + "step": 6073 + }, + { + "epoch": 0.48592, + "grad_norm": 1.5051372051239014, + "learning_rate": 5.2275547878394335e-06, + "loss": 0.2661, + "step": 6074 + }, + { + "epoch": 0.486, + "grad_norm": 1.9860918521881104, + "learning_rate": 5.2262984405985005e-06, + "loss": 0.3608, + "step": 6075 + }, + { + "epoch": 0.48608, + "grad_norm": 1.6986693143844604, + "learning_rate": 5.225042079040424e-06, + "loss": 0.335, + "step": 6076 + }, + { + "epoch": 0.48616, + "grad_norm": 1.692690372467041, + "learning_rate": 5.223785703244685e-06, + "loss": 0.4128, + "step": 6077 + }, + { + "epoch": 0.48624, + "grad_norm": 1.331215739250183, + "learning_rate": 5.222529313290774e-06, + "loss": 0.3168, + "step": 6078 + }, + { + "epoch": 0.48632, + "grad_norm": 1.5098655223846436, + "learning_rate": 5.221272909258178e-06, + "loss": 0.2776, + "step": 6079 + }, + { + "epoch": 0.4864, + "grad_norm": 1.5736565589904785, + "learning_rate": 5.220016491226387e-06, + "loss": 0.4146, + "step": 6080 + }, + { + "epoch": 0.48648, + "grad_norm": 1.3182066679000854, + "learning_rate": 5.2187600592748876e-06, + "loss": 0.2653, + "step": 6081 + }, + { + "epoch": 0.48656, + "grad_norm": 1.3879201412200928, + "learning_rate": 5.217503613483172e-06, + "loss": 0.3127, + "step": 6082 + }, + { + "epoch": 0.48664, + "grad_norm": 1.3078670501708984, + "learning_rate": 5.21624715393073e-06, + "loss": 0.2701, + "step": 6083 + }, + { + "epoch": 0.48672, + "grad_norm": 1.890555739402771, + "learning_rate": 5.214990680697054e-06, + "loss": 0.5105, + "step": 6084 + }, + { + "epoch": 0.4868, + "grad_norm": 1.693420171737671, + "learning_rate": 5.213734193861637e-06, + "loss": 0.3455, + "step": 6085 + }, + { + "epoch": 0.48688, + "grad_norm": 1.6514363288879395, + "learning_rate": 5.212477693503973e-06, + "loss": 0.4083, + "step": 6086 + }, + { + "epoch": 0.48696, + "grad_norm": 1.5142364501953125, + "learning_rate": 5.211221179703555e-06, + "loss": 0.3556, + "step": 6087 + }, + { + "epoch": 0.48704, + "grad_norm": 1.2250992059707642, + "learning_rate": 5.209964652539882e-06, + "loss": 0.2917, + "step": 6088 + }, + { + "epoch": 0.48712, + "grad_norm": 1.4357571601867676, + "learning_rate": 5.2087081120924464e-06, + "loss": 0.2784, + "step": 6089 + }, + { + "epoch": 0.4872, + "grad_norm": 1.5746808052062988, + "learning_rate": 5.207451558440747e-06, + "loss": 0.3645, + "step": 6090 + }, + { + "epoch": 0.48728, + "grad_norm": 1.1087597608566284, + "learning_rate": 5.206194991664283e-06, + "loss": 0.2236, + "step": 6091 + }, + { + "epoch": 0.48736, + "grad_norm": 1.6777054071426392, + "learning_rate": 5.204938411842551e-06, + "loss": 0.4101, + "step": 6092 + }, + { + "epoch": 0.48744, + "grad_norm": 2.015167474746704, + "learning_rate": 5.2036818190550496e-06, + "loss": 0.4252, + "step": 6093 + }, + { + "epoch": 0.48752, + "grad_norm": 1.6957621574401855, + "learning_rate": 5.202425213381284e-06, + "loss": 0.4532, + "step": 6094 + }, + { + "epoch": 0.4876, + "grad_norm": 1.689834475517273, + "learning_rate": 5.2011685949007506e-06, + "loss": 0.322, + "step": 6095 + }, + { + "epoch": 0.48768, + "grad_norm": 1.2407318353652954, + "learning_rate": 5.199911963692953e-06, + "loss": 0.2784, + "step": 6096 + }, + { + "epoch": 0.48776, + "grad_norm": 0.9591389298439026, + "learning_rate": 5.198655319837395e-06, + "loss": 0.232, + "step": 6097 + }, + { + "epoch": 0.48784, + "grad_norm": 1.498170018196106, + "learning_rate": 5.197398663413579e-06, + "loss": 0.2981, + "step": 6098 + }, + { + "epoch": 0.48792, + "grad_norm": 1.4004147052764893, + "learning_rate": 5.1961419945010104e-06, + "loss": 0.3348, + "step": 6099 + }, + { + "epoch": 0.488, + "grad_norm": 1.3157734870910645, + "learning_rate": 5.194885313179195e-06, + "loss": 0.3471, + "step": 6100 + }, + { + "epoch": 0.48808, + "grad_norm": 1.5406330823898315, + "learning_rate": 5.1936286195276374e-06, + "loss": 0.334, + "step": 6101 + }, + { + "epoch": 0.48816, + "grad_norm": 1.4737725257873535, + "learning_rate": 5.192371913625845e-06, + "loss": 0.3255, + "step": 6102 + }, + { + "epoch": 0.48824, + "grad_norm": 1.704572319984436, + "learning_rate": 5.1911151955533254e-06, + "loss": 0.3709, + "step": 6103 + }, + { + "epoch": 0.48832, + "grad_norm": 1.5798331499099731, + "learning_rate": 5.1898584653895865e-06, + "loss": 0.2797, + "step": 6104 + }, + { + "epoch": 0.4884, + "grad_norm": 1.6223543882369995, + "learning_rate": 5.188601723214139e-06, + "loss": 0.3452, + "step": 6105 + }, + { + "epoch": 0.48848, + "grad_norm": 1.5775635242462158, + "learning_rate": 5.18734496910649e-06, + "loss": 0.3373, + "step": 6106 + }, + { + "epoch": 0.48856, + "grad_norm": 1.9005051851272583, + "learning_rate": 5.186088203146152e-06, + "loss": 0.4212, + "step": 6107 + }, + { + "epoch": 0.48864, + "grad_norm": 1.3695884943008423, + "learning_rate": 5.184831425412636e-06, + "loss": 0.3052, + "step": 6108 + }, + { + "epoch": 0.48872, + "grad_norm": 1.3192788362503052, + "learning_rate": 5.1835746359854544e-06, + "loss": 0.2665, + "step": 6109 + }, + { + "epoch": 0.4888, + "grad_norm": 1.645479679107666, + "learning_rate": 5.1823178349441195e-06, + "loss": 0.3848, + "step": 6110 + }, + { + "epoch": 0.48888, + "grad_norm": 1.4701766967773438, + "learning_rate": 5.181061022368145e-06, + "loss": 0.2934, + "step": 6111 + }, + { + "epoch": 0.48896, + "grad_norm": 1.2284507751464844, + "learning_rate": 5.179804198337046e-06, + "loss": 0.2826, + "step": 6112 + }, + { + "epoch": 0.48904, + "grad_norm": 1.8432642221450806, + "learning_rate": 5.178547362930337e-06, + "loss": 0.354, + "step": 6113 + }, + { + "epoch": 0.48912, + "grad_norm": 1.703628420829773, + "learning_rate": 5.1772905162275345e-06, + "loss": 0.3431, + "step": 6114 + }, + { + "epoch": 0.4892, + "grad_norm": 1.5386126041412354, + "learning_rate": 5.176033658308154e-06, + "loss": 0.3512, + "step": 6115 + }, + { + "epoch": 0.48928, + "grad_norm": 1.7287826538085938, + "learning_rate": 5.174776789251712e-06, + "loss": 0.3485, + "step": 6116 + }, + { + "epoch": 0.48936, + "grad_norm": 1.4497462511062622, + "learning_rate": 5.173519909137728e-06, + "loss": 0.3208, + "step": 6117 + }, + { + "epoch": 0.48944, + "grad_norm": 1.2991448640823364, + "learning_rate": 5.17226301804572e-06, + "loss": 0.2982, + "step": 6118 + }, + { + "epoch": 0.48952, + "grad_norm": 1.4704984426498413, + "learning_rate": 5.171006116055206e-06, + "loss": 0.3264, + "step": 6119 + }, + { + "epoch": 0.4896, + "grad_norm": 1.2503539323806763, + "learning_rate": 5.169749203245709e-06, + "loss": 0.2779, + "step": 6120 + }, + { + "epoch": 0.48968, + "grad_norm": 1.3474243879318237, + "learning_rate": 5.168492279696747e-06, + "loss": 0.3001, + "step": 6121 + }, + { + "epoch": 0.48976, + "grad_norm": 1.846341609954834, + "learning_rate": 5.167235345487841e-06, + "loss": 0.3778, + "step": 6122 + }, + { + "epoch": 0.48984, + "grad_norm": 1.60556960105896, + "learning_rate": 5.165978400698516e-06, + "loss": 0.3083, + "step": 6123 + }, + { + "epoch": 0.48992, + "grad_norm": 1.5098539590835571, + "learning_rate": 5.164721445408292e-06, + "loss": 0.2916, + "step": 6124 + }, + { + "epoch": 0.49, + "grad_norm": 1.64169180393219, + "learning_rate": 5.163464479696694e-06, + "loss": 0.3603, + "step": 6125 + }, + { + "epoch": 0.49008, + "grad_norm": 1.6956416368484497, + "learning_rate": 5.162207503643246e-06, + "loss": 0.3837, + "step": 6126 + }, + { + "epoch": 0.49016, + "grad_norm": 1.6618326902389526, + "learning_rate": 5.160950517327471e-06, + "loss": 0.3823, + "step": 6127 + }, + { + "epoch": 0.49024, + "grad_norm": 1.7037644386291504, + "learning_rate": 5.159693520828895e-06, + "loss": 0.3085, + "step": 6128 + }, + { + "epoch": 0.49032, + "grad_norm": 1.144739031791687, + "learning_rate": 5.158436514227045e-06, + "loss": 0.2081, + "step": 6129 + }, + { + "epoch": 0.4904, + "grad_norm": 1.2798271179199219, + "learning_rate": 5.157179497601447e-06, + "loss": 0.3009, + "step": 6130 + }, + { + "epoch": 0.49048, + "grad_norm": 1.7405157089233398, + "learning_rate": 5.155922471031627e-06, + "loss": 0.3543, + "step": 6131 + }, + { + "epoch": 0.49056, + "grad_norm": 1.0767748355865479, + "learning_rate": 5.154665434597115e-06, + "loss": 0.2166, + "step": 6132 + }, + { + "epoch": 0.49064, + "grad_norm": 1.536720633506775, + "learning_rate": 5.153408388377438e-06, + "loss": 0.2829, + "step": 6133 + }, + { + "epoch": 0.49072, + "grad_norm": 1.578690767288208, + "learning_rate": 5.152151332452125e-06, + "loss": 0.3479, + "step": 6134 + }, + { + "epoch": 0.4908, + "grad_norm": 1.888026475906372, + "learning_rate": 5.150894266900708e-06, + "loss": 0.3414, + "step": 6135 + }, + { + "epoch": 0.49088, + "grad_norm": 1.6138415336608887, + "learning_rate": 5.149637191802714e-06, + "loss": 0.2913, + "step": 6136 + }, + { + "epoch": 0.49096, + "grad_norm": 1.5494178533554077, + "learning_rate": 5.148380107237677e-06, + "loss": 0.3035, + "step": 6137 + }, + { + "epoch": 0.49104, + "grad_norm": 1.4410408735275269, + "learning_rate": 5.1471230132851254e-06, + "loss": 0.3202, + "step": 6138 + }, + { + "epoch": 0.49112, + "grad_norm": 1.510621428489685, + "learning_rate": 5.145865910024595e-06, + "loss": 0.2863, + "step": 6139 + }, + { + "epoch": 0.4912, + "grad_norm": 1.8136560916900635, + "learning_rate": 5.144608797535614e-06, + "loss": 0.3635, + "step": 6140 + }, + { + "epoch": 0.49128, + "grad_norm": 1.6779876947402954, + "learning_rate": 5.143351675897721e-06, + "loss": 0.4096, + "step": 6141 + }, + { + "epoch": 0.49136, + "grad_norm": 1.5982199907302856, + "learning_rate": 5.1420945451904455e-06, + "loss": 0.2868, + "step": 6142 + }, + { + "epoch": 0.49144, + "grad_norm": 1.6804651021957397, + "learning_rate": 5.140837405493324e-06, + "loss": 0.3119, + "step": 6143 + }, + { + "epoch": 0.49152, + "grad_norm": 1.7489296197891235, + "learning_rate": 5.13958025688589e-06, + "loss": 0.302, + "step": 6144 + }, + { + "epoch": 0.4916, + "grad_norm": 1.693847417831421, + "learning_rate": 5.138323099447681e-06, + "loss": 0.3248, + "step": 6145 + }, + { + "epoch": 0.49168, + "grad_norm": 1.360862374305725, + "learning_rate": 5.137065933258233e-06, + "loss": 0.2943, + "step": 6146 + }, + { + "epoch": 0.49176, + "grad_norm": 2.244718313217163, + "learning_rate": 5.135808758397082e-06, + "loss": 0.4029, + "step": 6147 + }, + { + "epoch": 0.49184, + "grad_norm": 1.749489426612854, + "learning_rate": 5.134551574943765e-06, + "loss": 0.3362, + "step": 6148 + }, + { + "epoch": 0.49192, + "grad_norm": 1.4736480712890625, + "learning_rate": 5.1332943829778205e-06, + "loss": 0.2781, + "step": 6149 + }, + { + "epoch": 0.492, + "grad_norm": 1.5586931705474854, + "learning_rate": 5.132037182578785e-06, + "loss": 0.2709, + "step": 6150 + }, + { + "epoch": 0.49208, + "grad_norm": 1.2036563158035278, + "learning_rate": 5.130779973826199e-06, + "loss": 0.2957, + "step": 6151 + }, + { + "epoch": 0.49216, + "grad_norm": 2.098465919494629, + "learning_rate": 5.129522756799602e-06, + "loss": 0.4698, + "step": 6152 + }, + { + "epoch": 0.49224, + "grad_norm": 1.3323612213134766, + "learning_rate": 5.128265531578535e-06, + "loss": 0.3015, + "step": 6153 + }, + { + "epoch": 0.49232, + "grad_norm": 1.4440571069717407, + "learning_rate": 5.127008298242535e-06, + "loss": 0.3037, + "step": 6154 + }, + { + "epoch": 0.4924, + "grad_norm": 1.412387728691101, + "learning_rate": 5.125751056871146e-06, + "loss": 0.3701, + "step": 6155 + }, + { + "epoch": 0.49248, + "grad_norm": 1.841917634010315, + "learning_rate": 5.124493807543908e-06, + "loss": 0.3041, + "step": 6156 + }, + { + "epoch": 0.49256, + "grad_norm": 1.612427830696106, + "learning_rate": 5.1232365503403626e-06, + "loss": 0.3198, + "step": 6157 + }, + { + "epoch": 0.49264, + "grad_norm": 1.3332172632217407, + "learning_rate": 5.1219792853400545e-06, + "loss": 0.3406, + "step": 6158 + }, + { + "epoch": 0.49272, + "grad_norm": 1.876379370689392, + "learning_rate": 5.120722012622524e-06, + "loss": 0.4073, + "step": 6159 + }, + { + "epoch": 0.4928, + "grad_norm": 1.5823307037353516, + "learning_rate": 5.119464732267317e-06, + "loss": 0.3825, + "step": 6160 + }, + { + "epoch": 0.49288, + "grad_norm": 1.216632604598999, + "learning_rate": 5.118207444353975e-06, + "loss": 0.3464, + "step": 6161 + }, + { + "epoch": 0.49296, + "grad_norm": 2.3204121589660645, + "learning_rate": 5.116950148962043e-06, + "loss": 0.3724, + "step": 6162 + }, + { + "epoch": 0.49304, + "grad_norm": 1.6256400346755981, + "learning_rate": 5.115692846171067e-06, + "loss": 0.3355, + "step": 6163 + }, + { + "epoch": 0.49312, + "grad_norm": 1.4591851234436035, + "learning_rate": 5.114435536060593e-06, + "loss": 0.309, + "step": 6164 + }, + { + "epoch": 0.4932, + "grad_norm": 2.0748255252838135, + "learning_rate": 5.113178218710164e-06, + "loss": 0.3612, + "step": 6165 + }, + { + "epoch": 0.49328, + "grad_norm": 1.8025262355804443, + "learning_rate": 5.11192089419933e-06, + "loss": 0.4716, + "step": 6166 + }, + { + "epoch": 0.49336, + "grad_norm": 1.4925312995910645, + "learning_rate": 5.110663562607632e-06, + "loss": 0.3646, + "step": 6167 + }, + { + "epoch": 0.49344, + "grad_norm": 1.710847020149231, + "learning_rate": 5.109406224014623e-06, + "loss": 0.353, + "step": 6168 + }, + { + "epoch": 0.49352, + "grad_norm": 1.3324249982833862, + "learning_rate": 5.108148878499847e-06, + "loss": 0.2612, + "step": 6169 + }, + { + "epoch": 0.4936, + "grad_norm": 1.7299015522003174, + "learning_rate": 5.106891526142854e-06, + "loss": 0.3321, + "step": 6170 + }, + { + "epoch": 0.49368, + "grad_norm": 1.7776432037353516, + "learning_rate": 5.105634167023193e-06, + "loss": 0.3346, + "step": 6171 + }, + { + "epoch": 0.49376, + "grad_norm": 1.4523450136184692, + "learning_rate": 5.104376801220411e-06, + "loss": 0.2788, + "step": 6172 + }, + { + "epoch": 0.49384, + "grad_norm": 1.5704206228256226, + "learning_rate": 5.103119428814057e-06, + "loss": 0.3746, + "step": 6173 + }, + { + "epoch": 0.49392, + "grad_norm": 1.2726726531982422, + "learning_rate": 5.1018620498836825e-06, + "loss": 0.2869, + "step": 6174 + }, + { + "epoch": 0.494, + "grad_norm": 1.7216542959213257, + "learning_rate": 5.1006046645088355e-06, + "loss": 0.4106, + "step": 6175 + }, + { + "epoch": 0.49408, + "grad_norm": 1.4835233688354492, + "learning_rate": 5.09934727276907e-06, + "loss": 0.3307, + "step": 6176 + }, + { + "epoch": 0.49416, + "grad_norm": 1.483788251876831, + "learning_rate": 5.0980898747439345e-06, + "loss": 0.3311, + "step": 6177 + }, + { + "epoch": 0.49424, + "grad_norm": 1.5280448198318481, + "learning_rate": 5.096832470512981e-06, + "loss": 0.3275, + "step": 6178 + }, + { + "epoch": 0.49432, + "grad_norm": 1.6087018251419067, + "learning_rate": 5.095575060155761e-06, + "loss": 0.3893, + "step": 6179 + }, + { + "epoch": 0.4944, + "grad_norm": 1.2583895921707153, + "learning_rate": 5.094317643751825e-06, + "loss": 0.269, + "step": 6180 + }, + { + "epoch": 0.49448, + "grad_norm": 2.5767970085144043, + "learning_rate": 5.09306022138073e-06, + "loss": 0.3843, + "step": 6181 + }, + { + "epoch": 0.49456, + "grad_norm": 1.3988746404647827, + "learning_rate": 5.091802793122025e-06, + "loss": 0.2755, + "step": 6182 + }, + { + "epoch": 0.49464, + "grad_norm": 1.3515316247940063, + "learning_rate": 5.090545359055264e-06, + "loss": 0.2335, + "step": 6183 + }, + { + "epoch": 0.49472, + "grad_norm": 1.4173052310943604, + "learning_rate": 5.089287919260002e-06, + "loss": 0.3242, + "step": 6184 + }, + { + "epoch": 0.4948, + "grad_norm": 1.7955999374389648, + "learning_rate": 5.088030473815791e-06, + "loss": 0.4096, + "step": 6185 + }, + { + "epoch": 0.49488, + "grad_norm": 1.374130129814148, + "learning_rate": 5.0867730228021875e-06, + "loss": 0.2919, + "step": 6186 + }, + { + "epoch": 0.49496, + "grad_norm": 1.3074638843536377, + "learning_rate": 5.085515566298744e-06, + "loss": 0.2317, + "step": 6187 + }, + { + "epoch": 0.49504, + "grad_norm": 1.6454929113388062, + "learning_rate": 5.084258104385018e-06, + "loss": 0.286, + "step": 6188 + }, + { + "epoch": 0.49512, + "grad_norm": 1.3493096828460693, + "learning_rate": 5.0830006371405625e-06, + "loss": 0.2989, + "step": 6189 + }, + { + "epoch": 0.4952, + "grad_norm": 1.6204330921173096, + "learning_rate": 5.081743164644935e-06, + "loss": 0.2795, + "step": 6190 + }, + { + "epoch": 0.49528, + "grad_norm": 1.340806245803833, + "learning_rate": 5.080485686977691e-06, + "loss": 0.2935, + "step": 6191 + }, + { + "epoch": 0.49536, + "grad_norm": 1.6534948348999023, + "learning_rate": 5.079228204218387e-06, + "loss": 0.3131, + "step": 6192 + }, + { + "epoch": 0.49544, + "grad_norm": 1.6619195938110352, + "learning_rate": 5.07797071644658e-06, + "loss": 0.4199, + "step": 6193 + }, + { + "epoch": 0.49552, + "grad_norm": 1.7682374715805054, + "learning_rate": 5.0767132237418275e-06, + "loss": 0.3768, + "step": 6194 + }, + { + "epoch": 0.4956, + "grad_norm": 1.9795269966125488, + "learning_rate": 5.075455726183685e-06, + "loss": 0.5178, + "step": 6195 + }, + { + "epoch": 0.49568, + "grad_norm": 1.5913708209991455, + "learning_rate": 5.07419822385171e-06, + "loss": 0.2969, + "step": 6196 + }, + { + "epoch": 0.49576, + "grad_norm": 1.3208245038986206, + "learning_rate": 5.072940716825462e-06, + "loss": 0.2773, + "step": 6197 + }, + { + "epoch": 0.49584, + "grad_norm": 1.503617286682129, + "learning_rate": 5.071683205184499e-06, + "loss": 0.2821, + "step": 6198 + }, + { + "epoch": 0.49592, + "grad_norm": 1.469290852546692, + "learning_rate": 5.070425689008381e-06, + "loss": 0.289, + "step": 6199 + }, + { + "epoch": 0.496, + "grad_norm": 1.5494263172149658, + "learning_rate": 5.069168168376664e-06, + "loss": 0.3019, + "step": 6200 + }, + { + "epoch": 0.49608, + "grad_norm": 1.5420727729797363, + "learning_rate": 5.067910643368908e-06, + "loss": 0.2645, + "step": 6201 + }, + { + "epoch": 0.49616, + "grad_norm": 1.399019718170166, + "learning_rate": 5.066653114064674e-06, + "loss": 0.3092, + "step": 6202 + }, + { + "epoch": 0.49624, + "grad_norm": 1.9305436611175537, + "learning_rate": 5.065395580543519e-06, + "loss": 0.3418, + "step": 6203 + }, + { + "epoch": 0.49632, + "grad_norm": 1.197814702987671, + "learning_rate": 5.064138042885006e-06, + "loss": 0.2597, + "step": 6204 + }, + { + "epoch": 0.4964, + "grad_norm": 1.5163662433624268, + "learning_rate": 5.062880501168693e-06, + "loss": 0.3871, + "step": 6205 + }, + { + "epoch": 0.49648, + "grad_norm": 1.3596129417419434, + "learning_rate": 5.061622955474142e-06, + "loss": 0.3607, + "step": 6206 + }, + { + "epoch": 0.49656, + "grad_norm": 2.0122718811035156, + "learning_rate": 5.060365405880911e-06, + "loss": 0.3757, + "step": 6207 + }, + { + "epoch": 0.49664, + "grad_norm": 1.1948397159576416, + "learning_rate": 5.059107852468565e-06, + "loss": 0.256, + "step": 6208 + }, + { + "epoch": 0.49672, + "grad_norm": 1.5326647758483887, + "learning_rate": 5.057850295316661e-06, + "loss": 0.4097, + "step": 6209 + }, + { + "epoch": 0.4968, + "grad_norm": 1.3815491199493408, + "learning_rate": 5.0565927345047614e-06, + "loss": 0.3258, + "step": 6210 + }, + { + "epoch": 0.49688, + "grad_norm": 1.5262930393218994, + "learning_rate": 5.055335170112432e-06, + "loss": 0.3474, + "step": 6211 + }, + { + "epoch": 0.49696, + "grad_norm": 1.2492114305496216, + "learning_rate": 5.054077602219229e-06, + "loss": 0.2956, + "step": 6212 + }, + { + "epoch": 0.49704, + "grad_norm": 1.3622100353240967, + "learning_rate": 5.052820030904719e-06, + "loss": 0.3019, + "step": 6213 + }, + { + "epoch": 0.49712, + "grad_norm": 1.563354730606079, + "learning_rate": 5.051562456248461e-06, + "loss": 0.4146, + "step": 6214 + }, + { + "epoch": 0.4972, + "grad_norm": 1.7303005456924438, + "learning_rate": 5.05030487833002e-06, + "loss": 0.3324, + "step": 6215 + }, + { + "epoch": 0.49728, + "grad_norm": 1.239866852760315, + "learning_rate": 5.049047297228956e-06, + "loss": 0.2878, + "step": 6216 + }, + { + "epoch": 0.49736, + "grad_norm": 1.492857813835144, + "learning_rate": 5.047789713024836e-06, + "loss": 0.3036, + "step": 6217 + }, + { + "epoch": 0.49744, + "grad_norm": 1.4982048273086548, + "learning_rate": 5.046532125797219e-06, + "loss": 0.3185, + "step": 6218 + }, + { + "epoch": 0.49752, + "grad_norm": 1.7222771644592285, + "learning_rate": 5.0452745356256705e-06, + "loss": 0.4005, + "step": 6219 + }, + { + "epoch": 0.4976, + "grad_norm": 1.1927220821380615, + "learning_rate": 5.044016942589754e-06, + "loss": 0.2655, + "step": 6220 + }, + { + "epoch": 0.49768, + "grad_norm": 1.6021721363067627, + "learning_rate": 5.042759346769031e-06, + "loss": 0.2886, + "step": 6221 + }, + { + "epoch": 0.49776, + "grad_norm": 2.103205442428589, + "learning_rate": 5.041501748243069e-06, + "loss": 0.5297, + "step": 6222 + }, + { + "epoch": 0.49784, + "grad_norm": 1.3879965543746948, + "learning_rate": 5.040244147091431e-06, + "loss": 0.3109, + "step": 6223 + }, + { + "epoch": 0.49792, + "grad_norm": 2.045581817626953, + "learning_rate": 5.0389865433936776e-06, + "loss": 0.3605, + "step": 6224 + }, + { + "epoch": 0.498, + "grad_norm": 1.3637545108795166, + "learning_rate": 5.037728937229378e-06, + "loss": 0.2776, + "step": 6225 + }, + { + "epoch": 0.49808, + "grad_norm": 1.311774492263794, + "learning_rate": 5.036471328678095e-06, + "loss": 0.2672, + "step": 6226 + }, + { + "epoch": 0.49816, + "grad_norm": 1.8853634595870972, + "learning_rate": 5.035213717819393e-06, + "loss": 0.3784, + "step": 6227 + }, + { + "epoch": 0.49824, + "grad_norm": 1.301746129989624, + "learning_rate": 5.033956104732836e-06, + "loss": 0.2905, + "step": 6228 + }, + { + "epoch": 0.49832, + "grad_norm": 1.6232069730758667, + "learning_rate": 5.032698489497991e-06, + "loss": 0.3362, + "step": 6229 + }, + { + "epoch": 0.4984, + "grad_norm": 1.5022000074386597, + "learning_rate": 5.031440872194422e-06, + "loss": 0.2796, + "step": 6230 + }, + { + "epoch": 0.49848, + "grad_norm": 1.7945005893707275, + "learning_rate": 5.030183252901694e-06, + "loss": 0.3596, + "step": 6231 + }, + { + "epoch": 0.49856, + "grad_norm": 1.5525250434875488, + "learning_rate": 5.02892563169937e-06, + "loss": 0.343, + "step": 6232 + }, + { + "epoch": 0.49864, + "grad_norm": 1.631269931793213, + "learning_rate": 5.027668008667022e-06, + "loss": 0.3799, + "step": 6233 + }, + { + "epoch": 0.49872, + "grad_norm": 1.4517936706542969, + "learning_rate": 5.026410383884209e-06, + "loss": 0.338, + "step": 6234 + }, + { + "epoch": 0.4988, + "grad_norm": 1.387960433959961, + "learning_rate": 5.025152757430501e-06, + "loss": 0.3515, + "step": 6235 + }, + { + "epoch": 0.49888, + "grad_norm": 1.4274933338165283, + "learning_rate": 5.023895129385461e-06, + "loss": 0.3152, + "step": 6236 + }, + { + "epoch": 0.49896, + "grad_norm": 1.2925071716308594, + "learning_rate": 5.022637499828656e-06, + "loss": 0.3087, + "step": 6237 + }, + { + "epoch": 0.49904, + "grad_norm": 1.6201573610305786, + "learning_rate": 5.021379868839655e-06, + "loss": 0.3988, + "step": 6238 + }, + { + "epoch": 0.49912, + "grad_norm": 1.1447267532348633, + "learning_rate": 5.020122236498018e-06, + "loss": 0.281, + "step": 6239 + }, + { + "epoch": 0.4992, + "grad_norm": 1.618818998336792, + "learning_rate": 5.018864602883315e-06, + "loss": 0.34, + "step": 6240 + }, + { + "epoch": 0.49928, + "grad_norm": 1.4714045524597168, + "learning_rate": 5.017606968075113e-06, + "loss": 0.3352, + "step": 6241 + }, + { + "epoch": 0.49936, + "grad_norm": 1.8967351913452148, + "learning_rate": 5.016349332152975e-06, + "loss": 0.4835, + "step": 6242 + }, + { + "epoch": 0.49944, + "grad_norm": 1.4806861877441406, + "learning_rate": 5.0150916951964715e-06, + "loss": 0.3297, + "step": 6243 + }, + { + "epoch": 0.49952, + "grad_norm": 1.4988765716552734, + "learning_rate": 5.013834057285165e-06, + "loss": 0.3896, + "step": 6244 + }, + { + "epoch": 0.4996, + "grad_norm": 1.3121827840805054, + "learning_rate": 5.012576418498626e-06, + "loss": 0.3397, + "step": 6245 + }, + { + "epoch": 0.49968, + "grad_norm": 1.7312182188034058, + "learning_rate": 5.0113187789164176e-06, + "loss": 0.3462, + "step": 6246 + }, + { + "epoch": 0.49976, + "grad_norm": 1.460113763809204, + "learning_rate": 5.010061138618109e-06, + "loss": 0.2435, + "step": 6247 + }, + { + "epoch": 0.49984, + "grad_norm": 1.681500792503357, + "learning_rate": 5.008803497683266e-06, + "loss": 0.452, + "step": 6248 + }, + { + "epoch": 0.49992, + "grad_norm": 1.528382658958435, + "learning_rate": 5.007545856191453e-06, + "loss": 0.3297, + "step": 6249 + }, + { + "epoch": 0.5, + "grad_norm": 1.7061470746994019, + "learning_rate": 5.006288214222242e-06, + "loss": 0.4542, + "step": 6250 + } + ], + "logging_steps": 1, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2449718975467684e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}