diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4379225568531039, + "epoch": 0.53779963122311, "eval_steps": 500, - "global_step": 5700, + "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -19957,6 +19957,4556 @@ "learning_rate": 1.8248309772587588e-05, "loss": 1.3252, "step": 5700 + }, + { + "epoch": 0.4380762138905962, + "grad_norm": 2.912013530731201, + "learning_rate": 1.8247695144437614e-05, + "loss": 1.3125, + "step": 5702 + }, + { + "epoch": 0.4382298709280885, + "grad_norm": 3.1190555095672607, + "learning_rate": 1.8247080516287648e-05, + "loss": 1.5832, + "step": 5704 + }, + { + "epoch": 0.43838352796558083, + "grad_norm": 3.2913010120391846, + "learning_rate": 1.8246465888137677e-05, + "loss": 1.4592, + "step": 5706 + }, + { + "epoch": 0.43853718500307315, + "grad_norm": 3.0158445835113525, + "learning_rate": 1.8245851259987707e-05, + "loss": 1.258, + "step": 5708 + }, + { + "epoch": 0.43869084204056547, + "grad_norm": 2.7523088455200195, + "learning_rate": 1.824523663183774e-05, + "loss": 1.3033, + "step": 5710 + }, + { + "epoch": 0.4388444990780578, + "grad_norm": 3.2428712844848633, + "learning_rate": 1.824462200368777e-05, + "loss": 1.5227, + "step": 5712 + }, + { + "epoch": 0.4389981561155501, + "grad_norm": 3.1375772953033447, + "learning_rate": 1.8244007375537803e-05, + "loss": 1.2947, + "step": 5714 + }, + { + "epoch": 0.4391518131530424, + "grad_norm": 3.0907890796661377, + "learning_rate": 1.8243392747387832e-05, + "loss": 1.5986, + "step": 5716 + }, + { + "epoch": 0.43930547019053473, + "grad_norm": 2.9191091060638428, + "learning_rate": 1.8242778119237862e-05, + "loss": 1.3305, + "step": 5718 + }, + { + "epoch": 0.43945912722802705, + "grad_norm": 2.765789747238159, + "learning_rate": 1.8242163491087895e-05, + "loss": 1.3996, + "step": 5720 + }, + { + "epoch": 0.43961278426551936, + "grad_norm": 3.196028470993042, + "learning_rate": 1.8241548862937925e-05, + "loss": 1.4643, + "step": 5722 + }, + { + "epoch": 0.4397664413030117, + "grad_norm": 3.159723997116089, + "learning_rate": 1.8240934234787955e-05, + "loss": 1.2982, + "step": 5724 + }, + { + "epoch": 0.439920098340504, + "grad_norm": 4.066509246826172, + "learning_rate": 1.8240319606637988e-05, + "loss": 1.4798, + "step": 5726 + }, + { + "epoch": 0.4400737553779963, + "grad_norm": 3.310293436050415, + "learning_rate": 1.8239704978488014e-05, + "loss": 1.3133, + "step": 5728 + }, + { + "epoch": 0.44022741241548863, + "grad_norm": 3.166078805923462, + "learning_rate": 1.8239090350338047e-05, + "loss": 1.3163, + "step": 5730 + }, + { + "epoch": 0.44038106945298094, + "grad_norm": 3.0429303646087646, + "learning_rate": 1.8238475722188077e-05, + "loss": 1.3178, + "step": 5732 + }, + { + "epoch": 0.44053472649047326, + "grad_norm": 2.984344959259033, + "learning_rate": 1.8237861094038106e-05, + "loss": 1.4449, + "step": 5734 + }, + { + "epoch": 0.4406883835279656, + "grad_norm": 3.379456043243408, + "learning_rate": 1.823724646588814e-05, + "loss": 1.3448, + "step": 5736 + }, + { + "epoch": 0.4408420405654579, + "grad_norm": 3.0715973377227783, + "learning_rate": 1.823663183773817e-05, + "loss": 1.4384, + "step": 5738 + }, + { + "epoch": 0.4409956976029502, + "grad_norm": 3.0291173458099365, + "learning_rate": 1.8236017209588202e-05, + "loss": 1.452, + "step": 5740 + }, + { + "epoch": 0.4411493546404425, + "grad_norm": 3.0475244522094727, + "learning_rate": 1.8235402581438232e-05, + "loss": 1.3113, + "step": 5742 + }, + { + "epoch": 0.44130301167793484, + "grad_norm": 3.0464026927948, + "learning_rate": 1.823478795328826e-05, + "loss": 1.2598, + "step": 5744 + }, + { + "epoch": 0.44145666871542716, + "grad_norm": 3.1016945838928223, + "learning_rate": 1.8234173325138295e-05, + "loss": 1.396, + "step": 5746 + }, + { + "epoch": 0.4416103257529195, + "grad_norm": 3.356527328491211, + "learning_rate": 1.8233558696988324e-05, + "loss": 1.4148, + "step": 5748 + }, + { + "epoch": 0.4417639827904118, + "grad_norm": 2.8270328044891357, + "learning_rate": 1.8232944068838354e-05, + "loss": 1.346, + "step": 5750 + }, + { + "epoch": 0.4419176398279041, + "grad_norm": 3.364140510559082, + "learning_rate": 1.8232329440688387e-05, + "loss": 1.3203, + "step": 5752 + }, + { + "epoch": 0.4420712968653964, + "grad_norm": 3.246849298477173, + "learning_rate": 1.8231714812538413e-05, + "loss": 1.5842, + "step": 5754 + }, + { + "epoch": 0.44222495390288874, + "grad_norm": 3.0349695682525635, + "learning_rate": 1.8231100184388446e-05, + "loss": 1.3516, + "step": 5756 + }, + { + "epoch": 0.44237861094038106, + "grad_norm": 3.3905465602874756, + "learning_rate": 1.8230485556238476e-05, + "loss": 1.3804, + "step": 5758 + }, + { + "epoch": 0.4425322679778734, + "grad_norm": 3.100095510482788, + "learning_rate": 1.822987092808851e-05, + "loss": 1.3149, + "step": 5760 + }, + { + "epoch": 0.4426859250153657, + "grad_norm": 2.685727596282959, + "learning_rate": 1.822925629993854e-05, + "loss": 1.4374, + "step": 5762 + }, + { + "epoch": 0.442839582052858, + "grad_norm": 3.614854335784912, + "learning_rate": 1.822864167178857e-05, + "loss": 1.4598, + "step": 5764 + }, + { + "epoch": 0.4429932390903503, + "grad_norm": 3.2616031169891357, + "learning_rate": 1.82280270436386e-05, + "loss": 1.3823, + "step": 5766 + }, + { + "epoch": 0.44314689612784264, + "grad_norm": 3.1923673152923584, + "learning_rate": 1.822741241548863e-05, + "loss": 1.3283, + "step": 5768 + }, + { + "epoch": 0.44330055316533495, + "grad_norm": 2.938978433609009, + "learning_rate": 1.822679778733866e-05, + "loss": 1.3428, + "step": 5770 + }, + { + "epoch": 0.44345421020282727, + "grad_norm": 2.983696222305298, + "learning_rate": 1.8226183159188694e-05, + "loss": 1.4124, + "step": 5772 + }, + { + "epoch": 0.4436078672403196, + "grad_norm": 2.893930673599243, + "learning_rate": 1.8225568531038724e-05, + "loss": 1.3289, + "step": 5774 + }, + { + "epoch": 0.4437615242778119, + "grad_norm": 2.9682881832122803, + "learning_rate": 1.8224953902888753e-05, + "loss": 1.3341, + "step": 5776 + }, + { + "epoch": 0.4439151813153042, + "grad_norm": 3.4092354774475098, + "learning_rate": 1.8224339274738786e-05, + "loss": 1.4816, + "step": 5778 + }, + { + "epoch": 0.44406883835279654, + "grad_norm": 3.190753698348999, + "learning_rate": 1.8223724646588816e-05, + "loss": 1.4722, + "step": 5780 + }, + { + "epoch": 0.44422249539028885, + "grad_norm": 3.21653413772583, + "learning_rate": 1.8223110018438846e-05, + "loss": 1.3672, + "step": 5782 + }, + { + "epoch": 0.44437615242778117, + "grad_norm": 2.8916025161743164, + "learning_rate": 1.8222495390288876e-05, + "loss": 1.3928, + "step": 5784 + }, + { + "epoch": 0.4445298094652735, + "grad_norm": 3.229156494140625, + "learning_rate": 1.822188076213891e-05, + "loss": 1.3865, + "step": 5786 + }, + { + "epoch": 0.4446834665027658, + "grad_norm": 3.3504526615142822, + "learning_rate": 1.8221266133988938e-05, + "loss": 1.4361, + "step": 5788 + }, + { + "epoch": 0.4448371235402581, + "grad_norm": 3.143098831176758, + "learning_rate": 1.8220651505838968e-05, + "loss": 1.3759, + "step": 5790 + }, + { + "epoch": 0.44499078057775043, + "grad_norm": 3.4050183296203613, + "learning_rate": 1.8220036877689e-05, + "loss": 1.4189, + "step": 5792 + }, + { + "epoch": 0.4451444376152428, + "grad_norm": 2.914482593536377, + "learning_rate": 1.821942224953903e-05, + "loss": 1.4096, + "step": 5794 + }, + { + "epoch": 0.4452980946527351, + "grad_norm": 3.0142135620117188, + "learning_rate": 1.821880762138906e-05, + "loss": 1.3218, + "step": 5796 + }, + { + "epoch": 0.44545175169022744, + "grad_norm": 3.1696937084198, + "learning_rate": 1.8218192993239093e-05, + "loss": 1.3977, + "step": 5798 + }, + { + "epoch": 0.44560540872771975, + "grad_norm": 3.157047986984253, + "learning_rate": 1.8217578365089123e-05, + "loss": 1.4031, + "step": 5800 + }, + { + "epoch": 0.44575906576521207, + "grad_norm": 3.1851911544799805, + "learning_rate": 1.8216963736939153e-05, + "loss": 1.4248, + "step": 5802 + }, + { + "epoch": 0.4459127228027044, + "grad_norm": 2.895350456237793, + "learning_rate": 1.8216349108789183e-05, + "loss": 1.3526, + "step": 5804 + }, + { + "epoch": 0.4460663798401967, + "grad_norm": 3.438236713409424, + "learning_rate": 1.8215734480639216e-05, + "loss": 1.4285, + "step": 5806 + }, + { + "epoch": 0.446220036877689, + "grad_norm": 3.439058780670166, + "learning_rate": 1.8215119852489245e-05, + "loss": 1.3343, + "step": 5808 + }, + { + "epoch": 0.44637369391518134, + "grad_norm": 3.126304864883423, + "learning_rate": 1.8214505224339275e-05, + "loss": 1.4201, + "step": 5810 + }, + { + "epoch": 0.44652735095267365, + "grad_norm": 2.642670154571533, + "learning_rate": 1.8213890596189308e-05, + "loss": 1.3266, + "step": 5812 + }, + { + "epoch": 0.44668100799016597, + "grad_norm": 3.0952022075653076, + "learning_rate": 1.8213275968039338e-05, + "loss": 1.4502, + "step": 5814 + }, + { + "epoch": 0.4468346650276583, + "grad_norm": 3.7239139080047607, + "learning_rate": 1.8212661339889367e-05, + "loss": 1.5575, + "step": 5816 + }, + { + "epoch": 0.4469883220651506, + "grad_norm": 3.190653085708618, + "learning_rate": 1.82120467117394e-05, + "loss": 1.4421, + "step": 5818 + }, + { + "epoch": 0.4471419791026429, + "grad_norm": 3.4979770183563232, + "learning_rate": 1.821143208358943e-05, + "loss": 1.3664, + "step": 5820 + }, + { + "epoch": 0.44729563614013523, + "grad_norm": 3.044233798980713, + "learning_rate": 1.821081745543946e-05, + "loss": 1.3876, + "step": 5822 + }, + { + "epoch": 0.44744929317762755, + "grad_norm": 3.0839719772338867, + "learning_rate": 1.8210202827289493e-05, + "loss": 1.294, + "step": 5824 + }, + { + "epoch": 0.44760295021511987, + "grad_norm": 3.3031883239746094, + "learning_rate": 1.8209588199139523e-05, + "loss": 1.4067, + "step": 5826 + }, + { + "epoch": 0.4477566072526122, + "grad_norm": 3.392949104309082, + "learning_rate": 1.8208973570989552e-05, + "loss": 1.4638, + "step": 5828 + }, + { + "epoch": 0.4479102642901045, + "grad_norm": 3.3899407386779785, + "learning_rate": 1.8208358942839582e-05, + "loss": 1.3866, + "step": 5830 + }, + { + "epoch": 0.4480639213275968, + "grad_norm": 2.626540422439575, + "learning_rate": 1.8207744314689615e-05, + "loss": 1.2544, + "step": 5832 + }, + { + "epoch": 0.44821757836508913, + "grad_norm": 3.6084189414978027, + "learning_rate": 1.8207129686539645e-05, + "loss": 1.3872, + "step": 5834 + }, + { + "epoch": 0.44837123540258145, + "grad_norm": 2.835602045059204, + "learning_rate": 1.8206515058389674e-05, + "loss": 1.3862, + "step": 5836 + }, + { + "epoch": 0.44852489244007376, + "grad_norm": 2.9570329189300537, + "learning_rate": 1.8205900430239707e-05, + "loss": 1.4073, + "step": 5838 + }, + { + "epoch": 0.4486785494775661, + "grad_norm": 2.910385847091675, + "learning_rate": 1.8205285802089737e-05, + "loss": 1.3464, + "step": 5840 + }, + { + "epoch": 0.4488322065150584, + "grad_norm": 3.0790252685546875, + "learning_rate": 1.8204671173939767e-05, + "loss": 1.3937, + "step": 5842 + }, + { + "epoch": 0.4489858635525507, + "grad_norm": 3.0580132007598877, + "learning_rate": 1.82040565457898e-05, + "loss": 1.2775, + "step": 5844 + }, + { + "epoch": 0.44913952059004303, + "grad_norm": 2.8806533813476562, + "learning_rate": 1.820344191763983e-05, + "loss": 1.4138, + "step": 5846 + }, + { + "epoch": 0.44929317762753535, + "grad_norm": 3.8401284217834473, + "learning_rate": 1.820282728948986e-05, + "loss": 1.3532, + "step": 5848 + }, + { + "epoch": 0.44944683466502766, + "grad_norm": 3.237717628479004, + "learning_rate": 1.8202212661339892e-05, + "loss": 1.3316, + "step": 5850 + }, + { + "epoch": 0.44960049170252, + "grad_norm": 2.958611011505127, + "learning_rate": 1.8201598033189922e-05, + "loss": 1.2768, + "step": 5852 + }, + { + "epoch": 0.4497541487400123, + "grad_norm": 3.4589743614196777, + "learning_rate": 1.8200983405039952e-05, + "loss": 1.5463, + "step": 5854 + }, + { + "epoch": 0.4499078057775046, + "grad_norm": 2.9172708988189697, + "learning_rate": 1.820036877688998e-05, + "loss": 1.4517, + "step": 5856 + }, + { + "epoch": 0.4500614628149969, + "grad_norm": 3.213388681411743, + "learning_rate": 1.8199754148740014e-05, + "loss": 1.2623, + "step": 5858 + }, + { + "epoch": 0.45021511985248924, + "grad_norm": 2.901362180709839, + "learning_rate": 1.8199139520590044e-05, + "loss": 1.2491, + "step": 5860 + }, + { + "epoch": 0.45036877688998156, + "grad_norm": 3.0259957313537598, + "learning_rate": 1.8198524892440074e-05, + "loss": 1.5275, + "step": 5862 + }, + { + "epoch": 0.4505224339274739, + "grad_norm": 3.00325870513916, + "learning_rate": 1.8197910264290107e-05, + "loss": 1.2732, + "step": 5864 + }, + { + "epoch": 0.4506760909649662, + "grad_norm": 3.229722023010254, + "learning_rate": 1.8197295636140137e-05, + "loss": 1.4167, + "step": 5866 + }, + { + "epoch": 0.4508297480024585, + "grad_norm": 3.2484633922576904, + "learning_rate": 1.8196681007990166e-05, + "loss": 1.3736, + "step": 5868 + }, + { + "epoch": 0.4509834050399508, + "grad_norm": 2.872192859649658, + "learning_rate": 1.81960663798402e-05, + "loss": 1.4017, + "step": 5870 + }, + { + "epoch": 0.45113706207744314, + "grad_norm": 3.5792553424835205, + "learning_rate": 1.819545175169023e-05, + "loss": 1.3938, + "step": 5872 + }, + { + "epoch": 0.45129071911493546, + "grad_norm": 3.1069889068603516, + "learning_rate": 1.819483712354026e-05, + "loss": 1.3554, + "step": 5874 + }, + { + "epoch": 0.4514443761524278, + "grad_norm": 3.3849682807922363, + "learning_rate": 1.8194222495390292e-05, + "loss": 1.3308, + "step": 5876 + }, + { + "epoch": 0.4515980331899201, + "grad_norm": 3.07407808303833, + "learning_rate": 1.819360786724032e-05, + "loss": 1.3343, + "step": 5878 + }, + { + "epoch": 0.4517516902274124, + "grad_norm": 3.1967318058013916, + "learning_rate": 1.819299323909035e-05, + "loss": 1.3538, + "step": 5880 + }, + { + "epoch": 0.4519053472649047, + "grad_norm": 3.133248805999756, + "learning_rate": 1.819237861094038e-05, + "loss": 1.4287, + "step": 5882 + }, + { + "epoch": 0.45205900430239704, + "grad_norm": 3.287682056427002, + "learning_rate": 1.8191763982790414e-05, + "loss": 1.3651, + "step": 5884 + }, + { + "epoch": 0.45221266133988935, + "grad_norm": 3.5104236602783203, + "learning_rate": 1.8191149354640444e-05, + "loss": 1.3389, + "step": 5886 + }, + { + "epoch": 0.45236631837738167, + "grad_norm": 3.1581168174743652, + "learning_rate": 1.8190534726490473e-05, + "loss": 1.493, + "step": 5888 + }, + { + "epoch": 0.452519975414874, + "grad_norm": 3.408336877822876, + "learning_rate": 1.8189920098340506e-05, + "loss": 1.3866, + "step": 5890 + }, + { + "epoch": 0.4526736324523663, + "grad_norm": 3.840761661529541, + "learning_rate": 1.8189305470190536e-05, + "loss": 1.5172, + "step": 5892 + }, + { + "epoch": 0.4528272894898586, + "grad_norm": 3.4090771675109863, + "learning_rate": 1.8188690842040566e-05, + "loss": 1.3251, + "step": 5894 + }, + { + "epoch": 0.45298094652735094, + "grad_norm": 2.792132616043091, + "learning_rate": 1.81880762138906e-05, + "loss": 1.4536, + "step": 5896 + }, + { + "epoch": 0.45313460356484325, + "grad_norm": 2.6732709407806396, + "learning_rate": 1.818746158574063e-05, + "loss": 1.3353, + "step": 5898 + }, + { + "epoch": 0.45328826060233557, + "grad_norm": 3.2554478645324707, + "learning_rate": 1.8186846957590658e-05, + "loss": 1.3426, + "step": 5900 + }, + { + "epoch": 0.4534419176398279, + "grad_norm": 3.0684661865234375, + "learning_rate": 1.8186232329440688e-05, + "loss": 1.2457, + "step": 5902 + }, + { + "epoch": 0.4535955746773202, + "grad_norm": 2.9400198459625244, + "learning_rate": 1.818561770129072e-05, + "loss": 1.386, + "step": 5904 + }, + { + "epoch": 0.4537492317148125, + "grad_norm": 3.3350679874420166, + "learning_rate": 1.8185003073140754e-05, + "loss": 1.4366, + "step": 5906 + }, + { + "epoch": 0.45390288875230483, + "grad_norm": 2.902355670928955, + "learning_rate": 1.818438844499078e-05, + "loss": 1.2785, + "step": 5908 + }, + { + "epoch": 0.45405654578979715, + "grad_norm": 2.7429327964782715, + "learning_rate": 1.8183773816840813e-05, + "loss": 1.2129, + "step": 5910 + }, + { + "epoch": 0.45421020282728947, + "grad_norm": 3.4028749465942383, + "learning_rate": 1.8183159188690843e-05, + "loss": 1.5344, + "step": 5912 + }, + { + "epoch": 0.4543638598647818, + "grad_norm": 3.4940731525421143, + "learning_rate": 1.8182544560540873e-05, + "loss": 1.3376, + "step": 5914 + }, + { + "epoch": 0.4545175169022741, + "grad_norm": 3.099562644958496, + "learning_rate": 1.8181929932390906e-05, + "loss": 1.3071, + "step": 5916 + }, + { + "epoch": 0.4546711739397664, + "grad_norm": 2.8082468509674072, + "learning_rate": 1.8181315304240935e-05, + "loss": 1.273, + "step": 5918 + }, + { + "epoch": 0.45482483097725873, + "grad_norm": 2.951359510421753, + "learning_rate": 1.8180700676090965e-05, + "loss": 1.2602, + "step": 5920 + }, + { + "epoch": 0.45497848801475105, + "grad_norm": 3.421891212463379, + "learning_rate": 1.8180086047940998e-05, + "loss": 1.4899, + "step": 5922 + }, + { + "epoch": 0.4551321450522434, + "grad_norm": 3.0956802368164062, + "learning_rate": 1.8179471419791028e-05, + "loss": 1.4078, + "step": 5924 + }, + { + "epoch": 0.45528580208973574, + "grad_norm": 3.26657772064209, + "learning_rate": 1.817885679164106e-05, + "loss": 1.4338, + "step": 5926 + }, + { + "epoch": 0.45543945912722805, + "grad_norm": 3.3916943073272705, + "learning_rate": 1.8178242163491087e-05, + "loss": 1.4113, + "step": 5928 + }, + { + "epoch": 0.45559311616472037, + "grad_norm": 3.281963348388672, + "learning_rate": 1.817762753534112e-05, + "loss": 1.3594, + "step": 5930 + }, + { + "epoch": 0.4557467732022127, + "grad_norm": 2.989858627319336, + "learning_rate": 1.817701290719115e-05, + "loss": 1.4444, + "step": 5932 + }, + { + "epoch": 0.455900430239705, + "grad_norm": 2.8269715309143066, + "learning_rate": 1.817639827904118e-05, + "loss": 1.3883, + "step": 5934 + }, + { + "epoch": 0.4560540872771973, + "grad_norm": 3.383490800857544, + "learning_rate": 1.8175783650891213e-05, + "loss": 1.4248, + "step": 5936 + }, + { + "epoch": 0.45620774431468963, + "grad_norm": 3.237833023071289, + "learning_rate": 1.8175169022741242e-05, + "loss": 1.3974, + "step": 5938 + }, + { + "epoch": 0.45636140135218195, + "grad_norm": 3.240793466567993, + "learning_rate": 1.8174554394591272e-05, + "loss": 1.3285, + "step": 5940 + }, + { + "epoch": 0.45651505838967427, + "grad_norm": 3.475192070007324, + "learning_rate": 1.8173939766441305e-05, + "loss": 1.384, + "step": 5942 + }, + { + "epoch": 0.4566687154271666, + "grad_norm": 3.198943853378296, + "learning_rate": 1.8173325138291335e-05, + "loss": 1.4785, + "step": 5944 + }, + { + "epoch": 0.4568223724646589, + "grad_norm": 3.021594524383545, + "learning_rate": 1.8172710510141368e-05, + "loss": 1.3124, + "step": 5946 + }, + { + "epoch": 0.4569760295021512, + "grad_norm": 3.0245521068573, + "learning_rate": 1.8172095881991398e-05, + "loss": 1.439, + "step": 5948 + }, + { + "epoch": 0.45712968653964353, + "grad_norm": 3.3448681831359863, + "learning_rate": 1.8171481253841427e-05, + "loss": 1.401, + "step": 5950 + }, + { + "epoch": 0.45728334357713585, + "grad_norm": 3.0003669261932373, + "learning_rate": 1.817086662569146e-05, + "loss": 1.4555, + "step": 5952 + }, + { + "epoch": 0.45743700061462816, + "grad_norm": 3.3074393272399902, + "learning_rate": 1.8170251997541487e-05, + "loss": 1.3984, + "step": 5954 + }, + { + "epoch": 0.4575906576521205, + "grad_norm": 3.2424259185791016, + "learning_rate": 1.816963736939152e-05, + "loss": 1.3531, + "step": 5956 + }, + { + "epoch": 0.4577443146896128, + "grad_norm": 3.414992094039917, + "learning_rate": 1.816902274124155e-05, + "loss": 1.4555, + "step": 5958 + }, + { + "epoch": 0.4578979717271051, + "grad_norm": 3.0615060329437256, + "learning_rate": 1.816840811309158e-05, + "loss": 1.5183, + "step": 5960 + }, + { + "epoch": 0.45805162876459743, + "grad_norm": 3.127685070037842, + "learning_rate": 1.8167793484941612e-05, + "loss": 1.4996, + "step": 5962 + }, + { + "epoch": 0.45820528580208975, + "grad_norm": 2.8747687339782715, + "learning_rate": 1.8167178856791642e-05, + "loss": 1.363, + "step": 5964 + }, + { + "epoch": 0.45835894283958206, + "grad_norm": 3.215275764465332, + "learning_rate": 1.8166564228641675e-05, + "loss": 1.32, + "step": 5966 + }, + { + "epoch": 0.4585125998770744, + "grad_norm": 2.8502352237701416, + "learning_rate": 1.8165949600491705e-05, + "loss": 1.3669, + "step": 5968 + }, + { + "epoch": 0.4586662569145667, + "grad_norm": 3.226792812347412, + "learning_rate": 1.8165334972341734e-05, + "loss": 1.4084, + "step": 5970 + }, + { + "epoch": 0.458819913952059, + "grad_norm": 2.946282386779785, + "learning_rate": 1.8164720344191767e-05, + "loss": 1.3186, + "step": 5972 + }, + { + "epoch": 0.4589735709895513, + "grad_norm": 2.6620097160339355, + "learning_rate": 1.8164105716041797e-05, + "loss": 1.219, + "step": 5974 + }, + { + "epoch": 0.45912722802704364, + "grad_norm": 3.052964925765991, + "learning_rate": 1.8163491087891827e-05, + "loss": 1.2451, + "step": 5976 + }, + { + "epoch": 0.45928088506453596, + "grad_norm": 2.9971115589141846, + "learning_rate": 1.816287645974186e-05, + "loss": 1.3349, + "step": 5978 + }, + { + "epoch": 0.4594345421020283, + "grad_norm": 2.765615701675415, + "learning_rate": 1.8162261831591886e-05, + "loss": 1.3993, + "step": 5980 + }, + { + "epoch": 0.4595881991395206, + "grad_norm": 3.110050916671753, + "learning_rate": 1.816164720344192e-05, + "loss": 1.3793, + "step": 5982 + }, + { + "epoch": 0.4597418561770129, + "grad_norm": 3.004164934158325, + "learning_rate": 1.816103257529195e-05, + "loss": 1.3538, + "step": 5984 + }, + { + "epoch": 0.4598955132145052, + "grad_norm": 2.9276111125946045, + "learning_rate": 1.8160417947141982e-05, + "loss": 1.4343, + "step": 5986 + }, + { + "epoch": 0.46004917025199754, + "grad_norm": 3.2313802242279053, + "learning_rate": 1.815980331899201e-05, + "loss": 1.4701, + "step": 5988 + }, + { + "epoch": 0.46020282728948986, + "grad_norm": 3.9364774227142334, + "learning_rate": 1.815918869084204e-05, + "loss": 1.4805, + "step": 5990 + }, + { + "epoch": 0.4603564843269822, + "grad_norm": 3.274184465408325, + "learning_rate": 1.8158574062692074e-05, + "loss": 1.3289, + "step": 5992 + }, + { + "epoch": 0.4605101413644745, + "grad_norm": 3.1217334270477295, + "learning_rate": 1.8157959434542104e-05, + "loss": 1.4466, + "step": 5994 + }, + { + "epoch": 0.4606637984019668, + "grad_norm": 2.6427552700042725, + "learning_rate": 1.8157344806392134e-05, + "loss": 1.3293, + "step": 5996 + }, + { + "epoch": 0.4608174554394591, + "grad_norm": 3.3423068523406982, + "learning_rate": 1.8156730178242167e-05, + "loss": 1.3001, + "step": 5998 + }, + { + "epoch": 0.46097111247695144, + "grad_norm": 3.421719551086426, + "learning_rate": 1.8156115550092193e-05, + "loss": 1.2599, + "step": 6000 + }, + { + "epoch": 0.46112476951444376, + "grad_norm": 2.9063069820404053, + "learning_rate": 1.8155500921942226e-05, + "loss": 1.2448, + "step": 6002 + }, + { + "epoch": 0.46127842655193607, + "grad_norm": 3.389843225479126, + "learning_rate": 1.815488629379226e-05, + "loss": 1.465, + "step": 6004 + }, + { + "epoch": 0.4614320835894284, + "grad_norm": 3.1748673915863037, + "learning_rate": 1.8154271665642286e-05, + "loss": 1.442, + "step": 6006 + }, + { + "epoch": 0.4615857406269207, + "grad_norm": 3.1274852752685547, + "learning_rate": 1.815365703749232e-05, + "loss": 1.3625, + "step": 6008 + }, + { + "epoch": 0.461739397664413, + "grad_norm": 2.8795769214630127, + "learning_rate": 1.815304240934235e-05, + "loss": 1.3289, + "step": 6010 + }, + { + "epoch": 0.46189305470190534, + "grad_norm": 2.991797924041748, + "learning_rate": 1.815242778119238e-05, + "loss": 1.3398, + "step": 6012 + }, + { + "epoch": 0.46204671173939765, + "grad_norm": 2.745926856994629, + "learning_rate": 1.815181315304241e-05, + "loss": 1.3411, + "step": 6014 + }, + { + "epoch": 0.46220036877688997, + "grad_norm": 3.4233834743499756, + "learning_rate": 1.815119852489244e-05, + "loss": 1.476, + "step": 6016 + }, + { + "epoch": 0.4623540258143823, + "grad_norm": 3.179094076156616, + "learning_rate": 1.8150583896742474e-05, + "loss": 1.3088, + "step": 6018 + }, + { + "epoch": 0.4625076828518746, + "grad_norm": 3.251293897628784, + "learning_rate": 1.8149969268592504e-05, + "loss": 1.3724, + "step": 6020 + }, + { + "epoch": 0.4626613398893669, + "grad_norm": 3.3762452602386475, + "learning_rate": 1.8149354640442533e-05, + "loss": 1.4215, + "step": 6022 + }, + { + "epoch": 0.46281499692685923, + "grad_norm": 2.5178112983703613, + "learning_rate": 1.8148740012292566e-05, + "loss": 1.2476, + "step": 6024 + }, + { + "epoch": 0.46296865396435155, + "grad_norm": 3.1534478664398193, + "learning_rate": 1.8148125384142593e-05, + "loss": 1.2818, + "step": 6026 + }, + { + "epoch": 0.46312231100184387, + "grad_norm": 2.7992680072784424, + "learning_rate": 1.8147510755992626e-05, + "loss": 1.3746, + "step": 6028 + }, + { + "epoch": 0.4632759680393362, + "grad_norm": 3.016869068145752, + "learning_rate": 1.8146896127842655e-05, + "loss": 1.1759, + "step": 6030 + }, + { + "epoch": 0.4634296250768285, + "grad_norm": 3.1666297912597656, + "learning_rate": 1.814628149969269e-05, + "loss": 1.2996, + "step": 6032 + }, + { + "epoch": 0.4635832821143208, + "grad_norm": 3.2005155086517334, + "learning_rate": 1.8145666871542718e-05, + "loss": 1.354, + "step": 6034 + }, + { + "epoch": 0.46373693915181313, + "grad_norm": 2.9398839473724365, + "learning_rate": 1.8145052243392748e-05, + "loss": 1.3371, + "step": 6036 + }, + { + "epoch": 0.46389059618930545, + "grad_norm": 2.675481081008911, + "learning_rate": 1.814443761524278e-05, + "loss": 1.4086, + "step": 6038 + }, + { + "epoch": 0.46404425322679776, + "grad_norm": 2.6939587593078613, + "learning_rate": 1.814382298709281e-05, + "loss": 1.4703, + "step": 6040 + }, + { + "epoch": 0.4641979102642901, + "grad_norm": 3.3314614295959473, + "learning_rate": 1.814320835894284e-05, + "loss": 1.3802, + "step": 6042 + }, + { + "epoch": 0.4643515673017824, + "grad_norm": 2.7407829761505127, + "learning_rate": 1.8142593730792873e-05, + "loss": 1.2443, + "step": 6044 + }, + { + "epoch": 0.4645052243392747, + "grad_norm": 3.5611183643341064, + "learning_rate": 1.8141979102642903e-05, + "loss": 1.3102, + "step": 6046 + }, + { + "epoch": 0.46465888137676703, + "grad_norm": 3.135925054550171, + "learning_rate": 1.8141364474492933e-05, + "loss": 1.2848, + "step": 6048 + }, + { + "epoch": 0.46481253841425935, + "grad_norm": 2.807861566543579, + "learning_rate": 1.8140749846342966e-05, + "loss": 1.498, + "step": 6050 + }, + { + "epoch": 0.46496619545175166, + "grad_norm": 2.9039602279663086, + "learning_rate": 1.8140135218192995e-05, + "loss": 1.3336, + "step": 6052 + }, + { + "epoch": 0.46511985248924403, + "grad_norm": 3.3446543216705322, + "learning_rate": 1.8139520590043025e-05, + "loss": 1.3786, + "step": 6054 + }, + { + "epoch": 0.46527350952673635, + "grad_norm": 3.125364065170288, + "learning_rate": 1.8138905961893055e-05, + "loss": 1.3462, + "step": 6056 + }, + { + "epoch": 0.46542716656422867, + "grad_norm": 3.407083511352539, + "learning_rate": 1.8138291333743088e-05, + "loss": 1.4261, + "step": 6058 + }, + { + "epoch": 0.465580823601721, + "grad_norm": 2.6646227836608887, + "learning_rate": 1.8137676705593118e-05, + "loss": 1.2944, + "step": 6060 + }, + { + "epoch": 0.4657344806392133, + "grad_norm": 3.1010098457336426, + "learning_rate": 1.8137062077443147e-05, + "loss": 1.4409, + "step": 6062 + }, + { + "epoch": 0.4658881376767056, + "grad_norm": 2.994729518890381, + "learning_rate": 1.813644744929318e-05, + "loss": 1.2887, + "step": 6064 + }, + { + "epoch": 0.46604179471419793, + "grad_norm": 2.98126482963562, + "learning_rate": 1.813583282114321e-05, + "loss": 1.4378, + "step": 6066 + }, + { + "epoch": 0.46619545175169025, + "grad_norm": 3.0406136512756348, + "learning_rate": 1.813521819299324e-05, + "loss": 1.4089, + "step": 6068 + }, + { + "epoch": 0.46634910878918256, + "grad_norm": 2.912886619567871, + "learning_rate": 1.8134603564843273e-05, + "loss": 1.297, + "step": 6070 + }, + { + "epoch": 0.4665027658266749, + "grad_norm": 3.1055288314819336, + "learning_rate": 1.8133988936693302e-05, + "loss": 1.3387, + "step": 6072 + }, + { + "epoch": 0.4666564228641672, + "grad_norm": 2.698050022125244, + "learning_rate": 1.8133374308543332e-05, + "loss": 1.3589, + "step": 6074 + }, + { + "epoch": 0.4668100799016595, + "grad_norm": 3.304744243621826, + "learning_rate": 1.8132759680393365e-05, + "loss": 1.3344, + "step": 6076 + }, + { + "epoch": 0.46696373693915183, + "grad_norm": 3.2374093532562256, + "learning_rate": 1.8132145052243395e-05, + "loss": 1.3627, + "step": 6078 + }, + { + "epoch": 0.46711739397664415, + "grad_norm": 3.138913631439209, + "learning_rate": 1.8131530424093425e-05, + "loss": 1.3703, + "step": 6080 + }, + { + "epoch": 0.46727105101413646, + "grad_norm": 3.517970085144043, + "learning_rate": 1.8130915795943454e-05, + "loss": 1.3284, + "step": 6082 + }, + { + "epoch": 0.4674247080516288, + "grad_norm": 2.9830198287963867, + "learning_rate": 1.8130301167793487e-05, + "loss": 1.3917, + "step": 6084 + }, + { + "epoch": 0.4675783650891211, + "grad_norm": 2.6390559673309326, + "learning_rate": 1.8129686539643517e-05, + "loss": 1.3801, + "step": 6086 + }, + { + "epoch": 0.4677320221266134, + "grad_norm": 3.0859718322753906, + "learning_rate": 1.8129071911493547e-05, + "loss": 1.4656, + "step": 6088 + }, + { + "epoch": 0.46788567916410573, + "grad_norm": 2.984755516052246, + "learning_rate": 1.812845728334358e-05, + "loss": 1.4782, + "step": 6090 + }, + { + "epoch": 0.46803933620159804, + "grad_norm": 3.045100450515747, + "learning_rate": 1.812784265519361e-05, + "loss": 1.4161, + "step": 6092 + }, + { + "epoch": 0.46819299323909036, + "grad_norm": 3.148865222930908, + "learning_rate": 1.812722802704364e-05, + "loss": 1.3084, + "step": 6094 + }, + { + "epoch": 0.4683466502765827, + "grad_norm": 3.177959680557251, + "learning_rate": 1.8126613398893672e-05, + "loss": 1.3234, + "step": 6096 + }, + { + "epoch": 0.468500307314075, + "grad_norm": 2.8266592025756836, + "learning_rate": 1.8125998770743702e-05, + "loss": 1.4313, + "step": 6098 + }, + { + "epoch": 0.4686539643515673, + "grad_norm": 3.3804705142974854, + "learning_rate": 1.812538414259373e-05, + "loss": 1.3802, + "step": 6100 + }, + { + "epoch": 0.4688076213890596, + "grad_norm": 3.3395400047302246, + "learning_rate": 1.8124769514443765e-05, + "loss": 1.2999, + "step": 6102 + }, + { + "epoch": 0.46896127842655194, + "grad_norm": 2.902346134185791, + "learning_rate": 1.8124154886293794e-05, + "loss": 1.3411, + "step": 6104 + }, + { + "epoch": 0.46911493546404426, + "grad_norm": 3.264467477798462, + "learning_rate": 1.8123540258143824e-05, + "loss": 1.4063, + "step": 6106 + }, + { + "epoch": 0.4692685925015366, + "grad_norm": 2.926862955093384, + "learning_rate": 1.8122925629993854e-05, + "loss": 1.2848, + "step": 6108 + }, + { + "epoch": 0.4694222495390289, + "grad_norm": 3.159416913986206, + "learning_rate": 1.8122311001843887e-05, + "loss": 1.526, + "step": 6110 + }, + { + "epoch": 0.4695759065765212, + "grad_norm": 2.464360237121582, + "learning_rate": 1.8121696373693916e-05, + "loss": 1.2014, + "step": 6112 + }, + { + "epoch": 0.4697295636140135, + "grad_norm": 3.0088672637939453, + "learning_rate": 1.8121081745543946e-05, + "loss": 1.4058, + "step": 6114 + }, + { + "epoch": 0.46988322065150584, + "grad_norm": 2.786822557449341, + "learning_rate": 1.812046711739398e-05, + "loss": 1.2714, + "step": 6116 + }, + { + "epoch": 0.47003687768899816, + "grad_norm": 2.730426549911499, + "learning_rate": 1.811985248924401e-05, + "loss": 1.1898, + "step": 6118 + }, + { + "epoch": 0.47019053472649047, + "grad_norm": 2.9403066635131836, + "learning_rate": 1.811923786109404e-05, + "loss": 1.4679, + "step": 6120 + }, + { + "epoch": 0.4703441917639828, + "grad_norm": 2.8969335556030273, + "learning_rate": 1.811862323294407e-05, + "loss": 1.3628, + "step": 6122 + }, + { + "epoch": 0.4704978488014751, + "grad_norm": 3.1321475505828857, + "learning_rate": 1.81180086047941e-05, + "loss": 1.2599, + "step": 6124 + }, + { + "epoch": 0.4706515058389674, + "grad_norm": 2.7310781478881836, + "learning_rate": 1.811739397664413e-05, + "loss": 1.343, + "step": 6126 + }, + { + "epoch": 0.47080516287645974, + "grad_norm": 3.2368597984313965, + "learning_rate": 1.811677934849416e-05, + "loss": 1.452, + "step": 6128 + }, + { + "epoch": 0.47095881991395205, + "grad_norm": 3.1757235527038574, + "learning_rate": 1.8116164720344194e-05, + "loss": 1.2629, + "step": 6130 + }, + { + "epoch": 0.47111247695144437, + "grad_norm": 3.524467706680298, + "learning_rate": 1.8115550092194223e-05, + "loss": 1.3946, + "step": 6132 + }, + { + "epoch": 0.4712661339889367, + "grad_norm": 2.9439032077789307, + "learning_rate": 1.8114935464044253e-05, + "loss": 1.3863, + "step": 6134 + }, + { + "epoch": 0.471419791026429, + "grad_norm": 3.1204442977905273, + "learning_rate": 1.8114320835894286e-05, + "loss": 1.3305, + "step": 6136 + }, + { + "epoch": 0.4715734480639213, + "grad_norm": 2.8491554260253906, + "learning_rate": 1.8113706207744316e-05, + "loss": 1.3718, + "step": 6138 + }, + { + "epoch": 0.47172710510141364, + "grad_norm": 3.3984553813934326, + "learning_rate": 1.8113091579594346e-05, + "loss": 1.3786, + "step": 6140 + }, + { + "epoch": 0.47188076213890595, + "grad_norm": 2.843414545059204, + "learning_rate": 1.811247695144438e-05, + "loss": 1.4403, + "step": 6142 + }, + { + "epoch": 0.47203441917639827, + "grad_norm": 3.248155355453491, + "learning_rate": 1.811186232329441e-05, + "loss": 1.3445, + "step": 6144 + }, + { + "epoch": 0.4721880762138906, + "grad_norm": 2.920718193054199, + "learning_rate": 1.8111247695144438e-05, + "loss": 1.4953, + "step": 6146 + }, + { + "epoch": 0.4723417332513829, + "grad_norm": 2.642434597015381, + "learning_rate": 1.811063306699447e-05, + "loss": 1.4154, + "step": 6148 + }, + { + "epoch": 0.4724953902888752, + "grad_norm": 2.8133106231689453, + "learning_rate": 1.81100184388445e-05, + "loss": 1.2154, + "step": 6150 + }, + { + "epoch": 0.47264904732636753, + "grad_norm": 2.921673059463501, + "learning_rate": 1.810940381069453e-05, + "loss": 1.4372, + "step": 6152 + }, + { + "epoch": 0.47280270436385985, + "grad_norm": 2.9900777339935303, + "learning_rate": 1.810878918254456e-05, + "loss": 1.3489, + "step": 6154 + }, + { + "epoch": 0.47295636140135217, + "grad_norm": 3.1354458332061768, + "learning_rate": 1.8108174554394593e-05, + "loss": 1.3848, + "step": 6156 + }, + { + "epoch": 0.4731100184388445, + "grad_norm": 2.98395037651062, + "learning_rate": 1.8107559926244623e-05, + "loss": 1.1782, + "step": 6158 + }, + { + "epoch": 0.4732636754763368, + "grad_norm": 3.0558385848999023, + "learning_rate": 1.8106945298094653e-05, + "loss": 1.4362, + "step": 6160 + }, + { + "epoch": 0.4734173325138291, + "grad_norm": 2.871683120727539, + "learning_rate": 1.8106330669944686e-05, + "loss": 1.361, + "step": 6162 + }, + { + "epoch": 0.47357098955132143, + "grad_norm": 3.1952767372131348, + "learning_rate": 1.8105716041794715e-05, + "loss": 1.371, + "step": 6164 + }, + { + "epoch": 0.47372464658881375, + "grad_norm": 3.256314754486084, + "learning_rate": 1.8105101413644745e-05, + "loss": 1.5385, + "step": 6166 + }, + { + "epoch": 0.47387830362630606, + "grad_norm": 3.1399924755096436, + "learning_rate": 1.8104486785494778e-05, + "loss": 1.4099, + "step": 6168 + }, + { + "epoch": 0.4740319606637984, + "grad_norm": 2.872492551803589, + "learning_rate": 1.8103872157344808e-05, + "loss": 1.3904, + "step": 6170 + }, + { + "epoch": 0.4741856177012907, + "grad_norm": 3.3130695819854736, + "learning_rate": 1.8103257529194837e-05, + "loss": 1.4766, + "step": 6172 + }, + { + "epoch": 0.474339274738783, + "grad_norm": 3.3041231632232666, + "learning_rate": 1.810264290104487e-05, + "loss": 1.3754, + "step": 6174 + }, + { + "epoch": 0.47449293177627533, + "grad_norm": 2.94787859916687, + "learning_rate": 1.81020282728949e-05, + "loss": 1.407, + "step": 6176 + }, + { + "epoch": 0.47464658881376764, + "grad_norm": 2.9895570278167725, + "learning_rate": 1.8101413644744933e-05, + "loss": 1.4253, + "step": 6178 + }, + { + "epoch": 0.47480024585125996, + "grad_norm": 3.096012830734253, + "learning_rate": 1.810079901659496e-05, + "loss": 1.4598, + "step": 6180 + }, + { + "epoch": 0.4749539028887523, + "grad_norm": 3.4266176223754883, + "learning_rate": 1.8100184388444993e-05, + "loss": 1.3014, + "step": 6182 + }, + { + "epoch": 0.47510755992624465, + "grad_norm": 3.3502449989318848, + "learning_rate": 1.8099569760295022e-05, + "loss": 1.403, + "step": 6184 + }, + { + "epoch": 0.47526121696373697, + "grad_norm": 2.791080951690674, + "learning_rate": 1.8098955132145052e-05, + "loss": 1.3161, + "step": 6186 + }, + { + "epoch": 0.4754148740012293, + "grad_norm": 3.389315366744995, + "learning_rate": 1.8098340503995085e-05, + "loss": 1.3105, + "step": 6188 + }, + { + "epoch": 0.4755685310387216, + "grad_norm": 3.1894092559814453, + "learning_rate": 1.8097725875845115e-05, + "loss": 1.334, + "step": 6190 + }, + { + "epoch": 0.4757221880762139, + "grad_norm": 3.219374418258667, + "learning_rate": 1.8097111247695144e-05, + "loss": 1.2583, + "step": 6192 + }, + { + "epoch": 0.47587584511370623, + "grad_norm": 2.9629125595092773, + "learning_rate": 1.8096496619545178e-05, + "loss": 1.472, + "step": 6194 + }, + { + "epoch": 0.47602950215119855, + "grad_norm": 2.8309929370880127, + "learning_rate": 1.8095881991395207e-05, + "loss": 1.4128, + "step": 6196 + }, + { + "epoch": 0.47618315918869086, + "grad_norm": 3.1029868125915527, + "learning_rate": 1.809526736324524e-05, + "loss": 1.4316, + "step": 6198 + }, + { + "epoch": 0.4763368162261832, + "grad_norm": 3.910332679748535, + "learning_rate": 1.809465273509527e-05, + "loss": 1.4264, + "step": 6200 + }, + { + "epoch": 0.4764904732636755, + "grad_norm": 3.1213109493255615, + "learning_rate": 1.80940381069453e-05, + "loss": 1.3166, + "step": 6202 + }, + { + "epoch": 0.4766441303011678, + "grad_norm": 3.3485963344573975, + "learning_rate": 1.8093423478795333e-05, + "loss": 1.4312, + "step": 6204 + }, + { + "epoch": 0.47679778733866013, + "grad_norm": 2.8326988220214844, + "learning_rate": 1.809280885064536e-05, + "loss": 1.3337, + "step": 6206 + }, + { + "epoch": 0.47695144437615244, + "grad_norm": 3.0213944911956787, + "learning_rate": 1.8092194222495392e-05, + "loss": 1.3796, + "step": 6208 + }, + { + "epoch": 0.47710510141364476, + "grad_norm": 3.090485095977783, + "learning_rate": 1.8091579594345422e-05, + "loss": 1.3794, + "step": 6210 + }, + { + "epoch": 0.4772587584511371, + "grad_norm": 2.931671619415283, + "learning_rate": 1.809096496619545e-05, + "loss": 1.4677, + "step": 6212 + }, + { + "epoch": 0.4774124154886294, + "grad_norm": 3.267240524291992, + "learning_rate": 1.8090350338045485e-05, + "loss": 1.3751, + "step": 6214 + }, + { + "epoch": 0.4775660725261217, + "grad_norm": 2.553067207336426, + "learning_rate": 1.8089735709895514e-05, + "loss": 1.3013, + "step": 6216 + }, + { + "epoch": 0.477719729563614, + "grad_norm": 3.1998229026794434, + "learning_rate": 1.8089121081745547e-05, + "loss": 1.3474, + "step": 6218 + }, + { + "epoch": 0.47787338660110634, + "grad_norm": 11.610280990600586, + "learning_rate": 1.8088506453595577e-05, + "loss": 1.2955, + "step": 6220 + }, + { + "epoch": 0.47802704363859866, + "grad_norm": 2.8859567642211914, + "learning_rate": 1.8087891825445607e-05, + "loss": 1.3495, + "step": 6222 + }, + { + "epoch": 0.478180700676091, + "grad_norm": 2.9452872276306152, + "learning_rate": 1.808727719729564e-05, + "loss": 1.276, + "step": 6224 + }, + { + "epoch": 0.4783343577135833, + "grad_norm": 3.28800892829895, + "learning_rate": 1.8086662569145666e-05, + "loss": 1.4918, + "step": 6226 + }, + { + "epoch": 0.4784880147510756, + "grad_norm": 3.066276788711548, + "learning_rate": 1.80860479409957e-05, + "loss": 1.3771, + "step": 6228 + }, + { + "epoch": 0.4786416717885679, + "grad_norm": 2.8848133087158203, + "learning_rate": 1.8085433312845732e-05, + "loss": 1.3354, + "step": 6230 + }, + { + "epoch": 0.47879532882606024, + "grad_norm": 2.9447460174560547, + "learning_rate": 1.808481868469576e-05, + "loss": 1.4236, + "step": 6232 + }, + { + "epoch": 0.47894898586355256, + "grad_norm": 3.061072826385498, + "learning_rate": 1.808420405654579e-05, + "loss": 1.2652, + "step": 6234 + }, + { + "epoch": 0.4791026429010449, + "grad_norm": 3.3094377517700195, + "learning_rate": 1.808358942839582e-05, + "loss": 1.3274, + "step": 6236 + }, + { + "epoch": 0.4792562999385372, + "grad_norm": 2.868401050567627, + "learning_rate": 1.8082974800245854e-05, + "loss": 1.361, + "step": 6238 + }, + { + "epoch": 0.4794099569760295, + "grad_norm": 2.7703821659088135, + "learning_rate": 1.8082360172095884e-05, + "loss": 1.3177, + "step": 6240 + }, + { + "epoch": 0.4795636140135218, + "grad_norm": 2.5703728199005127, + "learning_rate": 1.8081745543945914e-05, + "loss": 1.3978, + "step": 6242 + }, + { + "epoch": 0.47971727105101414, + "grad_norm": 3.038760185241699, + "learning_rate": 1.8081130915795947e-05, + "loss": 1.3651, + "step": 6244 + }, + { + "epoch": 0.47987092808850645, + "grad_norm": 3.6976478099823, + "learning_rate": 1.8080516287645976e-05, + "loss": 1.4703, + "step": 6246 + }, + { + "epoch": 0.48002458512599877, + "grad_norm": 3.0694258213043213, + "learning_rate": 1.8079901659496006e-05, + "loss": 1.3212, + "step": 6248 + }, + { + "epoch": 0.4801782421634911, + "grad_norm": 3.3709535598754883, + "learning_rate": 1.807928703134604e-05, + "loss": 1.36, + "step": 6250 + }, + { + "epoch": 0.4803318992009834, + "grad_norm": 3.294551134109497, + "learning_rate": 1.8078672403196065e-05, + "loss": 1.5437, + "step": 6252 + }, + { + "epoch": 0.4804855562384757, + "grad_norm": 3.8295395374298096, + "learning_rate": 1.80780577750461e-05, + "loss": 1.2844, + "step": 6254 + }, + { + "epoch": 0.48063921327596804, + "grad_norm": 2.620631217956543, + "learning_rate": 1.8077443146896128e-05, + "loss": 1.2676, + "step": 6256 + }, + { + "epoch": 0.48079287031346035, + "grad_norm": 2.7182457447052, + "learning_rate": 1.8076828518746158e-05, + "loss": 1.4544, + "step": 6258 + }, + { + "epoch": 0.48094652735095267, + "grad_norm": 3.396521806716919, + "learning_rate": 1.807621389059619e-05, + "loss": 1.3121, + "step": 6260 + }, + { + "epoch": 0.481100184388445, + "grad_norm": 2.63437557220459, + "learning_rate": 1.807559926244622e-05, + "loss": 1.234, + "step": 6262 + }, + { + "epoch": 0.4812538414259373, + "grad_norm": 2.776506185531616, + "learning_rate": 1.8074984634296254e-05, + "loss": 1.4487, + "step": 6264 + }, + { + "epoch": 0.4814074984634296, + "grad_norm": 3.227975845336914, + "learning_rate": 1.8074370006146283e-05, + "loss": 1.2383, + "step": 6266 + }, + { + "epoch": 0.48156115550092193, + "grad_norm": 2.9529471397399902, + "learning_rate": 1.8073755377996313e-05, + "loss": 1.3515, + "step": 6268 + }, + { + "epoch": 0.48171481253841425, + "grad_norm": 3.3522536754608154, + "learning_rate": 1.8073140749846346e-05, + "loss": 1.3238, + "step": 6270 + }, + { + "epoch": 0.48186846957590657, + "grad_norm": 3.3435351848602295, + "learning_rate": 1.8072526121696376e-05, + "loss": 1.4359, + "step": 6272 + }, + { + "epoch": 0.4820221266133989, + "grad_norm": 2.8637821674346924, + "learning_rate": 1.8071911493546406e-05, + "loss": 1.2188, + "step": 6274 + }, + { + "epoch": 0.4821757836508912, + "grad_norm": 2.939358949661255, + "learning_rate": 1.807129686539644e-05, + "loss": 1.2918, + "step": 6276 + }, + { + "epoch": 0.4823294406883835, + "grad_norm": 3.1831789016723633, + "learning_rate": 1.8070682237246465e-05, + "loss": 1.4176, + "step": 6278 + }, + { + "epoch": 0.48248309772587583, + "grad_norm": 3.3042044639587402, + "learning_rate": 1.8070067609096498e-05, + "loss": 1.3979, + "step": 6280 + }, + { + "epoch": 0.48263675476336815, + "grad_norm": 3.7356269359588623, + "learning_rate": 1.8069452980946528e-05, + "loss": 1.3986, + "step": 6282 + }, + { + "epoch": 0.48279041180086046, + "grad_norm": 3.174906015396118, + "learning_rate": 1.806883835279656e-05, + "loss": 1.2884, + "step": 6284 + }, + { + "epoch": 0.4829440688383528, + "grad_norm": 2.9945216178894043, + "learning_rate": 1.806822372464659e-05, + "loss": 1.4439, + "step": 6286 + }, + { + "epoch": 0.4830977258758451, + "grad_norm": 3.113851308822632, + "learning_rate": 1.806760909649662e-05, + "loss": 1.3341, + "step": 6288 + }, + { + "epoch": 0.4832513829133374, + "grad_norm": 3.578928232192993, + "learning_rate": 1.8066994468346653e-05, + "loss": 1.4566, + "step": 6290 + }, + { + "epoch": 0.48340503995082973, + "grad_norm": 3.227860450744629, + "learning_rate": 1.8066379840196683e-05, + "loss": 1.3845, + "step": 6292 + }, + { + "epoch": 0.48355869698832205, + "grad_norm": 3.249185562133789, + "learning_rate": 1.8065765212046713e-05, + "loss": 1.3953, + "step": 6294 + }, + { + "epoch": 0.48371235402581436, + "grad_norm": 2.9868662357330322, + "learning_rate": 1.8065150583896746e-05, + "loss": 1.2176, + "step": 6296 + }, + { + "epoch": 0.4838660110633067, + "grad_norm": 2.748054265975952, + "learning_rate": 1.8064535955746775e-05, + "loss": 1.251, + "step": 6298 + }, + { + "epoch": 0.484019668100799, + "grad_norm": 3.2431583404541016, + "learning_rate": 1.8063921327596805e-05, + "loss": 1.3409, + "step": 6300 + }, + { + "epoch": 0.4841733251382913, + "grad_norm": 3.484886407852173, + "learning_rate": 1.8063306699446838e-05, + "loss": 1.3708, + "step": 6302 + }, + { + "epoch": 0.4843269821757836, + "grad_norm": 3.2029964923858643, + "learning_rate": 1.8062692071296868e-05, + "loss": 1.4806, + "step": 6304 + }, + { + "epoch": 0.48448063921327594, + "grad_norm": 2.78397274017334, + "learning_rate": 1.8062077443146897e-05, + "loss": 1.2184, + "step": 6306 + }, + { + "epoch": 0.48463429625076826, + "grad_norm": 2.9794437885284424, + "learning_rate": 1.8061462814996927e-05, + "loss": 1.4035, + "step": 6308 + }, + { + "epoch": 0.4847879532882606, + "grad_norm": 2.6800053119659424, + "learning_rate": 1.806084818684696e-05, + "loss": 1.3018, + "step": 6310 + }, + { + "epoch": 0.4849416103257529, + "grad_norm": 2.774409055709839, + "learning_rate": 1.806023355869699e-05, + "loss": 1.4247, + "step": 6312 + }, + { + "epoch": 0.48509526736324526, + "grad_norm": 3.3923118114471436, + "learning_rate": 1.805961893054702e-05, + "loss": 1.1499, + "step": 6314 + }, + { + "epoch": 0.4852489244007376, + "grad_norm": 2.7925145626068115, + "learning_rate": 1.8059004302397053e-05, + "loss": 1.2022, + "step": 6316 + }, + { + "epoch": 0.4854025814382299, + "grad_norm": 3.3250386714935303, + "learning_rate": 1.8058389674247082e-05, + "loss": 1.3755, + "step": 6318 + }, + { + "epoch": 0.4855562384757222, + "grad_norm": 3.531944990158081, + "learning_rate": 1.8057775046097112e-05, + "loss": 1.4049, + "step": 6320 + }, + { + "epoch": 0.48570989551321453, + "grad_norm": 3.0159146785736084, + "learning_rate": 1.8057160417947145e-05, + "loss": 1.4002, + "step": 6322 + }, + { + "epoch": 0.48586355255070685, + "grad_norm": 3.2475900650024414, + "learning_rate": 1.8056545789797175e-05, + "loss": 1.3624, + "step": 6324 + }, + { + "epoch": 0.48601720958819916, + "grad_norm": 3.2522149085998535, + "learning_rate": 1.8055931161647204e-05, + "loss": 1.3565, + "step": 6326 + }, + { + "epoch": 0.4861708666256915, + "grad_norm": 3.1061174869537354, + "learning_rate": 1.8055316533497237e-05, + "loss": 1.3549, + "step": 6328 + }, + { + "epoch": 0.4863245236631838, + "grad_norm": 3.162954330444336, + "learning_rate": 1.8054701905347267e-05, + "loss": 1.4947, + "step": 6330 + }, + { + "epoch": 0.4864781807006761, + "grad_norm": 3.179232597351074, + "learning_rate": 1.8054087277197297e-05, + "loss": 1.32, + "step": 6332 + }, + { + "epoch": 0.4866318377381684, + "grad_norm": 2.8867878913879395, + "learning_rate": 1.8053472649047327e-05, + "loss": 1.2925, + "step": 6334 + }, + { + "epoch": 0.48678549477566074, + "grad_norm": 3.4606575965881348, + "learning_rate": 1.805285802089736e-05, + "loss": 1.4834, + "step": 6336 + }, + { + "epoch": 0.48693915181315306, + "grad_norm": 3.118943214416504, + "learning_rate": 1.805224339274739e-05, + "loss": 1.227, + "step": 6338 + }, + { + "epoch": 0.4870928088506454, + "grad_norm": 3.1794466972351074, + "learning_rate": 1.805162876459742e-05, + "loss": 1.4092, + "step": 6340 + }, + { + "epoch": 0.4872464658881377, + "grad_norm": 2.719515562057495, + "learning_rate": 1.8051014136447452e-05, + "loss": 1.3534, + "step": 6342 + }, + { + "epoch": 0.48740012292563, + "grad_norm": 3.8740384578704834, + "learning_rate": 1.8050399508297482e-05, + "loss": 1.4607, + "step": 6344 + }, + { + "epoch": 0.4875537799631223, + "grad_norm": 3.121920585632324, + "learning_rate": 1.804978488014751e-05, + "loss": 1.365, + "step": 6346 + }, + { + "epoch": 0.48770743700061464, + "grad_norm": 2.8695993423461914, + "learning_rate": 1.8049170251997544e-05, + "loss": 1.2682, + "step": 6348 + }, + { + "epoch": 0.48786109403810696, + "grad_norm": 2.9940059185028076, + "learning_rate": 1.8048555623847574e-05, + "loss": 1.3376, + "step": 6350 + }, + { + "epoch": 0.4880147510755993, + "grad_norm": 3.0862960815429688, + "learning_rate": 1.8047940995697604e-05, + "loss": 1.3186, + "step": 6352 + }, + { + "epoch": 0.4881684081130916, + "grad_norm": 3.2420318126678467, + "learning_rate": 1.8047326367547634e-05, + "loss": 1.3098, + "step": 6354 + }, + { + "epoch": 0.4883220651505839, + "grad_norm": 2.9130239486694336, + "learning_rate": 1.8046711739397667e-05, + "loss": 1.3989, + "step": 6356 + }, + { + "epoch": 0.4884757221880762, + "grad_norm": 3.1051688194274902, + "learning_rate": 1.8046097111247696e-05, + "loss": 1.3173, + "step": 6358 + }, + { + "epoch": 0.48862937922556854, + "grad_norm": 2.9414143562316895, + "learning_rate": 1.8045482483097726e-05, + "loss": 1.3141, + "step": 6360 + }, + { + "epoch": 0.48878303626306085, + "grad_norm": 2.926856279373169, + "learning_rate": 1.804486785494776e-05, + "loss": 1.3659, + "step": 6362 + }, + { + "epoch": 0.48893669330055317, + "grad_norm": 3.0181140899658203, + "learning_rate": 1.804425322679779e-05, + "loss": 1.4083, + "step": 6364 + }, + { + "epoch": 0.4890903503380455, + "grad_norm": 2.822953462600708, + "learning_rate": 1.804363859864782e-05, + "loss": 1.3084, + "step": 6366 + }, + { + "epoch": 0.4892440073755378, + "grad_norm": 3.2265994548797607, + "learning_rate": 1.804302397049785e-05, + "loss": 1.4421, + "step": 6368 + }, + { + "epoch": 0.4893976644130301, + "grad_norm": 2.932751417160034, + "learning_rate": 1.804240934234788e-05, + "loss": 1.3666, + "step": 6370 + }, + { + "epoch": 0.48955132145052244, + "grad_norm": 3.190852403640747, + "learning_rate": 1.804179471419791e-05, + "loss": 1.4584, + "step": 6372 + }, + { + "epoch": 0.48970497848801475, + "grad_norm": 2.9671835899353027, + "learning_rate": 1.8041180086047944e-05, + "loss": 1.4951, + "step": 6374 + }, + { + "epoch": 0.48985863552550707, + "grad_norm": 3.1517322063446045, + "learning_rate": 1.8040565457897974e-05, + "loss": 1.4506, + "step": 6376 + }, + { + "epoch": 0.4900122925629994, + "grad_norm": 3.0485892295837402, + "learning_rate": 1.8039950829748003e-05, + "loss": 1.3996, + "step": 6378 + }, + { + "epoch": 0.4901659496004917, + "grad_norm": 2.753948450088501, + "learning_rate": 1.8039336201598033e-05, + "loss": 1.4296, + "step": 6380 + }, + { + "epoch": 0.490319606637984, + "grad_norm": 2.8919365406036377, + "learning_rate": 1.8038721573448066e-05, + "loss": 1.3162, + "step": 6382 + }, + { + "epoch": 0.49047326367547633, + "grad_norm": 2.782630205154419, + "learning_rate": 1.8038106945298096e-05, + "loss": 1.3588, + "step": 6384 + }, + { + "epoch": 0.49062692071296865, + "grad_norm": 2.8316001892089844, + "learning_rate": 1.8037492317148125e-05, + "loss": 1.4375, + "step": 6386 + }, + { + "epoch": 0.49078057775046097, + "grad_norm": 2.9996912479400635, + "learning_rate": 1.803687768899816e-05, + "loss": 1.3141, + "step": 6388 + }, + { + "epoch": 0.4909342347879533, + "grad_norm": 2.8670809268951416, + "learning_rate": 1.8036263060848188e-05, + "loss": 1.424, + "step": 6390 + }, + { + "epoch": 0.4910878918254456, + "grad_norm": 3.0059220790863037, + "learning_rate": 1.8035648432698218e-05, + "loss": 1.3933, + "step": 6392 + }, + { + "epoch": 0.4912415488629379, + "grad_norm": 3.1974833011627197, + "learning_rate": 1.803503380454825e-05, + "loss": 1.3125, + "step": 6394 + }, + { + "epoch": 0.49139520590043023, + "grad_norm": 2.683246612548828, + "learning_rate": 1.803441917639828e-05, + "loss": 1.258, + "step": 6396 + }, + { + "epoch": 0.49154886293792255, + "grad_norm": 3.330538034439087, + "learning_rate": 1.803380454824831e-05, + "loss": 1.4212, + "step": 6398 + }, + { + "epoch": 0.49170251997541486, + "grad_norm": 3.116828680038452, + "learning_rate": 1.8033189920098343e-05, + "loss": 1.3991, + "step": 6400 + }, + { + "epoch": 0.4918561770129072, + "grad_norm": 3.45032000541687, + "learning_rate": 1.8032575291948373e-05, + "loss": 1.3591, + "step": 6402 + }, + { + "epoch": 0.4920098340503995, + "grad_norm": 3.455242872238159, + "learning_rate": 1.8031960663798403e-05, + "loss": 1.3312, + "step": 6404 + }, + { + "epoch": 0.4921634910878918, + "grad_norm": 2.821232557296753, + "learning_rate": 1.8031346035648432e-05, + "loss": 1.2594, + "step": 6406 + }, + { + "epoch": 0.49231714812538413, + "grad_norm": 3.163733720779419, + "learning_rate": 1.8030731407498465e-05, + "loss": 1.3575, + "step": 6408 + }, + { + "epoch": 0.49247080516287645, + "grad_norm": 3.2715537548065186, + "learning_rate": 1.8030116779348495e-05, + "loss": 1.4909, + "step": 6410 + }, + { + "epoch": 0.49262446220036876, + "grad_norm": 3.1828911304473877, + "learning_rate": 1.8029502151198525e-05, + "loss": 1.3965, + "step": 6412 + }, + { + "epoch": 0.4927781192378611, + "grad_norm": 3.2268998622894287, + "learning_rate": 1.8028887523048558e-05, + "loss": 1.3785, + "step": 6414 + }, + { + "epoch": 0.4929317762753534, + "grad_norm": 3.106019973754883, + "learning_rate": 1.8028272894898588e-05, + "loss": 1.2841, + "step": 6416 + }, + { + "epoch": 0.4930854333128457, + "grad_norm": 3.220978260040283, + "learning_rate": 1.8027658266748617e-05, + "loss": 1.5033, + "step": 6418 + }, + { + "epoch": 0.493239090350338, + "grad_norm": 3.288722276687622, + "learning_rate": 1.802704363859865e-05, + "loss": 1.3628, + "step": 6420 + }, + { + "epoch": 0.49339274738783034, + "grad_norm": 3.134910821914673, + "learning_rate": 1.802642901044868e-05, + "loss": 1.3892, + "step": 6422 + }, + { + "epoch": 0.49354640442532266, + "grad_norm": 2.868943929672241, + "learning_rate": 1.802581438229871e-05, + "loss": 1.4807, + "step": 6424 + }, + { + "epoch": 0.493700061462815, + "grad_norm": 3.257479190826416, + "learning_rate": 1.8025199754148743e-05, + "loss": 1.3037, + "step": 6426 + }, + { + "epoch": 0.4938537185003073, + "grad_norm": 2.6661643981933594, + "learning_rate": 1.8024585125998772e-05, + "loss": 1.3603, + "step": 6428 + }, + { + "epoch": 0.4940073755377996, + "grad_norm": 3.075345039367676, + "learning_rate": 1.8023970497848806e-05, + "loss": 1.3357, + "step": 6430 + }, + { + "epoch": 0.4941610325752919, + "grad_norm": 3.3657965660095215, + "learning_rate": 1.8023355869698832e-05, + "loss": 1.3358, + "step": 6432 + }, + { + "epoch": 0.49431468961278424, + "grad_norm": 3.102015495300293, + "learning_rate": 1.8022741241548865e-05, + "loss": 1.4151, + "step": 6434 + }, + { + "epoch": 0.49446834665027656, + "grad_norm": 2.722320795059204, + "learning_rate": 1.8022126613398895e-05, + "loss": 1.3975, + "step": 6436 + }, + { + "epoch": 0.4946220036877689, + "grad_norm": 3.373051166534424, + "learning_rate": 1.8021511985248924e-05, + "loss": 1.2497, + "step": 6438 + }, + { + "epoch": 0.4947756607252612, + "grad_norm": 2.7113919258117676, + "learning_rate": 1.8020897357098957e-05, + "loss": 1.3809, + "step": 6440 + }, + { + "epoch": 0.4949293177627535, + "grad_norm": 3.1061747074127197, + "learning_rate": 1.8020282728948987e-05, + "loss": 1.4414, + "step": 6442 + }, + { + "epoch": 0.4950829748002459, + "grad_norm": 3.091012477874756, + "learning_rate": 1.8019668100799017e-05, + "loss": 1.3563, + "step": 6444 + }, + { + "epoch": 0.4952366318377382, + "grad_norm": 2.845564126968384, + "learning_rate": 1.801905347264905e-05, + "loss": 1.3844, + "step": 6446 + }, + { + "epoch": 0.4953902888752305, + "grad_norm": 3.5024054050445557, + "learning_rate": 1.801843884449908e-05, + "loss": 1.4408, + "step": 6448 + }, + { + "epoch": 0.4955439459127228, + "grad_norm": 2.915093421936035, + "learning_rate": 1.8017824216349113e-05, + "loss": 1.2814, + "step": 6450 + }, + { + "epoch": 0.49569760295021514, + "grad_norm": 3.042811632156372, + "learning_rate": 1.801720958819914e-05, + "loss": 1.5103, + "step": 6452 + }, + { + "epoch": 0.49585125998770746, + "grad_norm": 2.851787805557251, + "learning_rate": 1.8016594960049172e-05, + "loss": 1.4586, + "step": 6454 + }, + { + "epoch": 0.4960049170251998, + "grad_norm": 3.1319057941436768, + "learning_rate": 1.80159803318992e-05, + "loss": 1.2987, + "step": 6456 + }, + { + "epoch": 0.4961585740626921, + "grad_norm": 2.650089740753174, + "learning_rate": 1.801536570374923e-05, + "loss": 1.3441, + "step": 6458 + }, + { + "epoch": 0.4963122311001844, + "grad_norm": 3.493260383605957, + "learning_rate": 1.8014751075599264e-05, + "loss": 1.6013, + "step": 6460 + }, + { + "epoch": 0.4964658881376767, + "grad_norm": 2.9392683506011963, + "learning_rate": 1.8014136447449294e-05, + "loss": 1.3622, + "step": 6462 + }, + { + "epoch": 0.49661954517516904, + "grad_norm": 3.106326103210449, + "learning_rate": 1.8013521819299324e-05, + "loss": 1.3347, + "step": 6464 + }, + { + "epoch": 0.49677320221266136, + "grad_norm": 2.914437770843506, + "learning_rate": 1.8012907191149357e-05, + "loss": 1.2599, + "step": 6466 + }, + { + "epoch": 0.4969268592501537, + "grad_norm": 2.910841464996338, + "learning_rate": 1.8012292562999386e-05, + "loss": 1.3622, + "step": 6468 + }, + { + "epoch": 0.497080516287646, + "grad_norm": 2.89884352684021, + "learning_rate": 1.801167793484942e-05, + "loss": 1.3582, + "step": 6470 + }, + { + "epoch": 0.4972341733251383, + "grad_norm": 2.8821210861206055, + "learning_rate": 1.801106330669945e-05, + "loss": 1.4164, + "step": 6472 + }, + { + "epoch": 0.4973878303626306, + "grad_norm": 2.601987838745117, + "learning_rate": 1.801044867854948e-05, + "loss": 1.5184, + "step": 6474 + }, + { + "epoch": 0.49754148740012294, + "grad_norm": 2.958704710006714, + "learning_rate": 1.8009834050399512e-05, + "loss": 1.3903, + "step": 6476 + }, + { + "epoch": 0.49769514443761526, + "grad_norm": 2.9979889392852783, + "learning_rate": 1.8009219422249538e-05, + "loss": 1.2592, + "step": 6478 + }, + { + "epoch": 0.49784880147510757, + "grad_norm": 2.916813611984253, + "learning_rate": 1.800860479409957e-05, + "loss": 1.3302, + "step": 6480 + }, + { + "epoch": 0.4980024585125999, + "grad_norm": 3.3828582763671875, + "learning_rate": 1.80079901659496e-05, + "loss": 1.4222, + "step": 6482 + }, + { + "epoch": 0.4981561155500922, + "grad_norm": 3.1152589321136475, + "learning_rate": 1.800737553779963e-05, + "loss": 1.3731, + "step": 6484 + }, + { + "epoch": 0.4983097725875845, + "grad_norm": 2.974968671798706, + "learning_rate": 1.8006760909649664e-05, + "loss": 1.2744, + "step": 6486 + }, + { + "epoch": 0.49846342962507684, + "grad_norm": 3.652846336364746, + "learning_rate": 1.8006146281499693e-05, + "loss": 1.4291, + "step": 6488 + }, + { + "epoch": 0.49861708666256915, + "grad_norm": 2.827791213989258, + "learning_rate": 1.8005531653349727e-05, + "loss": 1.2982, + "step": 6490 + }, + { + "epoch": 0.49877074370006147, + "grad_norm": 3.1473135948181152, + "learning_rate": 1.8004917025199756e-05, + "loss": 1.3051, + "step": 6492 + }, + { + "epoch": 0.4989244007375538, + "grad_norm": 3.156839370727539, + "learning_rate": 1.8004302397049786e-05, + "loss": 1.4366, + "step": 6494 + }, + { + "epoch": 0.4990780577750461, + "grad_norm": 2.8511626720428467, + "learning_rate": 1.800368776889982e-05, + "loss": 1.3214, + "step": 6496 + }, + { + "epoch": 0.4992317148125384, + "grad_norm": 2.7867062091827393, + "learning_rate": 1.800307314074985e-05, + "loss": 1.3791, + "step": 6498 + }, + { + "epoch": 0.49938537185003073, + "grad_norm": 3.0567257404327393, + "learning_rate": 1.800245851259988e-05, + "loss": 1.3549, + "step": 6500 + }, + { + "epoch": 0.49953902888752305, + "grad_norm": 2.9443788528442383, + "learning_rate": 1.800184388444991e-05, + "loss": 1.3798, + "step": 6502 + }, + { + "epoch": 0.49969268592501537, + "grad_norm": 2.903205156326294, + "learning_rate": 1.8001229256299938e-05, + "loss": 1.3004, + "step": 6504 + }, + { + "epoch": 0.4998463429625077, + "grad_norm": 3.2189865112304688, + "learning_rate": 1.800061462814997e-05, + "loss": 1.3023, + "step": 6506 + }, + { + "epoch": 0.5, + "grad_norm": 3.1087801456451416, + "learning_rate": 1.8e-05, + "loss": 1.508, + "step": 6508 + }, + { + "epoch": 0.5001536570374924, + "grad_norm": 3.1676154136657715, + "learning_rate": 1.799938537185003e-05, + "loss": 1.4198, + "step": 6510 + }, + { + "epoch": 0.5003073140749846, + "grad_norm": 2.717998504638672, + "learning_rate": 1.7998770743700063e-05, + "loss": 1.3673, + "step": 6512 + }, + { + "epoch": 0.500460971112477, + "grad_norm": 2.779644012451172, + "learning_rate": 1.7998156115550093e-05, + "loss": 1.493, + "step": 6514 + }, + { + "epoch": 0.5006146281499693, + "grad_norm": 2.706282377243042, + "learning_rate": 1.7997541487400126e-05, + "loss": 1.2036, + "step": 6516 + }, + { + "epoch": 0.5007682851874616, + "grad_norm": 2.846618175506592, + "learning_rate": 1.7996926859250156e-05, + "loss": 1.431, + "step": 6518 + }, + { + "epoch": 0.5009219422249539, + "grad_norm": 2.9443068504333496, + "learning_rate": 1.7996312231100185e-05, + "loss": 1.2797, + "step": 6520 + }, + { + "epoch": 0.5010755992624463, + "grad_norm": 3.471630334854126, + "learning_rate": 1.799569760295022e-05, + "loss": 1.436, + "step": 6522 + }, + { + "epoch": 0.5012292562999385, + "grad_norm": 3.2783944606781006, + "learning_rate": 1.7995082974800248e-05, + "loss": 1.4904, + "step": 6524 + }, + { + "epoch": 0.5013829133374309, + "grad_norm": 3.51324725151062, + "learning_rate": 1.7994468346650278e-05, + "loss": 1.3418, + "step": 6526 + }, + { + "epoch": 0.5015365703749232, + "grad_norm": 2.648725748062134, + "learning_rate": 1.799385371850031e-05, + "loss": 1.2583, + "step": 6528 + }, + { + "epoch": 0.5016902274124155, + "grad_norm": 2.8676857948303223, + "learning_rate": 1.7993239090350337e-05, + "loss": 1.3348, + "step": 6530 + }, + { + "epoch": 0.5018438844499078, + "grad_norm": 3.1224286556243896, + "learning_rate": 1.799262446220037e-05, + "loss": 1.3221, + "step": 6532 + }, + { + "epoch": 0.5019975414874002, + "grad_norm": 3.003473997116089, + "learning_rate": 1.79920098340504e-05, + "loss": 1.3466, + "step": 6534 + }, + { + "epoch": 0.5021511985248924, + "grad_norm": 2.8612453937530518, + "learning_rate": 1.7991395205900433e-05, + "loss": 1.2505, + "step": 6536 + }, + { + "epoch": 0.5023048555623848, + "grad_norm": 2.9750072956085205, + "learning_rate": 1.7990780577750463e-05, + "loss": 1.3482, + "step": 6538 + }, + { + "epoch": 0.5024585125998771, + "grad_norm": 2.6988656520843506, + "learning_rate": 1.7990165949600492e-05, + "loss": 1.2996, + "step": 6540 + }, + { + "epoch": 0.5026121696373694, + "grad_norm": 3.0278279781341553, + "learning_rate": 1.7989551321450525e-05, + "loss": 1.3919, + "step": 6542 + }, + { + "epoch": 0.5027658266748617, + "grad_norm": 2.9088683128356934, + "learning_rate": 1.7988936693300555e-05, + "loss": 1.4706, + "step": 6544 + }, + { + "epoch": 0.5029194837123541, + "grad_norm": 2.8318252563476562, + "learning_rate": 1.7988322065150585e-05, + "loss": 1.4209, + "step": 6546 + }, + { + "epoch": 0.5030731407498463, + "grad_norm": 2.994474411010742, + "learning_rate": 1.7987707437000618e-05, + "loss": 1.3159, + "step": 6548 + }, + { + "epoch": 0.5032267977873387, + "grad_norm": 2.9801907539367676, + "learning_rate": 1.7987092808850644e-05, + "loss": 1.2645, + "step": 6550 + }, + { + "epoch": 0.503380454824831, + "grad_norm": 3.4261326789855957, + "learning_rate": 1.7986478180700677e-05, + "loss": 1.4416, + "step": 6552 + }, + { + "epoch": 0.5035341118623233, + "grad_norm": 2.65868878364563, + "learning_rate": 1.798586355255071e-05, + "loss": 1.2989, + "step": 6554 + }, + { + "epoch": 0.5036877688998156, + "grad_norm": 3.106370210647583, + "learning_rate": 1.798524892440074e-05, + "loss": 1.3748, + "step": 6556 + }, + { + "epoch": 0.503841425937308, + "grad_norm": 2.6410956382751465, + "learning_rate": 1.798463429625077e-05, + "loss": 1.3152, + "step": 6558 + }, + { + "epoch": 0.5039950829748002, + "grad_norm": 2.8190648555755615, + "learning_rate": 1.79840196681008e-05, + "loss": 1.4078, + "step": 6560 + }, + { + "epoch": 0.5041487400122926, + "grad_norm": 2.9152209758758545, + "learning_rate": 1.7983405039950832e-05, + "loss": 1.2513, + "step": 6562 + }, + { + "epoch": 0.5043023970497849, + "grad_norm": 3.070107936859131, + "learning_rate": 1.7982790411800862e-05, + "loss": 1.4488, + "step": 6564 + }, + { + "epoch": 0.5044560540872772, + "grad_norm": 2.9198601245880127, + "learning_rate": 1.7982175783650892e-05, + "loss": 1.2694, + "step": 6566 + }, + { + "epoch": 0.5046097111247695, + "grad_norm": 3.084261417388916, + "learning_rate": 1.7981561155500925e-05, + "loss": 1.3638, + "step": 6568 + }, + { + "epoch": 0.5047633681622619, + "grad_norm": 3.0732178688049316, + "learning_rate": 1.7980946527350955e-05, + "loss": 1.2477, + "step": 6570 + }, + { + "epoch": 0.5049170251997541, + "grad_norm": 3.3027431964874268, + "learning_rate": 1.7980331899200984e-05, + "loss": 1.335, + "step": 6572 + }, + { + "epoch": 0.5050706822372465, + "grad_norm": 3.3048105239868164, + "learning_rate": 1.7979717271051017e-05, + "loss": 1.4374, + "step": 6574 + }, + { + "epoch": 0.5052243392747388, + "grad_norm": 3.292642831802368, + "learning_rate": 1.7979102642901047e-05, + "loss": 1.353, + "step": 6576 + }, + { + "epoch": 0.5053779963122311, + "grad_norm": 3.257822036743164, + "learning_rate": 1.7978488014751077e-05, + "loss": 1.3081, + "step": 6578 + }, + { + "epoch": 0.5055316533497234, + "grad_norm": 2.796616792678833, + "learning_rate": 1.7977873386601106e-05, + "loss": 1.3497, + "step": 6580 + }, + { + "epoch": 0.5056853103872158, + "grad_norm": 3.194371223449707, + "learning_rate": 1.797725875845114e-05, + "loss": 1.3587, + "step": 6582 + }, + { + "epoch": 0.505838967424708, + "grad_norm": 2.6798551082611084, + "learning_rate": 1.797664413030117e-05, + "loss": 1.3413, + "step": 6584 + }, + { + "epoch": 0.5059926244622004, + "grad_norm": 3.123992681503296, + "learning_rate": 1.79760295021512e-05, + "loss": 1.2649, + "step": 6586 + }, + { + "epoch": 0.5061462814996927, + "grad_norm": 3.01867413520813, + "learning_rate": 1.7975414874001232e-05, + "loss": 1.3729, + "step": 6588 + }, + { + "epoch": 0.506299938537185, + "grad_norm": 3.5075738430023193, + "learning_rate": 1.797480024585126e-05, + "loss": 1.3458, + "step": 6590 + }, + { + "epoch": 0.5064535955746773, + "grad_norm": 3.0404345989227295, + "learning_rate": 1.797418561770129e-05, + "loss": 1.4686, + "step": 6592 + }, + { + "epoch": 0.5066072526121697, + "grad_norm": 2.7560205459594727, + "learning_rate": 1.7973570989551324e-05, + "loss": 1.3492, + "step": 6594 + }, + { + "epoch": 0.5067609096496619, + "grad_norm": 3.127852439880371, + "learning_rate": 1.7972956361401354e-05, + "loss": 1.3995, + "step": 6596 + }, + { + "epoch": 0.5069145666871543, + "grad_norm": 2.963434934616089, + "learning_rate": 1.7972341733251384e-05, + "loss": 1.3773, + "step": 6598 + }, + { + "epoch": 0.5070682237246465, + "grad_norm": 3.146491050720215, + "learning_rate": 1.7971727105101417e-05, + "loss": 1.282, + "step": 6600 + }, + { + "epoch": 0.5072218807621389, + "grad_norm": 2.8593227863311768, + "learning_rate": 1.7971112476951446e-05, + "loss": 1.2804, + "step": 6602 + }, + { + "epoch": 0.5073755377996312, + "grad_norm": 3.011699914932251, + "learning_rate": 1.7970497848801476e-05, + "loss": 1.3739, + "step": 6604 + }, + { + "epoch": 0.5075291948371236, + "grad_norm": 3.7797420024871826, + "learning_rate": 1.7969883220651506e-05, + "loss": 1.4324, + "step": 6606 + }, + { + "epoch": 0.5076828518746158, + "grad_norm": 2.8261046409606934, + "learning_rate": 1.796926859250154e-05, + "loss": 1.3562, + "step": 6608 + }, + { + "epoch": 0.5078365089121082, + "grad_norm": 2.9160585403442383, + "learning_rate": 1.796865396435157e-05, + "loss": 1.5061, + "step": 6610 + }, + { + "epoch": 0.5079901659496004, + "grad_norm": 3.1277146339416504, + "learning_rate": 1.7968039336201598e-05, + "loss": 1.3759, + "step": 6612 + }, + { + "epoch": 0.5081438229870928, + "grad_norm": 3.4111275672912598, + "learning_rate": 1.796742470805163e-05, + "loss": 1.3682, + "step": 6614 + }, + { + "epoch": 0.5082974800245851, + "grad_norm": 3.055345296859741, + "learning_rate": 1.796681007990166e-05, + "loss": 1.4154, + "step": 6616 + }, + { + "epoch": 0.5084511370620775, + "grad_norm": 3.3309273719787598, + "learning_rate": 1.796619545175169e-05, + "loss": 1.3364, + "step": 6618 + }, + { + "epoch": 0.5086047940995697, + "grad_norm": 2.7727885246276855, + "learning_rate": 1.7965580823601724e-05, + "loss": 1.2019, + "step": 6620 + }, + { + "epoch": 0.5087584511370621, + "grad_norm": 3.5600733757019043, + "learning_rate": 1.7964966195451753e-05, + "loss": 1.4411, + "step": 6622 + }, + { + "epoch": 0.5089121081745543, + "grad_norm": 3.1073739528656006, + "learning_rate": 1.7964351567301783e-05, + "loss": 1.4255, + "step": 6624 + }, + { + "epoch": 0.5090657652120467, + "grad_norm": 2.9542529582977295, + "learning_rate": 1.7963736939151816e-05, + "loss": 1.2744, + "step": 6626 + }, + { + "epoch": 0.509219422249539, + "grad_norm": 3.2199599742889404, + "learning_rate": 1.7963122311001846e-05, + "loss": 1.289, + "step": 6628 + }, + { + "epoch": 0.5093730792870313, + "grad_norm": 3.095400333404541, + "learning_rate": 1.7962507682851876e-05, + "loss": 1.3363, + "step": 6630 + }, + { + "epoch": 0.5095267363245236, + "grad_norm": 3.337625503540039, + "learning_rate": 1.7961893054701905e-05, + "loss": 1.4096, + "step": 6632 + }, + { + "epoch": 0.509680393362016, + "grad_norm": 3.3786509037017822, + "learning_rate": 1.7961278426551938e-05, + "loss": 1.3938, + "step": 6634 + }, + { + "epoch": 0.5098340503995082, + "grad_norm": 2.652902364730835, + "learning_rate": 1.7960663798401968e-05, + "loss": 1.2738, + "step": 6636 + }, + { + "epoch": 0.5099877074370006, + "grad_norm": 3.4588985443115234, + "learning_rate": 1.7960049170251998e-05, + "loss": 1.2758, + "step": 6638 + }, + { + "epoch": 0.510141364474493, + "grad_norm": 3.1711127758026123, + "learning_rate": 1.795943454210203e-05, + "loss": 1.3747, + "step": 6640 + }, + { + "epoch": 0.5102950215119852, + "grad_norm": 3.4093017578125, + "learning_rate": 1.795881991395206e-05, + "loss": 1.2861, + "step": 6642 + }, + { + "epoch": 0.5104486785494776, + "grad_norm": 3.0328543186187744, + "learning_rate": 1.795820528580209e-05, + "loss": 1.5437, + "step": 6644 + }, + { + "epoch": 0.5106023355869699, + "grad_norm": 2.8662827014923096, + "learning_rate": 1.7957590657652123e-05, + "loss": 1.3798, + "step": 6646 + }, + { + "epoch": 0.5107559926244623, + "grad_norm": 2.6307129859924316, + "learning_rate": 1.7956976029502153e-05, + "loss": 1.1287, + "step": 6648 + }, + { + "epoch": 0.5109096496619545, + "grad_norm": 3.1315627098083496, + "learning_rate": 1.7956361401352183e-05, + "loss": 1.4458, + "step": 6650 + }, + { + "epoch": 0.5110633066994469, + "grad_norm": 3.4800686836242676, + "learning_rate": 1.7955746773202216e-05, + "loss": 1.5149, + "step": 6652 + }, + { + "epoch": 0.5112169637369391, + "grad_norm": 3.0013492107391357, + "learning_rate": 1.7955132145052245e-05, + "loss": 1.3806, + "step": 6654 + }, + { + "epoch": 0.5113706207744315, + "grad_norm": 3.450124502182007, + "learning_rate": 1.7954517516902275e-05, + "loss": 1.4322, + "step": 6656 + }, + { + "epoch": 0.5115242778119238, + "grad_norm": 2.9068808555603027, + "learning_rate": 1.7953902888752305e-05, + "loss": 1.3998, + "step": 6658 + }, + { + "epoch": 0.5116779348494161, + "grad_norm": 2.6469430923461914, + "learning_rate": 1.7953288260602338e-05, + "loss": 1.4466, + "step": 6660 + }, + { + "epoch": 0.5118315918869084, + "grad_norm": 2.9947774410247803, + "learning_rate": 1.7952673632452367e-05, + "loss": 1.3114, + "step": 6662 + }, + { + "epoch": 0.5119852489244008, + "grad_norm": 3.5101516246795654, + "learning_rate": 1.7952059004302397e-05, + "loss": 1.3693, + "step": 6664 + }, + { + "epoch": 0.512138905961893, + "grad_norm": 3.2445108890533447, + "learning_rate": 1.795144437615243e-05, + "loss": 1.3274, + "step": 6666 + }, + { + "epoch": 0.5122925629993854, + "grad_norm": 3.050546169281006, + "learning_rate": 1.795082974800246e-05, + "loss": 1.3715, + "step": 6668 + }, + { + "epoch": 0.5124462200368777, + "grad_norm": 3.0119810104370117, + "learning_rate": 1.795021511985249e-05, + "loss": 1.3978, + "step": 6670 + }, + { + "epoch": 0.51259987707437, + "grad_norm": 3.514958381652832, + "learning_rate": 1.7949600491702523e-05, + "loss": 1.4204, + "step": 6672 + }, + { + "epoch": 0.5127535341118623, + "grad_norm": 2.9083685874938965, + "learning_rate": 1.7948985863552552e-05, + "loss": 1.2959, + "step": 6674 + }, + { + "epoch": 0.5129071911493547, + "grad_norm": 2.6381547451019287, + "learning_rate": 1.7948371235402582e-05, + "loss": 1.3359, + "step": 6676 + }, + { + "epoch": 0.5130608481868469, + "grad_norm": 3.478752374649048, + "learning_rate": 1.794775660725261e-05, + "loss": 1.4583, + "step": 6678 + }, + { + "epoch": 0.5132145052243393, + "grad_norm": 2.834408760070801, + "learning_rate": 1.7947141979102645e-05, + "loss": 1.3429, + "step": 6680 + }, + { + "epoch": 0.5133681622618316, + "grad_norm": 3.0341265201568604, + "learning_rate": 1.7946527350952674e-05, + "loss": 1.3716, + "step": 6682 + }, + { + "epoch": 0.513521819299324, + "grad_norm": 2.8251383304595947, + "learning_rate": 1.7945912722802704e-05, + "loss": 1.5223, + "step": 6684 + }, + { + "epoch": 0.5136754763368162, + "grad_norm": 3.265641212463379, + "learning_rate": 1.7945298094652737e-05, + "loss": 1.269, + "step": 6686 + }, + { + "epoch": 0.5138291333743086, + "grad_norm": 3.247816562652588, + "learning_rate": 1.7944683466502767e-05, + "loss": 1.2121, + "step": 6688 + }, + { + "epoch": 0.5139827904118008, + "grad_norm": 3.2081658840179443, + "learning_rate": 1.7944068838352797e-05, + "loss": 1.2807, + "step": 6690 + }, + { + "epoch": 0.5141364474492932, + "grad_norm": 3.1236660480499268, + "learning_rate": 1.794345421020283e-05, + "loss": 1.5633, + "step": 6692 + }, + { + "epoch": 0.5142901044867855, + "grad_norm": 2.951101064682007, + "learning_rate": 1.794283958205286e-05, + "loss": 1.3899, + "step": 6694 + }, + { + "epoch": 0.5144437615242778, + "grad_norm": 3.1760647296905518, + "learning_rate": 1.794222495390289e-05, + "loss": 1.2689, + "step": 6696 + }, + { + "epoch": 0.5145974185617701, + "grad_norm": 3.310821294784546, + "learning_rate": 1.7941610325752922e-05, + "loss": 1.4301, + "step": 6698 + }, + { + "epoch": 0.5147510755992625, + "grad_norm": 3.1752665042877197, + "learning_rate": 1.7940995697602952e-05, + "loss": 1.2948, + "step": 6700 + }, + { + "epoch": 0.5149047326367547, + "grad_norm": 3.2648708820343018, + "learning_rate": 1.7940381069452985e-05, + "loss": 1.3254, + "step": 6702 + }, + { + "epoch": 0.5150583896742471, + "grad_norm": 3.0156726837158203, + "learning_rate": 1.793976644130301e-05, + "loss": 1.4047, + "step": 6704 + }, + { + "epoch": 0.5152120467117394, + "grad_norm": 3.873260974884033, + "learning_rate": 1.7939151813153044e-05, + "loss": 1.4925, + "step": 6706 + }, + { + "epoch": 0.5153657037492317, + "grad_norm": 3.176854372024536, + "learning_rate": 1.7938537185003074e-05, + "loss": 1.301, + "step": 6708 + }, + { + "epoch": 0.515519360786724, + "grad_norm": 2.7950966358184814, + "learning_rate": 1.7937922556853104e-05, + "loss": 1.3525, + "step": 6710 + }, + { + "epoch": 0.5156730178242164, + "grad_norm": 3.167912244796753, + "learning_rate": 1.7937307928703137e-05, + "loss": 1.5551, + "step": 6712 + }, + { + "epoch": 0.5158266748617086, + "grad_norm": 3.4793012142181396, + "learning_rate": 1.7936693300553166e-05, + "loss": 1.5019, + "step": 6714 + }, + { + "epoch": 0.515980331899201, + "grad_norm": 2.9153974056243896, + "learning_rate": 1.7936078672403196e-05, + "loss": 1.407, + "step": 6716 + }, + { + "epoch": 0.5161339889366933, + "grad_norm": 2.8722434043884277, + "learning_rate": 1.793546404425323e-05, + "loss": 1.3882, + "step": 6718 + }, + { + "epoch": 0.5162876459741856, + "grad_norm": 2.9194371700286865, + "learning_rate": 1.793484941610326e-05, + "loss": 1.4366, + "step": 6720 + }, + { + "epoch": 0.5164413030116779, + "grad_norm": 3.2603633403778076, + "learning_rate": 1.7934234787953292e-05, + "loss": 1.3387, + "step": 6722 + }, + { + "epoch": 0.5165949600491703, + "grad_norm": 3.0995850563049316, + "learning_rate": 1.793362015980332e-05, + "loss": 1.3719, + "step": 6724 + }, + { + "epoch": 0.5167486170866625, + "grad_norm": 2.8780972957611084, + "learning_rate": 1.793300553165335e-05, + "loss": 1.3721, + "step": 6726 + }, + { + "epoch": 0.5169022741241549, + "grad_norm": 3.1328043937683105, + "learning_rate": 1.7932390903503384e-05, + "loss": 1.2831, + "step": 6728 + }, + { + "epoch": 0.5170559311616472, + "grad_norm": 3.0199432373046875, + "learning_rate": 1.793177627535341e-05, + "loss": 1.3505, + "step": 6730 + }, + { + "epoch": 0.5172095881991395, + "grad_norm": 2.7379181385040283, + "learning_rate": 1.7931161647203444e-05, + "loss": 1.3088, + "step": 6732 + }, + { + "epoch": 0.5173632452366318, + "grad_norm": 2.929868459701538, + "learning_rate": 1.7930547019053473e-05, + "loss": 1.3396, + "step": 6734 + }, + { + "epoch": 0.5175169022741242, + "grad_norm": 2.787508249282837, + "learning_rate": 1.7929932390903503e-05, + "loss": 1.3644, + "step": 6736 + }, + { + "epoch": 0.5176705593116164, + "grad_norm": 2.7956087589263916, + "learning_rate": 1.7929317762753536e-05, + "loss": 1.2359, + "step": 6738 + }, + { + "epoch": 0.5178242163491088, + "grad_norm": 2.53226900100708, + "learning_rate": 1.7928703134603566e-05, + "loss": 1.307, + "step": 6740 + }, + { + "epoch": 0.5179778733866011, + "grad_norm": 2.9932830333709717, + "learning_rate": 1.79280885064536e-05, + "loss": 1.3173, + "step": 6742 + }, + { + "epoch": 0.5181315304240934, + "grad_norm": 2.900578260421753, + "learning_rate": 1.792747387830363e-05, + "loss": 1.4958, + "step": 6744 + }, + { + "epoch": 0.5182851874615857, + "grad_norm": 3.190718173980713, + "learning_rate": 1.7926859250153658e-05, + "loss": 1.3313, + "step": 6746 + }, + { + "epoch": 0.5184388444990781, + "grad_norm": 3.0104870796203613, + "learning_rate": 1.792624462200369e-05, + "loss": 1.3749, + "step": 6748 + }, + { + "epoch": 0.5185925015365703, + "grad_norm": 3.335132598876953, + "learning_rate": 1.792562999385372e-05, + "loss": 1.3441, + "step": 6750 + }, + { + "epoch": 0.5187461585740627, + "grad_norm": 2.9974780082702637, + "learning_rate": 1.792501536570375e-05, + "loss": 1.4476, + "step": 6752 + }, + { + "epoch": 0.518899815611555, + "grad_norm": 2.8957419395446777, + "learning_rate": 1.7924400737553784e-05, + "loss": 1.3322, + "step": 6754 + }, + { + "epoch": 0.5190534726490473, + "grad_norm": 3.018171787261963, + "learning_rate": 1.792378610940381e-05, + "loss": 1.3581, + "step": 6756 + }, + { + "epoch": 0.5192071296865396, + "grad_norm": 2.8184151649475098, + "learning_rate": 1.7923171481253843e-05, + "loss": 1.4037, + "step": 6758 + }, + { + "epoch": 0.519360786724032, + "grad_norm": 2.97739315032959, + "learning_rate": 1.7922556853103873e-05, + "loss": 1.2547, + "step": 6760 + }, + { + "epoch": 0.5195144437615242, + "grad_norm": 3.4361817836761475, + "learning_rate": 1.7921942224953906e-05, + "loss": 1.3367, + "step": 6762 + }, + { + "epoch": 0.5196681007990166, + "grad_norm": 25.288660049438477, + "learning_rate": 1.7921327596803935e-05, + "loss": 1.4986, + "step": 6764 + }, + { + "epoch": 0.5198217578365089, + "grad_norm": 3.100538492202759, + "learning_rate": 1.7920712968653965e-05, + "loss": 1.3149, + "step": 6766 + }, + { + "epoch": 0.5199754148740012, + "grad_norm": 2.8900961875915527, + "learning_rate": 1.7920098340503998e-05, + "loss": 1.352, + "step": 6768 + }, + { + "epoch": 0.5201290719114936, + "grad_norm": 2.9886999130249023, + "learning_rate": 1.7919483712354028e-05, + "loss": 1.2732, + "step": 6770 + }, + { + "epoch": 0.5202827289489859, + "grad_norm": 2.7704336643218994, + "learning_rate": 1.7918869084204058e-05, + "loss": 1.1976, + "step": 6772 + }, + { + "epoch": 0.5204363859864782, + "grad_norm": 2.448591709136963, + "learning_rate": 1.791825445605409e-05, + "loss": 1.2853, + "step": 6774 + }, + { + "epoch": 0.5205900430239705, + "grad_norm": 2.9498369693756104, + "learning_rate": 1.7917639827904117e-05, + "loss": 1.306, + "step": 6776 + }, + { + "epoch": 0.5207437000614629, + "grad_norm": 3.2364501953125, + "learning_rate": 1.791702519975415e-05, + "loss": 1.379, + "step": 6778 + }, + { + "epoch": 0.5208973570989551, + "grad_norm": 3.1900076866149902, + "learning_rate": 1.791641057160418e-05, + "loss": 1.4177, + "step": 6780 + }, + { + "epoch": 0.5210510141364475, + "grad_norm": 3.0492565631866455, + "learning_rate": 1.791579594345421e-05, + "loss": 1.4582, + "step": 6782 + }, + { + "epoch": 0.5212046711739398, + "grad_norm": 2.8004837036132812, + "learning_rate": 1.7915181315304242e-05, + "loss": 1.4414, + "step": 6784 + }, + { + "epoch": 0.5213583282114321, + "grad_norm": 2.4796876907348633, + "learning_rate": 1.7914566687154272e-05, + "loss": 1.388, + "step": 6786 + }, + { + "epoch": 0.5215119852489244, + "grad_norm": 2.892904281616211, + "learning_rate": 1.7913952059004305e-05, + "loss": 1.4414, + "step": 6788 + }, + { + "epoch": 0.5216656422864168, + "grad_norm": 3.042837381362915, + "learning_rate": 1.7913337430854335e-05, + "loss": 1.4302, + "step": 6790 + }, + { + "epoch": 0.521819299323909, + "grad_norm": 3.3126227855682373, + "learning_rate": 1.7912722802704365e-05, + "loss": 1.4458, + "step": 6792 + }, + { + "epoch": 0.5219729563614014, + "grad_norm": 2.8981101512908936, + "learning_rate": 1.7912108174554398e-05, + "loss": 1.4403, + "step": 6794 + }, + { + "epoch": 0.5221266133988937, + "grad_norm": 3.016014337539673, + "learning_rate": 1.7911493546404427e-05, + "loss": 1.235, + "step": 6796 + }, + { + "epoch": 0.522280270436386, + "grad_norm": 2.693057060241699, + "learning_rate": 1.7910878918254457e-05, + "loss": 1.3, + "step": 6798 + }, + { + "epoch": 0.5224339274738783, + "grad_norm": 2.8503472805023193, + "learning_rate": 1.791026429010449e-05, + "loss": 1.2262, + "step": 6800 + }, + { + "epoch": 0.5225875845113707, + "grad_norm": 3.0016205310821533, + "learning_rate": 1.7909649661954516e-05, + "loss": 1.328, + "step": 6802 + }, + { + "epoch": 0.5227412415488629, + "grad_norm": 2.7652969360351562, + "learning_rate": 1.790903503380455e-05, + "loss": 1.1953, + "step": 6804 + }, + { + "epoch": 0.5228948985863553, + "grad_norm": 3.02223539352417, + "learning_rate": 1.790842040565458e-05, + "loss": 1.1621, + "step": 6806 + }, + { + "epoch": 0.5230485556238476, + "grad_norm": 3.1197988986968994, + "learning_rate": 1.7907805777504612e-05, + "loss": 1.4014, + "step": 6808 + }, + { + "epoch": 0.5232022126613399, + "grad_norm": 2.959120273590088, + "learning_rate": 1.7907191149354642e-05, + "loss": 1.4844, + "step": 6810 + }, + { + "epoch": 0.5233558696988322, + "grad_norm": 3.0649282932281494, + "learning_rate": 1.790657652120467e-05, + "loss": 1.4225, + "step": 6812 + }, + { + "epoch": 0.5235095267363246, + "grad_norm": 2.9492135047912598, + "learning_rate": 1.7905961893054705e-05, + "loss": 1.4206, + "step": 6814 + }, + { + "epoch": 0.5236631837738168, + "grad_norm": 3.1265509128570557, + "learning_rate": 1.7905347264904734e-05, + "loss": 1.317, + "step": 6816 + }, + { + "epoch": 0.5238168408113092, + "grad_norm": 2.831228256225586, + "learning_rate": 1.7904732636754764e-05, + "loss": 1.4686, + "step": 6818 + }, + { + "epoch": 0.5239704978488015, + "grad_norm": 3.228123903274536, + "learning_rate": 1.7904118008604797e-05, + "loss": 1.4721, + "step": 6820 + }, + { + "epoch": 0.5241241548862938, + "grad_norm": 3.5068886280059814, + "learning_rate": 1.7903503380454827e-05, + "loss": 1.4167, + "step": 6822 + }, + { + "epoch": 0.5242778119237861, + "grad_norm": 2.8935916423797607, + "learning_rate": 1.7902888752304856e-05, + "loss": 1.362, + "step": 6824 + }, + { + "epoch": 0.5244314689612785, + "grad_norm": 3.246994972229004, + "learning_rate": 1.790227412415489e-05, + "loss": 1.3156, + "step": 6826 + }, + { + "epoch": 0.5245851259987707, + "grad_norm": 3.070075035095215, + "learning_rate": 1.790165949600492e-05, + "loss": 1.3155, + "step": 6828 + }, + { + "epoch": 0.5247387830362631, + "grad_norm": 3.2165653705596924, + "learning_rate": 1.790104486785495e-05, + "loss": 1.4158, + "step": 6830 + }, + { + "epoch": 0.5248924400737554, + "grad_norm": 3.122731924057007, + "learning_rate": 1.790043023970498e-05, + "loss": 1.312, + "step": 6832 + }, + { + "epoch": 0.5250460971112477, + "grad_norm": 2.7937536239624023, + "learning_rate": 1.789981561155501e-05, + "loss": 1.4255, + "step": 6834 + }, + { + "epoch": 0.52519975414874, + "grad_norm": 2.8714072704315186, + "learning_rate": 1.789920098340504e-05, + "loss": 1.3561, + "step": 6836 + }, + { + "epoch": 0.5253534111862324, + "grad_norm": 3.3876631259918213, + "learning_rate": 1.789858635525507e-05, + "loss": 1.2653, + "step": 6838 + }, + { + "epoch": 0.5255070682237246, + "grad_norm": 3.067852258682251, + "learning_rate": 1.7897971727105104e-05, + "loss": 1.462, + "step": 6840 + }, + { + "epoch": 0.525660725261217, + "grad_norm": 2.623149871826172, + "learning_rate": 1.7897357098955134e-05, + "loss": 1.358, + "step": 6842 + }, + { + "epoch": 0.5258143822987092, + "grad_norm": 3.001631021499634, + "learning_rate": 1.7896742470805163e-05, + "loss": 1.4515, + "step": 6844 + }, + { + "epoch": 0.5259680393362016, + "grad_norm": 2.7457780838012695, + "learning_rate": 1.7896127842655197e-05, + "loss": 1.2601, + "step": 6846 + }, + { + "epoch": 0.5261216963736939, + "grad_norm": 3.05543851852417, + "learning_rate": 1.7895513214505226e-05, + "loss": 1.3837, + "step": 6848 + }, + { + "epoch": 0.5262753534111863, + "grad_norm": 2.7177581787109375, + "learning_rate": 1.7894898586355256e-05, + "loss": 1.2661, + "step": 6850 + }, + { + "epoch": 0.5264290104486785, + "grad_norm": 2.9022562503814697, + "learning_rate": 1.789428395820529e-05, + "loss": 1.3782, + "step": 6852 + }, + { + "epoch": 0.5265826674861709, + "grad_norm": 3.01700758934021, + "learning_rate": 1.789366933005532e-05, + "loss": 1.3188, + "step": 6854 + }, + { + "epoch": 0.5267363245236631, + "grad_norm": 2.9770376682281494, + "learning_rate": 1.789305470190535e-05, + "loss": 1.4046, + "step": 6856 + }, + { + "epoch": 0.5268899815611555, + "grad_norm": 3.635504961013794, + "learning_rate": 1.7892440073755378e-05, + "loss": 1.3997, + "step": 6858 + }, + { + "epoch": 0.5270436385986478, + "grad_norm": 3.3790907859802246, + "learning_rate": 1.789182544560541e-05, + "loss": 1.3248, + "step": 6860 + }, + { + "epoch": 0.5271972956361402, + "grad_norm": 3.0795631408691406, + "learning_rate": 1.789121081745544e-05, + "loss": 1.2888, + "step": 6862 + }, + { + "epoch": 0.5273509526736324, + "grad_norm": 2.7538647651672363, + "learning_rate": 1.789059618930547e-05, + "loss": 1.5236, + "step": 6864 + }, + { + "epoch": 0.5275046097111248, + "grad_norm": 3.4936375617980957, + "learning_rate": 1.7889981561155504e-05, + "loss": 1.4455, + "step": 6866 + }, + { + "epoch": 0.527658266748617, + "grad_norm": 3.7572572231292725, + "learning_rate": 1.7889366933005533e-05, + "loss": 1.4181, + "step": 6868 + }, + { + "epoch": 0.5278119237861094, + "grad_norm": 3.407961130142212, + "learning_rate": 1.7888752304855563e-05, + "loss": 1.3646, + "step": 6870 + }, + { + "epoch": 0.5279655808236017, + "grad_norm": 2.7877607345581055, + "learning_rate": 1.7888137676705596e-05, + "loss": 1.259, + "step": 6872 + }, + { + "epoch": 0.528119237861094, + "grad_norm": 2.999338150024414, + "learning_rate": 1.7887523048555626e-05, + "loss": 1.3156, + "step": 6874 + }, + { + "epoch": 0.5282728948985863, + "grad_norm": 3.449953079223633, + "learning_rate": 1.7886908420405655e-05, + "loss": 1.4194, + "step": 6876 + }, + { + "epoch": 0.5284265519360787, + "grad_norm": 3.0166075229644775, + "learning_rate": 1.788629379225569e-05, + "loss": 1.3544, + "step": 6878 + }, + { + "epoch": 0.5285802089735709, + "grad_norm": 2.9438271522521973, + "learning_rate": 1.7885679164105718e-05, + "loss": 1.4669, + "step": 6880 + }, + { + "epoch": 0.5287338660110633, + "grad_norm": 2.558922529220581, + "learning_rate": 1.7885064535955748e-05, + "loss": 1.2715, + "step": 6882 + }, + { + "epoch": 0.5288875230485556, + "grad_norm": 2.997262954711914, + "learning_rate": 1.7884449907805777e-05, + "loss": 1.3229, + "step": 6884 + }, + { + "epoch": 0.529041180086048, + "grad_norm": 2.7784371376037598, + "learning_rate": 1.788383527965581e-05, + "loss": 1.2801, + "step": 6886 + }, + { + "epoch": 0.5291948371235402, + "grad_norm": 3.280125379562378, + "learning_rate": 1.788322065150584e-05, + "loss": 1.3505, + "step": 6888 + }, + { + "epoch": 0.5293484941610326, + "grad_norm": 3.9201979637145996, + "learning_rate": 1.788260602335587e-05, + "loss": 1.3235, + "step": 6890 + }, + { + "epoch": 0.5295021511985248, + "grad_norm": 3.0474581718444824, + "learning_rate": 1.7881991395205903e-05, + "loss": 1.3269, + "step": 6892 + }, + { + "epoch": 0.5296558082360172, + "grad_norm": 2.956028938293457, + "learning_rate": 1.7881376767055933e-05, + "loss": 1.2897, + "step": 6894 + }, + { + "epoch": 0.5298094652735095, + "grad_norm": 3.143683433532715, + "learning_rate": 1.7880762138905962e-05, + "loss": 1.3913, + "step": 6896 + }, + { + "epoch": 0.5299631223110018, + "grad_norm": 2.942600727081299, + "learning_rate": 1.7880147510755995e-05, + "loss": 1.2993, + "step": 6898 + }, + { + "epoch": 0.5301167793484942, + "grad_norm": 2.7133402824401855, + "learning_rate": 1.7879532882606025e-05, + "loss": 1.2878, + "step": 6900 + }, + { + "epoch": 0.5302704363859865, + "grad_norm": 2.7319624423980713, + "learning_rate": 1.7878918254456055e-05, + "loss": 1.2711, + "step": 6902 + }, + { + "epoch": 0.5304240934234788, + "grad_norm": 3.0787789821624756, + "learning_rate": 1.7878303626306084e-05, + "loss": 1.4048, + "step": 6904 + }, + { + "epoch": 0.5305777504609711, + "grad_norm": 3.218059539794922, + "learning_rate": 1.7877688998156118e-05, + "loss": 1.346, + "step": 6906 + }, + { + "epoch": 0.5307314074984635, + "grad_norm": 3.2375054359436035, + "learning_rate": 1.7877074370006147e-05, + "loss": 1.3984, + "step": 6908 + }, + { + "epoch": 0.5308850645359557, + "grad_norm": 2.9513885974884033, + "learning_rate": 1.7876459741856177e-05, + "loss": 1.4062, + "step": 6910 + }, + { + "epoch": 0.5310387215734481, + "grad_norm": 3.1400296688079834, + "learning_rate": 1.787584511370621e-05, + "loss": 1.4292, + "step": 6912 + }, + { + "epoch": 0.5311923786109404, + "grad_norm": 3.114333152770996, + "learning_rate": 1.787523048555624e-05, + "loss": 1.5008, + "step": 6914 + }, + { + "epoch": 0.5313460356484327, + "grad_norm": 2.8262038230895996, + "learning_rate": 1.787461585740627e-05, + "loss": 1.4744, + "step": 6916 + }, + { + "epoch": 0.531499692685925, + "grad_norm": 3.2163949012756348, + "learning_rate": 1.7874001229256302e-05, + "loss": 1.3959, + "step": 6918 + }, + { + "epoch": 0.5316533497234174, + "grad_norm": 2.7985939979553223, + "learning_rate": 1.7873386601106332e-05, + "loss": 1.4022, + "step": 6920 + }, + { + "epoch": 0.5318070067609096, + "grad_norm": 2.731464147567749, + "learning_rate": 1.7872771972956362e-05, + "loss": 1.3438, + "step": 6922 + }, + { + "epoch": 0.531960663798402, + "grad_norm": 3.187049627304077, + "learning_rate": 1.7872157344806395e-05, + "loss": 1.4009, + "step": 6924 + }, + { + "epoch": 0.5321143208358943, + "grad_norm": 2.9336483478546143, + "learning_rate": 1.7871542716656425e-05, + "loss": 1.412, + "step": 6926 + }, + { + "epoch": 0.5322679778733866, + "grad_norm": 3.132073402404785, + "learning_rate": 1.7870928088506454e-05, + "loss": 1.3688, + "step": 6928 + }, + { + "epoch": 0.5324216349108789, + "grad_norm": 2.981800079345703, + "learning_rate": 1.7870313460356484e-05, + "loss": 1.3054, + "step": 6930 + }, + { + "epoch": 0.5325752919483713, + "grad_norm": 3.3410542011260986, + "learning_rate": 1.7869698832206517e-05, + "loss": 1.3939, + "step": 6932 + }, + { + "epoch": 0.5327289489858635, + "grad_norm": 2.8164236545562744, + "learning_rate": 1.7869084204056547e-05, + "loss": 1.3269, + "step": 6934 + }, + { + "epoch": 0.5328826060233559, + "grad_norm": 3.8438332080841064, + "learning_rate": 1.7868469575906576e-05, + "loss": 1.3612, + "step": 6936 + }, + { + "epoch": 0.5330362630608482, + "grad_norm": 2.7539196014404297, + "learning_rate": 1.786785494775661e-05, + "loss": 1.351, + "step": 6938 + }, + { + "epoch": 0.5331899200983405, + "grad_norm": 3.0499660968780518, + "learning_rate": 1.786724031960664e-05, + "loss": 1.4173, + "step": 6940 + }, + { + "epoch": 0.5333435771358328, + "grad_norm": 3.459529399871826, + "learning_rate": 1.786662569145667e-05, + "loss": 1.4837, + "step": 6942 + }, + { + "epoch": 0.5334972341733252, + "grad_norm": 3.2329859733581543, + "learning_rate": 1.7866011063306702e-05, + "loss": 1.4251, + "step": 6944 + }, + { + "epoch": 0.5336508912108174, + "grad_norm": 3.210688352584839, + "learning_rate": 1.786539643515673e-05, + "loss": 1.3004, + "step": 6946 + }, + { + "epoch": 0.5338045482483098, + "grad_norm": 3.564739227294922, + "learning_rate": 1.786478180700676e-05, + "loss": 1.443, + "step": 6948 + }, + { + "epoch": 0.5339582052858021, + "grad_norm": 2.8052964210510254, + "learning_rate": 1.7864167178856794e-05, + "loss": 1.2934, + "step": 6950 + }, + { + "epoch": 0.5341118623232944, + "grad_norm": 3.189671039581299, + "learning_rate": 1.7863552550706824e-05, + "loss": 1.3459, + "step": 6952 + }, + { + "epoch": 0.5342655193607867, + "grad_norm": 3.1696484088897705, + "learning_rate": 1.7862937922556857e-05, + "loss": 1.4234, + "step": 6954 + }, + { + "epoch": 0.5344191763982791, + "grad_norm": 2.973815679550171, + "learning_rate": 1.7862323294406883e-05, + "loss": 1.2077, + "step": 6956 + }, + { + "epoch": 0.5345728334357713, + "grad_norm": 3.508512258529663, + "learning_rate": 1.7861708666256916e-05, + "loss": 1.3832, + "step": 6958 + }, + { + "epoch": 0.5347264904732637, + "grad_norm": 2.8029847145080566, + "learning_rate": 1.7861094038106946e-05, + "loss": 1.3648, + "step": 6960 + }, + { + "epoch": 0.534880147510756, + "grad_norm": 3.183760404586792, + "learning_rate": 1.7860479409956976e-05, + "loss": 1.4012, + "step": 6962 + }, + { + "epoch": 0.5350338045482483, + "grad_norm": 3.181417942047119, + "learning_rate": 1.785986478180701e-05, + "loss": 1.3642, + "step": 6964 + }, + { + "epoch": 0.5351874615857406, + "grad_norm": 3.075376033782959, + "learning_rate": 1.785925015365704e-05, + "loss": 1.4268, + "step": 6966 + }, + { + "epoch": 0.535341118623233, + "grad_norm": 3.069875478744507, + "learning_rate": 1.7858635525507068e-05, + "loss": 1.4777, + "step": 6968 + }, + { + "epoch": 0.5354947756607252, + "grad_norm": 3.1056337356567383, + "learning_rate": 1.78580208973571e-05, + "loss": 1.4871, + "step": 6970 + }, + { + "epoch": 0.5356484326982176, + "grad_norm": 3.0267107486724854, + "learning_rate": 1.785740626920713e-05, + "loss": 1.4342, + "step": 6972 + }, + { + "epoch": 0.5358020897357099, + "grad_norm": 3.082707643508911, + "learning_rate": 1.7856791641057164e-05, + "loss": 1.4585, + "step": 6974 + }, + { + "epoch": 0.5359557467732022, + "grad_norm": 3.154895544052124, + "learning_rate": 1.7856177012907194e-05, + "loss": 1.4232, + "step": 6976 + }, + { + "epoch": 0.5361094038106945, + "grad_norm": 2.978407621383667, + "learning_rate": 1.7855562384757223e-05, + "loss": 1.3511, + "step": 6978 + }, + { + "epoch": 0.5362630608481869, + "grad_norm": 2.940761089324951, + "learning_rate": 1.7854947756607257e-05, + "loss": 1.372, + "step": 6980 + }, + { + "epoch": 0.5364167178856791, + "grad_norm": 2.9799039363861084, + "learning_rate": 1.7854333128457283e-05, + "loss": 1.5043, + "step": 6982 + }, + { + "epoch": 0.5365703749231715, + "grad_norm": 2.7917842864990234, + "learning_rate": 1.7853718500307316e-05, + "loss": 1.2237, + "step": 6984 + }, + { + "epoch": 0.5367240319606638, + "grad_norm": 2.8482396602630615, + "learning_rate": 1.7853103872157346e-05, + "loss": 1.4475, + "step": 6986 + }, + { + "epoch": 0.5368776889981561, + "grad_norm": 3.9226222038269043, + "learning_rate": 1.7852489244007375e-05, + "loss": 1.3577, + "step": 6988 + }, + { + "epoch": 0.5370313460356484, + "grad_norm": 2.828793525695801, + "learning_rate": 1.785187461585741e-05, + "loss": 1.3318, + "step": 6990 + }, + { + "epoch": 0.5371850030731408, + "grad_norm": 2.9199106693267822, + "learning_rate": 1.7851259987707438e-05, + "loss": 1.3467, + "step": 6992 + }, + { + "epoch": 0.537338660110633, + "grad_norm": 2.835475444793701, + "learning_rate": 1.785064535955747e-05, + "loss": 1.3634, + "step": 6994 + }, + { + "epoch": 0.5374923171481254, + "grad_norm": 3.4660840034484863, + "learning_rate": 1.78500307314075e-05, + "loss": 1.2317, + "step": 6996 + }, + { + "epoch": 0.5376459741856177, + "grad_norm": 4.105753421783447, + "learning_rate": 1.784941610325753e-05, + "loss": 1.3137, + "step": 6998 + }, + { + "epoch": 0.53779963122311, + "grad_norm": 2.8999927043914795, + "learning_rate": 1.7848801475107564e-05, + "loss": 1.2568, + "step": 7000 } ], "logging_steps": 2, @@ -19976,7 +24526,7 @@ "attributes": {} } }, - "total_flos": 3.638521767175127e+19, + "total_flos": 4.46836006495191e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null