{ "best_metric": 1.3142019510269165, "best_model_checkpoint": "/export/data/salmasia/tradutor/checkpoints/hf_phi3_lora/checkpoint-19500", "epoch": 6.254770193041568, "eval_steps": 500, "global_step": 21000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002978461996686461, "grad_norm": 0.48318326473236084, "learning_rate": 2e-08, "loss": 1.845, "step": 1 }, { "epoch": 0.0005956923993372922, "grad_norm": 0.48029589653015137, "learning_rate": 4e-08, "loss": 1.8713, "step": 2 }, { "epoch": 0.0008935385990059383, "grad_norm": 0.4785495400428772, "learning_rate": 6.000000000000001e-08, "loss": 1.8476, "step": 3 }, { "epoch": 0.0011913847986745843, "grad_norm": 0.47286421060562134, "learning_rate": 8e-08, "loss": 1.8556, "step": 4 }, { "epoch": 0.0014892309983432306, "grad_norm": 0.4699583649635315, "learning_rate": 1.0000000000000001e-07, "loss": 1.8643, "step": 5 }, { "epoch": 0.0017870771980118766, "grad_norm": 0.48511743545532227, "learning_rate": 1.2000000000000002e-07, "loss": 1.8712, "step": 6 }, { "epoch": 0.002084923397680523, "grad_norm": 0.46384984254837036, "learning_rate": 1.4e-07, "loss": 1.8585, "step": 7 }, { "epoch": 0.0023827695973491687, "grad_norm": 0.4602062404155731, "learning_rate": 1.6e-07, "loss": 1.8288, "step": 8 }, { "epoch": 0.002680615797017815, "grad_norm": 0.47385460138320923, "learning_rate": 1.8e-07, "loss": 1.8433, "step": 9 }, { "epoch": 0.002978461996686461, "grad_norm": 0.4577183425426483, "learning_rate": 2.0000000000000002e-07, "loss": 1.8512, "step": 10 }, { "epoch": 0.003276308196355107, "grad_norm": 0.48740354180336, "learning_rate": 2.2e-07, "loss": 1.8633, "step": 11 }, { "epoch": 0.0035741543960237532, "grad_norm": 0.47859111428260803, "learning_rate": 2.4000000000000003e-07, "loss": 1.8782, "step": 12 }, { "epoch": 0.0038720005956923995, "grad_norm": 0.4740448594093323, "learning_rate": 2.6e-07, "loss": 1.8873, "step": 13 }, { "epoch": 0.004169846795361046, "grad_norm": 0.4691798985004425, "learning_rate": 2.8e-07, "loss": 1.8568, "step": 14 }, { "epoch": 0.004467692995029691, "grad_norm": 0.4607478678226471, "learning_rate": 3.0000000000000004e-07, "loss": 1.8595, "step": 15 }, { "epoch": 0.004765539194698337, "grad_norm": 0.4853513836860657, "learning_rate": 3.2e-07, "loss": 1.8717, "step": 16 }, { "epoch": 0.005063385394366984, "grad_norm": 0.4743955433368683, "learning_rate": 3.4000000000000003e-07, "loss": 1.8515, "step": 17 }, { "epoch": 0.00536123159403563, "grad_norm": 0.4832974076271057, "learning_rate": 3.6e-07, "loss": 1.8849, "step": 18 }, { "epoch": 0.005659077793704276, "grad_norm": 0.4619203507900238, "learning_rate": 3.8e-07, "loss": 1.8496, "step": 19 }, { "epoch": 0.005956923993372922, "grad_norm": 0.4837261736392975, "learning_rate": 4.0000000000000003e-07, "loss": 1.8788, "step": 20 }, { "epoch": 0.006254770193041568, "grad_norm": 0.46011224389076233, "learning_rate": 4.2000000000000006e-07, "loss": 1.8711, "step": 21 }, { "epoch": 0.006552616392710214, "grad_norm": 0.46401581168174744, "learning_rate": 4.4e-07, "loss": 1.8299, "step": 22 }, { "epoch": 0.00685046259237886, "grad_norm": 0.47924378514289856, "learning_rate": 4.6000000000000004e-07, "loss": 1.8648, "step": 23 }, { "epoch": 0.0071483087920475065, "grad_norm": 0.46203556656837463, "learning_rate": 4.800000000000001e-07, "loss": 1.8662, "step": 24 }, { "epoch": 0.007446154991716153, "grad_norm": 0.44733723998069763, "learning_rate": 5.000000000000001e-07, "loss": 1.8475, "step": 25 }, { "epoch": 0.007744001191384799, "grad_norm": 0.45633989572525024, "learning_rate": 5.2e-07, "loss": 1.8526, "step": 26 }, { "epoch": 0.008041847391053444, "grad_norm": 0.4663243591785431, "learning_rate": 5.4e-07, "loss": 1.8574, "step": 27 }, { "epoch": 0.008339693590722091, "grad_norm": 0.4683181941509247, "learning_rate": 5.6e-07, "loss": 1.8392, "step": 28 }, { "epoch": 0.008637539790390737, "grad_norm": 0.46668142080307007, "learning_rate": 5.800000000000001e-07, "loss": 1.8743, "step": 29 }, { "epoch": 0.008935385990059382, "grad_norm": 0.48022735118865967, "learning_rate": 6.000000000000001e-07, "loss": 1.8605, "step": 30 }, { "epoch": 0.00923323218972803, "grad_norm": 0.4465586543083191, "learning_rate": 6.200000000000001e-07, "loss": 1.8268, "step": 31 }, { "epoch": 0.009531078389396675, "grad_norm": 0.46732452511787415, "learning_rate": 6.4e-07, "loss": 1.8707, "step": 32 }, { "epoch": 0.009828924589065322, "grad_norm": 0.4528926908969879, "learning_rate": 6.6e-07, "loss": 1.8549, "step": 33 }, { "epoch": 0.010126770788733967, "grad_norm": 0.4607648551464081, "learning_rate": 6.800000000000001e-07, "loss": 1.8582, "step": 34 }, { "epoch": 0.010424616988402614, "grad_norm": 0.4756196141242981, "learning_rate": 7.000000000000001e-07, "loss": 1.8732, "step": 35 }, { "epoch": 0.01072246318807126, "grad_norm": 0.4642297923564911, "learning_rate": 7.2e-07, "loss": 1.8596, "step": 36 }, { "epoch": 0.011020309387739905, "grad_norm": 0.45597583055496216, "learning_rate": 7.4e-07, "loss": 1.84, "step": 37 }, { "epoch": 0.011318155587408552, "grad_norm": 0.4625261127948761, "learning_rate": 7.6e-07, "loss": 1.8599, "step": 38 }, { "epoch": 0.011616001787077198, "grad_norm": 0.4891977608203888, "learning_rate": 7.8e-07, "loss": 1.8729, "step": 39 }, { "epoch": 0.011913847986745845, "grad_norm": 0.47429201006889343, "learning_rate": 8.000000000000001e-07, "loss": 1.844, "step": 40 }, { "epoch": 0.01221169418641449, "grad_norm": 0.4835318922996521, "learning_rate": 8.200000000000001e-07, "loss": 1.8688, "step": 41 }, { "epoch": 0.012509540386083135, "grad_norm": 0.49138543009757996, "learning_rate": 8.400000000000001e-07, "loss": 1.8678, "step": 42 }, { "epoch": 0.012807386585751783, "grad_norm": 0.4650191068649292, "learning_rate": 8.6e-07, "loss": 1.853, "step": 43 }, { "epoch": 0.013105232785420428, "grad_norm": 0.47995880246162415, "learning_rate": 8.8e-07, "loss": 1.8733, "step": 44 }, { "epoch": 0.013403078985089075, "grad_norm": 0.46710073947906494, "learning_rate": 9.000000000000001e-07, "loss": 1.8289, "step": 45 }, { "epoch": 0.01370092518475772, "grad_norm": 0.48132428526878357, "learning_rate": 9.200000000000001e-07, "loss": 1.8678, "step": 46 }, { "epoch": 0.013998771384426368, "grad_norm": 0.4732246994972229, "learning_rate": 9.400000000000001e-07, "loss": 1.8511, "step": 47 }, { "epoch": 0.014296617584095013, "grad_norm": 0.48072347044944763, "learning_rate": 9.600000000000001e-07, "loss": 1.861, "step": 48 }, { "epoch": 0.014594463783763658, "grad_norm": 0.4825969934463501, "learning_rate": 9.800000000000001e-07, "loss": 1.8701, "step": 49 }, { "epoch": 0.014892309983432305, "grad_norm": 0.4636313319206238, "learning_rate": 1.0000000000000002e-06, "loss": 1.8455, "step": 50 }, { "epoch": 0.01519015618310095, "grad_norm": 0.47920089960098267, "learning_rate": 1.02e-06, "loss": 1.8628, "step": 51 }, { "epoch": 0.015488002382769598, "grad_norm": 0.4644438922405243, "learning_rate": 1.04e-06, "loss": 1.8411, "step": 52 }, { "epoch": 0.015785848582438245, "grad_norm": 0.4573518633842468, "learning_rate": 1.06e-06, "loss": 1.8294, "step": 53 }, { "epoch": 0.01608369478210689, "grad_norm": 0.4642627239227295, "learning_rate": 1.08e-06, "loss": 1.8371, "step": 54 }, { "epoch": 0.016381540981775536, "grad_norm": 0.4713428020477295, "learning_rate": 1.1e-06, "loss": 1.8422, "step": 55 }, { "epoch": 0.016679387181444183, "grad_norm": 0.4599405527114868, "learning_rate": 1.12e-06, "loss": 1.8188, "step": 56 }, { "epoch": 0.016977233381112827, "grad_norm": 0.47443103790283203, "learning_rate": 1.14e-06, "loss": 1.8586, "step": 57 }, { "epoch": 0.017275079580781474, "grad_norm": 0.46728289127349854, "learning_rate": 1.1600000000000001e-06, "loss": 1.8384, "step": 58 }, { "epoch": 0.01757292578045012, "grad_norm": 0.4742635190486908, "learning_rate": 1.1800000000000001e-06, "loss": 1.8487, "step": 59 }, { "epoch": 0.017870771980118764, "grad_norm": 0.5065075755119324, "learning_rate": 1.2000000000000002e-06, "loss": 1.8695, "step": 60 }, { "epoch": 0.01816861817978741, "grad_norm": 0.4815598726272583, "learning_rate": 1.2200000000000002e-06, "loss": 1.847, "step": 61 }, { "epoch": 0.01846646437945606, "grad_norm": 0.5095518827438354, "learning_rate": 1.2400000000000002e-06, "loss": 1.8614, "step": 62 }, { "epoch": 0.018764310579124706, "grad_norm": 0.484244704246521, "learning_rate": 1.26e-06, "loss": 1.8396, "step": 63 }, { "epoch": 0.01906215677879335, "grad_norm": 0.5211488604545593, "learning_rate": 1.28e-06, "loss": 1.8555, "step": 64 }, { "epoch": 0.019360002978461997, "grad_norm": 0.5129069685935974, "learning_rate": 1.3e-06, "loss": 1.8668, "step": 65 }, { "epoch": 0.019657849178130644, "grad_norm": 0.5392414927482605, "learning_rate": 1.32e-06, "loss": 1.9, "step": 66 }, { "epoch": 0.019955695377799287, "grad_norm": 0.5067933201789856, "learning_rate": 1.34e-06, "loss": 1.857, "step": 67 }, { "epoch": 0.020253541577467934, "grad_norm": 0.5195866227149963, "learning_rate": 1.3600000000000001e-06, "loss": 1.8268, "step": 68 }, { "epoch": 0.02055138777713658, "grad_norm": 0.5253258943557739, "learning_rate": 1.3800000000000001e-06, "loss": 1.8535, "step": 69 }, { "epoch": 0.02084923397680523, "grad_norm": 0.5204430222511292, "learning_rate": 1.4000000000000001e-06, "loss": 1.8414, "step": 70 }, { "epoch": 0.021147080176473872, "grad_norm": 0.5310043096542358, "learning_rate": 1.42e-06, "loss": 1.8639, "step": 71 }, { "epoch": 0.02144492637614252, "grad_norm": 0.5121457576751709, "learning_rate": 1.44e-06, "loss": 1.8546, "step": 72 }, { "epoch": 0.021742772575811167, "grad_norm": 0.5088541507720947, "learning_rate": 1.46e-06, "loss": 1.8422, "step": 73 }, { "epoch": 0.02204061877547981, "grad_norm": 0.5058099627494812, "learning_rate": 1.48e-06, "loss": 1.8439, "step": 74 }, { "epoch": 0.022338464975148457, "grad_norm": 0.4940571188926697, "learning_rate": 1.5e-06, "loss": 1.8213, "step": 75 }, { "epoch": 0.022636311174817104, "grad_norm": 0.5225253701210022, "learning_rate": 1.52e-06, "loss": 1.8536, "step": 76 }, { "epoch": 0.02293415737448575, "grad_norm": 0.5128735303878784, "learning_rate": 1.54e-06, "loss": 1.8145, "step": 77 }, { "epoch": 0.023232003574154395, "grad_norm": 0.5496757626533508, "learning_rate": 1.56e-06, "loss": 1.8661, "step": 78 }, { "epoch": 0.023529849773823042, "grad_norm": 0.5278938412666321, "learning_rate": 1.5800000000000001e-06, "loss": 1.8255, "step": 79 }, { "epoch": 0.02382769597349169, "grad_norm": 0.5448312163352966, "learning_rate": 1.6000000000000001e-06, "loss": 1.8722, "step": 80 }, { "epoch": 0.024125542173160333, "grad_norm": 0.5144855976104736, "learning_rate": 1.6200000000000002e-06, "loss": 1.8249, "step": 81 }, { "epoch": 0.02442338837282898, "grad_norm": 0.5338228940963745, "learning_rate": 1.6400000000000002e-06, "loss": 1.8331, "step": 82 }, { "epoch": 0.024721234572497627, "grad_norm": 0.5393568873405457, "learning_rate": 1.6600000000000002e-06, "loss": 1.8334, "step": 83 }, { "epoch": 0.02501908077216627, "grad_norm": 0.5475087761878967, "learning_rate": 1.6800000000000002e-06, "loss": 1.8572, "step": 84 }, { "epoch": 0.025316926971834918, "grad_norm": 0.5298764705657959, "learning_rate": 1.7000000000000002e-06, "loss": 1.8323, "step": 85 }, { "epoch": 0.025614773171503565, "grad_norm": 0.5606328248977661, "learning_rate": 1.72e-06, "loss": 1.8654, "step": 86 }, { "epoch": 0.025912619371172212, "grad_norm": 0.5514033436775208, "learning_rate": 1.74e-06, "loss": 1.8227, "step": 87 }, { "epoch": 0.026210465570840856, "grad_norm": 0.559169352054596, "learning_rate": 1.76e-06, "loss": 1.8398, "step": 88 }, { "epoch": 0.026508311770509503, "grad_norm": 0.5687103867530823, "learning_rate": 1.7800000000000001e-06, "loss": 1.8377, "step": 89 }, { "epoch": 0.02680615797017815, "grad_norm": 0.5481163859367371, "learning_rate": 1.8000000000000001e-06, "loss": 1.8384, "step": 90 }, { "epoch": 0.027104004169846794, "grad_norm": 0.5697974562644958, "learning_rate": 1.8200000000000002e-06, "loss": 1.8511, "step": 91 }, { "epoch": 0.02740185036951544, "grad_norm": 0.5484298467636108, "learning_rate": 1.8400000000000002e-06, "loss": 1.8351, "step": 92 }, { "epoch": 0.027699696569184088, "grad_norm": 0.5311096906661987, "learning_rate": 1.8600000000000002e-06, "loss": 1.8234, "step": 93 }, { "epoch": 0.027997542768852735, "grad_norm": 0.5723997950553894, "learning_rate": 1.8800000000000002e-06, "loss": 1.8366, "step": 94 }, { "epoch": 0.02829538896852138, "grad_norm": 0.5615350604057312, "learning_rate": 1.9000000000000002e-06, "loss": 1.8531, "step": 95 }, { "epoch": 0.028593235168190026, "grad_norm": 0.5616511702537537, "learning_rate": 1.9200000000000003e-06, "loss": 1.8165, "step": 96 }, { "epoch": 0.028891081367858673, "grad_norm": 0.5790432095527649, "learning_rate": 1.94e-06, "loss": 1.856, "step": 97 }, { "epoch": 0.029188927567527317, "grad_norm": 0.5516716837882996, "learning_rate": 1.9600000000000003e-06, "loss": 1.8194, "step": 98 }, { "epoch": 0.029486773767195964, "grad_norm": 0.57439124584198, "learning_rate": 1.98e-06, "loss": 1.8388, "step": 99 }, { "epoch": 0.02978461996686461, "grad_norm": 0.5567128658294678, "learning_rate": 2.0000000000000003e-06, "loss": 1.8182, "step": 100 }, { "epoch": 0.030082466166533258, "grad_norm": 0.5853058099746704, "learning_rate": 2.02e-06, "loss": 1.8468, "step": 101 }, { "epoch": 0.0303803123662019, "grad_norm": 0.5795766115188599, "learning_rate": 2.04e-06, "loss": 1.8274, "step": 102 }, { "epoch": 0.03067815856587055, "grad_norm": 0.6096683740615845, "learning_rate": 2.06e-06, "loss": 1.8367, "step": 103 }, { "epoch": 0.030976004765539196, "grad_norm": 0.5890591740608215, "learning_rate": 2.08e-06, "loss": 1.8098, "step": 104 }, { "epoch": 0.03127385096520784, "grad_norm": 0.6070247888565063, "learning_rate": 2.1000000000000002e-06, "loss": 1.8306, "step": 105 }, { "epoch": 0.03157169716487649, "grad_norm": 0.6029655337333679, "learning_rate": 2.12e-06, "loss": 1.8243, "step": 106 }, { "epoch": 0.03186954336454513, "grad_norm": 0.5994901657104492, "learning_rate": 2.1400000000000003e-06, "loss": 1.8273, "step": 107 }, { "epoch": 0.03216738956421378, "grad_norm": 0.5960265398025513, "learning_rate": 2.16e-06, "loss": 1.8279, "step": 108 }, { "epoch": 0.032465235763882425, "grad_norm": 0.6401825547218323, "learning_rate": 2.1800000000000003e-06, "loss": 1.8318, "step": 109 }, { "epoch": 0.03276308196355107, "grad_norm": 0.6169389486312866, "learning_rate": 2.2e-06, "loss": 1.8093, "step": 110 }, { "epoch": 0.03306092816321972, "grad_norm": 0.6193079352378845, "learning_rate": 2.2200000000000003e-06, "loss": 1.8122, "step": 111 }, { "epoch": 0.033358774362888366, "grad_norm": 0.657038152217865, "learning_rate": 2.24e-06, "loss": 1.8412, "step": 112 }, { "epoch": 0.03365662056255701, "grad_norm": 0.6150979399681091, "learning_rate": 2.2600000000000004e-06, "loss": 1.812, "step": 113 }, { "epoch": 0.03395446676222565, "grad_norm": 0.6213580369949341, "learning_rate": 2.28e-06, "loss": 1.8112, "step": 114 }, { "epoch": 0.0342523129618943, "grad_norm": 0.6286153793334961, "learning_rate": 2.3000000000000004e-06, "loss": 1.7994, "step": 115 }, { "epoch": 0.03455015916156295, "grad_norm": 0.6432107090950012, "learning_rate": 2.3200000000000002e-06, "loss": 1.8105, "step": 116 }, { "epoch": 0.034848005361231595, "grad_norm": 0.629039466381073, "learning_rate": 2.3400000000000005e-06, "loss": 1.7944, "step": 117 }, { "epoch": 0.03514585156090024, "grad_norm": 0.6385027170181274, "learning_rate": 2.3600000000000003e-06, "loss": 1.8105, "step": 118 }, { "epoch": 0.03544369776056889, "grad_norm": 0.6555307507514954, "learning_rate": 2.38e-06, "loss": 1.824, "step": 119 }, { "epoch": 0.03574154396023753, "grad_norm": 0.6707586646080017, "learning_rate": 2.4000000000000003e-06, "loss": 1.7936, "step": 120 }, { "epoch": 0.036039390159906176, "grad_norm": 0.6686285138130188, "learning_rate": 2.42e-06, "loss": 1.8116, "step": 121 }, { "epoch": 0.03633723635957482, "grad_norm": 0.6620265245437622, "learning_rate": 2.4400000000000004e-06, "loss": 1.8123, "step": 122 }, { "epoch": 0.03663508255924347, "grad_norm": 0.6774595975875854, "learning_rate": 2.46e-06, "loss": 1.7954, "step": 123 }, { "epoch": 0.03693292875891212, "grad_norm": 0.6555871963500977, "learning_rate": 2.4800000000000004e-06, "loss": 1.8002, "step": 124 }, { "epoch": 0.037230774958580765, "grad_norm": 0.6729901432991028, "learning_rate": 2.5e-06, "loss": 1.8022, "step": 125 }, { "epoch": 0.03752862115824941, "grad_norm": 0.6566896438598633, "learning_rate": 2.52e-06, "loss": 1.7948, "step": 126 }, { "epoch": 0.03782646735791805, "grad_norm": 0.6590833067893982, "learning_rate": 2.5400000000000002e-06, "loss": 1.7972, "step": 127 }, { "epoch": 0.0381243135575867, "grad_norm": 0.6688756942749023, "learning_rate": 2.56e-06, "loss": 1.804, "step": 128 }, { "epoch": 0.038422159757255346, "grad_norm": 0.6758476495742798, "learning_rate": 2.5800000000000003e-06, "loss": 1.7914, "step": 129 }, { "epoch": 0.03872000595692399, "grad_norm": 0.6805055737495422, "learning_rate": 2.6e-06, "loss": 1.7951, "step": 130 }, { "epoch": 0.03901785215659264, "grad_norm": 0.6505172252655029, "learning_rate": 2.6200000000000003e-06, "loss": 1.7979, "step": 131 }, { "epoch": 0.03931569835626129, "grad_norm": 0.6635544896125793, "learning_rate": 2.64e-06, "loss": 1.7859, "step": 132 }, { "epoch": 0.039613544555929935, "grad_norm": 0.6838406920433044, "learning_rate": 2.6600000000000004e-06, "loss": 1.7956, "step": 133 }, { "epoch": 0.039911390755598575, "grad_norm": 0.6474420428276062, "learning_rate": 2.68e-06, "loss": 1.7752, "step": 134 }, { "epoch": 0.04020923695526722, "grad_norm": 0.6364726424217224, "learning_rate": 2.7000000000000004e-06, "loss": 1.7841, "step": 135 }, { "epoch": 0.04050708315493587, "grad_norm": 0.6838220357894897, "learning_rate": 2.7200000000000002e-06, "loss": 1.7843, "step": 136 }, { "epoch": 0.040804929354604516, "grad_norm": 0.660972535610199, "learning_rate": 2.7400000000000004e-06, "loss": 1.7815, "step": 137 }, { "epoch": 0.04110277555427316, "grad_norm": 0.7104812860488892, "learning_rate": 2.7600000000000003e-06, "loss": 1.8164, "step": 138 }, { "epoch": 0.04140062175394181, "grad_norm": 0.6517634987831116, "learning_rate": 2.7800000000000005e-06, "loss": 1.7552, "step": 139 }, { "epoch": 0.04169846795361046, "grad_norm": 0.6587279438972473, "learning_rate": 2.8000000000000003e-06, "loss": 1.7679, "step": 140 }, { "epoch": 0.0419963141532791, "grad_norm": 0.6743229627609253, "learning_rate": 2.82e-06, "loss": 1.7719, "step": 141 }, { "epoch": 0.042294160352947745, "grad_norm": 0.6563546061515808, "learning_rate": 2.84e-06, "loss": 1.7677, "step": 142 }, { "epoch": 0.04259200655261639, "grad_norm": 0.6561146378517151, "learning_rate": 2.86e-06, "loss": 1.7399, "step": 143 }, { "epoch": 0.04288985275228504, "grad_norm": 0.6793851256370544, "learning_rate": 2.88e-06, "loss": 1.7773, "step": 144 }, { "epoch": 0.043187698951953686, "grad_norm": 0.7065466642379761, "learning_rate": 2.9e-06, "loss": 1.7771, "step": 145 }, { "epoch": 0.04348554515162233, "grad_norm": 0.6534376740455627, "learning_rate": 2.92e-06, "loss": 1.7494, "step": 146 }, { "epoch": 0.04378339135129098, "grad_norm": 0.6629258990287781, "learning_rate": 2.9400000000000002e-06, "loss": 1.7584, "step": 147 }, { "epoch": 0.04408123755095962, "grad_norm": 0.6547302007675171, "learning_rate": 2.96e-06, "loss": 1.7452, "step": 148 }, { "epoch": 0.04437908375062827, "grad_norm": 0.6431775093078613, "learning_rate": 2.9800000000000003e-06, "loss": 1.7595, "step": 149 }, { "epoch": 0.044676929950296915, "grad_norm": 0.6590375900268555, "learning_rate": 3e-06, "loss": 1.7349, "step": 150 }, { "epoch": 0.04497477614996556, "grad_norm": 0.6398279070854187, "learning_rate": 3.0200000000000003e-06, "loss": 1.7334, "step": 151 }, { "epoch": 0.04527262234963421, "grad_norm": 0.6175107955932617, "learning_rate": 3.04e-06, "loss": 1.7406, "step": 152 }, { "epoch": 0.045570468549302856, "grad_norm": 0.6381570100784302, "learning_rate": 3.0600000000000003e-06, "loss": 1.7192, "step": 153 }, { "epoch": 0.0458683147489715, "grad_norm": 0.5893192291259766, "learning_rate": 3.08e-06, "loss": 1.7181, "step": 154 }, { "epoch": 0.04616616094864014, "grad_norm": 0.5449604392051697, "learning_rate": 3.1000000000000004e-06, "loss": 1.7141, "step": 155 }, { "epoch": 0.04646400714830879, "grad_norm": 0.5816596746444702, "learning_rate": 3.12e-06, "loss": 1.7301, "step": 156 }, { "epoch": 0.04676185334797744, "grad_norm": 0.5603417754173279, "learning_rate": 3.1400000000000004e-06, "loss": 1.7215, "step": 157 }, { "epoch": 0.047059699547646085, "grad_norm": 0.557874858379364, "learning_rate": 3.1600000000000002e-06, "loss": 1.7011, "step": 158 }, { "epoch": 0.04735754574731473, "grad_norm": 0.5643253922462463, "learning_rate": 3.1800000000000005e-06, "loss": 1.6999, "step": 159 }, { "epoch": 0.04765539194698338, "grad_norm": 0.5932505130767822, "learning_rate": 3.2000000000000003e-06, "loss": 1.7408, "step": 160 }, { "epoch": 0.04795323814665202, "grad_norm": 0.5699681043624878, "learning_rate": 3.2200000000000005e-06, "loss": 1.7111, "step": 161 }, { "epoch": 0.048251084346320666, "grad_norm": 0.5691033005714417, "learning_rate": 3.2400000000000003e-06, "loss": 1.6914, "step": 162 }, { "epoch": 0.04854893054598931, "grad_norm": 0.579626739025116, "learning_rate": 3.2600000000000006e-06, "loss": 1.7124, "step": 163 }, { "epoch": 0.04884677674565796, "grad_norm": 0.580288290977478, "learning_rate": 3.2800000000000004e-06, "loss": 1.7117, "step": 164 }, { "epoch": 0.04914462294532661, "grad_norm": 0.5958548188209534, "learning_rate": 3.3000000000000006e-06, "loss": 1.7044, "step": 165 }, { "epoch": 0.049442469144995255, "grad_norm": 0.5895776748657227, "learning_rate": 3.3200000000000004e-06, "loss": 1.6968, "step": 166 }, { "epoch": 0.0497403153446639, "grad_norm": 0.5660363435745239, "learning_rate": 3.3400000000000006e-06, "loss": 1.6819, "step": 167 }, { "epoch": 0.05003816154433254, "grad_norm": 0.5647677183151245, "learning_rate": 3.3600000000000004e-06, "loss": 1.6939, "step": 168 }, { "epoch": 0.05033600774400119, "grad_norm": 0.518638014793396, "learning_rate": 3.3800000000000007e-06, "loss": 1.7027, "step": 169 }, { "epoch": 0.050633853943669836, "grad_norm": 0.43530601263046265, "learning_rate": 3.4000000000000005e-06, "loss": 1.686, "step": 170 }, { "epoch": 0.05093170014333848, "grad_norm": 0.4004081189632416, "learning_rate": 3.4200000000000007e-06, "loss": 1.6593, "step": 171 }, { "epoch": 0.05122954634300713, "grad_norm": 0.4063718020915985, "learning_rate": 3.44e-06, "loss": 1.6799, "step": 172 }, { "epoch": 0.05152739254267578, "grad_norm": 0.3832969069480896, "learning_rate": 3.46e-06, "loss": 1.688, "step": 173 }, { "epoch": 0.051825238742344425, "grad_norm": 0.39484286308288574, "learning_rate": 3.48e-06, "loss": 1.6716, "step": 174 }, { "epoch": 0.052123084942013065, "grad_norm": 0.3989197611808777, "learning_rate": 3.5e-06, "loss": 1.6764, "step": 175 }, { "epoch": 0.05242093114168171, "grad_norm": 0.38792771100997925, "learning_rate": 3.52e-06, "loss": 1.6629, "step": 176 }, { "epoch": 0.05271877734135036, "grad_norm": 0.4144127368927002, "learning_rate": 3.54e-06, "loss": 1.6654, "step": 177 }, { "epoch": 0.053016623541019006, "grad_norm": 0.399394690990448, "learning_rate": 3.5600000000000002e-06, "loss": 1.6423, "step": 178 }, { "epoch": 0.05331446974068765, "grad_norm": 0.40582484006881714, "learning_rate": 3.58e-06, "loss": 1.6579, "step": 179 }, { "epoch": 0.0536123159403563, "grad_norm": 0.38763391971588135, "learning_rate": 3.6000000000000003e-06, "loss": 1.6463, "step": 180 }, { "epoch": 0.05391016214002495, "grad_norm": 0.39296337962150574, "learning_rate": 3.62e-06, "loss": 1.6406, "step": 181 }, { "epoch": 0.05420800833969359, "grad_norm": 0.3867253065109253, "learning_rate": 3.6400000000000003e-06, "loss": 1.6463, "step": 182 }, { "epoch": 0.054505854539362235, "grad_norm": 0.36434227228164673, "learning_rate": 3.66e-06, "loss": 1.6242, "step": 183 }, { "epoch": 0.05480370073903088, "grad_norm": 0.38888224959373474, "learning_rate": 3.6800000000000003e-06, "loss": 1.6614, "step": 184 }, { "epoch": 0.05510154693869953, "grad_norm": 0.37744617462158203, "learning_rate": 3.7e-06, "loss": 1.6095, "step": 185 }, { "epoch": 0.055399393138368176, "grad_norm": 0.38917142152786255, "learning_rate": 3.7200000000000004e-06, "loss": 1.6236, "step": 186 }, { "epoch": 0.05569723933803682, "grad_norm": 0.3825514018535614, "learning_rate": 3.74e-06, "loss": 1.6188, "step": 187 }, { "epoch": 0.05599508553770547, "grad_norm": 0.3907104432582855, "learning_rate": 3.7600000000000004e-06, "loss": 1.6368, "step": 188 }, { "epoch": 0.05629293173737411, "grad_norm": 0.37548741698265076, "learning_rate": 3.7800000000000002e-06, "loss": 1.6048, "step": 189 }, { "epoch": 0.05659077793704276, "grad_norm": 0.38833755254745483, "learning_rate": 3.8000000000000005e-06, "loss": 1.6154, "step": 190 }, { "epoch": 0.056888624136711405, "grad_norm": 0.3748374879360199, "learning_rate": 3.820000000000001e-06, "loss": 1.6299, "step": 191 }, { "epoch": 0.05718647033638005, "grad_norm": 0.4172203838825226, "learning_rate": 3.8400000000000005e-06, "loss": 1.6216, "step": 192 }, { "epoch": 0.0574843165360487, "grad_norm": 0.4116345942020416, "learning_rate": 3.86e-06, "loss": 1.6112, "step": 193 }, { "epoch": 0.057782162735717346, "grad_norm": 0.4274803698062897, "learning_rate": 3.88e-06, "loss": 1.6245, "step": 194 }, { "epoch": 0.05808000893538599, "grad_norm": 0.3828435242176056, "learning_rate": 3.900000000000001e-06, "loss": 1.6066, "step": 195 }, { "epoch": 0.058377855135054633, "grad_norm": 0.35347607731819153, "learning_rate": 3.920000000000001e-06, "loss": 1.5996, "step": 196 }, { "epoch": 0.05867570133472328, "grad_norm": 0.3211362063884735, "learning_rate": 3.94e-06, "loss": 1.5987, "step": 197 }, { "epoch": 0.05897354753439193, "grad_norm": 0.3006207346916199, "learning_rate": 3.96e-06, "loss": 1.59, "step": 198 }, { "epoch": 0.059271393734060575, "grad_norm": 0.29288944602012634, "learning_rate": 3.980000000000001e-06, "loss": 1.6042, "step": 199 }, { "epoch": 0.05956923993372922, "grad_norm": 0.27601832151412964, "learning_rate": 4.000000000000001e-06, "loss": 1.5809, "step": 200 }, { "epoch": 0.05986708613339787, "grad_norm": 0.2815020680427551, "learning_rate": 4.0200000000000005e-06, "loss": 1.5892, "step": 201 }, { "epoch": 0.060164932333066516, "grad_norm": 0.28545090556144714, "learning_rate": 4.04e-06, "loss": 1.5847, "step": 202 }, { "epoch": 0.060462778532735156, "grad_norm": 0.275897741317749, "learning_rate": 4.060000000000001e-06, "loss": 1.5937, "step": 203 }, { "epoch": 0.0607606247324038, "grad_norm": 0.27737608551979065, "learning_rate": 4.08e-06, "loss": 1.5782, "step": 204 }, { "epoch": 0.06105847093207245, "grad_norm": 0.2871304750442505, "learning_rate": 4.1e-06, "loss": 1.5841, "step": 205 }, { "epoch": 0.0613563171317411, "grad_norm": 0.29327964782714844, "learning_rate": 4.12e-06, "loss": 1.5759, "step": 206 }, { "epoch": 0.061654163331409745, "grad_norm": 0.2835024893283844, "learning_rate": 4.14e-06, "loss": 1.5834, "step": 207 }, { "epoch": 0.06195200953107839, "grad_norm": 0.2886221706867218, "learning_rate": 4.16e-06, "loss": 1.5973, "step": 208 }, { "epoch": 0.06224985573074703, "grad_norm": 0.2900846600532532, "learning_rate": 4.18e-06, "loss": 1.6061, "step": 209 }, { "epoch": 0.06254770193041569, "grad_norm": 0.28939002752304077, "learning_rate": 4.2000000000000004e-06, "loss": 1.5918, "step": 210 }, { "epoch": 0.06284554813008433, "grad_norm": 0.316026896238327, "learning_rate": 4.22e-06, "loss": 1.5984, "step": 211 }, { "epoch": 0.06314339432975298, "grad_norm": 0.30959439277648926, "learning_rate": 4.24e-06, "loss": 1.5915, "step": 212 }, { "epoch": 0.06344124052942161, "grad_norm": 0.30867230892181396, "learning_rate": 4.26e-06, "loss": 1.5838, "step": 213 }, { "epoch": 0.06373908672909026, "grad_norm": 0.3157883882522583, "learning_rate": 4.2800000000000005e-06, "loss": 1.5959, "step": 214 }, { "epoch": 0.06403693292875891, "grad_norm": 0.3085717558860779, "learning_rate": 4.3e-06, "loss": 1.5681, "step": 215 }, { "epoch": 0.06433477912842755, "grad_norm": 0.30693379044532776, "learning_rate": 4.32e-06, "loss": 1.5813, "step": 216 }, { "epoch": 0.0646326253280962, "grad_norm": 0.33237066864967346, "learning_rate": 4.34e-06, "loss": 1.5841, "step": 217 }, { "epoch": 0.06493047152776485, "grad_norm": 0.33629241585731506, "learning_rate": 4.360000000000001e-06, "loss": 1.6016, "step": 218 }, { "epoch": 0.0652283177274335, "grad_norm": 0.3244820237159729, "learning_rate": 4.38e-06, "loss": 1.5914, "step": 219 }, { "epoch": 0.06552616392710214, "grad_norm": 0.33084553480148315, "learning_rate": 4.4e-06, "loss": 1.5835, "step": 220 }, { "epoch": 0.06582401012677079, "grad_norm": 0.34093067049980164, "learning_rate": 4.42e-06, "loss": 1.5894, "step": 221 }, { "epoch": 0.06612185632643944, "grad_norm": 0.35316869616508484, "learning_rate": 4.440000000000001e-06, "loss": 1.5778, "step": 222 }, { "epoch": 0.06641970252610808, "grad_norm": 0.3374737799167633, "learning_rate": 4.4600000000000005e-06, "loss": 1.5691, "step": 223 }, { "epoch": 0.06671754872577673, "grad_norm": 0.34237730503082275, "learning_rate": 4.48e-06, "loss": 1.5646, "step": 224 }, { "epoch": 0.06701539492544538, "grad_norm": 0.34547626972198486, "learning_rate": 4.5e-06, "loss": 1.5657, "step": 225 }, { "epoch": 0.06731324112511403, "grad_norm": 0.3584066927433014, "learning_rate": 4.520000000000001e-06, "loss": 1.5571, "step": 226 }, { "epoch": 0.06761108732478266, "grad_norm": 0.36502501368522644, "learning_rate": 4.540000000000001e-06, "loss": 1.5843, "step": 227 }, { "epoch": 0.0679089335244513, "grad_norm": 0.35336628556251526, "learning_rate": 4.56e-06, "loss": 1.559, "step": 228 }, { "epoch": 0.06820677972411995, "grad_norm": 0.38950905203819275, "learning_rate": 4.58e-06, "loss": 1.6019, "step": 229 }, { "epoch": 0.0685046259237886, "grad_norm": 0.3913187086582184, "learning_rate": 4.600000000000001e-06, "loss": 1.5615, "step": 230 }, { "epoch": 0.06880247212345725, "grad_norm": 0.3854864835739136, "learning_rate": 4.620000000000001e-06, "loss": 1.5799, "step": 231 }, { "epoch": 0.0691003183231259, "grad_norm": 0.39519399404525757, "learning_rate": 4.6400000000000005e-06, "loss": 1.5983, "step": 232 }, { "epoch": 0.06939816452279454, "grad_norm": 0.3765176832675934, "learning_rate": 4.66e-06, "loss": 1.5673, "step": 233 }, { "epoch": 0.06969601072246319, "grad_norm": 0.37804102897644043, "learning_rate": 4.680000000000001e-06, "loss": 1.5628, "step": 234 }, { "epoch": 0.06999385692213184, "grad_norm": 0.3661312460899353, "learning_rate": 4.7e-06, "loss": 1.5743, "step": 235 }, { "epoch": 0.07029170312180048, "grad_norm": 0.4114713966846466, "learning_rate": 4.7200000000000005e-06, "loss": 1.5789, "step": 236 }, { "epoch": 0.07058954932146913, "grad_norm": 0.4079197645187378, "learning_rate": 4.74e-06, "loss": 1.561, "step": 237 }, { "epoch": 0.07088739552113778, "grad_norm": 0.4288586378097534, "learning_rate": 4.76e-06, "loss": 1.5744, "step": 238 }, { "epoch": 0.07118524172080642, "grad_norm": 0.44189968705177307, "learning_rate": 4.78e-06, "loss": 1.5642, "step": 239 }, { "epoch": 0.07148308792047506, "grad_norm": 0.46231845021247864, "learning_rate": 4.800000000000001e-06, "loss": 1.5813, "step": 240 }, { "epoch": 0.0717809341201437, "grad_norm": 0.44197338819503784, "learning_rate": 4.8200000000000004e-06, "loss": 1.5579, "step": 241 }, { "epoch": 0.07207878031981235, "grad_norm": 0.44414064288139343, "learning_rate": 4.84e-06, "loss": 1.5589, "step": 242 }, { "epoch": 0.072376626519481, "grad_norm": 0.41845178604125977, "learning_rate": 4.86e-06, "loss": 1.5465, "step": 243 }, { "epoch": 0.07267447271914965, "grad_norm": 0.4510466158390045, "learning_rate": 4.880000000000001e-06, "loss": 1.5466, "step": 244 }, { "epoch": 0.0729723189188183, "grad_norm": 0.442803293466568, "learning_rate": 4.9000000000000005e-06, "loss": 1.5472, "step": 245 }, { "epoch": 0.07327016511848694, "grad_norm": 0.45264631509780884, "learning_rate": 4.92e-06, "loss": 1.5581, "step": 246 }, { "epoch": 0.07356801131815559, "grad_norm": 0.4818227291107178, "learning_rate": 4.94e-06, "loss": 1.5708, "step": 247 }, { "epoch": 0.07386585751782423, "grad_norm": 0.4921914041042328, "learning_rate": 4.960000000000001e-06, "loss": 1.5533, "step": 248 }, { "epoch": 0.07416370371749288, "grad_norm": 0.5005349516868591, "learning_rate": 4.980000000000001e-06, "loss": 1.5569, "step": 249 }, { "epoch": 0.07446154991716153, "grad_norm": 0.5374754667282104, "learning_rate": 5e-06, "loss": 1.565, "step": 250 }, { "epoch": 0.07475939611683018, "grad_norm": 0.5428454279899597, "learning_rate": 5.02e-06, "loss": 1.5404, "step": 251 }, { "epoch": 0.07505724231649882, "grad_norm": 0.5481797456741333, "learning_rate": 5.04e-06, "loss": 1.5472, "step": 252 }, { "epoch": 0.07535508851616747, "grad_norm": 0.5704034566879272, "learning_rate": 5.060000000000001e-06, "loss": 1.5569, "step": 253 }, { "epoch": 0.0756529347158361, "grad_norm": 0.5555924773216248, "learning_rate": 5.0800000000000005e-06, "loss": 1.5427, "step": 254 }, { "epoch": 0.07595078091550475, "grad_norm": 0.5683029294013977, "learning_rate": 5.1e-06, "loss": 1.5584, "step": 255 }, { "epoch": 0.0762486271151734, "grad_norm": 0.5711975693702698, "learning_rate": 5.12e-06, "loss": 1.5481, "step": 256 }, { "epoch": 0.07654647331484205, "grad_norm": 0.5935271382331848, "learning_rate": 5.140000000000001e-06, "loss": 1.5775, "step": 257 }, { "epoch": 0.07684431951451069, "grad_norm": 0.5552716851234436, "learning_rate": 5.1600000000000006e-06, "loss": 1.541, "step": 258 }, { "epoch": 0.07714216571417934, "grad_norm": 0.5461673736572266, "learning_rate": 5.18e-06, "loss": 1.5504, "step": 259 }, { "epoch": 0.07744001191384799, "grad_norm": 0.5375115871429443, "learning_rate": 5.2e-06, "loss": 1.544, "step": 260 }, { "epoch": 0.07773785811351663, "grad_norm": 0.5090441107749939, "learning_rate": 5.220000000000001e-06, "loss": 1.5501, "step": 261 }, { "epoch": 0.07803570431318528, "grad_norm": 0.4780338704586029, "learning_rate": 5.240000000000001e-06, "loss": 1.527, "step": 262 }, { "epoch": 0.07833355051285393, "grad_norm": 0.4490078389644623, "learning_rate": 5.2600000000000005e-06, "loss": 1.5554, "step": 263 }, { "epoch": 0.07863139671252257, "grad_norm": 0.39460065960884094, "learning_rate": 5.28e-06, "loss": 1.5536, "step": 264 }, { "epoch": 0.07892924291219122, "grad_norm": 0.3691290318965912, "learning_rate": 5.300000000000001e-06, "loss": 1.5547, "step": 265 }, { "epoch": 0.07922708911185987, "grad_norm": 0.3380715847015381, "learning_rate": 5.320000000000001e-06, "loss": 1.5509, "step": 266 }, { "epoch": 0.07952493531152852, "grad_norm": 0.32584908604621887, "learning_rate": 5.3400000000000005e-06, "loss": 1.5476, "step": 267 }, { "epoch": 0.07982278151119715, "grad_norm": 0.2909034192562103, "learning_rate": 5.36e-06, "loss": 1.5489, "step": 268 }, { "epoch": 0.0801206277108658, "grad_norm": 0.2502373456954956, "learning_rate": 5.380000000000001e-06, "loss": 1.5511, "step": 269 }, { "epoch": 0.08041847391053444, "grad_norm": 0.20297959446907043, "learning_rate": 5.400000000000001e-06, "loss": 1.5694, "step": 270 }, { "epoch": 0.08071632011020309, "grad_norm": 0.14874856173992157, "learning_rate": 5.420000000000001e-06, "loss": 1.5543, "step": 271 }, { "epoch": 0.08101416630987174, "grad_norm": 0.13056589663028717, "learning_rate": 5.4400000000000004e-06, "loss": 1.5251, "step": 272 }, { "epoch": 0.08131201250954039, "grad_norm": 0.12990182638168335, "learning_rate": 5.460000000000001e-06, "loss": 1.5272, "step": 273 }, { "epoch": 0.08160985870920903, "grad_norm": 0.11918067187070847, "learning_rate": 5.480000000000001e-06, "loss": 1.5343, "step": 274 }, { "epoch": 0.08190770490887768, "grad_norm": 0.10813926160335541, "learning_rate": 5.500000000000001e-06, "loss": 1.532, "step": 275 }, { "epoch": 0.08220555110854633, "grad_norm": 0.10963642597198486, "learning_rate": 5.5200000000000005e-06, "loss": 1.5594, "step": 276 }, { "epoch": 0.08250339730821497, "grad_norm": 0.10151322931051254, "learning_rate": 5.540000000000001e-06, "loss": 1.5263, "step": 277 }, { "epoch": 0.08280124350788362, "grad_norm": 0.09857906401157379, "learning_rate": 5.560000000000001e-06, "loss": 1.5248, "step": 278 }, { "epoch": 0.08309908970755227, "grad_norm": 0.09040740132331848, "learning_rate": 5.580000000000001e-06, "loss": 1.5135, "step": 279 }, { "epoch": 0.08339693590722091, "grad_norm": 0.09191717952489853, "learning_rate": 5.600000000000001e-06, "loss": 1.5464, "step": 280 }, { "epoch": 0.08369478210688955, "grad_norm": 0.08776262402534485, "learning_rate": 5.620000000000001e-06, "loss": 1.5336, "step": 281 }, { "epoch": 0.0839926283065582, "grad_norm": 0.08440259844064713, "learning_rate": 5.64e-06, "loss": 1.5354, "step": 282 }, { "epoch": 0.08429047450622684, "grad_norm": 0.08505409210920334, "learning_rate": 5.66e-06, "loss": 1.5324, "step": 283 }, { "epoch": 0.08458832070589549, "grad_norm": 0.08075687289237976, "learning_rate": 5.68e-06, "loss": 1.5314, "step": 284 }, { "epoch": 0.08488616690556414, "grad_norm": 0.08375275880098343, "learning_rate": 5.7e-06, "loss": 1.5268, "step": 285 }, { "epoch": 0.08518401310523278, "grad_norm": 0.08402302861213684, "learning_rate": 5.72e-06, "loss": 1.5414, "step": 286 }, { "epoch": 0.08548185930490143, "grad_norm": 0.08408354222774506, "learning_rate": 5.74e-06, "loss": 1.5371, "step": 287 }, { "epoch": 0.08577970550457008, "grad_norm": 0.08600781112909317, "learning_rate": 5.76e-06, "loss": 1.5377, "step": 288 }, { "epoch": 0.08607755170423873, "grad_norm": 0.07958939671516418, "learning_rate": 5.78e-06, "loss": 1.5353, "step": 289 }, { "epoch": 0.08637539790390737, "grad_norm": 0.08092907816171646, "learning_rate": 5.8e-06, "loss": 1.5351, "step": 290 }, { "epoch": 0.08667324410357602, "grad_norm": 0.07932665199041367, "learning_rate": 5.82e-06, "loss": 1.519, "step": 291 }, { "epoch": 0.08697109030324467, "grad_norm": 0.0793927013874054, "learning_rate": 5.84e-06, "loss": 1.5481, "step": 292 }, { "epoch": 0.08726893650291331, "grad_norm": 0.07992880046367645, "learning_rate": 5.86e-06, "loss": 1.5377, "step": 293 }, { "epoch": 0.08756678270258196, "grad_norm": 0.07722952216863632, "learning_rate": 5.8800000000000005e-06, "loss": 1.5277, "step": 294 }, { "epoch": 0.0878646289022506, "grad_norm": 0.07810430228710175, "learning_rate": 5.9e-06, "loss": 1.5226, "step": 295 }, { "epoch": 0.08816247510191924, "grad_norm": 0.07677609473466873, "learning_rate": 5.92e-06, "loss": 1.5236, "step": 296 }, { "epoch": 0.08846032130158789, "grad_norm": 0.0761965662240982, "learning_rate": 5.94e-06, "loss": 1.5195, "step": 297 }, { "epoch": 0.08875816750125654, "grad_norm": 0.07856539636850357, "learning_rate": 5.9600000000000005e-06, "loss": 1.5277, "step": 298 }, { "epoch": 0.08905601370092518, "grad_norm": 0.0727543905377388, "learning_rate": 5.98e-06, "loss": 1.5017, "step": 299 }, { "epoch": 0.08935385990059383, "grad_norm": 0.07391712069511414, "learning_rate": 6e-06, "loss": 1.5229, "step": 300 }, { "epoch": 0.08965170610026248, "grad_norm": 0.08318979293107986, "learning_rate": 6.02e-06, "loss": 1.5263, "step": 301 }, { "epoch": 0.08994955229993112, "grad_norm": 0.06995917111635208, "learning_rate": 6.040000000000001e-06, "loss": 1.5201, "step": 302 }, { "epoch": 0.09024739849959977, "grad_norm": 0.07505074143409729, "learning_rate": 6.0600000000000004e-06, "loss": 1.5303, "step": 303 }, { "epoch": 0.09054524469926842, "grad_norm": 0.0731302946805954, "learning_rate": 6.08e-06, "loss": 1.5316, "step": 304 }, { "epoch": 0.09084309089893707, "grad_norm": 0.06981611996889114, "learning_rate": 6.1e-06, "loss": 1.5401, "step": 305 }, { "epoch": 0.09114093709860571, "grad_norm": 0.07196632772684097, "learning_rate": 6.120000000000001e-06, "loss": 1.5397, "step": 306 }, { "epoch": 0.09143878329827436, "grad_norm": 0.07196949422359467, "learning_rate": 6.1400000000000005e-06, "loss": 1.5239, "step": 307 }, { "epoch": 0.091736629497943, "grad_norm": 0.07190251350402832, "learning_rate": 6.16e-06, "loss": 1.532, "step": 308 }, { "epoch": 0.09203447569761164, "grad_norm": 0.06778890639543533, "learning_rate": 6.18e-06, "loss": 1.5185, "step": 309 }, { "epoch": 0.09233232189728029, "grad_norm": 0.07153601199388504, "learning_rate": 6.200000000000001e-06, "loss": 1.5356, "step": 310 }, { "epoch": 0.09263016809694893, "grad_norm": 0.07012605667114258, "learning_rate": 6.220000000000001e-06, "loss": 1.5128, "step": 311 }, { "epoch": 0.09292801429661758, "grad_norm": 0.0677528828382492, "learning_rate": 6.24e-06, "loss": 1.5458, "step": 312 }, { "epoch": 0.09322586049628623, "grad_norm": 0.06769208610057831, "learning_rate": 6.26e-06, "loss": 1.5305, "step": 313 }, { "epoch": 0.09352370669595488, "grad_norm": 0.0664294883608818, "learning_rate": 6.280000000000001e-06, "loss": 1.5381, "step": 314 }, { "epoch": 0.09382155289562352, "grad_norm": 0.06461451947689056, "learning_rate": 6.300000000000001e-06, "loss": 1.5049, "step": 315 }, { "epoch": 0.09411939909529217, "grad_norm": 0.06791339814662933, "learning_rate": 6.3200000000000005e-06, "loss": 1.5135, "step": 316 }, { "epoch": 0.09441724529496082, "grad_norm": 0.07396858930587769, "learning_rate": 6.34e-06, "loss": 1.5102, "step": 317 }, { "epoch": 0.09471509149462946, "grad_norm": 0.06848759949207306, "learning_rate": 6.360000000000001e-06, "loss": 1.533, "step": 318 }, { "epoch": 0.09501293769429811, "grad_norm": 0.06650793552398682, "learning_rate": 6.380000000000001e-06, "loss": 1.5063, "step": 319 }, { "epoch": 0.09531078389396676, "grad_norm": 0.06422768533229828, "learning_rate": 6.4000000000000006e-06, "loss": 1.5267, "step": 320 }, { "epoch": 0.0956086300936354, "grad_norm": 0.06654848158359528, "learning_rate": 6.42e-06, "loss": 1.5541, "step": 321 }, { "epoch": 0.09590647629330404, "grad_norm": 0.06139402836561203, "learning_rate": 6.440000000000001e-06, "loss": 1.5116, "step": 322 }, { "epoch": 0.09620432249297269, "grad_norm": 0.06376224011182785, "learning_rate": 6.460000000000001e-06, "loss": 1.5221, "step": 323 }, { "epoch": 0.09650216869264133, "grad_norm": 0.062332574278116226, "learning_rate": 6.480000000000001e-06, "loss": 1.5278, "step": 324 }, { "epoch": 0.09680001489230998, "grad_norm": 0.06041441112756729, "learning_rate": 6.5000000000000004e-06, "loss": 1.5367, "step": 325 }, { "epoch": 0.09709786109197863, "grad_norm": 0.06442257761955261, "learning_rate": 6.520000000000001e-06, "loss": 1.537, "step": 326 }, { "epoch": 0.09739570729164727, "grad_norm": 0.05909942835569382, "learning_rate": 6.540000000000001e-06, "loss": 1.5003, "step": 327 }, { "epoch": 0.09769355349131592, "grad_norm": 0.060887742787599564, "learning_rate": 6.560000000000001e-06, "loss": 1.5192, "step": 328 }, { "epoch": 0.09799139969098457, "grad_norm": 0.06028643622994423, "learning_rate": 6.5800000000000005e-06, "loss": 1.523, "step": 329 }, { "epoch": 0.09828924589065322, "grad_norm": 0.060260046273469925, "learning_rate": 6.600000000000001e-06, "loss": 1.5244, "step": 330 }, { "epoch": 0.09858709209032186, "grad_norm": 0.05909213423728943, "learning_rate": 6.620000000000001e-06, "loss": 1.5181, "step": 331 }, { "epoch": 0.09888493828999051, "grad_norm": 0.05903824791312218, "learning_rate": 6.640000000000001e-06, "loss": 1.5224, "step": 332 }, { "epoch": 0.09918278448965916, "grad_norm": 0.06103455275297165, "learning_rate": 6.660000000000001e-06, "loss": 1.5267, "step": 333 }, { "epoch": 0.0994806306893278, "grad_norm": 0.06424493342638016, "learning_rate": 6.680000000000001e-06, "loss": 1.5262, "step": 334 }, { "epoch": 0.09977847688899645, "grad_norm": 0.06069577485322952, "learning_rate": 6.700000000000001e-06, "loss": 1.5038, "step": 335 }, { "epoch": 0.10007632308866508, "grad_norm": 0.06028122827410698, "learning_rate": 6.720000000000001e-06, "loss": 1.5261, "step": 336 }, { "epoch": 0.10037416928833373, "grad_norm": 0.06266399472951889, "learning_rate": 6.740000000000001e-06, "loss": 1.4917, "step": 337 }, { "epoch": 0.10067201548800238, "grad_norm": 0.05822982266545296, "learning_rate": 6.760000000000001e-06, "loss": 1.5241, "step": 338 }, { "epoch": 0.10096986168767103, "grad_norm": 0.05943974480032921, "learning_rate": 6.780000000000001e-06, "loss": 1.5257, "step": 339 }, { "epoch": 0.10126770788733967, "grad_norm": 0.05773944407701492, "learning_rate": 6.800000000000001e-06, "loss": 1.52, "step": 340 }, { "epoch": 0.10156555408700832, "grad_norm": 0.06203080713748932, "learning_rate": 6.820000000000001e-06, "loss": 1.5058, "step": 341 }, { "epoch": 0.10186340028667697, "grad_norm": 0.10648725926876068, "learning_rate": 6.8400000000000014e-06, "loss": 1.5137, "step": 342 }, { "epoch": 0.10216124648634561, "grad_norm": 0.058104611933231354, "learning_rate": 6.860000000000001e-06, "loss": 1.5397, "step": 343 }, { "epoch": 0.10245909268601426, "grad_norm": 0.05618196353316307, "learning_rate": 6.88e-06, "loss": 1.5125, "step": 344 }, { "epoch": 0.10275693888568291, "grad_norm": 0.056050579994916916, "learning_rate": 6.9e-06, "loss": 1.5162, "step": 345 }, { "epoch": 0.10305478508535156, "grad_norm": 0.05842519551515579, "learning_rate": 6.92e-06, "loss": 1.5138, "step": 346 }, { "epoch": 0.1033526312850202, "grad_norm": 0.05745385214686394, "learning_rate": 6.9400000000000005e-06, "loss": 1.5162, "step": 347 }, { "epoch": 0.10365047748468885, "grad_norm": 0.05770609527826309, "learning_rate": 6.96e-06, "loss": 1.5224, "step": 348 }, { "epoch": 0.1039483236843575, "grad_norm": 0.06014389544725418, "learning_rate": 6.98e-06, "loss": 1.5132, "step": 349 }, { "epoch": 0.10424616988402613, "grad_norm": 0.058073949068784714, "learning_rate": 7e-06, "loss": 1.5185, "step": 350 }, { "epoch": 0.10454401608369478, "grad_norm": 0.05801470950245857, "learning_rate": 7.0200000000000006e-06, "loss": 1.5085, "step": 351 }, { "epoch": 0.10484186228336342, "grad_norm": 0.05637885257601738, "learning_rate": 7.04e-06, "loss": 1.5117, "step": 352 }, { "epoch": 0.10513970848303207, "grad_norm": 0.05973244085907936, "learning_rate": 7.06e-06, "loss": 1.5353, "step": 353 }, { "epoch": 0.10543755468270072, "grad_norm": 0.05698588117957115, "learning_rate": 7.08e-06, "loss": 1.5167, "step": 354 }, { "epoch": 0.10573540088236937, "grad_norm": 0.056220103055238724, "learning_rate": 7.100000000000001e-06, "loss": 1.5156, "step": 355 }, { "epoch": 0.10603324708203801, "grad_norm": 0.13667906820774078, "learning_rate": 7.1200000000000004e-06, "loss": 1.5007, "step": 356 }, { "epoch": 0.10633109328170666, "grad_norm": 0.05707328021526337, "learning_rate": 7.14e-06, "loss": 1.513, "step": 357 }, { "epoch": 0.1066289394813753, "grad_norm": 0.05900765210390091, "learning_rate": 7.16e-06, "loss": 1.501, "step": 358 }, { "epoch": 0.10692678568104395, "grad_norm": 0.054961275309324265, "learning_rate": 7.180000000000001e-06, "loss": 1.5153, "step": 359 }, { "epoch": 0.1072246318807126, "grad_norm": 0.055499982088804245, "learning_rate": 7.2000000000000005e-06, "loss": 1.5183, "step": 360 }, { "epoch": 0.10752247808038125, "grad_norm": 0.057524703443050385, "learning_rate": 7.22e-06, "loss": 1.5185, "step": 361 }, { "epoch": 0.1078203242800499, "grad_norm": 0.056625694036483765, "learning_rate": 7.24e-06, "loss": 1.5234, "step": 362 }, { "epoch": 0.10811817047971854, "grad_norm": 0.05682265758514404, "learning_rate": 7.260000000000001e-06, "loss": 1.5049, "step": 363 }, { "epoch": 0.10841601667938718, "grad_norm": 0.05868418887257576, "learning_rate": 7.280000000000001e-06, "loss": 1.5056, "step": 364 }, { "epoch": 0.10871386287905582, "grad_norm": 0.06089801341295242, "learning_rate": 7.3e-06, "loss": 1.5091, "step": 365 }, { "epoch": 0.10901170907872447, "grad_norm": 0.05808059498667717, "learning_rate": 7.32e-06, "loss": 1.5009, "step": 366 }, { "epoch": 0.10930955527839312, "grad_norm": 0.05970989167690277, "learning_rate": 7.340000000000001e-06, "loss": 1.5084, "step": 367 }, { "epoch": 0.10960740147806176, "grad_norm": 0.06059359386563301, "learning_rate": 7.360000000000001e-06, "loss": 1.5178, "step": 368 }, { "epoch": 0.10990524767773041, "grad_norm": 0.05733639374375343, "learning_rate": 7.3800000000000005e-06, "loss": 1.4836, "step": 369 }, { "epoch": 0.11020309387739906, "grad_norm": 0.0576791875064373, "learning_rate": 7.4e-06, "loss": 1.5061, "step": 370 }, { "epoch": 0.1105009400770677, "grad_norm": 0.059270892292261124, "learning_rate": 7.420000000000001e-06, "loss": 1.5009, "step": 371 }, { "epoch": 0.11079878627673635, "grad_norm": 0.05439314246177673, "learning_rate": 7.440000000000001e-06, "loss": 1.5037, "step": 372 }, { "epoch": 0.111096632476405, "grad_norm": 0.06574559211730957, "learning_rate": 7.4600000000000006e-06, "loss": 1.5103, "step": 373 }, { "epoch": 0.11139447867607365, "grad_norm": 0.05662696808576584, "learning_rate": 7.48e-06, "loss": 1.5117, "step": 374 }, { "epoch": 0.1116923248757423, "grad_norm": 0.05377237871289253, "learning_rate": 7.500000000000001e-06, "loss": 1.5202, "step": 375 }, { "epoch": 0.11199017107541094, "grad_norm": 0.06260445713996887, "learning_rate": 7.520000000000001e-06, "loss": 1.4895, "step": 376 }, { "epoch": 0.11228801727507957, "grad_norm": 0.05700628459453583, "learning_rate": 7.540000000000001e-06, "loss": 1.4807, "step": 377 }, { "epoch": 0.11258586347474822, "grad_norm": 0.05728604272007942, "learning_rate": 7.5600000000000005e-06, "loss": 1.4928, "step": 378 }, { "epoch": 0.11288370967441687, "grad_norm": 0.0564795583486557, "learning_rate": 7.58e-06, "loss": 1.5235, "step": 379 }, { "epoch": 0.11318155587408552, "grad_norm": 0.05639738589525223, "learning_rate": 7.600000000000001e-06, "loss": 1.5046, "step": 380 }, { "epoch": 0.11347940207375416, "grad_norm": 0.06486669927835464, "learning_rate": 7.620000000000001e-06, "loss": 1.5143, "step": 381 }, { "epoch": 0.11377724827342281, "grad_norm": 0.05547960475087166, "learning_rate": 7.640000000000001e-06, "loss": 1.5133, "step": 382 }, { "epoch": 0.11407509447309146, "grad_norm": 0.05486346781253815, "learning_rate": 7.660000000000001e-06, "loss": 1.5082, "step": 383 }, { "epoch": 0.1143729406727601, "grad_norm": 0.05648099631071091, "learning_rate": 7.680000000000001e-06, "loss": 1.4997, "step": 384 }, { "epoch": 0.11467078687242875, "grad_norm": 0.06804478168487549, "learning_rate": 7.7e-06, "loss": 1.5092, "step": 385 }, { "epoch": 0.1149686330720974, "grad_norm": 0.05766845867037773, "learning_rate": 7.72e-06, "loss": 1.4809, "step": 386 }, { "epoch": 0.11526647927176605, "grad_norm": 0.056981757283210754, "learning_rate": 7.74e-06, "loss": 1.5188, "step": 387 }, { "epoch": 0.11556432547143469, "grad_norm": 0.057405441999435425, "learning_rate": 7.76e-06, "loss": 1.5039, "step": 388 }, { "epoch": 0.11586217167110334, "grad_norm": 0.05664507672190666, "learning_rate": 7.78e-06, "loss": 1.4966, "step": 389 }, { "epoch": 0.11616001787077199, "grad_norm": 0.054925765842199326, "learning_rate": 7.800000000000002e-06, "loss": 1.4999, "step": 390 }, { "epoch": 0.11645786407044062, "grad_norm": 0.0627293661236763, "learning_rate": 7.820000000000001e-06, "loss": 1.5253, "step": 391 }, { "epoch": 0.11675571027010927, "grad_norm": 0.0569860115647316, "learning_rate": 7.840000000000001e-06, "loss": 1.501, "step": 392 }, { "epoch": 0.11705355646977791, "grad_norm": 0.053533781319856644, "learning_rate": 7.860000000000001e-06, "loss": 1.4995, "step": 393 }, { "epoch": 0.11735140266944656, "grad_norm": 0.055959541350603104, "learning_rate": 7.88e-06, "loss": 1.4732, "step": 394 }, { "epoch": 0.11764924886911521, "grad_norm": 0.0623549222946167, "learning_rate": 7.9e-06, "loss": 1.4995, "step": 395 }, { "epoch": 0.11794709506878386, "grad_norm": 0.055066898465156555, "learning_rate": 7.92e-06, "loss": 1.481, "step": 396 }, { "epoch": 0.1182449412684525, "grad_norm": 0.053721833974123, "learning_rate": 7.94e-06, "loss": 1.5033, "step": 397 }, { "epoch": 0.11854278746812115, "grad_norm": 0.05427899211645126, "learning_rate": 7.960000000000002e-06, "loss": 1.4925, "step": 398 }, { "epoch": 0.1188406336677898, "grad_norm": 0.05966542661190033, "learning_rate": 7.980000000000002e-06, "loss": 1.4939, "step": 399 }, { "epoch": 0.11913847986745844, "grad_norm": 0.05510552227497101, "learning_rate": 8.000000000000001e-06, "loss": 1.4894, "step": 400 }, { "epoch": 0.11943632606712709, "grad_norm": 0.05756401643157005, "learning_rate": 8.020000000000001e-06, "loss": 1.5015, "step": 401 }, { "epoch": 0.11973417226679574, "grad_norm": 0.06447158753871918, "learning_rate": 8.040000000000001e-06, "loss": 1.4989, "step": 402 }, { "epoch": 0.12003201846646439, "grad_norm": 0.053807564079761505, "learning_rate": 8.06e-06, "loss": 1.4997, "step": 403 }, { "epoch": 0.12032986466613303, "grad_norm": 0.05765663832426071, "learning_rate": 8.08e-06, "loss": 1.5109, "step": 404 }, { "epoch": 0.12062771086580167, "grad_norm": 0.05815264210104942, "learning_rate": 8.1e-06, "loss": 1.5204, "step": 405 }, { "epoch": 0.12092555706547031, "grad_norm": 0.05554782971739769, "learning_rate": 8.120000000000002e-06, "loss": 1.5061, "step": 406 }, { "epoch": 0.12122340326513896, "grad_norm": 0.05318152904510498, "learning_rate": 8.14e-06, "loss": 1.4944, "step": 407 }, { "epoch": 0.1215212494648076, "grad_norm": 0.061731383204460144, "learning_rate": 8.16e-06, "loss": 1.5216, "step": 408 }, { "epoch": 0.12181909566447625, "grad_norm": 0.060279667377471924, "learning_rate": 8.18e-06, "loss": 1.5058, "step": 409 }, { "epoch": 0.1221169418641449, "grad_norm": 0.05318167433142662, "learning_rate": 8.2e-06, "loss": 1.5058, "step": 410 }, { "epoch": 0.12241478806381355, "grad_norm": 0.060518402606248856, "learning_rate": 8.220000000000001e-06, "loss": 1.5043, "step": 411 }, { "epoch": 0.1227126342634822, "grad_norm": 0.0560225285589695, "learning_rate": 8.24e-06, "loss": 1.5003, "step": 412 }, { "epoch": 0.12301048046315084, "grad_norm": 0.0653400719165802, "learning_rate": 8.26e-06, "loss": 1.504, "step": 413 }, { "epoch": 0.12330832666281949, "grad_norm": 0.05497625097632408, "learning_rate": 8.28e-06, "loss": 1.5043, "step": 414 }, { "epoch": 0.12360617286248814, "grad_norm": 0.058193057775497437, "learning_rate": 8.3e-06, "loss": 1.5248, "step": 415 }, { "epoch": 0.12390401906215678, "grad_norm": 0.06820514798164368, "learning_rate": 8.32e-06, "loss": 1.5004, "step": 416 }, { "epoch": 0.12420186526182543, "grad_norm": 0.0610477477312088, "learning_rate": 8.34e-06, "loss": 1.5011, "step": 417 }, { "epoch": 0.12449971146149406, "grad_norm": 0.09810482710599899, "learning_rate": 8.36e-06, "loss": 1.5168, "step": 418 }, { "epoch": 0.12479755766116271, "grad_norm": 0.05371012166142464, "learning_rate": 8.380000000000001e-06, "loss": 1.5134, "step": 419 }, { "epoch": 0.12509540386083137, "grad_norm": 0.0640043094754219, "learning_rate": 8.400000000000001e-06, "loss": 1.4888, "step": 420 }, { "epoch": 0.1253932500605, "grad_norm": 0.08393670618534088, "learning_rate": 8.42e-06, "loss": 1.5245, "step": 421 }, { "epoch": 0.12569109626016867, "grad_norm": 0.05550488829612732, "learning_rate": 8.44e-06, "loss": 1.482, "step": 422 }, { "epoch": 0.1259889424598373, "grad_norm": 0.05311019718647003, "learning_rate": 8.46e-06, "loss": 1.5023, "step": 423 }, { "epoch": 0.12628678865950596, "grad_norm": 0.05454510822892189, "learning_rate": 8.48e-06, "loss": 1.5106, "step": 424 }, { "epoch": 0.1265846348591746, "grad_norm": 0.056061387062072754, "learning_rate": 8.5e-06, "loss": 1.4945, "step": 425 }, { "epoch": 0.12688248105884323, "grad_norm": 0.07110252976417542, "learning_rate": 8.52e-06, "loss": 1.4938, "step": 426 }, { "epoch": 0.1271803272585119, "grad_norm": 0.05501431226730347, "learning_rate": 8.540000000000001e-06, "loss": 1.5179, "step": 427 }, { "epoch": 0.12747817345818052, "grad_norm": 0.07934402674436569, "learning_rate": 8.560000000000001e-06, "loss": 1.4848, "step": 428 }, { "epoch": 0.12777601965784918, "grad_norm": 0.054356809705495834, "learning_rate": 8.580000000000001e-06, "loss": 1.5047, "step": 429 }, { "epoch": 0.12807386585751782, "grad_norm": 0.054320644587278366, "learning_rate": 8.6e-06, "loss": 1.4871, "step": 430 }, { "epoch": 0.12837171205718648, "grad_norm": 0.057648032903671265, "learning_rate": 8.62e-06, "loss": 1.4814, "step": 431 }, { "epoch": 0.1286695582568551, "grad_norm": 0.06109480559825897, "learning_rate": 8.64e-06, "loss": 1.4859, "step": 432 }, { "epoch": 0.12896740445652377, "grad_norm": 0.06258855015039444, "learning_rate": 8.66e-06, "loss": 1.4824, "step": 433 }, { "epoch": 0.1292652506561924, "grad_norm": 0.0565275102853775, "learning_rate": 8.68e-06, "loss": 1.4883, "step": 434 }, { "epoch": 0.12956309685586107, "grad_norm": 0.058759719133377075, "learning_rate": 8.700000000000001e-06, "loss": 1.4909, "step": 435 }, { "epoch": 0.1298609430555297, "grad_norm": 0.06828753650188446, "learning_rate": 8.720000000000001e-06, "loss": 1.5012, "step": 436 }, { "epoch": 0.13015878925519836, "grad_norm": 0.05682528391480446, "learning_rate": 8.740000000000001e-06, "loss": 1.4848, "step": 437 }, { "epoch": 0.130456635454867, "grad_norm": 0.06481580436229706, "learning_rate": 8.76e-06, "loss": 1.5015, "step": 438 }, { "epoch": 0.13075448165453563, "grad_norm": 0.0758729875087738, "learning_rate": 8.78e-06, "loss": 1.491, "step": 439 }, { "epoch": 0.1310523278542043, "grad_norm": 0.058766938745975494, "learning_rate": 8.8e-06, "loss": 1.4808, "step": 440 }, { "epoch": 0.13135017405387292, "grad_norm": 0.07881702482700348, "learning_rate": 8.82e-06, "loss": 1.4945, "step": 441 }, { "epoch": 0.13164802025354158, "grad_norm": 0.06895948201417923, "learning_rate": 8.84e-06, "loss": 1.4932, "step": 442 }, { "epoch": 0.13194586645321021, "grad_norm": 0.05118221789598465, "learning_rate": 8.860000000000002e-06, "loss": 1.4728, "step": 443 }, { "epoch": 0.13224371265287888, "grad_norm": 0.07200183719396591, "learning_rate": 8.880000000000001e-06, "loss": 1.4938, "step": 444 }, { "epoch": 0.1325415588525475, "grad_norm": 0.07630757242441177, "learning_rate": 8.900000000000001e-06, "loss": 1.4919, "step": 445 }, { "epoch": 0.13283940505221617, "grad_norm": 0.057770371437072754, "learning_rate": 8.920000000000001e-06, "loss": 1.4768, "step": 446 }, { "epoch": 0.1331372512518848, "grad_norm": 0.0767252966761589, "learning_rate": 8.94e-06, "loss": 1.4739, "step": 447 }, { "epoch": 0.13343509745155346, "grad_norm": 0.069420225918293, "learning_rate": 8.96e-06, "loss": 1.4904, "step": 448 }, { "epoch": 0.1337329436512221, "grad_norm": 0.05710853636264801, "learning_rate": 8.98e-06, "loss": 1.5044, "step": 449 }, { "epoch": 0.13403078985089076, "grad_norm": 0.10228551924228668, "learning_rate": 9e-06, "loss": 1.4748, "step": 450 }, { "epoch": 0.1343286360505594, "grad_norm": 0.06879130750894547, "learning_rate": 9.020000000000002e-06, "loss": 1.4723, "step": 451 }, { "epoch": 0.13462648225022805, "grad_norm": 0.06307007372379303, "learning_rate": 9.040000000000002e-06, "loss": 1.4817, "step": 452 }, { "epoch": 0.13492432844989669, "grad_norm": 0.08950717747211456, "learning_rate": 9.060000000000001e-06, "loss": 1.4978, "step": 453 }, { "epoch": 0.13522217464956532, "grad_norm": 0.05961303785443306, "learning_rate": 9.080000000000001e-06, "loss": 1.4799, "step": 454 }, { "epoch": 0.13552002084923398, "grad_norm": 0.053586605936288834, "learning_rate": 9.100000000000001e-06, "loss": 1.4979, "step": 455 }, { "epoch": 0.1358178670489026, "grad_norm": 0.08085786551237106, "learning_rate": 9.12e-06, "loss": 1.498, "step": 456 }, { "epoch": 0.13611571324857127, "grad_norm": 0.08234117180109024, "learning_rate": 9.14e-06, "loss": 1.4999, "step": 457 }, { "epoch": 0.1364135594482399, "grad_norm": 0.06344082951545715, "learning_rate": 9.16e-06, "loss": 1.4977, "step": 458 }, { "epoch": 0.13671140564790857, "grad_norm": 0.08588889241218567, "learning_rate": 9.180000000000002e-06, "loss": 1.5045, "step": 459 }, { "epoch": 0.1370092518475772, "grad_norm": 0.09717432409524918, "learning_rate": 9.200000000000002e-06, "loss": 1.5038, "step": 460 }, { "epoch": 0.13730709804724586, "grad_norm": 0.06904091686010361, "learning_rate": 9.220000000000002e-06, "loss": 1.4827, "step": 461 }, { "epoch": 0.1376049442469145, "grad_norm": 0.08623939752578735, "learning_rate": 9.240000000000001e-06, "loss": 1.473, "step": 462 }, { "epoch": 0.13790279044658316, "grad_norm": 0.0879717618227005, "learning_rate": 9.260000000000001e-06, "loss": 1.5091, "step": 463 }, { "epoch": 0.1382006366462518, "grad_norm": 0.06129393354058266, "learning_rate": 9.280000000000001e-06, "loss": 1.4717, "step": 464 }, { "epoch": 0.13849848284592045, "grad_norm": 0.06214470416307449, "learning_rate": 9.3e-06, "loss": 1.5008, "step": 465 }, { "epoch": 0.13879632904558908, "grad_norm": 0.08542024344205856, "learning_rate": 9.32e-06, "loss": 1.5034, "step": 466 }, { "epoch": 0.13909417524525772, "grad_norm": 0.06799095869064331, "learning_rate": 9.340000000000002e-06, "loss": 1.4733, "step": 467 }, { "epoch": 0.13939202144492638, "grad_norm": 0.05802611634135246, "learning_rate": 9.360000000000002e-06, "loss": 1.4819, "step": 468 }, { "epoch": 0.139689867644595, "grad_norm": 0.06662992388010025, "learning_rate": 9.38e-06, "loss": 1.492, "step": 469 }, { "epoch": 0.13998771384426367, "grad_norm": 0.05898972228169441, "learning_rate": 9.4e-06, "loss": 1.493, "step": 470 }, { "epoch": 0.1402855600439323, "grad_norm": 0.05767171084880829, "learning_rate": 9.42e-06, "loss": 1.466, "step": 471 }, { "epoch": 0.14058340624360097, "grad_norm": 0.062400639057159424, "learning_rate": 9.440000000000001e-06, "loss": 1.48, "step": 472 }, { "epoch": 0.1408812524432696, "grad_norm": 0.0686592310667038, "learning_rate": 9.460000000000001e-06, "loss": 1.4856, "step": 473 }, { "epoch": 0.14117909864293826, "grad_norm": 0.06079595908522606, "learning_rate": 9.48e-06, "loss": 1.4846, "step": 474 }, { "epoch": 0.1414769448426069, "grad_norm": 0.06447052955627441, "learning_rate": 9.5e-06, "loss": 1.4845, "step": 475 }, { "epoch": 0.14177479104227556, "grad_norm": 0.06018751487135887, "learning_rate": 9.52e-06, "loss": 1.4913, "step": 476 }, { "epoch": 0.1420726372419442, "grad_norm": 0.06974750012159348, "learning_rate": 9.54e-06, "loss": 1.4878, "step": 477 }, { "epoch": 0.14237048344161285, "grad_norm": 0.06062848120927811, "learning_rate": 9.56e-06, "loss": 1.4897, "step": 478 }, { "epoch": 0.14266832964128148, "grad_norm": 0.06055650860071182, "learning_rate": 9.58e-06, "loss": 1.4746, "step": 479 }, { "epoch": 0.14296617584095012, "grad_norm": 0.06422478705644608, "learning_rate": 9.600000000000001e-06, "loss": 1.4749, "step": 480 }, { "epoch": 0.14326402204061878, "grad_norm": 0.05979803204536438, "learning_rate": 9.620000000000001e-06, "loss": 1.4844, "step": 481 }, { "epoch": 0.1435618682402874, "grad_norm": 0.0778728798031807, "learning_rate": 9.640000000000001e-06, "loss": 1.4919, "step": 482 }, { "epoch": 0.14385971443995607, "grad_norm": 0.06429169327020645, "learning_rate": 9.66e-06, "loss": 1.4987, "step": 483 }, { "epoch": 0.1441575606396247, "grad_norm": 0.05542841926217079, "learning_rate": 9.68e-06, "loss": 1.4737, "step": 484 }, { "epoch": 0.14445540683929337, "grad_norm": 0.07081244140863419, "learning_rate": 9.7e-06, "loss": 1.4816, "step": 485 }, { "epoch": 0.144753253038962, "grad_norm": 0.07182405889034271, "learning_rate": 9.72e-06, "loss": 1.4958, "step": 486 }, { "epoch": 0.14505109923863066, "grad_norm": 0.0578744113445282, "learning_rate": 9.74e-06, "loss": 1.4902, "step": 487 }, { "epoch": 0.1453489454382993, "grad_norm": 0.07073856890201569, "learning_rate": 9.760000000000001e-06, "loss": 1.4954, "step": 488 }, { "epoch": 0.14564679163796795, "grad_norm": 0.0563889779150486, "learning_rate": 9.780000000000001e-06, "loss": 1.4759, "step": 489 }, { "epoch": 0.1459446378376366, "grad_norm": 0.06201820820569992, "learning_rate": 9.800000000000001e-06, "loss": 1.4978, "step": 490 }, { "epoch": 0.14624248403730525, "grad_norm": 0.058224428445100784, "learning_rate": 9.820000000000001e-06, "loss": 1.4554, "step": 491 }, { "epoch": 0.14654033023697388, "grad_norm": 0.05769447609782219, "learning_rate": 9.84e-06, "loss": 1.4685, "step": 492 }, { "epoch": 0.14683817643664254, "grad_norm": 0.06263457983732224, "learning_rate": 9.86e-06, "loss": 1.475, "step": 493 }, { "epoch": 0.14713602263631118, "grad_norm": 0.05658208206295967, "learning_rate": 9.88e-06, "loss": 1.4806, "step": 494 }, { "epoch": 0.1474338688359798, "grad_norm": 0.07487098127603531, "learning_rate": 9.9e-06, "loss": 1.4962, "step": 495 }, { "epoch": 0.14773171503564847, "grad_norm": 0.05984261631965637, "learning_rate": 9.920000000000002e-06, "loss": 1.471, "step": 496 }, { "epoch": 0.1480295612353171, "grad_norm": 0.061114851385354996, "learning_rate": 9.940000000000001e-06, "loss": 1.4734, "step": 497 }, { "epoch": 0.14832740743498576, "grad_norm": 0.06528756022453308, "learning_rate": 9.960000000000001e-06, "loss": 1.4599, "step": 498 }, { "epoch": 0.1486252536346544, "grad_norm": 0.08878826349973679, "learning_rate": 9.980000000000001e-06, "loss": 1.4971, "step": 499 }, { "epoch": 0.14892309983432306, "grad_norm": 0.09886343032121658, "learning_rate": 1e-05, "loss": 1.4828, "step": 500 }, { "epoch": 0.14892309983432306, "eval_loss": 1.4305658340454102, "eval_runtime": 18.358, "eval_samples_per_second": 94.455, "eval_steps_per_second": 5.937, "step": 500 }, { "epoch": 0.1492209460339917, "grad_norm": 0.06291390210390091, "learning_rate": 1.002e-05, "loss": 1.4785, "step": 501 }, { "epoch": 0.14951879223366035, "grad_norm": 0.0753321647644043, "learning_rate": 1.004e-05, "loss": 1.4701, "step": 502 }, { "epoch": 0.14981663843332899, "grad_norm": 0.06809154152870178, "learning_rate": 1.006e-05, "loss": 1.4695, "step": 503 }, { "epoch": 0.15011448463299765, "grad_norm": 0.08590058237314224, "learning_rate": 1.008e-05, "loss": 1.4905, "step": 504 }, { "epoch": 0.15041233083266628, "grad_norm": 0.0781983733177185, "learning_rate": 1.0100000000000002e-05, "loss": 1.4806, "step": 505 }, { "epoch": 0.15071017703233494, "grad_norm": 0.0789913758635521, "learning_rate": 1.0120000000000001e-05, "loss": 1.4838, "step": 506 }, { "epoch": 0.15100802323200357, "grad_norm": 0.07115836441516876, "learning_rate": 1.0140000000000001e-05, "loss": 1.4888, "step": 507 }, { "epoch": 0.1513058694316722, "grad_norm": 0.06764430552721024, "learning_rate": 1.0160000000000001e-05, "loss": 1.4657, "step": 508 }, { "epoch": 0.15160371563134087, "grad_norm": 0.0823146253824234, "learning_rate": 1.018e-05, "loss": 1.4973, "step": 509 }, { "epoch": 0.1519015618310095, "grad_norm": 0.11073504388332367, "learning_rate": 1.02e-05, "loss": 1.4692, "step": 510 }, { "epoch": 0.15219940803067816, "grad_norm": 0.07361211627721786, "learning_rate": 1.022e-05, "loss": 1.4951, "step": 511 }, { "epoch": 0.1524972542303468, "grad_norm": 0.11719920486211777, "learning_rate": 1.024e-05, "loss": 1.4888, "step": 512 }, { "epoch": 0.15279510043001546, "grad_norm": 0.06927847117185593, "learning_rate": 1.0260000000000002e-05, "loss": 1.4728, "step": 513 }, { "epoch": 0.1530929466296841, "grad_norm": 0.0775856226682663, "learning_rate": 1.0280000000000002e-05, "loss": 1.4768, "step": 514 }, { "epoch": 0.15339079282935275, "grad_norm": 0.10022097826004028, "learning_rate": 1.0300000000000001e-05, "loss": 1.4891, "step": 515 }, { "epoch": 0.15368863902902138, "grad_norm": 0.06402795761823654, "learning_rate": 1.0320000000000001e-05, "loss": 1.4642, "step": 516 }, { "epoch": 0.15398648522869005, "grad_norm": 0.07679101824760437, "learning_rate": 1.0340000000000001e-05, "loss": 1.4729, "step": 517 }, { "epoch": 0.15428433142835868, "grad_norm": 0.11688075214624405, "learning_rate": 1.036e-05, "loss": 1.488, "step": 518 }, { "epoch": 0.15458217762802734, "grad_norm": 0.06289079040288925, "learning_rate": 1.038e-05, "loss": 1.4791, "step": 519 }, { "epoch": 0.15488002382769597, "grad_norm": 0.07349809259176254, "learning_rate": 1.04e-05, "loss": 1.4794, "step": 520 }, { "epoch": 0.1551778700273646, "grad_norm": 0.0793452039361, "learning_rate": 1.0420000000000002e-05, "loss": 1.5016, "step": 521 }, { "epoch": 0.15547571622703327, "grad_norm": 0.0695367380976677, "learning_rate": 1.0440000000000002e-05, "loss": 1.4687, "step": 522 }, { "epoch": 0.1557735624267019, "grad_norm": 0.0566876195371151, "learning_rate": 1.0460000000000001e-05, "loss": 1.4522, "step": 523 }, { "epoch": 0.15607140862637056, "grad_norm": 0.06410634517669678, "learning_rate": 1.0480000000000001e-05, "loss": 1.5194, "step": 524 }, { "epoch": 0.1563692548260392, "grad_norm": 0.09583567082881927, "learning_rate": 1.0500000000000001e-05, "loss": 1.4795, "step": 525 }, { "epoch": 0.15666710102570786, "grad_norm": 0.0846511498093605, "learning_rate": 1.0520000000000001e-05, "loss": 1.4784, "step": 526 }, { "epoch": 0.1569649472253765, "grad_norm": 0.07598602026700974, "learning_rate": 1.054e-05, "loss": 1.4767, "step": 527 }, { "epoch": 0.15726279342504515, "grad_norm": 0.07100868970155716, "learning_rate": 1.056e-05, "loss": 1.4854, "step": 528 }, { "epoch": 0.15756063962471378, "grad_norm": 0.06840056926012039, "learning_rate": 1.0580000000000002e-05, "loss": 1.4487, "step": 529 }, { "epoch": 0.15785848582438244, "grad_norm": 0.072611004114151, "learning_rate": 1.0600000000000002e-05, "loss": 1.4675, "step": 530 }, { "epoch": 0.15815633202405108, "grad_norm": 0.08857429027557373, "learning_rate": 1.0620000000000002e-05, "loss": 1.4517, "step": 531 }, { "epoch": 0.15845417822371974, "grad_norm": 0.0728512555360794, "learning_rate": 1.0640000000000001e-05, "loss": 1.4671, "step": 532 }, { "epoch": 0.15875202442338837, "grad_norm": 0.08091485500335693, "learning_rate": 1.0660000000000001e-05, "loss": 1.4882, "step": 533 }, { "epoch": 0.15904987062305703, "grad_norm": 0.06646449863910675, "learning_rate": 1.0680000000000001e-05, "loss": 1.4777, "step": 534 }, { "epoch": 0.15934771682272567, "grad_norm": 0.06292653828859329, "learning_rate": 1.0700000000000001e-05, "loss": 1.4853, "step": 535 }, { "epoch": 0.1596455630223943, "grad_norm": 0.07076392322778702, "learning_rate": 1.072e-05, "loss": 1.4711, "step": 536 }, { "epoch": 0.15994340922206296, "grad_norm": 0.07309917360544205, "learning_rate": 1.0740000000000002e-05, "loss": 1.4629, "step": 537 }, { "epoch": 0.1602412554217316, "grad_norm": 0.06675314903259277, "learning_rate": 1.0760000000000002e-05, "loss": 1.4567, "step": 538 }, { "epoch": 0.16053910162140025, "grad_norm": 0.06333121657371521, "learning_rate": 1.0780000000000002e-05, "loss": 1.4799, "step": 539 }, { "epoch": 0.1608369478210689, "grad_norm": 0.07681736350059509, "learning_rate": 1.0800000000000002e-05, "loss": 1.4699, "step": 540 }, { "epoch": 0.16113479402073755, "grad_norm": 0.058470066636800766, "learning_rate": 1.0820000000000001e-05, "loss": 1.4713, "step": 541 }, { "epoch": 0.16143264022040618, "grad_norm": 0.06542841345071793, "learning_rate": 1.0840000000000001e-05, "loss": 1.4826, "step": 542 }, { "epoch": 0.16173048642007484, "grad_norm": 0.06384658813476562, "learning_rate": 1.0860000000000001e-05, "loss": 1.4763, "step": 543 }, { "epoch": 0.16202833261974348, "grad_norm": 0.06615178287029266, "learning_rate": 1.0880000000000001e-05, "loss": 1.4606, "step": 544 }, { "epoch": 0.16232617881941214, "grad_norm": 0.06471758335828781, "learning_rate": 1.0900000000000002e-05, "loss": 1.4896, "step": 545 }, { "epoch": 0.16262402501908077, "grad_norm": 0.06672003865242004, "learning_rate": 1.0920000000000002e-05, "loss": 1.4744, "step": 546 }, { "epoch": 0.16292187121874943, "grad_norm": 0.07130942493677139, "learning_rate": 1.0940000000000002e-05, "loss": 1.493, "step": 547 }, { "epoch": 0.16321971741841806, "grad_norm": 0.07708992809057236, "learning_rate": 1.0960000000000002e-05, "loss": 1.4663, "step": 548 }, { "epoch": 0.1635175636180867, "grad_norm": 0.06569571793079376, "learning_rate": 1.0980000000000002e-05, "loss": 1.461, "step": 549 }, { "epoch": 0.16381540981775536, "grad_norm": 0.06068449467420578, "learning_rate": 1.1000000000000001e-05, "loss": 1.462, "step": 550 }, { "epoch": 0.164113256017424, "grad_norm": 0.06866385042667389, "learning_rate": 1.1020000000000001e-05, "loss": 1.4795, "step": 551 }, { "epoch": 0.16441110221709265, "grad_norm": 0.07375936955213547, "learning_rate": 1.1040000000000001e-05, "loss": 1.4568, "step": 552 }, { "epoch": 0.16470894841676129, "grad_norm": 0.07791747152805328, "learning_rate": 1.1060000000000003e-05, "loss": 1.4699, "step": 553 }, { "epoch": 0.16500679461642995, "grad_norm": 0.06984464079141617, "learning_rate": 1.1080000000000002e-05, "loss": 1.4797, "step": 554 }, { "epoch": 0.16530464081609858, "grad_norm": 0.06589505821466446, "learning_rate": 1.1100000000000002e-05, "loss": 1.4671, "step": 555 }, { "epoch": 0.16560248701576724, "grad_norm": 0.08439430594444275, "learning_rate": 1.1120000000000002e-05, "loss": 1.4732, "step": 556 }, { "epoch": 0.16590033321543587, "grad_norm": 0.07184866070747375, "learning_rate": 1.1140000000000002e-05, "loss": 1.4744, "step": 557 }, { "epoch": 0.16619817941510454, "grad_norm": 0.07689396291971207, "learning_rate": 1.1160000000000002e-05, "loss": 1.4657, "step": 558 }, { "epoch": 0.16649602561477317, "grad_norm": 0.06957592815160751, "learning_rate": 1.1180000000000001e-05, "loss": 1.4686, "step": 559 }, { "epoch": 0.16679387181444183, "grad_norm": 0.0719662606716156, "learning_rate": 1.1200000000000001e-05, "loss": 1.476, "step": 560 }, { "epoch": 0.16709171801411046, "grad_norm": 0.08123235404491425, "learning_rate": 1.1220000000000003e-05, "loss": 1.4563, "step": 561 }, { "epoch": 0.1673895642137791, "grad_norm": 0.08388621360063553, "learning_rate": 1.1240000000000002e-05, "loss": 1.469, "step": 562 }, { "epoch": 0.16768741041344776, "grad_norm": 0.06538707762956619, "learning_rate": 1.126e-05, "loss": 1.4779, "step": 563 }, { "epoch": 0.1679852566131164, "grad_norm": 0.0906519740819931, "learning_rate": 1.128e-05, "loss": 1.4712, "step": 564 }, { "epoch": 0.16828310281278505, "grad_norm": 0.08886244148015976, "learning_rate": 1.13e-05, "loss": 1.4566, "step": 565 }, { "epoch": 0.16858094901245368, "grad_norm": 0.06940344721078873, "learning_rate": 1.132e-05, "loss": 1.4622, "step": 566 }, { "epoch": 0.16887879521212235, "grad_norm": 0.07784947752952576, "learning_rate": 1.134e-05, "loss": 1.4923, "step": 567 }, { "epoch": 0.16917664141179098, "grad_norm": 0.08280647546052933, "learning_rate": 1.136e-05, "loss": 1.4729, "step": 568 }, { "epoch": 0.16947448761145964, "grad_norm": 0.0870596393942833, "learning_rate": 1.138e-05, "loss": 1.471, "step": 569 }, { "epoch": 0.16977233381112827, "grad_norm": 0.07927907258272171, "learning_rate": 1.14e-05, "loss": 1.444, "step": 570 }, { "epoch": 0.17007018001079693, "grad_norm": 0.08484699577093124, "learning_rate": 1.142e-05, "loss": 1.4811, "step": 571 }, { "epoch": 0.17036802621046557, "grad_norm": 0.07751280069351196, "learning_rate": 1.144e-05, "loss": 1.4679, "step": 572 }, { "epoch": 0.17066587241013423, "grad_norm": 0.06520108878612518, "learning_rate": 1.146e-05, "loss": 1.471, "step": 573 }, { "epoch": 0.17096371860980286, "grad_norm": 0.09336327761411667, "learning_rate": 1.148e-05, "loss": 1.4566, "step": 574 }, { "epoch": 0.17126156480947152, "grad_norm": 0.0706481784582138, "learning_rate": 1.15e-05, "loss": 1.4694, "step": 575 }, { "epoch": 0.17155941100914016, "grad_norm": 0.11398935317993164, "learning_rate": 1.152e-05, "loss": 1.4729, "step": 576 }, { "epoch": 0.1718572572088088, "grad_norm": 0.07338732481002808, "learning_rate": 1.154e-05, "loss": 1.4707, "step": 577 }, { "epoch": 0.17215510340847745, "grad_norm": 0.07571332901716232, "learning_rate": 1.156e-05, "loss": 1.4594, "step": 578 }, { "epoch": 0.17245294960814608, "grad_norm": 0.07202799618244171, "learning_rate": 1.1580000000000001e-05, "loss": 1.4657, "step": 579 }, { "epoch": 0.17275079580781474, "grad_norm": 0.07837383449077606, "learning_rate": 1.16e-05, "loss": 1.4493, "step": 580 }, { "epoch": 0.17304864200748338, "grad_norm": 0.07715293765068054, "learning_rate": 1.162e-05, "loss": 1.4824, "step": 581 }, { "epoch": 0.17334648820715204, "grad_norm": 0.07711914926767349, "learning_rate": 1.164e-05, "loss": 1.4495, "step": 582 }, { "epoch": 0.17364433440682067, "grad_norm": 0.07181393355131149, "learning_rate": 1.166e-05, "loss": 1.4903, "step": 583 }, { "epoch": 0.17394218060648933, "grad_norm": 0.07789280265569687, "learning_rate": 1.168e-05, "loss": 1.4734, "step": 584 }, { "epoch": 0.17424002680615797, "grad_norm": 0.07323314249515533, "learning_rate": 1.17e-05, "loss": 1.4702, "step": 585 }, { "epoch": 0.17453787300582663, "grad_norm": 0.06602338701486588, "learning_rate": 1.172e-05, "loss": 1.4532, "step": 586 }, { "epoch": 0.17483571920549526, "grad_norm": 0.07128585875034332, "learning_rate": 1.1740000000000001e-05, "loss": 1.456, "step": 587 }, { "epoch": 0.17513356540516392, "grad_norm": 0.06492584943771362, "learning_rate": 1.1760000000000001e-05, "loss": 1.4524, "step": 588 }, { "epoch": 0.17543141160483255, "grad_norm": 0.06519263237714767, "learning_rate": 1.178e-05, "loss": 1.4638, "step": 589 }, { "epoch": 0.1757292578045012, "grad_norm": 0.0794047936797142, "learning_rate": 1.18e-05, "loss": 1.4688, "step": 590 }, { "epoch": 0.17602710400416985, "grad_norm": 0.07813981920480728, "learning_rate": 1.182e-05, "loss": 1.4661, "step": 591 }, { "epoch": 0.17632495020383848, "grad_norm": 0.06432987749576569, "learning_rate": 1.184e-05, "loss": 1.4653, "step": 592 }, { "epoch": 0.17662279640350714, "grad_norm": 0.06692880392074585, "learning_rate": 1.186e-05, "loss": 1.4577, "step": 593 }, { "epoch": 0.17692064260317578, "grad_norm": 0.07275351136922836, "learning_rate": 1.188e-05, "loss": 1.4617, "step": 594 }, { "epoch": 0.17721848880284444, "grad_norm": 0.07653673738241196, "learning_rate": 1.1900000000000001e-05, "loss": 1.4733, "step": 595 }, { "epoch": 0.17751633500251307, "grad_norm": 0.07326718419790268, "learning_rate": 1.1920000000000001e-05, "loss": 1.4585, "step": 596 }, { "epoch": 0.17781418120218173, "grad_norm": 0.07613977789878845, "learning_rate": 1.1940000000000001e-05, "loss": 1.4776, "step": 597 }, { "epoch": 0.17811202740185036, "grad_norm": 0.07090508937835693, "learning_rate": 1.196e-05, "loss": 1.4704, "step": 598 }, { "epoch": 0.17840987360151903, "grad_norm": 0.07558804005384445, "learning_rate": 1.198e-05, "loss": 1.4618, "step": 599 }, { "epoch": 0.17870771980118766, "grad_norm": 0.07647745311260223, "learning_rate": 1.2e-05, "loss": 1.4842, "step": 600 }, { "epoch": 0.17900556600085632, "grad_norm": 0.08317311108112335, "learning_rate": 1.202e-05, "loss": 1.4756, "step": 601 }, { "epoch": 0.17930341220052495, "grad_norm": 0.08071677386760712, "learning_rate": 1.204e-05, "loss": 1.4592, "step": 602 }, { "epoch": 0.1796012584001936, "grad_norm": 0.06859596073627472, "learning_rate": 1.2060000000000001e-05, "loss": 1.4583, "step": 603 }, { "epoch": 0.17989910459986225, "grad_norm": 0.07272063195705414, "learning_rate": 1.2080000000000001e-05, "loss": 1.4582, "step": 604 }, { "epoch": 0.18019695079953088, "grad_norm": 0.2326221913099289, "learning_rate": 1.2100000000000001e-05, "loss": 1.464, "step": 605 }, { "epoch": 0.18049479699919954, "grad_norm": 0.08455421030521393, "learning_rate": 1.2120000000000001e-05, "loss": 1.4555, "step": 606 }, { "epoch": 0.18079264319886817, "grad_norm": 0.08261154592037201, "learning_rate": 1.214e-05, "loss": 1.4779, "step": 607 }, { "epoch": 0.18109048939853684, "grad_norm": 0.08473054319620132, "learning_rate": 1.216e-05, "loss": 1.4533, "step": 608 }, { "epoch": 0.18138833559820547, "grad_norm": 0.10028796643018723, "learning_rate": 1.218e-05, "loss": 1.4532, "step": 609 }, { "epoch": 0.18168618179787413, "grad_norm": 0.07884956896305084, "learning_rate": 1.22e-05, "loss": 1.4874, "step": 610 }, { "epoch": 0.18198402799754276, "grad_norm": 0.0739910900592804, "learning_rate": 1.2220000000000002e-05, "loss": 1.4523, "step": 611 }, { "epoch": 0.18228187419721142, "grad_norm": 0.07816793024539948, "learning_rate": 1.2240000000000001e-05, "loss": 1.4814, "step": 612 }, { "epoch": 0.18257972039688006, "grad_norm": 0.08463511615991592, "learning_rate": 1.2260000000000001e-05, "loss": 1.4639, "step": 613 }, { "epoch": 0.18287756659654872, "grad_norm": 0.0768217220902443, "learning_rate": 1.2280000000000001e-05, "loss": 1.4461, "step": 614 }, { "epoch": 0.18317541279621735, "grad_norm": 0.06891295313835144, "learning_rate": 1.23e-05, "loss": 1.4489, "step": 615 }, { "epoch": 0.183473258995886, "grad_norm": 0.08129339665174484, "learning_rate": 1.232e-05, "loss": 1.4646, "step": 616 }, { "epoch": 0.18377110519555465, "grad_norm": 0.09381826967000961, "learning_rate": 1.234e-05, "loss": 1.4772, "step": 617 }, { "epoch": 0.18406895139522328, "grad_norm": 0.07440833002328873, "learning_rate": 1.236e-05, "loss": 1.453, "step": 618 }, { "epoch": 0.18436679759489194, "grad_norm": 0.07224272936582565, "learning_rate": 1.2380000000000002e-05, "loss": 1.4581, "step": 619 }, { "epoch": 0.18466464379456057, "grad_norm": 0.09223474562168121, "learning_rate": 1.2400000000000002e-05, "loss": 1.4644, "step": 620 }, { "epoch": 0.18496248999422923, "grad_norm": 0.072278693318367, "learning_rate": 1.2420000000000001e-05, "loss": 1.4756, "step": 621 }, { "epoch": 0.18526033619389787, "grad_norm": 0.07935364544391632, "learning_rate": 1.2440000000000001e-05, "loss": 1.4405, "step": 622 }, { "epoch": 0.18555818239356653, "grad_norm": 0.08382735401391983, "learning_rate": 1.2460000000000001e-05, "loss": 1.4551, "step": 623 }, { "epoch": 0.18585602859323516, "grad_norm": 0.09439096599817276, "learning_rate": 1.248e-05, "loss": 1.4753, "step": 624 }, { "epoch": 0.18615387479290382, "grad_norm": 0.08867479115724564, "learning_rate": 1.25e-05, "loss": 1.453, "step": 625 }, { "epoch": 0.18645172099257246, "grad_norm": 0.07792381942272186, "learning_rate": 1.252e-05, "loss": 1.4379, "step": 626 }, { "epoch": 0.18674956719224112, "grad_norm": 0.09601400792598724, "learning_rate": 1.254e-05, "loss": 1.47, "step": 627 }, { "epoch": 0.18704741339190975, "grad_norm": 0.07906622439622879, "learning_rate": 1.2560000000000002e-05, "loss": 1.4668, "step": 628 }, { "epoch": 0.1873452595915784, "grad_norm": 0.0777350515127182, "learning_rate": 1.2580000000000002e-05, "loss": 1.4527, "step": 629 }, { "epoch": 0.18764310579124704, "grad_norm": 0.06617771834135056, "learning_rate": 1.2600000000000001e-05, "loss": 1.4496, "step": 630 }, { "epoch": 0.18794095199091568, "grad_norm": 0.07273231446743011, "learning_rate": 1.2620000000000001e-05, "loss": 1.4607, "step": 631 }, { "epoch": 0.18823879819058434, "grad_norm": 0.08392848074436188, "learning_rate": 1.2640000000000001e-05, "loss": 1.4503, "step": 632 }, { "epoch": 0.18853664439025297, "grad_norm": 0.08314831554889679, "learning_rate": 1.266e-05, "loss": 1.4564, "step": 633 }, { "epoch": 0.18883449058992163, "grad_norm": 0.07674255967140198, "learning_rate": 1.268e-05, "loss": 1.4436, "step": 634 }, { "epoch": 0.18913233678959027, "grad_norm": 0.08189515769481659, "learning_rate": 1.27e-05, "loss": 1.4528, "step": 635 }, { "epoch": 0.18943018298925893, "grad_norm": 0.07610049098730087, "learning_rate": 1.2720000000000002e-05, "loss": 1.4644, "step": 636 }, { "epoch": 0.18972802918892756, "grad_norm": 0.08168883621692657, "learning_rate": 1.2740000000000002e-05, "loss": 1.43, "step": 637 }, { "epoch": 0.19002587538859622, "grad_norm": 0.09663128852844238, "learning_rate": 1.2760000000000001e-05, "loss": 1.4638, "step": 638 }, { "epoch": 0.19032372158826485, "grad_norm": 0.09020671248435974, "learning_rate": 1.2780000000000001e-05, "loss": 1.4427, "step": 639 }, { "epoch": 0.19062156778793352, "grad_norm": 0.08151830732822418, "learning_rate": 1.2800000000000001e-05, "loss": 1.4469, "step": 640 }, { "epoch": 0.19091941398760215, "grad_norm": 0.074073925614357, "learning_rate": 1.2820000000000001e-05, "loss": 1.4487, "step": 641 }, { "epoch": 0.1912172601872708, "grad_norm": 0.08020228892564774, "learning_rate": 1.284e-05, "loss": 1.4499, "step": 642 }, { "epoch": 0.19151510638693944, "grad_norm": 0.0663752481341362, "learning_rate": 1.286e-05, "loss": 1.4403, "step": 643 }, { "epoch": 0.19181295258660808, "grad_norm": 0.07641787081956863, "learning_rate": 1.2880000000000002e-05, "loss": 1.4646, "step": 644 }, { "epoch": 0.19211079878627674, "grad_norm": 0.09084177017211914, "learning_rate": 1.2900000000000002e-05, "loss": 1.4533, "step": 645 }, { "epoch": 0.19240864498594537, "grad_norm": 0.06982532143592834, "learning_rate": 1.2920000000000002e-05, "loss": 1.4484, "step": 646 }, { "epoch": 0.19270649118561403, "grad_norm": 0.07324111461639404, "learning_rate": 1.2940000000000001e-05, "loss": 1.4629, "step": 647 }, { "epoch": 0.19300433738528266, "grad_norm": 0.08842533081769943, "learning_rate": 1.2960000000000001e-05, "loss": 1.4495, "step": 648 }, { "epoch": 0.19330218358495133, "grad_norm": 0.07014186680316925, "learning_rate": 1.2980000000000001e-05, "loss": 1.4647, "step": 649 }, { "epoch": 0.19360002978461996, "grad_norm": 0.06982603669166565, "learning_rate": 1.3000000000000001e-05, "loss": 1.44, "step": 650 }, { "epoch": 0.19389787598428862, "grad_norm": 0.06965212523937225, "learning_rate": 1.302e-05, "loss": 1.4537, "step": 651 }, { "epoch": 0.19419572218395725, "grad_norm": 0.07888025045394897, "learning_rate": 1.3040000000000002e-05, "loss": 1.4553, "step": 652 }, { "epoch": 0.19449356838362591, "grad_norm": 0.0851881206035614, "learning_rate": 1.3060000000000002e-05, "loss": 1.4534, "step": 653 }, { "epoch": 0.19479141458329455, "grad_norm": 0.06838874518871307, "learning_rate": 1.3080000000000002e-05, "loss": 1.4632, "step": 654 }, { "epoch": 0.1950892607829632, "grad_norm": 0.07981374114751816, "learning_rate": 1.3100000000000002e-05, "loss": 1.4357, "step": 655 }, { "epoch": 0.19538710698263184, "grad_norm": 0.0882086381316185, "learning_rate": 1.3120000000000001e-05, "loss": 1.4601, "step": 656 }, { "epoch": 0.1956849531823005, "grad_norm": 0.08774475008249283, "learning_rate": 1.3140000000000001e-05, "loss": 1.4629, "step": 657 }, { "epoch": 0.19598279938196914, "grad_norm": 0.08365315198898315, "learning_rate": 1.3160000000000001e-05, "loss": 1.4605, "step": 658 }, { "epoch": 0.19628064558163777, "grad_norm": 0.09570679068565369, "learning_rate": 1.3180000000000001e-05, "loss": 1.4503, "step": 659 }, { "epoch": 0.19657849178130643, "grad_norm": 0.07877921313047409, "learning_rate": 1.3200000000000002e-05, "loss": 1.4424, "step": 660 }, { "epoch": 0.19687633798097506, "grad_norm": 0.07286708801984787, "learning_rate": 1.3220000000000002e-05, "loss": 1.4396, "step": 661 }, { "epoch": 0.19717418418064372, "grad_norm": 0.07777304947376251, "learning_rate": 1.3240000000000002e-05, "loss": 1.4584, "step": 662 }, { "epoch": 0.19747203038031236, "grad_norm": 0.07005325704813004, "learning_rate": 1.3260000000000002e-05, "loss": 1.4481, "step": 663 }, { "epoch": 0.19776987657998102, "grad_norm": 0.08193511515855789, "learning_rate": 1.3280000000000002e-05, "loss": 1.4664, "step": 664 }, { "epoch": 0.19806772277964965, "grad_norm": 0.08347135782241821, "learning_rate": 1.3300000000000001e-05, "loss": 1.4421, "step": 665 }, { "epoch": 0.1983655689793183, "grad_norm": 0.07640582323074341, "learning_rate": 1.3320000000000001e-05, "loss": 1.4498, "step": 666 }, { "epoch": 0.19866341517898695, "grad_norm": 0.0817837044596672, "learning_rate": 1.3340000000000001e-05, "loss": 1.4395, "step": 667 }, { "epoch": 0.1989612613786556, "grad_norm": 0.07509731501340866, "learning_rate": 1.3360000000000003e-05, "loss": 1.4529, "step": 668 }, { "epoch": 0.19925910757832424, "grad_norm": 0.07815979421138763, "learning_rate": 1.3380000000000002e-05, "loss": 1.4491, "step": 669 }, { "epoch": 0.1995569537779929, "grad_norm": 0.07622452825307846, "learning_rate": 1.3400000000000002e-05, "loss": 1.4553, "step": 670 }, { "epoch": 0.19985479997766153, "grad_norm": 0.07803203910589218, "learning_rate": 1.3420000000000002e-05, "loss": 1.4638, "step": 671 }, { "epoch": 0.20015264617733017, "grad_norm": 0.0738442987203598, "learning_rate": 1.3440000000000002e-05, "loss": 1.4448, "step": 672 }, { "epoch": 0.20045049237699883, "grad_norm": 0.07957897335290909, "learning_rate": 1.3460000000000002e-05, "loss": 1.4426, "step": 673 }, { "epoch": 0.20074833857666746, "grad_norm": 0.07637246698141098, "learning_rate": 1.3480000000000001e-05, "loss": 1.4368, "step": 674 }, { "epoch": 0.20104618477633612, "grad_norm": 0.07760189473628998, "learning_rate": 1.3500000000000001e-05, "loss": 1.4445, "step": 675 }, { "epoch": 0.20134403097600476, "grad_norm": 0.0710621029138565, "learning_rate": 1.3520000000000003e-05, "loss": 1.4443, "step": 676 }, { "epoch": 0.20164187717567342, "grad_norm": 0.07569440454244614, "learning_rate": 1.3540000000000003e-05, "loss": 1.4498, "step": 677 }, { "epoch": 0.20193972337534205, "grad_norm": 0.0767282173037529, "learning_rate": 1.3560000000000002e-05, "loss": 1.4422, "step": 678 }, { "epoch": 0.2022375695750107, "grad_norm": 0.08722124248743057, "learning_rate": 1.3580000000000002e-05, "loss": 1.4596, "step": 679 }, { "epoch": 0.20253541577467934, "grad_norm": 0.09099457412958145, "learning_rate": 1.3600000000000002e-05, "loss": 1.4469, "step": 680 }, { "epoch": 0.202833261974348, "grad_norm": 0.06960950046777725, "learning_rate": 1.3620000000000002e-05, "loss": 1.4701, "step": 681 }, { "epoch": 0.20313110817401664, "grad_norm": 0.07177898287773132, "learning_rate": 1.3640000000000002e-05, "loss": 1.4425, "step": 682 }, { "epoch": 0.2034289543736853, "grad_norm": 0.07588344812393188, "learning_rate": 1.3660000000000001e-05, "loss": 1.4415, "step": 683 }, { "epoch": 0.20372680057335393, "grad_norm": 0.07126078009605408, "learning_rate": 1.3680000000000003e-05, "loss": 1.4592, "step": 684 }, { "epoch": 0.2040246467730226, "grad_norm": 0.0929664671421051, "learning_rate": 1.3700000000000003e-05, "loss": 1.4479, "step": 685 }, { "epoch": 0.20432249297269123, "grad_norm": 0.07565732300281525, "learning_rate": 1.3720000000000002e-05, "loss": 1.4404, "step": 686 }, { "epoch": 0.20462033917235986, "grad_norm": 0.09555564820766449, "learning_rate": 1.3740000000000002e-05, "loss": 1.4419, "step": 687 }, { "epoch": 0.20491818537202852, "grad_norm": 0.0677831768989563, "learning_rate": 1.376e-05, "loss": 1.4457, "step": 688 }, { "epoch": 0.20521603157169716, "grad_norm": 0.07221392542123795, "learning_rate": 1.378e-05, "loss": 1.4456, "step": 689 }, { "epoch": 0.20551387777136582, "grad_norm": 0.07777795195579529, "learning_rate": 1.38e-05, "loss": 1.4432, "step": 690 }, { "epoch": 0.20581172397103445, "grad_norm": 0.07490269839763641, "learning_rate": 1.382e-05, "loss": 1.4417, "step": 691 }, { "epoch": 0.2061095701707031, "grad_norm": 0.079764723777771, "learning_rate": 1.384e-05, "loss": 1.4455, "step": 692 }, { "epoch": 0.20640741637037174, "grad_norm": 0.09166201949119568, "learning_rate": 1.386e-05, "loss": 1.446, "step": 693 }, { "epoch": 0.2067052625700404, "grad_norm": 0.07424476742744446, "learning_rate": 1.3880000000000001e-05, "loss": 1.4458, "step": 694 }, { "epoch": 0.20700310876970904, "grad_norm": 0.07070266455411911, "learning_rate": 1.39e-05, "loss": 1.4336, "step": 695 }, { "epoch": 0.2073009549693777, "grad_norm": 0.07643958926200867, "learning_rate": 1.392e-05, "loss": 1.4536, "step": 696 }, { "epoch": 0.20759880116904633, "grad_norm": 0.07516030222177505, "learning_rate": 1.394e-05, "loss": 1.4408, "step": 697 }, { "epoch": 0.207896647368715, "grad_norm": 0.0756191536784172, "learning_rate": 1.396e-05, "loss": 1.4337, "step": 698 }, { "epoch": 0.20819449356838363, "grad_norm": 0.07343825697898865, "learning_rate": 1.398e-05, "loss": 1.4437, "step": 699 }, { "epoch": 0.20849233976805226, "grad_norm": 0.10146047174930573, "learning_rate": 1.4e-05, "loss": 1.4374, "step": 700 }, { "epoch": 0.20879018596772092, "grad_norm": 0.0799114927649498, "learning_rate": 1.402e-05, "loss": 1.4517, "step": 701 }, { "epoch": 0.20908803216738955, "grad_norm": 0.07850436866283417, "learning_rate": 1.4040000000000001e-05, "loss": 1.445, "step": 702 }, { "epoch": 0.20938587836705821, "grad_norm": 0.07773782312870026, "learning_rate": 1.4060000000000001e-05, "loss": 1.4392, "step": 703 }, { "epoch": 0.20968372456672685, "grad_norm": 0.08914069831371307, "learning_rate": 1.408e-05, "loss": 1.4579, "step": 704 }, { "epoch": 0.2099815707663955, "grad_norm": 0.07327356189489365, "learning_rate": 1.41e-05, "loss": 1.4472, "step": 705 }, { "epoch": 0.21027941696606414, "grad_norm": 0.0727405697107315, "learning_rate": 1.412e-05, "loss": 1.4548, "step": 706 }, { "epoch": 0.2105772631657328, "grad_norm": 0.08400865644216537, "learning_rate": 1.414e-05, "loss": 1.4488, "step": 707 }, { "epoch": 0.21087510936540144, "grad_norm": 0.07869898527860641, "learning_rate": 1.416e-05, "loss": 1.4435, "step": 708 }, { "epoch": 0.2111729555650701, "grad_norm": 0.07953284680843353, "learning_rate": 1.418e-05, "loss": 1.4513, "step": 709 }, { "epoch": 0.21147080176473873, "grad_norm": 0.07338771224021912, "learning_rate": 1.4200000000000001e-05, "loss": 1.4415, "step": 710 }, { "epoch": 0.2117686479644074, "grad_norm": 0.08386941999197006, "learning_rate": 1.4220000000000001e-05, "loss": 1.441, "step": 711 }, { "epoch": 0.21206649416407602, "grad_norm": 0.08398545533418655, "learning_rate": 1.4240000000000001e-05, "loss": 1.4567, "step": 712 }, { "epoch": 0.21236434036374466, "grad_norm": 0.0810646340250969, "learning_rate": 1.426e-05, "loss": 1.4614, "step": 713 }, { "epoch": 0.21266218656341332, "grad_norm": 0.07760893553495407, "learning_rate": 1.428e-05, "loss": 1.434, "step": 714 }, { "epoch": 0.21296003276308195, "grad_norm": 0.07823329418897629, "learning_rate": 1.43e-05, "loss": 1.4318, "step": 715 }, { "epoch": 0.2132578789627506, "grad_norm": 0.08283335715532303, "learning_rate": 1.432e-05, "loss": 1.4331, "step": 716 }, { "epoch": 0.21355572516241925, "grad_norm": 0.07786435633897781, "learning_rate": 1.434e-05, "loss": 1.436, "step": 717 }, { "epoch": 0.2138535713620879, "grad_norm": 0.0875328779220581, "learning_rate": 1.4360000000000001e-05, "loss": 1.4267, "step": 718 }, { "epoch": 0.21415141756175654, "grad_norm": 0.08489986509084702, "learning_rate": 1.4380000000000001e-05, "loss": 1.442, "step": 719 }, { "epoch": 0.2144492637614252, "grad_norm": 0.09320718050003052, "learning_rate": 1.4400000000000001e-05, "loss": 1.4436, "step": 720 }, { "epoch": 0.21474710996109383, "grad_norm": 0.08342552185058594, "learning_rate": 1.4420000000000001e-05, "loss": 1.4347, "step": 721 }, { "epoch": 0.2150449561607625, "grad_norm": 0.08096481114625931, "learning_rate": 1.444e-05, "loss": 1.4307, "step": 722 }, { "epoch": 0.21534280236043113, "grad_norm": 0.0804436206817627, "learning_rate": 1.446e-05, "loss": 1.4415, "step": 723 }, { "epoch": 0.2156406485600998, "grad_norm": 0.0846792608499527, "learning_rate": 1.448e-05, "loss": 1.4508, "step": 724 }, { "epoch": 0.21593849475976842, "grad_norm": 0.07710380107164383, "learning_rate": 1.45e-05, "loss": 1.441, "step": 725 }, { "epoch": 0.21623634095943708, "grad_norm": 0.08490527421236038, "learning_rate": 1.4520000000000002e-05, "loss": 1.4427, "step": 726 }, { "epoch": 0.21653418715910572, "grad_norm": 0.08044250309467316, "learning_rate": 1.4540000000000001e-05, "loss": 1.4409, "step": 727 }, { "epoch": 0.21683203335877435, "grad_norm": 0.08590737730264664, "learning_rate": 1.4560000000000001e-05, "loss": 1.4366, "step": 728 }, { "epoch": 0.217129879558443, "grad_norm": 0.07575635612010956, "learning_rate": 1.4580000000000001e-05, "loss": 1.437, "step": 729 }, { "epoch": 0.21742772575811165, "grad_norm": 0.07428482174873352, "learning_rate": 1.46e-05, "loss": 1.4243, "step": 730 }, { "epoch": 0.2177255719577803, "grad_norm": 0.08661162853240967, "learning_rate": 1.462e-05, "loss": 1.4346, "step": 731 }, { "epoch": 0.21802341815744894, "grad_norm": 0.0743163526058197, "learning_rate": 1.464e-05, "loss": 1.4184, "step": 732 }, { "epoch": 0.2183212643571176, "grad_norm": 0.15677453577518463, "learning_rate": 1.466e-05, "loss": 1.4413, "step": 733 }, { "epoch": 0.21861911055678623, "grad_norm": 0.09177551418542862, "learning_rate": 1.4680000000000002e-05, "loss": 1.4463, "step": 734 }, { "epoch": 0.2189169567564549, "grad_norm": 0.08581575006246567, "learning_rate": 1.4700000000000002e-05, "loss": 1.4566, "step": 735 }, { "epoch": 0.21921480295612353, "grad_norm": 0.07593221962451935, "learning_rate": 1.4720000000000001e-05, "loss": 1.4337, "step": 736 }, { "epoch": 0.2195126491557922, "grad_norm": 0.07464537024497986, "learning_rate": 1.4740000000000001e-05, "loss": 1.4351, "step": 737 }, { "epoch": 0.21981049535546082, "grad_norm": 0.08558041602373123, "learning_rate": 1.4760000000000001e-05, "loss": 1.4554, "step": 738 }, { "epoch": 0.22010834155512948, "grad_norm": 0.0854952484369278, "learning_rate": 1.478e-05, "loss": 1.4363, "step": 739 }, { "epoch": 0.22040618775479812, "grad_norm": 0.08115620911121368, "learning_rate": 1.48e-05, "loss": 1.4449, "step": 740 }, { "epoch": 0.22070403395446675, "grad_norm": 0.09755454212427139, "learning_rate": 1.482e-05, "loss": 1.4229, "step": 741 }, { "epoch": 0.2210018801541354, "grad_norm": 0.0890473797917366, "learning_rate": 1.4840000000000002e-05, "loss": 1.4298, "step": 742 }, { "epoch": 0.22129972635380404, "grad_norm": 0.09477613121271133, "learning_rate": 1.4860000000000002e-05, "loss": 1.4341, "step": 743 }, { "epoch": 0.2215975725534727, "grad_norm": 0.08408848196268082, "learning_rate": 1.4880000000000002e-05, "loss": 1.449, "step": 744 }, { "epoch": 0.22189541875314134, "grad_norm": 0.09292439371347427, "learning_rate": 1.4900000000000001e-05, "loss": 1.4544, "step": 745 }, { "epoch": 0.22219326495281, "grad_norm": 0.0929374247789383, "learning_rate": 1.4920000000000001e-05, "loss": 1.4257, "step": 746 }, { "epoch": 0.22249111115247863, "grad_norm": 0.07992805540561676, "learning_rate": 1.4940000000000001e-05, "loss": 1.4216, "step": 747 }, { "epoch": 0.2227889573521473, "grad_norm": 0.08281844854354858, "learning_rate": 1.496e-05, "loss": 1.4437, "step": 748 }, { "epoch": 0.22308680355181593, "grad_norm": 0.08912403136491776, "learning_rate": 1.498e-05, "loss": 1.4343, "step": 749 }, { "epoch": 0.2233846497514846, "grad_norm": 0.08185693621635437, "learning_rate": 1.5000000000000002e-05, "loss": 1.4339, "step": 750 }, { "epoch": 0.22368249595115322, "grad_norm": 0.09299539029598236, "learning_rate": 1.5020000000000002e-05, "loss": 1.4288, "step": 751 }, { "epoch": 0.22398034215082188, "grad_norm": 0.08689764887094498, "learning_rate": 1.5040000000000002e-05, "loss": 1.4256, "step": 752 }, { "epoch": 0.22427818835049051, "grad_norm": 0.0877898633480072, "learning_rate": 1.5060000000000001e-05, "loss": 1.4254, "step": 753 }, { "epoch": 0.22457603455015915, "grad_norm": 0.08317238837480545, "learning_rate": 1.5080000000000001e-05, "loss": 1.4254, "step": 754 }, { "epoch": 0.2248738807498278, "grad_norm": 0.07793273776769638, "learning_rate": 1.5100000000000001e-05, "loss": 1.4406, "step": 755 }, { "epoch": 0.22517172694949644, "grad_norm": 0.0886562243103981, "learning_rate": 1.5120000000000001e-05, "loss": 1.4486, "step": 756 }, { "epoch": 0.2254695731491651, "grad_norm": 0.08472087979316711, "learning_rate": 1.514e-05, "loss": 1.4171, "step": 757 }, { "epoch": 0.22576741934883374, "grad_norm": 0.09509363025426865, "learning_rate": 1.516e-05, "loss": 1.4207, "step": 758 }, { "epoch": 0.2260652655485024, "grad_norm": 0.09654158353805542, "learning_rate": 1.5180000000000002e-05, "loss": 1.424, "step": 759 }, { "epoch": 0.22636311174817103, "grad_norm": 0.07955440133810043, "learning_rate": 1.5200000000000002e-05, "loss": 1.4335, "step": 760 }, { "epoch": 0.2266609579478397, "grad_norm": 0.09881535917520523, "learning_rate": 1.5220000000000002e-05, "loss": 1.42, "step": 761 }, { "epoch": 0.22695880414750833, "grad_norm": 0.09733070433139801, "learning_rate": 1.5240000000000001e-05, "loss": 1.4325, "step": 762 }, { "epoch": 0.22725665034717699, "grad_norm": 0.08393367379903793, "learning_rate": 1.5260000000000003e-05, "loss": 1.4338, "step": 763 }, { "epoch": 0.22755449654684562, "grad_norm": 0.08710719645023346, "learning_rate": 1.5280000000000003e-05, "loss": 1.4436, "step": 764 }, { "epoch": 0.22785234274651428, "grad_norm": 0.0892837718129158, "learning_rate": 1.5300000000000003e-05, "loss": 1.4317, "step": 765 }, { "epoch": 0.2281501889461829, "grad_norm": 0.09063079208135605, "learning_rate": 1.5320000000000002e-05, "loss": 1.439, "step": 766 }, { "epoch": 0.22844803514585157, "grad_norm": 0.0837310254573822, "learning_rate": 1.5340000000000002e-05, "loss": 1.4154, "step": 767 }, { "epoch": 0.2287458813455202, "grad_norm": 0.0990089401602745, "learning_rate": 1.5360000000000002e-05, "loss": 1.4355, "step": 768 }, { "epoch": 0.22904372754518884, "grad_norm": 0.08116359263658524, "learning_rate": 1.5380000000000002e-05, "loss": 1.4406, "step": 769 }, { "epoch": 0.2293415737448575, "grad_norm": 0.09696918725967407, "learning_rate": 1.54e-05, "loss": 1.4249, "step": 770 }, { "epoch": 0.22963941994452614, "grad_norm": 0.0966629609465599, "learning_rate": 1.542e-05, "loss": 1.4278, "step": 771 }, { "epoch": 0.2299372661441948, "grad_norm": 0.08971633017063141, "learning_rate": 1.544e-05, "loss": 1.4296, "step": 772 }, { "epoch": 0.23023511234386343, "grad_norm": 0.0809473842382431, "learning_rate": 1.546e-05, "loss": 1.4143, "step": 773 }, { "epoch": 0.2305329585435321, "grad_norm": 0.13298514485359192, "learning_rate": 1.548e-05, "loss": 1.4372, "step": 774 }, { "epoch": 0.23083080474320072, "grad_norm": 0.08953984081745148, "learning_rate": 1.55e-05, "loss": 1.4328, "step": 775 }, { "epoch": 0.23112865094286938, "grad_norm": 0.08975512534379959, "learning_rate": 1.552e-05, "loss": 1.4188, "step": 776 }, { "epoch": 0.23142649714253802, "grad_norm": 0.08501532673835754, "learning_rate": 1.554e-05, "loss": 1.435, "step": 777 }, { "epoch": 0.23172434334220668, "grad_norm": 0.08300330489873886, "learning_rate": 1.556e-05, "loss": 1.4346, "step": 778 }, { "epoch": 0.2320221895418753, "grad_norm": 0.08758524805307388, "learning_rate": 1.5580000000000003e-05, "loss": 1.4267, "step": 779 }, { "epoch": 0.23232003574154397, "grad_norm": 0.08660390228033066, "learning_rate": 1.5600000000000003e-05, "loss": 1.4428, "step": 780 }, { "epoch": 0.2326178819412126, "grad_norm": 0.08623852580785751, "learning_rate": 1.5620000000000003e-05, "loss": 1.4178, "step": 781 }, { "epoch": 0.23291572814088124, "grad_norm": 0.09258664399385452, "learning_rate": 1.5640000000000003e-05, "loss": 1.4408, "step": 782 }, { "epoch": 0.2332135743405499, "grad_norm": 0.08016887307167053, "learning_rate": 1.5660000000000003e-05, "loss": 1.4232, "step": 783 }, { "epoch": 0.23351142054021853, "grad_norm": 0.08588265627622604, "learning_rate": 1.5680000000000002e-05, "loss": 1.431, "step": 784 }, { "epoch": 0.2338092667398872, "grad_norm": 0.08261680603027344, "learning_rate": 1.5700000000000002e-05, "loss": 1.4283, "step": 785 }, { "epoch": 0.23410711293955583, "grad_norm": 0.0949244499206543, "learning_rate": 1.5720000000000002e-05, "loss": 1.4268, "step": 786 }, { "epoch": 0.2344049591392245, "grad_norm": 0.10626151412725449, "learning_rate": 1.5740000000000002e-05, "loss": 1.4331, "step": 787 }, { "epoch": 0.23470280533889312, "grad_norm": 0.08633959293365479, "learning_rate": 1.576e-05, "loss": 1.4458, "step": 788 }, { "epoch": 0.23500065153856178, "grad_norm": 0.08385222405195236, "learning_rate": 1.578e-05, "loss": 1.4275, "step": 789 }, { "epoch": 0.23529849773823042, "grad_norm": 0.0932660847902298, "learning_rate": 1.58e-05, "loss": 1.4371, "step": 790 }, { "epoch": 0.23559634393789908, "grad_norm": 0.09389449656009674, "learning_rate": 1.582e-05, "loss": 1.4452, "step": 791 }, { "epoch": 0.2358941901375677, "grad_norm": 0.08671513944864273, "learning_rate": 1.584e-05, "loss": 1.4261, "step": 792 }, { "epoch": 0.23619203633723637, "grad_norm": 0.10053084790706635, "learning_rate": 1.586e-05, "loss": 1.4061, "step": 793 }, { "epoch": 0.236489882536905, "grad_norm": 0.10518268495798111, "learning_rate": 1.588e-05, "loss": 1.4232, "step": 794 }, { "epoch": 0.23678772873657364, "grad_norm": 0.09803315252065659, "learning_rate": 1.5900000000000004e-05, "loss": 1.4485, "step": 795 }, { "epoch": 0.2370855749362423, "grad_norm": 0.09714682400226593, "learning_rate": 1.5920000000000003e-05, "loss": 1.4359, "step": 796 }, { "epoch": 0.23738342113591093, "grad_norm": 0.09825300425291061, "learning_rate": 1.5940000000000003e-05, "loss": 1.4334, "step": 797 }, { "epoch": 0.2376812673355796, "grad_norm": 0.0919112116098404, "learning_rate": 1.5960000000000003e-05, "loss": 1.4379, "step": 798 }, { "epoch": 0.23797911353524823, "grad_norm": 0.08471221476793289, "learning_rate": 1.5980000000000003e-05, "loss": 1.4105, "step": 799 }, { "epoch": 0.2382769597349169, "grad_norm": 0.08819548040628433, "learning_rate": 1.6000000000000003e-05, "loss": 1.444, "step": 800 }, { "epoch": 0.23857480593458552, "grad_norm": 0.09059222787618637, "learning_rate": 1.6020000000000002e-05, "loss": 1.4186, "step": 801 }, { "epoch": 0.23887265213425418, "grad_norm": 0.09318530559539795, "learning_rate": 1.6040000000000002e-05, "loss": 1.4273, "step": 802 }, { "epoch": 0.23917049833392282, "grad_norm": 0.0928410068154335, "learning_rate": 1.6060000000000002e-05, "loss": 1.4338, "step": 803 }, { "epoch": 0.23946834453359148, "grad_norm": 0.09981449693441391, "learning_rate": 1.6080000000000002e-05, "loss": 1.4294, "step": 804 }, { "epoch": 0.2397661907332601, "grad_norm": 0.08160090446472168, "learning_rate": 1.6100000000000002e-05, "loss": 1.4104, "step": 805 }, { "epoch": 0.24006403693292877, "grad_norm": 0.09856461733579636, "learning_rate": 1.612e-05, "loss": 1.4432, "step": 806 }, { "epoch": 0.2403618831325974, "grad_norm": 0.08490412682294846, "learning_rate": 1.614e-05, "loss": 1.439, "step": 807 }, { "epoch": 0.24065972933226606, "grad_norm": 0.10598944127559662, "learning_rate": 1.616e-05, "loss": 1.4408, "step": 808 }, { "epoch": 0.2409575755319347, "grad_norm": 0.09258910268545151, "learning_rate": 1.618e-05, "loss": 1.4108, "step": 809 }, { "epoch": 0.24125542173160333, "grad_norm": 0.08649790287017822, "learning_rate": 1.62e-05, "loss": 1.4151, "step": 810 }, { "epoch": 0.241553267931272, "grad_norm": 0.08332608640193939, "learning_rate": 1.6220000000000004e-05, "loss": 1.4288, "step": 811 }, { "epoch": 0.24185111413094063, "grad_norm": 0.09572703391313553, "learning_rate": 1.6240000000000004e-05, "loss": 1.42, "step": 812 }, { "epoch": 0.2421489603306093, "grad_norm": 0.10475686937570572, "learning_rate": 1.626e-05, "loss": 1.4474, "step": 813 }, { "epoch": 0.24244680653027792, "grad_norm": 0.11400950700044632, "learning_rate": 1.628e-05, "loss": 1.435, "step": 814 }, { "epoch": 0.24274465272994658, "grad_norm": 0.08923758566379547, "learning_rate": 1.63e-05, "loss": 1.4216, "step": 815 }, { "epoch": 0.2430424989296152, "grad_norm": 0.09222014993429184, "learning_rate": 1.632e-05, "loss": 1.4204, "step": 816 }, { "epoch": 0.24334034512928387, "grad_norm": 0.10609360784292221, "learning_rate": 1.634e-05, "loss": 1.4313, "step": 817 }, { "epoch": 0.2436381913289525, "grad_norm": 0.09008601307868958, "learning_rate": 1.636e-05, "loss": 1.4127, "step": 818 }, { "epoch": 0.24393603752862117, "grad_norm": 0.09481731057167053, "learning_rate": 1.638e-05, "loss": 1.4234, "step": 819 }, { "epoch": 0.2442338837282898, "grad_norm": 0.09240791201591492, "learning_rate": 1.64e-05, "loss": 1.4503, "step": 820 }, { "epoch": 0.24453172992795846, "grad_norm": 0.09623836725950241, "learning_rate": 1.6420000000000002e-05, "loss": 1.4357, "step": 821 }, { "epoch": 0.2448295761276271, "grad_norm": 0.09477917104959488, "learning_rate": 1.6440000000000002e-05, "loss": 1.3946, "step": 822 }, { "epoch": 0.24512742232729573, "grad_norm": 0.08368118852376938, "learning_rate": 1.646e-05, "loss": 1.4181, "step": 823 }, { "epoch": 0.2454252685269644, "grad_norm": 0.08611281961202621, "learning_rate": 1.648e-05, "loss": 1.4196, "step": 824 }, { "epoch": 0.24572311472663302, "grad_norm": 0.09466510266065598, "learning_rate": 1.65e-05, "loss": 1.409, "step": 825 }, { "epoch": 0.24602096092630169, "grad_norm": 0.09006678313016891, "learning_rate": 1.652e-05, "loss": 1.4372, "step": 826 }, { "epoch": 0.24631880712597032, "grad_norm": 0.0944555401802063, "learning_rate": 1.654e-05, "loss": 1.4294, "step": 827 }, { "epoch": 0.24661665332563898, "grad_norm": 0.08576387912034988, "learning_rate": 1.656e-05, "loss": 1.4145, "step": 828 }, { "epoch": 0.2469144995253076, "grad_norm": 0.09978868067264557, "learning_rate": 1.658e-05, "loss": 1.4233, "step": 829 }, { "epoch": 0.24721234572497627, "grad_norm": 0.0882377177476883, "learning_rate": 1.66e-05, "loss": 1.4413, "step": 830 }, { "epoch": 0.2475101919246449, "grad_norm": 0.09680736809968948, "learning_rate": 1.662e-05, "loss": 1.4082, "step": 831 }, { "epoch": 0.24780803812431357, "grad_norm": 0.0893857553601265, "learning_rate": 1.664e-05, "loss": 1.4185, "step": 832 }, { "epoch": 0.2481058843239822, "grad_norm": 0.09749665856361389, "learning_rate": 1.666e-05, "loss": 1.395, "step": 833 }, { "epoch": 0.24840373052365086, "grad_norm": 0.08752049505710602, "learning_rate": 1.668e-05, "loss": 1.429, "step": 834 }, { "epoch": 0.2487015767233195, "grad_norm": 0.10418985784053802, "learning_rate": 1.67e-05, "loss": 1.4238, "step": 835 }, { "epoch": 0.24899942292298813, "grad_norm": 0.09133674949407578, "learning_rate": 1.672e-05, "loss": 1.4041, "step": 836 }, { "epoch": 0.2492972691226568, "grad_norm": 0.08586476743221283, "learning_rate": 1.6740000000000002e-05, "loss": 1.4163, "step": 837 }, { "epoch": 0.24959511532232542, "grad_norm": 0.09824176877737045, "learning_rate": 1.6760000000000002e-05, "loss": 1.441, "step": 838 }, { "epoch": 0.24989296152199408, "grad_norm": 0.08725030720233917, "learning_rate": 1.6780000000000002e-05, "loss": 1.4267, "step": 839 }, { "epoch": 0.25019080772166274, "grad_norm": 0.11314549297094345, "learning_rate": 1.6800000000000002e-05, "loss": 1.4128, "step": 840 }, { "epoch": 0.25048865392133135, "grad_norm": 0.09061966836452484, "learning_rate": 1.682e-05, "loss": 1.4415, "step": 841 }, { "epoch": 0.250786500121, "grad_norm": 0.11773449182510376, "learning_rate": 1.684e-05, "loss": 1.4315, "step": 842 }, { "epoch": 0.25108434632066867, "grad_norm": 0.09932917356491089, "learning_rate": 1.686e-05, "loss": 1.4214, "step": 843 }, { "epoch": 0.25138219252033733, "grad_norm": 0.09133890271186829, "learning_rate": 1.688e-05, "loss": 1.4078, "step": 844 }, { "epoch": 0.25168003872000594, "grad_norm": 0.12656791508197784, "learning_rate": 1.69e-05, "loss": 1.4246, "step": 845 }, { "epoch": 0.2519778849196746, "grad_norm": 0.08804042637348175, "learning_rate": 1.692e-05, "loss": 1.4184, "step": 846 }, { "epoch": 0.25227573111934326, "grad_norm": 0.09051616489887238, "learning_rate": 1.694e-05, "loss": 1.4024, "step": 847 }, { "epoch": 0.2525735773190119, "grad_norm": 0.11514545232057571, "learning_rate": 1.696e-05, "loss": 1.4167, "step": 848 }, { "epoch": 0.2528714235186805, "grad_norm": 0.08954603224992752, "learning_rate": 1.698e-05, "loss": 1.4114, "step": 849 }, { "epoch": 0.2531692697183492, "grad_norm": 0.10602930933237076, "learning_rate": 1.7e-05, "loss": 1.4161, "step": 850 }, { "epoch": 0.25346711591801785, "grad_norm": 0.09775227308273315, "learning_rate": 1.702e-05, "loss": 1.4334, "step": 851 }, { "epoch": 0.25376496211768645, "grad_norm": 0.0997774749994278, "learning_rate": 1.704e-05, "loss": 1.412, "step": 852 }, { "epoch": 0.2540628083173551, "grad_norm": 0.09448961913585663, "learning_rate": 1.7060000000000003e-05, "loss": 1.4162, "step": 853 }, { "epoch": 0.2543606545170238, "grad_norm": 0.10423950850963593, "learning_rate": 1.7080000000000002e-05, "loss": 1.4313, "step": 854 }, { "epoch": 0.25465850071669244, "grad_norm": 0.10520417243242264, "learning_rate": 1.7100000000000002e-05, "loss": 1.4229, "step": 855 }, { "epoch": 0.25495634691636104, "grad_norm": 0.09463100880384445, "learning_rate": 1.7120000000000002e-05, "loss": 1.4128, "step": 856 }, { "epoch": 0.2552541931160297, "grad_norm": 0.09853941947221756, "learning_rate": 1.7140000000000002e-05, "loss": 1.426, "step": 857 }, { "epoch": 0.25555203931569836, "grad_norm": 0.09377727657556534, "learning_rate": 1.7160000000000002e-05, "loss": 1.4198, "step": 858 }, { "epoch": 0.255849885515367, "grad_norm": 0.08863189816474915, "learning_rate": 1.718e-05, "loss": 1.4079, "step": 859 }, { "epoch": 0.25614773171503563, "grad_norm": 0.08952966332435608, "learning_rate": 1.72e-05, "loss": 1.4312, "step": 860 }, { "epoch": 0.2564455779147043, "grad_norm": 0.10895556956529617, "learning_rate": 1.722e-05, "loss": 1.3928, "step": 861 }, { "epoch": 0.25674342411437295, "grad_norm": 0.09976348280906677, "learning_rate": 1.724e-05, "loss": 1.4131, "step": 862 }, { "epoch": 0.2570412703140416, "grad_norm": 0.10217374563217163, "learning_rate": 1.726e-05, "loss": 1.4301, "step": 863 }, { "epoch": 0.2573391165137102, "grad_norm": 0.0923185646533966, "learning_rate": 1.728e-05, "loss": 1.4379, "step": 864 }, { "epoch": 0.2576369627133789, "grad_norm": 0.09548322856426239, "learning_rate": 1.73e-05, "loss": 1.4158, "step": 865 }, { "epoch": 0.25793480891304754, "grad_norm": 0.09015744179487228, "learning_rate": 1.732e-05, "loss": 1.4119, "step": 866 }, { "epoch": 0.25823265511271615, "grad_norm": 0.09577369689941406, "learning_rate": 1.734e-05, "loss": 1.4046, "step": 867 }, { "epoch": 0.2585305013123848, "grad_norm": 0.09768911451101303, "learning_rate": 1.736e-05, "loss": 1.4031, "step": 868 }, { "epoch": 0.25882834751205347, "grad_norm": 0.10957697778940201, "learning_rate": 1.7380000000000003e-05, "loss": 1.4117, "step": 869 }, { "epoch": 0.25912619371172213, "grad_norm": 0.0891515463590622, "learning_rate": 1.7400000000000003e-05, "loss": 1.4045, "step": 870 }, { "epoch": 0.25942403991139074, "grad_norm": 0.09458895772695541, "learning_rate": 1.7420000000000003e-05, "loss": 1.4109, "step": 871 }, { "epoch": 0.2597218861110594, "grad_norm": 0.0916818156838417, "learning_rate": 1.7440000000000002e-05, "loss": 1.4028, "step": 872 }, { "epoch": 0.26001973231072806, "grad_norm": 0.10320673137903214, "learning_rate": 1.7460000000000002e-05, "loss": 1.4187, "step": 873 }, { "epoch": 0.2603175785103967, "grad_norm": 0.099599190056324, "learning_rate": 1.7480000000000002e-05, "loss": 1.4265, "step": 874 }, { "epoch": 0.2606154247100653, "grad_norm": 0.09808894246816635, "learning_rate": 1.7500000000000002e-05, "loss": 1.4232, "step": 875 }, { "epoch": 0.260913270909734, "grad_norm": 0.09341083467006683, "learning_rate": 1.752e-05, "loss": 1.4233, "step": 876 }, { "epoch": 0.26121111710940265, "grad_norm": 0.10195144265890121, "learning_rate": 1.754e-05, "loss": 1.4139, "step": 877 }, { "epoch": 0.26150896330907125, "grad_norm": 0.09913980960845947, "learning_rate": 1.756e-05, "loss": 1.4196, "step": 878 }, { "epoch": 0.2618068095087399, "grad_norm": 0.09989370405673981, "learning_rate": 1.758e-05, "loss": 1.4288, "step": 879 }, { "epoch": 0.2621046557084086, "grad_norm": 0.10181991010904312, "learning_rate": 1.76e-05, "loss": 1.4175, "step": 880 }, { "epoch": 0.26240250190807723, "grad_norm": 0.09663712233304977, "learning_rate": 1.762e-05, "loss": 1.4406, "step": 881 }, { "epoch": 0.26270034810774584, "grad_norm": 0.09889239817857742, "learning_rate": 1.764e-05, "loss": 1.4101, "step": 882 }, { "epoch": 0.2629981943074145, "grad_norm": 0.0971718281507492, "learning_rate": 1.766e-05, "loss": 1.4217, "step": 883 }, { "epoch": 0.26329604050708316, "grad_norm": 0.095889151096344, "learning_rate": 1.768e-05, "loss": 1.4181, "step": 884 }, { "epoch": 0.2635938867067518, "grad_norm": 0.09541714191436768, "learning_rate": 1.77e-05, "loss": 1.4212, "step": 885 }, { "epoch": 0.26389173290642043, "grad_norm": 0.09994743764400482, "learning_rate": 1.7720000000000003e-05, "loss": 1.4147, "step": 886 }, { "epoch": 0.2641895791060891, "grad_norm": 0.09512241184711456, "learning_rate": 1.7740000000000003e-05, "loss": 1.4326, "step": 887 }, { "epoch": 0.26448742530575775, "grad_norm": 0.09152122586965561, "learning_rate": 1.7760000000000003e-05, "loss": 1.4007, "step": 888 }, { "epoch": 0.2647852715054264, "grad_norm": 0.10552559047937393, "learning_rate": 1.7780000000000003e-05, "loss": 1.4241, "step": 889 }, { "epoch": 0.265083117705095, "grad_norm": 0.0983191430568695, "learning_rate": 1.7800000000000002e-05, "loss": 1.3978, "step": 890 }, { "epoch": 0.2653809639047637, "grad_norm": 0.09858065098524094, "learning_rate": 1.7820000000000002e-05, "loss": 1.4227, "step": 891 }, { "epoch": 0.26567881010443234, "grad_norm": 0.0938938781619072, "learning_rate": 1.7840000000000002e-05, "loss": 1.3974, "step": 892 }, { "epoch": 0.26597665630410094, "grad_norm": 0.09322497993707657, "learning_rate": 1.7860000000000002e-05, "loss": 1.408, "step": 893 }, { "epoch": 0.2662745025037696, "grad_norm": 0.10351376980543137, "learning_rate": 1.788e-05, "loss": 1.4184, "step": 894 }, { "epoch": 0.26657234870343827, "grad_norm": 0.1024598702788353, "learning_rate": 1.79e-05, "loss": 1.4024, "step": 895 }, { "epoch": 0.2668701949031069, "grad_norm": 0.0989329069852829, "learning_rate": 1.792e-05, "loss": 1.4221, "step": 896 }, { "epoch": 0.26716804110277553, "grad_norm": 0.1000707745552063, "learning_rate": 1.794e-05, "loss": 1.4036, "step": 897 }, { "epoch": 0.2674658873024442, "grad_norm": 0.0954870656132698, "learning_rate": 1.796e-05, "loss": 1.4084, "step": 898 }, { "epoch": 0.26776373350211286, "grad_norm": 0.09943860024213791, "learning_rate": 1.798e-05, "loss": 1.4081, "step": 899 }, { "epoch": 0.2680615797017815, "grad_norm": 0.09641280025243759, "learning_rate": 1.8e-05, "loss": 1.4056, "step": 900 }, { "epoch": 0.2683594259014501, "grad_norm": 0.09933046251535416, "learning_rate": 1.802e-05, "loss": 1.4015, "step": 901 }, { "epoch": 0.2686572721011188, "grad_norm": 0.09325650334358215, "learning_rate": 1.8040000000000003e-05, "loss": 1.4166, "step": 902 }, { "epoch": 0.26895511830078744, "grad_norm": 0.09692834317684174, "learning_rate": 1.8060000000000003e-05, "loss": 1.3878, "step": 903 }, { "epoch": 0.2692529645004561, "grad_norm": 0.0937558114528656, "learning_rate": 1.8080000000000003e-05, "loss": 1.4024, "step": 904 }, { "epoch": 0.2695508107001247, "grad_norm": 0.10609590262174606, "learning_rate": 1.8100000000000003e-05, "loss": 1.394, "step": 905 }, { "epoch": 0.26984865689979337, "grad_norm": 0.10428649187088013, "learning_rate": 1.8120000000000003e-05, "loss": 1.409, "step": 906 }, { "epoch": 0.27014650309946203, "grad_norm": 0.10072480142116547, "learning_rate": 1.8140000000000003e-05, "loss": 1.4273, "step": 907 }, { "epoch": 0.27044434929913064, "grad_norm": 0.10124707221984863, "learning_rate": 1.8160000000000002e-05, "loss": 1.4101, "step": 908 }, { "epoch": 0.2707421954987993, "grad_norm": 0.11809855699539185, "learning_rate": 1.8180000000000002e-05, "loss": 1.4181, "step": 909 }, { "epoch": 0.27104004169846796, "grad_norm": 0.10658524185419083, "learning_rate": 1.8200000000000002e-05, "loss": 1.4127, "step": 910 }, { "epoch": 0.2713378878981366, "grad_norm": 0.09508049488067627, "learning_rate": 1.8220000000000002e-05, "loss": 1.4078, "step": 911 }, { "epoch": 0.2716357340978052, "grad_norm": 0.09699340164661407, "learning_rate": 1.824e-05, "loss": 1.4023, "step": 912 }, { "epoch": 0.2719335802974739, "grad_norm": 0.10190922766923904, "learning_rate": 1.826e-05, "loss": 1.3913, "step": 913 }, { "epoch": 0.27223142649714255, "grad_norm": 0.09622354805469513, "learning_rate": 1.828e-05, "loss": 1.3964, "step": 914 }, { "epoch": 0.2725292726968112, "grad_norm": 0.10256228595972061, "learning_rate": 1.83e-05, "loss": 1.4014, "step": 915 }, { "epoch": 0.2728271188964798, "grad_norm": 0.10841874033212662, "learning_rate": 1.832e-05, "loss": 1.3959, "step": 916 }, { "epoch": 0.2731249650961485, "grad_norm": 0.10006251186132431, "learning_rate": 1.834e-05, "loss": 1.3897, "step": 917 }, { "epoch": 0.27342281129581714, "grad_norm": 0.10497182607650757, "learning_rate": 1.8360000000000004e-05, "loss": 1.4088, "step": 918 }, { "epoch": 0.27372065749548574, "grad_norm": 0.09609287977218628, "learning_rate": 1.8380000000000004e-05, "loss": 1.4052, "step": 919 }, { "epoch": 0.2740185036951544, "grad_norm": 0.09894520044326782, "learning_rate": 1.8400000000000003e-05, "loss": 1.4145, "step": 920 }, { "epoch": 0.27431634989482306, "grad_norm": 0.11151731759309769, "learning_rate": 1.8420000000000003e-05, "loss": 1.4094, "step": 921 }, { "epoch": 0.2746141960944917, "grad_norm": 0.11131482571363449, "learning_rate": 1.8440000000000003e-05, "loss": 1.389, "step": 922 }, { "epoch": 0.27491204229416033, "grad_norm": 0.1114390641450882, "learning_rate": 1.8460000000000003e-05, "loss": 1.4065, "step": 923 }, { "epoch": 0.275209888493829, "grad_norm": 0.10770322382450104, "learning_rate": 1.8480000000000003e-05, "loss": 1.4018, "step": 924 }, { "epoch": 0.27550773469349765, "grad_norm": 0.11392100900411606, "learning_rate": 1.8500000000000002e-05, "loss": 1.4076, "step": 925 }, { "epoch": 0.2758055808931663, "grad_norm": 0.11138052493333817, "learning_rate": 1.8520000000000002e-05, "loss": 1.4204, "step": 926 }, { "epoch": 0.2761034270928349, "grad_norm": 0.09689308702945709, "learning_rate": 1.8540000000000002e-05, "loss": 1.4126, "step": 927 }, { "epoch": 0.2764012732925036, "grad_norm": 0.10757928341627121, "learning_rate": 1.8560000000000002e-05, "loss": 1.4057, "step": 928 }, { "epoch": 0.27669911949217224, "grad_norm": 0.10798566043376923, "learning_rate": 1.858e-05, "loss": 1.4037, "step": 929 }, { "epoch": 0.2769969656918409, "grad_norm": 0.09981115907430649, "learning_rate": 1.86e-05, "loss": 1.4028, "step": 930 }, { "epoch": 0.2772948118915095, "grad_norm": 0.10636036098003387, "learning_rate": 1.862e-05, "loss": 1.4002, "step": 931 }, { "epoch": 0.27759265809117817, "grad_norm": 0.10867718607187271, "learning_rate": 1.864e-05, "loss": 1.3935, "step": 932 }, { "epoch": 0.27789050429084683, "grad_norm": 0.0998225212097168, "learning_rate": 1.866e-05, "loss": 1.4056, "step": 933 }, { "epoch": 0.27818835049051543, "grad_norm": 0.09779660403728485, "learning_rate": 1.8680000000000004e-05, "loss": 1.4096, "step": 934 }, { "epoch": 0.2784861966901841, "grad_norm": 0.10754135251045227, "learning_rate": 1.8700000000000004e-05, "loss": 1.4028, "step": 935 }, { "epoch": 0.27878404288985276, "grad_norm": 0.09656241536140442, "learning_rate": 1.8720000000000004e-05, "loss": 1.3919, "step": 936 }, { "epoch": 0.2790818890895214, "grad_norm": 0.1022043228149414, "learning_rate": 1.8740000000000004e-05, "loss": 1.4003, "step": 937 }, { "epoch": 0.27937973528919, "grad_norm": 0.10005227476358414, "learning_rate": 1.876e-05, "loss": 1.397, "step": 938 }, { "epoch": 0.2796775814888587, "grad_norm": 0.10625877231359482, "learning_rate": 1.878e-05, "loss": 1.389, "step": 939 }, { "epoch": 0.27997542768852735, "grad_norm": 0.09828570485115051, "learning_rate": 1.88e-05, "loss": 1.3926, "step": 940 }, { "epoch": 0.280273273888196, "grad_norm": 0.09796032309532166, "learning_rate": 1.882e-05, "loss": 1.4054, "step": 941 }, { "epoch": 0.2805711200878646, "grad_norm": 0.10021141171455383, "learning_rate": 1.884e-05, "loss": 1.3962, "step": 942 }, { "epoch": 0.2808689662875333, "grad_norm": 0.10457919538021088, "learning_rate": 1.886e-05, "loss": 1.3957, "step": 943 }, { "epoch": 0.28116681248720193, "grad_norm": 0.10513138771057129, "learning_rate": 1.8880000000000002e-05, "loss": 1.4013, "step": 944 }, { "epoch": 0.2814646586868706, "grad_norm": 0.10234611481428146, "learning_rate": 1.8900000000000002e-05, "loss": 1.4118, "step": 945 }, { "epoch": 0.2817625048865392, "grad_norm": 0.09986097365617752, "learning_rate": 1.8920000000000002e-05, "loss": 1.3906, "step": 946 }, { "epoch": 0.28206035108620786, "grad_norm": 0.10012020915746689, "learning_rate": 1.894e-05, "loss": 1.4008, "step": 947 }, { "epoch": 0.2823581972858765, "grad_norm": 0.10664036870002747, "learning_rate": 1.896e-05, "loss": 1.4044, "step": 948 }, { "epoch": 0.2826560434855451, "grad_norm": 0.10882434248924255, "learning_rate": 1.898e-05, "loss": 1.3903, "step": 949 }, { "epoch": 0.2829538896852138, "grad_norm": 0.1019265428185463, "learning_rate": 1.9e-05, "loss": 1.3986, "step": 950 }, { "epoch": 0.28325173588488245, "grad_norm": 0.09696544706821442, "learning_rate": 1.902e-05, "loss": 1.3848, "step": 951 }, { "epoch": 0.2835495820845511, "grad_norm": 0.10864205658435822, "learning_rate": 1.904e-05, "loss": 1.4086, "step": 952 }, { "epoch": 0.2838474282842197, "grad_norm": 0.1046675369143486, "learning_rate": 1.906e-05, "loss": 1.4066, "step": 953 }, { "epoch": 0.2841452744838884, "grad_norm": 0.10185607522726059, "learning_rate": 1.908e-05, "loss": 1.4088, "step": 954 }, { "epoch": 0.28444312068355704, "grad_norm": 0.10509466379880905, "learning_rate": 1.91e-05, "loss": 1.4144, "step": 955 }, { "epoch": 0.2847409668832257, "grad_norm": 0.11369527131319046, "learning_rate": 1.912e-05, "loss": 1.42, "step": 956 }, { "epoch": 0.2850388130828943, "grad_norm": 0.1027270033955574, "learning_rate": 1.914e-05, "loss": 1.3971, "step": 957 }, { "epoch": 0.28533665928256297, "grad_norm": 0.11005938798189163, "learning_rate": 1.916e-05, "loss": 1.3954, "step": 958 }, { "epoch": 0.2856345054822316, "grad_norm": 0.10584299266338348, "learning_rate": 1.918e-05, "loss": 1.4025, "step": 959 }, { "epoch": 0.28593235168190023, "grad_norm": 0.09938495606184006, "learning_rate": 1.9200000000000003e-05, "loss": 1.404, "step": 960 }, { "epoch": 0.2862301978815689, "grad_norm": 0.10662488639354706, "learning_rate": 1.9220000000000002e-05, "loss": 1.3897, "step": 961 }, { "epoch": 0.28652804408123755, "grad_norm": 0.11186008900403976, "learning_rate": 1.9240000000000002e-05, "loss": 1.3969, "step": 962 }, { "epoch": 0.2868258902809062, "grad_norm": 0.11389771103858948, "learning_rate": 1.9260000000000002e-05, "loss": 1.4106, "step": 963 }, { "epoch": 0.2871237364805748, "grad_norm": 0.10690823197364807, "learning_rate": 1.9280000000000002e-05, "loss": 1.4152, "step": 964 }, { "epoch": 0.2874215826802435, "grad_norm": 0.10708753764629364, "learning_rate": 1.93e-05, "loss": 1.3998, "step": 965 }, { "epoch": 0.28771942887991214, "grad_norm": 0.11222704499959946, "learning_rate": 1.932e-05, "loss": 1.3986, "step": 966 }, { "epoch": 0.2880172750795808, "grad_norm": 0.10319394618272781, "learning_rate": 1.934e-05, "loss": 1.3831, "step": 967 }, { "epoch": 0.2883151212792494, "grad_norm": 0.10459738224744797, "learning_rate": 1.936e-05, "loss": 1.4113, "step": 968 }, { "epoch": 0.28861296747891807, "grad_norm": 0.10582360625267029, "learning_rate": 1.938e-05, "loss": 1.4131, "step": 969 }, { "epoch": 0.28891081367858673, "grad_norm": 0.10391835868358612, "learning_rate": 1.94e-05, "loss": 1.4068, "step": 970 }, { "epoch": 0.2892086598782554, "grad_norm": 0.1056375801563263, "learning_rate": 1.942e-05, "loss": 1.3891, "step": 971 }, { "epoch": 0.289506506077924, "grad_norm": 0.10760103911161423, "learning_rate": 1.944e-05, "loss": 1.4115, "step": 972 }, { "epoch": 0.28980435227759266, "grad_norm": 0.10337383300065994, "learning_rate": 1.946e-05, "loss": 1.3781, "step": 973 }, { "epoch": 0.2901021984772613, "grad_norm": 0.1072370707988739, "learning_rate": 1.948e-05, "loss": 1.3953, "step": 974 }, { "epoch": 0.2904000446769299, "grad_norm": 0.10368131101131439, "learning_rate": 1.95e-05, "loss": 1.4046, "step": 975 }, { "epoch": 0.2906978908765986, "grad_norm": 0.10708404332399368, "learning_rate": 1.9520000000000003e-05, "loss": 1.3919, "step": 976 }, { "epoch": 0.29099573707626725, "grad_norm": 0.1009281799197197, "learning_rate": 1.9540000000000003e-05, "loss": 1.4213, "step": 977 }, { "epoch": 0.2912935832759359, "grad_norm": 0.10308690369129181, "learning_rate": 1.9560000000000002e-05, "loss": 1.3902, "step": 978 }, { "epoch": 0.2915914294756045, "grad_norm": 0.11442205309867859, "learning_rate": 1.9580000000000002e-05, "loss": 1.3841, "step": 979 }, { "epoch": 0.2918892756752732, "grad_norm": 0.11296455562114716, "learning_rate": 1.9600000000000002e-05, "loss": 1.3842, "step": 980 }, { "epoch": 0.29218712187494184, "grad_norm": 0.10912510752677917, "learning_rate": 1.9620000000000002e-05, "loss": 1.3975, "step": 981 }, { "epoch": 0.2924849680746105, "grad_norm": 0.11549566686153412, "learning_rate": 1.9640000000000002e-05, "loss": 1.3844, "step": 982 }, { "epoch": 0.2927828142742791, "grad_norm": 0.10916072130203247, "learning_rate": 1.966e-05, "loss": 1.399, "step": 983 }, { "epoch": 0.29308066047394776, "grad_norm": 0.12139784544706345, "learning_rate": 1.968e-05, "loss": 1.3983, "step": 984 }, { "epoch": 0.2933785066736164, "grad_norm": 0.11861048638820648, "learning_rate": 1.97e-05, "loss": 1.4091, "step": 985 }, { "epoch": 0.2936763528732851, "grad_norm": 0.11576896905899048, "learning_rate": 1.972e-05, "loss": 1.4114, "step": 986 }, { "epoch": 0.2939741990729537, "grad_norm": 0.11256488412618637, "learning_rate": 1.974e-05, "loss": 1.4046, "step": 987 }, { "epoch": 0.29427204527262235, "grad_norm": 0.11060027033090591, "learning_rate": 1.976e-05, "loss": 1.3987, "step": 988 }, { "epoch": 0.294569891472291, "grad_norm": 0.10744709521532059, "learning_rate": 1.978e-05, "loss": 1.3953, "step": 989 }, { "epoch": 0.2948677376719596, "grad_norm": 0.109938845038414, "learning_rate": 1.98e-05, "loss": 1.3942, "step": 990 }, { "epoch": 0.2951655838716283, "grad_norm": 0.12279019504785538, "learning_rate": 1.982e-05, "loss": 1.3926, "step": 991 }, { "epoch": 0.29546343007129694, "grad_norm": 0.1063704863190651, "learning_rate": 1.9840000000000003e-05, "loss": 1.401, "step": 992 }, { "epoch": 0.2957612762709656, "grad_norm": 0.11839362978935242, "learning_rate": 1.9860000000000003e-05, "loss": 1.401, "step": 993 }, { "epoch": 0.2960591224706342, "grad_norm": 0.11602285504341125, "learning_rate": 1.9880000000000003e-05, "loss": 1.3978, "step": 994 }, { "epoch": 0.29635696867030287, "grad_norm": 0.11820555478334427, "learning_rate": 1.9900000000000003e-05, "loss": 1.3963, "step": 995 }, { "epoch": 0.29665481486997153, "grad_norm": 0.10548858344554901, "learning_rate": 1.9920000000000002e-05, "loss": 1.3883, "step": 996 }, { "epoch": 0.2969526610696402, "grad_norm": 0.9518315196037292, "learning_rate": 1.9940000000000002e-05, "loss": 1.4016, "step": 997 }, { "epoch": 0.2972505072693088, "grad_norm": 0.11482294648885727, "learning_rate": 1.9960000000000002e-05, "loss": 1.3948, "step": 998 }, { "epoch": 0.29754835346897746, "grad_norm": 0.12062684446573257, "learning_rate": 1.9980000000000002e-05, "loss": 1.4016, "step": 999 }, { "epoch": 0.2978461996686461, "grad_norm": 0.12081863731145859, "learning_rate": 2e-05, "loss": 1.4047, "step": 1000 }, { "epoch": 0.2978461996686461, "eval_loss": 1.397951602935791, "eval_runtime": 18.6121, "eval_samples_per_second": 93.165, "eval_steps_per_second": 5.856, "step": 1000 }, { "epoch": 0.2981440458683147, "grad_norm": 0.11395775526762009, "learning_rate": 1.9999999953480586e-05, "loss": 1.3953, "step": 1001 }, { "epoch": 0.2984418920679834, "grad_norm": 0.12259924411773682, "learning_rate": 1.9999999813922347e-05, "loss": 1.4078, "step": 1002 }, { "epoch": 0.29873973826765204, "grad_norm": 0.12151668965816498, "learning_rate": 1.999999958132528e-05, "loss": 1.3956, "step": 1003 }, { "epoch": 0.2990375844673207, "grad_norm": 0.11236704140901566, "learning_rate": 1.9999999255689393e-05, "loss": 1.3904, "step": 1004 }, { "epoch": 0.2993354306669893, "grad_norm": 0.11873602867126465, "learning_rate": 1.999999883701468e-05, "loss": 1.384, "step": 1005 }, { "epoch": 0.29963327686665797, "grad_norm": 0.12156610190868378, "learning_rate": 1.9999998325301156e-05, "loss": 1.3934, "step": 1006 }, { "epoch": 0.29993112306632663, "grad_norm": 0.11415664851665497, "learning_rate": 1.9999997720548817e-05, "loss": 1.3992, "step": 1007 }, { "epoch": 0.3002289692659953, "grad_norm": 0.11592312157154083, "learning_rate": 1.9999997022757675e-05, "loss": 1.4118, "step": 1008 }, { "epoch": 0.3005268154656639, "grad_norm": 0.12188813090324402, "learning_rate": 1.999999623192773e-05, "loss": 1.3994, "step": 1009 }, { "epoch": 0.30082466166533256, "grad_norm": 0.11910311132669449, "learning_rate": 1.9999995348058992e-05, "loss": 1.4028, "step": 1010 }, { "epoch": 0.3011225078650012, "grad_norm": 0.10839038342237473, "learning_rate": 1.9999994371151472e-05, "loss": 1.3934, "step": 1011 }, { "epoch": 0.3014203540646699, "grad_norm": 0.12016301602125168, "learning_rate": 1.9999993301205176e-05, "loss": 1.3969, "step": 1012 }, { "epoch": 0.3017182002643385, "grad_norm": 0.1121194139122963, "learning_rate": 1.9999992138220117e-05, "loss": 1.4116, "step": 1013 }, { "epoch": 0.30201604646400715, "grad_norm": 0.11905040591955185, "learning_rate": 1.99999908821963e-05, "loss": 1.4097, "step": 1014 }, { "epoch": 0.3023138926636758, "grad_norm": 0.11118368059396744, "learning_rate": 1.9999989533133743e-05, "loss": 1.3893, "step": 1015 }, { "epoch": 0.3026117388633444, "grad_norm": 0.11388043314218521, "learning_rate": 1.9999988091032456e-05, "loss": 1.4, "step": 1016 }, { "epoch": 0.3029095850630131, "grad_norm": 0.11436357349157333, "learning_rate": 1.9999986555892453e-05, "loss": 1.4029, "step": 1017 }, { "epoch": 0.30320743126268174, "grad_norm": 0.10592655092477798, "learning_rate": 1.9999984927713748e-05, "loss": 1.4006, "step": 1018 }, { "epoch": 0.3035052774623504, "grad_norm": 0.1133023053407669, "learning_rate": 1.9999983206496355e-05, "loss": 1.3881, "step": 1019 }, { "epoch": 0.303803123662019, "grad_norm": 0.1287492960691452, "learning_rate": 1.999998139224029e-05, "loss": 1.3987, "step": 1020 }, { "epoch": 0.30410096986168766, "grad_norm": 0.11255930364131927, "learning_rate": 1.999997948494557e-05, "loss": 1.3882, "step": 1021 }, { "epoch": 0.3043988160613563, "grad_norm": 0.11940372735261917, "learning_rate": 1.9999977484612217e-05, "loss": 1.3873, "step": 1022 }, { "epoch": 0.304696662261025, "grad_norm": 0.1144757941365242, "learning_rate": 1.9999975391240242e-05, "loss": 1.3942, "step": 1023 }, { "epoch": 0.3049945084606936, "grad_norm": 0.1100514829158783, "learning_rate": 1.999997320482967e-05, "loss": 1.4043, "step": 1024 }, { "epoch": 0.30529235466036225, "grad_norm": 0.11846914142370224, "learning_rate": 1.9999970925380526e-05, "loss": 1.3971, "step": 1025 }, { "epoch": 0.3055902008600309, "grad_norm": 0.12528426945209503, "learning_rate": 1.9999968552892815e-05, "loss": 1.3991, "step": 1026 }, { "epoch": 0.3058880470596996, "grad_norm": 0.1267407387495041, "learning_rate": 1.9999966087366575e-05, "loss": 1.3861, "step": 1027 }, { "epoch": 0.3061858932593682, "grad_norm": 0.10988324135541916, "learning_rate": 1.9999963528801826e-05, "loss": 1.4037, "step": 1028 }, { "epoch": 0.30648373945903684, "grad_norm": 0.12478654831647873, "learning_rate": 1.9999960877198585e-05, "loss": 1.3927, "step": 1029 }, { "epoch": 0.3067815856587055, "grad_norm": 0.11915039271116257, "learning_rate": 1.9999958132556882e-05, "loss": 1.3863, "step": 1030 }, { "epoch": 0.3070794318583741, "grad_norm": 0.11628096550703049, "learning_rate": 1.9999955294876738e-05, "loss": 1.3958, "step": 1031 }, { "epoch": 0.30737727805804277, "grad_norm": 0.11177193373441696, "learning_rate": 1.9999952364158184e-05, "loss": 1.3972, "step": 1032 }, { "epoch": 0.30767512425771143, "grad_norm": 0.11739183962345123, "learning_rate": 1.999994934040125e-05, "loss": 1.3969, "step": 1033 }, { "epoch": 0.3079729704573801, "grad_norm": 0.10952276736497879, "learning_rate": 1.9999946223605955e-05, "loss": 1.3966, "step": 1034 }, { "epoch": 0.3082708166570487, "grad_norm": 0.11301985383033752, "learning_rate": 1.9999943013772335e-05, "loss": 1.4064, "step": 1035 }, { "epoch": 0.30856866285671736, "grad_norm": 0.12167651951313019, "learning_rate": 1.999993971090042e-05, "loss": 1.4039, "step": 1036 }, { "epoch": 0.308866509056386, "grad_norm": 0.1168079599738121, "learning_rate": 1.9999936314990235e-05, "loss": 1.4044, "step": 1037 }, { "epoch": 0.3091643552560547, "grad_norm": 0.11724202334880829, "learning_rate": 1.9999932826041813e-05, "loss": 1.3941, "step": 1038 }, { "epoch": 0.3094622014557233, "grad_norm": 0.117185078561306, "learning_rate": 1.9999929244055194e-05, "loss": 1.394, "step": 1039 }, { "epoch": 0.30976004765539195, "grad_norm": 0.1150064691901207, "learning_rate": 1.9999925569030405e-05, "loss": 1.3928, "step": 1040 }, { "epoch": 0.3100578938550606, "grad_norm": 0.11486277729272842, "learning_rate": 1.999992180096748e-05, "loss": 1.3843, "step": 1041 }, { "epoch": 0.3103557400547292, "grad_norm": 0.1181897297501564, "learning_rate": 1.9999917939866455e-05, "loss": 1.3904, "step": 1042 }, { "epoch": 0.3106535862543979, "grad_norm": 0.11689642071723938, "learning_rate": 1.9999913985727362e-05, "loss": 1.366, "step": 1043 }, { "epoch": 0.31095143245406653, "grad_norm": 0.10985434800386429, "learning_rate": 1.9999909938550252e-05, "loss": 1.4004, "step": 1044 }, { "epoch": 0.3112492786537352, "grad_norm": 0.11144163459539413, "learning_rate": 1.9999905798335148e-05, "loss": 1.3762, "step": 1045 }, { "epoch": 0.3115471248534038, "grad_norm": 0.11040697246789932, "learning_rate": 1.9999901565082087e-05, "loss": 1.391, "step": 1046 }, { "epoch": 0.31184497105307246, "grad_norm": 0.11224711686372757, "learning_rate": 1.999989723879112e-05, "loss": 1.4161, "step": 1047 }, { "epoch": 0.3121428172527411, "grad_norm": 0.1203780397772789, "learning_rate": 1.9999892819462282e-05, "loss": 1.3899, "step": 1048 }, { "epoch": 0.3124406634524098, "grad_norm": 0.11567550152540207, "learning_rate": 1.9999888307095615e-05, "loss": 1.384, "step": 1049 }, { "epoch": 0.3127385096520784, "grad_norm": 0.12175623327493668, "learning_rate": 1.9999883701691155e-05, "loss": 1.3764, "step": 1050 }, { "epoch": 0.31303635585174705, "grad_norm": 0.11729805171489716, "learning_rate": 1.9999879003248955e-05, "loss": 1.3944, "step": 1051 }, { "epoch": 0.3133342020514157, "grad_norm": 0.11439957469701767, "learning_rate": 1.999987421176905e-05, "loss": 1.3871, "step": 1052 }, { "epoch": 0.31363204825108437, "grad_norm": 0.11447083204984665, "learning_rate": 1.9999869327251487e-05, "loss": 1.3765, "step": 1053 }, { "epoch": 0.313929894450753, "grad_norm": 0.11776264011859894, "learning_rate": 1.9999864349696315e-05, "loss": 1.3945, "step": 1054 }, { "epoch": 0.31422774065042164, "grad_norm": 0.1123151183128357, "learning_rate": 1.9999859279103576e-05, "loss": 1.3948, "step": 1055 }, { "epoch": 0.3145255868500903, "grad_norm": 0.11624394357204437, "learning_rate": 1.9999854115473324e-05, "loss": 1.4, "step": 1056 }, { "epoch": 0.3148234330497589, "grad_norm": 0.11331585049629211, "learning_rate": 1.9999848858805596e-05, "loss": 1.4026, "step": 1057 }, { "epoch": 0.31512127924942757, "grad_norm": 0.11557549983263016, "learning_rate": 1.999984350910045e-05, "loss": 1.3642, "step": 1058 }, { "epoch": 0.3154191254490962, "grad_norm": 0.1166551485657692, "learning_rate": 1.9999838066357932e-05, "loss": 1.3858, "step": 1059 }, { "epoch": 0.3157169716487649, "grad_norm": 0.11316041648387909, "learning_rate": 1.9999832530578093e-05, "loss": 1.392, "step": 1060 }, { "epoch": 0.3160148178484335, "grad_norm": 0.1141393855214119, "learning_rate": 1.9999826901760985e-05, "loss": 1.3801, "step": 1061 }, { "epoch": 0.31631266404810215, "grad_norm": 0.11633685976266861, "learning_rate": 1.999982117990666e-05, "loss": 1.3999, "step": 1062 }, { "epoch": 0.3166105102477708, "grad_norm": 0.11223947256803513, "learning_rate": 1.999981536501517e-05, "loss": 1.4016, "step": 1063 }, { "epoch": 0.3169083564474395, "grad_norm": 0.11991715431213379, "learning_rate": 1.999980945708657e-05, "loss": 1.3807, "step": 1064 }, { "epoch": 0.3172062026471081, "grad_norm": 0.11510065943002701, "learning_rate": 1.999980345612092e-05, "loss": 1.3888, "step": 1065 }, { "epoch": 0.31750404884677674, "grad_norm": 0.11234598606824875, "learning_rate": 1.9999797362118263e-05, "loss": 1.3844, "step": 1066 }, { "epoch": 0.3178018950464454, "grad_norm": 0.12312228232622147, "learning_rate": 1.9999791175078674e-05, "loss": 1.3841, "step": 1067 }, { "epoch": 0.31809974124611406, "grad_norm": 0.11295381933450699, "learning_rate": 1.9999784895002196e-05, "loss": 1.3809, "step": 1068 }, { "epoch": 0.31839758744578267, "grad_norm": 0.11573398858308792, "learning_rate": 1.9999778521888892e-05, "loss": 1.3875, "step": 1069 }, { "epoch": 0.31869543364545133, "grad_norm": 0.11296920478343964, "learning_rate": 1.999977205573882e-05, "loss": 1.3746, "step": 1070 }, { "epoch": 0.31899327984512, "grad_norm": 0.11616770923137665, "learning_rate": 1.999976549655204e-05, "loss": 1.3809, "step": 1071 }, { "epoch": 0.3192911260447886, "grad_norm": 0.10907561331987381, "learning_rate": 1.9999758844328618e-05, "loss": 1.3877, "step": 1072 }, { "epoch": 0.31958897224445726, "grad_norm": 0.13153153657913208, "learning_rate": 1.999975209906861e-05, "loss": 1.3811, "step": 1073 }, { "epoch": 0.3198868184441259, "grad_norm": 0.11982592195272446, "learning_rate": 1.9999745260772087e-05, "loss": 1.3885, "step": 1074 }, { "epoch": 0.3201846646437946, "grad_norm": 0.12647011876106262, "learning_rate": 1.9999738329439097e-05, "loss": 1.3951, "step": 1075 }, { "epoch": 0.3204825108434632, "grad_norm": 0.11812597513198853, "learning_rate": 1.9999731305069723e-05, "loss": 1.3965, "step": 1076 }, { "epoch": 0.32078035704313185, "grad_norm": 0.11641401052474976, "learning_rate": 1.999972418766402e-05, "loss": 1.4019, "step": 1077 }, { "epoch": 0.3210782032428005, "grad_norm": 0.12187743186950684, "learning_rate": 1.999971697722205e-05, "loss": 1.4009, "step": 1078 }, { "epoch": 0.32137604944246917, "grad_norm": 0.12092571705579758, "learning_rate": 1.999970967374389e-05, "loss": 1.3813, "step": 1079 }, { "epoch": 0.3216738956421378, "grad_norm": 0.12640085816383362, "learning_rate": 1.9999702277229604e-05, "loss": 1.4036, "step": 1080 }, { "epoch": 0.32197174184180644, "grad_norm": 0.13022305071353912, "learning_rate": 1.999969478767926e-05, "loss": 1.4151, "step": 1081 }, { "epoch": 0.3222695880414751, "grad_norm": 0.11454661935567856, "learning_rate": 1.9999687205092926e-05, "loss": 1.3837, "step": 1082 }, { "epoch": 0.3225674342411437, "grad_norm": 0.12185961753129959, "learning_rate": 1.999967952947068e-05, "loss": 1.3805, "step": 1083 }, { "epoch": 0.32286528044081236, "grad_norm": 0.12060558050870895, "learning_rate": 1.9999671760812584e-05, "loss": 1.3811, "step": 1084 }, { "epoch": 0.323163126640481, "grad_norm": 0.12417693436145782, "learning_rate": 1.9999663899118714e-05, "loss": 1.3885, "step": 1085 }, { "epoch": 0.3234609728401497, "grad_norm": 0.1290057897567749, "learning_rate": 1.9999655944389147e-05, "loss": 1.399, "step": 1086 }, { "epoch": 0.3237588190398183, "grad_norm": 0.12008559703826904, "learning_rate": 1.9999647896623954e-05, "loss": 1.3769, "step": 1087 }, { "epoch": 0.32405666523948695, "grad_norm": 0.11745686084032059, "learning_rate": 1.9999639755823207e-05, "loss": 1.3899, "step": 1088 }, { "epoch": 0.3243545114391556, "grad_norm": 0.1250544935464859, "learning_rate": 1.9999631521986983e-05, "loss": 1.3873, "step": 1089 }, { "epoch": 0.3246523576388243, "grad_norm": 0.11275584250688553, "learning_rate": 1.9999623195115364e-05, "loss": 1.3632, "step": 1090 }, { "epoch": 0.3249502038384929, "grad_norm": 0.1159360483288765, "learning_rate": 1.999961477520842e-05, "loss": 1.3642, "step": 1091 }, { "epoch": 0.32524805003816154, "grad_norm": 0.11819259077310562, "learning_rate": 1.999960626226624e-05, "loss": 1.3717, "step": 1092 }, { "epoch": 0.3255458962378302, "grad_norm": 0.124155193567276, "learning_rate": 1.9999597656288886e-05, "loss": 1.4006, "step": 1093 }, { "epoch": 0.32584374243749886, "grad_norm": 0.1236296072602272, "learning_rate": 1.9999588957276455e-05, "loss": 1.3903, "step": 1094 }, { "epoch": 0.32614158863716747, "grad_norm": 0.11855696886777878, "learning_rate": 1.999958016522902e-05, "loss": 1.3944, "step": 1095 }, { "epoch": 0.32643943483683613, "grad_norm": 0.11899484694004059, "learning_rate": 1.999957128014666e-05, "loss": 1.385, "step": 1096 }, { "epoch": 0.3267372810365048, "grad_norm": 0.11923281103372574, "learning_rate": 1.9999562302029462e-05, "loss": 1.4006, "step": 1097 }, { "epoch": 0.3270351272361734, "grad_norm": 0.1309370994567871, "learning_rate": 1.999955323087751e-05, "loss": 1.4025, "step": 1098 }, { "epoch": 0.32733297343584206, "grad_norm": 0.11568192392587662, "learning_rate": 1.999954406669089e-05, "loss": 1.3911, "step": 1099 }, { "epoch": 0.3276308196355107, "grad_norm": 0.12005109339952469, "learning_rate": 1.999953480946968e-05, "loss": 1.3857, "step": 1100 }, { "epoch": 0.3279286658351794, "grad_norm": 0.12727560102939606, "learning_rate": 1.9999525459213975e-05, "loss": 1.3974, "step": 1101 }, { "epoch": 0.328226512034848, "grad_norm": 0.12770545482635498, "learning_rate": 1.9999516015923854e-05, "loss": 1.3924, "step": 1102 }, { "epoch": 0.32852435823451664, "grad_norm": 0.11987710744142532, "learning_rate": 1.999950647959941e-05, "loss": 1.3799, "step": 1103 }, { "epoch": 0.3288222044341853, "grad_norm": 0.11894366145133972, "learning_rate": 1.9999496850240732e-05, "loss": 1.3796, "step": 1104 }, { "epoch": 0.32912005063385397, "grad_norm": 0.1279422640800476, "learning_rate": 1.999948712784791e-05, "loss": 1.3883, "step": 1105 }, { "epoch": 0.32941789683352257, "grad_norm": 0.11908257007598877, "learning_rate": 1.9999477312421026e-05, "loss": 1.3911, "step": 1106 }, { "epoch": 0.32971574303319123, "grad_norm": 0.1226000040769577, "learning_rate": 1.999946740396018e-05, "loss": 1.385, "step": 1107 }, { "epoch": 0.3300135892328599, "grad_norm": 0.1315961480140686, "learning_rate": 1.999945740246546e-05, "loss": 1.3763, "step": 1108 }, { "epoch": 0.33031143543252856, "grad_norm": 0.12968845665454865, "learning_rate": 1.9999447307936967e-05, "loss": 1.3814, "step": 1109 }, { "epoch": 0.33060928163219716, "grad_norm": 0.14246723055839539, "learning_rate": 1.9999437120374784e-05, "loss": 1.3984, "step": 1110 }, { "epoch": 0.3309071278318658, "grad_norm": 0.12243639677762985, "learning_rate": 1.9999426839779013e-05, "loss": 1.3754, "step": 1111 }, { "epoch": 0.3312049740315345, "grad_norm": 0.13210417330265045, "learning_rate": 1.9999416466149747e-05, "loss": 1.3841, "step": 1112 }, { "epoch": 0.3315028202312031, "grad_norm": 0.12163086235523224, "learning_rate": 1.999940599948708e-05, "loss": 1.3716, "step": 1113 }, { "epoch": 0.33180066643087175, "grad_norm": 0.12960121035575867, "learning_rate": 1.9999395439791117e-05, "loss": 1.3929, "step": 1114 }, { "epoch": 0.3320985126305404, "grad_norm": 0.13061104714870453, "learning_rate": 1.9999384787061947e-05, "loss": 1.3651, "step": 1115 }, { "epoch": 0.33239635883020907, "grad_norm": 0.12969915568828583, "learning_rate": 1.9999374041299676e-05, "loss": 1.3643, "step": 1116 }, { "epoch": 0.3326942050298777, "grad_norm": 0.12098529189825058, "learning_rate": 1.99993632025044e-05, "loss": 1.3715, "step": 1117 }, { "epoch": 0.33299205122954634, "grad_norm": 0.1331017166376114, "learning_rate": 1.999935227067622e-05, "loss": 1.4062, "step": 1118 }, { "epoch": 0.333289897429215, "grad_norm": 0.1369689255952835, "learning_rate": 1.9999341245815242e-05, "loss": 1.3818, "step": 1119 }, { "epoch": 0.33358774362888366, "grad_norm": 0.12456752359867096, "learning_rate": 1.9999330127921564e-05, "loss": 1.3734, "step": 1120 }, { "epoch": 0.33388558982855226, "grad_norm": 0.11823807656764984, "learning_rate": 1.9999318916995293e-05, "loss": 1.3964, "step": 1121 }, { "epoch": 0.3341834360282209, "grad_norm": 0.12144974619150162, "learning_rate": 1.999930761303653e-05, "loss": 1.376, "step": 1122 }, { "epoch": 0.3344812822278896, "grad_norm": 0.13049054145812988, "learning_rate": 1.999929621604538e-05, "loss": 1.3864, "step": 1123 }, { "epoch": 0.3347791284275582, "grad_norm": 0.12313073873519897, "learning_rate": 1.9999284726021954e-05, "loss": 1.388, "step": 1124 }, { "epoch": 0.33507697462722685, "grad_norm": 0.12566570937633514, "learning_rate": 1.9999273142966354e-05, "loss": 1.3871, "step": 1125 }, { "epoch": 0.3353748208268955, "grad_norm": 0.11997831612825394, "learning_rate": 1.999926146687869e-05, "loss": 1.3915, "step": 1126 }, { "epoch": 0.3356726670265642, "grad_norm": 0.12298216670751572, "learning_rate": 1.9999249697759068e-05, "loss": 1.3803, "step": 1127 }, { "epoch": 0.3359705132262328, "grad_norm": 0.1251022070646286, "learning_rate": 1.99992378356076e-05, "loss": 1.3808, "step": 1128 }, { "epoch": 0.33626835942590144, "grad_norm": 0.12131714075803757, "learning_rate": 1.9999225880424397e-05, "loss": 1.3875, "step": 1129 }, { "epoch": 0.3365662056255701, "grad_norm": 0.1261511743068695, "learning_rate": 1.9999213832209568e-05, "loss": 1.3805, "step": 1130 }, { "epoch": 0.33686405182523876, "grad_norm": 0.11724023520946503, "learning_rate": 1.9999201690963223e-05, "loss": 1.3816, "step": 1131 }, { "epoch": 0.33716189802490737, "grad_norm": 0.12367585301399231, "learning_rate": 1.999918945668548e-05, "loss": 1.4037, "step": 1132 }, { "epoch": 0.33745974422457603, "grad_norm": 0.12494815140962601, "learning_rate": 1.9999177129376456e-05, "loss": 1.3864, "step": 1133 }, { "epoch": 0.3377575904242447, "grad_norm": 0.122878797352314, "learning_rate": 1.9999164709036255e-05, "loss": 1.3728, "step": 1134 }, { "epoch": 0.33805543662391335, "grad_norm": 0.12429952621459961, "learning_rate": 1.9999152195665e-05, "loss": 1.362, "step": 1135 }, { "epoch": 0.33835328282358196, "grad_norm": 0.12699133157730103, "learning_rate": 1.9999139589262806e-05, "loss": 1.3543, "step": 1136 }, { "epoch": 0.3386511290232506, "grad_norm": 0.12095256894826889, "learning_rate": 1.999912688982979e-05, "loss": 1.3691, "step": 1137 }, { "epoch": 0.3389489752229193, "grad_norm": 0.12273938208818436, "learning_rate": 1.9999114097366066e-05, "loss": 1.3842, "step": 1138 }, { "epoch": 0.3392468214225879, "grad_norm": 0.12688860297203064, "learning_rate": 1.9999101211871762e-05, "loss": 1.3776, "step": 1139 }, { "epoch": 0.33954466762225655, "grad_norm": 0.11873604357242584, "learning_rate": 1.999908823334699e-05, "loss": 1.3854, "step": 1140 }, { "epoch": 0.3398425138219252, "grad_norm": 0.12411191314458847, "learning_rate": 1.9999075161791873e-05, "loss": 1.3716, "step": 1141 }, { "epoch": 0.34014036002159387, "grad_norm": 0.11912590265274048, "learning_rate": 1.999906199720654e-05, "loss": 1.3823, "step": 1142 }, { "epoch": 0.3404382062212625, "grad_norm": 0.11869439482688904, "learning_rate": 1.9999048739591094e-05, "loss": 1.3851, "step": 1143 }, { "epoch": 0.34073605242093113, "grad_norm": 0.12020917981863022, "learning_rate": 1.999903538894568e-05, "loss": 1.3835, "step": 1144 }, { "epoch": 0.3410338986205998, "grad_norm": 0.12506596744060516, "learning_rate": 1.9999021945270412e-05, "loss": 1.3816, "step": 1145 }, { "epoch": 0.34133174482026846, "grad_norm": 0.13336603343486786, "learning_rate": 1.9999008408565415e-05, "loss": 1.3954, "step": 1146 }, { "epoch": 0.34162959101993706, "grad_norm": 0.12509402632713318, "learning_rate": 1.999899477883082e-05, "loss": 1.3796, "step": 1147 }, { "epoch": 0.3419274372196057, "grad_norm": 0.12254216521978378, "learning_rate": 1.9998981056066745e-05, "loss": 1.3821, "step": 1148 }, { "epoch": 0.3422252834192744, "grad_norm": 0.1316823810338974, "learning_rate": 1.999896724027333e-05, "loss": 1.3809, "step": 1149 }, { "epoch": 0.34252312961894305, "grad_norm": 0.12617230415344238, "learning_rate": 1.999895333145069e-05, "loss": 1.3716, "step": 1150 }, { "epoch": 0.34282097581861165, "grad_norm": 0.12473230063915253, "learning_rate": 1.9998939329598964e-05, "loss": 1.3745, "step": 1151 }, { "epoch": 0.3431188220182803, "grad_norm": 0.12697988748550415, "learning_rate": 1.9998925234718275e-05, "loss": 1.3707, "step": 1152 }, { "epoch": 0.343416668217949, "grad_norm": 0.12683434784412384, "learning_rate": 1.999891104680876e-05, "loss": 1.3791, "step": 1153 }, { "epoch": 0.3437145144176176, "grad_norm": 0.13295918703079224, "learning_rate": 1.999889676587055e-05, "loss": 1.3924, "step": 1154 }, { "epoch": 0.34401236061728624, "grad_norm": 0.12189807742834091, "learning_rate": 1.9998882391903778e-05, "loss": 1.3818, "step": 1155 }, { "epoch": 0.3443102068169549, "grad_norm": 0.12889567017555237, "learning_rate": 1.9998867924908576e-05, "loss": 1.3945, "step": 1156 }, { "epoch": 0.34460805301662356, "grad_norm": 0.14350058138370514, "learning_rate": 1.9998853364885077e-05, "loss": 1.3862, "step": 1157 }, { "epoch": 0.34490589921629217, "grad_norm": 0.13133090734481812, "learning_rate": 1.999883871183342e-05, "loss": 1.3731, "step": 1158 }, { "epoch": 0.3452037454159608, "grad_norm": 0.12137048691511154, "learning_rate": 1.999882396575374e-05, "loss": 1.3666, "step": 1159 }, { "epoch": 0.3455015916156295, "grad_norm": 0.12902837991714478, "learning_rate": 1.9998809126646178e-05, "loss": 1.3913, "step": 1160 }, { "epoch": 0.34579943781529815, "grad_norm": 0.1265643686056137, "learning_rate": 1.9998794194510863e-05, "loss": 1.412, "step": 1161 }, { "epoch": 0.34609728401496676, "grad_norm": 0.11998944729566574, "learning_rate": 1.9998779169347942e-05, "loss": 1.3625, "step": 1162 }, { "epoch": 0.3463951302146354, "grad_norm": 0.12442991882562637, "learning_rate": 1.999876405115755e-05, "loss": 1.3628, "step": 1163 }, { "epoch": 0.3466929764143041, "grad_norm": 0.12390732765197754, "learning_rate": 1.9998748839939836e-05, "loss": 1.3678, "step": 1164 }, { "epoch": 0.3469908226139727, "grad_norm": 0.1266006976366043, "learning_rate": 1.999873353569493e-05, "loss": 1.3724, "step": 1165 }, { "epoch": 0.34728866881364134, "grad_norm": 0.12780870497226715, "learning_rate": 1.9998718138422983e-05, "loss": 1.3719, "step": 1166 }, { "epoch": 0.34758651501331, "grad_norm": 0.12996380031108856, "learning_rate": 1.999870264812413e-05, "loss": 1.387, "step": 1167 }, { "epoch": 0.34788436121297867, "grad_norm": 0.12336032092571259, "learning_rate": 1.9998687064798522e-05, "loss": 1.3861, "step": 1168 }, { "epoch": 0.34818220741264727, "grad_norm": 0.13241474330425262, "learning_rate": 1.9998671388446304e-05, "loss": 1.3868, "step": 1169 }, { "epoch": 0.34848005361231593, "grad_norm": 0.12734943628311157, "learning_rate": 1.999865561906762e-05, "loss": 1.3959, "step": 1170 }, { "epoch": 0.3487778998119846, "grad_norm": 0.12355831265449524, "learning_rate": 1.9998639756662614e-05, "loss": 1.3676, "step": 1171 }, { "epoch": 0.34907574601165325, "grad_norm": 0.12056966125965118, "learning_rate": 1.9998623801231438e-05, "loss": 1.3849, "step": 1172 }, { "epoch": 0.34937359221132186, "grad_norm": 0.1257609874010086, "learning_rate": 1.9998607752774238e-05, "loss": 1.3746, "step": 1173 }, { "epoch": 0.3496714384109905, "grad_norm": 0.12630680203437805, "learning_rate": 1.9998591611291166e-05, "loss": 1.3744, "step": 1174 }, { "epoch": 0.3499692846106592, "grad_norm": 0.13128453493118286, "learning_rate": 1.9998575376782366e-05, "loss": 1.3711, "step": 1175 }, { "epoch": 0.35026713081032784, "grad_norm": 0.12793263792991638, "learning_rate": 1.9998559049247996e-05, "loss": 1.366, "step": 1176 }, { "epoch": 0.35056497700999645, "grad_norm": 0.12537628412246704, "learning_rate": 1.9998542628688206e-05, "loss": 1.3642, "step": 1177 }, { "epoch": 0.3508628232096651, "grad_norm": 0.1319340020418167, "learning_rate": 1.9998526115103144e-05, "loss": 1.3805, "step": 1178 }, { "epoch": 0.35116066940933377, "grad_norm": 0.13178007304668427, "learning_rate": 1.999850950849297e-05, "loss": 1.3759, "step": 1179 }, { "epoch": 0.3514585156090024, "grad_norm": 0.13513357937335968, "learning_rate": 1.999849280885784e-05, "loss": 1.3761, "step": 1180 }, { "epoch": 0.35175636180867104, "grad_norm": 0.134556844830513, "learning_rate": 1.99984760161979e-05, "loss": 1.3881, "step": 1181 }, { "epoch": 0.3520542080083397, "grad_norm": 0.13081508874893188, "learning_rate": 1.9998459130513313e-05, "loss": 1.376, "step": 1182 }, { "epoch": 0.35235205420800836, "grad_norm": 0.1282830685377121, "learning_rate": 1.9998442151804235e-05, "loss": 1.3728, "step": 1183 }, { "epoch": 0.35264990040767696, "grad_norm": 0.13131962716579437, "learning_rate": 1.9998425080070824e-05, "loss": 1.3728, "step": 1184 }, { "epoch": 0.3529477466073456, "grad_norm": 0.12443128228187561, "learning_rate": 1.9998407915313236e-05, "loss": 1.3756, "step": 1185 }, { "epoch": 0.3532455928070143, "grad_norm": 0.12644220888614655, "learning_rate": 1.9998390657531637e-05, "loss": 1.3736, "step": 1186 }, { "epoch": 0.35354343900668295, "grad_norm": 0.13326533138751984, "learning_rate": 1.999837330672618e-05, "loss": 1.37, "step": 1187 }, { "epoch": 0.35384128520635155, "grad_norm": 0.11835305392742157, "learning_rate": 1.9998355862897032e-05, "loss": 1.3899, "step": 1188 }, { "epoch": 0.3541391314060202, "grad_norm": 0.12689045071601868, "learning_rate": 1.9998338326044356e-05, "loss": 1.3734, "step": 1189 }, { "epoch": 0.3544369776056889, "grad_norm": 0.12657564878463745, "learning_rate": 1.9998320696168308e-05, "loss": 1.3877, "step": 1190 }, { "epoch": 0.35473482380535754, "grad_norm": 0.12246771156787872, "learning_rate": 1.9998302973269063e-05, "loss": 1.3738, "step": 1191 }, { "epoch": 0.35503267000502614, "grad_norm": 0.1292853057384491, "learning_rate": 1.9998285157346772e-05, "loss": 1.3936, "step": 1192 }, { "epoch": 0.3553305162046948, "grad_norm": 0.13208384811878204, "learning_rate": 1.999826724840161e-05, "loss": 1.3598, "step": 1193 }, { "epoch": 0.35562836240436346, "grad_norm": 0.13662226498126984, "learning_rate": 1.9998249246433748e-05, "loss": 1.3704, "step": 1194 }, { "epoch": 0.35592620860403207, "grad_norm": 0.13780462741851807, "learning_rate": 1.9998231151443344e-05, "loss": 1.4053, "step": 1195 }, { "epoch": 0.35622405480370073, "grad_norm": 0.1379765272140503, "learning_rate": 1.999821296343057e-05, "loss": 1.3878, "step": 1196 }, { "epoch": 0.3565219010033694, "grad_norm": 0.13109084963798523, "learning_rate": 1.9998194682395592e-05, "loss": 1.3607, "step": 1197 }, { "epoch": 0.35681974720303805, "grad_norm": 0.13432908058166504, "learning_rate": 1.9998176308338582e-05, "loss": 1.3725, "step": 1198 }, { "epoch": 0.35711759340270666, "grad_norm": 0.13718818128108978, "learning_rate": 1.9998157841259716e-05, "loss": 1.3735, "step": 1199 }, { "epoch": 0.3574154396023753, "grad_norm": 0.1316278874874115, "learning_rate": 1.9998139281159164e-05, "loss": 1.3601, "step": 1200 }, { "epoch": 0.357713285802044, "grad_norm": 0.12941031157970428, "learning_rate": 1.9998120628037094e-05, "loss": 1.3809, "step": 1201 }, { "epoch": 0.35801113200171264, "grad_norm": 0.13530942797660828, "learning_rate": 1.9998101881893683e-05, "loss": 1.3818, "step": 1202 }, { "epoch": 0.35830897820138125, "grad_norm": 0.1344297230243683, "learning_rate": 1.9998083042729103e-05, "loss": 1.375, "step": 1203 }, { "epoch": 0.3586068244010499, "grad_norm": 0.1414794921875, "learning_rate": 1.999806411054353e-05, "loss": 1.3912, "step": 1204 }, { "epoch": 0.35890467060071857, "grad_norm": 0.14133290946483612, "learning_rate": 1.9998045085337147e-05, "loss": 1.3715, "step": 1205 }, { "epoch": 0.3592025168003872, "grad_norm": 0.13531997799873352, "learning_rate": 1.999802596711012e-05, "loss": 1.3691, "step": 1206 }, { "epoch": 0.35950036300005583, "grad_norm": 0.13153564929962158, "learning_rate": 1.9998006755862628e-05, "loss": 1.3804, "step": 1207 }, { "epoch": 0.3597982091997245, "grad_norm": 0.12766240537166595, "learning_rate": 1.999798745159486e-05, "loss": 1.4068, "step": 1208 }, { "epoch": 0.36009605539939316, "grad_norm": 0.1314178705215454, "learning_rate": 1.9997968054306985e-05, "loss": 1.3712, "step": 1209 }, { "epoch": 0.36039390159906176, "grad_norm": 0.14264385402202606, "learning_rate": 1.9997948563999194e-05, "loss": 1.3744, "step": 1210 }, { "epoch": 0.3606917477987304, "grad_norm": 0.13112768530845642, "learning_rate": 1.9997928980671653e-05, "loss": 1.3607, "step": 1211 }, { "epoch": 0.3609895939983991, "grad_norm": 0.12162243574857712, "learning_rate": 1.9997909304324557e-05, "loss": 1.3697, "step": 1212 }, { "epoch": 0.36128744019806774, "grad_norm": 0.1289089173078537, "learning_rate": 1.9997889534958088e-05, "loss": 1.3828, "step": 1213 }, { "epoch": 0.36158528639773635, "grad_norm": 0.1352340281009674, "learning_rate": 1.9997869672572426e-05, "loss": 1.3635, "step": 1214 }, { "epoch": 0.361883132597405, "grad_norm": 0.13754944503307343, "learning_rate": 1.9997849717167757e-05, "loss": 1.3714, "step": 1215 }, { "epoch": 0.36218097879707367, "grad_norm": 0.13485591113567352, "learning_rate": 1.9997829668744265e-05, "loss": 1.3694, "step": 1216 }, { "epoch": 0.36247882499674233, "grad_norm": 0.13831846415996552, "learning_rate": 1.9997809527302138e-05, "loss": 1.3891, "step": 1217 }, { "epoch": 0.36277667119641094, "grad_norm": 0.12861719727516174, "learning_rate": 1.9997789292841564e-05, "loss": 1.3803, "step": 1218 }, { "epoch": 0.3630745173960796, "grad_norm": 0.122126504778862, "learning_rate": 1.999776896536273e-05, "loss": 1.3648, "step": 1219 }, { "epoch": 0.36337236359574826, "grad_norm": 0.13395242393016815, "learning_rate": 1.999774854486583e-05, "loss": 1.3614, "step": 1220 }, { "epoch": 0.36367020979541687, "grad_norm": 0.13298609852790833, "learning_rate": 1.9997728031351044e-05, "loss": 1.3718, "step": 1221 }, { "epoch": 0.3639680559950855, "grad_norm": 0.13225732743740082, "learning_rate": 1.9997707424818572e-05, "loss": 1.3664, "step": 1222 }, { "epoch": 0.3642659021947542, "grad_norm": 0.13129866123199463, "learning_rate": 1.9997686725268605e-05, "loss": 1.382, "step": 1223 }, { "epoch": 0.36456374839442285, "grad_norm": 0.1358112394809723, "learning_rate": 1.999766593270133e-05, "loss": 1.3902, "step": 1224 }, { "epoch": 0.36486159459409145, "grad_norm": 0.13032831251621246, "learning_rate": 1.9997645047116942e-05, "loss": 1.3951, "step": 1225 }, { "epoch": 0.3651594407937601, "grad_norm": 0.1316758096218109, "learning_rate": 1.9997624068515643e-05, "loss": 1.3742, "step": 1226 }, { "epoch": 0.3654572869934288, "grad_norm": 0.12895487248897552, "learning_rate": 1.9997602996897614e-05, "loss": 1.3722, "step": 1227 }, { "epoch": 0.36575513319309744, "grad_norm": 0.13532277941703796, "learning_rate": 1.9997581832263065e-05, "loss": 1.3742, "step": 1228 }, { "epoch": 0.36605297939276604, "grad_norm": 0.12580768764019012, "learning_rate": 1.9997560574612186e-05, "loss": 1.3642, "step": 1229 }, { "epoch": 0.3663508255924347, "grad_norm": 0.12978699803352356, "learning_rate": 1.9997539223945174e-05, "loss": 1.3675, "step": 1230 }, { "epoch": 0.36664867179210336, "grad_norm": 0.1309354305267334, "learning_rate": 1.999751778026223e-05, "loss": 1.3815, "step": 1231 }, { "epoch": 0.366946517991772, "grad_norm": 0.12747173011302948, "learning_rate": 1.9997496243563553e-05, "loss": 1.3836, "step": 1232 }, { "epoch": 0.36724436419144063, "grad_norm": 0.13163839280605316, "learning_rate": 1.9997474613849346e-05, "loss": 1.368, "step": 1233 }, { "epoch": 0.3675422103911093, "grad_norm": 0.13420511782169342, "learning_rate": 1.9997452891119804e-05, "loss": 1.3477, "step": 1234 }, { "epoch": 0.36784005659077795, "grad_norm": 0.13195165991783142, "learning_rate": 1.9997431075375132e-05, "loss": 1.3848, "step": 1235 }, { "epoch": 0.36813790279044656, "grad_norm": 0.12061884999275208, "learning_rate": 1.9997409166615535e-05, "loss": 1.3767, "step": 1236 }, { "epoch": 0.3684357489901152, "grad_norm": 0.1303822249174118, "learning_rate": 1.9997387164841218e-05, "loss": 1.3754, "step": 1237 }, { "epoch": 0.3687335951897839, "grad_norm": 0.1285220980644226, "learning_rate": 1.9997365070052383e-05, "loss": 1.3746, "step": 1238 }, { "epoch": 0.36903144138945254, "grad_norm": 0.126163050532341, "learning_rate": 1.9997342882249234e-05, "loss": 1.3775, "step": 1239 }, { "epoch": 0.36932928758912115, "grad_norm": 0.1282956302165985, "learning_rate": 1.9997320601431974e-05, "loss": 1.3844, "step": 1240 }, { "epoch": 0.3696271337887898, "grad_norm": 0.12833084166049957, "learning_rate": 1.9997298227600823e-05, "loss": 1.3825, "step": 1241 }, { "epoch": 0.36992497998845847, "grad_norm": 0.1357254534959793, "learning_rate": 1.999727576075598e-05, "loss": 1.3499, "step": 1242 }, { "epoch": 0.37022282618812713, "grad_norm": 0.12870509922504425, "learning_rate": 1.9997253200897652e-05, "loss": 1.3799, "step": 1243 }, { "epoch": 0.37052067238779574, "grad_norm": 0.1316627711057663, "learning_rate": 1.9997230548026056e-05, "loss": 1.3887, "step": 1244 }, { "epoch": 0.3708185185874644, "grad_norm": 0.12772461771965027, "learning_rate": 1.9997207802141402e-05, "loss": 1.3866, "step": 1245 }, { "epoch": 0.37111636478713306, "grad_norm": 0.12759855389595032, "learning_rate": 1.9997184963243894e-05, "loss": 1.372, "step": 1246 }, { "epoch": 0.37141421098680166, "grad_norm": 0.13649018108844757, "learning_rate": 1.999716203133375e-05, "loss": 1.3644, "step": 1247 }, { "epoch": 0.3717120571864703, "grad_norm": 0.12223384529352188, "learning_rate": 1.9997139006411184e-05, "loss": 1.3669, "step": 1248 }, { "epoch": 0.372009903386139, "grad_norm": 0.1267387717962265, "learning_rate": 1.999711588847641e-05, "loss": 1.3695, "step": 1249 }, { "epoch": 0.37230774958580765, "grad_norm": 0.13214010000228882, "learning_rate": 1.999709267752964e-05, "loss": 1.3931, "step": 1250 }, { "epoch": 0.37260559578547625, "grad_norm": 0.13361282646656036, "learning_rate": 1.9997069373571092e-05, "loss": 1.3872, "step": 1251 }, { "epoch": 0.3729034419851449, "grad_norm": 0.13205532729625702, "learning_rate": 1.9997045976600983e-05, "loss": 1.3791, "step": 1252 }, { "epoch": 0.3732012881848136, "grad_norm": 0.1310456097126007, "learning_rate": 1.9997022486619532e-05, "loss": 1.3792, "step": 1253 }, { "epoch": 0.37349913438448223, "grad_norm": 0.12933965027332306, "learning_rate": 1.9996998903626955e-05, "loss": 1.3736, "step": 1254 }, { "epoch": 0.37379698058415084, "grad_norm": 0.12627552449703217, "learning_rate": 1.9996975227623476e-05, "loss": 1.3714, "step": 1255 }, { "epoch": 0.3740948267838195, "grad_norm": 0.13484114408493042, "learning_rate": 1.9996951458609307e-05, "loss": 1.382, "step": 1256 }, { "epoch": 0.37439267298348816, "grad_norm": 0.1362007111310959, "learning_rate": 1.9996927596584673e-05, "loss": 1.3668, "step": 1257 }, { "epoch": 0.3746905191831568, "grad_norm": 0.12914560735225677, "learning_rate": 1.99969036415498e-05, "loss": 1.3483, "step": 1258 }, { "epoch": 0.37498836538282543, "grad_norm": 0.12913164496421814, "learning_rate": 1.999687959350491e-05, "loss": 1.3622, "step": 1259 }, { "epoch": 0.3752862115824941, "grad_norm": 0.13875743746757507, "learning_rate": 1.999685545245022e-05, "loss": 1.3773, "step": 1260 }, { "epoch": 0.37558405778216275, "grad_norm": 0.14095179736614227, "learning_rate": 1.9996831218385964e-05, "loss": 1.3696, "step": 1261 }, { "epoch": 0.37588190398183136, "grad_norm": 0.1370166391134262, "learning_rate": 1.9996806891312358e-05, "loss": 1.3834, "step": 1262 }, { "epoch": 0.3761797501815, "grad_norm": 0.12707234919071198, "learning_rate": 1.9996782471229635e-05, "loss": 1.367, "step": 1263 }, { "epoch": 0.3764775963811687, "grad_norm": 0.13285866379737854, "learning_rate": 1.999675795813802e-05, "loss": 1.3775, "step": 1264 }, { "epoch": 0.37677544258083734, "grad_norm": 0.13654166460037231, "learning_rate": 1.9996733352037743e-05, "loss": 1.3714, "step": 1265 }, { "epoch": 0.37707328878050594, "grad_norm": 0.1282128542661667, "learning_rate": 1.9996708652929028e-05, "loss": 1.375, "step": 1266 }, { "epoch": 0.3773711349801746, "grad_norm": 0.13190089166164398, "learning_rate": 1.9996683860812108e-05, "loss": 1.3691, "step": 1267 }, { "epoch": 0.37766898117984327, "grad_norm": 0.1347484290599823, "learning_rate": 1.9996658975687216e-05, "loss": 1.3743, "step": 1268 }, { "epoch": 0.3779668273795119, "grad_norm": 0.13643623888492584, "learning_rate": 1.999663399755458e-05, "loss": 1.3857, "step": 1269 }, { "epoch": 0.37826467357918053, "grad_norm": 0.13558420538902283, "learning_rate": 1.9996608926414435e-05, "loss": 1.378, "step": 1270 }, { "epoch": 0.3785625197788492, "grad_norm": 0.13643789291381836, "learning_rate": 1.999658376226701e-05, "loss": 1.353, "step": 1271 }, { "epoch": 0.37886036597851785, "grad_norm": 0.14463697373867035, "learning_rate": 1.9996558505112543e-05, "loss": 1.3773, "step": 1272 }, { "epoch": 0.3791582121781865, "grad_norm": 0.1375194638967514, "learning_rate": 1.999653315495127e-05, "loss": 1.3807, "step": 1273 }, { "epoch": 0.3794560583778551, "grad_norm": 0.13336382806301117, "learning_rate": 1.9996507711783422e-05, "loss": 1.3595, "step": 1274 }, { "epoch": 0.3797539045775238, "grad_norm": 0.14004971086978912, "learning_rate": 1.9996482175609237e-05, "loss": 1.364, "step": 1275 }, { "epoch": 0.38005175077719244, "grad_norm": 0.1399676501750946, "learning_rate": 1.9996456546428957e-05, "loss": 1.3712, "step": 1276 }, { "epoch": 0.38034959697686105, "grad_norm": 0.1426711082458496, "learning_rate": 1.9996430824242817e-05, "loss": 1.3625, "step": 1277 }, { "epoch": 0.3806474431765297, "grad_norm": 0.1441764235496521, "learning_rate": 1.999640500905106e-05, "loss": 1.3775, "step": 1278 }, { "epoch": 0.38094528937619837, "grad_norm": 0.13327822089195251, "learning_rate": 1.999637910085392e-05, "loss": 1.3816, "step": 1279 }, { "epoch": 0.38124313557586703, "grad_norm": 0.13619780540466309, "learning_rate": 1.999635309965164e-05, "loss": 1.3675, "step": 1280 }, { "epoch": 0.38154098177553564, "grad_norm": 0.13389413058757782, "learning_rate": 1.999632700544446e-05, "loss": 1.3603, "step": 1281 }, { "epoch": 0.3818388279752043, "grad_norm": 0.1472572386264801, "learning_rate": 1.999630081823263e-05, "loss": 1.3683, "step": 1282 }, { "epoch": 0.38213667417487296, "grad_norm": 0.14208440482616425, "learning_rate": 1.9996274538016394e-05, "loss": 1.3722, "step": 1283 }, { "epoch": 0.3824345203745416, "grad_norm": 0.1297113448381424, "learning_rate": 1.9996248164795987e-05, "loss": 1.3658, "step": 1284 }, { "epoch": 0.3827323665742102, "grad_norm": 0.13482092320919037, "learning_rate": 1.9996221698571657e-05, "loss": 1.3787, "step": 1285 }, { "epoch": 0.3830302127738789, "grad_norm": 0.13355214893817902, "learning_rate": 1.999619513934366e-05, "loss": 1.3719, "step": 1286 }, { "epoch": 0.38332805897354755, "grad_norm": 0.1377803385257721, "learning_rate": 1.9996168487112228e-05, "loss": 1.3664, "step": 1287 }, { "epoch": 0.38362590517321615, "grad_norm": 0.13205859065055847, "learning_rate": 1.999614174187762e-05, "loss": 1.3554, "step": 1288 }, { "epoch": 0.3839237513728848, "grad_norm": 0.14215654134750366, "learning_rate": 1.999611490364008e-05, "loss": 1.3671, "step": 1289 }, { "epoch": 0.3842215975725535, "grad_norm": 0.13025464117527008, "learning_rate": 1.999608797239986e-05, "loss": 1.3813, "step": 1290 }, { "epoch": 0.38451944377222214, "grad_norm": 0.1379905492067337, "learning_rate": 1.9996060948157206e-05, "loss": 1.3803, "step": 1291 }, { "epoch": 0.38481728997189074, "grad_norm": 0.13925138115882874, "learning_rate": 1.999603383091238e-05, "loss": 1.3786, "step": 1292 }, { "epoch": 0.3851151361715594, "grad_norm": 0.14388136565685272, "learning_rate": 1.999600662066562e-05, "loss": 1.3471, "step": 1293 }, { "epoch": 0.38541298237122806, "grad_norm": 0.13373485207557678, "learning_rate": 1.9995979317417197e-05, "loss": 1.3675, "step": 1294 }, { "epoch": 0.3857108285708967, "grad_norm": 0.14883075654506683, "learning_rate": 1.9995951921167343e-05, "loss": 1.3864, "step": 1295 }, { "epoch": 0.38600867477056533, "grad_norm": 0.13818325102329254, "learning_rate": 1.9995924431916332e-05, "loss": 1.385, "step": 1296 }, { "epoch": 0.386306520970234, "grad_norm": 0.13905934989452362, "learning_rate": 1.999589684966441e-05, "loss": 1.3637, "step": 1297 }, { "epoch": 0.38660436716990265, "grad_norm": 0.15382356941699982, "learning_rate": 1.9995869174411834e-05, "loss": 1.3975, "step": 1298 }, { "epoch": 0.3869022133695713, "grad_norm": 0.13522179424762726, "learning_rate": 1.9995841406158866e-05, "loss": 1.3712, "step": 1299 }, { "epoch": 0.3872000595692399, "grad_norm": 0.1400614082813263, "learning_rate": 1.9995813544905763e-05, "loss": 1.357, "step": 1300 }, { "epoch": 0.3874979057689086, "grad_norm": 0.14229221642017365, "learning_rate": 1.9995785590652783e-05, "loss": 1.3843, "step": 1301 }, { "epoch": 0.38779575196857724, "grad_norm": 0.13116051256656647, "learning_rate": 1.9995757543400182e-05, "loss": 1.3606, "step": 1302 }, { "epoch": 0.38809359816824585, "grad_norm": 0.13689297437667847, "learning_rate": 1.9995729403148226e-05, "loss": 1.373, "step": 1303 }, { "epoch": 0.3883914443679145, "grad_norm": 0.16429245471954346, "learning_rate": 1.999570116989718e-05, "loss": 1.3704, "step": 1304 }, { "epoch": 0.38868929056758317, "grad_norm": 0.14279286563396454, "learning_rate": 1.9995672843647295e-05, "loss": 1.3736, "step": 1305 }, { "epoch": 0.38898713676725183, "grad_norm": 0.14285866916179657, "learning_rate": 1.9995644424398847e-05, "loss": 1.3741, "step": 1306 }, { "epoch": 0.38928498296692043, "grad_norm": 0.14350569248199463, "learning_rate": 1.9995615912152098e-05, "loss": 1.3896, "step": 1307 }, { "epoch": 0.3895828291665891, "grad_norm": 0.14331308007240295, "learning_rate": 1.9995587306907308e-05, "loss": 1.3507, "step": 1308 }, { "epoch": 0.38988067536625776, "grad_norm": 0.14781762659549713, "learning_rate": 1.9995558608664744e-05, "loss": 1.3649, "step": 1309 }, { "epoch": 0.3901785215659264, "grad_norm": 0.14205756783485413, "learning_rate": 1.9995529817424675e-05, "loss": 1.3748, "step": 1310 }, { "epoch": 0.390476367765595, "grad_norm": 0.13681936264038086, "learning_rate": 1.999550093318737e-05, "loss": 1.3711, "step": 1311 }, { "epoch": 0.3907742139652637, "grad_norm": 0.13977855443954468, "learning_rate": 1.9995471955953096e-05, "loss": 1.3681, "step": 1312 }, { "epoch": 0.39107206016493234, "grad_norm": 0.14186030626296997, "learning_rate": 1.9995442885722122e-05, "loss": 1.3758, "step": 1313 }, { "epoch": 0.391369906364601, "grad_norm": 0.14282235503196716, "learning_rate": 1.999541372249472e-05, "loss": 1.3824, "step": 1314 }, { "epoch": 0.3916677525642696, "grad_norm": 0.13587988913059235, "learning_rate": 1.9995384466271164e-05, "loss": 1.3542, "step": 1315 }, { "epoch": 0.39196559876393827, "grad_norm": 0.13733258843421936, "learning_rate": 1.999535511705172e-05, "loss": 1.3584, "step": 1316 }, { "epoch": 0.39226344496360693, "grad_norm": 0.14015381038188934, "learning_rate": 1.9995325674836665e-05, "loss": 1.3716, "step": 1317 }, { "epoch": 0.39256129116327554, "grad_norm": 0.13651269674301147, "learning_rate": 1.9995296139626274e-05, "loss": 1.365, "step": 1318 }, { "epoch": 0.3928591373629442, "grad_norm": 0.14151425659656525, "learning_rate": 1.9995266511420816e-05, "loss": 1.3638, "step": 1319 }, { "epoch": 0.39315698356261286, "grad_norm": 0.13102491199970245, "learning_rate": 1.9995236790220574e-05, "loss": 1.3545, "step": 1320 }, { "epoch": 0.3934548297622815, "grad_norm": 0.14258791506290436, "learning_rate": 1.9995206976025817e-05, "loss": 1.3717, "step": 1321 }, { "epoch": 0.3937526759619501, "grad_norm": 0.13457924127578735, "learning_rate": 1.9995177068836828e-05, "loss": 1.3766, "step": 1322 }, { "epoch": 0.3940505221616188, "grad_norm": 0.14607053995132446, "learning_rate": 1.9995147068653887e-05, "loss": 1.3647, "step": 1323 }, { "epoch": 0.39434836836128745, "grad_norm": 0.13513265550136566, "learning_rate": 1.9995116975477265e-05, "loss": 1.3589, "step": 1324 }, { "epoch": 0.3946462145609561, "grad_norm": 0.1353466659784317, "learning_rate": 1.9995086789307248e-05, "loss": 1.3713, "step": 1325 }, { "epoch": 0.3949440607606247, "grad_norm": 0.15018858015537262, "learning_rate": 1.9995056510144116e-05, "loss": 1.3583, "step": 1326 }, { "epoch": 0.3952419069602934, "grad_norm": 0.14529645442962646, "learning_rate": 1.999502613798815e-05, "loss": 1.3737, "step": 1327 }, { "epoch": 0.39553975315996204, "grad_norm": 0.14066046476364136, "learning_rate": 1.9994995672839636e-05, "loss": 1.3622, "step": 1328 }, { "epoch": 0.39583759935963064, "grad_norm": 0.1439758837223053, "learning_rate": 1.999496511469885e-05, "loss": 1.3764, "step": 1329 }, { "epoch": 0.3961354455592993, "grad_norm": 0.14791858196258545, "learning_rate": 1.9994934463566086e-05, "loss": 1.3917, "step": 1330 }, { "epoch": 0.39643329175896797, "grad_norm": 0.1415499597787857, "learning_rate": 1.999490371944162e-05, "loss": 1.356, "step": 1331 }, { "epoch": 0.3967311379586366, "grad_norm": 0.14524230360984802, "learning_rate": 1.999487288232574e-05, "loss": 1.3762, "step": 1332 }, { "epoch": 0.39702898415830523, "grad_norm": 0.13859130442142487, "learning_rate": 1.9994841952218738e-05, "loss": 1.3627, "step": 1333 }, { "epoch": 0.3973268303579739, "grad_norm": 0.13905595242977142, "learning_rate": 1.9994810929120897e-05, "loss": 1.372, "step": 1334 }, { "epoch": 0.39762467655764255, "grad_norm": 0.14038988947868347, "learning_rate": 1.999477981303251e-05, "loss": 1.3674, "step": 1335 }, { "epoch": 0.3979225227573112, "grad_norm": 0.1370796263217926, "learning_rate": 1.999474860395386e-05, "loss": 1.353, "step": 1336 }, { "epoch": 0.3982203689569798, "grad_norm": 0.14049774408340454, "learning_rate": 1.999471730188524e-05, "loss": 1.3644, "step": 1337 }, { "epoch": 0.3985182151566485, "grad_norm": 0.16187512874603271, "learning_rate": 1.9994685906826944e-05, "loss": 1.349, "step": 1338 }, { "epoch": 0.39881606135631714, "grad_norm": 0.144514262676239, "learning_rate": 1.9994654418779263e-05, "loss": 1.3598, "step": 1339 }, { "epoch": 0.3991139075559858, "grad_norm": 0.13683633506298065, "learning_rate": 1.999462283774249e-05, "loss": 1.3512, "step": 1340 }, { "epoch": 0.3994117537556544, "grad_norm": 0.13460426032543182, "learning_rate": 1.999459116371692e-05, "loss": 1.3454, "step": 1341 }, { "epoch": 0.39970959995532307, "grad_norm": 0.14671728014945984, "learning_rate": 1.999455939670284e-05, "loss": 1.3733, "step": 1342 }, { "epoch": 0.40000744615499173, "grad_norm": 0.14138005673885345, "learning_rate": 1.9994527536700557e-05, "loss": 1.3702, "step": 1343 }, { "epoch": 0.40030529235466034, "grad_norm": 0.1489688754081726, "learning_rate": 1.999449558371036e-05, "loss": 1.3739, "step": 1344 }, { "epoch": 0.400603138554329, "grad_norm": 0.1427619457244873, "learning_rate": 1.9994463537732546e-05, "loss": 1.3839, "step": 1345 }, { "epoch": 0.40090098475399766, "grad_norm": 0.1428166627883911, "learning_rate": 1.999443139876742e-05, "loss": 1.3648, "step": 1346 }, { "epoch": 0.4011988309536663, "grad_norm": 0.15491773188114166, "learning_rate": 1.9994399166815272e-05, "loss": 1.368, "step": 1347 }, { "epoch": 0.4014966771533349, "grad_norm": 0.14289771020412445, "learning_rate": 1.9994366841876404e-05, "loss": 1.3732, "step": 1348 }, { "epoch": 0.4017945233530036, "grad_norm": 0.14798198640346527, "learning_rate": 1.9994334423951122e-05, "loss": 1.3309, "step": 1349 }, { "epoch": 0.40209236955267225, "grad_norm": 0.14131715893745422, "learning_rate": 1.9994301913039726e-05, "loss": 1.3637, "step": 1350 }, { "epoch": 0.4023902157523409, "grad_norm": 0.13964703679084778, "learning_rate": 1.9994269309142517e-05, "loss": 1.3767, "step": 1351 }, { "epoch": 0.4026880619520095, "grad_norm": 0.15044206380844116, "learning_rate": 1.9994236612259796e-05, "loss": 1.3623, "step": 1352 }, { "epoch": 0.4029859081516782, "grad_norm": 0.14952361583709717, "learning_rate": 1.999420382239187e-05, "loss": 1.3717, "step": 1353 }, { "epoch": 0.40328375435134683, "grad_norm": 0.13733325898647308, "learning_rate": 1.9994170939539042e-05, "loss": 1.3471, "step": 1354 }, { "epoch": 0.4035816005510155, "grad_norm": 0.13589802384376526, "learning_rate": 1.9994137963701622e-05, "loss": 1.3678, "step": 1355 }, { "epoch": 0.4038794467506841, "grad_norm": 0.14110149443149567, "learning_rate": 1.9994104894879914e-05, "loss": 1.3529, "step": 1356 }, { "epoch": 0.40417729295035276, "grad_norm": 0.14588642120361328, "learning_rate": 1.999407173307423e-05, "loss": 1.3475, "step": 1357 }, { "epoch": 0.4044751391500214, "grad_norm": 0.17346397042274475, "learning_rate": 1.9994038478284868e-05, "loss": 1.3598, "step": 1358 }, { "epoch": 0.40477298534969003, "grad_norm": 0.13631947338581085, "learning_rate": 1.9994005130512147e-05, "loss": 1.3675, "step": 1359 }, { "epoch": 0.4050708315493587, "grad_norm": 0.13887470960617065, "learning_rate": 1.9993971689756374e-05, "loss": 1.3684, "step": 1360 }, { "epoch": 0.40536867774902735, "grad_norm": 0.1530754268169403, "learning_rate": 1.9993938156017857e-05, "loss": 1.3761, "step": 1361 }, { "epoch": 0.405666523948696, "grad_norm": 0.1420319378376007, "learning_rate": 1.9993904529296915e-05, "loss": 1.3866, "step": 1362 }, { "epoch": 0.4059643701483646, "grad_norm": 0.14963901042938232, "learning_rate": 1.9993870809593857e-05, "loss": 1.3713, "step": 1363 }, { "epoch": 0.4062622163480333, "grad_norm": 0.14358863234519958, "learning_rate": 1.9993836996909e-05, "loss": 1.3766, "step": 1364 }, { "epoch": 0.40656006254770194, "grad_norm": 0.14113590121269226, "learning_rate": 1.999380309124265e-05, "loss": 1.3747, "step": 1365 }, { "epoch": 0.4068579087473706, "grad_norm": 0.13802053034305573, "learning_rate": 1.999376909259513e-05, "loss": 1.3659, "step": 1366 }, { "epoch": 0.4071557549470392, "grad_norm": 0.14546042680740356, "learning_rate": 1.9993735000966756e-05, "loss": 1.3717, "step": 1367 }, { "epoch": 0.40745360114670787, "grad_norm": 0.15038831532001495, "learning_rate": 1.999370081635784e-05, "loss": 1.3597, "step": 1368 }, { "epoch": 0.4077514473463765, "grad_norm": 0.1376657336950302, "learning_rate": 1.9993666538768704e-05, "loss": 1.3722, "step": 1369 }, { "epoch": 0.4080492935460452, "grad_norm": 0.14589664340019226, "learning_rate": 1.9993632168199668e-05, "loss": 1.3547, "step": 1370 }, { "epoch": 0.4083471397457138, "grad_norm": 0.1463681012392044, "learning_rate": 1.9993597704651052e-05, "loss": 1.3646, "step": 1371 }, { "epoch": 0.40864498594538246, "grad_norm": 0.15309378504753113, "learning_rate": 1.9993563148123176e-05, "loss": 1.3631, "step": 1372 }, { "epoch": 0.4089428321450511, "grad_norm": 0.14134198427200317, "learning_rate": 1.9993528498616354e-05, "loss": 1.3672, "step": 1373 }, { "epoch": 0.4092406783447197, "grad_norm": 0.1537083387374878, "learning_rate": 1.9993493756130918e-05, "loss": 1.3858, "step": 1374 }, { "epoch": 0.4095385245443884, "grad_norm": 0.1451197862625122, "learning_rate": 1.9993458920667188e-05, "loss": 1.369, "step": 1375 }, { "epoch": 0.40983637074405704, "grad_norm": 0.13950785994529724, "learning_rate": 1.9993423992225487e-05, "loss": 1.3688, "step": 1376 }, { "epoch": 0.4101342169437257, "grad_norm": 0.14010006189346313, "learning_rate": 1.9993388970806143e-05, "loss": 1.3552, "step": 1377 }, { "epoch": 0.4104320631433943, "grad_norm": 0.1423448920249939, "learning_rate": 1.9993353856409482e-05, "loss": 1.355, "step": 1378 }, { "epoch": 0.41072990934306297, "grad_norm": 0.1458849012851715, "learning_rate": 1.9993318649035824e-05, "loss": 1.3719, "step": 1379 }, { "epoch": 0.41102775554273163, "grad_norm": 0.1420285850763321, "learning_rate": 1.99932833486855e-05, "loss": 1.3491, "step": 1380 }, { "epoch": 0.4113256017424003, "grad_norm": 0.1413401961326599, "learning_rate": 1.9993247955358845e-05, "loss": 1.366, "step": 1381 }, { "epoch": 0.4116234479420689, "grad_norm": 0.1456751674413681, "learning_rate": 1.999321246905618e-05, "loss": 1.3711, "step": 1382 }, { "epoch": 0.41192129414173756, "grad_norm": 0.14318130910396576, "learning_rate": 1.999317688977784e-05, "loss": 1.3637, "step": 1383 }, { "epoch": 0.4122191403414062, "grad_norm": 0.15225200355052948, "learning_rate": 1.999314121752415e-05, "loss": 1.3808, "step": 1384 }, { "epoch": 0.4125169865410748, "grad_norm": 0.15400700271129608, "learning_rate": 1.999310545229545e-05, "loss": 1.3733, "step": 1385 }, { "epoch": 0.4128148327407435, "grad_norm": 0.14465181529521942, "learning_rate": 1.999306959409207e-05, "loss": 1.3766, "step": 1386 }, { "epoch": 0.41311267894041215, "grad_norm": 0.14450886845588684, "learning_rate": 1.9993033642914334e-05, "loss": 1.3659, "step": 1387 }, { "epoch": 0.4134105251400808, "grad_norm": 0.14789682626724243, "learning_rate": 1.999299759876259e-05, "loss": 1.3402, "step": 1388 }, { "epoch": 0.4137083713397494, "grad_norm": 0.13714885711669922, "learning_rate": 1.999296146163717e-05, "loss": 1.3617, "step": 1389 }, { "epoch": 0.4140062175394181, "grad_norm": 0.14600655436515808, "learning_rate": 1.9992925231538406e-05, "loss": 1.3783, "step": 1390 }, { "epoch": 0.41430406373908674, "grad_norm": 0.14446376264095306, "learning_rate": 1.9992888908466638e-05, "loss": 1.3502, "step": 1391 }, { "epoch": 0.4146019099387554, "grad_norm": 0.15243427455425262, "learning_rate": 1.9992852492422206e-05, "loss": 1.3511, "step": 1392 }, { "epoch": 0.414899756138424, "grad_norm": 0.13800148665905, "learning_rate": 1.9992815983405442e-05, "loss": 1.3725, "step": 1393 }, { "epoch": 0.41519760233809266, "grad_norm": 0.14049609005451202, "learning_rate": 1.9992779381416692e-05, "loss": 1.3671, "step": 1394 }, { "epoch": 0.4154954485377613, "grad_norm": 0.14720329642295837, "learning_rate": 1.9992742686456298e-05, "loss": 1.3734, "step": 1395 }, { "epoch": 0.41579329473743, "grad_norm": 0.14476299285888672, "learning_rate": 1.999270589852459e-05, "loss": 1.3795, "step": 1396 }, { "epoch": 0.4160911409370986, "grad_norm": 0.14092835783958435, "learning_rate": 1.9992669017621925e-05, "loss": 1.3593, "step": 1397 }, { "epoch": 0.41638898713676725, "grad_norm": 0.14597156643867493, "learning_rate": 1.9992632043748635e-05, "loss": 1.3908, "step": 1398 }, { "epoch": 0.4166868333364359, "grad_norm": 0.14053498208522797, "learning_rate": 1.999259497690507e-05, "loss": 1.3773, "step": 1399 }, { "epoch": 0.4169846795361045, "grad_norm": 0.14102129638195038, "learning_rate": 1.9992557817091574e-05, "loss": 1.36, "step": 1400 }, { "epoch": 0.4172825257357732, "grad_norm": 0.13804572820663452, "learning_rate": 1.9992520564308488e-05, "loss": 1.3773, "step": 1401 }, { "epoch": 0.41758037193544184, "grad_norm": 0.14213070273399353, "learning_rate": 1.9992483218556166e-05, "loss": 1.3754, "step": 1402 }, { "epoch": 0.4178782181351105, "grad_norm": 0.14632445573806763, "learning_rate": 1.9992445779834952e-05, "loss": 1.3648, "step": 1403 }, { "epoch": 0.4181760643347791, "grad_norm": 0.14518555998802185, "learning_rate": 1.9992408248145194e-05, "loss": 1.3487, "step": 1404 }, { "epoch": 0.41847391053444777, "grad_norm": 0.1442975252866745, "learning_rate": 1.999237062348724e-05, "loss": 1.3634, "step": 1405 }, { "epoch": 0.41877175673411643, "grad_norm": 0.15438003838062286, "learning_rate": 1.9992332905861445e-05, "loss": 1.363, "step": 1406 }, { "epoch": 0.4190696029337851, "grad_norm": 0.138583242893219, "learning_rate": 1.999229509526815e-05, "loss": 1.3379, "step": 1407 }, { "epoch": 0.4193674491334537, "grad_norm": 0.14776061475276947, "learning_rate": 1.999225719170772e-05, "loss": 1.3701, "step": 1408 }, { "epoch": 0.41966529533312236, "grad_norm": 0.14421144127845764, "learning_rate": 1.9992219195180498e-05, "loss": 1.3762, "step": 1409 }, { "epoch": 0.419963141532791, "grad_norm": 0.14404408633708954, "learning_rate": 1.9992181105686838e-05, "loss": 1.3715, "step": 1410 }, { "epoch": 0.4202609877324597, "grad_norm": 0.14231495559215546, "learning_rate": 1.99921429232271e-05, "loss": 1.355, "step": 1411 }, { "epoch": 0.4205588339321283, "grad_norm": 0.1412181854248047, "learning_rate": 1.9992104647801635e-05, "loss": 1.3506, "step": 1412 }, { "epoch": 0.42085668013179695, "grad_norm": 0.1508765071630478, "learning_rate": 1.9992066279410797e-05, "loss": 1.3588, "step": 1413 }, { "epoch": 0.4211545263314656, "grad_norm": 0.1456536203622818, "learning_rate": 1.9992027818054952e-05, "loss": 1.3644, "step": 1414 }, { "epoch": 0.4214523725311342, "grad_norm": 0.1506153792142868, "learning_rate": 1.999198926373445e-05, "loss": 1.3741, "step": 1415 }, { "epoch": 0.4217502187308029, "grad_norm": 0.14759069681167603, "learning_rate": 1.9991950616449648e-05, "loss": 1.3582, "step": 1416 }, { "epoch": 0.42204806493047153, "grad_norm": 0.13830821216106415, "learning_rate": 1.9991911876200914e-05, "loss": 1.3678, "step": 1417 }, { "epoch": 0.4223459111301402, "grad_norm": 0.13808178901672363, "learning_rate": 1.99918730429886e-05, "loss": 1.3427, "step": 1418 }, { "epoch": 0.4226437573298088, "grad_norm": 0.14943626523017883, "learning_rate": 1.999183411681307e-05, "loss": 1.3738, "step": 1419 }, { "epoch": 0.42294160352947746, "grad_norm": 0.15086951851844788, "learning_rate": 1.999179509767469e-05, "loss": 1.3674, "step": 1420 }, { "epoch": 0.4232394497291461, "grad_norm": 0.13922718167304993, "learning_rate": 1.9991755985573823e-05, "loss": 1.3561, "step": 1421 }, { "epoch": 0.4235372959288148, "grad_norm": 0.1464361548423767, "learning_rate": 1.9991716780510823e-05, "loss": 1.3572, "step": 1422 }, { "epoch": 0.4238351421284834, "grad_norm": 0.1514001041650772, "learning_rate": 1.9991677482486068e-05, "loss": 1.3714, "step": 1423 }, { "epoch": 0.42413298832815205, "grad_norm": 0.15556089580059052, "learning_rate": 1.999163809149991e-05, "loss": 1.3637, "step": 1424 }, { "epoch": 0.4244308345278207, "grad_norm": 0.1481410712003708, "learning_rate": 1.9991598607552733e-05, "loss": 1.3791, "step": 1425 }, { "epoch": 0.4247286807274893, "grad_norm": 0.14641015231609344, "learning_rate": 1.9991559030644888e-05, "loss": 1.3479, "step": 1426 }, { "epoch": 0.425026526927158, "grad_norm": 0.146641343832016, "learning_rate": 1.999151936077675e-05, "loss": 1.3615, "step": 1427 }, { "epoch": 0.42532437312682664, "grad_norm": 0.1396789401769638, "learning_rate": 1.9991479597948685e-05, "loss": 1.3633, "step": 1428 }, { "epoch": 0.4256222193264953, "grad_norm": 0.13987213373184204, "learning_rate": 1.9991439742161066e-05, "loss": 1.3631, "step": 1429 }, { "epoch": 0.4259200655261639, "grad_norm": 0.14193253219127655, "learning_rate": 1.9991399793414268e-05, "loss": 1.3627, "step": 1430 }, { "epoch": 0.42621791172583257, "grad_norm": 0.15473538637161255, "learning_rate": 1.9991359751708653e-05, "loss": 1.3726, "step": 1431 }, { "epoch": 0.4265157579255012, "grad_norm": 0.14363671839237213, "learning_rate": 1.9991319617044605e-05, "loss": 1.3517, "step": 1432 }, { "epoch": 0.4268136041251699, "grad_norm": 0.13485422730445862, "learning_rate": 1.9991279389422485e-05, "loss": 1.3599, "step": 1433 }, { "epoch": 0.4271114503248385, "grad_norm": 0.14515508711338043, "learning_rate": 1.9991239068842673e-05, "loss": 1.3784, "step": 1434 }, { "epoch": 0.42740929652450715, "grad_norm": 0.153084397315979, "learning_rate": 1.999119865530555e-05, "loss": 1.361, "step": 1435 }, { "epoch": 0.4277071427241758, "grad_norm": 0.15337277948856354, "learning_rate": 1.999115814881148e-05, "loss": 1.3643, "step": 1436 }, { "epoch": 0.4280049889238445, "grad_norm": 0.15344147384166718, "learning_rate": 1.999111754936085e-05, "loss": 1.3746, "step": 1437 }, { "epoch": 0.4283028351235131, "grad_norm": 0.1456160843372345, "learning_rate": 1.9991076856954034e-05, "loss": 1.3631, "step": 1438 }, { "epoch": 0.42860068132318174, "grad_norm": 0.1523975133895874, "learning_rate": 1.999103607159141e-05, "loss": 1.3638, "step": 1439 }, { "epoch": 0.4288985275228504, "grad_norm": 0.15147021412849426, "learning_rate": 1.9990995193273354e-05, "loss": 1.3524, "step": 1440 }, { "epoch": 0.429196373722519, "grad_norm": 0.14829808473587036, "learning_rate": 1.9990954222000254e-05, "loss": 1.371, "step": 1441 }, { "epoch": 0.42949421992218767, "grad_norm": 0.14790543913841248, "learning_rate": 1.9990913157772488e-05, "loss": 1.343, "step": 1442 }, { "epoch": 0.42979206612185633, "grad_norm": 0.1457212120294571, "learning_rate": 1.9990872000590435e-05, "loss": 1.3723, "step": 1443 }, { "epoch": 0.430089912321525, "grad_norm": 0.14873450994491577, "learning_rate": 1.9990830750454485e-05, "loss": 1.3668, "step": 1444 }, { "epoch": 0.4303877585211936, "grad_norm": 0.15360642969608307, "learning_rate": 1.999078940736501e-05, "loss": 1.3746, "step": 1445 }, { "epoch": 0.43068560472086226, "grad_norm": 0.14420081675052643, "learning_rate": 1.999074797132241e-05, "loss": 1.358, "step": 1446 }, { "epoch": 0.4309834509205309, "grad_norm": 0.1481042504310608, "learning_rate": 1.999070644232706e-05, "loss": 1.3346, "step": 1447 }, { "epoch": 0.4312812971201996, "grad_norm": 0.15088361501693726, "learning_rate": 1.9990664820379348e-05, "loss": 1.3669, "step": 1448 }, { "epoch": 0.4315791433198682, "grad_norm": 0.14640074968338013, "learning_rate": 1.9990623105479662e-05, "loss": 1.3495, "step": 1449 }, { "epoch": 0.43187698951953685, "grad_norm": 0.1395910680294037, "learning_rate": 1.999058129762839e-05, "loss": 1.3567, "step": 1450 }, { "epoch": 0.4321748357192055, "grad_norm": 0.14191977679729462, "learning_rate": 1.9990539396825918e-05, "loss": 1.3974, "step": 1451 }, { "epoch": 0.43247268191887417, "grad_norm": 0.14816097915172577, "learning_rate": 1.9990497403072645e-05, "loss": 1.3662, "step": 1452 }, { "epoch": 0.4327705281185428, "grad_norm": 0.15005965530872345, "learning_rate": 1.9990455316368952e-05, "loss": 1.3592, "step": 1453 }, { "epoch": 0.43306837431821144, "grad_norm": 0.14851853251457214, "learning_rate": 1.9990413136715234e-05, "loss": 1.3552, "step": 1454 }, { "epoch": 0.4333662205178801, "grad_norm": 0.14743919670581818, "learning_rate": 1.9990370864111885e-05, "loss": 1.3633, "step": 1455 }, { "epoch": 0.4336640667175487, "grad_norm": 0.14935488998889923, "learning_rate": 1.9990328498559297e-05, "loss": 1.3852, "step": 1456 }, { "epoch": 0.43396191291721736, "grad_norm": 0.14826509356498718, "learning_rate": 1.9990286040057864e-05, "loss": 1.3796, "step": 1457 }, { "epoch": 0.434259759116886, "grad_norm": 0.14386005699634552, "learning_rate": 1.9990243488607982e-05, "loss": 1.3523, "step": 1458 }, { "epoch": 0.4345576053165547, "grad_norm": 0.13946856558322906, "learning_rate": 1.9990200844210044e-05, "loss": 1.37, "step": 1459 }, { "epoch": 0.4348554515162233, "grad_norm": 0.1450900286436081, "learning_rate": 1.9990158106864454e-05, "loss": 1.3526, "step": 1460 }, { "epoch": 0.43515329771589195, "grad_norm": 0.1451714038848877, "learning_rate": 1.9990115276571597e-05, "loss": 1.3489, "step": 1461 }, { "epoch": 0.4354511439155606, "grad_norm": 0.13972912728786469, "learning_rate": 1.9990072353331883e-05, "loss": 1.3458, "step": 1462 }, { "epoch": 0.4357489901152293, "grad_norm": 0.13991695642471313, "learning_rate": 1.9990029337145706e-05, "loss": 1.3642, "step": 1463 }, { "epoch": 0.4360468363148979, "grad_norm": 0.14446556568145752, "learning_rate": 1.9989986228013468e-05, "loss": 1.3601, "step": 1464 }, { "epoch": 0.43634468251456654, "grad_norm": 0.14560119807720184, "learning_rate": 1.9989943025935574e-05, "loss": 1.3618, "step": 1465 }, { "epoch": 0.4366425287142352, "grad_norm": 0.1507396101951599, "learning_rate": 1.9989899730912415e-05, "loss": 1.3559, "step": 1466 }, { "epoch": 0.4369403749139038, "grad_norm": 0.1439831554889679, "learning_rate": 1.9989856342944405e-05, "loss": 1.3437, "step": 1467 }, { "epoch": 0.43723822111357247, "grad_norm": 0.14343243837356567, "learning_rate": 1.9989812862031938e-05, "loss": 1.3613, "step": 1468 }, { "epoch": 0.43753606731324113, "grad_norm": 0.1516478955745697, "learning_rate": 1.998976928817543e-05, "loss": 1.374, "step": 1469 }, { "epoch": 0.4378339135129098, "grad_norm": 0.13883419334888458, "learning_rate": 1.9989725621375277e-05, "loss": 1.3497, "step": 1470 }, { "epoch": 0.4381317597125784, "grad_norm": 0.15203416347503662, "learning_rate": 1.9989681861631886e-05, "loss": 1.3604, "step": 1471 }, { "epoch": 0.43842960591224706, "grad_norm": 0.14967718720436096, "learning_rate": 1.9989638008945667e-05, "loss": 1.3672, "step": 1472 }, { "epoch": 0.4387274521119157, "grad_norm": 0.1440047323703766, "learning_rate": 1.998959406331703e-05, "loss": 1.3554, "step": 1473 }, { "epoch": 0.4390252983115844, "grad_norm": 0.15080441534519196, "learning_rate": 1.9989550024746382e-05, "loss": 1.3466, "step": 1474 }, { "epoch": 0.439323144511253, "grad_norm": 0.15265288949012756, "learning_rate": 1.9989505893234128e-05, "loss": 1.3499, "step": 1475 }, { "epoch": 0.43962099071092164, "grad_norm": 0.15390194952487946, "learning_rate": 1.9989461668780687e-05, "loss": 1.3669, "step": 1476 }, { "epoch": 0.4399188369105903, "grad_norm": 0.15198729932308197, "learning_rate": 1.9989417351386463e-05, "loss": 1.376, "step": 1477 }, { "epoch": 0.44021668311025897, "grad_norm": 0.14844822883605957, "learning_rate": 1.9989372941051872e-05, "loss": 1.3474, "step": 1478 }, { "epoch": 0.44051452930992757, "grad_norm": 0.15690770745277405, "learning_rate": 1.9989328437777325e-05, "loss": 1.336, "step": 1479 }, { "epoch": 0.44081237550959623, "grad_norm": 0.15483498573303223, "learning_rate": 1.998928384156324e-05, "loss": 1.3439, "step": 1480 }, { "epoch": 0.4411102217092649, "grad_norm": 0.15072625875473022, "learning_rate": 1.9989239152410028e-05, "loss": 1.369, "step": 1481 }, { "epoch": 0.4414080679089335, "grad_norm": 0.14917880296707153, "learning_rate": 1.9989194370318107e-05, "loss": 1.3597, "step": 1482 }, { "epoch": 0.44170591410860216, "grad_norm": 0.15088708698749542, "learning_rate": 1.9989149495287895e-05, "loss": 1.3654, "step": 1483 }, { "epoch": 0.4420037603082708, "grad_norm": 0.1527431458234787, "learning_rate": 1.9989104527319805e-05, "loss": 1.3437, "step": 1484 }, { "epoch": 0.4423016065079395, "grad_norm": 0.1469215601682663, "learning_rate": 1.9989059466414257e-05, "loss": 1.3417, "step": 1485 }, { "epoch": 0.4425994527076081, "grad_norm": 0.14421643316745758, "learning_rate": 1.9989014312571674e-05, "loss": 1.3647, "step": 1486 }, { "epoch": 0.44289729890727675, "grad_norm": 0.1480475813150406, "learning_rate": 1.998896906579247e-05, "loss": 1.3574, "step": 1487 }, { "epoch": 0.4431951451069454, "grad_norm": 0.16325557231903076, "learning_rate": 1.9988923726077073e-05, "loss": 1.3474, "step": 1488 }, { "epoch": 0.44349299130661407, "grad_norm": 0.15464672446250916, "learning_rate": 1.99888782934259e-05, "loss": 1.3545, "step": 1489 }, { "epoch": 0.4437908375062827, "grad_norm": 0.1462455838918686, "learning_rate": 1.998883276783937e-05, "loss": 1.3742, "step": 1490 }, { "epoch": 0.44408868370595134, "grad_norm": 0.1596900075674057, "learning_rate": 1.9988787149317918e-05, "loss": 1.3426, "step": 1491 }, { "epoch": 0.44438652990562, "grad_norm": 0.14827901124954224, "learning_rate": 1.998874143786196e-05, "loss": 1.3593, "step": 1492 }, { "epoch": 0.44468437610528866, "grad_norm": 0.16149313747882843, "learning_rate": 1.9988695633471916e-05, "loss": 1.3695, "step": 1493 }, { "epoch": 0.44498222230495726, "grad_norm": 0.14973697066307068, "learning_rate": 1.9988649736148228e-05, "loss": 1.374, "step": 1494 }, { "epoch": 0.4452800685046259, "grad_norm": 0.15943573415279388, "learning_rate": 1.998860374589131e-05, "loss": 1.3639, "step": 1495 }, { "epoch": 0.4455779147042946, "grad_norm": 0.16365402936935425, "learning_rate": 1.9988557662701596e-05, "loss": 1.3777, "step": 1496 }, { "epoch": 0.4458757609039632, "grad_norm": 0.14562122523784637, "learning_rate": 1.998851148657951e-05, "loss": 1.3708, "step": 1497 }, { "epoch": 0.44617360710363185, "grad_norm": 0.15102730691432953, "learning_rate": 1.998846521752549e-05, "loss": 1.3398, "step": 1498 }, { "epoch": 0.4464714533033005, "grad_norm": 0.15463490784168243, "learning_rate": 1.9988418855539956e-05, "loss": 1.3496, "step": 1499 }, { "epoch": 0.4467692995029692, "grad_norm": 0.1538606882095337, "learning_rate": 1.9988372400623345e-05, "loss": 1.3611, "step": 1500 }, { "epoch": 0.4467692995029692, "eval_loss": 1.3834556341171265, "eval_runtime": 19.0173, "eval_samples_per_second": 91.18, "eval_steps_per_second": 5.732, "step": 1500 }, { "epoch": 0.4470671457026378, "grad_norm": 0.16407231986522675, "learning_rate": 1.998832585277609e-05, "loss": 1.3598, "step": 1501 }, { "epoch": 0.44736499190230644, "grad_norm": 0.15346083045005798, "learning_rate": 1.998827921199862e-05, "loss": 1.3405, "step": 1502 }, { "epoch": 0.4476628381019751, "grad_norm": 0.15303270518779755, "learning_rate": 1.998823247829137e-05, "loss": 1.3554, "step": 1503 }, { "epoch": 0.44796068430164376, "grad_norm": 0.14479559659957886, "learning_rate": 1.998818565165478e-05, "loss": 1.3571, "step": 1504 }, { "epoch": 0.44825853050131237, "grad_norm": 0.15231306850910187, "learning_rate": 1.9988138732089285e-05, "loss": 1.3483, "step": 1505 }, { "epoch": 0.44855637670098103, "grad_norm": 0.14821556210517883, "learning_rate": 1.9988091719595314e-05, "loss": 1.3757, "step": 1506 }, { "epoch": 0.4488542229006497, "grad_norm": 0.15764357149600983, "learning_rate": 1.998804461417331e-05, "loss": 1.3378, "step": 1507 }, { "epoch": 0.4491520691003183, "grad_norm": 0.1531432718038559, "learning_rate": 1.9987997415823708e-05, "loss": 1.346, "step": 1508 }, { "epoch": 0.44944991529998696, "grad_norm": 0.15966816246509552, "learning_rate": 1.998795012454695e-05, "loss": 1.3664, "step": 1509 }, { "epoch": 0.4497477614996556, "grad_norm": 0.16142988204956055, "learning_rate": 1.998790274034348e-05, "loss": 1.3496, "step": 1510 }, { "epoch": 0.4500456076993243, "grad_norm": 0.14158722758293152, "learning_rate": 1.998785526321373e-05, "loss": 1.3501, "step": 1511 }, { "epoch": 0.4503434538989929, "grad_norm": 0.14258314669132233, "learning_rate": 1.9987807693158145e-05, "loss": 1.3612, "step": 1512 }, { "epoch": 0.45064130009866155, "grad_norm": 0.15757381916046143, "learning_rate": 1.998776003017717e-05, "loss": 1.3424, "step": 1513 }, { "epoch": 0.4509391462983302, "grad_norm": 0.14767710864543915, "learning_rate": 1.9987712274271248e-05, "loss": 1.3634, "step": 1514 }, { "epoch": 0.45123699249799887, "grad_norm": 0.15256808698177338, "learning_rate": 1.998766442544082e-05, "loss": 1.3755, "step": 1515 }, { "epoch": 0.4515348386976675, "grad_norm": 0.15053477883338928, "learning_rate": 1.9987616483686335e-05, "loss": 1.3552, "step": 1516 }, { "epoch": 0.45183268489733613, "grad_norm": 0.1493692547082901, "learning_rate": 1.9987568449008236e-05, "loss": 1.3474, "step": 1517 }, { "epoch": 0.4521305310970048, "grad_norm": 0.1480308622121811, "learning_rate": 1.9987520321406973e-05, "loss": 1.359, "step": 1518 }, { "epoch": 0.45242837729667346, "grad_norm": 0.16877669095993042, "learning_rate": 1.998747210088299e-05, "loss": 1.349, "step": 1519 }, { "epoch": 0.45272622349634206, "grad_norm": 0.1567610800266266, "learning_rate": 1.998742378743674e-05, "loss": 1.3587, "step": 1520 }, { "epoch": 0.4530240696960107, "grad_norm": 0.15030761063098907, "learning_rate": 1.998737538106867e-05, "loss": 1.357, "step": 1521 }, { "epoch": 0.4533219158956794, "grad_norm": 0.15131089091300964, "learning_rate": 1.998732688177923e-05, "loss": 1.3751, "step": 1522 }, { "epoch": 0.453619762095348, "grad_norm": 0.15185332298278809, "learning_rate": 1.9987278289568873e-05, "loss": 1.3489, "step": 1523 }, { "epoch": 0.45391760829501665, "grad_norm": 0.15380892157554626, "learning_rate": 1.998722960443805e-05, "loss": 1.3646, "step": 1524 }, { "epoch": 0.4542154544946853, "grad_norm": 0.15530753135681152, "learning_rate": 1.9987180826387213e-05, "loss": 1.366, "step": 1525 }, { "epoch": 0.45451330069435397, "grad_norm": 0.15113243460655212, "learning_rate": 1.9987131955416815e-05, "loss": 1.359, "step": 1526 }, { "epoch": 0.4548111468940226, "grad_norm": 0.14856329560279846, "learning_rate": 1.9987082991527313e-05, "loss": 1.3648, "step": 1527 }, { "epoch": 0.45510899309369124, "grad_norm": 0.15362197160720825, "learning_rate": 1.9987033934719164e-05, "loss": 1.3724, "step": 1528 }, { "epoch": 0.4554068392933599, "grad_norm": 0.1537696123123169, "learning_rate": 1.998698478499282e-05, "loss": 1.3298, "step": 1529 }, { "epoch": 0.45570468549302856, "grad_norm": 0.15299934148788452, "learning_rate": 1.998693554234874e-05, "loss": 1.3636, "step": 1530 }, { "epoch": 0.45600253169269717, "grad_norm": 0.1576281040906906, "learning_rate": 1.998688620678739e-05, "loss": 1.3609, "step": 1531 }, { "epoch": 0.4563003778923658, "grad_norm": 0.1615464836359024, "learning_rate": 1.9986836778309215e-05, "loss": 1.3523, "step": 1532 }, { "epoch": 0.4565982240920345, "grad_norm": 0.1629544198513031, "learning_rate": 1.9986787256914682e-05, "loss": 1.3559, "step": 1533 }, { "epoch": 0.45689607029170315, "grad_norm": 0.1634722799062729, "learning_rate": 1.9986737642604253e-05, "loss": 1.3517, "step": 1534 }, { "epoch": 0.45719391649137175, "grad_norm": 0.15739208459854126, "learning_rate": 1.998668793537839e-05, "loss": 1.3381, "step": 1535 }, { "epoch": 0.4574917626910404, "grad_norm": 0.16441817581653595, "learning_rate": 1.9986638135237548e-05, "loss": 1.3547, "step": 1536 }, { "epoch": 0.4577896088907091, "grad_norm": 0.15827372670173645, "learning_rate": 1.99865882421822e-05, "loss": 1.3589, "step": 1537 }, { "epoch": 0.4580874550903777, "grad_norm": 0.14872604608535767, "learning_rate": 1.9986538256212806e-05, "loss": 1.3533, "step": 1538 }, { "epoch": 0.45838530129004634, "grad_norm": 0.16894614696502686, "learning_rate": 1.998648817732983e-05, "loss": 1.3485, "step": 1539 }, { "epoch": 0.458683147489715, "grad_norm": 0.16135190427303314, "learning_rate": 1.998643800553374e-05, "loss": 1.3518, "step": 1540 }, { "epoch": 0.45898099368938367, "grad_norm": 0.1732345074415207, "learning_rate": 1.9986387740825e-05, "loss": 1.3657, "step": 1541 }, { "epoch": 0.45927883988905227, "grad_norm": 0.16166165471076965, "learning_rate": 1.9986337383204085e-05, "loss": 1.3607, "step": 1542 }, { "epoch": 0.45957668608872093, "grad_norm": 0.1520800143480301, "learning_rate": 1.998628693267145e-05, "loss": 1.3508, "step": 1543 }, { "epoch": 0.4598745322883896, "grad_norm": 0.15748022496700287, "learning_rate": 1.9986236389227577e-05, "loss": 1.3645, "step": 1544 }, { "epoch": 0.46017237848805825, "grad_norm": 0.1648527979850769, "learning_rate": 1.9986185752872934e-05, "loss": 1.3708, "step": 1545 }, { "epoch": 0.46047022468772686, "grad_norm": 0.16140109300613403, "learning_rate": 1.9986135023607988e-05, "loss": 1.3443, "step": 1546 }, { "epoch": 0.4607680708873955, "grad_norm": 0.1539575457572937, "learning_rate": 1.998608420143321e-05, "loss": 1.3521, "step": 1547 }, { "epoch": 0.4610659170870642, "grad_norm": 0.1520809382200241, "learning_rate": 1.9986033286349078e-05, "loss": 1.3566, "step": 1548 }, { "epoch": 0.4613637632867328, "grad_norm": 0.16655495762825012, "learning_rate": 1.9985982278356066e-05, "loss": 1.3389, "step": 1549 }, { "epoch": 0.46166160948640145, "grad_norm": 0.15567821264266968, "learning_rate": 1.998593117745464e-05, "loss": 1.3493, "step": 1550 }, { "epoch": 0.4619594556860701, "grad_norm": 0.15215620398521423, "learning_rate": 1.998587998364528e-05, "loss": 1.3566, "step": 1551 }, { "epoch": 0.46225730188573877, "grad_norm": 0.15496288239955902, "learning_rate": 1.998582869692847e-05, "loss": 1.356, "step": 1552 }, { "epoch": 0.4625551480854074, "grad_norm": 0.15681160986423492, "learning_rate": 1.998577731730468e-05, "loss": 1.3512, "step": 1553 }, { "epoch": 0.46285299428507604, "grad_norm": 0.16336871683597565, "learning_rate": 1.9985725844774387e-05, "loss": 1.3429, "step": 1554 }, { "epoch": 0.4631508404847447, "grad_norm": 0.15553857386112213, "learning_rate": 1.9985674279338072e-05, "loss": 1.3623, "step": 1555 }, { "epoch": 0.46344868668441336, "grad_norm": 0.15605011582374573, "learning_rate": 1.9985622620996216e-05, "loss": 1.3604, "step": 1556 }, { "epoch": 0.46374653288408196, "grad_norm": 0.15912525355815887, "learning_rate": 1.9985570869749295e-05, "loss": 1.3574, "step": 1557 }, { "epoch": 0.4640443790837506, "grad_norm": 0.1530112624168396, "learning_rate": 1.9985519025597795e-05, "loss": 1.3608, "step": 1558 }, { "epoch": 0.4643422252834193, "grad_norm": 0.14379249513149261, "learning_rate": 1.9985467088542197e-05, "loss": 1.3519, "step": 1559 }, { "epoch": 0.46464007148308795, "grad_norm": 0.17771276831626892, "learning_rate": 1.9985415058582985e-05, "loss": 1.3595, "step": 1560 }, { "epoch": 0.46493791768275655, "grad_norm": 0.15996088087558746, "learning_rate": 1.9985362935720644e-05, "loss": 1.3653, "step": 1561 }, { "epoch": 0.4652357638824252, "grad_norm": 0.1668889820575714, "learning_rate": 1.998531071995565e-05, "loss": 1.3688, "step": 1562 }, { "epoch": 0.4655336100820939, "grad_norm": 0.1608288586139679, "learning_rate": 1.99852584112885e-05, "loss": 1.3708, "step": 1563 }, { "epoch": 0.4658314562817625, "grad_norm": 0.15631313621997833, "learning_rate": 1.998520600971968e-05, "loss": 1.3625, "step": 1564 }, { "epoch": 0.46612930248143114, "grad_norm": 0.15062110126018524, "learning_rate": 1.9985153515249672e-05, "loss": 1.3699, "step": 1565 }, { "epoch": 0.4664271486810998, "grad_norm": 0.16072940826416016, "learning_rate": 1.9985100927878965e-05, "loss": 1.3485, "step": 1566 }, { "epoch": 0.46672499488076846, "grad_norm": 0.15695033967494965, "learning_rate": 1.9985048247608053e-05, "loss": 1.3491, "step": 1567 }, { "epoch": 0.46702284108043707, "grad_norm": 0.15807615220546722, "learning_rate": 1.998499547443742e-05, "loss": 1.3585, "step": 1568 }, { "epoch": 0.46732068728010573, "grad_norm": 0.15113434195518494, "learning_rate": 1.998494260836756e-05, "loss": 1.3623, "step": 1569 }, { "epoch": 0.4676185334797744, "grad_norm": 0.14953960478305817, "learning_rate": 1.9984889649398967e-05, "loss": 1.361, "step": 1570 }, { "epoch": 0.46791637967944305, "grad_norm": 0.1541774868965149, "learning_rate": 1.998483659753213e-05, "loss": 1.3371, "step": 1571 }, { "epoch": 0.46821422587911166, "grad_norm": 0.153075709939003, "learning_rate": 1.9984783452767546e-05, "loss": 1.3617, "step": 1572 }, { "epoch": 0.4685120720787803, "grad_norm": 0.15750396251678467, "learning_rate": 1.9984730215105705e-05, "loss": 1.3467, "step": 1573 }, { "epoch": 0.468809918278449, "grad_norm": 0.15796124935150146, "learning_rate": 1.9984676884547104e-05, "loss": 1.3348, "step": 1574 }, { "epoch": 0.46910776447811764, "grad_norm": 0.15773655474185944, "learning_rate": 1.9984623461092246e-05, "loss": 1.3519, "step": 1575 }, { "epoch": 0.46940561067778624, "grad_norm": 0.15742747485637665, "learning_rate": 1.998456994474162e-05, "loss": 1.3791, "step": 1576 }, { "epoch": 0.4697034568774549, "grad_norm": 0.15541964769363403, "learning_rate": 1.9984516335495722e-05, "loss": 1.3568, "step": 1577 }, { "epoch": 0.47000130307712357, "grad_norm": 0.15354672074317932, "learning_rate": 1.998446263335506e-05, "loss": 1.3552, "step": 1578 }, { "epoch": 0.47029914927679217, "grad_norm": 0.15648815035820007, "learning_rate": 1.9984408838320126e-05, "loss": 1.3757, "step": 1579 }, { "epoch": 0.47059699547646083, "grad_norm": 0.16400554776191711, "learning_rate": 1.998435495039142e-05, "loss": 1.3417, "step": 1580 }, { "epoch": 0.4708948416761295, "grad_norm": 0.16394999623298645, "learning_rate": 1.998430096956945e-05, "loss": 1.3446, "step": 1581 }, { "epoch": 0.47119268787579816, "grad_norm": 0.1599092334508896, "learning_rate": 1.9984246895854717e-05, "loss": 1.3448, "step": 1582 }, { "epoch": 0.47149053407546676, "grad_norm": 0.16573455929756165, "learning_rate": 1.9984192729247716e-05, "loss": 1.3461, "step": 1583 }, { "epoch": 0.4717883802751354, "grad_norm": 0.16031022369861603, "learning_rate": 1.998413846974896e-05, "loss": 1.3618, "step": 1584 }, { "epoch": 0.4720862264748041, "grad_norm": 0.16845066845417023, "learning_rate": 1.998408411735895e-05, "loss": 1.3644, "step": 1585 }, { "epoch": 0.47238407267447274, "grad_norm": 0.16577422618865967, "learning_rate": 1.998402967207819e-05, "loss": 1.3449, "step": 1586 }, { "epoch": 0.47268191887414135, "grad_norm": 0.14588119089603424, "learning_rate": 1.9983975133907193e-05, "loss": 1.3308, "step": 1587 }, { "epoch": 0.47297976507381, "grad_norm": 0.16787150502204895, "learning_rate": 1.9983920502846457e-05, "loss": 1.3691, "step": 1588 }, { "epoch": 0.47327761127347867, "grad_norm": 0.1683722883462906, "learning_rate": 1.99838657788965e-05, "loss": 1.3627, "step": 1589 }, { "epoch": 0.4735754574731473, "grad_norm": 0.16757291555404663, "learning_rate": 1.9983810962057823e-05, "loss": 1.3325, "step": 1590 }, { "epoch": 0.47387330367281594, "grad_norm": 0.15515659749507904, "learning_rate": 1.9983756052330942e-05, "loss": 1.3419, "step": 1591 }, { "epoch": 0.4741711498724846, "grad_norm": 0.14991584420204163, "learning_rate": 1.9983701049716368e-05, "loss": 1.3557, "step": 1592 }, { "epoch": 0.47446899607215326, "grad_norm": 0.15937559306621552, "learning_rate": 1.9983645954214602e-05, "loss": 1.3488, "step": 1593 }, { "epoch": 0.47476684227182187, "grad_norm": 0.15821953117847443, "learning_rate": 1.9983590765826173e-05, "loss": 1.358, "step": 1594 }, { "epoch": 0.4750646884714905, "grad_norm": 0.15610694885253906, "learning_rate": 1.9983535484551583e-05, "loss": 1.35, "step": 1595 }, { "epoch": 0.4753625346711592, "grad_norm": 0.15480051934719086, "learning_rate": 1.998348011039135e-05, "loss": 1.3296, "step": 1596 }, { "epoch": 0.47566038087082785, "grad_norm": 0.1662609577178955, "learning_rate": 1.9983424643345987e-05, "loss": 1.3611, "step": 1597 }, { "epoch": 0.47595822707049645, "grad_norm": 0.17669756710529327, "learning_rate": 1.9983369083416012e-05, "loss": 1.3781, "step": 1598 }, { "epoch": 0.4762560732701651, "grad_norm": 0.15191276371479034, "learning_rate": 1.9983313430601946e-05, "loss": 1.3572, "step": 1599 }, { "epoch": 0.4765539194698338, "grad_norm": 0.15627487003803253, "learning_rate": 1.9983257684904297e-05, "loss": 1.3702, "step": 1600 }, { "epoch": 0.47685176566950244, "grad_norm": 0.16343528032302856, "learning_rate": 1.9983201846323596e-05, "loss": 1.3497, "step": 1601 }, { "epoch": 0.47714961186917104, "grad_norm": 0.16408991813659668, "learning_rate": 1.9983145914860345e-05, "loss": 1.3576, "step": 1602 }, { "epoch": 0.4774474580688397, "grad_norm": 0.15406526625156403, "learning_rate": 1.9983089890515087e-05, "loss": 1.3517, "step": 1603 }, { "epoch": 0.47774530426850836, "grad_norm": 0.1557612121105194, "learning_rate": 1.9983033773288323e-05, "loss": 1.3496, "step": 1604 }, { "epoch": 0.47804315046817697, "grad_norm": 0.16335482895374298, "learning_rate": 1.9982977563180584e-05, "loss": 1.3506, "step": 1605 }, { "epoch": 0.47834099666784563, "grad_norm": 0.16537539660930634, "learning_rate": 1.99829212601924e-05, "loss": 1.3678, "step": 1606 }, { "epoch": 0.4786388428675143, "grad_norm": 0.16301892697811127, "learning_rate": 1.998286486432428e-05, "loss": 1.348, "step": 1607 }, { "epoch": 0.47893668906718295, "grad_norm": 0.16790518164634705, "learning_rate": 1.9982808375576757e-05, "loss": 1.3533, "step": 1608 }, { "epoch": 0.47923453526685156, "grad_norm": 0.16025929152965546, "learning_rate": 1.9982751793950355e-05, "loss": 1.3579, "step": 1609 }, { "epoch": 0.4795323814665202, "grad_norm": 0.1598828136920929, "learning_rate": 1.9982695119445602e-05, "loss": 1.3269, "step": 1610 }, { "epoch": 0.4798302276661889, "grad_norm": 0.1701895296573639, "learning_rate": 1.9982638352063025e-05, "loss": 1.3498, "step": 1611 }, { "epoch": 0.48012807386585754, "grad_norm": 0.15481679141521454, "learning_rate": 1.9982581491803154e-05, "loss": 1.345, "step": 1612 }, { "epoch": 0.48042592006552615, "grad_norm": 0.1521386206150055, "learning_rate": 1.998252453866651e-05, "loss": 1.3385, "step": 1613 }, { "epoch": 0.4807237662651948, "grad_norm": 0.15778110921382904, "learning_rate": 1.998246749265363e-05, "loss": 1.3452, "step": 1614 }, { "epoch": 0.48102161246486347, "grad_norm": 0.16336236894130707, "learning_rate": 1.9982410353765046e-05, "loss": 1.3647, "step": 1615 }, { "epoch": 0.48131945866453213, "grad_norm": 0.17148883640766144, "learning_rate": 1.9982353122001284e-05, "loss": 1.3661, "step": 1616 }, { "epoch": 0.48161730486420073, "grad_norm": 0.1563539057970047, "learning_rate": 1.9982295797362877e-05, "loss": 1.338, "step": 1617 }, { "epoch": 0.4819151510638694, "grad_norm": 0.1565932333469391, "learning_rate": 1.9982238379850364e-05, "loss": 1.3648, "step": 1618 }, { "epoch": 0.48221299726353806, "grad_norm": 0.16150546073913574, "learning_rate": 1.9982180869464276e-05, "loss": 1.3603, "step": 1619 }, { "epoch": 0.48251084346320666, "grad_norm": 0.16295665502548218, "learning_rate": 1.9982123266205148e-05, "loss": 1.3571, "step": 1620 }, { "epoch": 0.4828086896628753, "grad_norm": 0.16042910516262054, "learning_rate": 1.9982065570073513e-05, "loss": 1.3641, "step": 1621 }, { "epoch": 0.483106535862544, "grad_norm": 0.15912720561027527, "learning_rate": 1.9982007781069913e-05, "loss": 1.354, "step": 1622 }, { "epoch": 0.48340438206221265, "grad_norm": 0.16279810667037964, "learning_rate": 1.9981949899194883e-05, "loss": 1.3634, "step": 1623 }, { "epoch": 0.48370222826188125, "grad_norm": 0.15667667984962463, "learning_rate": 1.9981891924448958e-05, "loss": 1.3501, "step": 1624 }, { "epoch": 0.4840000744615499, "grad_norm": 0.16223295032978058, "learning_rate": 1.9981833856832684e-05, "loss": 1.3638, "step": 1625 }, { "epoch": 0.4842979206612186, "grad_norm": 0.16240368783473969, "learning_rate": 1.9981775696346596e-05, "loss": 1.3476, "step": 1626 }, { "epoch": 0.48459576686088723, "grad_norm": 0.15941420197486877, "learning_rate": 1.998171744299124e-05, "loss": 1.3294, "step": 1627 }, { "epoch": 0.48489361306055584, "grad_norm": 0.1619134098291397, "learning_rate": 1.9981659096767158e-05, "loss": 1.3429, "step": 1628 }, { "epoch": 0.4851914592602245, "grad_norm": 0.14640536904335022, "learning_rate": 1.9981600657674885e-05, "loss": 1.3552, "step": 1629 }, { "epoch": 0.48548930545989316, "grad_norm": 0.16520731151103973, "learning_rate": 1.9981542125714973e-05, "loss": 1.3441, "step": 1630 }, { "epoch": 0.48578715165956177, "grad_norm": 0.1630891114473343, "learning_rate": 1.998148350088796e-05, "loss": 1.3558, "step": 1631 }, { "epoch": 0.4860849978592304, "grad_norm": 0.15566229820251465, "learning_rate": 1.99814247831944e-05, "loss": 1.3575, "step": 1632 }, { "epoch": 0.4863828440588991, "grad_norm": 0.15086857974529266, "learning_rate": 1.998136597263483e-05, "loss": 1.3428, "step": 1633 }, { "epoch": 0.48668069025856775, "grad_norm": 0.1621299684047699, "learning_rate": 1.9981307069209802e-05, "loss": 1.359, "step": 1634 }, { "epoch": 0.48697853645823636, "grad_norm": 0.15317213535308838, "learning_rate": 1.9981248072919866e-05, "loss": 1.3695, "step": 1635 }, { "epoch": 0.487276382657905, "grad_norm": 0.1480894833803177, "learning_rate": 1.9981188983765568e-05, "loss": 1.3489, "step": 1636 }, { "epoch": 0.4875742288575737, "grad_norm": 0.15267395973205566, "learning_rate": 1.998112980174746e-05, "loss": 1.3418, "step": 1637 }, { "epoch": 0.48787207505724234, "grad_norm": 0.16101063787937164, "learning_rate": 1.9981070526866086e-05, "loss": 1.3553, "step": 1638 }, { "epoch": 0.48816992125691094, "grad_norm": 0.16287663578987122, "learning_rate": 1.9981011159122004e-05, "loss": 1.3473, "step": 1639 }, { "epoch": 0.4884677674565796, "grad_norm": 0.15994007885456085, "learning_rate": 1.9980951698515766e-05, "loss": 1.3457, "step": 1640 }, { "epoch": 0.48876561365624827, "grad_norm": 0.15128056704998016, "learning_rate": 1.998089214504792e-05, "loss": 1.3421, "step": 1641 }, { "epoch": 0.4890634598559169, "grad_norm": 0.15700973570346832, "learning_rate": 1.9980832498719028e-05, "loss": 1.3512, "step": 1642 }, { "epoch": 0.48936130605558553, "grad_norm": 0.1566929966211319, "learning_rate": 1.998077275952964e-05, "loss": 1.3372, "step": 1643 }, { "epoch": 0.4896591522552542, "grad_norm": 0.1569521278142929, "learning_rate": 1.998071292748031e-05, "loss": 1.3453, "step": 1644 }, { "epoch": 0.48995699845492285, "grad_norm": 0.15177418291568756, "learning_rate": 1.99806530025716e-05, "loss": 1.3414, "step": 1645 }, { "epoch": 0.49025484465459146, "grad_norm": 0.14923198521137238, "learning_rate": 1.9980592984804067e-05, "loss": 1.3467, "step": 1646 }, { "epoch": 0.4905526908542601, "grad_norm": 0.1549576073884964, "learning_rate": 1.9980532874178263e-05, "loss": 1.3529, "step": 1647 }, { "epoch": 0.4908505370539288, "grad_norm": 0.14973151683807373, "learning_rate": 1.998047267069475e-05, "loss": 1.3483, "step": 1648 }, { "epoch": 0.49114838325359744, "grad_norm": 0.14889411628246307, "learning_rate": 1.9980412374354097e-05, "loss": 1.3513, "step": 1649 }, { "epoch": 0.49144622945326605, "grad_norm": 0.15207715332508087, "learning_rate": 1.9980351985156854e-05, "loss": 1.3647, "step": 1650 }, { "epoch": 0.4917440756529347, "grad_norm": 0.1659306436777115, "learning_rate": 1.9980291503103582e-05, "loss": 1.3519, "step": 1651 }, { "epoch": 0.49204192185260337, "grad_norm": 0.15857510268688202, "learning_rate": 1.9980230928194856e-05, "loss": 1.352, "step": 1652 }, { "epoch": 0.49233976805227203, "grad_norm": 0.15699228644371033, "learning_rate": 1.998017026043123e-05, "loss": 1.3537, "step": 1653 }, { "epoch": 0.49263761425194064, "grad_norm": 0.16018475592136383, "learning_rate": 1.9980109499813264e-05, "loss": 1.3615, "step": 1654 }, { "epoch": 0.4929354604516093, "grad_norm": 0.14704987406730652, "learning_rate": 1.9980048646341538e-05, "loss": 1.3527, "step": 1655 }, { "epoch": 0.49323330665127796, "grad_norm": 0.15027841925621033, "learning_rate": 1.9979987700016603e-05, "loss": 1.354, "step": 1656 }, { "epoch": 0.4935311528509466, "grad_norm": 0.16247746348381042, "learning_rate": 1.997992666083903e-05, "loss": 1.3538, "step": 1657 }, { "epoch": 0.4938289990506152, "grad_norm": 0.15666894614696503, "learning_rate": 1.99798655288094e-05, "loss": 1.3289, "step": 1658 }, { "epoch": 0.4941268452502839, "grad_norm": 0.16850797832012177, "learning_rate": 1.9979804303928265e-05, "loss": 1.368, "step": 1659 }, { "epoch": 0.49442469144995255, "grad_norm": 0.16110515594482422, "learning_rate": 1.9979742986196204e-05, "loss": 1.3601, "step": 1660 }, { "epoch": 0.49472253764962115, "grad_norm": 0.15504297614097595, "learning_rate": 1.997968157561378e-05, "loss": 1.369, "step": 1661 }, { "epoch": 0.4950203838492898, "grad_norm": 0.16364294290542603, "learning_rate": 1.9979620072181575e-05, "loss": 1.3712, "step": 1662 }, { "epoch": 0.4953182300489585, "grad_norm": 0.1554390788078308, "learning_rate": 1.9979558475900148e-05, "loss": 1.3637, "step": 1663 }, { "epoch": 0.49561607624862714, "grad_norm": 0.16341114044189453, "learning_rate": 1.9979496786770087e-05, "loss": 1.3444, "step": 1664 }, { "epoch": 0.49591392244829574, "grad_norm": 0.1561581939458847, "learning_rate": 1.9979435004791953e-05, "loss": 1.3548, "step": 1665 }, { "epoch": 0.4962117686479644, "grad_norm": 0.15423549711704254, "learning_rate": 1.9979373129966326e-05, "loss": 1.3446, "step": 1666 }, { "epoch": 0.49650961484763306, "grad_norm": 0.15586353838443756, "learning_rate": 1.9979311162293783e-05, "loss": 1.3403, "step": 1667 }, { "epoch": 0.4968074610473017, "grad_norm": 0.16967377066612244, "learning_rate": 1.99792491017749e-05, "loss": 1.3516, "step": 1668 }, { "epoch": 0.49710530724697033, "grad_norm": 0.16152755916118622, "learning_rate": 1.9979186948410253e-05, "loss": 1.3511, "step": 1669 }, { "epoch": 0.497403153446639, "grad_norm": 0.17606371641159058, "learning_rate": 1.997912470220042e-05, "loss": 1.3558, "step": 1670 }, { "epoch": 0.49770099964630765, "grad_norm": 0.15710416436195374, "learning_rate": 1.9979062363145982e-05, "loss": 1.3468, "step": 1671 }, { "epoch": 0.49799884584597626, "grad_norm": 0.15560346841812134, "learning_rate": 1.9978999931247514e-05, "loss": 1.3348, "step": 1672 }, { "epoch": 0.4982966920456449, "grad_norm": 0.16255123913288116, "learning_rate": 1.9978937406505606e-05, "loss": 1.3595, "step": 1673 }, { "epoch": 0.4985945382453136, "grad_norm": 0.15652534365653992, "learning_rate": 1.9978874788920834e-05, "loss": 1.3554, "step": 1674 }, { "epoch": 0.49889238444498224, "grad_norm": 0.16340097784996033, "learning_rate": 1.997881207849378e-05, "loss": 1.3469, "step": 1675 }, { "epoch": 0.49919023064465085, "grad_norm": 0.1603468805551529, "learning_rate": 1.9978749275225026e-05, "loss": 1.3467, "step": 1676 }, { "epoch": 0.4994880768443195, "grad_norm": 0.1565481275320053, "learning_rate": 1.9978686379115163e-05, "loss": 1.3547, "step": 1677 }, { "epoch": 0.49978592304398817, "grad_norm": 0.15879780054092407, "learning_rate": 1.997862339016477e-05, "loss": 1.323, "step": 1678 }, { "epoch": 0.5000837692436568, "grad_norm": 0.1717914640903473, "learning_rate": 1.9978560308374436e-05, "loss": 1.3532, "step": 1679 }, { "epoch": 0.5003816154433255, "grad_norm": 0.15800116956233978, "learning_rate": 1.9978497133744744e-05, "loss": 1.3479, "step": 1680 }, { "epoch": 0.5006794616429942, "grad_norm": 0.16885773837566376, "learning_rate": 1.9978433866276288e-05, "loss": 1.3539, "step": 1681 }, { "epoch": 0.5009773078426627, "grad_norm": 0.16035017371177673, "learning_rate": 1.9978370505969655e-05, "loss": 1.3442, "step": 1682 }, { "epoch": 0.5012751540423314, "grad_norm": 0.1676723062992096, "learning_rate": 1.997830705282543e-05, "loss": 1.3344, "step": 1683 }, { "epoch": 0.501573000242, "grad_norm": 0.16242334246635437, "learning_rate": 1.997824350684421e-05, "loss": 1.3613, "step": 1684 }, { "epoch": 0.5018708464416687, "grad_norm": 0.1588880866765976, "learning_rate": 1.9978179868026575e-05, "loss": 1.3472, "step": 1685 }, { "epoch": 0.5021686926413373, "grad_norm": 0.1659652143716812, "learning_rate": 1.9978116136373133e-05, "loss": 1.3659, "step": 1686 }, { "epoch": 0.502466538841006, "grad_norm": 0.17029404640197754, "learning_rate": 1.9978052311884463e-05, "loss": 1.3457, "step": 1687 }, { "epoch": 0.5027643850406747, "grad_norm": 0.16028033196926117, "learning_rate": 1.9977988394561165e-05, "loss": 1.375, "step": 1688 }, { "epoch": 0.5030622312403432, "grad_norm": 0.15996789932250977, "learning_rate": 1.9977924384403836e-05, "loss": 1.3547, "step": 1689 }, { "epoch": 0.5033600774400119, "grad_norm": 0.15632674098014832, "learning_rate": 1.9977860281413064e-05, "loss": 1.3592, "step": 1690 }, { "epoch": 0.5036579236396805, "grad_norm": 0.15774105489253998, "learning_rate": 1.9977796085589453e-05, "loss": 1.3592, "step": 1691 }, { "epoch": 0.5039557698393492, "grad_norm": 0.16142427921295166, "learning_rate": 1.99777317969336e-05, "loss": 1.3548, "step": 1692 }, { "epoch": 0.5042536160390179, "grad_norm": 0.15816092491149902, "learning_rate": 1.9977667415446096e-05, "loss": 1.3401, "step": 1693 }, { "epoch": 0.5045514622386865, "grad_norm": 0.15842948853969574, "learning_rate": 1.9977602941127546e-05, "loss": 1.3418, "step": 1694 }, { "epoch": 0.5048493084383552, "grad_norm": 0.16587843000888824, "learning_rate": 1.9977538373978548e-05, "loss": 1.3317, "step": 1695 }, { "epoch": 0.5051471546380238, "grad_norm": 0.15935342013835907, "learning_rate": 1.9977473713999703e-05, "loss": 1.341, "step": 1696 }, { "epoch": 0.5054450008376924, "grad_norm": 0.16304226219654083, "learning_rate": 1.997740896119161e-05, "loss": 1.3679, "step": 1697 }, { "epoch": 0.505742847037361, "grad_norm": 0.1632426381111145, "learning_rate": 1.997734411555488e-05, "loss": 1.3499, "step": 1698 }, { "epoch": 0.5060406932370297, "grad_norm": 0.1575668901205063, "learning_rate": 1.9977279177090105e-05, "loss": 1.3439, "step": 1699 }, { "epoch": 0.5063385394366984, "grad_norm": 0.1599178910255432, "learning_rate": 1.9977214145797898e-05, "loss": 1.3397, "step": 1700 }, { "epoch": 0.506636385636367, "grad_norm": 0.16731704771518707, "learning_rate": 1.997714902167886e-05, "loss": 1.3654, "step": 1701 }, { "epoch": 0.5069342318360357, "grad_norm": 0.16202615201473236, "learning_rate": 1.9977083804733595e-05, "loss": 1.3339, "step": 1702 }, { "epoch": 0.5072320780357044, "grad_norm": 0.16238966584205627, "learning_rate": 1.9977018494962715e-05, "loss": 1.3503, "step": 1703 }, { "epoch": 0.5075299242353729, "grad_norm": 0.16439203917980194, "learning_rate": 1.9976953092366825e-05, "loss": 1.3343, "step": 1704 }, { "epoch": 0.5078277704350416, "grad_norm": 0.15659132599830627, "learning_rate": 1.9976887596946533e-05, "loss": 1.3342, "step": 1705 }, { "epoch": 0.5081256166347102, "grad_norm": 0.15232855081558228, "learning_rate": 1.9976822008702445e-05, "loss": 1.3394, "step": 1706 }, { "epoch": 0.5084234628343789, "grad_norm": 0.16024251282215118, "learning_rate": 1.9976756327635178e-05, "loss": 1.3604, "step": 1707 }, { "epoch": 0.5087213090340476, "grad_norm": 0.16114826500415802, "learning_rate": 1.997669055374534e-05, "loss": 1.3595, "step": 1708 }, { "epoch": 0.5090191552337162, "grad_norm": 0.15960678458213806, "learning_rate": 1.9976624687033543e-05, "loss": 1.3651, "step": 1709 }, { "epoch": 0.5093170014333849, "grad_norm": 0.16880513727664948, "learning_rate": 1.99765587275004e-05, "loss": 1.3491, "step": 1710 }, { "epoch": 0.5096148476330535, "grad_norm": 0.16581284999847412, "learning_rate": 1.9976492675146522e-05, "loss": 1.3592, "step": 1711 }, { "epoch": 0.5099126938327221, "grad_norm": 0.16696377098560333, "learning_rate": 1.9976426529972532e-05, "loss": 1.3508, "step": 1712 }, { "epoch": 0.5102105400323907, "grad_norm": 0.16760198771953583, "learning_rate": 1.9976360291979034e-05, "loss": 1.3533, "step": 1713 }, { "epoch": 0.5105083862320594, "grad_norm": 0.16133247315883636, "learning_rate": 1.9976293961166654e-05, "loss": 1.3476, "step": 1714 }, { "epoch": 0.5108062324317281, "grad_norm": 0.1651097983121872, "learning_rate": 1.9976227537536003e-05, "loss": 1.3574, "step": 1715 }, { "epoch": 0.5111040786313967, "grad_norm": 0.16226840019226074, "learning_rate": 1.99761610210877e-05, "loss": 1.3441, "step": 1716 }, { "epoch": 0.5114019248310654, "grad_norm": 0.1548374593257904, "learning_rate": 1.9976094411822364e-05, "loss": 1.3421, "step": 1717 }, { "epoch": 0.511699771030734, "grad_norm": 0.1616935282945633, "learning_rate": 1.9976027709740616e-05, "loss": 1.353, "step": 1718 }, { "epoch": 0.5119976172304026, "grad_norm": 0.1775294840335846, "learning_rate": 1.9975960914843075e-05, "loss": 1.3583, "step": 1719 }, { "epoch": 0.5122954634300713, "grad_norm": 0.17687584459781647, "learning_rate": 1.9975894027130367e-05, "loss": 1.3575, "step": 1720 }, { "epoch": 0.5125933096297399, "grad_norm": 0.1619470715522766, "learning_rate": 1.997582704660311e-05, "loss": 1.3461, "step": 1721 }, { "epoch": 0.5128911558294086, "grad_norm": 0.1800096482038498, "learning_rate": 1.9975759973261924e-05, "loss": 1.3674, "step": 1722 }, { "epoch": 0.5131890020290772, "grad_norm": 0.178078293800354, "learning_rate": 1.997569280710744e-05, "loss": 1.3309, "step": 1723 }, { "epoch": 0.5134868482287459, "grad_norm": 0.16881874203681946, "learning_rate": 1.997562554814028e-05, "loss": 1.3411, "step": 1724 }, { "epoch": 0.5137846944284146, "grad_norm": 0.1666990965604782, "learning_rate": 1.997555819636107e-05, "loss": 1.3465, "step": 1725 }, { "epoch": 0.5140825406280832, "grad_norm": 0.17623895406723022, "learning_rate": 1.9975490751770436e-05, "loss": 1.3122, "step": 1726 }, { "epoch": 0.5143803868277518, "grad_norm": 0.17145301401615143, "learning_rate": 1.9975423214369004e-05, "loss": 1.3348, "step": 1727 }, { "epoch": 0.5146782330274204, "grad_norm": 0.15384571254253387, "learning_rate": 1.997535558415741e-05, "loss": 1.3451, "step": 1728 }, { "epoch": 0.5149760792270891, "grad_norm": 0.17020882666110992, "learning_rate": 1.9975287861136272e-05, "loss": 1.3414, "step": 1729 }, { "epoch": 0.5152739254267578, "grad_norm": 0.17712000012397766, "learning_rate": 1.9975220045306227e-05, "loss": 1.3489, "step": 1730 }, { "epoch": 0.5155717716264264, "grad_norm": 0.16047564148902893, "learning_rate": 1.9975152136667902e-05, "loss": 1.348, "step": 1731 }, { "epoch": 0.5158696178260951, "grad_norm": 0.15981508791446686, "learning_rate": 1.9975084135221933e-05, "loss": 1.3523, "step": 1732 }, { "epoch": 0.5161674640257637, "grad_norm": 0.1657506823539734, "learning_rate": 1.9975016040968952e-05, "loss": 1.3432, "step": 1733 }, { "epoch": 0.5164653102254323, "grad_norm": 0.17202956974506378, "learning_rate": 1.997494785390959e-05, "loss": 1.3542, "step": 1734 }, { "epoch": 0.516763156425101, "grad_norm": 0.16407586634159088, "learning_rate": 1.9974879574044484e-05, "loss": 1.3394, "step": 1735 }, { "epoch": 0.5170610026247696, "grad_norm": 0.16045711934566498, "learning_rate": 1.9974811201374267e-05, "loss": 1.3498, "step": 1736 }, { "epoch": 0.5173588488244383, "grad_norm": 0.17223873734474182, "learning_rate": 1.997474273589958e-05, "loss": 1.3409, "step": 1737 }, { "epoch": 0.5176566950241069, "grad_norm": 0.15774434804916382, "learning_rate": 1.9974674177621053e-05, "loss": 1.3427, "step": 1738 }, { "epoch": 0.5179545412237756, "grad_norm": 0.16195274889469147, "learning_rate": 1.9974605526539326e-05, "loss": 1.347, "step": 1739 }, { "epoch": 0.5182523874234443, "grad_norm": 0.16589990258216858, "learning_rate": 1.997453678265504e-05, "loss": 1.3386, "step": 1740 }, { "epoch": 0.5185502336231128, "grad_norm": 0.15620534121990204, "learning_rate": 1.9974467945968835e-05, "loss": 1.3648, "step": 1741 }, { "epoch": 0.5188480798227815, "grad_norm": 0.16260677576065063, "learning_rate": 1.997439901648135e-05, "loss": 1.3383, "step": 1742 }, { "epoch": 0.5191459260224501, "grad_norm": 0.16320344805717468, "learning_rate": 1.9974329994193225e-05, "loss": 1.3355, "step": 1743 }, { "epoch": 0.5194437722221188, "grad_norm": 0.15730075538158417, "learning_rate": 1.9974260879105104e-05, "loss": 1.3454, "step": 1744 }, { "epoch": 0.5197416184217875, "grad_norm": 0.17475290596485138, "learning_rate": 1.9974191671217627e-05, "loss": 1.3376, "step": 1745 }, { "epoch": 0.5200394646214561, "grad_norm": 0.16563285887241364, "learning_rate": 1.997412237053144e-05, "loss": 1.3473, "step": 1746 }, { "epoch": 0.5203373108211248, "grad_norm": 0.16842181980609894, "learning_rate": 1.9974052977047193e-05, "loss": 1.3772, "step": 1747 }, { "epoch": 0.5206351570207934, "grad_norm": 0.17073404788970947, "learning_rate": 1.9973983490765522e-05, "loss": 1.3426, "step": 1748 }, { "epoch": 0.520933003220462, "grad_norm": 0.1694171130657196, "learning_rate": 1.997391391168708e-05, "loss": 1.3568, "step": 1749 }, { "epoch": 0.5212308494201306, "grad_norm": 0.16218890249729156, "learning_rate": 1.9973844239812516e-05, "loss": 1.3251, "step": 1750 }, { "epoch": 0.5215286956197993, "grad_norm": 0.1597883403301239, "learning_rate": 1.997377447514247e-05, "loss": 1.3427, "step": 1751 }, { "epoch": 0.521826541819468, "grad_norm": 0.16062845289707184, "learning_rate": 1.9973704617677597e-05, "loss": 1.3403, "step": 1752 }, { "epoch": 0.5221243880191366, "grad_norm": 0.1634770780801773, "learning_rate": 1.9973634667418548e-05, "loss": 1.3456, "step": 1753 }, { "epoch": 0.5224222342188053, "grad_norm": 0.16171061992645264, "learning_rate": 1.997356462436597e-05, "loss": 1.3569, "step": 1754 }, { "epoch": 0.522720080418474, "grad_norm": 0.16133779287338257, "learning_rate": 1.9973494488520514e-05, "loss": 1.3392, "step": 1755 }, { "epoch": 0.5230179266181425, "grad_norm": 0.15806788206100464, "learning_rate": 1.9973424259882837e-05, "loss": 1.3471, "step": 1756 }, { "epoch": 0.5233157728178112, "grad_norm": 0.16078951954841614, "learning_rate": 1.9973353938453592e-05, "loss": 1.3389, "step": 1757 }, { "epoch": 0.5236136190174798, "grad_norm": 0.16487789154052734, "learning_rate": 1.9973283524233425e-05, "loss": 1.3473, "step": 1758 }, { "epoch": 0.5239114652171485, "grad_norm": 0.16977691650390625, "learning_rate": 1.9973213017223005e-05, "loss": 1.3508, "step": 1759 }, { "epoch": 0.5242093114168171, "grad_norm": 0.15756413340568542, "learning_rate": 1.9973142417422978e-05, "loss": 1.3423, "step": 1760 }, { "epoch": 0.5245071576164858, "grad_norm": 0.1605219691991806, "learning_rate": 1.9973071724834002e-05, "loss": 1.3485, "step": 1761 }, { "epoch": 0.5248050038161545, "grad_norm": 0.15755310654640198, "learning_rate": 1.997300093945674e-05, "loss": 1.351, "step": 1762 }, { "epoch": 0.5251028500158231, "grad_norm": 0.15491753816604614, "learning_rate": 1.9972930061291845e-05, "loss": 1.3274, "step": 1763 }, { "epoch": 0.5254006962154917, "grad_norm": 0.16542217135429382, "learning_rate": 1.9972859090339975e-05, "loss": 1.3576, "step": 1764 }, { "epoch": 0.5256985424151603, "grad_norm": 0.15959101915359497, "learning_rate": 1.9972788026601798e-05, "loss": 1.3443, "step": 1765 }, { "epoch": 0.525996388614829, "grad_norm": 0.16338451206684113, "learning_rate": 1.9972716870077966e-05, "loss": 1.3387, "step": 1766 }, { "epoch": 0.5262942348144977, "grad_norm": 0.15877531468868256, "learning_rate": 1.9972645620769148e-05, "loss": 1.3399, "step": 1767 }, { "epoch": 0.5265920810141663, "grad_norm": 0.1649436354637146, "learning_rate": 1.9972574278676006e-05, "loss": 1.3414, "step": 1768 }, { "epoch": 0.526889927213835, "grad_norm": 0.15953277051448822, "learning_rate": 1.9972502843799204e-05, "loss": 1.3607, "step": 1769 }, { "epoch": 0.5271877734135036, "grad_norm": 0.16934038698673248, "learning_rate": 1.99724313161394e-05, "loss": 1.3285, "step": 1770 }, { "epoch": 0.5274856196131722, "grad_norm": 0.16570153832435608, "learning_rate": 1.9972359695697267e-05, "loss": 1.3334, "step": 1771 }, { "epoch": 0.5277834658128409, "grad_norm": 0.16746912896633148, "learning_rate": 1.9972287982473468e-05, "loss": 1.3472, "step": 1772 }, { "epoch": 0.5280813120125095, "grad_norm": 0.15530602633953094, "learning_rate": 1.997221617646867e-05, "loss": 1.3438, "step": 1773 }, { "epoch": 0.5283791582121782, "grad_norm": 0.15870961546897888, "learning_rate": 1.9972144277683545e-05, "loss": 1.3346, "step": 1774 }, { "epoch": 0.5286770044118468, "grad_norm": 0.1621575653553009, "learning_rate": 1.997207228611876e-05, "loss": 1.3416, "step": 1775 }, { "epoch": 0.5289748506115155, "grad_norm": 0.16383720934391022, "learning_rate": 1.997200020177498e-05, "loss": 1.3502, "step": 1776 }, { "epoch": 0.5292726968111842, "grad_norm": 0.17416946589946747, "learning_rate": 1.997192802465288e-05, "loss": 1.3438, "step": 1777 }, { "epoch": 0.5295705430108528, "grad_norm": 0.16202545166015625, "learning_rate": 1.9971855754753134e-05, "loss": 1.37, "step": 1778 }, { "epoch": 0.5298683892105214, "grad_norm": 0.17150531709194183, "learning_rate": 1.9971783392076407e-05, "loss": 1.3343, "step": 1779 }, { "epoch": 0.53016623541019, "grad_norm": 0.16482296586036682, "learning_rate": 1.997171093662338e-05, "loss": 1.3461, "step": 1780 }, { "epoch": 0.5304640816098587, "grad_norm": 0.1628810316324234, "learning_rate": 1.9971638388394724e-05, "loss": 1.3454, "step": 1781 }, { "epoch": 0.5307619278095274, "grad_norm": 0.17356710135936737, "learning_rate": 1.997156574739111e-05, "loss": 1.3525, "step": 1782 }, { "epoch": 0.531059774009196, "grad_norm": 0.17725849151611328, "learning_rate": 1.997149301361322e-05, "loss": 1.3641, "step": 1783 }, { "epoch": 0.5313576202088647, "grad_norm": 0.15778744220733643, "learning_rate": 1.997142018706173e-05, "loss": 1.3506, "step": 1784 }, { "epoch": 0.5316554664085333, "grad_norm": 0.17386850714683533, "learning_rate": 1.9971347267737314e-05, "loss": 1.3454, "step": 1785 }, { "epoch": 0.5319533126082019, "grad_norm": 0.1833541989326477, "learning_rate": 1.9971274255640654e-05, "loss": 1.3333, "step": 1786 }, { "epoch": 0.5322511588078706, "grad_norm": 0.16736625134944916, "learning_rate": 1.9971201150772426e-05, "loss": 1.345, "step": 1787 }, { "epoch": 0.5325490050075392, "grad_norm": 0.16242630779743195, "learning_rate": 1.9971127953133314e-05, "loss": 1.3425, "step": 1788 }, { "epoch": 0.5328468512072079, "grad_norm": 0.16493670642375946, "learning_rate": 1.9971054662723996e-05, "loss": 1.3522, "step": 1789 }, { "epoch": 0.5331446974068765, "grad_norm": 0.17540118098258972, "learning_rate": 1.997098127954515e-05, "loss": 1.3214, "step": 1790 }, { "epoch": 0.5334425436065452, "grad_norm": 0.19233962893486023, "learning_rate": 1.997090780359747e-05, "loss": 1.3451, "step": 1791 }, { "epoch": 0.5337403898062139, "grad_norm": 0.16943323612213135, "learning_rate": 1.9970834234881628e-05, "loss": 1.3303, "step": 1792 }, { "epoch": 0.5340382360058825, "grad_norm": 0.15818580985069275, "learning_rate": 1.9970760573398316e-05, "loss": 1.3337, "step": 1793 }, { "epoch": 0.5343360822055511, "grad_norm": 0.17102812230587006, "learning_rate": 1.9970686819148216e-05, "loss": 1.3577, "step": 1794 }, { "epoch": 0.5346339284052197, "grad_norm": 0.17434453964233398, "learning_rate": 1.9970612972132017e-05, "loss": 1.3541, "step": 1795 }, { "epoch": 0.5349317746048884, "grad_norm": 0.16895638406276703, "learning_rate": 1.99705390323504e-05, "loss": 1.3363, "step": 1796 }, { "epoch": 0.535229620804557, "grad_norm": 0.1675790697336197, "learning_rate": 1.9970464999804063e-05, "loss": 1.3452, "step": 1797 }, { "epoch": 0.5355274670042257, "grad_norm": 0.1745259016752243, "learning_rate": 1.9970390874493685e-05, "loss": 1.3344, "step": 1798 }, { "epoch": 0.5358253132038944, "grad_norm": 0.16959160566329956, "learning_rate": 1.9970316656419958e-05, "loss": 1.3404, "step": 1799 }, { "epoch": 0.536123159403563, "grad_norm": 0.1652490347623825, "learning_rate": 1.9970242345583573e-05, "loss": 1.3326, "step": 1800 }, { "epoch": 0.5364210056032316, "grad_norm": 0.15833565592765808, "learning_rate": 1.997016794198523e-05, "loss": 1.3373, "step": 1801 }, { "epoch": 0.5367188518029002, "grad_norm": 0.1646917462348938, "learning_rate": 1.9970093445625607e-05, "loss": 1.3328, "step": 1802 }, { "epoch": 0.5370166980025689, "grad_norm": 0.17969022691249847, "learning_rate": 1.9970018856505408e-05, "loss": 1.3521, "step": 1803 }, { "epoch": 0.5373145442022376, "grad_norm": 0.16079260408878326, "learning_rate": 1.996994417462532e-05, "loss": 1.3534, "step": 1804 }, { "epoch": 0.5376123904019062, "grad_norm": 0.16265247762203217, "learning_rate": 1.9969869399986043e-05, "loss": 1.3398, "step": 1805 }, { "epoch": 0.5379102366015749, "grad_norm": 0.1635185331106186, "learning_rate": 1.9969794532588266e-05, "loss": 1.3457, "step": 1806 }, { "epoch": 0.5382080828012435, "grad_norm": 0.16827929019927979, "learning_rate": 1.996971957243269e-05, "loss": 1.3486, "step": 1807 }, { "epoch": 0.5385059290009122, "grad_norm": 0.15943558514118195, "learning_rate": 1.9969644519520014e-05, "loss": 1.3529, "step": 1808 }, { "epoch": 0.5388037752005808, "grad_norm": 0.15358304977416992, "learning_rate": 1.9969569373850935e-05, "loss": 1.3428, "step": 1809 }, { "epoch": 0.5391016214002494, "grad_norm": 0.1591339260339737, "learning_rate": 1.9969494135426155e-05, "loss": 1.3291, "step": 1810 }, { "epoch": 0.5393994675999181, "grad_norm": 0.16745908558368683, "learning_rate": 1.9969418804246367e-05, "loss": 1.3446, "step": 1811 }, { "epoch": 0.5396973137995867, "grad_norm": 0.17378199100494385, "learning_rate": 1.996934338031227e-05, "loss": 1.3346, "step": 1812 }, { "epoch": 0.5399951599992554, "grad_norm": 0.16124950349330902, "learning_rate": 1.996926786362458e-05, "loss": 1.3564, "step": 1813 }, { "epoch": 0.5402930061989241, "grad_norm": 0.17586320638656616, "learning_rate": 1.996919225418399e-05, "loss": 1.3647, "step": 1814 }, { "epoch": 0.5405908523985927, "grad_norm": 0.1691247671842575, "learning_rate": 1.9969116551991197e-05, "loss": 1.3437, "step": 1815 }, { "epoch": 0.5408886985982613, "grad_norm": 0.1737504005432129, "learning_rate": 1.996904075704692e-05, "loss": 1.3238, "step": 1816 }, { "epoch": 0.5411865447979299, "grad_norm": 0.15709450840950012, "learning_rate": 1.9968964869351855e-05, "loss": 1.3419, "step": 1817 }, { "epoch": 0.5414843909975986, "grad_norm": 0.16622452437877655, "learning_rate": 1.9968888888906707e-05, "loss": 1.3457, "step": 1818 }, { "epoch": 0.5417822371972673, "grad_norm": 0.16880926489830017, "learning_rate": 1.996881281571219e-05, "loss": 1.3461, "step": 1819 }, { "epoch": 0.5420800833969359, "grad_norm": 0.17418015003204346, "learning_rate": 1.9968736649769005e-05, "loss": 1.3623, "step": 1820 }, { "epoch": 0.5423779295966046, "grad_norm": 0.17431408166885376, "learning_rate": 1.9968660391077864e-05, "loss": 1.3654, "step": 1821 }, { "epoch": 0.5426757757962732, "grad_norm": 0.16442149877548218, "learning_rate": 1.9968584039639475e-05, "loss": 1.3441, "step": 1822 }, { "epoch": 0.5429736219959418, "grad_norm": 0.17375442385673523, "learning_rate": 1.996850759545455e-05, "loss": 1.3502, "step": 1823 }, { "epoch": 0.5432714681956105, "grad_norm": 0.1641933023929596, "learning_rate": 1.99684310585238e-05, "loss": 1.3461, "step": 1824 }, { "epoch": 0.5435693143952791, "grad_norm": 0.1642332673072815, "learning_rate": 1.9968354428847934e-05, "loss": 1.3395, "step": 1825 }, { "epoch": 0.5438671605949478, "grad_norm": 0.179438978433609, "learning_rate": 1.9968277706427667e-05, "loss": 1.3522, "step": 1826 }, { "epoch": 0.5441650067946164, "grad_norm": 0.16452936828136444, "learning_rate": 1.9968200891263717e-05, "loss": 1.3381, "step": 1827 }, { "epoch": 0.5444628529942851, "grad_norm": 0.16768696904182434, "learning_rate": 1.9968123983356794e-05, "loss": 1.3439, "step": 1828 }, { "epoch": 0.5447606991939538, "grad_norm": 0.17807155847549438, "learning_rate": 1.996804698270761e-05, "loss": 1.3341, "step": 1829 }, { "epoch": 0.5450585453936224, "grad_norm": 0.17662352323532104, "learning_rate": 1.996796988931689e-05, "loss": 1.3335, "step": 1830 }, { "epoch": 0.545356391593291, "grad_norm": 0.16839559376239777, "learning_rate": 1.9967892703185344e-05, "loss": 1.3476, "step": 1831 }, { "epoch": 0.5456542377929596, "grad_norm": 0.1740187257528305, "learning_rate": 1.9967815424313697e-05, "loss": 1.3451, "step": 1832 }, { "epoch": 0.5459520839926283, "grad_norm": 0.1738300770521164, "learning_rate": 1.9967738052702664e-05, "loss": 1.3345, "step": 1833 }, { "epoch": 0.546249930192297, "grad_norm": 0.17655032873153687, "learning_rate": 1.996766058835296e-05, "loss": 1.3256, "step": 1834 }, { "epoch": 0.5465477763919656, "grad_norm": 0.16478820145130157, "learning_rate": 1.9967583031265313e-05, "loss": 1.348, "step": 1835 }, { "epoch": 0.5468456225916343, "grad_norm": 0.16392803192138672, "learning_rate": 1.9967505381440446e-05, "loss": 1.3548, "step": 1836 }, { "epoch": 0.5471434687913029, "grad_norm": 0.17150482535362244, "learning_rate": 1.9967427638879075e-05, "loss": 1.3404, "step": 1837 }, { "epoch": 0.5474413149909715, "grad_norm": 0.17070432007312775, "learning_rate": 1.9967349803581924e-05, "loss": 1.3376, "step": 1838 }, { "epoch": 0.5477391611906401, "grad_norm": 0.16423381865024567, "learning_rate": 1.9967271875549726e-05, "loss": 1.3388, "step": 1839 }, { "epoch": 0.5480370073903088, "grad_norm": 0.16895762085914612, "learning_rate": 1.9967193854783192e-05, "loss": 1.3328, "step": 1840 }, { "epoch": 0.5483348535899775, "grad_norm": 0.16752618551254272, "learning_rate": 1.9967115741283057e-05, "loss": 1.347, "step": 1841 }, { "epoch": 0.5486326997896461, "grad_norm": 0.16780757904052734, "learning_rate": 1.996703753505005e-05, "loss": 1.329, "step": 1842 }, { "epoch": 0.5489305459893148, "grad_norm": 0.16428565979003906, "learning_rate": 1.996695923608489e-05, "loss": 1.346, "step": 1843 }, { "epoch": 0.5492283921889834, "grad_norm": 0.172488272190094, "learning_rate": 1.996688084438831e-05, "loss": 1.3474, "step": 1844 }, { "epoch": 0.5495262383886521, "grad_norm": 0.16258852183818817, "learning_rate": 1.9966802359961042e-05, "loss": 1.3384, "step": 1845 }, { "epoch": 0.5498240845883207, "grad_norm": 0.16376498341560364, "learning_rate": 1.996672378280381e-05, "loss": 1.3475, "step": 1846 }, { "epoch": 0.5501219307879893, "grad_norm": 0.1541081815958023, "learning_rate": 1.996664511291735e-05, "loss": 1.35, "step": 1847 }, { "epoch": 0.550419776987658, "grad_norm": 0.16525718569755554, "learning_rate": 1.9966566350302398e-05, "loss": 1.3377, "step": 1848 }, { "epoch": 0.5507176231873266, "grad_norm": 0.16358032822608948, "learning_rate": 1.9966487494959678e-05, "loss": 1.3246, "step": 1849 }, { "epoch": 0.5510154693869953, "grad_norm": 0.15843936800956726, "learning_rate": 1.9966408546889924e-05, "loss": 1.3415, "step": 1850 }, { "epoch": 0.551313315586664, "grad_norm": 0.1719946563243866, "learning_rate": 1.9966329506093876e-05, "loss": 1.3264, "step": 1851 }, { "epoch": 0.5516111617863326, "grad_norm": 0.1743927001953125, "learning_rate": 1.9966250372572265e-05, "loss": 1.3387, "step": 1852 }, { "epoch": 0.5519090079860012, "grad_norm": 0.17628605663776398, "learning_rate": 1.9966171146325832e-05, "loss": 1.3414, "step": 1853 }, { "epoch": 0.5522068541856698, "grad_norm": 0.16940726339817047, "learning_rate": 1.9966091827355312e-05, "loss": 1.3334, "step": 1854 }, { "epoch": 0.5525047003853385, "grad_norm": 0.16338200867176056, "learning_rate": 1.9966012415661444e-05, "loss": 1.3202, "step": 1855 }, { "epoch": 0.5528025465850072, "grad_norm": 0.15881620347499847, "learning_rate": 1.996593291124496e-05, "loss": 1.342, "step": 1856 }, { "epoch": 0.5531003927846758, "grad_norm": 0.16228406131267548, "learning_rate": 1.9965853314106606e-05, "loss": 1.3443, "step": 1857 }, { "epoch": 0.5533982389843445, "grad_norm": 0.16766801476478577, "learning_rate": 1.996577362424712e-05, "loss": 1.3513, "step": 1858 }, { "epoch": 0.5536960851840131, "grad_norm": 0.16810661554336548, "learning_rate": 1.996569384166725e-05, "loss": 1.3309, "step": 1859 }, { "epoch": 0.5539939313836818, "grad_norm": 0.1693679839372635, "learning_rate": 1.9965613966367726e-05, "loss": 1.3379, "step": 1860 }, { "epoch": 0.5542917775833504, "grad_norm": 0.15733188390731812, "learning_rate": 1.9965533998349306e-05, "loss": 1.3275, "step": 1861 }, { "epoch": 0.554589623783019, "grad_norm": 0.16854235529899597, "learning_rate": 1.996545393761272e-05, "loss": 1.351, "step": 1862 }, { "epoch": 0.5548874699826877, "grad_norm": 0.16890855133533478, "learning_rate": 1.9965373784158725e-05, "loss": 1.3444, "step": 1863 }, { "epoch": 0.5551853161823563, "grad_norm": 0.17377708852291107, "learning_rate": 1.9965293537988058e-05, "loss": 1.3455, "step": 1864 }, { "epoch": 0.555483162382025, "grad_norm": 0.17160733044147491, "learning_rate": 1.9965213199101466e-05, "loss": 1.3326, "step": 1865 }, { "epoch": 0.5557810085816937, "grad_norm": 0.17008385062217712, "learning_rate": 1.99651327674997e-05, "loss": 1.3565, "step": 1866 }, { "epoch": 0.5560788547813623, "grad_norm": 0.16323962807655334, "learning_rate": 1.996505224318351e-05, "loss": 1.3564, "step": 1867 }, { "epoch": 0.5563767009810309, "grad_norm": 0.16453659534454346, "learning_rate": 1.9964971626153643e-05, "loss": 1.3408, "step": 1868 }, { "epoch": 0.5566745471806995, "grad_norm": 0.17312097549438477, "learning_rate": 1.9964890916410846e-05, "loss": 1.3462, "step": 1869 }, { "epoch": 0.5569723933803682, "grad_norm": 0.17335082590579987, "learning_rate": 1.9964810113955872e-05, "loss": 1.3358, "step": 1870 }, { "epoch": 0.5572702395800369, "grad_norm": 0.17735369503498077, "learning_rate": 1.9964729218789472e-05, "loss": 1.3436, "step": 1871 }, { "epoch": 0.5575680857797055, "grad_norm": 0.17080743610858917, "learning_rate": 1.9964648230912406e-05, "loss": 1.3452, "step": 1872 }, { "epoch": 0.5578659319793742, "grad_norm": 0.17747481167316437, "learning_rate": 1.9964567150325416e-05, "loss": 1.3336, "step": 1873 }, { "epoch": 0.5581637781790428, "grad_norm": 0.1720564216375351, "learning_rate": 1.9964485977029263e-05, "loss": 1.33, "step": 1874 }, { "epoch": 0.5584616243787115, "grad_norm": 0.16504305601119995, "learning_rate": 1.9964404711024703e-05, "loss": 1.348, "step": 1875 }, { "epoch": 0.55875947057838, "grad_norm": 0.1772458851337433, "learning_rate": 1.9964323352312486e-05, "loss": 1.3474, "step": 1876 }, { "epoch": 0.5590573167780487, "grad_norm": 0.1816224902868271, "learning_rate": 1.9964241900893377e-05, "loss": 1.3217, "step": 1877 }, { "epoch": 0.5593551629777174, "grad_norm": 0.1705823540687561, "learning_rate": 1.9964160356768128e-05, "loss": 1.3346, "step": 1878 }, { "epoch": 0.559653009177386, "grad_norm": 0.1707306206226349, "learning_rate": 1.9964078719937497e-05, "loss": 1.3369, "step": 1879 }, { "epoch": 0.5599508553770547, "grad_norm": 0.17467527091503143, "learning_rate": 1.9963996990402252e-05, "loss": 1.3468, "step": 1880 }, { "epoch": 0.5602487015767234, "grad_norm": 0.17778559029102325, "learning_rate": 1.9963915168163143e-05, "loss": 1.3525, "step": 1881 }, { "epoch": 0.560546547776392, "grad_norm": 0.17405925691127777, "learning_rate": 1.9963833253220937e-05, "loss": 1.3235, "step": 1882 }, { "epoch": 0.5608443939760606, "grad_norm": 0.1665666401386261, "learning_rate": 1.9963751245576396e-05, "loss": 1.3342, "step": 1883 }, { "epoch": 0.5611422401757292, "grad_norm": 0.16782276332378387, "learning_rate": 1.996366914523028e-05, "loss": 1.3436, "step": 1884 }, { "epoch": 0.5614400863753979, "grad_norm": 0.16133250296115875, "learning_rate": 1.9963586952183355e-05, "loss": 1.3572, "step": 1885 }, { "epoch": 0.5617379325750665, "grad_norm": 0.17694924771785736, "learning_rate": 1.9963504666436386e-05, "loss": 1.3491, "step": 1886 }, { "epoch": 0.5620357787747352, "grad_norm": 0.17527422308921814, "learning_rate": 1.9963422287990134e-05, "loss": 1.3376, "step": 1887 }, { "epoch": 0.5623336249744039, "grad_norm": 0.17014721035957336, "learning_rate": 1.9963339816845377e-05, "loss": 1.3289, "step": 1888 }, { "epoch": 0.5626314711740725, "grad_norm": 0.164009228348732, "learning_rate": 1.9963257253002868e-05, "loss": 1.3403, "step": 1889 }, { "epoch": 0.5629293173737412, "grad_norm": 0.2767203450202942, "learning_rate": 1.9963174596463387e-05, "loss": 1.3266, "step": 1890 }, { "epoch": 0.5632271635734097, "grad_norm": 0.17292539775371552, "learning_rate": 1.9963091847227694e-05, "loss": 1.3648, "step": 1891 }, { "epoch": 0.5635250097730784, "grad_norm": 0.17129047214984894, "learning_rate": 1.9963009005296565e-05, "loss": 1.335, "step": 1892 }, { "epoch": 0.5638228559727471, "grad_norm": 0.16882653534412384, "learning_rate": 1.9962926070670767e-05, "loss": 1.3381, "step": 1893 }, { "epoch": 0.5641207021724157, "grad_norm": 0.17744049429893494, "learning_rate": 1.996284304335107e-05, "loss": 1.3374, "step": 1894 }, { "epoch": 0.5644185483720844, "grad_norm": 0.17141692340373993, "learning_rate": 1.996275992333826e-05, "loss": 1.3348, "step": 1895 }, { "epoch": 0.564716394571753, "grad_norm": 0.16697026789188385, "learning_rate": 1.9962676710633093e-05, "loss": 1.332, "step": 1896 }, { "epoch": 0.5650142407714217, "grad_norm": 0.1682598888874054, "learning_rate": 1.996259340523635e-05, "loss": 1.3302, "step": 1897 }, { "epoch": 0.5653120869710903, "grad_norm": 0.1695362627506256, "learning_rate": 1.9962510007148807e-05, "loss": 1.3294, "step": 1898 }, { "epoch": 0.5656099331707589, "grad_norm": 0.16629473865032196, "learning_rate": 1.9962426516371236e-05, "loss": 1.3422, "step": 1899 }, { "epoch": 0.5659077793704276, "grad_norm": 0.16506071388721466, "learning_rate": 1.996234293290442e-05, "loss": 1.3388, "step": 1900 }, { "epoch": 0.5662056255700962, "grad_norm": 0.17205072939395905, "learning_rate": 1.9962259256749135e-05, "loss": 1.3316, "step": 1901 }, { "epoch": 0.5665034717697649, "grad_norm": 0.16647745668888092, "learning_rate": 1.9962175487906155e-05, "loss": 1.3259, "step": 1902 }, { "epoch": 0.5668013179694336, "grad_norm": 0.16938139498233795, "learning_rate": 1.9962091626376265e-05, "loss": 1.3318, "step": 1903 }, { "epoch": 0.5670991641691022, "grad_norm": 0.17617148160934448, "learning_rate": 1.9962007672160243e-05, "loss": 1.3444, "step": 1904 }, { "epoch": 0.5673970103687708, "grad_norm": 0.1727476865053177, "learning_rate": 1.9961923625258867e-05, "loss": 1.35, "step": 1905 }, { "epoch": 0.5676948565684394, "grad_norm": 0.16883544623851776, "learning_rate": 1.9961839485672923e-05, "loss": 1.342, "step": 1906 }, { "epoch": 0.5679927027681081, "grad_norm": 0.17220336198806763, "learning_rate": 1.9961755253403194e-05, "loss": 1.334, "step": 1907 }, { "epoch": 0.5682905489677768, "grad_norm": 0.1599874645471573, "learning_rate": 1.996167092845046e-05, "loss": 1.339, "step": 1908 }, { "epoch": 0.5685883951674454, "grad_norm": 0.16740910708904266, "learning_rate": 1.9961586510815508e-05, "loss": 1.3516, "step": 1909 }, { "epoch": 0.5688862413671141, "grad_norm": 0.181361585855484, "learning_rate": 1.9961502000499127e-05, "loss": 1.3496, "step": 1910 }, { "epoch": 0.5691840875667827, "grad_norm": 0.17191778123378754, "learning_rate": 1.9961417397502098e-05, "loss": 1.3273, "step": 1911 }, { "epoch": 0.5694819337664514, "grad_norm": 0.1787191778421402, "learning_rate": 1.9961332701825207e-05, "loss": 1.3398, "step": 1912 }, { "epoch": 0.56977977996612, "grad_norm": 0.1790206879377365, "learning_rate": 1.9961247913469244e-05, "loss": 1.343, "step": 1913 }, { "epoch": 0.5700776261657886, "grad_norm": 0.17674620449543, "learning_rate": 1.9961163032435006e-05, "loss": 1.331, "step": 1914 }, { "epoch": 0.5703754723654573, "grad_norm": 0.17626053094863892, "learning_rate": 1.9961078058723267e-05, "loss": 1.336, "step": 1915 }, { "epoch": 0.5706733185651259, "grad_norm": 0.16902464628219604, "learning_rate": 1.9960992992334828e-05, "loss": 1.3333, "step": 1916 }, { "epoch": 0.5709711647647946, "grad_norm": 0.16353672742843628, "learning_rate": 1.996090783327048e-05, "loss": 1.3385, "step": 1917 }, { "epoch": 0.5712690109644633, "grad_norm": 0.16979794204235077, "learning_rate": 1.9960822581531013e-05, "loss": 1.3418, "step": 1918 }, { "epoch": 0.5715668571641319, "grad_norm": 0.17273689806461334, "learning_rate": 1.996073723711722e-05, "loss": 1.34, "step": 1919 }, { "epoch": 0.5718647033638005, "grad_norm": 0.17751668393611908, "learning_rate": 1.99606518000299e-05, "loss": 1.3294, "step": 1920 }, { "epoch": 0.5721625495634691, "grad_norm": 0.1674806773662567, "learning_rate": 1.9960566270269837e-05, "loss": 1.3532, "step": 1921 }, { "epoch": 0.5724603957631378, "grad_norm": 0.17393875122070312, "learning_rate": 1.9960480647837837e-05, "loss": 1.3477, "step": 1922 }, { "epoch": 0.5727582419628064, "grad_norm": 0.17875970900058746, "learning_rate": 1.9960394932734694e-05, "loss": 1.3456, "step": 1923 }, { "epoch": 0.5730560881624751, "grad_norm": 0.17763720452785492, "learning_rate": 1.9960309124961203e-05, "loss": 1.3415, "step": 1924 }, { "epoch": 0.5733539343621438, "grad_norm": 0.17421069741249084, "learning_rate": 1.9960223224518163e-05, "loss": 1.3462, "step": 1925 }, { "epoch": 0.5736517805618124, "grad_norm": 0.17774200439453125, "learning_rate": 1.9960137231406372e-05, "loss": 1.3272, "step": 1926 }, { "epoch": 0.5739496267614811, "grad_norm": 0.1704302579164505, "learning_rate": 1.996005114562664e-05, "loss": 1.3304, "step": 1927 }, { "epoch": 0.5742474729611496, "grad_norm": 0.17656168341636658, "learning_rate": 1.9959964967179753e-05, "loss": 1.3452, "step": 1928 }, { "epoch": 0.5745453191608183, "grad_norm": 0.1732204705476761, "learning_rate": 1.995987869606652e-05, "loss": 1.3266, "step": 1929 }, { "epoch": 0.574843165360487, "grad_norm": 0.17743700742721558, "learning_rate": 1.995979233228775e-05, "loss": 1.3331, "step": 1930 }, { "epoch": 0.5751410115601556, "grad_norm": 0.18953180313110352, "learning_rate": 1.9959705875844233e-05, "loss": 1.3649, "step": 1931 }, { "epoch": 0.5754388577598243, "grad_norm": 0.161458820104599, "learning_rate": 1.9959619326736786e-05, "loss": 1.339, "step": 1932 }, { "epoch": 0.575736703959493, "grad_norm": 0.16367121040821075, "learning_rate": 1.9959532684966205e-05, "loss": 1.3535, "step": 1933 }, { "epoch": 0.5760345501591616, "grad_norm": 0.1716615855693817, "learning_rate": 1.99594459505333e-05, "loss": 1.3387, "step": 1934 }, { "epoch": 0.5763323963588302, "grad_norm": 0.16995373368263245, "learning_rate": 1.995935912343888e-05, "loss": 1.3326, "step": 1935 }, { "epoch": 0.5766302425584988, "grad_norm": 0.1661783903837204, "learning_rate": 1.9959272203683747e-05, "loss": 1.3244, "step": 1936 }, { "epoch": 0.5769280887581675, "grad_norm": 0.17309847474098206, "learning_rate": 1.995918519126872e-05, "loss": 1.3429, "step": 1937 }, { "epoch": 0.5772259349578361, "grad_norm": 0.18650607764720917, "learning_rate": 1.9959098086194596e-05, "loss": 1.349, "step": 1938 }, { "epoch": 0.5775237811575048, "grad_norm": 0.1782013326883316, "learning_rate": 1.9959010888462193e-05, "loss": 1.3412, "step": 1939 }, { "epoch": 0.5778216273571735, "grad_norm": 0.17299634218215942, "learning_rate": 1.9958923598072318e-05, "loss": 1.3142, "step": 1940 }, { "epoch": 0.5781194735568421, "grad_norm": 0.17703306674957275, "learning_rate": 1.995883621502579e-05, "loss": 1.3403, "step": 1941 }, { "epoch": 0.5784173197565108, "grad_norm": 0.16884054243564606, "learning_rate": 1.9958748739323415e-05, "loss": 1.3194, "step": 1942 }, { "epoch": 0.5787151659561793, "grad_norm": 0.18963994085788727, "learning_rate": 1.995866117096601e-05, "loss": 1.346, "step": 1943 }, { "epoch": 0.579013012155848, "grad_norm": 0.18116572499275208, "learning_rate": 1.9958573509954392e-05, "loss": 1.3631, "step": 1944 }, { "epoch": 0.5793108583555167, "grad_norm": 0.17018160223960876, "learning_rate": 1.995848575628937e-05, "loss": 1.342, "step": 1945 }, { "epoch": 0.5796087045551853, "grad_norm": 0.1706002652645111, "learning_rate": 1.9958397909971765e-05, "loss": 1.3306, "step": 1946 }, { "epoch": 0.579906550754854, "grad_norm": 0.16390784084796906, "learning_rate": 1.9958309971002395e-05, "loss": 1.3362, "step": 1947 }, { "epoch": 0.5802043969545226, "grad_norm": 0.16863805055618286, "learning_rate": 1.9958221939382075e-05, "loss": 1.3433, "step": 1948 }, { "epoch": 0.5805022431541913, "grad_norm": 0.16780760884284973, "learning_rate": 1.9958133815111628e-05, "loss": 1.3519, "step": 1949 }, { "epoch": 0.5808000893538598, "grad_norm": 0.180782750248909, "learning_rate": 1.995804559819187e-05, "loss": 1.3356, "step": 1950 }, { "epoch": 0.5810979355535285, "grad_norm": 0.1618662178516388, "learning_rate": 1.9957957288623624e-05, "loss": 1.3366, "step": 1951 }, { "epoch": 0.5813957817531972, "grad_norm": 0.16837403178215027, "learning_rate": 1.995786888640771e-05, "loss": 1.3397, "step": 1952 }, { "epoch": 0.5816936279528658, "grad_norm": 0.19062912464141846, "learning_rate": 1.9957780391544953e-05, "loss": 1.3337, "step": 1953 }, { "epoch": 0.5819914741525345, "grad_norm": 0.174716517329216, "learning_rate": 1.995769180403618e-05, "loss": 1.3388, "step": 1954 }, { "epoch": 0.5822893203522032, "grad_norm": 0.1641959249973297, "learning_rate": 1.9957603123882202e-05, "loss": 1.3308, "step": 1955 }, { "epoch": 0.5825871665518718, "grad_norm": 0.16572526097297668, "learning_rate": 1.9957514351083855e-05, "loss": 1.3372, "step": 1956 }, { "epoch": 0.5828850127515405, "grad_norm": 0.16768459975719452, "learning_rate": 1.9957425485641964e-05, "loss": 1.3277, "step": 1957 }, { "epoch": 0.583182858951209, "grad_norm": 0.17460903525352478, "learning_rate": 1.995733652755735e-05, "loss": 1.3393, "step": 1958 }, { "epoch": 0.5834807051508777, "grad_norm": 0.1763729304075241, "learning_rate": 1.9957247476830846e-05, "loss": 1.3447, "step": 1959 }, { "epoch": 0.5837785513505463, "grad_norm": 0.1680125743150711, "learning_rate": 1.9957158333463283e-05, "loss": 1.3445, "step": 1960 }, { "epoch": 0.584076397550215, "grad_norm": 0.1667768508195877, "learning_rate": 1.9957069097455482e-05, "loss": 1.3216, "step": 1961 }, { "epoch": 0.5843742437498837, "grad_norm": 0.17902809381484985, "learning_rate": 1.995697976880828e-05, "loss": 1.3481, "step": 1962 }, { "epoch": 0.5846720899495523, "grad_norm": 0.17711710929870605, "learning_rate": 1.9956890347522505e-05, "loss": 1.3173, "step": 1963 }, { "epoch": 0.584969936149221, "grad_norm": 0.17662787437438965, "learning_rate": 1.995680083359899e-05, "loss": 1.3389, "step": 1964 }, { "epoch": 0.5852677823488895, "grad_norm": 0.16423983871936798, "learning_rate": 1.9956711227038567e-05, "loss": 1.3275, "step": 1965 }, { "epoch": 0.5855656285485582, "grad_norm": 0.17329730093479156, "learning_rate": 1.995662152784207e-05, "loss": 1.3334, "step": 1966 }, { "epoch": 0.5858634747482269, "grad_norm": 0.18660931289196014, "learning_rate": 1.9956531736010336e-05, "loss": 1.3476, "step": 1967 }, { "epoch": 0.5861613209478955, "grad_norm": 0.18202678859233856, "learning_rate": 1.9956441851544197e-05, "loss": 1.3495, "step": 1968 }, { "epoch": 0.5864591671475642, "grad_norm": 0.17551197111606598, "learning_rate": 1.9956351874444492e-05, "loss": 1.3353, "step": 1969 }, { "epoch": 0.5867570133472328, "grad_norm": 0.1738758683204651, "learning_rate": 1.9956261804712055e-05, "loss": 1.3453, "step": 1970 }, { "epoch": 0.5870548595469015, "grad_norm": 0.1726599484682083, "learning_rate": 1.9956171642347725e-05, "loss": 1.325, "step": 1971 }, { "epoch": 0.5873527057465702, "grad_norm": 0.1872359812259674, "learning_rate": 1.9956081387352343e-05, "loss": 1.3345, "step": 1972 }, { "epoch": 0.5876505519462387, "grad_norm": 0.1731291115283966, "learning_rate": 1.9955991039726745e-05, "loss": 1.3277, "step": 1973 }, { "epoch": 0.5879483981459074, "grad_norm": 0.16684368252754211, "learning_rate": 1.9955900599471776e-05, "loss": 1.3357, "step": 1974 }, { "epoch": 0.588246244345576, "grad_norm": 0.17079827189445496, "learning_rate": 1.9955810066588276e-05, "loss": 1.3239, "step": 1975 }, { "epoch": 0.5885440905452447, "grad_norm": 0.16931898891925812, "learning_rate": 1.9955719441077088e-05, "loss": 1.316, "step": 1976 }, { "epoch": 0.5888419367449134, "grad_norm": 0.16645090281963348, "learning_rate": 1.995562872293905e-05, "loss": 1.3357, "step": 1977 }, { "epoch": 0.589139782944582, "grad_norm": 0.18683874607086182, "learning_rate": 1.9955537912175012e-05, "loss": 1.3299, "step": 1978 }, { "epoch": 0.5894376291442507, "grad_norm": 0.1716485619544983, "learning_rate": 1.9955447008785813e-05, "loss": 1.3384, "step": 1979 }, { "epoch": 0.5897354753439192, "grad_norm": 0.17715637385845184, "learning_rate": 1.9955356012772307e-05, "loss": 1.3377, "step": 1980 }, { "epoch": 0.5900333215435879, "grad_norm": 0.1782515048980713, "learning_rate": 1.9955264924135334e-05, "loss": 1.3405, "step": 1981 }, { "epoch": 0.5903311677432566, "grad_norm": 0.1820431649684906, "learning_rate": 1.9955173742875743e-05, "loss": 1.3211, "step": 1982 }, { "epoch": 0.5906290139429252, "grad_norm": 0.18172216415405273, "learning_rate": 1.9955082468994383e-05, "loss": 1.3298, "step": 1983 }, { "epoch": 0.5909268601425939, "grad_norm": 0.17631405591964722, "learning_rate": 1.9954991102492108e-05, "loss": 1.322, "step": 1984 }, { "epoch": 0.5912247063422625, "grad_norm": 0.17888231575489044, "learning_rate": 1.995489964336976e-05, "loss": 1.322, "step": 1985 }, { "epoch": 0.5915225525419312, "grad_norm": 0.16786961257457733, "learning_rate": 1.995480809162819e-05, "loss": 1.3271, "step": 1986 }, { "epoch": 0.5918203987415998, "grad_norm": 0.18634092807769775, "learning_rate": 1.9954716447268258e-05, "loss": 1.3532, "step": 1987 }, { "epoch": 0.5921182449412684, "grad_norm": 0.1904766708612442, "learning_rate": 1.9954624710290807e-05, "loss": 1.339, "step": 1988 }, { "epoch": 0.5924160911409371, "grad_norm": 0.17535746097564697, "learning_rate": 1.9954532880696694e-05, "loss": 1.3245, "step": 1989 }, { "epoch": 0.5927139373406057, "grad_norm": 0.17284177243709564, "learning_rate": 1.9954440958486782e-05, "loss": 1.3425, "step": 1990 }, { "epoch": 0.5930117835402744, "grad_norm": 0.1806136667728424, "learning_rate": 1.995434894366191e-05, "loss": 1.3235, "step": 1991 }, { "epoch": 0.5933096297399431, "grad_norm": 0.18574415147304535, "learning_rate": 1.9954256836222946e-05, "loss": 1.3452, "step": 1992 }, { "epoch": 0.5936074759396117, "grad_norm": 0.17696474492549896, "learning_rate": 1.9954164636170747e-05, "loss": 1.336, "step": 1993 }, { "epoch": 0.5939053221392804, "grad_norm": 0.18362903594970703, "learning_rate": 1.9954072343506164e-05, "loss": 1.349, "step": 1994 }, { "epoch": 0.5942031683389489, "grad_norm": 0.166317418217659, "learning_rate": 1.9953979958230062e-05, "loss": 1.3317, "step": 1995 }, { "epoch": 0.5945010145386176, "grad_norm": 0.17155015468597412, "learning_rate": 1.9953887480343294e-05, "loss": 1.3411, "step": 1996 }, { "epoch": 0.5947988607382863, "grad_norm": 0.1783314049243927, "learning_rate": 1.9953794909846724e-05, "loss": 1.3349, "step": 1997 }, { "epoch": 0.5950967069379549, "grad_norm": 0.17705361545085907, "learning_rate": 1.9953702246741216e-05, "loss": 1.3563, "step": 1998 }, { "epoch": 0.5953945531376236, "grad_norm": 0.16366109251976013, "learning_rate": 1.995360949102763e-05, "loss": 1.3435, "step": 1999 }, { "epoch": 0.5956923993372922, "grad_norm": 0.1686832308769226, "learning_rate": 1.9953516642706827e-05, "loss": 1.3653, "step": 2000 }, { "epoch": 0.5956923993372922, "eval_loss": 1.3709481954574585, "eval_runtime": 20.4305, "eval_samples_per_second": 84.873, "eval_steps_per_second": 5.335, "step": 2000 }, { "epoch": 0.5959902455369609, "grad_norm": 0.17259104549884796, "learning_rate": 1.995342370177967e-05, "loss": 1.3281, "step": 2001 }, { "epoch": 0.5962880917366294, "grad_norm": 0.17897668480873108, "learning_rate": 1.995333066824703e-05, "loss": 1.3326, "step": 2002 }, { "epoch": 0.5965859379362981, "grad_norm": 0.1779920607805252, "learning_rate": 1.995323754210977e-05, "loss": 1.3484, "step": 2003 }, { "epoch": 0.5968837841359668, "grad_norm": 0.16980735957622528, "learning_rate": 1.995314432336875e-05, "loss": 1.3208, "step": 2004 }, { "epoch": 0.5971816303356354, "grad_norm": 0.16974031925201416, "learning_rate": 1.9953051012024845e-05, "loss": 1.3133, "step": 2005 }, { "epoch": 0.5974794765353041, "grad_norm": 0.17339594662189484, "learning_rate": 1.995295760807892e-05, "loss": 1.3286, "step": 2006 }, { "epoch": 0.5977773227349727, "grad_norm": 0.17947392165660858, "learning_rate": 1.9952864111531845e-05, "loss": 1.3467, "step": 2007 }, { "epoch": 0.5980751689346414, "grad_norm": 0.1726466864347458, "learning_rate": 1.995277052238449e-05, "loss": 1.3257, "step": 2008 }, { "epoch": 0.5983730151343101, "grad_norm": 0.17698267102241516, "learning_rate": 1.9952676840637724e-05, "loss": 1.3587, "step": 2009 }, { "epoch": 0.5986708613339786, "grad_norm": 0.17208711802959442, "learning_rate": 1.995258306629242e-05, "loss": 1.3321, "step": 2010 }, { "epoch": 0.5989687075336473, "grad_norm": 0.16823822259902954, "learning_rate": 1.995248919934945e-05, "loss": 1.3332, "step": 2011 }, { "epoch": 0.5992665537333159, "grad_norm": 0.1744266003370285, "learning_rate": 1.9952395239809686e-05, "loss": 1.3384, "step": 2012 }, { "epoch": 0.5995643999329846, "grad_norm": 0.1792793571949005, "learning_rate": 1.995230118767401e-05, "loss": 1.34, "step": 2013 }, { "epoch": 0.5998622461326533, "grad_norm": 0.17216219007968903, "learning_rate": 1.9952207042943287e-05, "loss": 1.318, "step": 2014 }, { "epoch": 0.6001600923323219, "grad_norm": 0.1686355471611023, "learning_rate": 1.9952112805618394e-05, "loss": 1.3177, "step": 2015 }, { "epoch": 0.6004579385319906, "grad_norm": 0.17162348330020905, "learning_rate": 1.9952018475700212e-05, "loss": 1.3229, "step": 2016 }, { "epoch": 0.6007557847316591, "grad_norm": 0.17360536754131317, "learning_rate": 1.995192405318962e-05, "loss": 1.3429, "step": 2017 }, { "epoch": 0.6010536309313278, "grad_norm": 0.18095779418945312, "learning_rate": 1.9951829538087492e-05, "loss": 1.3338, "step": 2018 }, { "epoch": 0.6013514771309965, "grad_norm": 0.17286114394664764, "learning_rate": 1.9951734930394705e-05, "loss": 1.3404, "step": 2019 }, { "epoch": 0.6016493233306651, "grad_norm": 0.16724084317684174, "learning_rate": 1.9951640230112146e-05, "loss": 1.3318, "step": 2020 }, { "epoch": 0.6019471695303338, "grad_norm": 0.1806597113609314, "learning_rate": 1.9951545437240698e-05, "loss": 1.3367, "step": 2021 }, { "epoch": 0.6022450157300024, "grad_norm": 0.18451228737831116, "learning_rate": 1.9951450551781236e-05, "loss": 1.3536, "step": 2022 }, { "epoch": 0.6025428619296711, "grad_norm": 0.1788858026266098, "learning_rate": 1.9951355573734643e-05, "loss": 1.3365, "step": 2023 }, { "epoch": 0.6028407081293398, "grad_norm": 0.1803123652935028, "learning_rate": 1.9951260503101803e-05, "loss": 1.3414, "step": 2024 }, { "epoch": 0.6031385543290083, "grad_norm": 0.18506979942321777, "learning_rate": 1.9951165339883606e-05, "loss": 1.3226, "step": 2025 }, { "epoch": 0.603436400528677, "grad_norm": 0.17891237139701843, "learning_rate": 1.995107008408093e-05, "loss": 1.3415, "step": 2026 }, { "epoch": 0.6037342467283456, "grad_norm": 0.18189607560634613, "learning_rate": 1.995097473569467e-05, "loss": 1.3446, "step": 2027 }, { "epoch": 0.6040320929280143, "grad_norm": 0.1696876734495163, "learning_rate": 1.9950879294725702e-05, "loss": 1.3488, "step": 2028 }, { "epoch": 0.604329939127683, "grad_norm": 0.18890951573848724, "learning_rate": 1.9950783761174922e-05, "loss": 1.3356, "step": 2029 }, { "epoch": 0.6046277853273516, "grad_norm": 0.18418404459953308, "learning_rate": 1.9950688135043217e-05, "loss": 1.3331, "step": 2030 }, { "epoch": 0.6049256315270203, "grad_norm": 0.17907729744911194, "learning_rate": 1.995059241633148e-05, "loss": 1.3351, "step": 2031 }, { "epoch": 0.6052234777266888, "grad_norm": 0.18905779719352722, "learning_rate": 1.9950496605040595e-05, "loss": 1.3254, "step": 2032 }, { "epoch": 0.6055213239263575, "grad_norm": 0.17531952261924744, "learning_rate": 1.9950400701171456e-05, "loss": 1.3402, "step": 2033 }, { "epoch": 0.6058191701260262, "grad_norm": 0.1793741136789322, "learning_rate": 1.9950304704724956e-05, "loss": 1.3354, "step": 2034 }, { "epoch": 0.6061170163256948, "grad_norm": 0.18081824481487274, "learning_rate": 1.9950208615701987e-05, "loss": 1.335, "step": 2035 }, { "epoch": 0.6064148625253635, "grad_norm": 0.17112943530082703, "learning_rate": 1.9950112434103444e-05, "loss": 1.3324, "step": 2036 }, { "epoch": 0.6067127087250321, "grad_norm": 0.17233715951442719, "learning_rate": 1.9950016159930223e-05, "loss": 1.3292, "step": 2037 }, { "epoch": 0.6070105549247008, "grad_norm": 0.16468045115470886, "learning_rate": 1.994991979318322e-05, "loss": 1.3278, "step": 2038 }, { "epoch": 0.6073084011243695, "grad_norm": 0.1741722971200943, "learning_rate": 1.9949823333863328e-05, "loss": 1.3488, "step": 2039 }, { "epoch": 0.607606247324038, "grad_norm": 0.18325020372867584, "learning_rate": 1.9949726781971446e-05, "loss": 1.3446, "step": 2040 }, { "epoch": 0.6079040935237067, "grad_norm": 0.18267396092414856, "learning_rate": 1.9949630137508472e-05, "loss": 1.3421, "step": 2041 }, { "epoch": 0.6082019397233753, "grad_norm": 0.17630638182163239, "learning_rate": 1.9949533400475307e-05, "loss": 1.3342, "step": 2042 }, { "epoch": 0.608499785923044, "grad_norm": 0.167982280254364, "learning_rate": 1.994943657087285e-05, "loss": 1.3507, "step": 2043 }, { "epoch": 0.6087976321227127, "grad_norm": 0.1815871000289917, "learning_rate": 1.9949339648702004e-05, "loss": 1.3226, "step": 2044 }, { "epoch": 0.6090954783223813, "grad_norm": 0.17143020033836365, "learning_rate": 1.9949242633963666e-05, "loss": 1.335, "step": 2045 }, { "epoch": 0.60939332452205, "grad_norm": 0.17347553372383118, "learning_rate": 1.9949145526658742e-05, "loss": 1.3327, "step": 2046 }, { "epoch": 0.6096911707217185, "grad_norm": 0.17974741756916046, "learning_rate": 1.9949048326788133e-05, "loss": 1.3419, "step": 2047 }, { "epoch": 0.6099890169213872, "grad_norm": 0.18071913719177246, "learning_rate": 1.9948951034352747e-05, "loss": 1.345, "step": 2048 }, { "epoch": 0.6102868631210558, "grad_norm": 0.18039347231388092, "learning_rate": 1.994885364935349e-05, "loss": 1.3373, "step": 2049 }, { "epoch": 0.6105847093207245, "grad_norm": 0.18226660788059235, "learning_rate": 1.9948756171791262e-05, "loss": 1.3433, "step": 2050 }, { "epoch": 0.6108825555203932, "grad_norm": 0.18637697398662567, "learning_rate": 1.994865860166697e-05, "loss": 1.3352, "step": 2051 }, { "epoch": 0.6111804017200618, "grad_norm": 0.18535496294498444, "learning_rate": 1.994856093898153e-05, "loss": 1.316, "step": 2052 }, { "epoch": 0.6114782479197305, "grad_norm": 0.1768663376569748, "learning_rate": 1.9948463183735845e-05, "loss": 1.3341, "step": 2053 }, { "epoch": 0.6117760941193991, "grad_norm": 0.17285248637199402, "learning_rate": 1.9948365335930825e-05, "loss": 1.3266, "step": 2054 }, { "epoch": 0.6120739403190677, "grad_norm": 0.1809185892343521, "learning_rate": 1.9948267395567378e-05, "loss": 1.3343, "step": 2055 }, { "epoch": 0.6123717865187364, "grad_norm": 0.18210750818252563, "learning_rate": 1.994816936264642e-05, "loss": 1.3495, "step": 2056 }, { "epoch": 0.612669632718405, "grad_norm": 0.17945021390914917, "learning_rate": 1.994807123716886e-05, "loss": 1.3395, "step": 2057 }, { "epoch": 0.6129674789180737, "grad_norm": 0.16708803176879883, "learning_rate": 1.994797301913561e-05, "loss": 1.3273, "step": 2058 }, { "epoch": 0.6132653251177423, "grad_norm": 0.17374387383460999, "learning_rate": 1.994787470854759e-05, "loss": 1.3347, "step": 2059 }, { "epoch": 0.613563171317411, "grad_norm": 0.18136759102344513, "learning_rate": 1.9947776305405708e-05, "loss": 1.3313, "step": 2060 }, { "epoch": 0.6138610175170797, "grad_norm": 0.18151119351387024, "learning_rate": 1.9947677809710882e-05, "loss": 1.3469, "step": 2061 }, { "epoch": 0.6141588637167482, "grad_norm": 0.17824167013168335, "learning_rate": 1.994757922146403e-05, "loss": 1.3299, "step": 2062 }, { "epoch": 0.6144567099164169, "grad_norm": 0.17063666880130768, "learning_rate": 1.9947480540666064e-05, "loss": 1.3083, "step": 2063 }, { "epoch": 0.6147545561160855, "grad_norm": 0.17322927713394165, "learning_rate": 1.9947381767317907e-05, "loss": 1.3258, "step": 2064 }, { "epoch": 0.6150524023157542, "grad_norm": 0.18017977476119995, "learning_rate": 1.9947282901420477e-05, "loss": 1.3336, "step": 2065 }, { "epoch": 0.6153502485154229, "grad_norm": 0.17630837857723236, "learning_rate": 1.9947183942974693e-05, "loss": 1.3229, "step": 2066 }, { "epoch": 0.6156480947150915, "grad_norm": 0.1698862910270691, "learning_rate": 1.9947084891981476e-05, "loss": 1.3391, "step": 2067 }, { "epoch": 0.6159459409147602, "grad_norm": 0.16928812861442566, "learning_rate": 1.9946985748441747e-05, "loss": 1.3545, "step": 2068 }, { "epoch": 0.6162437871144287, "grad_norm": 0.17287898063659668, "learning_rate": 1.994688651235643e-05, "loss": 1.3445, "step": 2069 }, { "epoch": 0.6165416333140974, "grad_norm": 0.1712704598903656, "learning_rate": 1.9946787183726445e-05, "loss": 1.3156, "step": 2070 }, { "epoch": 0.616839479513766, "grad_norm": 0.18089932203292847, "learning_rate": 1.994668776255272e-05, "loss": 1.3223, "step": 2071 }, { "epoch": 0.6171373257134347, "grad_norm": 0.16370098292827606, "learning_rate": 1.994658824883618e-05, "loss": 1.3522, "step": 2072 }, { "epoch": 0.6174351719131034, "grad_norm": 0.18175539374351501, "learning_rate": 1.9946488642577747e-05, "loss": 1.3253, "step": 2073 }, { "epoch": 0.617733018112772, "grad_norm": 0.16670680046081543, "learning_rate": 1.9946388943778353e-05, "loss": 1.3271, "step": 2074 }, { "epoch": 0.6180308643124407, "grad_norm": 0.16849905252456665, "learning_rate": 1.994628915243892e-05, "loss": 1.3338, "step": 2075 }, { "epoch": 0.6183287105121094, "grad_norm": 0.1773916482925415, "learning_rate": 1.994618926856038e-05, "loss": 1.3301, "step": 2076 }, { "epoch": 0.6186265567117779, "grad_norm": 0.167177215218544, "learning_rate": 1.994608929214366e-05, "loss": 1.3293, "step": 2077 }, { "epoch": 0.6189244029114466, "grad_norm": 0.17059874534606934, "learning_rate": 1.9945989223189694e-05, "loss": 1.3206, "step": 2078 }, { "epoch": 0.6192222491111152, "grad_norm": 0.17021021246910095, "learning_rate": 1.994588906169941e-05, "loss": 1.3296, "step": 2079 }, { "epoch": 0.6195200953107839, "grad_norm": 0.17559467256069183, "learning_rate": 1.994578880767374e-05, "loss": 1.3289, "step": 2080 }, { "epoch": 0.6198179415104526, "grad_norm": 0.17063923180103302, "learning_rate": 1.994568846111362e-05, "loss": 1.3201, "step": 2081 }, { "epoch": 0.6201157877101212, "grad_norm": 0.17661602795124054, "learning_rate": 1.9945588022019975e-05, "loss": 1.326, "step": 2082 }, { "epoch": 0.6204136339097899, "grad_norm": 0.17022280395030975, "learning_rate": 1.994548749039375e-05, "loss": 1.3284, "step": 2083 }, { "epoch": 0.6207114801094584, "grad_norm": 0.18009376525878906, "learning_rate": 1.9945386866235874e-05, "loss": 1.3187, "step": 2084 }, { "epoch": 0.6210093263091271, "grad_norm": 0.167107954621315, "learning_rate": 1.9945286149547284e-05, "loss": 1.3351, "step": 2085 }, { "epoch": 0.6213071725087957, "grad_norm": 0.1685510128736496, "learning_rate": 1.994518534032892e-05, "loss": 1.3186, "step": 2086 }, { "epoch": 0.6216050187084644, "grad_norm": 0.179209366440773, "learning_rate": 1.9945084438581713e-05, "loss": 1.3394, "step": 2087 }, { "epoch": 0.6219028649081331, "grad_norm": 0.18145005404949188, "learning_rate": 1.9944983444306613e-05, "loss": 1.3336, "step": 2088 }, { "epoch": 0.6222007111078017, "grad_norm": 0.17497749626636505, "learning_rate": 1.994488235750455e-05, "loss": 1.3488, "step": 2089 }, { "epoch": 0.6224985573074704, "grad_norm": 0.1645064353942871, "learning_rate": 1.9944781178176468e-05, "loss": 1.3264, "step": 2090 }, { "epoch": 0.622796403507139, "grad_norm": 0.17959046363830566, "learning_rate": 1.9944679906323307e-05, "loss": 1.3192, "step": 2091 }, { "epoch": 0.6230942497068076, "grad_norm": 0.17209501564502716, "learning_rate": 1.994457854194601e-05, "loss": 1.3409, "step": 2092 }, { "epoch": 0.6233920959064763, "grad_norm": 0.16878920793533325, "learning_rate": 1.9944477085045525e-05, "loss": 1.3499, "step": 2093 }, { "epoch": 0.6236899421061449, "grad_norm": 0.17653900384902954, "learning_rate": 1.994437553562279e-05, "loss": 1.3289, "step": 2094 }, { "epoch": 0.6239877883058136, "grad_norm": 0.1805211454629898, "learning_rate": 1.9944273893678748e-05, "loss": 1.3096, "step": 2095 }, { "epoch": 0.6242856345054822, "grad_norm": 0.18547116219997406, "learning_rate": 1.994417215921435e-05, "loss": 1.3339, "step": 2096 }, { "epoch": 0.6245834807051509, "grad_norm": 0.17186765372753143, "learning_rate": 1.994407033223054e-05, "loss": 1.3263, "step": 2097 }, { "epoch": 0.6248813269048196, "grad_norm": 0.18666228652000427, "learning_rate": 1.9943968412728262e-05, "loss": 1.3403, "step": 2098 }, { "epoch": 0.6251791731044881, "grad_norm": 0.1880766749382019, "learning_rate": 1.9943866400708473e-05, "loss": 1.3348, "step": 2099 }, { "epoch": 0.6254770193041568, "grad_norm": 0.17504388093948364, "learning_rate": 1.9943764296172116e-05, "loss": 1.3265, "step": 2100 }, { "epoch": 0.6257748655038254, "grad_norm": 0.18056386709213257, "learning_rate": 1.994366209912014e-05, "loss": 1.3333, "step": 2101 }, { "epoch": 0.6260727117034941, "grad_norm": 0.18163588643074036, "learning_rate": 1.9943559809553502e-05, "loss": 1.327, "step": 2102 }, { "epoch": 0.6263705579031628, "grad_norm": 0.1701747626066208, "learning_rate": 1.9943457427473146e-05, "loss": 1.3363, "step": 2103 }, { "epoch": 0.6266684041028314, "grad_norm": 0.1721438467502594, "learning_rate": 1.9943354952880028e-05, "loss": 1.3382, "step": 2104 }, { "epoch": 0.6269662503025001, "grad_norm": 0.16902436316013336, "learning_rate": 1.9943252385775103e-05, "loss": 1.3363, "step": 2105 }, { "epoch": 0.6272640965021687, "grad_norm": 0.17550300061702728, "learning_rate": 1.9943149726159326e-05, "loss": 1.3274, "step": 2106 }, { "epoch": 0.6275619427018373, "grad_norm": 0.17495177686214447, "learning_rate": 1.9943046974033647e-05, "loss": 1.3417, "step": 2107 }, { "epoch": 0.627859788901506, "grad_norm": 0.17364050447940826, "learning_rate": 1.9942944129399023e-05, "loss": 1.3178, "step": 2108 }, { "epoch": 0.6281576351011746, "grad_norm": 0.16875362396240234, "learning_rate": 1.9942841192256415e-05, "loss": 1.3265, "step": 2109 }, { "epoch": 0.6284554813008433, "grad_norm": 0.17429284751415253, "learning_rate": 1.994273816260678e-05, "loss": 1.32, "step": 2110 }, { "epoch": 0.6287533275005119, "grad_norm": 0.17145192623138428, "learning_rate": 1.994263504045107e-05, "loss": 1.3319, "step": 2111 }, { "epoch": 0.6290511737001806, "grad_norm": 0.16923145949840546, "learning_rate": 1.9942531825790254e-05, "loss": 1.3371, "step": 2112 }, { "epoch": 0.6293490198998493, "grad_norm": 0.17293810844421387, "learning_rate": 1.994242851862529e-05, "loss": 1.3186, "step": 2113 }, { "epoch": 0.6296468660995178, "grad_norm": 0.1745597869157791, "learning_rate": 1.9942325118957133e-05, "loss": 1.3202, "step": 2114 }, { "epoch": 0.6299447122991865, "grad_norm": 0.17113369703292847, "learning_rate": 1.994222162678675e-05, "loss": 1.3371, "step": 2115 }, { "epoch": 0.6302425584988551, "grad_norm": 0.17446771264076233, "learning_rate": 1.9942118042115104e-05, "loss": 1.3452, "step": 2116 }, { "epoch": 0.6305404046985238, "grad_norm": 0.178619846701622, "learning_rate": 1.9942014364943154e-05, "loss": 1.3223, "step": 2117 }, { "epoch": 0.6308382508981925, "grad_norm": 0.18409490585327148, "learning_rate": 1.9941910595271872e-05, "loss": 1.3224, "step": 2118 }, { "epoch": 0.6311360970978611, "grad_norm": 0.1795678585767746, "learning_rate": 1.9941806733102217e-05, "loss": 1.3296, "step": 2119 }, { "epoch": 0.6314339432975298, "grad_norm": 0.1839311569929123, "learning_rate": 1.994170277843516e-05, "loss": 1.3274, "step": 2120 }, { "epoch": 0.6317317894971984, "grad_norm": 0.17433764040470123, "learning_rate": 1.9941598731271665e-05, "loss": 1.3118, "step": 2121 }, { "epoch": 0.632029635696867, "grad_norm": 0.18274016678333282, "learning_rate": 1.9941494591612702e-05, "loss": 1.3092, "step": 2122 }, { "epoch": 0.6323274818965356, "grad_norm": 0.16890452802181244, "learning_rate": 1.994139035945924e-05, "loss": 1.3396, "step": 2123 }, { "epoch": 0.6326253280962043, "grad_norm": 0.18805302679538727, "learning_rate": 1.9941286034812248e-05, "loss": 1.3313, "step": 2124 }, { "epoch": 0.632923174295873, "grad_norm": 0.17017240822315216, "learning_rate": 1.9941181617672694e-05, "loss": 1.328, "step": 2125 }, { "epoch": 0.6332210204955416, "grad_norm": 0.18405862152576447, "learning_rate": 1.9941077108041555e-05, "loss": 1.3443, "step": 2126 }, { "epoch": 0.6335188666952103, "grad_norm": 0.16606247425079346, "learning_rate": 1.9940972505919797e-05, "loss": 1.3268, "step": 2127 }, { "epoch": 0.633816712894879, "grad_norm": 0.17940346896648407, "learning_rate": 1.9940867811308398e-05, "loss": 1.334, "step": 2128 }, { "epoch": 0.6341145590945475, "grad_norm": 0.1704801768064499, "learning_rate": 1.994076302420833e-05, "loss": 1.329, "step": 2129 }, { "epoch": 0.6344124052942162, "grad_norm": 0.1802673041820526, "learning_rate": 1.994065814462057e-05, "loss": 1.3335, "step": 2130 }, { "epoch": 0.6347102514938848, "grad_norm": 0.17566320300102234, "learning_rate": 1.9940553172546088e-05, "loss": 1.3195, "step": 2131 }, { "epoch": 0.6350080976935535, "grad_norm": 0.177176833152771, "learning_rate": 1.9940448107985873e-05, "loss": 1.3275, "step": 2132 }, { "epoch": 0.6353059438932221, "grad_norm": 0.17318475246429443, "learning_rate": 1.994034295094089e-05, "loss": 1.3281, "step": 2133 }, { "epoch": 0.6356037900928908, "grad_norm": 0.17585478723049164, "learning_rate": 1.994023770141212e-05, "loss": 1.3368, "step": 2134 }, { "epoch": 0.6359016362925595, "grad_norm": 0.1724289357662201, "learning_rate": 1.9940132359400545e-05, "loss": 1.3273, "step": 2135 }, { "epoch": 0.6361994824922281, "grad_norm": 0.17516650259494781, "learning_rate": 1.9940026924907144e-05, "loss": 1.3307, "step": 2136 }, { "epoch": 0.6364973286918967, "grad_norm": 0.17116110026836395, "learning_rate": 1.99399213979329e-05, "loss": 1.338, "step": 2137 }, { "epoch": 0.6367951748915653, "grad_norm": 0.1668197512626648, "learning_rate": 1.9939815778478792e-05, "loss": 1.3455, "step": 2138 }, { "epoch": 0.637093021091234, "grad_norm": 0.17479157447814941, "learning_rate": 1.9939710066545804e-05, "loss": 1.3205, "step": 2139 }, { "epoch": 0.6373908672909027, "grad_norm": 0.17087820172309875, "learning_rate": 1.993960426213492e-05, "loss": 1.3041, "step": 2140 }, { "epoch": 0.6376887134905713, "grad_norm": 0.18054068088531494, "learning_rate": 1.993949836524712e-05, "loss": 1.3279, "step": 2141 }, { "epoch": 0.63798655969024, "grad_norm": 0.17079319059848785, "learning_rate": 1.99393923758834e-05, "loss": 1.3237, "step": 2142 }, { "epoch": 0.6382844058899086, "grad_norm": 0.171514630317688, "learning_rate": 1.9939286294044732e-05, "loss": 1.3254, "step": 2143 }, { "epoch": 0.6385822520895772, "grad_norm": 0.16941453516483307, "learning_rate": 1.993918011973211e-05, "loss": 1.3338, "step": 2144 }, { "epoch": 0.6388800982892459, "grad_norm": 0.16755352914333344, "learning_rate": 1.9939073852946524e-05, "loss": 1.3251, "step": 2145 }, { "epoch": 0.6391779444889145, "grad_norm": 0.17181557416915894, "learning_rate": 1.993896749368896e-05, "loss": 1.3394, "step": 2146 }, { "epoch": 0.6394757906885832, "grad_norm": 0.18224985897541046, "learning_rate": 1.9938861041960406e-05, "loss": 1.3363, "step": 2147 }, { "epoch": 0.6397736368882518, "grad_norm": 0.1785338670015335, "learning_rate": 1.993875449776186e-05, "loss": 1.3164, "step": 2148 }, { "epoch": 0.6400714830879205, "grad_norm": 0.17881037294864655, "learning_rate": 1.9938647861094302e-05, "loss": 1.3178, "step": 2149 }, { "epoch": 0.6403693292875892, "grad_norm": 0.17853541672229767, "learning_rate": 1.9938541131958732e-05, "loss": 1.3294, "step": 2150 }, { "epoch": 0.6406671754872577, "grad_norm": 0.16955041885375977, "learning_rate": 1.993843431035614e-05, "loss": 1.337, "step": 2151 }, { "epoch": 0.6409650216869264, "grad_norm": 0.5713686943054199, "learning_rate": 1.993832739628752e-05, "loss": 1.3335, "step": 2152 }, { "epoch": 0.641262867886595, "grad_norm": 0.18174128234386444, "learning_rate": 1.993822038975387e-05, "loss": 1.3162, "step": 2153 }, { "epoch": 0.6415607140862637, "grad_norm": 0.1834786981344223, "learning_rate": 1.993811329075618e-05, "loss": 1.3347, "step": 2154 }, { "epoch": 0.6418585602859324, "grad_norm": 0.18102896213531494, "learning_rate": 1.993800609929545e-05, "loss": 1.3424, "step": 2155 }, { "epoch": 0.642156406485601, "grad_norm": 0.17838533222675323, "learning_rate": 1.9937898815372677e-05, "loss": 1.3353, "step": 2156 }, { "epoch": 0.6424542526852697, "grad_norm": 0.18635553121566772, "learning_rate": 1.993779143898886e-05, "loss": 1.338, "step": 2157 }, { "epoch": 0.6427520988849383, "grad_norm": 0.19375599920749664, "learning_rate": 1.9937683970144996e-05, "loss": 1.357, "step": 2158 }, { "epoch": 0.6430499450846069, "grad_norm": 0.16988781094551086, "learning_rate": 1.9937576408842087e-05, "loss": 1.3301, "step": 2159 }, { "epoch": 0.6433477912842755, "grad_norm": 0.184706449508667, "learning_rate": 1.993746875508113e-05, "loss": 1.3288, "step": 2160 }, { "epoch": 0.6436456374839442, "grad_norm": 0.1785503625869751, "learning_rate": 1.993736100886313e-05, "loss": 1.3381, "step": 2161 }, { "epoch": 0.6439434836836129, "grad_norm": 0.1770181804895401, "learning_rate": 1.993725317018909e-05, "loss": 1.3259, "step": 2162 }, { "epoch": 0.6442413298832815, "grad_norm": 0.1877453774213791, "learning_rate": 1.993714523906001e-05, "loss": 1.337, "step": 2163 }, { "epoch": 0.6445391760829502, "grad_norm": 0.18047162890434265, "learning_rate": 1.9937037215476895e-05, "loss": 1.334, "step": 2164 }, { "epoch": 0.6448370222826189, "grad_norm": 0.21972842514514923, "learning_rate": 1.993692909944075e-05, "loss": 1.3383, "step": 2165 }, { "epoch": 0.6451348684822874, "grad_norm": 0.17394456267356873, "learning_rate": 1.9936820890952585e-05, "loss": 1.3304, "step": 2166 }, { "epoch": 0.6454327146819561, "grad_norm": 0.18222405016422272, "learning_rate": 1.9936712590013404e-05, "loss": 1.3265, "step": 2167 }, { "epoch": 0.6457305608816247, "grad_norm": 0.18246474862098694, "learning_rate": 1.9936604196624214e-05, "loss": 1.3251, "step": 2168 }, { "epoch": 0.6460284070812934, "grad_norm": 0.17890970408916473, "learning_rate": 1.9936495710786025e-05, "loss": 1.3271, "step": 2169 }, { "epoch": 0.646326253280962, "grad_norm": 0.17982813715934753, "learning_rate": 1.9936387132499838e-05, "loss": 1.3299, "step": 2170 }, { "epoch": 0.6466240994806307, "grad_norm": 0.18518179655075073, "learning_rate": 1.993627846176668e-05, "loss": 1.3258, "step": 2171 }, { "epoch": 0.6469219456802994, "grad_norm": 0.17826440930366516, "learning_rate": 1.9936169698587546e-05, "loss": 1.3174, "step": 2172 }, { "epoch": 0.647219791879968, "grad_norm": 0.17979653179645538, "learning_rate": 1.9936060842963456e-05, "loss": 1.3278, "step": 2173 }, { "epoch": 0.6475176380796366, "grad_norm": 0.17953205108642578, "learning_rate": 1.993595189489542e-05, "loss": 1.3514, "step": 2174 }, { "epoch": 0.6478154842793052, "grad_norm": 0.18435202538967133, "learning_rate": 1.9935842854384456e-05, "loss": 1.3157, "step": 2175 }, { "epoch": 0.6481133304789739, "grad_norm": 0.17402073740959167, "learning_rate": 1.993573372143157e-05, "loss": 1.3356, "step": 2176 }, { "epoch": 0.6484111766786426, "grad_norm": 0.17155030369758606, "learning_rate": 1.9935624496037786e-05, "loss": 1.317, "step": 2177 }, { "epoch": 0.6487090228783112, "grad_norm": 0.17052295804023743, "learning_rate": 1.9935515178204118e-05, "loss": 1.3164, "step": 2178 }, { "epoch": 0.6490068690779799, "grad_norm": 0.1768856942653656, "learning_rate": 1.9935405767931582e-05, "loss": 1.3451, "step": 2179 }, { "epoch": 0.6493047152776485, "grad_norm": 0.18772312998771667, "learning_rate": 1.9935296265221192e-05, "loss": 1.3215, "step": 2180 }, { "epoch": 0.6496025614773171, "grad_norm": 0.1783144325017929, "learning_rate": 1.9935186670073975e-05, "loss": 1.3476, "step": 2181 }, { "epoch": 0.6499004076769858, "grad_norm": 0.17431962490081787, "learning_rate": 1.9935076982490943e-05, "loss": 1.3408, "step": 2182 }, { "epoch": 0.6501982538766544, "grad_norm": 0.18071846663951874, "learning_rate": 1.993496720247312e-05, "loss": 1.337, "step": 2183 }, { "epoch": 0.6504961000763231, "grad_norm": 0.17916467785835266, "learning_rate": 1.993485733002153e-05, "loss": 1.344, "step": 2184 }, { "epoch": 0.6507939462759917, "grad_norm": 0.181587815284729, "learning_rate": 1.993474736513719e-05, "loss": 1.3243, "step": 2185 }, { "epoch": 0.6510917924756604, "grad_norm": 0.16496814787387848, "learning_rate": 1.9934637307821126e-05, "loss": 1.3287, "step": 2186 }, { "epoch": 0.6513896386753291, "grad_norm": 0.18435817956924438, "learning_rate": 1.993452715807436e-05, "loss": 1.3095, "step": 2187 }, { "epoch": 0.6516874848749977, "grad_norm": 0.17921589314937592, "learning_rate": 1.993441691589792e-05, "loss": 1.3267, "step": 2188 }, { "epoch": 0.6519853310746663, "grad_norm": 0.17948828637599945, "learning_rate": 1.993430658129283e-05, "loss": 1.3222, "step": 2189 }, { "epoch": 0.6522831772743349, "grad_norm": 0.18039970099925995, "learning_rate": 1.9934196154260114e-05, "loss": 1.3252, "step": 2190 }, { "epoch": 0.6525810234740036, "grad_norm": 0.16772134602069855, "learning_rate": 1.9934085634800807e-05, "loss": 1.3329, "step": 2191 }, { "epoch": 0.6528788696736723, "grad_norm": 0.17711152136325836, "learning_rate": 1.993397502291593e-05, "loss": 1.3336, "step": 2192 }, { "epoch": 0.6531767158733409, "grad_norm": 0.1912129521369934, "learning_rate": 1.9933864318606514e-05, "loss": 1.3189, "step": 2193 }, { "epoch": 0.6534745620730096, "grad_norm": 0.18480518460273743, "learning_rate": 1.9933753521873587e-05, "loss": 1.3287, "step": 2194 }, { "epoch": 0.6537724082726782, "grad_norm": 0.17623695731163025, "learning_rate": 1.9933642632718185e-05, "loss": 1.3193, "step": 2195 }, { "epoch": 0.6540702544723468, "grad_norm": 0.1776251345872879, "learning_rate": 1.9933531651141335e-05, "loss": 1.3404, "step": 2196 }, { "epoch": 0.6543681006720155, "grad_norm": 0.18505097925662994, "learning_rate": 1.9933420577144075e-05, "loss": 1.3214, "step": 2197 }, { "epoch": 0.6546659468716841, "grad_norm": 0.19477857649326324, "learning_rate": 1.9933309410727427e-05, "loss": 1.3391, "step": 2198 }, { "epoch": 0.6549637930713528, "grad_norm": 0.1819305121898651, "learning_rate": 1.993319815189244e-05, "loss": 1.3302, "step": 2199 }, { "epoch": 0.6552616392710214, "grad_norm": 0.18122519552707672, "learning_rate": 1.993308680064014e-05, "loss": 1.3501, "step": 2200 }, { "epoch": 0.6555594854706901, "grad_norm": 0.1732597053050995, "learning_rate": 1.993297535697157e-05, "loss": 1.3256, "step": 2201 }, { "epoch": 0.6558573316703588, "grad_norm": 0.18337799608707428, "learning_rate": 1.9932863820887753e-05, "loss": 1.3349, "step": 2202 }, { "epoch": 0.6561551778700274, "grad_norm": 0.18442195653915405, "learning_rate": 1.9932752192389743e-05, "loss": 1.3359, "step": 2203 }, { "epoch": 0.656453024069696, "grad_norm": 0.17236138880252838, "learning_rate": 1.9932640471478568e-05, "loss": 1.3318, "step": 2204 }, { "epoch": 0.6567508702693646, "grad_norm": 0.18082675337791443, "learning_rate": 1.993252865815527e-05, "loss": 1.3377, "step": 2205 }, { "epoch": 0.6570487164690333, "grad_norm": 0.18028658628463745, "learning_rate": 1.9932416752420895e-05, "loss": 1.3268, "step": 2206 }, { "epoch": 0.657346562668702, "grad_norm": 0.18233519792556763, "learning_rate": 1.9932304754276473e-05, "loss": 1.3501, "step": 2207 }, { "epoch": 0.6576444088683706, "grad_norm": 0.1773044466972351, "learning_rate": 1.9932192663723054e-05, "loss": 1.3387, "step": 2208 }, { "epoch": 0.6579422550680393, "grad_norm": 0.17857183516025543, "learning_rate": 1.993208048076168e-05, "loss": 1.327, "step": 2209 }, { "epoch": 0.6582401012677079, "grad_norm": 0.18105275928974152, "learning_rate": 1.9931968205393398e-05, "loss": 1.3268, "step": 2210 }, { "epoch": 0.6585379474673765, "grad_norm": 0.17181296646595, "learning_rate": 1.993185583761924e-05, "loss": 1.3236, "step": 2211 }, { "epoch": 0.6588357936670451, "grad_norm": 0.1748933494091034, "learning_rate": 1.993174337744027e-05, "loss": 1.3407, "step": 2212 }, { "epoch": 0.6591336398667138, "grad_norm": 0.18497586250305176, "learning_rate": 1.993163082485752e-05, "loss": 1.3093, "step": 2213 }, { "epoch": 0.6594314860663825, "grad_norm": 0.17400768399238586, "learning_rate": 1.993151817987204e-05, "loss": 1.3278, "step": 2214 }, { "epoch": 0.6597293322660511, "grad_norm": 0.18220269680023193, "learning_rate": 1.993140544248488e-05, "loss": 1.3406, "step": 2215 }, { "epoch": 0.6600271784657198, "grad_norm": 0.17420974373817444, "learning_rate": 1.993129261269709e-05, "loss": 1.336, "step": 2216 }, { "epoch": 0.6603250246653884, "grad_norm": 0.17776700854301453, "learning_rate": 1.9931179690509714e-05, "loss": 1.3396, "step": 2217 }, { "epoch": 0.6606228708650571, "grad_norm": 0.16965252161026, "learning_rate": 1.9931066675923808e-05, "loss": 1.3432, "step": 2218 }, { "epoch": 0.6609207170647257, "grad_norm": 0.19087284803390503, "learning_rate": 1.9930953568940424e-05, "loss": 1.3202, "step": 2219 }, { "epoch": 0.6612185632643943, "grad_norm": 0.18144509196281433, "learning_rate": 1.9930840369560612e-05, "loss": 1.3085, "step": 2220 }, { "epoch": 0.661516409464063, "grad_norm": 0.22314777970314026, "learning_rate": 1.9930727077785427e-05, "loss": 1.3226, "step": 2221 }, { "epoch": 0.6618142556637316, "grad_norm": 0.18070575594902039, "learning_rate": 1.993061369361592e-05, "loss": 1.3152, "step": 2222 }, { "epoch": 0.6621121018634003, "grad_norm": 0.18086224794387817, "learning_rate": 1.9930500217053147e-05, "loss": 1.3316, "step": 2223 }, { "epoch": 0.662409948063069, "grad_norm": 0.18728743493556976, "learning_rate": 1.993038664809817e-05, "loss": 1.3261, "step": 2224 }, { "epoch": 0.6627077942627376, "grad_norm": 0.17994599044322968, "learning_rate": 1.9930272986752036e-05, "loss": 1.3346, "step": 2225 }, { "epoch": 0.6630056404624062, "grad_norm": 0.1903691291809082, "learning_rate": 1.9930159233015805e-05, "loss": 1.3491, "step": 2226 }, { "epoch": 0.6633034866620748, "grad_norm": 0.18117789924144745, "learning_rate": 1.993004538689054e-05, "loss": 1.3057, "step": 2227 }, { "epoch": 0.6636013328617435, "grad_norm": 0.17975282669067383, "learning_rate": 1.9929931448377292e-05, "loss": 1.3228, "step": 2228 }, { "epoch": 0.6638991790614122, "grad_norm": 0.18567296862602234, "learning_rate": 1.9929817417477132e-05, "loss": 1.3328, "step": 2229 }, { "epoch": 0.6641970252610808, "grad_norm": 0.18861307203769684, "learning_rate": 1.9929703294191115e-05, "loss": 1.3223, "step": 2230 }, { "epoch": 0.6644948714607495, "grad_norm": 0.18120045959949493, "learning_rate": 1.99295890785203e-05, "loss": 1.346, "step": 2231 }, { "epoch": 0.6647927176604181, "grad_norm": 0.17772220075130463, "learning_rate": 1.9929474770465755e-05, "loss": 1.3255, "step": 2232 }, { "epoch": 0.6650905638600868, "grad_norm": 0.189261332154274, "learning_rate": 1.9929360370028535e-05, "loss": 1.3306, "step": 2233 }, { "epoch": 0.6653884100597554, "grad_norm": 0.19222469627857208, "learning_rate": 1.9929245877209718e-05, "loss": 1.3327, "step": 2234 }, { "epoch": 0.665686256259424, "grad_norm": 0.17099244892597198, "learning_rate": 1.992913129201036e-05, "loss": 1.3198, "step": 2235 }, { "epoch": 0.6659841024590927, "grad_norm": 0.1890147626399994, "learning_rate": 1.9929016614431526e-05, "loss": 1.3253, "step": 2236 }, { "epoch": 0.6662819486587613, "grad_norm": 0.18433354794979095, "learning_rate": 1.9928901844474285e-05, "loss": 1.315, "step": 2237 }, { "epoch": 0.66657979485843, "grad_norm": 0.18210284411907196, "learning_rate": 1.992878698213971e-05, "loss": 1.3342, "step": 2238 }, { "epoch": 0.6668776410580987, "grad_norm": 0.17864404618740082, "learning_rate": 1.9928672027428866e-05, "loss": 1.3399, "step": 2239 }, { "epoch": 0.6671754872577673, "grad_norm": 0.18134282529354095, "learning_rate": 1.9928556980342818e-05, "loss": 1.3254, "step": 2240 }, { "epoch": 0.6674733334574359, "grad_norm": 0.19404278695583344, "learning_rate": 1.992844184088264e-05, "loss": 1.3467, "step": 2241 }, { "epoch": 0.6677711796571045, "grad_norm": 0.18934600055217743, "learning_rate": 1.9928326609049406e-05, "loss": 1.3338, "step": 2242 }, { "epoch": 0.6680690258567732, "grad_norm": 0.17520056664943695, "learning_rate": 1.9928211284844183e-05, "loss": 1.3353, "step": 2243 }, { "epoch": 0.6683668720564419, "grad_norm": 0.18218885362148285, "learning_rate": 1.992809586826805e-05, "loss": 1.3143, "step": 2244 }, { "epoch": 0.6686647182561105, "grad_norm": 0.1865578293800354, "learning_rate": 1.992798035932207e-05, "loss": 1.3276, "step": 2245 }, { "epoch": 0.6689625644557792, "grad_norm": 0.18215002119541168, "learning_rate": 1.9927864758007332e-05, "loss": 1.3176, "step": 2246 }, { "epoch": 0.6692604106554478, "grad_norm": 0.18074733018875122, "learning_rate": 1.9927749064324905e-05, "loss": 1.3295, "step": 2247 }, { "epoch": 0.6695582568551164, "grad_norm": 0.18050310015678406, "learning_rate": 1.9927633278275862e-05, "loss": 1.312, "step": 2248 }, { "epoch": 0.669856103054785, "grad_norm": 0.17129181325435638, "learning_rate": 1.992751739986128e-05, "loss": 1.3518, "step": 2249 }, { "epoch": 0.6701539492544537, "grad_norm": 0.19418247044086456, "learning_rate": 1.9927401429082244e-05, "loss": 1.3273, "step": 2250 }, { "epoch": 0.6704517954541224, "grad_norm": 0.18332301080226898, "learning_rate": 1.992728536593983e-05, "loss": 1.3223, "step": 2251 }, { "epoch": 0.670749641653791, "grad_norm": 0.17829930782318115, "learning_rate": 1.9927169210435117e-05, "loss": 1.3381, "step": 2252 }, { "epoch": 0.6710474878534597, "grad_norm": 0.1867094486951828, "learning_rate": 1.9927052962569183e-05, "loss": 1.341, "step": 2253 }, { "epoch": 0.6713453340531284, "grad_norm": 0.18619227409362793, "learning_rate": 1.9926936622343115e-05, "loss": 1.3242, "step": 2254 }, { "epoch": 0.671643180252797, "grad_norm": 0.1722981333732605, "learning_rate": 1.992682018975799e-05, "loss": 1.3277, "step": 2255 }, { "epoch": 0.6719410264524656, "grad_norm": 0.17711478471755981, "learning_rate": 1.9926703664814898e-05, "loss": 1.3135, "step": 2256 }, { "epoch": 0.6722388726521342, "grad_norm": 0.18896667659282684, "learning_rate": 1.9926587047514917e-05, "loss": 1.3303, "step": 2257 }, { "epoch": 0.6725367188518029, "grad_norm": 0.17343689501285553, "learning_rate": 1.9926470337859133e-05, "loss": 1.3349, "step": 2258 }, { "epoch": 0.6728345650514715, "grad_norm": 0.1764567643404007, "learning_rate": 1.9926353535848636e-05, "loss": 1.333, "step": 2259 }, { "epoch": 0.6731324112511402, "grad_norm": 0.17717604339122772, "learning_rate": 1.9926236641484506e-05, "loss": 1.3227, "step": 2260 }, { "epoch": 0.6734302574508089, "grad_norm": 0.1780940145254135, "learning_rate": 1.9926119654767836e-05, "loss": 1.3304, "step": 2261 }, { "epoch": 0.6737281036504775, "grad_norm": 0.182095006108284, "learning_rate": 1.9926002575699713e-05, "loss": 1.3158, "step": 2262 }, { "epoch": 0.6740259498501461, "grad_norm": 0.17723587155342102, "learning_rate": 1.992588540428123e-05, "loss": 1.3241, "step": 2263 }, { "epoch": 0.6743237960498147, "grad_norm": 0.1783917248249054, "learning_rate": 1.9925768140513466e-05, "loss": 1.3216, "step": 2264 }, { "epoch": 0.6746216422494834, "grad_norm": 0.1807514876127243, "learning_rate": 1.9925650784397522e-05, "loss": 1.3329, "step": 2265 }, { "epoch": 0.6749194884491521, "grad_norm": 0.18023428320884705, "learning_rate": 1.9925533335934488e-05, "loss": 1.3192, "step": 2266 }, { "epoch": 0.6752173346488207, "grad_norm": 0.18253910541534424, "learning_rate": 1.9925415795125455e-05, "loss": 1.3182, "step": 2267 }, { "epoch": 0.6755151808484894, "grad_norm": 0.19268609583377838, "learning_rate": 1.992529816197152e-05, "loss": 1.3322, "step": 2268 }, { "epoch": 0.675813027048158, "grad_norm": 0.18364225327968597, "learning_rate": 1.992518043647377e-05, "loss": 1.3308, "step": 2269 }, { "epoch": 0.6761108732478267, "grad_norm": 0.17027676105499268, "learning_rate": 1.9925062618633308e-05, "loss": 1.3514, "step": 2270 }, { "epoch": 0.6764087194474953, "grad_norm": 0.18000012636184692, "learning_rate": 1.9924944708451225e-05, "loss": 1.3038, "step": 2271 }, { "epoch": 0.6767065656471639, "grad_norm": 0.19108423590660095, "learning_rate": 1.9924826705928623e-05, "loss": 1.3204, "step": 2272 }, { "epoch": 0.6770044118468326, "grad_norm": 0.1780695766210556, "learning_rate": 1.9924708611066595e-05, "loss": 1.3524, "step": 2273 }, { "epoch": 0.6773022580465012, "grad_norm": 0.1744093894958496, "learning_rate": 1.9924590423866242e-05, "loss": 1.3183, "step": 2274 }, { "epoch": 0.6776001042461699, "grad_norm": 0.1840371936559677, "learning_rate": 1.9924472144328667e-05, "loss": 1.3395, "step": 2275 }, { "epoch": 0.6778979504458386, "grad_norm": 0.18936192989349365, "learning_rate": 1.9924353772454962e-05, "loss": 1.3308, "step": 2276 }, { "epoch": 0.6781957966455072, "grad_norm": 0.1820136308670044, "learning_rate": 1.9924235308246237e-05, "loss": 1.3396, "step": 2277 }, { "epoch": 0.6784936428451758, "grad_norm": 0.18547116219997406, "learning_rate": 1.9924116751703592e-05, "loss": 1.3116, "step": 2278 }, { "epoch": 0.6787914890448444, "grad_norm": 0.18258768320083618, "learning_rate": 1.9923998102828125e-05, "loss": 1.3307, "step": 2279 }, { "epoch": 0.6790893352445131, "grad_norm": 0.1860014945268631, "learning_rate": 1.992387936162094e-05, "loss": 1.3164, "step": 2280 }, { "epoch": 0.6793871814441818, "grad_norm": 0.1784329116344452, "learning_rate": 1.9923760528083153e-05, "loss": 1.3217, "step": 2281 }, { "epoch": 0.6796850276438504, "grad_norm": 0.18610075116157532, "learning_rate": 1.9923641602215857e-05, "loss": 1.3416, "step": 2282 }, { "epoch": 0.6799828738435191, "grad_norm": 0.1862253099679947, "learning_rate": 1.9923522584020164e-05, "loss": 1.3276, "step": 2283 }, { "epoch": 0.6802807200431877, "grad_norm": 0.18593938648700714, "learning_rate": 1.9923403473497182e-05, "loss": 1.3112, "step": 2284 }, { "epoch": 0.6805785662428564, "grad_norm": 0.1905030757188797, "learning_rate": 1.9923284270648015e-05, "loss": 1.3392, "step": 2285 }, { "epoch": 0.680876412442525, "grad_norm": 0.18495959043502808, "learning_rate": 1.992316497547378e-05, "loss": 1.3148, "step": 2286 }, { "epoch": 0.6811742586421936, "grad_norm": 0.18764932453632355, "learning_rate": 1.9923045587975576e-05, "loss": 1.3226, "step": 2287 }, { "epoch": 0.6814721048418623, "grad_norm": 0.1735365390777588, "learning_rate": 1.992292610815452e-05, "loss": 1.3311, "step": 2288 }, { "epoch": 0.6817699510415309, "grad_norm": 0.18366490304470062, "learning_rate": 1.9922806536011724e-05, "loss": 1.3356, "step": 2289 }, { "epoch": 0.6820677972411996, "grad_norm": 0.17725488543510437, "learning_rate": 1.9922686871548303e-05, "loss": 1.3312, "step": 2290 }, { "epoch": 0.6823656434408683, "grad_norm": 0.17958669364452362, "learning_rate": 1.9922567114765362e-05, "loss": 1.3238, "step": 2291 }, { "epoch": 0.6826634896405369, "grad_norm": 0.18054373562335968, "learning_rate": 1.9922447265664023e-05, "loss": 1.3357, "step": 2292 }, { "epoch": 0.6829613358402055, "grad_norm": 0.1845768392086029, "learning_rate": 1.99223273242454e-05, "loss": 1.336, "step": 2293 }, { "epoch": 0.6832591820398741, "grad_norm": 0.16891595721244812, "learning_rate": 1.9922207290510603e-05, "loss": 1.3212, "step": 2294 }, { "epoch": 0.6835570282395428, "grad_norm": 0.1755804568529129, "learning_rate": 1.9922087164460755e-05, "loss": 1.3164, "step": 2295 }, { "epoch": 0.6838548744392114, "grad_norm": 0.18519924581050873, "learning_rate": 1.992196694609697e-05, "loss": 1.3306, "step": 2296 }, { "epoch": 0.6841527206388801, "grad_norm": 0.18089953064918518, "learning_rate": 1.992184663542037e-05, "loss": 1.3381, "step": 2297 }, { "epoch": 0.6844505668385488, "grad_norm": 0.18031135201454163, "learning_rate": 1.9921726232432072e-05, "loss": 1.3244, "step": 2298 }, { "epoch": 0.6847484130382174, "grad_norm": 0.1838495433330536, "learning_rate": 1.9921605737133197e-05, "loss": 1.3282, "step": 2299 }, { "epoch": 0.6850462592378861, "grad_norm": 0.17975088953971863, "learning_rate": 1.9921485149524864e-05, "loss": 1.3302, "step": 2300 }, { "epoch": 0.6853441054375546, "grad_norm": 0.17442457377910614, "learning_rate": 1.99213644696082e-05, "loss": 1.3328, "step": 2301 }, { "epoch": 0.6856419516372233, "grad_norm": 0.17792700231075287, "learning_rate": 1.992124369738432e-05, "loss": 1.3198, "step": 2302 }, { "epoch": 0.685939797836892, "grad_norm": 0.17103061079978943, "learning_rate": 1.9921122832854353e-05, "loss": 1.3293, "step": 2303 }, { "epoch": 0.6862376440365606, "grad_norm": 0.18080931901931763, "learning_rate": 1.9921001876019425e-05, "loss": 1.3322, "step": 2304 }, { "epoch": 0.6865354902362293, "grad_norm": 0.17673060297966003, "learning_rate": 1.9920880826880657e-05, "loss": 1.3155, "step": 2305 }, { "epoch": 0.686833336435898, "grad_norm": 0.18672189116477966, "learning_rate": 1.9920759685439178e-05, "loss": 1.3056, "step": 2306 }, { "epoch": 0.6871311826355666, "grad_norm": 0.17934873700141907, "learning_rate": 1.9920638451696112e-05, "loss": 1.3379, "step": 2307 }, { "epoch": 0.6874290288352352, "grad_norm": 0.1873992532491684, "learning_rate": 1.9920517125652594e-05, "loss": 1.337, "step": 2308 }, { "epoch": 0.6877268750349038, "grad_norm": 0.18944714963436127, "learning_rate": 1.9920395707309743e-05, "loss": 1.3169, "step": 2309 }, { "epoch": 0.6880247212345725, "grad_norm": 0.18874067068099976, "learning_rate": 1.9920274196668696e-05, "loss": 1.327, "step": 2310 }, { "epoch": 0.6883225674342411, "grad_norm": 0.17803321778774261, "learning_rate": 1.9920152593730582e-05, "loss": 1.3236, "step": 2311 }, { "epoch": 0.6886204136339098, "grad_norm": 0.17799226939678192, "learning_rate": 1.9920030898496532e-05, "loss": 1.323, "step": 2312 }, { "epoch": 0.6889182598335785, "grad_norm": 0.18605384230613708, "learning_rate": 1.9919909110967676e-05, "loss": 1.3214, "step": 2313 }, { "epoch": 0.6892161060332471, "grad_norm": 0.1801423281431198, "learning_rate": 1.9919787231145147e-05, "loss": 1.3154, "step": 2314 }, { "epoch": 0.6895139522329158, "grad_norm": 0.1903238594532013, "learning_rate": 1.9919665259030084e-05, "loss": 1.3088, "step": 2315 }, { "epoch": 0.6898117984325843, "grad_norm": 0.18387939035892487, "learning_rate": 1.991954319462362e-05, "loss": 1.3234, "step": 2316 }, { "epoch": 0.690109644632253, "grad_norm": 0.18068253993988037, "learning_rate": 1.9919421037926885e-05, "loss": 1.3444, "step": 2317 }, { "epoch": 0.6904074908319217, "grad_norm": 0.18537044525146484, "learning_rate": 1.991929878894102e-05, "loss": 1.3246, "step": 2318 }, { "epoch": 0.6907053370315903, "grad_norm": 0.19145667552947998, "learning_rate": 1.9919176447667167e-05, "loss": 1.3397, "step": 2319 }, { "epoch": 0.691003183231259, "grad_norm": 0.18210239708423615, "learning_rate": 1.9919054014106457e-05, "loss": 1.3469, "step": 2320 }, { "epoch": 0.6913010294309276, "grad_norm": 0.1801193505525589, "learning_rate": 1.9918931488260028e-05, "loss": 1.3235, "step": 2321 }, { "epoch": 0.6915988756305963, "grad_norm": 0.18036174774169922, "learning_rate": 1.991880887012903e-05, "loss": 1.3377, "step": 2322 }, { "epoch": 0.6918967218302648, "grad_norm": 0.1823994517326355, "learning_rate": 1.9918686159714596e-05, "loss": 1.3342, "step": 2323 }, { "epoch": 0.6921945680299335, "grad_norm": 0.1893562525510788, "learning_rate": 1.9918563357017863e-05, "loss": 1.3241, "step": 2324 }, { "epoch": 0.6924924142296022, "grad_norm": 0.17952671647071838, "learning_rate": 1.9918440462039984e-05, "loss": 1.3393, "step": 2325 }, { "epoch": 0.6927902604292708, "grad_norm": 0.1820048838853836, "learning_rate": 1.99183174747821e-05, "loss": 1.3284, "step": 2326 }, { "epoch": 0.6930881066289395, "grad_norm": 0.18531034886837006, "learning_rate": 1.9918194395245352e-05, "loss": 1.3036, "step": 2327 }, { "epoch": 0.6933859528286082, "grad_norm": 0.1763806790113449, "learning_rate": 1.9918071223430885e-05, "loss": 1.3334, "step": 2328 }, { "epoch": 0.6936837990282768, "grad_norm": 0.17299970984458923, "learning_rate": 1.9917947959339846e-05, "loss": 1.3233, "step": 2329 }, { "epoch": 0.6939816452279454, "grad_norm": 0.18826264142990112, "learning_rate": 1.9917824602973387e-05, "loss": 1.3555, "step": 2330 }, { "epoch": 0.694279491427614, "grad_norm": 0.17757001519203186, "learning_rate": 1.9917701154332646e-05, "loss": 1.3157, "step": 2331 }, { "epoch": 0.6945773376272827, "grad_norm": 0.18596325814723969, "learning_rate": 1.9917577613418778e-05, "loss": 1.3406, "step": 2332 }, { "epoch": 0.6948751838269513, "grad_norm": 0.18018612265586853, "learning_rate": 1.991745398023293e-05, "loss": 1.3295, "step": 2333 }, { "epoch": 0.69517303002662, "grad_norm": 0.1862306296825409, "learning_rate": 1.9917330254776254e-05, "loss": 1.3308, "step": 2334 }, { "epoch": 0.6954708762262887, "grad_norm": 0.18988408148288727, "learning_rate": 1.99172064370499e-05, "loss": 1.3511, "step": 2335 }, { "epoch": 0.6957687224259573, "grad_norm": 0.18056082725524902, "learning_rate": 1.991708252705502e-05, "loss": 1.3219, "step": 2336 }, { "epoch": 0.696066568625626, "grad_norm": 0.19533643126487732, "learning_rate": 1.991695852479277e-05, "loss": 1.325, "step": 2337 }, { "epoch": 0.6963644148252945, "grad_norm": 0.1862207055091858, "learning_rate": 1.9916834430264296e-05, "loss": 1.3157, "step": 2338 }, { "epoch": 0.6966622610249632, "grad_norm": 0.18456688523292542, "learning_rate": 1.991671024347076e-05, "loss": 1.3158, "step": 2339 }, { "epoch": 0.6969601072246319, "grad_norm": 0.18564829230308533, "learning_rate": 1.9916585964413317e-05, "loss": 1.3379, "step": 2340 }, { "epoch": 0.6972579534243005, "grad_norm": 0.18239466845989227, "learning_rate": 1.9916461593093115e-05, "loss": 1.3352, "step": 2341 }, { "epoch": 0.6975557996239692, "grad_norm": 0.18152117729187012, "learning_rate": 1.9916337129511323e-05, "loss": 1.3309, "step": 2342 }, { "epoch": 0.6978536458236378, "grad_norm": 0.17583444714546204, "learning_rate": 1.9916212573669093e-05, "loss": 1.321, "step": 2343 }, { "epoch": 0.6981514920233065, "grad_norm": 0.19174323976039886, "learning_rate": 1.991608792556758e-05, "loss": 1.3311, "step": 2344 }, { "epoch": 0.6984493382229751, "grad_norm": 0.20540690422058105, "learning_rate": 1.9915963185207948e-05, "loss": 1.3446, "step": 2345 }, { "epoch": 0.6987471844226437, "grad_norm": 0.18162190914154053, "learning_rate": 1.991583835259136e-05, "loss": 1.3302, "step": 2346 }, { "epoch": 0.6990450306223124, "grad_norm": 0.18038925528526306, "learning_rate": 1.9915713427718972e-05, "loss": 1.319, "step": 2347 }, { "epoch": 0.699342876821981, "grad_norm": 0.18839231133460999, "learning_rate": 1.9915588410591954e-05, "loss": 1.3175, "step": 2348 }, { "epoch": 0.6996407230216497, "grad_norm": 0.19137445092201233, "learning_rate": 1.991546330121146e-05, "loss": 1.3184, "step": 2349 }, { "epoch": 0.6999385692213184, "grad_norm": 0.19656787812709808, "learning_rate": 1.991533809957866e-05, "loss": 1.3182, "step": 2350 }, { "epoch": 0.700236415420987, "grad_norm": 0.1719386726617813, "learning_rate": 1.9915212805694715e-05, "loss": 1.3273, "step": 2351 }, { "epoch": 0.7005342616206557, "grad_norm": 0.18107326328754425, "learning_rate": 1.9915087419560795e-05, "loss": 1.326, "step": 2352 }, { "epoch": 0.7008321078203242, "grad_norm": 0.18605652451515198, "learning_rate": 1.9914961941178062e-05, "loss": 1.3163, "step": 2353 }, { "epoch": 0.7011299540199929, "grad_norm": 0.18013206124305725, "learning_rate": 1.9914836370547688e-05, "loss": 1.3263, "step": 2354 }, { "epoch": 0.7014278002196616, "grad_norm": 0.18930692970752716, "learning_rate": 1.991471070767084e-05, "loss": 1.3471, "step": 2355 }, { "epoch": 0.7017256464193302, "grad_norm": 0.17976365983486176, "learning_rate": 1.9914584952548682e-05, "loss": 1.3229, "step": 2356 }, { "epoch": 0.7020234926189989, "grad_norm": 0.18191921710968018, "learning_rate": 1.991445910518239e-05, "loss": 1.3168, "step": 2357 }, { "epoch": 0.7023213388186675, "grad_norm": 0.19520500302314758, "learning_rate": 1.9914333165573136e-05, "loss": 1.3367, "step": 2358 }, { "epoch": 0.7026191850183362, "grad_norm": 0.1778630167245865, "learning_rate": 1.9914207133722086e-05, "loss": 1.3095, "step": 2359 }, { "epoch": 0.7029170312180048, "grad_norm": 0.1767389327287674, "learning_rate": 1.9914081009630413e-05, "loss": 1.307, "step": 2360 }, { "epoch": 0.7032148774176734, "grad_norm": 0.17485411465168, "learning_rate": 1.99139547932993e-05, "loss": 1.3238, "step": 2361 }, { "epoch": 0.7035127236173421, "grad_norm": 0.17325006425380707, "learning_rate": 1.991382848472991e-05, "loss": 1.3139, "step": 2362 }, { "epoch": 0.7038105698170107, "grad_norm": 0.18308599293231964, "learning_rate": 1.9913702083923422e-05, "loss": 1.3335, "step": 2363 }, { "epoch": 0.7041084160166794, "grad_norm": 0.1821424812078476, "learning_rate": 1.9913575590881013e-05, "loss": 1.3053, "step": 2364 }, { "epoch": 0.7044062622163481, "grad_norm": 0.18681998550891876, "learning_rate": 1.9913449005603857e-05, "loss": 1.3132, "step": 2365 }, { "epoch": 0.7047041084160167, "grad_norm": 0.18637306988239288, "learning_rate": 1.9913322328093138e-05, "loss": 1.2985, "step": 2366 }, { "epoch": 0.7050019546156854, "grad_norm": 0.19341403245925903, "learning_rate": 1.9913195558350028e-05, "loss": 1.3215, "step": 2367 }, { "epoch": 0.7052998008153539, "grad_norm": 0.1809716671705246, "learning_rate": 1.9913068696375706e-05, "loss": 1.3319, "step": 2368 }, { "epoch": 0.7055976470150226, "grad_norm": 0.18493227660655975, "learning_rate": 1.9912941742171362e-05, "loss": 1.3293, "step": 2369 }, { "epoch": 0.7058954932146912, "grad_norm": 0.195916086435318, "learning_rate": 1.9912814695738167e-05, "loss": 1.3296, "step": 2370 }, { "epoch": 0.7061933394143599, "grad_norm": 0.1875908225774765, "learning_rate": 1.9912687557077307e-05, "loss": 1.321, "step": 2371 }, { "epoch": 0.7064911856140286, "grad_norm": 0.20055951178073883, "learning_rate": 1.9912560326189966e-05, "loss": 1.318, "step": 2372 }, { "epoch": 0.7067890318136972, "grad_norm": 0.1841721534729004, "learning_rate": 1.9912433003077324e-05, "loss": 1.3118, "step": 2373 }, { "epoch": 0.7070868780133659, "grad_norm": 0.18234992027282715, "learning_rate": 1.991230558774057e-05, "loss": 1.3244, "step": 2374 }, { "epoch": 0.7073847242130344, "grad_norm": 0.1869121789932251, "learning_rate": 1.9912178080180883e-05, "loss": 1.313, "step": 2375 }, { "epoch": 0.7076825704127031, "grad_norm": 0.19375371932983398, "learning_rate": 1.9912050480399458e-05, "loss": 1.3361, "step": 2376 }, { "epoch": 0.7079804166123718, "grad_norm": 0.18816982209682465, "learning_rate": 1.9911922788397473e-05, "loss": 1.343, "step": 2377 }, { "epoch": 0.7082782628120404, "grad_norm": 0.18643729388713837, "learning_rate": 1.991179500417612e-05, "loss": 1.3338, "step": 2378 }, { "epoch": 0.7085761090117091, "grad_norm": 0.18068727850914001, "learning_rate": 1.9911667127736594e-05, "loss": 1.3202, "step": 2379 }, { "epoch": 0.7088739552113777, "grad_norm": 0.1867077499628067, "learning_rate": 1.991153915908008e-05, "loss": 1.3307, "step": 2380 }, { "epoch": 0.7091718014110464, "grad_norm": 0.1902526617050171, "learning_rate": 1.9911411098207765e-05, "loss": 1.3224, "step": 2381 }, { "epoch": 0.7094696476107151, "grad_norm": 0.18281622231006622, "learning_rate": 1.991128294512084e-05, "loss": 1.3156, "step": 2382 }, { "epoch": 0.7097674938103836, "grad_norm": 0.17584648728370667, "learning_rate": 1.9911154699820506e-05, "loss": 1.3338, "step": 2383 }, { "epoch": 0.7100653400100523, "grad_norm": 0.17479106783866882, "learning_rate": 1.991102636230795e-05, "loss": 1.3211, "step": 2384 }, { "epoch": 0.7103631862097209, "grad_norm": 0.18454335629940033, "learning_rate": 1.9910897932584367e-05, "loss": 1.3318, "step": 2385 }, { "epoch": 0.7106610324093896, "grad_norm": 0.18156906962394714, "learning_rate": 1.991076941065095e-05, "loss": 1.3362, "step": 2386 }, { "epoch": 0.7109588786090583, "grad_norm": 0.18315784633159637, "learning_rate": 1.9910640796508897e-05, "loss": 1.3266, "step": 2387 }, { "epoch": 0.7112567248087269, "grad_norm": 0.17966997623443604, "learning_rate": 1.9910512090159405e-05, "loss": 1.3111, "step": 2388 }, { "epoch": 0.7115545710083956, "grad_norm": 0.19016844034194946, "learning_rate": 1.991038329160367e-05, "loss": 1.3338, "step": 2389 }, { "epoch": 0.7118524172080641, "grad_norm": 0.17686402797698975, "learning_rate": 1.991025440084289e-05, "loss": 1.3293, "step": 2390 }, { "epoch": 0.7121502634077328, "grad_norm": 0.17823560535907745, "learning_rate": 1.991012541787827e-05, "loss": 1.319, "step": 2391 }, { "epoch": 0.7124481096074015, "grad_norm": 0.18323731422424316, "learning_rate": 1.9909996342711e-05, "loss": 1.3212, "step": 2392 }, { "epoch": 0.7127459558070701, "grad_norm": 0.19072403013706207, "learning_rate": 1.990986717534229e-05, "loss": 1.3262, "step": 2393 }, { "epoch": 0.7130438020067388, "grad_norm": 0.18610836565494537, "learning_rate": 1.9909737915773335e-05, "loss": 1.3136, "step": 2394 }, { "epoch": 0.7133416482064074, "grad_norm": 0.18933738768100739, "learning_rate": 1.990960856400534e-05, "loss": 1.3163, "step": 2395 }, { "epoch": 0.7136394944060761, "grad_norm": 0.1870865523815155, "learning_rate": 1.990947912003951e-05, "loss": 1.2943, "step": 2396 }, { "epoch": 0.7139373406057448, "grad_norm": 0.18337379395961761, "learning_rate": 1.9909349583877053e-05, "loss": 1.3244, "step": 2397 }, { "epoch": 0.7142351868054133, "grad_norm": 0.19369252026081085, "learning_rate": 1.990921995551916e-05, "loss": 1.3224, "step": 2398 }, { "epoch": 0.714533033005082, "grad_norm": 0.19757018983364105, "learning_rate": 1.9909090234967053e-05, "loss": 1.3196, "step": 2399 }, { "epoch": 0.7148308792047506, "grad_norm": 0.17689107358455658, "learning_rate": 1.9908960422221932e-05, "loss": 1.3209, "step": 2400 }, { "epoch": 0.7151287254044193, "grad_norm": 0.18062490224838257, "learning_rate": 1.9908830517285007e-05, "loss": 1.317, "step": 2401 }, { "epoch": 0.715426571604088, "grad_norm": 0.1801805943250656, "learning_rate": 1.990870052015748e-05, "loss": 1.3143, "step": 2402 }, { "epoch": 0.7157244178037566, "grad_norm": 0.18121488392353058, "learning_rate": 1.9908570430840567e-05, "loss": 1.3135, "step": 2403 }, { "epoch": 0.7160222640034253, "grad_norm": 0.19454193115234375, "learning_rate": 1.9908440249335478e-05, "loss": 1.3282, "step": 2404 }, { "epoch": 0.7163201102030938, "grad_norm": 0.1717558354139328, "learning_rate": 1.990830997564342e-05, "loss": 1.318, "step": 2405 }, { "epoch": 0.7166179564027625, "grad_norm": 0.18550734221935272, "learning_rate": 1.990817960976561e-05, "loss": 1.3045, "step": 2406 }, { "epoch": 0.7169158026024312, "grad_norm": 0.17859478294849396, "learning_rate": 1.990804915170326e-05, "loss": 1.324, "step": 2407 }, { "epoch": 0.7172136488020998, "grad_norm": 0.1804829239845276, "learning_rate": 1.990791860145758e-05, "loss": 1.3131, "step": 2408 }, { "epoch": 0.7175114950017685, "grad_norm": 0.19894477725028992, "learning_rate": 1.990778795902979e-05, "loss": 1.3088, "step": 2409 }, { "epoch": 0.7178093412014371, "grad_norm": 0.1904131919145584, "learning_rate": 1.99076572244211e-05, "loss": 1.3081, "step": 2410 }, { "epoch": 0.7181071874011058, "grad_norm": 0.18772949278354645, "learning_rate": 1.990752639763273e-05, "loss": 1.3228, "step": 2411 }, { "epoch": 0.7184050336007743, "grad_norm": 0.1856239140033722, "learning_rate": 1.9907395478665896e-05, "loss": 1.3304, "step": 2412 }, { "epoch": 0.718702879800443, "grad_norm": 0.1815764456987381, "learning_rate": 1.9907264467521817e-05, "loss": 1.3251, "step": 2413 }, { "epoch": 0.7190007260001117, "grad_norm": 0.18634894490242004, "learning_rate": 1.9907133364201712e-05, "loss": 1.302, "step": 2414 }, { "epoch": 0.7192985721997803, "grad_norm": 0.18666182458400726, "learning_rate": 1.9907002168706798e-05, "loss": 1.3212, "step": 2415 }, { "epoch": 0.719596418399449, "grad_norm": 0.18581783771514893, "learning_rate": 1.9906870881038297e-05, "loss": 1.3433, "step": 2416 }, { "epoch": 0.7198942645991177, "grad_norm": 0.19481654465198517, "learning_rate": 1.9906739501197435e-05, "loss": 1.311, "step": 2417 }, { "epoch": 0.7201921107987863, "grad_norm": 0.1938018798828125, "learning_rate": 1.9906608029185427e-05, "loss": 1.32, "step": 2418 }, { "epoch": 0.720489956998455, "grad_norm": 0.18101400136947632, "learning_rate": 1.9906476465003498e-05, "loss": 1.3321, "step": 2419 }, { "epoch": 0.7207878031981235, "grad_norm": 0.17936542630195618, "learning_rate": 1.9906344808652878e-05, "loss": 1.3215, "step": 2420 }, { "epoch": 0.7210856493977922, "grad_norm": 0.20500177145004272, "learning_rate": 1.9906213060134787e-05, "loss": 1.3109, "step": 2421 }, { "epoch": 0.7213834955974608, "grad_norm": 0.21439610421657562, "learning_rate": 1.990608121945045e-05, "loss": 1.3226, "step": 2422 }, { "epoch": 0.7216813417971295, "grad_norm": 0.18862423300743103, "learning_rate": 1.9905949286601095e-05, "loss": 1.3144, "step": 2423 }, { "epoch": 0.7219791879967982, "grad_norm": 0.19281642138957977, "learning_rate": 1.9905817261587947e-05, "loss": 1.3357, "step": 2424 }, { "epoch": 0.7222770341964668, "grad_norm": 0.19567199051380157, "learning_rate": 1.990568514441224e-05, "loss": 1.3178, "step": 2425 }, { "epoch": 0.7225748803961355, "grad_norm": 0.19655363261699677, "learning_rate": 1.99055529350752e-05, "loss": 1.3181, "step": 2426 }, { "epoch": 0.722872726595804, "grad_norm": 0.19821792840957642, "learning_rate": 1.9905420633578055e-05, "loss": 1.3031, "step": 2427 }, { "epoch": 0.7231705727954727, "grad_norm": 0.1845242828130722, "learning_rate": 1.990528823992204e-05, "loss": 1.3362, "step": 2428 }, { "epoch": 0.7234684189951414, "grad_norm": 0.20102176070213318, "learning_rate": 1.9905155754108385e-05, "loss": 1.3204, "step": 2429 }, { "epoch": 0.72376626519481, "grad_norm": 0.19908630847930908, "learning_rate": 1.990502317613832e-05, "loss": 1.3232, "step": 2430 }, { "epoch": 0.7240641113944787, "grad_norm": 0.19704653322696686, "learning_rate": 1.9904890506013084e-05, "loss": 1.3116, "step": 2431 }, { "epoch": 0.7243619575941473, "grad_norm": 0.1876095086336136, "learning_rate": 1.9904757743733903e-05, "loss": 1.3309, "step": 2432 }, { "epoch": 0.724659803793816, "grad_norm": 0.20279152691364288, "learning_rate": 1.9904624889302024e-05, "loss": 1.329, "step": 2433 }, { "epoch": 0.7249576499934847, "grad_norm": 0.18088439106941223, "learning_rate": 1.9904491942718672e-05, "loss": 1.3312, "step": 2434 }, { "epoch": 0.7252554961931532, "grad_norm": 0.18198047578334808, "learning_rate": 1.9904358903985092e-05, "loss": 1.3251, "step": 2435 }, { "epoch": 0.7255533423928219, "grad_norm": 0.18865373730659485, "learning_rate": 1.9904225773102516e-05, "loss": 1.3314, "step": 2436 }, { "epoch": 0.7258511885924905, "grad_norm": 0.19915615022182465, "learning_rate": 1.9904092550072184e-05, "loss": 1.3215, "step": 2437 }, { "epoch": 0.7261490347921592, "grad_norm": 0.18113912642002106, "learning_rate": 1.9903959234895337e-05, "loss": 1.3152, "step": 2438 }, { "epoch": 0.7264468809918279, "grad_norm": 0.19505468010902405, "learning_rate": 1.9903825827573215e-05, "loss": 1.3097, "step": 2439 }, { "epoch": 0.7267447271914965, "grad_norm": 0.18470172584056854, "learning_rate": 1.990369232810706e-05, "loss": 1.3206, "step": 2440 }, { "epoch": 0.7270425733911652, "grad_norm": 0.19797997176647186, "learning_rate": 1.990355873649811e-05, "loss": 1.3269, "step": 2441 }, { "epoch": 0.7273404195908337, "grad_norm": 0.18900243937969208, "learning_rate": 1.9903425052747613e-05, "loss": 1.3422, "step": 2442 }, { "epoch": 0.7276382657905024, "grad_norm": 0.1780720353126526, "learning_rate": 1.9903291276856813e-05, "loss": 1.3131, "step": 2443 }, { "epoch": 0.727936111990171, "grad_norm": 0.18734505772590637, "learning_rate": 1.990315740882695e-05, "loss": 1.3211, "step": 2444 }, { "epoch": 0.7282339581898397, "grad_norm": 0.19354982674121857, "learning_rate": 1.990302344865927e-05, "loss": 1.3187, "step": 2445 }, { "epoch": 0.7285318043895084, "grad_norm": 0.18596869707107544, "learning_rate": 1.9902889396355023e-05, "loss": 1.3262, "step": 2446 }, { "epoch": 0.728829650589177, "grad_norm": 0.18094055354595184, "learning_rate": 1.9902755251915454e-05, "loss": 1.3255, "step": 2447 }, { "epoch": 0.7291274967888457, "grad_norm": 0.1835775524377823, "learning_rate": 1.9902621015341812e-05, "loss": 1.3135, "step": 2448 }, { "epoch": 0.7294253429885144, "grad_norm": 0.18673677742481232, "learning_rate": 1.9902486686635342e-05, "loss": 1.3205, "step": 2449 }, { "epoch": 0.7297231891881829, "grad_norm": 0.18784306943416595, "learning_rate": 1.99023522657973e-05, "loss": 1.3132, "step": 2450 }, { "epoch": 0.7300210353878516, "grad_norm": 0.19159583747386932, "learning_rate": 1.9902217752828935e-05, "loss": 1.3469, "step": 2451 }, { "epoch": 0.7303188815875202, "grad_norm": 0.1930502951145172, "learning_rate": 1.9902083147731495e-05, "loss": 1.3247, "step": 2452 }, { "epoch": 0.7306167277871889, "grad_norm": 0.18336597084999084, "learning_rate": 1.9901948450506238e-05, "loss": 1.3292, "step": 2453 }, { "epoch": 0.7309145739868576, "grad_norm": 0.20361313223838806, "learning_rate": 1.9901813661154408e-05, "loss": 1.3202, "step": 2454 }, { "epoch": 0.7312124201865262, "grad_norm": 0.20657707750797272, "learning_rate": 1.9901678779677266e-05, "loss": 1.3271, "step": 2455 }, { "epoch": 0.7315102663861949, "grad_norm": 0.19663913547992706, "learning_rate": 1.990154380607607e-05, "loss": 1.3254, "step": 2456 }, { "epoch": 0.7318081125858634, "grad_norm": 0.19350913166999817, "learning_rate": 1.990140874035207e-05, "loss": 1.317, "step": 2457 }, { "epoch": 0.7321059587855321, "grad_norm": 0.1959448605775833, "learning_rate": 1.990127358250652e-05, "loss": 1.3388, "step": 2458 }, { "epoch": 0.7324038049852007, "grad_norm": 0.1914556324481964, "learning_rate": 1.9901138332540685e-05, "loss": 1.3193, "step": 2459 }, { "epoch": 0.7327016511848694, "grad_norm": 0.19220644235610962, "learning_rate": 1.9901002990455822e-05, "loss": 1.3096, "step": 2460 }, { "epoch": 0.7329994973845381, "grad_norm": 0.17991133034229279, "learning_rate": 1.9900867556253188e-05, "loss": 1.3168, "step": 2461 }, { "epoch": 0.7332973435842067, "grad_norm": 0.18685564398765564, "learning_rate": 1.9900732029934038e-05, "loss": 1.3407, "step": 2462 }, { "epoch": 0.7335951897838754, "grad_norm": 0.1979990303516388, "learning_rate": 1.9900596411499644e-05, "loss": 1.3124, "step": 2463 }, { "epoch": 0.733893035983544, "grad_norm": 0.22801139950752258, "learning_rate": 1.9900460700951257e-05, "loss": 1.3229, "step": 2464 }, { "epoch": 0.7341908821832126, "grad_norm": 0.19433832168579102, "learning_rate": 1.990032489829015e-05, "loss": 1.331, "step": 2465 }, { "epoch": 0.7344887283828813, "grad_norm": 0.18098841607570648, "learning_rate": 1.9900189003517578e-05, "loss": 1.3282, "step": 2466 }, { "epoch": 0.7347865745825499, "grad_norm": 0.18603920936584473, "learning_rate": 1.990005301663481e-05, "loss": 1.3258, "step": 2467 }, { "epoch": 0.7350844207822186, "grad_norm": 0.19751660525798798, "learning_rate": 1.9899916937643107e-05, "loss": 1.3203, "step": 2468 }, { "epoch": 0.7353822669818872, "grad_norm": 0.1913176327943802, "learning_rate": 1.989978076654374e-05, "loss": 1.3297, "step": 2469 }, { "epoch": 0.7356801131815559, "grad_norm": 0.19215697050094604, "learning_rate": 1.989964450333797e-05, "loss": 1.3196, "step": 2470 }, { "epoch": 0.7359779593812246, "grad_norm": 0.19319573044776917, "learning_rate": 1.9899508148027075e-05, "loss": 1.3402, "step": 2471 }, { "epoch": 0.7362758055808931, "grad_norm": 0.19183918833732605, "learning_rate": 1.9899371700612313e-05, "loss": 1.3425, "step": 2472 }, { "epoch": 0.7365736517805618, "grad_norm": 0.17503371834754944, "learning_rate": 1.989923516109496e-05, "loss": 1.3207, "step": 2473 }, { "epoch": 0.7368714979802304, "grad_norm": 0.1884661465883255, "learning_rate": 1.989909852947628e-05, "loss": 1.329, "step": 2474 }, { "epoch": 0.7371693441798991, "grad_norm": 0.19076471030712128, "learning_rate": 1.9898961805757547e-05, "loss": 1.3061, "step": 2475 }, { "epoch": 0.7374671903795678, "grad_norm": 0.1796417236328125, "learning_rate": 1.989882498994004e-05, "loss": 1.3201, "step": 2476 }, { "epoch": 0.7377650365792364, "grad_norm": 0.18640348315238953, "learning_rate": 1.9898688082025024e-05, "loss": 1.3346, "step": 2477 }, { "epoch": 0.7380628827789051, "grad_norm": 0.20385372638702393, "learning_rate": 1.9898551082013774e-05, "loss": 1.3182, "step": 2478 }, { "epoch": 0.7383607289785737, "grad_norm": 0.17771016061306, "learning_rate": 1.9898413989907563e-05, "loss": 1.3293, "step": 2479 }, { "epoch": 0.7386585751782423, "grad_norm": 0.17758183181285858, "learning_rate": 1.9898276805707673e-05, "loss": 1.3371, "step": 2480 }, { "epoch": 0.738956421377911, "grad_norm": 0.18944403529167175, "learning_rate": 1.9898139529415374e-05, "loss": 1.3006, "step": 2481 }, { "epoch": 0.7392542675775796, "grad_norm": 0.18202091753482819, "learning_rate": 1.9898002161031948e-05, "loss": 1.3255, "step": 2482 }, { "epoch": 0.7395521137772483, "grad_norm": 0.1849079728126526, "learning_rate": 1.989786470055867e-05, "loss": 1.3168, "step": 2483 }, { "epoch": 0.7398499599769169, "grad_norm": 0.19304360449314117, "learning_rate": 1.9897727147996817e-05, "loss": 1.3296, "step": 2484 }, { "epoch": 0.7401478061765856, "grad_norm": 0.18227414786815643, "learning_rate": 1.9897589503347673e-05, "loss": 1.3093, "step": 2485 }, { "epoch": 0.7404456523762543, "grad_norm": 0.2001018077135086, "learning_rate": 1.9897451766612515e-05, "loss": 1.3346, "step": 2486 }, { "epoch": 0.7407434985759228, "grad_norm": 0.18903778493404388, "learning_rate": 1.989731393779263e-05, "loss": 1.3431, "step": 2487 }, { "epoch": 0.7410413447755915, "grad_norm": 0.19867846369743347, "learning_rate": 1.9897176016889296e-05, "loss": 1.3341, "step": 2488 }, { "epoch": 0.7413391909752601, "grad_norm": 0.19320452213287354, "learning_rate": 1.9897038003903795e-05, "loss": 1.3235, "step": 2489 }, { "epoch": 0.7416370371749288, "grad_norm": 0.18547265231609344, "learning_rate": 1.989689989883741e-05, "loss": 1.3359, "step": 2490 }, { "epoch": 0.7419348833745975, "grad_norm": 0.18041522800922394, "learning_rate": 1.9896761701691437e-05, "loss": 1.303, "step": 2491 }, { "epoch": 0.7422327295742661, "grad_norm": 0.1961670070886612, "learning_rate": 1.9896623412467147e-05, "loss": 1.3224, "step": 2492 }, { "epoch": 0.7425305757739348, "grad_norm": 0.19458021223545074, "learning_rate": 1.9896485031165836e-05, "loss": 1.3336, "step": 2493 }, { "epoch": 0.7428284219736033, "grad_norm": 0.1804085075855255, "learning_rate": 1.989634655778879e-05, "loss": 1.3176, "step": 2494 }, { "epoch": 0.743126268173272, "grad_norm": 0.20877377688884735, "learning_rate": 1.9896207992337296e-05, "loss": 1.3213, "step": 2495 }, { "epoch": 0.7434241143729406, "grad_norm": 0.20411789417266846, "learning_rate": 1.9896069334812638e-05, "loss": 1.3265, "step": 2496 }, { "epoch": 0.7437219605726093, "grad_norm": 0.18133004009723663, "learning_rate": 1.9895930585216116e-05, "loss": 1.3338, "step": 2497 }, { "epoch": 0.744019806772278, "grad_norm": 0.18506641685962677, "learning_rate": 1.9895791743549017e-05, "loss": 1.3357, "step": 2498 }, { "epoch": 0.7443176529719466, "grad_norm": 0.18748866021633148, "learning_rate": 1.989565280981263e-05, "loss": 1.3157, "step": 2499 }, { "epoch": 0.7446154991716153, "grad_norm": 0.19544926285743713, "learning_rate": 1.989551378400825e-05, "loss": 1.3171, "step": 2500 }, { "epoch": 0.7446154991716153, "eval_loss": 1.3665268421173096, "eval_runtime": 19.5702, "eval_samples_per_second": 88.604, "eval_steps_per_second": 5.57, "step": 2500 }, { "epoch": 0.744913345371284, "grad_norm": 0.18737247586250305, "learning_rate": 1.989537466613717e-05, "loss": 1.3311, "step": 2501 }, { "epoch": 0.7452111915709525, "grad_norm": 0.183830127120018, "learning_rate": 1.9895235456200685e-05, "loss": 1.3178, "step": 2502 }, { "epoch": 0.7455090377706212, "grad_norm": 0.18429724872112274, "learning_rate": 1.989509615420009e-05, "loss": 1.3013, "step": 2503 }, { "epoch": 0.7458068839702898, "grad_norm": 0.1926550567150116, "learning_rate": 1.9894956760136682e-05, "loss": 1.3067, "step": 2504 }, { "epoch": 0.7461047301699585, "grad_norm": 0.18947188556194305, "learning_rate": 1.9894817274011755e-05, "loss": 1.3108, "step": 2505 }, { "epoch": 0.7464025763696271, "grad_norm": 0.1820349395275116, "learning_rate": 1.9894677695826607e-05, "loss": 1.3147, "step": 2506 }, { "epoch": 0.7467004225692958, "grad_norm": 0.19136416912078857, "learning_rate": 1.989453802558254e-05, "loss": 1.3381, "step": 2507 }, { "epoch": 0.7469982687689645, "grad_norm": 0.1774381846189499, "learning_rate": 1.989439826328085e-05, "loss": 1.313, "step": 2508 }, { "epoch": 0.747296114968633, "grad_norm": 0.1879688948392868, "learning_rate": 1.9894258408922843e-05, "loss": 1.3352, "step": 2509 }, { "epoch": 0.7475939611683017, "grad_norm": 0.18918554484844208, "learning_rate": 1.989411846250981e-05, "loss": 1.317, "step": 2510 }, { "epoch": 0.7478918073679703, "grad_norm": 0.1828010529279709, "learning_rate": 1.9893978424043063e-05, "loss": 1.3183, "step": 2511 }, { "epoch": 0.748189653567639, "grad_norm": 0.18194228410720825, "learning_rate": 1.9893838293523902e-05, "loss": 1.3129, "step": 2512 }, { "epoch": 0.7484874997673077, "grad_norm": 0.18059831857681274, "learning_rate": 1.9893698070953626e-05, "loss": 1.3143, "step": 2513 }, { "epoch": 0.7487853459669763, "grad_norm": 0.19225727021694183, "learning_rate": 1.9893557756333545e-05, "loss": 1.3316, "step": 2514 }, { "epoch": 0.749083192166645, "grad_norm": 0.18320530652999878, "learning_rate": 1.9893417349664963e-05, "loss": 1.3095, "step": 2515 }, { "epoch": 0.7493810383663136, "grad_norm": 0.1820412576198578, "learning_rate": 1.9893276850949186e-05, "loss": 1.3282, "step": 2516 }, { "epoch": 0.7496788845659822, "grad_norm": 0.19816304743289948, "learning_rate": 1.989313626018752e-05, "loss": 1.3158, "step": 2517 }, { "epoch": 0.7499767307656509, "grad_norm": 0.18489590287208557, "learning_rate": 1.9892995577381276e-05, "loss": 1.3158, "step": 2518 }, { "epoch": 0.7502745769653195, "grad_norm": 0.18483327329158783, "learning_rate": 1.9892854802531762e-05, "loss": 1.3257, "step": 2519 }, { "epoch": 0.7505724231649882, "grad_norm": 0.18682824075222015, "learning_rate": 1.9892713935640287e-05, "loss": 1.3091, "step": 2520 }, { "epoch": 0.7508702693646568, "grad_norm": 0.19897626340389252, "learning_rate": 1.989257297670816e-05, "loss": 1.3275, "step": 2521 }, { "epoch": 0.7511681155643255, "grad_norm": 0.1884569376707077, "learning_rate": 1.989243192573669e-05, "loss": 1.3043, "step": 2522 }, { "epoch": 0.7514659617639942, "grad_norm": 0.18143467605113983, "learning_rate": 1.98922907827272e-05, "loss": 1.3071, "step": 2523 }, { "epoch": 0.7517638079636627, "grad_norm": 0.18571607768535614, "learning_rate": 1.9892149547680993e-05, "loss": 1.3112, "step": 2524 }, { "epoch": 0.7520616541633314, "grad_norm": 0.19137336313724518, "learning_rate": 1.989200822059939e-05, "loss": 1.3139, "step": 2525 }, { "epoch": 0.752359500363, "grad_norm": 0.19988438487052917, "learning_rate": 1.9891866801483702e-05, "loss": 1.3171, "step": 2526 }, { "epoch": 0.7526573465626687, "grad_norm": 0.18807357549667358, "learning_rate": 1.9891725290335243e-05, "loss": 1.338, "step": 2527 }, { "epoch": 0.7529551927623374, "grad_norm": 0.1797240525484085, "learning_rate": 1.9891583687155334e-05, "loss": 1.3159, "step": 2528 }, { "epoch": 0.753253038962006, "grad_norm": 0.18882443010807037, "learning_rate": 1.9891441991945288e-05, "loss": 1.3189, "step": 2529 }, { "epoch": 0.7535508851616747, "grad_norm": 0.18117474019527435, "learning_rate": 1.9891300204706427e-05, "loss": 1.3311, "step": 2530 }, { "epoch": 0.7538487313613433, "grad_norm": 0.18642142415046692, "learning_rate": 1.989115832544007e-05, "loss": 1.3146, "step": 2531 }, { "epoch": 0.7541465775610119, "grad_norm": 0.19051580131053925, "learning_rate": 1.989101635414753e-05, "loss": 1.3134, "step": 2532 }, { "epoch": 0.7544444237606805, "grad_norm": 0.19228899478912354, "learning_rate": 1.9890874290830143e-05, "loss": 1.3095, "step": 2533 }, { "epoch": 0.7547422699603492, "grad_norm": 0.17889706790447235, "learning_rate": 1.9890732135489215e-05, "loss": 1.2917, "step": 2534 }, { "epoch": 0.7550401161600179, "grad_norm": 0.18818412721157074, "learning_rate": 1.9890589888126075e-05, "loss": 1.3177, "step": 2535 }, { "epoch": 0.7553379623596865, "grad_norm": 0.18684431910514832, "learning_rate": 1.989044754874205e-05, "loss": 1.3361, "step": 2536 }, { "epoch": 0.7556358085593552, "grad_norm": 0.18594640493392944, "learning_rate": 1.9890305117338456e-05, "loss": 1.3065, "step": 2537 }, { "epoch": 0.7559336547590239, "grad_norm": 0.18711869418621063, "learning_rate": 1.989016259391663e-05, "loss": 1.3345, "step": 2538 }, { "epoch": 0.7562315009586924, "grad_norm": 0.19249029457569122, "learning_rate": 1.9890019978477885e-05, "loss": 1.3336, "step": 2539 }, { "epoch": 0.7565293471583611, "grad_norm": 0.1920575052499771, "learning_rate": 1.9889877271023558e-05, "loss": 1.3376, "step": 2540 }, { "epoch": 0.7568271933580297, "grad_norm": 0.187772735953331, "learning_rate": 1.9889734471554968e-05, "loss": 1.3191, "step": 2541 }, { "epoch": 0.7571250395576984, "grad_norm": 0.1912943422794342, "learning_rate": 1.988959158007345e-05, "loss": 1.3289, "step": 2542 }, { "epoch": 0.757422885757367, "grad_norm": 0.20400895178318024, "learning_rate": 1.9889448596580336e-05, "loss": 1.3235, "step": 2543 }, { "epoch": 0.7577207319570357, "grad_norm": 0.18285918235778809, "learning_rate": 1.9889305521076946e-05, "loss": 1.325, "step": 2544 }, { "epoch": 0.7580185781567044, "grad_norm": 0.18557345867156982, "learning_rate": 1.988916235356462e-05, "loss": 1.3148, "step": 2545 }, { "epoch": 0.758316424356373, "grad_norm": 0.18313251435756683, "learning_rate": 1.9889019094044687e-05, "loss": 1.334, "step": 2546 }, { "epoch": 0.7586142705560416, "grad_norm": 0.18324284255504608, "learning_rate": 1.9888875742518477e-05, "loss": 1.3135, "step": 2547 }, { "epoch": 0.7589121167557102, "grad_norm": 0.20431822538375854, "learning_rate": 1.988873229898733e-05, "loss": 1.3366, "step": 2548 }, { "epoch": 0.7592099629553789, "grad_norm": 0.20846553146839142, "learning_rate": 1.988858876345258e-05, "loss": 1.3369, "step": 2549 }, { "epoch": 0.7595078091550476, "grad_norm": 0.19721843302249908, "learning_rate": 1.9888445135915554e-05, "loss": 1.3243, "step": 2550 }, { "epoch": 0.7598056553547162, "grad_norm": 0.18610846996307373, "learning_rate": 1.9888301416377595e-05, "loss": 1.3156, "step": 2551 }, { "epoch": 0.7601035015543849, "grad_norm": 0.18554642796516418, "learning_rate": 1.988815760484004e-05, "loss": 1.3479, "step": 2552 }, { "epoch": 0.7604013477540535, "grad_norm": 0.19714638590812683, "learning_rate": 1.988801370130423e-05, "loss": 1.3221, "step": 2553 }, { "epoch": 0.7606991939537221, "grad_norm": 0.20119118690490723, "learning_rate": 1.9887869705771496e-05, "loss": 1.3248, "step": 2554 }, { "epoch": 0.7609970401533908, "grad_norm": 0.18889166414737701, "learning_rate": 1.988772561824318e-05, "loss": 1.3217, "step": 2555 }, { "epoch": 0.7612948863530594, "grad_norm": 0.18209540843963623, "learning_rate": 1.9887581438720627e-05, "loss": 1.3183, "step": 2556 }, { "epoch": 0.7615927325527281, "grad_norm": 0.2016037553548813, "learning_rate": 1.9887437167205178e-05, "loss": 1.3337, "step": 2557 }, { "epoch": 0.7618905787523967, "grad_norm": 0.1964045912027359, "learning_rate": 1.988729280369817e-05, "loss": 1.3191, "step": 2558 }, { "epoch": 0.7621884249520654, "grad_norm": 0.20772108435630798, "learning_rate": 1.988714834820095e-05, "loss": 1.3273, "step": 2559 }, { "epoch": 0.7624862711517341, "grad_norm": 0.18894939124584198, "learning_rate": 1.988700380071486e-05, "loss": 1.3096, "step": 2560 }, { "epoch": 0.7627841173514027, "grad_norm": 0.18265798687934875, "learning_rate": 1.9886859161241248e-05, "loss": 1.3145, "step": 2561 }, { "epoch": 0.7630819635510713, "grad_norm": 0.19556808471679688, "learning_rate": 1.9886714429781457e-05, "loss": 1.3332, "step": 2562 }, { "epoch": 0.7633798097507399, "grad_norm": 0.20578935742378235, "learning_rate": 1.988656960633683e-05, "loss": 1.3277, "step": 2563 }, { "epoch": 0.7636776559504086, "grad_norm": 0.19405904412269592, "learning_rate": 1.9886424690908724e-05, "loss": 1.3194, "step": 2564 }, { "epoch": 0.7639755021500773, "grad_norm": 0.19134913384914398, "learning_rate": 1.988627968349848e-05, "loss": 1.3268, "step": 2565 }, { "epoch": 0.7642733483497459, "grad_norm": 0.1856839954853058, "learning_rate": 1.988613458410745e-05, "loss": 1.3263, "step": 2566 }, { "epoch": 0.7645711945494146, "grad_norm": 0.20191329717636108, "learning_rate": 1.9885989392736986e-05, "loss": 1.3044, "step": 2567 }, { "epoch": 0.7648690407490832, "grad_norm": 0.19773228466510773, "learning_rate": 1.9885844109388436e-05, "loss": 1.3145, "step": 2568 }, { "epoch": 0.7651668869487518, "grad_norm": 0.18773703277111053, "learning_rate": 1.9885698734063146e-05, "loss": 1.3246, "step": 2569 }, { "epoch": 0.7654647331484205, "grad_norm": 0.1798093169927597, "learning_rate": 1.988555326676248e-05, "loss": 1.3189, "step": 2570 }, { "epoch": 0.7657625793480891, "grad_norm": 0.20348991453647614, "learning_rate": 1.988540770748778e-05, "loss": 1.323, "step": 2571 }, { "epoch": 0.7660604255477578, "grad_norm": 0.19539345800876617, "learning_rate": 1.988526205624041e-05, "loss": 1.3106, "step": 2572 }, { "epoch": 0.7663582717474264, "grad_norm": 0.18025873601436615, "learning_rate": 1.988511631302172e-05, "loss": 1.3169, "step": 2573 }, { "epoch": 0.7666561179470951, "grad_norm": 0.18473367393016815, "learning_rate": 1.9884970477833066e-05, "loss": 1.3124, "step": 2574 }, { "epoch": 0.7669539641467638, "grad_norm": 0.19651813805103302, "learning_rate": 1.9884824550675806e-05, "loss": 1.3288, "step": 2575 }, { "epoch": 0.7672518103464323, "grad_norm": 0.18388725817203522, "learning_rate": 1.9884678531551297e-05, "loss": 1.3271, "step": 2576 }, { "epoch": 0.767549656546101, "grad_norm": 0.17565305531024933, "learning_rate": 1.98845324204609e-05, "loss": 1.3173, "step": 2577 }, { "epoch": 0.7678475027457696, "grad_norm": 0.1815018653869629, "learning_rate": 1.9884386217405972e-05, "loss": 1.3181, "step": 2578 }, { "epoch": 0.7681453489454383, "grad_norm": 0.19392378628253937, "learning_rate": 1.9884239922387873e-05, "loss": 1.3379, "step": 2579 }, { "epoch": 0.768443195145107, "grad_norm": 0.18461914360523224, "learning_rate": 1.9884093535407963e-05, "loss": 1.3227, "step": 2580 }, { "epoch": 0.7687410413447756, "grad_norm": 0.3589487671852112, "learning_rate": 1.988394705646761e-05, "loss": 1.3266, "step": 2581 }, { "epoch": 0.7690388875444443, "grad_norm": 0.1992337703704834, "learning_rate": 1.9883800485568174e-05, "loss": 1.3151, "step": 2582 }, { "epoch": 0.7693367337441129, "grad_norm": 0.18829941749572754, "learning_rate": 1.988365382271101e-05, "loss": 1.3257, "step": 2583 }, { "epoch": 0.7696345799437815, "grad_norm": 0.1826237291097641, "learning_rate": 1.9883507067897493e-05, "loss": 1.3114, "step": 2584 }, { "epoch": 0.7699324261434501, "grad_norm": 0.1839599758386612, "learning_rate": 1.9883360221128987e-05, "loss": 1.3176, "step": 2585 }, { "epoch": 0.7702302723431188, "grad_norm": 0.17493994534015656, "learning_rate": 1.9883213282406855e-05, "loss": 1.3207, "step": 2586 }, { "epoch": 0.7705281185427875, "grad_norm": 0.17947088181972504, "learning_rate": 1.988306625173247e-05, "loss": 1.3061, "step": 2587 }, { "epoch": 0.7708259647424561, "grad_norm": 0.1852508932352066, "learning_rate": 1.988291912910719e-05, "loss": 1.3236, "step": 2588 }, { "epoch": 0.7711238109421248, "grad_norm": 0.18166646361351013, "learning_rate": 1.988277191453239e-05, "loss": 1.3069, "step": 2589 }, { "epoch": 0.7714216571417934, "grad_norm": 0.1864398717880249, "learning_rate": 1.988262460800944e-05, "loss": 1.3407, "step": 2590 }, { "epoch": 0.771719503341462, "grad_norm": 0.1851319670677185, "learning_rate": 1.9882477209539707e-05, "loss": 1.3223, "step": 2591 }, { "epoch": 0.7720173495411307, "grad_norm": 0.19353334605693817, "learning_rate": 1.988232971912457e-05, "loss": 1.3085, "step": 2592 }, { "epoch": 0.7723151957407993, "grad_norm": 0.19049708545207977, "learning_rate": 1.9882182136765394e-05, "loss": 1.3389, "step": 2593 }, { "epoch": 0.772613041940468, "grad_norm": 0.1989065259695053, "learning_rate": 1.9882034462463553e-05, "loss": 1.3245, "step": 2594 }, { "epoch": 0.7729108881401366, "grad_norm": 0.1943540871143341, "learning_rate": 1.9881886696220424e-05, "loss": 1.31, "step": 2595 }, { "epoch": 0.7732087343398053, "grad_norm": 0.18225571513175964, "learning_rate": 1.9881738838037376e-05, "loss": 1.3101, "step": 2596 }, { "epoch": 0.773506580539474, "grad_norm": 0.18928158283233643, "learning_rate": 1.9881590887915794e-05, "loss": 1.3065, "step": 2597 }, { "epoch": 0.7738044267391426, "grad_norm": 0.19873449206352234, "learning_rate": 1.9881442845857046e-05, "loss": 1.3214, "step": 2598 }, { "epoch": 0.7741022729388112, "grad_norm": 0.18662762641906738, "learning_rate": 1.9881294711862514e-05, "loss": 1.3086, "step": 2599 }, { "epoch": 0.7744001191384798, "grad_norm": 0.18702490627765656, "learning_rate": 1.9881146485933574e-05, "loss": 1.3127, "step": 2600 }, { "epoch": 0.7746979653381485, "grad_norm": 0.19949471950531006, "learning_rate": 1.9880998168071607e-05, "loss": 1.3108, "step": 2601 }, { "epoch": 0.7749958115378172, "grad_norm": 0.19867639243602753, "learning_rate": 1.9880849758277987e-05, "loss": 1.315, "step": 2602 }, { "epoch": 0.7752936577374858, "grad_norm": 0.18987229466438293, "learning_rate": 1.9880701256554106e-05, "loss": 1.3311, "step": 2603 }, { "epoch": 0.7755915039371545, "grad_norm": 0.19967243075370789, "learning_rate": 1.9880552662901337e-05, "loss": 1.3342, "step": 2604 }, { "epoch": 0.7758893501368231, "grad_norm": 0.1925693154335022, "learning_rate": 1.9880403977321063e-05, "loss": 1.3222, "step": 2605 }, { "epoch": 0.7761871963364917, "grad_norm": 0.19117453694343567, "learning_rate": 1.988025519981467e-05, "loss": 1.3056, "step": 2606 }, { "epoch": 0.7764850425361604, "grad_norm": 0.19443592429161072, "learning_rate": 1.988010633038354e-05, "loss": 1.3319, "step": 2607 }, { "epoch": 0.776782888735829, "grad_norm": 0.19276925921440125, "learning_rate": 1.9879957369029062e-05, "loss": 1.2891, "step": 2608 }, { "epoch": 0.7770807349354977, "grad_norm": 0.1844119131565094, "learning_rate": 1.9879808315752616e-05, "loss": 1.3221, "step": 2609 }, { "epoch": 0.7773785811351663, "grad_norm": 0.1958608776330948, "learning_rate": 1.9879659170555596e-05, "loss": 1.3192, "step": 2610 }, { "epoch": 0.777676427334835, "grad_norm": 0.18317051231861115, "learning_rate": 1.9879509933439384e-05, "loss": 1.3238, "step": 2611 }, { "epoch": 0.7779742735345037, "grad_norm": 0.20573605597019196, "learning_rate": 1.987936060440537e-05, "loss": 1.2924, "step": 2612 }, { "epoch": 0.7782721197341723, "grad_norm": 0.19641467928886414, "learning_rate": 1.987921118345494e-05, "loss": 1.319, "step": 2613 }, { "epoch": 0.7785699659338409, "grad_norm": 0.18709130585193634, "learning_rate": 1.9879061670589493e-05, "loss": 1.3208, "step": 2614 }, { "epoch": 0.7788678121335095, "grad_norm": 0.2033838927745819, "learning_rate": 1.987891206581041e-05, "loss": 1.3148, "step": 2615 }, { "epoch": 0.7791656583331782, "grad_norm": 0.1878487765789032, "learning_rate": 1.9878762369119092e-05, "loss": 1.3306, "step": 2616 }, { "epoch": 0.7794635045328469, "grad_norm": 0.18077583611011505, "learning_rate": 1.9878612580516926e-05, "loss": 1.3171, "step": 2617 }, { "epoch": 0.7797613507325155, "grad_norm": 0.18070384860038757, "learning_rate": 1.9878462700005306e-05, "loss": 1.3016, "step": 2618 }, { "epoch": 0.7800591969321842, "grad_norm": 0.1827581375837326, "learning_rate": 1.9878312727585627e-05, "loss": 1.327, "step": 2619 }, { "epoch": 0.7803570431318528, "grad_norm": 0.19255481660366058, "learning_rate": 1.9878162663259285e-05, "loss": 1.3174, "step": 2620 }, { "epoch": 0.7806548893315214, "grad_norm": 0.18368345499038696, "learning_rate": 1.987801250702768e-05, "loss": 1.3263, "step": 2621 }, { "epoch": 0.78095273553119, "grad_norm": 0.18737348914146423, "learning_rate": 1.98778622588922e-05, "loss": 1.3132, "step": 2622 }, { "epoch": 0.7812505817308587, "grad_norm": 0.1827528327703476, "learning_rate": 1.9877711918854248e-05, "loss": 1.3205, "step": 2623 }, { "epoch": 0.7815484279305274, "grad_norm": 0.18387190997600555, "learning_rate": 1.9877561486915224e-05, "loss": 1.3216, "step": 2624 }, { "epoch": 0.781846274130196, "grad_norm": 0.18497104942798615, "learning_rate": 1.9877410963076523e-05, "loss": 1.3116, "step": 2625 }, { "epoch": 0.7821441203298647, "grad_norm": 0.18254296481609344, "learning_rate": 1.9877260347339552e-05, "loss": 1.317, "step": 2626 }, { "epoch": 0.7824419665295334, "grad_norm": 0.1875661015510559, "learning_rate": 1.987710963970571e-05, "loss": 1.3146, "step": 2627 }, { "epoch": 0.782739812729202, "grad_norm": 0.19719748198986053, "learning_rate": 1.9876958840176397e-05, "loss": 1.3283, "step": 2628 }, { "epoch": 0.7830376589288706, "grad_norm": 0.18464429676532745, "learning_rate": 1.9876807948753017e-05, "loss": 1.3177, "step": 2629 }, { "epoch": 0.7833355051285392, "grad_norm": 0.18928779661655426, "learning_rate": 1.9876656965436974e-05, "loss": 1.3387, "step": 2630 }, { "epoch": 0.7836333513282079, "grad_norm": 0.18596485257148743, "learning_rate": 1.9876505890229675e-05, "loss": 1.3074, "step": 2631 }, { "epoch": 0.7839311975278765, "grad_norm": 0.18939976394176483, "learning_rate": 1.987635472313252e-05, "loss": 1.3088, "step": 2632 }, { "epoch": 0.7842290437275452, "grad_norm": 0.18326114118099213, "learning_rate": 1.9876203464146922e-05, "loss": 1.3119, "step": 2633 }, { "epoch": 0.7845268899272139, "grad_norm": 0.19060494005680084, "learning_rate": 1.987605211327428e-05, "loss": 1.3008, "step": 2634 }, { "epoch": 0.7848247361268825, "grad_norm": 0.19197452068328857, "learning_rate": 1.987590067051601e-05, "loss": 1.325, "step": 2635 }, { "epoch": 0.7851225823265511, "grad_norm": 0.18107475340366364, "learning_rate": 1.987574913587352e-05, "loss": 1.3189, "step": 2636 }, { "epoch": 0.7854204285262197, "grad_norm": 0.18983641266822815, "learning_rate": 1.9875597509348218e-05, "loss": 1.3172, "step": 2637 }, { "epoch": 0.7857182747258884, "grad_norm": 0.18232080340385437, "learning_rate": 1.9875445790941513e-05, "loss": 1.3015, "step": 2638 }, { "epoch": 0.7860161209255571, "grad_norm": 0.18595845997333527, "learning_rate": 1.987529398065482e-05, "loss": 1.319, "step": 2639 }, { "epoch": 0.7863139671252257, "grad_norm": 0.1903022974729538, "learning_rate": 1.987514207848955e-05, "loss": 1.3259, "step": 2640 }, { "epoch": 0.7866118133248944, "grad_norm": 0.19145415723323822, "learning_rate": 1.9874990084447116e-05, "loss": 1.3201, "step": 2641 }, { "epoch": 0.786909659524563, "grad_norm": 0.19813406467437744, "learning_rate": 1.987483799852893e-05, "loss": 1.321, "step": 2642 }, { "epoch": 0.7872075057242317, "grad_norm": 0.18630708754062653, "learning_rate": 1.9874685820736413e-05, "loss": 1.3232, "step": 2643 }, { "epoch": 0.7875053519239003, "grad_norm": 0.20730362832546234, "learning_rate": 1.987453355107097e-05, "loss": 1.3476, "step": 2644 }, { "epoch": 0.7878031981235689, "grad_norm": 0.19203825294971466, "learning_rate": 1.9874381189534032e-05, "loss": 1.3195, "step": 2645 }, { "epoch": 0.7881010443232376, "grad_norm": 0.18337178230285645, "learning_rate": 1.9874228736127007e-05, "loss": 1.3294, "step": 2646 }, { "epoch": 0.7883988905229062, "grad_norm": 0.19438254833221436, "learning_rate": 1.9874076190851313e-05, "loss": 1.3328, "step": 2647 }, { "epoch": 0.7886967367225749, "grad_norm": 0.1868901252746582, "learning_rate": 1.9873923553708378e-05, "loss": 1.3236, "step": 2648 }, { "epoch": 0.7889945829222436, "grad_norm": 0.19647769629955292, "learning_rate": 1.9873770824699613e-05, "loss": 1.3171, "step": 2649 }, { "epoch": 0.7892924291219122, "grad_norm": 0.19201479852199554, "learning_rate": 1.9873618003826442e-05, "loss": 1.3166, "step": 2650 }, { "epoch": 0.7895902753215808, "grad_norm": 0.1863786280155182, "learning_rate": 1.9873465091090285e-05, "loss": 1.3071, "step": 2651 }, { "epoch": 0.7898881215212494, "grad_norm": 0.21051806211471558, "learning_rate": 1.9873312086492568e-05, "loss": 1.3264, "step": 2652 }, { "epoch": 0.7901859677209181, "grad_norm": 0.22071000933647156, "learning_rate": 1.9873158990034714e-05, "loss": 1.3259, "step": 2653 }, { "epoch": 0.7904838139205868, "grad_norm": 0.18861712515354156, "learning_rate": 1.9873005801718146e-05, "loss": 1.3192, "step": 2654 }, { "epoch": 0.7907816601202554, "grad_norm": 0.19382211565971375, "learning_rate": 1.987285252154429e-05, "loss": 1.3157, "step": 2655 }, { "epoch": 0.7910795063199241, "grad_norm": 0.20809771120548248, "learning_rate": 1.9872699149514574e-05, "loss": 1.3077, "step": 2656 }, { "epoch": 0.7913773525195927, "grad_norm": 0.19337840378284454, "learning_rate": 1.987254568563042e-05, "loss": 1.331, "step": 2657 }, { "epoch": 0.7916751987192613, "grad_norm": 0.19075921177864075, "learning_rate": 1.987239212989326e-05, "loss": 1.3267, "step": 2658 }, { "epoch": 0.79197304491893, "grad_norm": 0.1955624371767044, "learning_rate": 1.987223848230452e-05, "loss": 1.3068, "step": 2659 }, { "epoch": 0.7922708911185986, "grad_norm": 0.19948722422122955, "learning_rate": 1.987208474286563e-05, "loss": 1.3018, "step": 2660 }, { "epoch": 0.7925687373182673, "grad_norm": 0.19672907888889313, "learning_rate": 1.9871930911578024e-05, "loss": 1.3015, "step": 2661 }, { "epoch": 0.7928665835179359, "grad_norm": 0.19176773726940155, "learning_rate": 1.987177698844313e-05, "loss": 1.3108, "step": 2662 }, { "epoch": 0.7931644297176046, "grad_norm": 0.1909971386194229, "learning_rate": 1.9871622973462377e-05, "loss": 1.3083, "step": 2663 }, { "epoch": 0.7934622759172733, "grad_norm": 0.20255598425865173, "learning_rate": 1.9871468866637203e-05, "loss": 1.3245, "step": 2664 }, { "epoch": 0.7937601221169419, "grad_norm": 0.1945829540491104, "learning_rate": 1.9871314667969043e-05, "loss": 1.3201, "step": 2665 }, { "epoch": 0.7940579683166105, "grad_norm": 0.1856970489025116, "learning_rate": 1.9871160377459326e-05, "loss": 1.3255, "step": 2666 }, { "epoch": 0.7943558145162791, "grad_norm": 0.19815580546855927, "learning_rate": 1.9871005995109492e-05, "loss": 1.3197, "step": 2667 }, { "epoch": 0.7946536607159478, "grad_norm": 0.19439074397087097, "learning_rate": 1.987085152092098e-05, "loss": 1.3085, "step": 2668 }, { "epoch": 0.7949515069156164, "grad_norm": 0.19563210010528564, "learning_rate": 1.9870696954895216e-05, "loss": 1.3176, "step": 2669 }, { "epoch": 0.7952493531152851, "grad_norm": 0.1940533071756363, "learning_rate": 1.9870542297033645e-05, "loss": 1.3157, "step": 2670 }, { "epoch": 0.7955471993149538, "grad_norm": 0.20374523103237152, "learning_rate": 1.9870387547337708e-05, "loss": 1.3346, "step": 2671 }, { "epoch": 0.7958450455146224, "grad_norm": 0.19792142510414124, "learning_rate": 1.9870232705808844e-05, "loss": 1.3141, "step": 2672 }, { "epoch": 0.796142891714291, "grad_norm": 0.1836538463830948, "learning_rate": 1.9870077772448493e-05, "loss": 1.3147, "step": 2673 }, { "epoch": 0.7964407379139596, "grad_norm": 0.19210876524448395, "learning_rate": 1.986992274725809e-05, "loss": 1.3252, "step": 2674 }, { "epoch": 0.7967385841136283, "grad_norm": 0.2051551789045334, "learning_rate": 1.986976763023909e-05, "loss": 1.3092, "step": 2675 }, { "epoch": 0.797036430313297, "grad_norm": 0.19050179421901703, "learning_rate": 1.9869612421392928e-05, "loss": 1.3185, "step": 2676 }, { "epoch": 0.7973342765129656, "grad_norm": 0.18543951213359833, "learning_rate": 1.986945712072105e-05, "loss": 1.3068, "step": 2677 }, { "epoch": 0.7976321227126343, "grad_norm": 0.18150204420089722, "learning_rate": 1.9869301728224893e-05, "loss": 1.3045, "step": 2678 }, { "epoch": 0.797929968912303, "grad_norm": 0.18430055677890778, "learning_rate": 1.986914624390592e-05, "loss": 1.3197, "step": 2679 }, { "epoch": 0.7982278151119716, "grad_norm": 0.19682268798351288, "learning_rate": 1.986899066776556e-05, "loss": 1.3075, "step": 2680 }, { "epoch": 0.7985256613116402, "grad_norm": 0.1920955628156662, "learning_rate": 1.986883499980527e-05, "loss": 1.3059, "step": 2681 }, { "epoch": 0.7988235075113088, "grad_norm": 0.18305283784866333, "learning_rate": 1.9868679240026502e-05, "loss": 1.297, "step": 2682 }, { "epoch": 0.7991213537109775, "grad_norm": 0.19209906458854675, "learning_rate": 1.9868523388430693e-05, "loss": 1.3048, "step": 2683 }, { "epoch": 0.7994191999106461, "grad_norm": 0.19421148300170898, "learning_rate": 1.9868367445019304e-05, "loss": 1.3234, "step": 2684 }, { "epoch": 0.7997170461103148, "grad_norm": 0.19669722020626068, "learning_rate": 1.9868211409793778e-05, "loss": 1.3239, "step": 2685 }, { "epoch": 0.8000148923099835, "grad_norm": 0.19060830771923065, "learning_rate": 1.986805528275557e-05, "loss": 1.3148, "step": 2686 }, { "epoch": 0.8003127385096521, "grad_norm": 0.19015301764011383, "learning_rate": 1.9867899063906136e-05, "loss": 1.3139, "step": 2687 }, { "epoch": 0.8006105847093207, "grad_norm": 0.18294614553451538, "learning_rate": 1.9867742753246926e-05, "loss": 1.325, "step": 2688 }, { "epoch": 0.8009084309089893, "grad_norm": 0.18956173956394196, "learning_rate": 1.9867586350779394e-05, "loss": 1.3257, "step": 2689 }, { "epoch": 0.801206277108658, "grad_norm": 0.19346562027931213, "learning_rate": 1.9867429856504993e-05, "loss": 1.3058, "step": 2690 }, { "epoch": 0.8015041233083267, "grad_norm": 0.18454086780548096, "learning_rate": 1.9867273270425184e-05, "loss": 1.3087, "step": 2691 }, { "epoch": 0.8018019695079953, "grad_norm": 0.19533635675907135, "learning_rate": 1.9867116592541423e-05, "loss": 1.3062, "step": 2692 }, { "epoch": 0.802099815707664, "grad_norm": 0.2025240808725357, "learning_rate": 1.9866959822855163e-05, "loss": 1.3281, "step": 2693 }, { "epoch": 0.8023976619073326, "grad_norm": 0.19497162103652954, "learning_rate": 1.9866802961367867e-05, "loss": 1.3307, "step": 2694 }, { "epoch": 0.8026955081070013, "grad_norm": 0.18541958928108215, "learning_rate": 1.9866646008080996e-05, "loss": 1.3159, "step": 2695 }, { "epoch": 0.8029933543066698, "grad_norm": 0.18936875462532043, "learning_rate": 1.9866488962996004e-05, "loss": 1.3178, "step": 2696 }, { "epoch": 0.8032912005063385, "grad_norm": 0.21259909868240356, "learning_rate": 1.986633182611436e-05, "loss": 1.3445, "step": 2697 }, { "epoch": 0.8035890467060072, "grad_norm": 0.18928909301757812, "learning_rate": 1.9866174597437517e-05, "loss": 1.3102, "step": 2698 }, { "epoch": 0.8038868929056758, "grad_norm": 0.18981429934501648, "learning_rate": 1.9866017276966945e-05, "loss": 1.3098, "step": 2699 }, { "epoch": 0.8041847391053445, "grad_norm": 0.19316232204437256, "learning_rate": 1.98658598647041e-05, "loss": 1.3146, "step": 2700 }, { "epoch": 0.8044825853050132, "grad_norm": 0.2001708298921585, "learning_rate": 1.9865702360650458e-05, "loss": 1.3193, "step": 2701 }, { "epoch": 0.8047804315046818, "grad_norm": 0.19563239812850952, "learning_rate": 1.986554476480748e-05, "loss": 1.3055, "step": 2702 }, { "epoch": 0.8050782777043504, "grad_norm": 0.20106148719787598, "learning_rate": 1.9865387077176623e-05, "loss": 1.3182, "step": 2703 }, { "epoch": 0.805376123904019, "grad_norm": 0.18671630322933197, "learning_rate": 1.986522929775937e-05, "loss": 1.2984, "step": 2704 }, { "epoch": 0.8056739701036877, "grad_norm": 0.18762362003326416, "learning_rate": 1.986507142655717e-05, "loss": 1.319, "step": 2705 }, { "epoch": 0.8059718163033563, "grad_norm": 0.20593728125095367, "learning_rate": 1.9864913463571512e-05, "loss": 1.2955, "step": 2706 }, { "epoch": 0.806269662503025, "grad_norm": 0.18869160115718842, "learning_rate": 1.986475540880385e-05, "loss": 1.3055, "step": 2707 }, { "epoch": 0.8065675087026937, "grad_norm": 0.20767731964588165, "learning_rate": 1.9864597262255666e-05, "loss": 1.3087, "step": 2708 }, { "epoch": 0.8068653549023623, "grad_norm": 0.18076202273368835, "learning_rate": 1.986443902392842e-05, "loss": 1.305, "step": 2709 }, { "epoch": 0.807163201102031, "grad_norm": 0.19493411481380463, "learning_rate": 1.9864280693823594e-05, "loss": 1.2989, "step": 2710 }, { "epoch": 0.8074610473016995, "grad_norm": 0.21097835898399353, "learning_rate": 1.9864122271942654e-05, "loss": 1.3312, "step": 2711 }, { "epoch": 0.8077588935013682, "grad_norm": 0.20255888998508453, "learning_rate": 1.986396375828708e-05, "loss": 1.3153, "step": 2712 }, { "epoch": 0.8080567397010369, "grad_norm": 0.18912790715694427, "learning_rate": 1.9863805152858342e-05, "loss": 1.3159, "step": 2713 }, { "epoch": 0.8083545859007055, "grad_norm": 0.18855468928813934, "learning_rate": 1.9863646455657918e-05, "loss": 1.3227, "step": 2714 }, { "epoch": 0.8086524321003742, "grad_norm": 0.20826329290866852, "learning_rate": 1.986348766668728e-05, "loss": 1.3123, "step": 2715 }, { "epoch": 0.8089502783000428, "grad_norm": 0.19729100167751312, "learning_rate": 1.9863328785947916e-05, "loss": 1.3168, "step": 2716 }, { "epoch": 0.8092481244997115, "grad_norm": 0.19067421555519104, "learning_rate": 1.9863169813441296e-05, "loss": 1.3018, "step": 2717 }, { "epoch": 0.8095459706993801, "grad_norm": 0.19457784295082092, "learning_rate": 1.98630107491689e-05, "loss": 1.3201, "step": 2718 }, { "epoch": 0.8098438168990487, "grad_norm": 0.1942594051361084, "learning_rate": 1.9862851593132208e-05, "loss": 1.3234, "step": 2719 }, { "epoch": 0.8101416630987174, "grad_norm": 0.20054592192173004, "learning_rate": 1.9862692345332704e-05, "loss": 1.3224, "step": 2720 }, { "epoch": 0.810439509298386, "grad_norm": 0.20668689906597137, "learning_rate": 1.9862533005771864e-05, "loss": 1.32, "step": 2721 }, { "epoch": 0.8107373554980547, "grad_norm": 0.1965094655752182, "learning_rate": 1.9862373574451173e-05, "loss": 1.3066, "step": 2722 }, { "epoch": 0.8110352016977234, "grad_norm": 0.1899128556251526, "learning_rate": 1.9862214051372114e-05, "loss": 1.3015, "step": 2723 }, { "epoch": 0.811333047897392, "grad_norm": 0.20184451341629028, "learning_rate": 1.9862054436536175e-05, "loss": 1.309, "step": 2724 }, { "epoch": 0.8116308940970607, "grad_norm": 0.19189730286598206, "learning_rate": 1.9861894729944836e-05, "loss": 1.3255, "step": 2725 }, { "epoch": 0.8119287402967292, "grad_norm": 0.19494622945785522, "learning_rate": 1.9861734931599588e-05, "loss": 1.2981, "step": 2726 }, { "epoch": 0.8122265864963979, "grad_norm": 0.21060998737812042, "learning_rate": 1.9861575041501912e-05, "loss": 1.3195, "step": 2727 }, { "epoch": 0.8125244326960666, "grad_norm": 0.19490574300289154, "learning_rate": 1.98614150596533e-05, "loss": 1.3309, "step": 2728 }, { "epoch": 0.8128222788957352, "grad_norm": 0.1967184990644455, "learning_rate": 1.9861254986055235e-05, "loss": 1.3219, "step": 2729 }, { "epoch": 0.8131201250954039, "grad_norm": 0.2028256356716156, "learning_rate": 1.9861094820709215e-05, "loss": 1.3335, "step": 2730 }, { "epoch": 0.8134179712950725, "grad_norm": 0.21276399493217468, "learning_rate": 1.986093456361672e-05, "loss": 1.3126, "step": 2731 }, { "epoch": 0.8137158174947412, "grad_norm": 0.20708300173282623, "learning_rate": 1.986077421477925e-05, "loss": 1.3266, "step": 2732 }, { "epoch": 0.8140136636944098, "grad_norm": 0.20489421486854553, "learning_rate": 1.986061377419829e-05, "loss": 1.3226, "step": 2733 }, { "epoch": 0.8143115098940784, "grad_norm": 0.19015038013458252, "learning_rate": 1.9860453241875342e-05, "loss": 1.3064, "step": 2734 }, { "epoch": 0.8146093560937471, "grad_norm": 0.18883351981639862, "learning_rate": 1.9860292617811888e-05, "loss": 1.2963, "step": 2735 }, { "epoch": 0.8149072022934157, "grad_norm": 0.18830153346061707, "learning_rate": 1.986013190200943e-05, "loss": 1.3319, "step": 2736 }, { "epoch": 0.8152050484930844, "grad_norm": 0.1849372833967209, "learning_rate": 1.9859971094469456e-05, "loss": 1.3048, "step": 2737 }, { "epoch": 0.815502894692753, "grad_norm": 0.20077911019325256, "learning_rate": 1.985981019519347e-05, "loss": 1.3198, "step": 2738 }, { "epoch": 0.8158007408924217, "grad_norm": 0.19239689409732819, "learning_rate": 1.985964920418297e-05, "loss": 1.3265, "step": 2739 }, { "epoch": 0.8160985870920904, "grad_norm": 0.19686836004257202, "learning_rate": 1.9859488121439448e-05, "loss": 1.305, "step": 2740 }, { "epoch": 0.8163964332917589, "grad_norm": 0.20230987668037415, "learning_rate": 1.9859326946964403e-05, "loss": 1.3075, "step": 2741 }, { "epoch": 0.8166942794914276, "grad_norm": 0.1813904494047165, "learning_rate": 1.9859165680759335e-05, "loss": 1.3003, "step": 2742 }, { "epoch": 0.8169921256910962, "grad_norm": 0.18585096299648285, "learning_rate": 1.985900432282575e-05, "loss": 1.3285, "step": 2743 }, { "epoch": 0.8172899718907649, "grad_norm": 0.19540949165821075, "learning_rate": 1.9858842873165142e-05, "loss": 1.3283, "step": 2744 }, { "epoch": 0.8175878180904336, "grad_norm": 0.1910727322101593, "learning_rate": 1.9858681331779016e-05, "loss": 1.3188, "step": 2745 }, { "epoch": 0.8178856642901022, "grad_norm": 0.17849615216255188, "learning_rate": 1.9858519698668877e-05, "loss": 1.312, "step": 2746 }, { "epoch": 0.8181835104897709, "grad_norm": 0.1925978809595108, "learning_rate": 1.985835797383622e-05, "loss": 1.2988, "step": 2747 }, { "epoch": 0.8184813566894394, "grad_norm": 0.18562570214271545, "learning_rate": 1.9858196157282564e-05, "loss": 1.3114, "step": 2748 }, { "epoch": 0.8187792028891081, "grad_norm": 0.1920994520187378, "learning_rate": 1.9858034249009406e-05, "loss": 1.3191, "step": 2749 }, { "epoch": 0.8190770490887768, "grad_norm": 0.18559814989566803, "learning_rate": 1.9857872249018252e-05, "loss": 1.3246, "step": 2750 }, { "epoch": 0.8193748952884454, "grad_norm": 0.19468210637569427, "learning_rate": 1.9857710157310612e-05, "loss": 1.3098, "step": 2751 }, { "epoch": 0.8196727414881141, "grad_norm": 0.19151291251182556, "learning_rate": 1.985754797388799e-05, "loss": 1.2972, "step": 2752 }, { "epoch": 0.8199705876877827, "grad_norm": 0.19110466539859772, "learning_rate": 1.9857385698751898e-05, "loss": 1.3182, "step": 2753 }, { "epoch": 0.8202684338874514, "grad_norm": 0.194067120552063, "learning_rate": 1.9857223331903846e-05, "loss": 1.3141, "step": 2754 }, { "epoch": 0.82056628008712, "grad_norm": 0.19519226253032684, "learning_rate": 1.9857060873345345e-05, "loss": 1.2999, "step": 2755 }, { "epoch": 0.8208641262867886, "grad_norm": 0.1942768692970276, "learning_rate": 1.9856898323077906e-05, "loss": 1.3266, "step": 2756 }, { "epoch": 0.8211619724864573, "grad_norm": 0.1909717470407486, "learning_rate": 1.985673568110304e-05, "loss": 1.2988, "step": 2757 }, { "epoch": 0.8214598186861259, "grad_norm": 0.19233834743499756, "learning_rate": 1.985657294742226e-05, "loss": 1.3027, "step": 2758 }, { "epoch": 0.8217576648857946, "grad_norm": 0.18716426193714142, "learning_rate": 1.985641012203708e-05, "loss": 1.3098, "step": 2759 }, { "epoch": 0.8220555110854633, "grad_norm": 0.20037902891635895, "learning_rate": 1.985624720494902e-05, "loss": 1.3118, "step": 2760 }, { "epoch": 0.8223533572851319, "grad_norm": 0.20361924171447754, "learning_rate": 1.985608419615959e-05, "loss": 1.3013, "step": 2761 }, { "epoch": 0.8226512034848006, "grad_norm": 0.19471381604671478, "learning_rate": 1.9855921095670306e-05, "loss": 1.3139, "step": 2762 }, { "epoch": 0.8229490496844691, "grad_norm": 0.19743210077285767, "learning_rate": 1.9855757903482692e-05, "loss": 1.309, "step": 2763 }, { "epoch": 0.8232468958841378, "grad_norm": 0.19530196487903595, "learning_rate": 1.9855594619598262e-05, "loss": 1.3305, "step": 2764 }, { "epoch": 0.8235447420838065, "grad_norm": 0.19224487245082855, "learning_rate": 1.985543124401853e-05, "loss": 1.309, "step": 2765 }, { "epoch": 0.8238425882834751, "grad_norm": 0.3356058895587921, "learning_rate": 1.9855267776745028e-05, "loss": 1.3086, "step": 2766 }, { "epoch": 0.8241404344831438, "grad_norm": 0.19616787135601044, "learning_rate": 1.9855104217779265e-05, "loss": 1.3214, "step": 2767 }, { "epoch": 0.8244382806828124, "grad_norm": 0.19699224829673767, "learning_rate": 1.9854940567122773e-05, "loss": 1.3096, "step": 2768 }, { "epoch": 0.8247361268824811, "grad_norm": 0.1945800483226776, "learning_rate": 1.9854776824777068e-05, "loss": 1.3194, "step": 2769 }, { "epoch": 0.8250339730821497, "grad_norm": 0.18592716753482819, "learning_rate": 1.9854612990743675e-05, "loss": 1.3042, "step": 2770 }, { "epoch": 0.8253318192818183, "grad_norm": 0.1943686157464981, "learning_rate": 1.985444906502412e-05, "loss": 1.3264, "step": 2771 }, { "epoch": 0.825629665481487, "grad_norm": 0.2716086506843567, "learning_rate": 1.985428504761992e-05, "loss": 1.3085, "step": 2772 }, { "epoch": 0.8259275116811556, "grad_norm": 0.20023038983345032, "learning_rate": 1.985412093853261e-05, "loss": 1.311, "step": 2773 }, { "epoch": 0.8262253578808243, "grad_norm": 0.1959606558084488, "learning_rate": 1.9853956737763718e-05, "loss": 1.3389, "step": 2774 }, { "epoch": 0.826523204080493, "grad_norm": 0.19522209465503693, "learning_rate": 1.9853792445314766e-05, "loss": 1.3306, "step": 2775 }, { "epoch": 0.8268210502801616, "grad_norm": 0.19641757011413574, "learning_rate": 1.985362806118728e-05, "loss": 1.3073, "step": 2776 }, { "epoch": 0.8271188964798303, "grad_norm": 0.19511444866657257, "learning_rate": 1.9853463585382804e-05, "loss": 1.3078, "step": 2777 }, { "epoch": 0.8274167426794988, "grad_norm": 0.186085045337677, "learning_rate": 1.985329901790285e-05, "loss": 1.3221, "step": 2778 }, { "epoch": 0.8277145888791675, "grad_norm": 0.1986667364835739, "learning_rate": 1.985313435874896e-05, "loss": 1.3094, "step": 2779 }, { "epoch": 0.8280124350788362, "grad_norm": 0.1856551170349121, "learning_rate": 1.9852969607922664e-05, "loss": 1.2961, "step": 2780 }, { "epoch": 0.8283102812785048, "grad_norm": 0.1947954148054123, "learning_rate": 1.9852804765425495e-05, "loss": 1.3067, "step": 2781 }, { "epoch": 0.8286081274781735, "grad_norm": 0.19135957956314087, "learning_rate": 1.9852639831258985e-05, "loss": 1.3059, "step": 2782 }, { "epoch": 0.8289059736778421, "grad_norm": 0.19226491451263428, "learning_rate": 1.985247480542467e-05, "loss": 1.2999, "step": 2783 }, { "epoch": 0.8292038198775108, "grad_norm": 0.19346089661121368, "learning_rate": 1.9852309687924084e-05, "loss": 1.3104, "step": 2784 }, { "epoch": 0.8295016660771793, "grad_norm": 0.19168633222579956, "learning_rate": 1.9852144478758763e-05, "loss": 1.3205, "step": 2785 }, { "epoch": 0.829799512276848, "grad_norm": 0.19923147559165955, "learning_rate": 1.9851979177930243e-05, "loss": 1.3308, "step": 2786 }, { "epoch": 0.8300973584765167, "grad_norm": 0.1852472871541977, "learning_rate": 1.985181378544007e-05, "loss": 1.2994, "step": 2787 }, { "epoch": 0.8303952046761853, "grad_norm": 0.20116518437862396, "learning_rate": 1.985164830128977e-05, "loss": 1.3182, "step": 2788 }, { "epoch": 0.830693050875854, "grad_norm": 0.1894659548997879, "learning_rate": 1.9851482725480896e-05, "loss": 1.3044, "step": 2789 }, { "epoch": 0.8309908970755226, "grad_norm": 0.19924505054950714, "learning_rate": 1.9851317058014978e-05, "loss": 1.3123, "step": 2790 }, { "epoch": 0.8312887432751913, "grad_norm": 0.19592399895191193, "learning_rate": 1.9851151298893563e-05, "loss": 1.3243, "step": 2791 }, { "epoch": 0.83158658947486, "grad_norm": 0.19199055433273315, "learning_rate": 1.9850985448118192e-05, "loss": 1.3114, "step": 2792 }, { "epoch": 0.8318844356745285, "grad_norm": 0.1868075728416443, "learning_rate": 1.9850819505690408e-05, "loss": 1.3117, "step": 2793 }, { "epoch": 0.8321822818741972, "grad_norm": 0.19179585576057434, "learning_rate": 1.985065347161175e-05, "loss": 1.3254, "step": 2794 }, { "epoch": 0.8324801280738658, "grad_norm": 0.2259640246629715, "learning_rate": 1.985048734588377e-05, "loss": 1.3308, "step": 2795 }, { "epoch": 0.8327779742735345, "grad_norm": 0.19590279459953308, "learning_rate": 1.9850321128508013e-05, "loss": 1.3217, "step": 2796 }, { "epoch": 0.8330758204732032, "grad_norm": 0.19112949073314667, "learning_rate": 1.985015481948602e-05, "loss": 1.3131, "step": 2797 }, { "epoch": 0.8333736666728718, "grad_norm": 0.20110125839710236, "learning_rate": 1.9849988418819342e-05, "loss": 1.3052, "step": 2798 }, { "epoch": 0.8336715128725405, "grad_norm": 0.1952708512544632, "learning_rate": 1.984982192650953e-05, "loss": 1.2947, "step": 2799 }, { "epoch": 0.833969359072209, "grad_norm": 0.18995912373065948, "learning_rate": 1.9849655342558126e-05, "loss": 1.3191, "step": 2800 }, { "epoch": 0.8342672052718777, "grad_norm": 0.1994224488735199, "learning_rate": 1.9849488666966686e-05, "loss": 1.3229, "step": 2801 }, { "epoch": 0.8345650514715464, "grad_norm": 0.19360418617725372, "learning_rate": 1.9849321899736757e-05, "loss": 1.3131, "step": 2802 }, { "epoch": 0.834862897671215, "grad_norm": 0.20493118464946747, "learning_rate": 1.9849155040869895e-05, "loss": 1.3249, "step": 2803 }, { "epoch": 0.8351607438708837, "grad_norm": 0.1936962753534317, "learning_rate": 1.9848988090367648e-05, "loss": 1.3174, "step": 2804 }, { "epoch": 0.8354585900705523, "grad_norm": 0.18214362859725952, "learning_rate": 1.9848821048231567e-05, "loss": 1.3082, "step": 2805 }, { "epoch": 0.835756436270221, "grad_norm": 0.21772930026054382, "learning_rate": 1.9848653914463214e-05, "loss": 1.3053, "step": 2806 }, { "epoch": 0.8360542824698897, "grad_norm": 0.19386501610279083, "learning_rate": 1.9848486689064138e-05, "loss": 1.3072, "step": 2807 }, { "epoch": 0.8363521286695582, "grad_norm": 0.183438241481781, "learning_rate": 1.9848319372035898e-05, "loss": 1.3009, "step": 2808 }, { "epoch": 0.8366499748692269, "grad_norm": 0.21069328486919403, "learning_rate": 1.9848151963380048e-05, "loss": 1.3227, "step": 2809 }, { "epoch": 0.8369478210688955, "grad_norm": 0.2096351981163025, "learning_rate": 1.984798446309815e-05, "loss": 1.313, "step": 2810 }, { "epoch": 0.8372456672685642, "grad_norm": 0.20137789845466614, "learning_rate": 1.9847816871191757e-05, "loss": 1.3208, "step": 2811 }, { "epoch": 0.8375435134682329, "grad_norm": 0.19148589670658112, "learning_rate": 1.9847649187662433e-05, "loss": 1.3121, "step": 2812 }, { "epoch": 0.8378413596679015, "grad_norm": 0.19229096174240112, "learning_rate": 1.9847481412511734e-05, "loss": 1.2878, "step": 2813 }, { "epoch": 0.8381392058675702, "grad_norm": 0.1903059333562851, "learning_rate": 1.984731354574122e-05, "loss": 1.3072, "step": 2814 }, { "epoch": 0.8384370520672387, "grad_norm": 0.19770605862140656, "learning_rate": 1.9847145587352458e-05, "loss": 1.3059, "step": 2815 }, { "epoch": 0.8387348982669074, "grad_norm": 0.20117489993572235, "learning_rate": 1.984697753734701e-05, "loss": 1.3155, "step": 2816 }, { "epoch": 0.839032744466576, "grad_norm": 0.20878930389881134, "learning_rate": 1.9846809395726433e-05, "loss": 1.3177, "step": 2817 }, { "epoch": 0.8393305906662447, "grad_norm": 0.19036711752414703, "learning_rate": 1.9846641162492303e-05, "loss": 1.3054, "step": 2818 }, { "epoch": 0.8396284368659134, "grad_norm": 0.19927635788917542, "learning_rate": 1.9846472837646173e-05, "loss": 1.3266, "step": 2819 }, { "epoch": 0.839926283065582, "grad_norm": 0.19310376048088074, "learning_rate": 1.9846304421189618e-05, "loss": 1.3124, "step": 2820 }, { "epoch": 0.8402241292652507, "grad_norm": 0.19466964900493622, "learning_rate": 1.9846135913124196e-05, "loss": 1.3077, "step": 2821 }, { "epoch": 0.8405219754649194, "grad_norm": 0.19182047247886658, "learning_rate": 1.9845967313451484e-05, "loss": 1.3104, "step": 2822 }, { "epoch": 0.8408198216645879, "grad_norm": 0.18639199435710907, "learning_rate": 1.9845798622173046e-05, "loss": 1.3178, "step": 2823 }, { "epoch": 0.8411176678642566, "grad_norm": 0.19101166725158691, "learning_rate": 1.984562983929045e-05, "loss": 1.3171, "step": 2824 }, { "epoch": 0.8414155140639252, "grad_norm": 0.18609404563903809, "learning_rate": 1.984546096480527e-05, "loss": 1.3144, "step": 2825 }, { "epoch": 0.8417133602635939, "grad_norm": 0.1885635405778885, "learning_rate": 1.984529199871908e-05, "loss": 1.3198, "step": 2826 }, { "epoch": 0.8420112064632626, "grad_norm": 0.1921912133693695, "learning_rate": 1.9845122941033443e-05, "loss": 1.3247, "step": 2827 }, { "epoch": 0.8423090526629312, "grad_norm": 0.21644912660121918, "learning_rate": 1.9844953791749938e-05, "loss": 1.3158, "step": 2828 }, { "epoch": 0.8426068988625999, "grad_norm": 0.1851980835199356, "learning_rate": 1.9844784550870137e-05, "loss": 1.2949, "step": 2829 }, { "epoch": 0.8429047450622684, "grad_norm": 0.19381019473075867, "learning_rate": 1.9844615218395615e-05, "loss": 1.3182, "step": 2830 }, { "epoch": 0.8432025912619371, "grad_norm": 0.1965644806623459, "learning_rate": 1.9844445794327947e-05, "loss": 1.3122, "step": 2831 }, { "epoch": 0.8435004374616057, "grad_norm": 0.20047470927238464, "learning_rate": 1.984427627866871e-05, "loss": 1.3124, "step": 2832 }, { "epoch": 0.8437982836612744, "grad_norm": 0.1878950148820877, "learning_rate": 1.984410667141948e-05, "loss": 1.3033, "step": 2833 }, { "epoch": 0.8440961298609431, "grad_norm": 0.18842440843582153, "learning_rate": 1.9843936972581838e-05, "loss": 1.3162, "step": 2834 }, { "epoch": 0.8443939760606117, "grad_norm": 0.19343741238117218, "learning_rate": 1.984376718215736e-05, "loss": 1.3271, "step": 2835 }, { "epoch": 0.8446918222602804, "grad_norm": 0.19063957035541534, "learning_rate": 1.9843597300147625e-05, "loss": 1.3083, "step": 2836 }, { "epoch": 0.8449896684599489, "grad_norm": 0.20658956468105316, "learning_rate": 1.9843427326554218e-05, "loss": 1.3055, "step": 2837 }, { "epoch": 0.8452875146596176, "grad_norm": 0.19133248925209045, "learning_rate": 1.9843257261378717e-05, "loss": 1.3177, "step": 2838 }, { "epoch": 0.8455853608592863, "grad_norm": 0.1955590844154358, "learning_rate": 1.9843087104622702e-05, "loss": 1.2879, "step": 2839 }, { "epoch": 0.8458832070589549, "grad_norm": 0.1960470825433731, "learning_rate": 1.984291685628776e-05, "loss": 1.3118, "step": 2840 }, { "epoch": 0.8461810532586236, "grad_norm": 0.2028670758008957, "learning_rate": 1.9842746516375474e-05, "loss": 1.318, "step": 2841 }, { "epoch": 0.8464788994582922, "grad_norm": 0.18661056458950043, "learning_rate": 1.984257608488743e-05, "loss": 1.2969, "step": 2842 }, { "epoch": 0.8467767456579609, "grad_norm": 0.18581277132034302, "learning_rate": 1.984240556182521e-05, "loss": 1.3079, "step": 2843 }, { "epoch": 0.8470745918576296, "grad_norm": 0.19332443177700043, "learning_rate": 1.9842234947190406e-05, "loss": 1.3074, "step": 2844 }, { "epoch": 0.8473724380572981, "grad_norm": 0.19430364668369293, "learning_rate": 1.98420642409846e-05, "loss": 1.3131, "step": 2845 }, { "epoch": 0.8476702842569668, "grad_norm": 0.1899929642677307, "learning_rate": 1.9841893443209385e-05, "loss": 1.3215, "step": 2846 }, { "epoch": 0.8479681304566354, "grad_norm": 0.18724815547466278, "learning_rate": 1.9841722553866344e-05, "loss": 1.3109, "step": 2847 }, { "epoch": 0.8482659766563041, "grad_norm": 0.19492894411087036, "learning_rate": 1.984155157295707e-05, "loss": 1.3271, "step": 2848 }, { "epoch": 0.8485638228559728, "grad_norm": 0.18969008326530457, "learning_rate": 1.9841380500483157e-05, "loss": 1.3087, "step": 2849 }, { "epoch": 0.8488616690556414, "grad_norm": 0.19285663962364197, "learning_rate": 1.984120933644619e-05, "loss": 1.2828, "step": 2850 }, { "epoch": 0.8491595152553101, "grad_norm": 0.19459478557109833, "learning_rate": 1.984103808084777e-05, "loss": 1.3242, "step": 2851 }, { "epoch": 0.8494573614549786, "grad_norm": 0.1840110868215561, "learning_rate": 1.9840866733689482e-05, "loss": 1.3112, "step": 2852 }, { "epoch": 0.8497552076546473, "grad_norm": 0.19513243436813354, "learning_rate": 1.9840695294972926e-05, "loss": 1.3048, "step": 2853 }, { "epoch": 0.850053053854316, "grad_norm": 0.1993199735879898, "learning_rate": 1.98405237646997e-05, "loss": 1.3422, "step": 2854 }, { "epoch": 0.8503509000539846, "grad_norm": 0.1872604489326477, "learning_rate": 1.9840352142871384e-05, "loss": 1.3077, "step": 2855 }, { "epoch": 0.8506487462536533, "grad_norm": 0.18576233088970184, "learning_rate": 1.9840180429489593e-05, "loss": 1.3156, "step": 2856 }, { "epoch": 0.8509465924533219, "grad_norm": 0.18810655176639557, "learning_rate": 1.9840008624555917e-05, "loss": 1.3133, "step": 2857 }, { "epoch": 0.8512444386529906, "grad_norm": 0.18760626018047333, "learning_rate": 1.983983672807195e-05, "loss": 1.2985, "step": 2858 }, { "epoch": 0.8515422848526593, "grad_norm": 0.18436740338802338, "learning_rate": 1.98396647400393e-05, "loss": 1.3197, "step": 2859 }, { "epoch": 0.8518401310523278, "grad_norm": 0.18469035625457764, "learning_rate": 1.983949266045956e-05, "loss": 1.3238, "step": 2860 }, { "epoch": 0.8521379772519965, "grad_norm": 0.18595953285694122, "learning_rate": 1.983932048933434e-05, "loss": 1.3141, "step": 2861 }, { "epoch": 0.8524358234516651, "grad_norm": 0.19043129682540894, "learning_rate": 1.983914822666523e-05, "loss": 1.3231, "step": 2862 }, { "epoch": 0.8527336696513338, "grad_norm": 0.19436709582805634, "learning_rate": 1.9838975872453842e-05, "loss": 1.331, "step": 2863 }, { "epoch": 0.8530315158510025, "grad_norm": 0.19808824360370636, "learning_rate": 1.9838803426701774e-05, "loss": 1.3049, "step": 2864 }, { "epoch": 0.8533293620506711, "grad_norm": 0.18367069959640503, "learning_rate": 1.9838630889410635e-05, "loss": 1.3097, "step": 2865 }, { "epoch": 0.8536272082503398, "grad_norm": 0.20176853239536285, "learning_rate": 1.9838458260582024e-05, "loss": 1.3348, "step": 2866 }, { "epoch": 0.8539250544500083, "grad_norm": 0.1957586705684662, "learning_rate": 1.9838285540217554e-05, "loss": 1.3194, "step": 2867 }, { "epoch": 0.854222900649677, "grad_norm": 0.19776873290538788, "learning_rate": 1.983811272831883e-05, "loss": 1.3224, "step": 2868 }, { "epoch": 0.8545207468493456, "grad_norm": 0.1966826468706131, "learning_rate": 1.983793982488746e-05, "loss": 1.3134, "step": 2869 }, { "epoch": 0.8548185930490143, "grad_norm": 0.19042354822158813, "learning_rate": 1.9837766829925045e-05, "loss": 1.2999, "step": 2870 }, { "epoch": 0.855116439248683, "grad_norm": 0.20047244429588318, "learning_rate": 1.9837593743433207e-05, "loss": 1.3303, "step": 2871 }, { "epoch": 0.8554142854483516, "grad_norm": 0.20083603262901306, "learning_rate": 1.9837420565413546e-05, "loss": 1.3063, "step": 2872 }, { "epoch": 0.8557121316480203, "grad_norm": 0.2025173008441925, "learning_rate": 1.9837247295867682e-05, "loss": 1.3232, "step": 2873 }, { "epoch": 0.856009977847689, "grad_norm": 0.20584246516227722, "learning_rate": 1.9837073934797222e-05, "loss": 1.322, "step": 2874 }, { "epoch": 0.8563078240473575, "grad_norm": 0.1971740424633026, "learning_rate": 1.9836900482203778e-05, "loss": 1.3105, "step": 2875 }, { "epoch": 0.8566056702470262, "grad_norm": 0.18648609519004822, "learning_rate": 1.9836726938088966e-05, "loss": 1.3218, "step": 2876 }, { "epoch": 0.8569035164466948, "grad_norm": 0.2044125497341156, "learning_rate": 1.9836553302454402e-05, "loss": 1.3245, "step": 2877 }, { "epoch": 0.8572013626463635, "grad_norm": 0.20012997090816498, "learning_rate": 1.9836379575301696e-05, "loss": 1.3213, "step": 2878 }, { "epoch": 0.8574992088460321, "grad_norm": 0.1945681869983673, "learning_rate": 1.983620575663247e-05, "loss": 1.3169, "step": 2879 }, { "epoch": 0.8577970550457008, "grad_norm": 0.19905398786067963, "learning_rate": 1.9836031846448338e-05, "loss": 1.3251, "step": 2880 }, { "epoch": 0.8580949012453695, "grad_norm": 0.2026236206293106, "learning_rate": 1.9835857844750922e-05, "loss": 1.3394, "step": 2881 }, { "epoch": 0.858392747445038, "grad_norm": 0.19498775899410248, "learning_rate": 1.9835683751541835e-05, "loss": 1.2935, "step": 2882 }, { "epoch": 0.8586905936447067, "grad_norm": 0.20352014899253845, "learning_rate": 1.9835509566822703e-05, "loss": 1.3141, "step": 2883 }, { "epoch": 0.8589884398443753, "grad_norm": 0.19510705769062042, "learning_rate": 1.983533529059514e-05, "loss": 1.3114, "step": 2884 }, { "epoch": 0.859286286044044, "grad_norm": 0.19283436238765717, "learning_rate": 1.983516092286077e-05, "loss": 1.3209, "step": 2885 }, { "epoch": 0.8595841322437127, "grad_norm": 0.1914401352405548, "learning_rate": 1.9834986463621218e-05, "loss": 1.3057, "step": 2886 }, { "epoch": 0.8598819784433813, "grad_norm": 0.19460298120975494, "learning_rate": 1.9834811912878106e-05, "loss": 1.2959, "step": 2887 }, { "epoch": 0.86017982464305, "grad_norm": 0.19352369010448456, "learning_rate": 1.9834637270633057e-05, "loss": 1.295, "step": 2888 }, { "epoch": 0.8604776708427186, "grad_norm": 0.19900284707546234, "learning_rate": 1.9834462536887694e-05, "loss": 1.3332, "step": 2889 }, { "epoch": 0.8607755170423872, "grad_norm": 0.19056642055511475, "learning_rate": 1.9834287711643647e-05, "loss": 1.297, "step": 2890 }, { "epoch": 0.8610733632420559, "grad_norm": 0.19439257681369781, "learning_rate": 1.983411279490254e-05, "loss": 1.3127, "step": 2891 }, { "epoch": 0.8613712094417245, "grad_norm": 0.18397583067417145, "learning_rate": 1.9833937786666e-05, "loss": 1.3181, "step": 2892 }, { "epoch": 0.8616690556413932, "grad_norm": 0.19947071373462677, "learning_rate": 1.9833762686935656e-05, "loss": 1.3075, "step": 2893 }, { "epoch": 0.8619669018410618, "grad_norm": 0.18260259926319122, "learning_rate": 1.9833587495713138e-05, "loss": 1.3129, "step": 2894 }, { "epoch": 0.8622647480407305, "grad_norm": 0.1963987797498703, "learning_rate": 1.9833412213000077e-05, "loss": 1.3141, "step": 2895 }, { "epoch": 0.8625625942403992, "grad_norm": 0.2164333015680313, "learning_rate": 1.9833236838798097e-05, "loss": 1.3123, "step": 2896 }, { "epoch": 0.8628604404400677, "grad_norm": 0.19688157737255096, "learning_rate": 1.9833061373108836e-05, "loss": 1.3178, "step": 2897 }, { "epoch": 0.8631582866397364, "grad_norm": 0.1961808204650879, "learning_rate": 1.9832885815933926e-05, "loss": 1.313, "step": 2898 }, { "epoch": 0.863456132839405, "grad_norm": 0.1982816457748413, "learning_rate": 1.9832710167275002e-05, "loss": 1.3211, "step": 2899 }, { "epoch": 0.8637539790390737, "grad_norm": 0.20556198060512543, "learning_rate": 1.983253442713369e-05, "loss": 1.3202, "step": 2900 }, { "epoch": 0.8640518252387424, "grad_norm": 0.2019490897655487, "learning_rate": 1.9832358595511638e-05, "loss": 1.3018, "step": 2901 }, { "epoch": 0.864349671438411, "grad_norm": 0.19240225851535797, "learning_rate": 1.9832182672410468e-05, "loss": 1.3138, "step": 2902 }, { "epoch": 0.8646475176380797, "grad_norm": 0.19900253415107727, "learning_rate": 1.9832006657831827e-05, "loss": 1.3014, "step": 2903 }, { "epoch": 0.8649453638377483, "grad_norm": 0.19329264760017395, "learning_rate": 1.9831830551777348e-05, "loss": 1.3122, "step": 2904 }, { "epoch": 0.8652432100374169, "grad_norm": 0.20352724194526672, "learning_rate": 1.983165435424867e-05, "loss": 1.3035, "step": 2905 }, { "epoch": 0.8655410562370855, "grad_norm": 0.22703319787979126, "learning_rate": 1.9831478065247434e-05, "loss": 1.2987, "step": 2906 }, { "epoch": 0.8658389024367542, "grad_norm": 0.19756467640399933, "learning_rate": 1.9831301684775276e-05, "loss": 1.306, "step": 2907 }, { "epoch": 0.8661367486364229, "grad_norm": 0.21882663667201996, "learning_rate": 1.9831125212833842e-05, "loss": 1.3342, "step": 2908 }, { "epoch": 0.8664345948360915, "grad_norm": 0.20317678153514862, "learning_rate": 1.9830948649424773e-05, "loss": 1.3162, "step": 2909 }, { "epoch": 0.8667324410357602, "grad_norm": 0.20027881860733032, "learning_rate": 1.9830771994549712e-05, "loss": 1.3061, "step": 2910 }, { "epoch": 0.8670302872354289, "grad_norm": 0.19904261827468872, "learning_rate": 1.98305952482103e-05, "loss": 1.3042, "step": 2911 }, { "epoch": 0.8673281334350974, "grad_norm": 0.2004762887954712, "learning_rate": 1.983041841040818e-05, "loss": 1.2973, "step": 2912 }, { "epoch": 0.8676259796347661, "grad_norm": 0.20205415785312653, "learning_rate": 1.9830241481145e-05, "loss": 1.3091, "step": 2913 }, { "epoch": 0.8679238258344347, "grad_norm": 0.2154819518327713, "learning_rate": 1.983006446042241e-05, "loss": 1.3296, "step": 2914 }, { "epoch": 0.8682216720341034, "grad_norm": 0.19992339611053467, "learning_rate": 1.982988734824205e-05, "loss": 1.3025, "step": 2915 }, { "epoch": 0.868519518233772, "grad_norm": 0.19117744266986847, "learning_rate": 1.9829710144605568e-05, "loss": 1.313, "step": 2916 }, { "epoch": 0.8688173644334407, "grad_norm": 0.20051947236061096, "learning_rate": 1.9829532849514623e-05, "loss": 1.3172, "step": 2917 }, { "epoch": 0.8691152106331094, "grad_norm": 0.18870866298675537, "learning_rate": 1.9829355462970852e-05, "loss": 1.308, "step": 2918 }, { "epoch": 0.8694130568327779, "grad_norm": 0.18585895001888275, "learning_rate": 1.9829177984975912e-05, "loss": 1.3361, "step": 2919 }, { "epoch": 0.8697109030324466, "grad_norm": 0.19265083968639374, "learning_rate": 1.982900041553145e-05, "loss": 1.2989, "step": 2920 }, { "epoch": 0.8700087492321152, "grad_norm": 0.20708735287189484, "learning_rate": 1.9828822754639124e-05, "loss": 1.3295, "step": 2921 }, { "epoch": 0.8703065954317839, "grad_norm": 0.19332322478294373, "learning_rate": 1.982864500230058e-05, "loss": 1.3164, "step": 2922 }, { "epoch": 0.8706044416314526, "grad_norm": 0.19034935534000397, "learning_rate": 1.982846715851748e-05, "loss": 1.2992, "step": 2923 }, { "epoch": 0.8709022878311212, "grad_norm": 0.21060289442539215, "learning_rate": 1.982828922329147e-05, "loss": 1.3204, "step": 2924 }, { "epoch": 0.8712001340307899, "grad_norm": 0.20401045680046082, "learning_rate": 1.9828111196624212e-05, "loss": 1.2966, "step": 2925 }, { "epoch": 0.8714979802304585, "grad_norm": 0.1838827133178711, "learning_rate": 1.982793307851736e-05, "loss": 1.3039, "step": 2926 }, { "epoch": 0.8717958264301271, "grad_norm": 0.20430974662303925, "learning_rate": 1.9827754868972572e-05, "loss": 1.3115, "step": 2927 }, { "epoch": 0.8720936726297958, "grad_norm": 0.19398505985736847, "learning_rate": 1.9827576567991503e-05, "loss": 1.3243, "step": 2928 }, { "epoch": 0.8723915188294644, "grad_norm": 0.1987292468547821, "learning_rate": 1.9827398175575815e-05, "loss": 1.3095, "step": 2929 }, { "epoch": 0.8726893650291331, "grad_norm": 0.19124366343021393, "learning_rate": 1.9827219691727167e-05, "loss": 1.3105, "step": 2930 }, { "epoch": 0.8729872112288017, "grad_norm": 0.19534295797348022, "learning_rate": 1.9827041116447217e-05, "loss": 1.3082, "step": 2931 }, { "epoch": 0.8732850574284704, "grad_norm": 0.20367717742919922, "learning_rate": 1.9826862449737633e-05, "loss": 1.3146, "step": 2932 }, { "epoch": 0.8735829036281391, "grad_norm": 0.19920623302459717, "learning_rate": 1.982668369160007e-05, "loss": 1.3078, "step": 2933 }, { "epoch": 0.8738807498278076, "grad_norm": 0.19633528590202332, "learning_rate": 1.9826504842036193e-05, "loss": 1.3069, "step": 2934 }, { "epoch": 0.8741785960274763, "grad_norm": 0.20765548944473267, "learning_rate": 1.982632590104767e-05, "loss": 1.3167, "step": 2935 }, { "epoch": 0.8744764422271449, "grad_norm": 0.20531393587589264, "learning_rate": 1.9826146868636162e-05, "loss": 1.3116, "step": 2936 }, { "epoch": 0.8747742884268136, "grad_norm": 0.2838412821292877, "learning_rate": 1.9825967744803337e-05, "loss": 1.3075, "step": 2937 }, { "epoch": 0.8750721346264823, "grad_norm": 0.2089710533618927, "learning_rate": 1.982578852955086e-05, "loss": 1.3105, "step": 2938 }, { "epoch": 0.8753699808261509, "grad_norm": 0.2029941827058792, "learning_rate": 1.9825609222880396e-05, "loss": 1.2992, "step": 2939 }, { "epoch": 0.8756678270258196, "grad_norm": 0.19510619342327118, "learning_rate": 1.982542982479362e-05, "loss": 1.3293, "step": 2940 }, { "epoch": 0.8759656732254882, "grad_norm": 0.19722811877727509, "learning_rate": 1.9825250335292196e-05, "loss": 1.3185, "step": 2941 }, { "epoch": 0.8762635194251568, "grad_norm": 0.1850764900445938, "learning_rate": 1.9825070754377794e-05, "loss": 1.3113, "step": 2942 }, { "epoch": 0.8765613656248255, "grad_norm": 0.18542790412902832, "learning_rate": 1.9824891082052088e-05, "loss": 1.2884, "step": 2943 }, { "epoch": 0.8768592118244941, "grad_norm": 0.2818898558616638, "learning_rate": 1.9824711318316744e-05, "loss": 1.3054, "step": 2944 }, { "epoch": 0.8771570580241628, "grad_norm": 0.19794979691505432, "learning_rate": 1.9824531463173443e-05, "loss": 1.3149, "step": 2945 }, { "epoch": 0.8774549042238314, "grad_norm": 0.19850584864616394, "learning_rate": 1.982435151662385e-05, "loss": 1.3059, "step": 2946 }, { "epoch": 0.8777527504235001, "grad_norm": 0.1981966346502304, "learning_rate": 1.982417147866964e-05, "loss": 1.3225, "step": 2947 }, { "epoch": 0.8780505966231688, "grad_norm": 0.19317342340946198, "learning_rate": 1.9823991349312495e-05, "loss": 1.2975, "step": 2948 }, { "epoch": 0.8783484428228373, "grad_norm": 0.19680213928222656, "learning_rate": 1.9823811128554084e-05, "loss": 1.3083, "step": 2949 }, { "epoch": 0.878646289022506, "grad_norm": 0.1831223964691162, "learning_rate": 1.9823630816396087e-05, "loss": 1.3169, "step": 2950 }, { "epoch": 0.8789441352221746, "grad_norm": 0.19879232347011566, "learning_rate": 1.9823450412840183e-05, "loss": 1.296, "step": 2951 }, { "epoch": 0.8792419814218433, "grad_norm": 0.20635774731636047, "learning_rate": 1.9823269917888045e-05, "loss": 1.3359, "step": 2952 }, { "epoch": 0.879539827621512, "grad_norm": 0.31576263904571533, "learning_rate": 1.9823089331541357e-05, "loss": 1.308, "step": 2953 }, { "epoch": 0.8798376738211806, "grad_norm": 0.19298005104064941, "learning_rate": 1.9822908653801796e-05, "loss": 1.3243, "step": 2954 }, { "epoch": 0.8801355200208493, "grad_norm": 0.19962824881076813, "learning_rate": 1.9822727884671046e-05, "loss": 1.3214, "step": 2955 }, { "epoch": 0.8804333662205179, "grad_norm": 0.19718356430530548, "learning_rate": 1.982254702415079e-05, "loss": 1.3124, "step": 2956 }, { "epoch": 0.8807312124201865, "grad_norm": 0.19522196054458618, "learning_rate": 1.9822366072242704e-05, "loss": 1.3185, "step": 2957 }, { "epoch": 0.8810290586198551, "grad_norm": 0.19573675096035004, "learning_rate": 1.9822185028948483e-05, "loss": 1.2923, "step": 2958 }, { "epoch": 0.8813269048195238, "grad_norm": 0.1976947784423828, "learning_rate": 1.9822003894269795e-05, "loss": 1.3237, "step": 2959 }, { "epoch": 0.8816247510191925, "grad_norm": 0.20155999064445496, "learning_rate": 1.982182266820834e-05, "loss": 1.2854, "step": 2960 }, { "epoch": 0.8819225972188611, "grad_norm": 0.1935465931892395, "learning_rate": 1.98216413507658e-05, "loss": 1.3018, "step": 2961 }, { "epoch": 0.8822204434185298, "grad_norm": 0.1943049430847168, "learning_rate": 1.9821459941943857e-05, "loss": 1.3169, "step": 2962 }, { "epoch": 0.8825182896181984, "grad_norm": 0.19589929282665253, "learning_rate": 1.9821278441744205e-05, "loss": 1.3065, "step": 2963 }, { "epoch": 0.882816135817867, "grad_norm": 0.18929025530815125, "learning_rate": 1.982109685016853e-05, "loss": 1.307, "step": 2964 }, { "epoch": 0.8831139820175357, "grad_norm": 0.19319988787174225, "learning_rate": 1.9820915167218517e-05, "loss": 1.3358, "step": 2965 }, { "epoch": 0.8834118282172043, "grad_norm": 0.19552074372768402, "learning_rate": 1.9820733392895865e-05, "loss": 1.3163, "step": 2966 }, { "epoch": 0.883709674416873, "grad_norm": 0.1948016732931137, "learning_rate": 1.9820551527202262e-05, "loss": 1.3193, "step": 2967 }, { "epoch": 0.8840075206165416, "grad_norm": 0.19571538269519806, "learning_rate": 1.9820369570139397e-05, "loss": 1.316, "step": 2968 }, { "epoch": 0.8843053668162103, "grad_norm": 0.1959875226020813, "learning_rate": 1.9820187521708966e-05, "loss": 1.3061, "step": 2969 }, { "epoch": 0.884603213015879, "grad_norm": 0.2011423259973526, "learning_rate": 1.9820005381912662e-05, "loss": 1.3186, "step": 2970 }, { "epoch": 0.8849010592155476, "grad_norm": 0.19470910727977753, "learning_rate": 1.981982315075218e-05, "loss": 1.3121, "step": 2971 }, { "epoch": 0.8851989054152162, "grad_norm": 0.20103910565376282, "learning_rate": 1.9819640828229214e-05, "loss": 1.3182, "step": 2972 }, { "epoch": 0.8854967516148848, "grad_norm": 0.19423173367977142, "learning_rate": 1.9819458414345462e-05, "loss": 1.3048, "step": 2973 }, { "epoch": 0.8857945978145535, "grad_norm": 0.19678495824337006, "learning_rate": 1.981927590910262e-05, "loss": 1.3117, "step": 2974 }, { "epoch": 0.8860924440142222, "grad_norm": 0.19346599280834198, "learning_rate": 1.9819093312502383e-05, "loss": 1.3251, "step": 2975 }, { "epoch": 0.8863902902138908, "grad_norm": 0.1933828890323639, "learning_rate": 1.981891062454646e-05, "loss": 1.312, "step": 2976 }, { "epoch": 0.8866881364135595, "grad_norm": 0.21723021566867828, "learning_rate": 1.9818727845236537e-05, "loss": 1.3115, "step": 2977 }, { "epoch": 0.8869859826132281, "grad_norm": 0.20165273547172546, "learning_rate": 1.9818544974574327e-05, "loss": 1.3037, "step": 2978 }, { "epoch": 0.8872838288128967, "grad_norm": 0.19502496719360352, "learning_rate": 1.9818362012561525e-05, "loss": 1.3188, "step": 2979 }, { "epoch": 0.8875816750125654, "grad_norm": 0.19082093238830566, "learning_rate": 1.981817895919983e-05, "loss": 1.3179, "step": 2980 }, { "epoch": 0.887879521212234, "grad_norm": 0.19495829939842224, "learning_rate": 1.9817995814490956e-05, "loss": 1.315, "step": 2981 }, { "epoch": 0.8881773674119027, "grad_norm": 0.19247324764728546, "learning_rate": 1.98178125784366e-05, "loss": 1.3182, "step": 2982 }, { "epoch": 0.8884752136115713, "grad_norm": 0.19553199410438538, "learning_rate": 1.981762925103846e-05, "loss": 1.2941, "step": 2983 }, { "epoch": 0.88877305981124, "grad_norm": 0.1885903775691986, "learning_rate": 1.981744583229825e-05, "loss": 1.2995, "step": 2984 }, { "epoch": 0.8890709060109087, "grad_norm": 0.19529998302459717, "learning_rate": 1.981726232221768e-05, "loss": 1.3087, "step": 2985 }, { "epoch": 0.8893687522105773, "grad_norm": 0.1879928857088089, "learning_rate": 1.981707872079845e-05, "loss": 1.3053, "step": 2986 }, { "epoch": 0.8896665984102459, "grad_norm": 0.21500670909881592, "learning_rate": 1.9816895028042272e-05, "loss": 1.3084, "step": 2987 }, { "epoch": 0.8899644446099145, "grad_norm": 0.19979077577590942, "learning_rate": 1.9816711243950852e-05, "loss": 1.2911, "step": 2988 }, { "epoch": 0.8902622908095832, "grad_norm": 0.18259525299072266, "learning_rate": 1.98165273685259e-05, "loss": 1.3061, "step": 2989 }, { "epoch": 0.8905601370092519, "grad_norm": 0.19319696724414825, "learning_rate": 1.9816343401769136e-05, "loss": 1.3174, "step": 2990 }, { "epoch": 0.8908579832089205, "grad_norm": 0.19616177678108215, "learning_rate": 1.9816159343682256e-05, "loss": 1.2965, "step": 2991 }, { "epoch": 0.8911558294085892, "grad_norm": 0.19472496211528778, "learning_rate": 1.9815975194266986e-05, "loss": 1.3183, "step": 2992 }, { "epoch": 0.8914536756082578, "grad_norm": 0.19293633103370667, "learning_rate": 1.9815790953525033e-05, "loss": 1.305, "step": 2993 }, { "epoch": 0.8917515218079264, "grad_norm": 0.19016510248184204, "learning_rate": 1.981560662145811e-05, "loss": 1.2958, "step": 2994 }, { "epoch": 0.892049368007595, "grad_norm": 0.19922995567321777, "learning_rate": 1.9815422198067935e-05, "loss": 1.3044, "step": 2995 }, { "epoch": 0.8923472142072637, "grad_norm": 0.19845548272132874, "learning_rate": 1.981523768335622e-05, "loss": 1.3211, "step": 2996 }, { "epoch": 0.8926450604069324, "grad_norm": 0.20516902208328247, "learning_rate": 1.981505307732469e-05, "loss": 1.3338, "step": 2997 }, { "epoch": 0.892942906606601, "grad_norm": 0.21636910736560822, "learning_rate": 1.981486837997505e-05, "loss": 1.3116, "step": 2998 }, { "epoch": 0.8932407528062697, "grad_norm": 0.20371657609939575, "learning_rate": 1.9814683591309034e-05, "loss": 1.3135, "step": 2999 }, { "epoch": 0.8935385990059383, "grad_norm": 0.19144035875797272, "learning_rate": 1.9814498711328348e-05, "loss": 1.3089, "step": 3000 }, { "epoch": 0.8935385990059383, "eval_loss": 1.3625597953796387, "eval_runtime": 21.0759, "eval_samples_per_second": 82.274, "eval_steps_per_second": 5.172, "step": 3000 }, { "epoch": 0.8938364452056069, "grad_norm": 0.19272580742835999, "learning_rate": 1.9814313740034715e-05, "loss": 1.301, "step": 3001 }, { "epoch": 0.8941342914052756, "grad_norm": 0.19311420619487762, "learning_rate": 1.9814128677429864e-05, "loss": 1.3022, "step": 3002 }, { "epoch": 0.8944321376049442, "grad_norm": 0.1974402368068695, "learning_rate": 1.9813943523515507e-05, "loss": 1.3036, "step": 3003 }, { "epoch": 0.8947299838046129, "grad_norm": 0.187892347574234, "learning_rate": 1.981375827829337e-05, "loss": 1.2897, "step": 3004 }, { "epoch": 0.8950278300042815, "grad_norm": 0.19705121219158173, "learning_rate": 1.9813572941765176e-05, "loss": 1.3024, "step": 3005 }, { "epoch": 0.8953256762039502, "grad_norm": 0.2087492048740387, "learning_rate": 1.981338751393265e-05, "loss": 1.3222, "step": 3006 }, { "epoch": 0.8956235224036189, "grad_norm": 0.19134962558746338, "learning_rate": 1.981320199479752e-05, "loss": 1.2942, "step": 3007 }, { "epoch": 0.8959213686032875, "grad_norm": 0.1930139660835266, "learning_rate": 1.981301638436151e-05, "loss": 1.329, "step": 3008 }, { "epoch": 0.8962192148029561, "grad_norm": 0.1902092695236206, "learning_rate": 1.9812830682626342e-05, "loss": 1.3007, "step": 3009 }, { "epoch": 0.8965170610026247, "grad_norm": 0.1995134800672531, "learning_rate": 1.9812644889593752e-05, "loss": 1.3157, "step": 3010 }, { "epoch": 0.8968149072022934, "grad_norm": 0.19240212440490723, "learning_rate": 1.9812459005265463e-05, "loss": 1.3122, "step": 3011 }, { "epoch": 0.8971127534019621, "grad_norm": 0.20270533859729767, "learning_rate": 1.9812273029643205e-05, "loss": 1.3035, "step": 3012 }, { "epoch": 0.8974105996016307, "grad_norm": 0.1889352947473526, "learning_rate": 1.981208696272871e-05, "loss": 1.3073, "step": 3013 }, { "epoch": 0.8977084458012994, "grad_norm": 0.1877407431602478, "learning_rate": 1.981190080452371e-05, "loss": 1.2942, "step": 3014 }, { "epoch": 0.898006292000968, "grad_norm": 0.192073792219162, "learning_rate": 1.9811714555029937e-05, "loss": 1.2944, "step": 3015 }, { "epoch": 0.8983041382006366, "grad_norm": 0.20263244211673737, "learning_rate": 1.9811528214249122e-05, "loss": 1.3089, "step": 3016 }, { "epoch": 0.8986019844003053, "grad_norm": 0.19734568893909454, "learning_rate": 1.9811341782182994e-05, "loss": 1.3108, "step": 3017 }, { "epoch": 0.8988998305999739, "grad_norm": 0.19960430264472961, "learning_rate": 1.9811155258833294e-05, "loss": 1.3119, "step": 3018 }, { "epoch": 0.8991976767996426, "grad_norm": 0.1904885470867157, "learning_rate": 1.981096864420176e-05, "loss": 1.291, "step": 3019 }, { "epoch": 0.8994955229993112, "grad_norm": 0.19666573405265808, "learning_rate": 1.9810781938290124e-05, "loss": 1.2999, "step": 3020 }, { "epoch": 0.8997933691989799, "grad_norm": 0.1923913210630417, "learning_rate": 1.9810595141100116e-05, "loss": 1.3184, "step": 3021 }, { "epoch": 0.9000912153986486, "grad_norm": 0.19742454588413239, "learning_rate": 1.981040825263349e-05, "loss": 1.3083, "step": 3022 }, { "epoch": 0.9003890615983172, "grad_norm": 0.19502176344394684, "learning_rate": 1.981022127289197e-05, "loss": 1.3085, "step": 3023 }, { "epoch": 0.9006869077979858, "grad_norm": 0.19793961942195892, "learning_rate": 1.9810034201877304e-05, "loss": 1.3189, "step": 3024 }, { "epoch": 0.9009847539976544, "grad_norm": 0.19931679964065552, "learning_rate": 1.980984703959123e-05, "loss": 1.3224, "step": 3025 }, { "epoch": 0.9012826001973231, "grad_norm": 0.198928564786911, "learning_rate": 1.980965978603549e-05, "loss": 1.3149, "step": 3026 }, { "epoch": 0.9015804463969918, "grad_norm": 0.19738368690013885, "learning_rate": 1.9809472441211826e-05, "loss": 1.3177, "step": 3027 }, { "epoch": 0.9018782925966604, "grad_norm": 0.1833115518093109, "learning_rate": 1.980928500512198e-05, "loss": 1.313, "step": 3028 }, { "epoch": 0.9021761387963291, "grad_norm": 0.19273918867111206, "learning_rate": 1.9809097477767695e-05, "loss": 1.3233, "step": 3029 }, { "epoch": 0.9024739849959977, "grad_norm": 0.19289207458496094, "learning_rate": 1.980890985915072e-05, "loss": 1.2983, "step": 3030 }, { "epoch": 0.9027718311956663, "grad_norm": 0.2008187621831894, "learning_rate": 1.98087221492728e-05, "loss": 1.3066, "step": 3031 }, { "epoch": 0.903069677395335, "grad_norm": 0.20150333642959595, "learning_rate": 1.9808534348135676e-05, "loss": 1.2904, "step": 3032 }, { "epoch": 0.9033675235950036, "grad_norm": 0.19519232213497162, "learning_rate": 1.9808346455741097e-05, "loss": 1.3192, "step": 3033 }, { "epoch": 0.9036653697946723, "grad_norm": 0.19195297360420227, "learning_rate": 1.9808158472090814e-05, "loss": 1.3096, "step": 3034 }, { "epoch": 0.9039632159943409, "grad_norm": 0.2011096477508545, "learning_rate": 1.980797039718658e-05, "loss": 1.3089, "step": 3035 }, { "epoch": 0.9042610621940096, "grad_norm": 0.18987266719341278, "learning_rate": 1.980778223103013e-05, "loss": 1.2981, "step": 3036 }, { "epoch": 0.9045589083936783, "grad_norm": 0.20272229611873627, "learning_rate": 1.980759397362323e-05, "loss": 1.3111, "step": 3037 }, { "epoch": 0.9048567545933469, "grad_norm": 0.20297445356845856, "learning_rate": 1.9807405624967627e-05, "loss": 1.3028, "step": 3038 }, { "epoch": 0.9051546007930155, "grad_norm": 0.1968143731355667, "learning_rate": 1.980721718506507e-05, "loss": 1.2874, "step": 3039 }, { "epoch": 0.9054524469926841, "grad_norm": 0.19670012593269348, "learning_rate": 1.9807028653917315e-05, "loss": 1.2864, "step": 3040 }, { "epoch": 0.9057502931923528, "grad_norm": 0.1936255693435669, "learning_rate": 1.9806840031526113e-05, "loss": 1.3033, "step": 3041 }, { "epoch": 0.9060481393920214, "grad_norm": 0.2041553556919098, "learning_rate": 1.9806651317893224e-05, "loss": 1.314, "step": 3042 }, { "epoch": 0.9063459855916901, "grad_norm": 0.20097097754478455, "learning_rate": 1.9806462513020402e-05, "loss": 1.3107, "step": 3043 }, { "epoch": 0.9066438317913588, "grad_norm": 0.20446543395519257, "learning_rate": 1.9806273616909403e-05, "loss": 1.3226, "step": 3044 }, { "epoch": 0.9069416779910274, "grad_norm": 0.19455336034297943, "learning_rate": 1.980608462956198e-05, "loss": 1.3077, "step": 3045 }, { "epoch": 0.907239524190696, "grad_norm": 0.20479156076908112, "learning_rate": 1.9805895550979898e-05, "loss": 1.3041, "step": 3046 }, { "epoch": 0.9075373703903646, "grad_norm": 0.1962958574295044, "learning_rate": 1.9805706381164917e-05, "loss": 1.3184, "step": 3047 }, { "epoch": 0.9078352165900333, "grad_norm": 0.2033453732728958, "learning_rate": 1.9805517120118788e-05, "loss": 1.3003, "step": 3048 }, { "epoch": 0.908133062789702, "grad_norm": 0.21646438539028168, "learning_rate": 1.980532776784328e-05, "loss": 1.3184, "step": 3049 }, { "epoch": 0.9084309089893706, "grad_norm": 0.20765748620033264, "learning_rate": 1.9805138324340152e-05, "loss": 1.3167, "step": 3050 }, { "epoch": 0.9087287551890393, "grad_norm": 0.2132749706506729, "learning_rate": 1.9804948789611166e-05, "loss": 1.3433, "step": 3051 }, { "epoch": 0.9090266013887079, "grad_norm": 0.20811358094215393, "learning_rate": 1.980475916365809e-05, "loss": 1.3159, "step": 3052 }, { "epoch": 0.9093244475883766, "grad_norm": 0.2119991034269333, "learning_rate": 1.980456944648268e-05, "loss": 1.3237, "step": 3053 }, { "epoch": 0.9096222937880452, "grad_norm": 0.20286352932453156, "learning_rate": 1.9804379638086706e-05, "loss": 1.3194, "step": 3054 }, { "epoch": 0.9099201399877138, "grad_norm": 0.20021235942840576, "learning_rate": 1.9804189738471935e-05, "loss": 1.2853, "step": 3055 }, { "epoch": 0.9102179861873825, "grad_norm": 0.19465458393096924, "learning_rate": 1.9803999747640135e-05, "loss": 1.3183, "step": 3056 }, { "epoch": 0.9105158323870511, "grad_norm": 0.21487313508987427, "learning_rate": 1.980380966559307e-05, "loss": 1.3205, "step": 3057 }, { "epoch": 0.9108136785867198, "grad_norm": 0.19248226284980774, "learning_rate": 1.9803619492332507e-05, "loss": 1.317, "step": 3058 }, { "epoch": 0.9111115247863885, "grad_norm": 0.20386140048503876, "learning_rate": 1.9803429227860218e-05, "loss": 1.3019, "step": 3059 }, { "epoch": 0.9114093709860571, "grad_norm": 0.18888108432292938, "learning_rate": 1.9803238872177972e-05, "loss": 1.3095, "step": 3060 }, { "epoch": 0.9117072171857257, "grad_norm": 0.20344072580337524, "learning_rate": 1.9803048425287543e-05, "loss": 1.3146, "step": 3061 }, { "epoch": 0.9120050633853943, "grad_norm": 0.20831163227558136, "learning_rate": 1.98028578871907e-05, "loss": 1.337, "step": 3062 }, { "epoch": 0.912302909585063, "grad_norm": 0.19399350881576538, "learning_rate": 1.980266725788922e-05, "loss": 1.3102, "step": 3063 }, { "epoch": 0.9126007557847317, "grad_norm": 0.2027069479227066, "learning_rate": 1.9802476537384868e-05, "loss": 1.3089, "step": 3064 }, { "epoch": 0.9128986019844003, "grad_norm": 0.20166629552841187, "learning_rate": 1.980228572567943e-05, "loss": 1.2962, "step": 3065 }, { "epoch": 0.913196448184069, "grad_norm": 0.2001592516899109, "learning_rate": 1.980209482277467e-05, "loss": 1.3127, "step": 3066 }, { "epoch": 0.9134942943837376, "grad_norm": 0.2128322273492813, "learning_rate": 1.9801903828672372e-05, "loss": 1.3051, "step": 3067 }, { "epoch": 0.9137921405834063, "grad_norm": 0.1976434886455536, "learning_rate": 1.9801712743374312e-05, "loss": 1.3123, "step": 3068 }, { "epoch": 0.9140899867830748, "grad_norm": 0.20201466977596283, "learning_rate": 1.980152156688226e-05, "loss": 1.2979, "step": 3069 }, { "epoch": 0.9143878329827435, "grad_norm": 0.20096318423748016, "learning_rate": 1.9801330299198005e-05, "loss": 1.2987, "step": 3070 }, { "epoch": 0.9146856791824122, "grad_norm": 0.19993111491203308, "learning_rate": 1.980113894032332e-05, "loss": 1.299, "step": 3071 }, { "epoch": 0.9149835253820808, "grad_norm": 0.20647186040878296, "learning_rate": 1.980094749025999e-05, "loss": 1.2875, "step": 3072 }, { "epoch": 0.9152813715817495, "grad_norm": 0.224049374461174, "learning_rate": 1.9800755949009794e-05, "loss": 1.3375, "step": 3073 }, { "epoch": 0.9155792177814182, "grad_norm": 0.2097865641117096, "learning_rate": 1.980056431657451e-05, "loss": 1.2911, "step": 3074 }, { "epoch": 0.9158770639810868, "grad_norm": 0.20605894923210144, "learning_rate": 1.980037259295593e-05, "loss": 1.3106, "step": 3075 }, { "epoch": 0.9161749101807554, "grad_norm": 0.19733017683029175, "learning_rate": 1.9800180778155832e-05, "loss": 1.3071, "step": 3076 }, { "epoch": 0.916472756380424, "grad_norm": 0.2055257260799408, "learning_rate": 1.9799988872175996e-05, "loss": 1.2896, "step": 3077 }, { "epoch": 0.9167706025800927, "grad_norm": 0.20163756608963013, "learning_rate": 1.979979687501822e-05, "loss": 1.32, "step": 3078 }, { "epoch": 0.9170684487797613, "grad_norm": 0.20674440264701843, "learning_rate": 1.979960478668428e-05, "loss": 1.302, "step": 3079 }, { "epoch": 0.91736629497943, "grad_norm": 0.19469445943832397, "learning_rate": 1.9799412607175963e-05, "loss": 1.3096, "step": 3080 }, { "epoch": 0.9176641411790987, "grad_norm": 0.1986764520406723, "learning_rate": 1.9799220336495063e-05, "loss": 1.2967, "step": 3081 }, { "epoch": 0.9179619873787673, "grad_norm": 0.20165130496025085, "learning_rate": 1.9799027974643365e-05, "loss": 1.3115, "step": 3082 }, { "epoch": 0.9182598335784359, "grad_norm": 0.1868019998073578, "learning_rate": 1.9798835521622662e-05, "loss": 1.3056, "step": 3083 }, { "epoch": 0.9185576797781045, "grad_norm": 0.19767910242080688, "learning_rate": 1.9798642977434742e-05, "loss": 1.313, "step": 3084 }, { "epoch": 0.9188555259777732, "grad_norm": 0.20172198116779327, "learning_rate": 1.9798450342081398e-05, "loss": 1.305, "step": 3085 }, { "epoch": 0.9191533721774419, "grad_norm": 0.2029125988483429, "learning_rate": 1.9798257615564415e-05, "loss": 1.3079, "step": 3086 }, { "epoch": 0.9194512183771105, "grad_norm": 0.19753821194171906, "learning_rate": 1.9798064797885596e-05, "loss": 1.3058, "step": 3087 }, { "epoch": 0.9197490645767792, "grad_norm": 0.1987670212984085, "learning_rate": 1.9797871889046733e-05, "loss": 1.3208, "step": 3088 }, { "epoch": 0.9200469107764478, "grad_norm": 0.1995089054107666, "learning_rate": 1.9797678889049615e-05, "loss": 1.323, "step": 3089 }, { "epoch": 0.9203447569761165, "grad_norm": 0.19714906811714172, "learning_rate": 1.9797485797896045e-05, "loss": 1.3018, "step": 3090 }, { "epoch": 0.9206426031757851, "grad_norm": 0.19496291875839233, "learning_rate": 1.9797292615587814e-05, "loss": 1.3045, "step": 3091 }, { "epoch": 0.9209404493754537, "grad_norm": 0.20265202224254608, "learning_rate": 1.9797099342126726e-05, "loss": 1.3135, "step": 3092 }, { "epoch": 0.9212382955751224, "grad_norm": 0.19452430307865143, "learning_rate": 1.979690597751457e-05, "loss": 1.3063, "step": 3093 }, { "epoch": 0.921536141774791, "grad_norm": 0.19185176491737366, "learning_rate": 1.979671252175315e-05, "loss": 1.3013, "step": 3094 }, { "epoch": 0.9218339879744597, "grad_norm": 0.21012099087238312, "learning_rate": 1.9796518974844265e-05, "loss": 1.3092, "step": 3095 }, { "epoch": 0.9221318341741284, "grad_norm": 0.22558675706386566, "learning_rate": 1.9796325336789716e-05, "loss": 1.3408, "step": 3096 }, { "epoch": 0.922429680373797, "grad_norm": 0.1874483972787857, "learning_rate": 1.9796131607591305e-05, "loss": 1.2966, "step": 3097 }, { "epoch": 0.9227275265734656, "grad_norm": 0.2448127418756485, "learning_rate": 1.9795937787250835e-05, "loss": 1.3082, "step": 3098 }, { "epoch": 0.9230253727731342, "grad_norm": 0.18637888133525848, "learning_rate": 1.979574387577011e-05, "loss": 1.3185, "step": 3099 }, { "epoch": 0.9233232189728029, "grad_norm": 0.2024553120136261, "learning_rate": 1.979554987315093e-05, "loss": 1.3131, "step": 3100 }, { "epoch": 0.9236210651724716, "grad_norm": 0.18883754312992096, "learning_rate": 1.9795355779395107e-05, "loss": 1.3215, "step": 3101 }, { "epoch": 0.9239189113721402, "grad_norm": 0.19369982182979584, "learning_rate": 1.979516159450444e-05, "loss": 1.3142, "step": 3102 }, { "epoch": 0.9242167575718089, "grad_norm": 0.20784643292427063, "learning_rate": 1.9794967318480735e-05, "loss": 1.2958, "step": 3103 }, { "epoch": 0.9245146037714775, "grad_norm": 0.1899401843547821, "learning_rate": 1.979477295132581e-05, "loss": 1.3032, "step": 3104 }, { "epoch": 0.9248124499711462, "grad_norm": 0.19492416083812714, "learning_rate": 1.9794578493041458e-05, "loss": 1.3201, "step": 3105 }, { "epoch": 0.9251102961708147, "grad_norm": 0.19163598120212555, "learning_rate": 1.97943839436295e-05, "loss": 1.3131, "step": 3106 }, { "epoch": 0.9254081423704834, "grad_norm": 0.1966073364019394, "learning_rate": 1.979418930309174e-05, "loss": 1.3071, "step": 3107 }, { "epoch": 0.9257059885701521, "grad_norm": 0.2052278071641922, "learning_rate": 1.9793994571429996e-05, "loss": 1.3031, "step": 3108 }, { "epoch": 0.9260038347698207, "grad_norm": 0.1949099451303482, "learning_rate": 1.9793799748646073e-05, "loss": 1.3307, "step": 3109 }, { "epoch": 0.9263016809694894, "grad_norm": 0.20263433456420898, "learning_rate": 1.9793604834741785e-05, "loss": 1.2873, "step": 3110 }, { "epoch": 0.926599527169158, "grad_norm": 0.19447313249111176, "learning_rate": 1.9793409829718947e-05, "loss": 1.322, "step": 3111 }, { "epoch": 0.9268973733688267, "grad_norm": 0.18885290622711182, "learning_rate": 1.9793214733579373e-05, "loss": 1.3018, "step": 3112 }, { "epoch": 0.9271952195684953, "grad_norm": 0.19127033650875092, "learning_rate": 1.9793019546324874e-05, "loss": 1.2934, "step": 3113 }, { "epoch": 0.9274930657681639, "grad_norm": 0.19292551279067993, "learning_rate": 1.979282426795727e-05, "loss": 1.302, "step": 3114 }, { "epoch": 0.9277909119678326, "grad_norm": 0.2079010307788849, "learning_rate": 1.9792628898478382e-05, "loss": 1.3152, "step": 3115 }, { "epoch": 0.9280887581675012, "grad_norm": 0.19239436089992523, "learning_rate": 1.9792433437890017e-05, "loss": 1.3109, "step": 3116 }, { "epoch": 0.9283866043671699, "grad_norm": 0.19920912384986877, "learning_rate": 1.9792237886194002e-05, "loss": 1.2922, "step": 3117 }, { "epoch": 0.9286844505668386, "grad_norm": 0.19173072278499603, "learning_rate": 1.9792042243392157e-05, "loss": 1.3057, "step": 3118 }, { "epoch": 0.9289822967665072, "grad_norm": 0.18758858740329742, "learning_rate": 1.9791846509486294e-05, "loss": 1.3072, "step": 3119 }, { "epoch": 0.9292801429661759, "grad_norm": 0.1958310753107071, "learning_rate": 1.9791650684478247e-05, "loss": 1.3054, "step": 3120 }, { "epoch": 0.9295779891658444, "grad_norm": 0.20528602600097656, "learning_rate": 1.9791454768369823e-05, "loss": 1.318, "step": 3121 }, { "epoch": 0.9298758353655131, "grad_norm": 0.1926400512456894, "learning_rate": 1.979125876116285e-05, "loss": 1.3155, "step": 3122 }, { "epoch": 0.9301736815651818, "grad_norm": 0.20159479975700378, "learning_rate": 1.9791062662859162e-05, "loss": 1.2954, "step": 3123 }, { "epoch": 0.9304715277648504, "grad_norm": 0.20442180335521698, "learning_rate": 1.979086647346057e-05, "loss": 1.3036, "step": 3124 }, { "epoch": 0.9307693739645191, "grad_norm": 0.21335050463676453, "learning_rate": 1.9790670192968906e-05, "loss": 1.3026, "step": 3125 }, { "epoch": 0.9310672201641877, "grad_norm": 0.19624581933021545, "learning_rate": 1.9790473821385995e-05, "loss": 1.3245, "step": 3126 }, { "epoch": 0.9313650663638564, "grad_norm": 0.20077237486839294, "learning_rate": 1.9790277358713662e-05, "loss": 1.3139, "step": 3127 }, { "epoch": 0.931662912563525, "grad_norm": 0.2491120547056198, "learning_rate": 1.979008080495374e-05, "loss": 1.3303, "step": 3128 }, { "epoch": 0.9319607587631936, "grad_norm": 0.2023068517446518, "learning_rate": 1.978988416010805e-05, "loss": 1.3117, "step": 3129 }, { "epoch": 0.9322586049628623, "grad_norm": 0.19905631244182587, "learning_rate": 1.978968742417843e-05, "loss": 1.3005, "step": 3130 }, { "epoch": 0.9325564511625309, "grad_norm": 0.20414213836193085, "learning_rate": 1.97894905971667e-05, "loss": 1.3076, "step": 3131 }, { "epoch": 0.9328542973621996, "grad_norm": 0.2147301733493805, "learning_rate": 1.9789293679074704e-05, "loss": 1.3346, "step": 3132 }, { "epoch": 0.9331521435618683, "grad_norm": 0.19705048203468323, "learning_rate": 1.9789096669904266e-05, "loss": 1.3014, "step": 3133 }, { "epoch": 0.9334499897615369, "grad_norm": 0.1927923560142517, "learning_rate": 1.9788899569657216e-05, "loss": 1.334, "step": 3134 }, { "epoch": 0.9337478359612056, "grad_norm": 0.20143796503543854, "learning_rate": 1.97887023783354e-05, "loss": 1.3158, "step": 3135 }, { "epoch": 0.9340456821608741, "grad_norm": 0.20049843192100525, "learning_rate": 1.9788505095940636e-05, "loss": 1.3028, "step": 3136 }, { "epoch": 0.9343435283605428, "grad_norm": 0.1882764995098114, "learning_rate": 1.9788307722474774e-05, "loss": 1.3072, "step": 3137 }, { "epoch": 0.9346413745602115, "grad_norm": 0.19787998497486115, "learning_rate": 1.9788110257939644e-05, "loss": 1.3258, "step": 3138 }, { "epoch": 0.9349392207598801, "grad_norm": 0.1986810564994812, "learning_rate": 1.978791270233708e-05, "loss": 1.3077, "step": 3139 }, { "epoch": 0.9352370669595488, "grad_norm": 0.2063145935535431, "learning_rate": 1.978771505566893e-05, "loss": 1.3167, "step": 3140 }, { "epoch": 0.9355349131592174, "grad_norm": 0.19711002707481384, "learning_rate": 1.9787517317937025e-05, "loss": 1.3141, "step": 3141 }, { "epoch": 0.9358327593588861, "grad_norm": 0.19143569469451904, "learning_rate": 1.978731948914321e-05, "loss": 1.309, "step": 3142 }, { "epoch": 0.9361306055585547, "grad_norm": 0.19706933200359344, "learning_rate": 1.9787121569289316e-05, "loss": 1.3104, "step": 3143 }, { "epoch": 0.9364284517582233, "grad_norm": 0.19449691474437714, "learning_rate": 1.9786923558377192e-05, "loss": 1.3102, "step": 3144 }, { "epoch": 0.936726297957892, "grad_norm": 0.2025444209575653, "learning_rate": 1.978672545640868e-05, "loss": 1.2949, "step": 3145 }, { "epoch": 0.9370241441575606, "grad_norm": 0.19922438263893127, "learning_rate": 1.978652726338562e-05, "loss": 1.3044, "step": 3146 }, { "epoch": 0.9373219903572293, "grad_norm": 0.19880349934101105, "learning_rate": 1.9786328979309865e-05, "loss": 1.3072, "step": 3147 }, { "epoch": 0.937619836556898, "grad_norm": 0.19347430765628815, "learning_rate": 1.9786130604183244e-05, "loss": 1.304, "step": 3148 }, { "epoch": 0.9379176827565666, "grad_norm": 0.1958763152360916, "learning_rate": 1.9785932138007617e-05, "loss": 1.3203, "step": 3149 }, { "epoch": 0.9382155289562353, "grad_norm": 0.1980467438697815, "learning_rate": 1.9785733580784823e-05, "loss": 1.3051, "step": 3150 }, { "epoch": 0.9385133751559038, "grad_norm": 0.19710774719715118, "learning_rate": 1.978553493251671e-05, "loss": 1.3101, "step": 3151 }, { "epoch": 0.9388112213555725, "grad_norm": 0.18674476444721222, "learning_rate": 1.978533619320513e-05, "loss": 1.303, "step": 3152 }, { "epoch": 0.9391090675552412, "grad_norm": 0.20070084929466248, "learning_rate": 1.978513736285193e-05, "loss": 1.3192, "step": 3153 }, { "epoch": 0.9394069137549098, "grad_norm": 0.1897594928741455, "learning_rate": 1.9784938441458957e-05, "loss": 1.2943, "step": 3154 }, { "epoch": 0.9397047599545785, "grad_norm": 0.206159770488739, "learning_rate": 1.9784739429028063e-05, "loss": 1.3178, "step": 3155 }, { "epoch": 0.9400026061542471, "grad_norm": 0.19705045223236084, "learning_rate": 1.9784540325561104e-05, "loss": 1.2959, "step": 3156 }, { "epoch": 0.9403004523539158, "grad_norm": 0.1905844360589981, "learning_rate": 1.978434113105993e-05, "loss": 1.2994, "step": 3157 }, { "epoch": 0.9405982985535843, "grad_norm": 0.19088006019592285, "learning_rate": 1.978414184552639e-05, "loss": 1.3053, "step": 3158 }, { "epoch": 0.940896144753253, "grad_norm": 0.19885419309139252, "learning_rate": 1.9783942468962343e-05, "loss": 1.3257, "step": 3159 }, { "epoch": 0.9411939909529217, "grad_norm": 0.20647475123405457, "learning_rate": 1.978374300136964e-05, "loss": 1.301, "step": 3160 }, { "epoch": 0.9414918371525903, "grad_norm": 0.20827622711658478, "learning_rate": 1.9783543442750144e-05, "loss": 1.2988, "step": 3161 }, { "epoch": 0.941789683352259, "grad_norm": 0.1971600353717804, "learning_rate": 1.9783343793105705e-05, "loss": 1.2978, "step": 3162 }, { "epoch": 0.9420875295519276, "grad_norm": 0.20790520310401917, "learning_rate": 1.9783144052438184e-05, "loss": 1.3042, "step": 3163 }, { "epoch": 0.9423853757515963, "grad_norm": 0.19935233891010284, "learning_rate": 1.9782944220749438e-05, "loss": 1.2978, "step": 3164 }, { "epoch": 0.9426832219512649, "grad_norm": 0.19943548738956451, "learning_rate": 1.9782744298041322e-05, "loss": 1.3118, "step": 3165 }, { "epoch": 0.9429810681509335, "grad_norm": 0.19326171278953552, "learning_rate": 1.9782544284315702e-05, "loss": 1.3204, "step": 3166 }, { "epoch": 0.9432789143506022, "grad_norm": 0.21011726558208466, "learning_rate": 1.978234417957444e-05, "loss": 1.3276, "step": 3167 }, { "epoch": 0.9435767605502708, "grad_norm": 0.20445969700813293, "learning_rate": 1.9782143983819392e-05, "loss": 1.3084, "step": 3168 }, { "epoch": 0.9438746067499395, "grad_norm": 0.19919198751449585, "learning_rate": 1.9781943697052427e-05, "loss": 1.2952, "step": 3169 }, { "epoch": 0.9441724529496082, "grad_norm": 0.2089526355266571, "learning_rate": 1.97817433192754e-05, "loss": 1.304, "step": 3170 }, { "epoch": 0.9444702991492768, "grad_norm": 0.19269444048404694, "learning_rate": 1.9781542850490182e-05, "loss": 1.2865, "step": 3171 }, { "epoch": 0.9447681453489455, "grad_norm": 0.20415502786636353, "learning_rate": 1.978134229069864e-05, "loss": 1.3236, "step": 3172 }, { "epoch": 0.945065991548614, "grad_norm": 0.19797283411026, "learning_rate": 1.9781141639902636e-05, "loss": 1.3045, "step": 3173 }, { "epoch": 0.9453638377482827, "grad_norm": 0.18118560314178467, "learning_rate": 1.978094089810403e-05, "loss": 1.2933, "step": 3174 }, { "epoch": 0.9456616839479514, "grad_norm": 0.19790440797805786, "learning_rate": 1.9780740065304703e-05, "loss": 1.2991, "step": 3175 }, { "epoch": 0.94595953014762, "grad_norm": 0.19850385189056396, "learning_rate": 1.9780539141506515e-05, "loss": 1.3124, "step": 3176 }, { "epoch": 0.9462573763472887, "grad_norm": 0.19748616218566895, "learning_rate": 1.978033812671134e-05, "loss": 1.3223, "step": 3177 }, { "epoch": 0.9465552225469573, "grad_norm": 0.19483156502246857, "learning_rate": 1.9780137020921045e-05, "loss": 1.3071, "step": 3178 }, { "epoch": 0.946853068746626, "grad_norm": 0.194817915558815, "learning_rate": 1.97799358241375e-05, "loss": 1.3002, "step": 3179 }, { "epoch": 0.9471509149462946, "grad_norm": 0.19546271860599518, "learning_rate": 1.977973453636258e-05, "loss": 1.3112, "step": 3180 }, { "epoch": 0.9474487611459632, "grad_norm": 0.1972402185201645, "learning_rate": 1.977953315759816e-05, "loss": 1.2967, "step": 3181 }, { "epoch": 0.9477466073456319, "grad_norm": 0.20926529169082642, "learning_rate": 1.9779331687846105e-05, "loss": 1.3099, "step": 3182 }, { "epoch": 0.9480444535453005, "grad_norm": 0.20544731616973877, "learning_rate": 1.97791301271083e-05, "loss": 1.3136, "step": 3183 }, { "epoch": 0.9483422997449692, "grad_norm": 0.20195014774799347, "learning_rate": 1.977892847538661e-05, "loss": 1.3066, "step": 3184 }, { "epoch": 0.9486401459446379, "grad_norm": 0.20106039941310883, "learning_rate": 1.977872673268292e-05, "loss": 1.3008, "step": 3185 }, { "epoch": 0.9489379921443065, "grad_norm": 0.20220544934272766, "learning_rate": 1.9778524898999102e-05, "loss": 1.3044, "step": 3186 }, { "epoch": 0.9492358383439752, "grad_norm": 0.19450978934764862, "learning_rate": 1.9778322974337036e-05, "loss": 1.3047, "step": 3187 }, { "epoch": 0.9495336845436437, "grad_norm": 0.20149749517440796, "learning_rate": 1.97781209586986e-05, "loss": 1.3135, "step": 3188 }, { "epoch": 0.9498315307433124, "grad_norm": 0.20093722641468048, "learning_rate": 1.977791885208567e-05, "loss": 1.3067, "step": 3189 }, { "epoch": 0.950129376942981, "grad_norm": 0.1976437121629715, "learning_rate": 1.977771665450013e-05, "loss": 1.3073, "step": 3190 }, { "epoch": 0.9504272231426497, "grad_norm": 0.19089743494987488, "learning_rate": 1.977751436594386e-05, "loss": 1.3196, "step": 3191 }, { "epoch": 0.9507250693423184, "grad_norm": 0.21099534630775452, "learning_rate": 1.977731198641875e-05, "loss": 1.3068, "step": 3192 }, { "epoch": 0.951022915541987, "grad_norm": 0.1997574418783188, "learning_rate": 1.977710951592667e-05, "loss": 1.3014, "step": 3193 }, { "epoch": 0.9513207617416557, "grad_norm": 0.20798201858997345, "learning_rate": 1.977690695446951e-05, "loss": 1.2956, "step": 3194 }, { "epoch": 0.9516186079413242, "grad_norm": 0.2052495777606964, "learning_rate": 1.9776704302049155e-05, "loss": 1.3071, "step": 3195 }, { "epoch": 0.9519164541409929, "grad_norm": 0.21590670943260193, "learning_rate": 1.9776501558667488e-05, "loss": 1.299, "step": 3196 }, { "epoch": 0.9522143003406616, "grad_norm": 0.19587722420692444, "learning_rate": 1.9776298724326398e-05, "loss": 1.3029, "step": 3197 }, { "epoch": 0.9525121465403302, "grad_norm": 0.21008187532424927, "learning_rate": 1.9776095799027773e-05, "loss": 1.3249, "step": 3198 }, { "epoch": 0.9528099927399989, "grad_norm": 0.19470874965190887, "learning_rate": 1.9775892782773497e-05, "loss": 1.2969, "step": 3199 }, { "epoch": 0.9531078389396676, "grad_norm": 0.19459150731563568, "learning_rate": 1.977568967556546e-05, "loss": 1.3153, "step": 3200 }, { "epoch": 0.9534056851393362, "grad_norm": 0.1974397748708725, "learning_rate": 1.9775486477405556e-05, "loss": 1.3192, "step": 3201 }, { "epoch": 0.9537035313390049, "grad_norm": 0.19983869791030884, "learning_rate": 1.977528318829567e-05, "loss": 1.3096, "step": 3202 }, { "epoch": 0.9540013775386734, "grad_norm": 0.20790590345859528, "learning_rate": 1.9775079808237695e-05, "loss": 1.3079, "step": 3203 }, { "epoch": 0.9542992237383421, "grad_norm": 0.20371964573860168, "learning_rate": 1.9774876337233527e-05, "loss": 1.3018, "step": 3204 }, { "epoch": 0.9545970699380107, "grad_norm": 0.19802193343639374, "learning_rate": 1.977467277528505e-05, "loss": 1.3104, "step": 3205 }, { "epoch": 0.9548949161376794, "grad_norm": 0.195772185921669, "learning_rate": 1.9774469122394167e-05, "loss": 1.3017, "step": 3206 }, { "epoch": 0.9551927623373481, "grad_norm": 0.19083097577095032, "learning_rate": 1.977426537856277e-05, "loss": 1.3069, "step": 3207 }, { "epoch": 0.9554906085370167, "grad_norm": 0.20081329345703125, "learning_rate": 1.9774061543792754e-05, "loss": 1.3071, "step": 3208 }, { "epoch": 0.9557884547366854, "grad_norm": 0.2067907452583313, "learning_rate": 1.9773857618086016e-05, "loss": 1.3019, "step": 3209 }, { "epoch": 0.9560863009363539, "grad_norm": 0.21616604924201965, "learning_rate": 1.9773653601444453e-05, "loss": 1.3047, "step": 3210 }, { "epoch": 0.9563841471360226, "grad_norm": 0.2052096724510193, "learning_rate": 1.9773449493869963e-05, "loss": 1.2714, "step": 3211 }, { "epoch": 0.9566819933356913, "grad_norm": 0.20484837889671326, "learning_rate": 1.9773245295364443e-05, "loss": 1.2986, "step": 3212 }, { "epoch": 0.9569798395353599, "grad_norm": 0.19925053417682648, "learning_rate": 1.9773041005929797e-05, "loss": 1.303, "step": 3213 }, { "epoch": 0.9572776857350286, "grad_norm": 0.19449517130851746, "learning_rate": 1.9772836625567923e-05, "loss": 1.3057, "step": 3214 }, { "epoch": 0.9575755319346972, "grad_norm": 0.20237061381340027, "learning_rate": 1.977263215428072e-05, "loss": 1.3025, "step": 3215 }, { "epoch": 0.9578733781343659, "grad_norm": 0.1875840276479721, "learning_rate": 1.9772427592070095e-05, "loss": 1.2857, "step": 3216 }, { "epoch": 0.9581712243340346, "grad_norm": 0.1839030534029007, "learning_rate": 1.977222293893795e-05, "loss": 1.3042, "step": 3217 }, { "epoch": 0.9584690705337031, "grad_norm": 0.19607189297676086, "learning_rate": 1.977201819488619e-05, "loss": 1.307, "step": 3218 }, { "epoch": 0.9587669167333718, "grad_norm": 0.2031727433204651, "learning_rate": 1.977181335991672e-05, "loss": 1.3069, "step": 3219 }, { "epoch": 0.9590647629330404, "grad_norm": 0.20891068875789642, "learning_rate": 1.9771608434031443e-05, "loss": 1.29, "step": 3220 }, { "epoch": 0.9593626091327091, "grad_norm": 0.2035285234451294, "learning_rate": 1.9771403417232265e-05, "loss": 1.3103, "step": 3221 }, { "epoch": 0.9596604553323778, "grad_norm": 0.1997441202402115, "learning_rate": 1.9771198309521095e-05, "loss": 1.3185, "step": 3222 }, { "epoch": 0.9599583015320464, "grad_norm": 0.21472379565238953, "learning_rate": 1.9770993110899847e-05, "loss": 1.305, "step": 3223 }, { "epoch": 0.9602561477317151, "grad_norm": 0.20216310024261475, "learning_rate": 1.9770787821370422e-05, "loss": 1.3031, "step": 3224 }, { "epoch": 0.9605539939313836, "grad_norm": 0.2030552327632904, "learning_rate": 1.977058244093473e-05, "loss": 1.3062, "step": 3225 }, { "epoch": 0.9608518401310523, "grad_norm": 0.19075995683670044, "learning_rate": 1.9770376969594685e-05, "loss": 1.3046, "step": 3226 }, { "epoch": 0.961149686330721, "grad_norm": 0.19615478813648224, "learning_rate": 1.97701714073522e-05, "loss": 1.3173, "step": 3227 }, { "epoch": 0.9614475325303896, "grad_norm": 0.21271677315235138, "learning_rate": 1.9769965754209188e-05, "loss": 1.3039, "step": 3228 }, { "epoch": 0.9617453787300583, "grad_norm": 0.2042091339826584, "learning_rate": 1.976976001016756e-05, "loss": 1.3038, "step": 3229 }, { "epoch": 0.9620432249297269, "grad_norm": 0.19975119829177856, "learning_rate": 1.9769554175229228e-05, "loss": 1.3012, "step": 3230 }, { "epoch": 0.9623410711293956, "grad_norm": 0.20455537736415863, "learning_rate": 1.976934824939611e-05, "loss": 1.2987, "step": 3231 }, { "epoch": 0.9626389173290643, "grad_norm": 0.19876553118228912, "learning_rate": 1.9769142232670123e-05, "loss": 1.3123, "step": 3232 }, { "epoch": 0.9629367635287328, "grad_norm": 0.19535748660564423, "learning_rate": 1.976893612505318e-05, "loss": 1.3003, "step": 3233 }, { "epoch": 0.9632346097284015, "grad_norm": 0.19817012548446655, "learning_rate": 1.9768729926547205e-05, "loss": 1.2986, "step": 3234 }, { "epoch": 0.9635324559280701, "grad_norm": 0.20017985999584198, "learning_rate": 1.976852363715411e-05, "loss": 1.3041, "step": 3235 }, { "epoch": 0.9638303021277388, "grad_norm": 0.19800353050231934, "learning_rate": 1.9768317256875814e-05, "loss": 1.3256, "step": 3236 }, { "epoch": 0.9641281483274075, "grad_norm": 0.208049014210701, "learning_rate": 1.9768110785714242e-05, "loss": 1.2828, "step": 3237 }, { "epoch": 0.9644259945270761, "grad_norm": 0.21552208065986633, "learning_rate": 1.9767904223671313e-05, "loss": 1.3083, "step": 3238 }, { "epoch": 0.9647238407267448, "grad_norm": 0.21490371227264404, "learning_rate": 1.976769757074895e-05, "loss": 1.3045, "step": 3239 }, { "epoch": 0.9650216869264133, "grad_norm": 0.19391708076000214, "learning_rate": 1.9767490826949074e-05, "loss": 1.3125, "step": 3240 }, { "epoch": 0.965319533126082, "grad_norm": 0.1888916939496994, "learning_rate": 1.976728399227361e-05, "loss": 1.2935, "step": 3241 }, { "epoch": 0.9656173793257506, "grad_norm": 0.20408473908901215, "learning_rate": 1.9767077066724475e-05, "loss": 1.3003, "step": 3242 }, { "epoch": 0.9659152255254193, "grad_norm": 0.20652951300144196, "learning_rate": 1.9766870050303603e-05, "loss": 1.3059, "step": 3243 }, { "epoch": 0.966213071725088, "grad_norm": 0.19031745195388794, "learning_rate": 1.9766662943012918e-05, "loss": 1.2848, "step": 3244 }, { "epoch": 0.9665109179247566, "grad_norm": 0.19920264184474945, "learning_rate": 1.9766455744854348e-05, "loss": 1.3031, "step": 3245 }, { "epoch": 0.9668087641244253, "grad_norm": 0.19462813436985016, "learning_rate": 1.9766248455829817e-05, "loss": 1.2851, "step": 3246 }, { "epoch": 0.9671066103240938, "grad_norm": 0.19669468700885773, "learning_rate": 1.9766041075941253e-05, "loss": 1.2877, "step": 3247 }, { "epoch": 0.9674044565237625, "grad_norm": 0.19745111465454102, "learning_rate": 1.9765833605190594e-05, "loss": 1.314, "step": 3248 }, { "epoch": 0.9677023027234312, "grad_norm": 0.19436196982860565, "learning_rate": 1.976562604357976e-05, "loss": 1.3028, "step": 3249 }, { "epoch": 0.9680001489230998, "grad_norm": 0.19147449731826782, "learning_rate": 1.976541839111069e-05, "loss": 1.2855, "step": 3250 }, { "epoch": 0.9682979951227685, "grad_norm": 0.20012982189655304, "learning_rate": 1.9765210647785308e-05, "loss": 1.3043, "step": 3251 }, { "epoch": 0.9685958413224371, "grad_norm": 0.20200534164905548, "learning_rate": 1.9765002813605554e-05, "loss": 1.3101, "step": 3252 }, { "epoch": 0.9688936875221058, "grad_norm": 0.20013867318630219, "learning_rate": 1.976479488857336e-05, "loss": 1.3185, "step": 3253 }, { "epoch": 0.9691915337217745, "grad_norm": 0.20642933249473572, "learning_rate": 1.9764586872690655e-05, "loss": 1.3037, "step": 3254 }, { "epoch": 0.969489379921443, "grad_norm": 0.196793794631958, "learning_rate": 1.9764378765959385e-05, "loss": 1.3181, "step": 3255 }, { "epoch": 0.9697872261211117, "grad_norm": 0.20247621834278107, "learning_rate": 1.9764170568381477e-05, "loss": 1.2888, "step": 3256 }, { "epoch": 0.9700850723207803, "grad_norm": 0.1943470686674118, "learning_rate": 1.9763962279958872e-05, "loss": 1.3143, "step": 3257 }, { "epoch": 0.970382918520449, "grad_norm": 0.20508818328380585, "learning_rate": 1.9763753900693504e-05, "loss": 1.3134, "step": 3258 }, { "epoch": 0.9706807647201177, "grad_norm": 0.21431271731853485, "learning_rate": 1.9763545430587313e-05, "loss": 1.3126, "step": 3259 }, { "epoch": 0.9709786109197863, "grad_norm": 0.22474008798599243, "learning_rate": 1.9763336869642248e-05, "loss": 1.319, "step": 3260 }, { "epoch": 0.971276457119455, "grad_norm": 0.20004312694072723, "learning_rate": 1.9763128217860236e-05, "loss": 1.3073, "step": 3261 }, { "epoch": 0.9715743033191235, "grad_norm": 0.20104417204856873, "learning_rate": 1.9762919475243226e-05, "loss": 1.3062, "step": 3262 }, { "epoch": 0.9718721495187922, "grad_norm": 0.20001773536205292, "learning_rate": 1.9762710641793155e-05, "loss": 1.2997, "step": 3263 }, { "epoch": 0.9721699957184609, "grad_norm": 0.20377035439014435, "learning_rate": 1.9762501717511972e-05, "loss": 1.3179, "step": 3264 }, { "epoch": 0.9724678419181295, "grad_norm": 0.19937177002429962, "learning_rate": 1.9762292702401614e-05, "loss": 1.2991, "step": 3265 }, { "epoch": 0.9727656881177982, "grad_norm": 0.19473156332969666, "learning_rate": 1.9762083596464035e-05, "loss": 1.2993, "step": 3266 }, { "epoch": 0.9730635343174668, "grad_norm": 0.20990370213985443, "learning_rate": 1.9761874399701173e-05, "loss": 1.3063, "step": 3267 }, { "epoch": 0.9733613805171355, "grad_norm": 0.20723816752433777, "learning_rate": 1.9761665112114973e-05, "loss": 1.2908, "step": 3268 }, { "epoch": 0.9736592267168042, "grad_norm": 0.20216530561447144, "learning_rate": 1.976145573370739e-05, "loss": 1.3003, "step": 3269 }, { "epoch": 0.9739570729164727, "grad_norm": 0.22414833307266235, "learning_rate": 1.976124626448037e-05, "loss": 1.2981, "step": 3270 }, { "epoch": 0.9742549191161414, "grad_norm": 0.20434461534023285, "learning_rate": 1.9761036704435853e-05, "loss": 1.2964, "step": 3271 }, { "epoch": 0.97455276531581, "grad_norm": 0.20486347377300262, "learning_rate": 1.9760827053575796e-05, "loss": 1.3062, "step": 3272 }, { "epoch": 0.9748506115154787, "grad_norm": 0.19999034702777863, "learning_rate": 1.976061731190215e-05, "loss": 1.3125, "step": 3273 }, { "epoch": 0.9751484577151474, "grad_norm": 0.20354914665222168, "learning_rate": 1.9760407479416864e-05, "loss": 1.3, "step": 3274 }, { "epoch": 0.975446303914816, "grad_norm": 0.1978183090686798, "learning_rate": 1.9760197556121893e-05, "loss": 1.2989, "step": 3275 }, { "epoch": 0.9757441501144847, "grad_norm": 0.20380771160125732, "learning_rate": 1.975998754201919e-05, "loss": 1.2998, "step": 3276 }, { "epoch": 0.9760419963141532, "grad_norm": 0.19634027779102325, "learning_rate": 1.9759777437110706e-05, "loss": 1.3153, "step": 3277 }, { "epoch": 0.9763398425138219, "grad_norm": 0.19065183401107788, "learning_rate": 1.9759567241398396e-05, "loss": 1.2888, "step": 3278 }, { "epoch": 0.9766376887134905, "grad_norm": 0.21245373785495758, "learning_rate": 1.9759356954884218e-05, "loss": 1.3291, "step": 3279 }, { "epoch": 0.9769355349131592, "grad_norm": 0.20729570090770721, "learning_rate": 1.9759146577570124e-05, "loss": 1.3185, "step": 3280 }, { "epoch": 0.9772333811128279, "grad_norm": 0.2136635184288025, "learning_rate": 1.975893610945808e-05, "loss": 1.3042, "step": 3281 }, { "epoch": 0.9775312273124965, "grad_norm": 0.20022720098495483, "learning_rate": 1.9758725550550036e-05, "loss": 1.3015, "step": 3282 }, { "epoch": 0.9778290735121652, "grad_norm": 0.20077665150165558, "learning_rate": 1.9758514900847955e-05, "loss": 1.3183, "step": 3283 }, { "epoch": 0.9781269197118339, "grad_norm": 0.21058335900306702, "learning_rate": 1.9758304160353794e-05, "loss": 1.3094, "step": 3284 }, { "epoch": 0.9784247659115024, "grad_norm": 0.19979368150234222, "learning_rate": 1.9758093329069516e-05, "loss": 1.2985, "step": 3285 }, { "epoch": 0.9787226121111711, "grad_norm": 0.20603543519973755, "learning_rate": 1.9757882406997085e-05, "loss": 1.2928, "step": 3286 }, { "epoch": 0.9790204583108397, "grad_norm": 0.21984848380088806, "learning_rate": 1.9757671394138457e-05, "loss": 1.3117, "step": 3287 }, { "epoch": 0.9793183045105084, "grad_norm": 0.20999737083911896, "learning_rate": 1.97574602904956e-05, "loss": 1.3065, "step": 3288 }, { "epoch": 0.979616150710177, "grad_norm": 0.19377809762954712, "learning_rate": 1.9757249096070475e-05, "loss": 1.2862, "step": 3289 }, { "epoch": 0.9799139969098457, "grad_norm": 0.2010391801595688, "learning_rate": 1.975703781086505e-05, "loss": 1.3056, "step": 3290 }, { "epoch": 0.9802118431095144, "grad_norm": 0.21323293447494507, "learning_rate": 1.9756826434881286e-05, "loss": 1.3036, "step": 3291 }, { "epoch": 0.9805096893091829, "grad_norm": 0.21092991530895233, "learning_rate": 1.9756614968121157e-05, "loss": 1.3103, "step": 3292 }, { "epoch": 0.9808075355088516, "grad_norm": 0.21400882303714752, "learning_rate": 1.9756403410586625e-05, "loss": 1.3083, "step": 3293 }, { "epoch": 0.9811053817085202, "grad_norm": 0.19310444593429565, "learning_rate": 1.9756191762279662e-05, "loss": 1.2848, "step": 3294 }, { "epoch": 0.9814032279081889, "grad_norm": 0.22360482811927795, "learning_rate": 1.9755980023202232e-05, "loss": 1.3122, "step": 3295 }, { "epoch": 0.9817010741078576, "grad_norm": 0.219071626663208, "learning_rate": 1.9755768193356308e-05, "loss": 1.3023, "step": 3296 }, { "epoch": 0.9819989203075262, "grad_norm": 0.22543108463287354, "learning_rate": 1.9755556272743864e-05, "loss": 1.3079, "step": 3297 }, { "epoch": 0.9822967665071949, "grad_norm": 0.20201154053211212, "learning_rate": 1.9755344261366863e-05, "loss": 1.3031, "step": 3298 }, { "epoch": 0.9825946127068635, "grad_norm": 0.21543379127979279, "learning_rate": 1.9755132159227287e-05, "loss": 1.3065, "step": 3299 }, { "epoch": 0.9828924589065321, "grad_norm": 0.2380930483341217, "learning_rate": 1.9754919966327107e-05, "loss": 1.2956, "step": 3300 }, { "epoch": 0.9831903051062008, "grad_norm": 0.2125539481639862, "learning_rate": 1.975470768266829e-05, "loss": 1.318, "step": 3301 }, { "epoch": 0.9834881513058694, "grad_norm": 0.20666386187076569, "learning_rate": 1.975449530825282e-05, "loss": 1.3041, "step": 3302 }, { "epoch": 0.9837859975055381, "grad_norm": 0.19961614906787872, "learning_rate": 1.975428284308267e-05, "loss": 1.2951, "step": 3303 }, { "epoch": 0.9840838437052067, "grad_norm": 0.2076793760061264, "learning_rate": 1.9754070287159815e-05, "loss": 1.2906, "step": 3304 }, { "epoch": 0.9843816899048754, "grad_norm": 0.21194185316562653, "learning_rate": 1.9753857640486233e-05, "loss": 1.3005, "step": 3305 }, { "epoch": 0.9846795361045441, "grad_norm": 0.22201673686504364, "learning_rate": 1.9753644903063906e-05, "loss": 1.3032, "step": 3306 }, { "epoch": 0.9849773823042126, "grad_norm": 0.20467990636825562, "learning_rate": 1.975343207489481e-05, "loss": 1.3336, "step": 3307 }, { "epoch": 0.9852752285038813, "grad_norm": 0.20192810893058777, "learning_rate": 1.9753219155980922e-05, "loss": 1.2886, "step": 3308 }, { "epoch": 0.9855730747035499, "grad_norm": 0.2171163409948349, "learning_rate": 1.9753006146324232e-05, "loss": 1.3066, "step": 3309 }, { "epoch": 0.9858709209032186, "grad_norm": 0.20057716965675354, "learning_rate": 1.9752793045926712e-05, "loss": 1.2986, "step": 3310 }, { "epoch": 0.9861687671028873, "grad_norm": 0.19825346767902374, "learning_rate": 1.9752579854790353e-05, "loss": 1.3079, "step": 3311 }, { "epoch": 0.9864666133025559, "grad_norm": 0.1995212435722351, "learning_rate": 1.9752366572917135e-05, "loss": 1.3016, "step": 3312 }, { "epoch": 0.9867644595022246, "grad_norm": 0.21994702517986298, "learning_rate": 1.9752153200309037e-05, "loss": 1.3127, "step": 3313 }, { "epoch": 0.9870623057018932, "grad_norm": 0.20895038545131683, "learning_rate": 1.9751939736968053e-05, "loss": 1.3024, "step": 3314 }, { "epoch": 0.9873601519015618, "grad_norm": 0.20712026953697205, "learning_rate": 1.9751726182896166e-05, "loss": 1.294, "step": 3315 }, { "epoch": 0.9876579981012304, "grad_norm": 0.21284706890583038, "learning_rate": 1.9751512538095357e-05, "loss": 1.2929, "step": 3316 }, { "epoch": 0.9879558443008991, "grad_norm": 0.1990649700164795, "learning_rate": 1.9751298802567624e-05, "loss": 1.313, "step": 3317 }, { "epoch": 0.9882536905005678, "grad_norm": 0.19659705460071564, "learning_rate": 1.975108497631495e-05, "loss": 1.3124, "step": 3318 }, { "epoch": 0.9885515367002364, "grad_norm": 0.19998084008693695, "learning_rate": 1.975087105933932e-05, "loss": 1.3047, "step": 3319 }, { "epoch": 0.9888493828999051, "grad_norm": 0.2667716145515442, "learning_rate": 1.9750657051642738e-05, "loss": 1.2841, "step": 3320 }, { "epoch": 0.9891472290995738, "grad_norm": 0.20697744190692902, "learning_rate": 1.975044295322718e-05, "loss": 1.2899, "step": 3321 }, { "epoch": 0.9894450752992423, "grad_norm": 0.20054593682289124, "learning_rate": 1.9750228764094645e-05, "loss": 1.2943, "step": 3322 }, { "epoch": 0.989742921498911, "grad_norm": 0.2037118375301361, "learning_rate": 1.9750014484247124e-05, "loss": 1.2944, "step": 3323 }, { "epoch": 0.9900407676985796, "grad_norm": 0.20013418793678284, "learning_rate": 1.9749800113686613e-05, "loss": 1.2788, "step": 3324 }, { "epoch": 0.9903386138982483, "grad_norm": 0.19793759286403656, "learning_rate": 1.9749585652415104e-05, "loss": 1.3095, "step": 3325 }, { "epoch": 0.990636460097917, "grad_norm": 0.20117361843585968, "learning_rate": 1.9749371100434596e-05, "loss": 1.2982, "step": 3326 }, { "epoch": 0.9909343062975856, "grad_norm": 0.2205909639596939, "learning_rate": 1.9749156457747078e-05, "loss": 1.2955, "step": 3327 }, { "epoch": 0.9912321524972543, "grad_norm": 0.21903647482395172, "learning_rate": 1.9748941724354555e-05, "loss": 1.283, "step": 3328 }, { "epoch": 0.9915299986969229, "grad_norm": 0.20330850780010223, "learning_rate": 1.974872690025902e-05, "loss": 1.3012, "step": 3329 }, { "epoch": 0.9918278448965915, "grad_norm": 0.297417014837265, "learning_rate": 1.9748511985462474e-05, "loss": 1.3106, "step": 3330 }, { "epoch": 0.9921256910962601, "grad_norm": 0.1906329095363617, "learning_rate": 1.9748296979966916e-05, "loss": 1.3036, "step": 3331 }, { "epoch": 0.9924235372959288, "grad_norm": 0.21068215370178223, "learning_rate": 1.9748081883774346e-05, "loss": 1.3204, "step": 3332 }, { "epoch": 0.9927213834955975, "grad_norm": 0.20962785184383392, "learning_rate": 1.9747866696886767e-05, "loss": 1.3056, "step": 3333 }, { "epoch": 0.9930192296952661, "grad_norm": 0.20342715084552765, "learning_rate": 1.9747651419306174e-05, "loss": 1.3013, "step": 3334 }, { "epoch": 0.9933170758949348, "grad_norm": 0.199946790933609, "learning_rate": 1.9747436051034577e-05, "loss": 1.3148, "step": 3335 }, { "epoch": 0.9936149220946034, "grad_norm": 0.2034381926059723, "learning_rate": 1.974722059207398e-05, "loss": 1.3077, "step": 3336 }, { "epoch": 0.993912768294272, "grad_norm": 0.20843319594860077, "learning_rate": 1.974700504242639e-05, "loss": 1.3088, "step": 3337 }, { "epoch": 0.9942106144939407, "grad_norm": 0.1890590339899063, "learning_rate": 1.97467894020938e-05, "loss": 1.2777, "step": 3338 }, { "epoch": 0.9945084606936093, "grad_norm": 0.20892484486103058, "learning_rate": 1.9746573671078225e-05, "loss": 1.3049, "step": 3339 }, { "epoch": 0.994806306893278, "grad_norm": 0.19718541204929352, "learning_rate": 1.9746357849381675e-05, "loss": 1.2859, "step": 3340 }, { "epoch": 0.9951041530929466, "grad_norm": 0.21576572954654694, "learning_rate": 1.9746141937006155e-05, "loss": 1.3173, "step": 3341 }, { "epoch": 0.9954019992926153, "grad_norm": 0.20363959670066833, "learning_rate": 1.974592593395367e-05, "loss": 1.2754, "step": 3342 }, { "epoch": 0.995699845492284, "grad_norm": 0.20009107887744904, "learning_rate": 1.9745709840226236e-05, "loss": 1.2979, "step": 3343 }, { "epoch": 0.9959976916919525, "grad_norm": 0.19112053513526917, "learning_rate": 1.9745493655825858e-05, "loss": 1.305, "step": 3344 }, { "epoch": 0.9962955378916212, "grad_norm": 0.20680101215839386, "learning_rate": 1.9745277380754553e-05, "loss": 1.3173, "step": 3345 }, { "epoch": 0.9965933840912898, "grad_norm": 0.19246166944503784, "learning_rate": 1.9745061015014325e-05, "loss": 1.3163, "step": 3346 }, { "epoch": 0.9968912302909585, "grad_norm": 0.1975969821214676, "learning_rate": 1.97448445586072e-05, "loss": 1.2984, "step": 3347 }, { "epoch": 0.9971890764906272, "grad_norm": 0.1960238516330719, "learning_rate": 1.9744628011535176e-05, "loss": 1.3003, "step": 3348 }, { "epoch": 0.9974869226902958, "grad_norm": 0.20400959253311157, "learning_rate": 1.974441137380028e-05, "loss": 1.2883, "step": 3349 }, { "epoch": 0.9977847688899645, "grad_norm": 0.1912810355424881, "learning_rate": 1.9744194645404523e-05, "loss": 1.2951, "step": 3350 }, { "epoch": 0.9980826150896331, "grad_norm": 0.18957996368408203, "learning_rate": 1.9743977826349917e-05, "loss": 1.2963, "step": 3351 }, { "epoch": 0.9983804612893017, "grad_norm": 0.2020920068025589, "learning_rate": 1.974376091663849e-05, "loss": 1.3111, "step": 3352 }, { "epoch": 0.9986783074889704, "grad_norm": 0.19816243648529053, "learning_rate": 1.9743543916272252e-05, "loss": 1.2878, "step": 3353 }, { "epoch": 0.998976153688639, "grad_norm": 0.19619014859199524, "learning_rate": 1.9743326825253225e-05, "loss": 1.3091, "step": 3354 }, { "epoch": 0.9992739998883077, "grad_norm": 0.202503502368927, "learning_rate": 1.9743109643583425e-05, "loss": 1.2974, "step": 3355 }, { "epoch": 0.9995718460879763, "grad_norm": 0.19494196772575378, "learning_rate": 1.9742892371264876e-05, "loss": 1.3062, "step": 3356 }, { "epoch": 0.999869692287645, "grad_norm": 0.2075500786304474, "learning_rate": 1.97426750082996e-05, "loss": 1.311, "step": 3357 }, { "epoch": 1.0001675384873137, "grad_norm": 0.20512253046035767, "learning_rate": 1.974245755468962e-05, "loss": 1.3058, "step": 3358 }, { "epoch": 1.0004653846869822, "grad_norm": 0.2019047886133194, "learning_rate": 1.9742240010436955e-05, "loss": 1.2981, "step": 3359 }, { "epoch": 1.000763230886651, "grad_norm": 0.20687521994113922, "learning_rate": 1.974202237554363e-05, "loss": 1.3051, "step": 3360 }, { "epoch": 1.0010610770863195, "grad_norm": 0.2059999257326126, "learning_rate": 1.9741804650011673e-05, "loss": 1.3112, "step": 3361 }, { "epoch": 1.0013589232859883, "grad_norm": 0.2035730481147766, "learning_rate": 1.974158683384311e-05, "loss": 1.3189, "step": 3362 }, { "epoch": 1.0016567694856569, "grad_norm": 0.21043939888477325, "learning_rate": 1.9741368927039962e-05, "loss": 1.296, "step": 3363 }, { "epoch": 1.0019546156853254, "grad_norm": 0.20265917479991913, "learning_rate": 1.974115092960426e-05, "loss": 1.3064, "step": 3364 }, { "epoch": 1.0022524618849942, "grad_norm": 0.1880495548248291, "learning_rate": 1.9740932841538035e-05, "loss": 1.3062, "step": 3365 }, { "epoch": 1.0025503080846627, "grad_norm": 0.19218531250953674, "learning_rate": 1.9740714662843313e-05, "loss": 1.3058, "step": 3366 }, { "epoch": 1.0028481542843315, "grad_norm": 0.19349585473537445, "learning_rate": 1.9740496393522123e-05, "loss": 1.3055, "step": 3367 }, { "epoch": 1.003146000484, "grad_norm": 0.23449759185314178, "learning_rate": 1.9740278033576496e-05, "loss": 1.3067, "step": 3368 }, { "epoch": 1.0034438466836688, "grad_norm": 0.1942831575870514, "learning_rate": 1.9740059583008463e-05, "loss": 1.2853, "step": 3369 }, { "epoch": 1.0037416928833374, "grad_norm": 0.19284681975841522, "learning_rate": 1.973984104182006e-05, "loss": 1.2824, "step": 3370 }, { "epoch": 1.004039539083006, "grad_norm": 0.2020491063594818, "learning_rate": 1.973962241001332e-05, "loss": 1.2929, "step": 3371 }, { "epoch": 1.0043373852826747, "grad_norm": 0.19560948014259338, "learning_rate": 1.9739403687590268e-05, "loss": 1.2821, "step": 3372 }, { "epoch": 1.0046352314823432, "grad_norm": 0.20239092409610748, "learning_rate": 1.973918487455295e-05, "loss": 1.297, "step": 3373 }, { "epoch": 1.004933077682012, "grad_norm": 0.19475553929805756, "learning_rate": 1.9738965970903397e-05, "loss": 1.3142, "step": 3374 }, { "epoch": 1.0052309238816806, "grad_norm": 0.2040357142686844, "learning_rate": 1.973874697664365e-05, "loss": 1.3028, "step": 3375 }, { "epoch": 1.0055287700813493, "grad_norm": 0.1978982836008072, "learning_rate": 1.9738527891775746e-05, "loss": 1.303, "step": 3376 }, { "epoch": 1.0058266162810179, "grad_norm": 0.20032471418380737, "learning_rate": 1.973830871630171e-05, "loss": 1.3013, "step": 3377 }, { "epoch": 1.0061244624806864, "grad_norm": 0.2011338174343109, "learning_rate": 1.9738089450223602e-05, "loss": 1.29, "step": 3378 }, { "epoch": 1.0064223086803552, "grad_norm": 0.19828371703624725, "learning_rate": 1.9737870093543446e-05, "loss": 1.3042, "step": 3379 }, { "epoch": 1.0067201548800238, "grad_norm": 0.20131751894950867, "learning_rate": 1.973765064626329e-05, "loss": 1.2847, "step": 3380 }, { "epoch": 1.0070180010796925, "grad_norm": 0.19987799227237701, "learning_rate": 1.9737431108385174e-05, "loss": 1.3098, "step": 3381 }, { "epoch": 1.007315847279361, "grad_norm": 0.20842686295509338, "learning_rate": 1.9737211479911143e-05, "loss": 1.3177, "step": 3382 }, { "epoch": 1.0076136934790298, "grad_norm": 0.19841377437114716, "learning_rate": 1.9736991760843235e-05, "loss": 1.2875, "step": 3383 }, { "epoch": 1.0079115396786984, "grad_norm": 0.1997886598110199, "learning_rate": 1.97367719511835e-05, "loss": 1.2997, "step": 3384 }, { "epoch": 1.008209385878367, "grad_norm": 0.20692330598831177, "learning_rate": 1.973655205093398e-05, "loss": 1.3109, "step": 3385 }, { "epoch": 1.0085072320780357, "grad_norm": 0.19390694797039032, "learning_rate": 1.973633206009672e-05, "loss": 1.327, "step": 3386 }, { "epoch": 1.0088050782777043, "grad_norm": 0.207501620054245, "learning_rate": 1.973611197867377e-05, "loss": 1.2863, "step": 3387 }, { "epoch": 1.009102924477373, "grad_norm": 0.2012711763381958, "learning_rate": 1.9735891806667173e-05, "loss": 1.2983, "step": 3388 }, { "epoch": 1.0094007706770416, "grad_norm": 0.20405852794647217, "learning_rate": 1.9735671544078985e-05, "loss": 1.3109, "step": 3389 }, { "epoch": 1.0096986168767104, "grad_norm": 0.20827427506446838, "learning_rate": 1.973545119091125e-05, "loss": 1.3079, "step": 3390 }, { "epoch": 1.009996463076379, "grad_norm": 0.21407508850097656, "learning_rate": 1.9735230747166014e-05, "loss": 1.2979, "step": 3391 }, { "epoch": 1.0102943092760477, "grad_norm": 0.20271380245685577, "learning_rate": 1.9735010212845336e-05, "loss": 1.3111, "step": 3392 }, { "epoch": 1.0105921554757162, "grad_norm": 0.19732552766799927, "learning_rate": 1.9734789587951264e-05, "loss": 1.3056, "step": 3393 }, { "epoch": 1.0108900016753848, "grad_norm": 0.20793870091438293, "learning_rate": 1.9734568872485852e-05, "loss": 1.3091, "step": 3394 }, { "epoch": 1.0111878478750536, "grad_norm": 0.21210235357284546, "learning_rate": 1.973434806645115e-05, "loss": 1.2873, "step": 3395 }, { "epoch": 1.011485694074722, "grad_norm": 0.19632075726985931, "learning_rate": 1.973412716984922e-05, "loss": 1.2919, "step": 3396 }, { "epoch": 1.0117835402743909, "grad_norm": 0.20312587916851044, "learning_rate": 1.9733906182682107e-05, "loss": 1.2898, "step": 3397 }, { "epoch": 1.0120813864740594, "grad_norm": 0.19858725368976593, "learning_rate": 1.9733685104951874e-05, "loss": 1.3011, "step": 3398 }, { "epoch": 1.0123792326737282, "grad_norm": 0.20250661671161652, "learning_rate": 1.973346393666058e-05, "loss": 1.3047, "step": 3399 }, { "epoch": 1.0126770788733968, "grad_norm": 0.2005874067544937, "learning_rate": 1.9733242677810277e-05, "loss": 1.3041, "step": 3400 }, { "epoch": 1.0129749250730653, "grad_norm": 0.2064727246761322, "learning_rate": 1.9733021328403022e-05, "loss": 1.2991, "step": 3401 }, { "epoch": 1.013272771272734, "grad_norm": 0.21274413168430328, "learning_rate": 1.973279988844088e-05, "loss": 1.2957, "step": 3402 }, { "epoch": 1.0135706174724026, "grad_norm": 0.2098504602909088, "learning_rate": 1.9732578357925913e-05, "loss": 1.3196, "step": 3403 }, { "epoch": 1.0138684636720714, "grad_norm": 0.20712429285049438, "learning_rate": 1.9732356736860172e-05, "loss": 1.3001, "step": 3404 }, { "epoch": 1.01416630987174, "grad_norm": 0.2014516294002533, "learning_rate": 1.973213502524573e-05, "loss": 1.2929, "step": 3405 }, { "epoch": 1.0144641560714087, "grad_norm": 0.20681215822696686, "learning_rate": 1.973191322308464e-05, "loss": 1.307, "step": 3406 }, { "epoch": 1.0147620022710773, "grad_norm": 0.21458099782466888, "learning_rate": 1.9731691330378972e-05, "loss": 1.3124, "step": 3407 }, { "epoch": 1.0150598484707458, "grad_norm": 0.20352648198604584, "learning_rate": 1.9731469347130793e-05, "loss": 1.29, "step": 3408 }, { "epoch": 1.0153576946704146, "grad_norm": 0.20370642840862274, "learning_rate": 1.973124727334216e-05, "loss": 1.2969, "step": 3409 }, { "epoch": 1.0156555408700831, "grad_norm": 0.21255552768707275, "learning_rate": 1.9731025109015146e-05, "loss": 1.3053, "step": 3410 }, { "epoch": 1.015953387069752, "grad_norm": 0.21125616133213043, "learning_rate": 1.9730802854151815e-05, "loss": 1.3053, "step": 3411 }, { "epoch": 1.0162512332694205, "grad_norm": 0.1962079554796219, "learning_rate": 1.9730580508754235e-05, "loss": 1.312, "step": 3412 }, { "epoch": 1.0165490794690892, "grad_norm": 0.19924552738666534, "learning_rate": 1.9730358072824476e-05, "loss": 1.2835, "step": 3413 }, { "epoch": 1.0168469256687578, "grad_norm": 0.2046997994184494, "learning_rate": 1.9730135546364605e-05, "loss": 1.2881, "step": 3414 }, { "epoch": 1.0171447718684263, "grad_norm": 0.21070091426372528, "learning_rate": 1.9729912929376695e-05, "loss": 1.3205, "step": 3415 }, { "epoch": 1.017442618068095, "grad_norm": 0.19929635524749756, "learning_rate": 1.9729690221862816e-05, "loss": 1.3032, "step": 3416 }, { "epoch": 1.0177404642677637, "grad_norm": 0.19725117087364197, "learning_rate": 1.972946742382504e-05, "loss": 1.312, "step": 3417 }, { "epoch": 1.0180383104674324, "grad_norm": 0.21880078315734863, "learning_rate": 1.972924453526544e-05, "loss": 1.2986, "step": 3418 }, { "epoch": 1.018336156667101, "grad_norm": 0.21586327254772186, "learning_rate": 1.972902155618609e-05, "loss": 1.3067, "step": 3419 }, { "epoch": 1.0186340028667697, "grad_norm": 0.21872232854366302, "learning_rate": 1.9728798486589065e-05, "loss": 1.3118, "step": 3420 }, { "epoch": 1.0189318490664383, "grad_norm": 0.2075866013765335, "learning_rate": 1.9728575326476437e-05, "loss": 1.3075, "step": 3421 }, { "epoch": 1.019229695266107, "grad_norm": 0.21058566868305206, "learning_rate": 1.9728352075850287e-05, "loss": 1.3022, "step": 3422 }, { "epoch": 1.0195275414657756, "grad_norm": 0.20899464190006256, "learning_rate": 1.972812873471269e-05, "loss": 1.2853, "step": 3423 }, { "epoch": 1.0198253876654442, "grad_norm": 0.22943811118602753, "learning_rate": 1.9727905303065724e-05, "loss": 1.3279, "step": 3424 }, { "epoch": 1.020123233865113, "grad_norm": 0.20464380085468292, "learning_rate": 1.9727681780911467e-05, "loss": 1.3055, "step": 3425 }, { "epoch": 1.0204210800647815, "grad_norm": 0.20870985090732574, "learning_rate": 1.9727458168252e-05, "loss": 1.3253, "step": 3426 }, { "epoch": 1.0207189262644503, "grad_norm": 0.2142518013715744, "learning_rate": 1.9727234465089406e-05, "loss": 1.3113, "step": 3427 }, { "epoch": 1.0210167724641188, "grad_norm": 0.20791509747505188, "learning_rate": 1.9727010671425758e-05, "loss": 1.2828, "step": 3428 }, { "epoch": 1.0213146186637876, "grad_norm": 0.20511488616466522, "learning_rate": 1.9726786787263145e-05, "loss": 1.2879, "step": 3429 }, { "epoch": 1.0216124648634561, "grad_norm": 0.20509251952171326, "learning_rate": 1.972656281260365e-05, "loss": 1.293, "step": 3430 }, { "epoch": 1.0219103110631247, "grad_norm": 0.20584897696971893, "learning_rate": 1.9726338747449356e-05, "loss": 1.2958, "step": 3431 }, { "epoch": 1.0222081572627935, "grad_norm": 0.2150140404701233, "learning_rate": 1.9726114591802343e-05, "loss": 1.3014, "step": 3432 }, { "epoch": 1.022506003462462, "grad_norm": 0.22393961250782013, "learning_rate": 1.9725890345664703e-05, "loss": 1.3094, "step": 3433 }, { "epoch": 1.0228038496621308, "grad_norm": 0.20525576174259186, "learning_rate": 1.972566600903852e-05, "loss": 1.3008, "step": 3434 }, { "epoch": 1.0231016958617993, "grad_norm": 0.20266354084014893, "learning_rate": 1.972544158192588e-05, "loss": 1.3082, "step": 3435 }, { "epoch": 1.023399542061468, "grad_norm": 0.21380510926246643, "learning_rate": 1.972521706432887e-05, "loss": 1.312, "step": 3436 }, { "epoch": 1.0236973882611367, "grad_norm": 0.20366153120994568, "learning_rate": 1.9724992456249584e-05, "loss": 1.3105, "step": 3437 }, { "epoch": 1.0239952344608052, "grad_norm": 0.21148470044136047, "learning_rate": 1.972476775769011e-05, "loss": 1.3014, "step": 3438 }, { "epoch": 1.024293080660474, "grad_norm": 0.21774984896183014, "learning_rate": 1.9724542968652532e-05, "loss": 1.2974, "step": 3439 }, { "epoch": 1.0245909268601425, "grad_norm": 0.20611019432544708, "learning_rate": 1.9724318089138948e-05, "loss": 1.2826, "step": 3440 }, { "epoch": 1.0248887730598113, "grad_norm": 0.20914271473884583, "learning_rate": 1.9724093119151454e-05, "loss": 1.3097, "step": 3441 }, { "epoch": 1.0251866192594798, "grad_norm": 0.21499302983283997, "learning_rate": 1.972386805869213e-05, "loss": 1.2998, "step": 3442 }, { "epoch": 1.0254844654591486, "grad_norm": 0.19540037214756012, "learning_rate": 1.9723642907763082e-05, "loss": 1.2874, "step": 3443 }, { "epoch": 1.0257823116588172, "grad_norm": 0.20741817355155945, "learning_rate": 1.97234176663664e-05, "loss": 1.3061, "step": 3444 }, { "epoch": 1.0260801578584857, "grad_norm": 0.2004505693912506, "learning_rate": 1.9723192334504183e-05, "loss": 1.3131, "step": 3445 }, { "epoch": 1.0263780040581545, "grad_norm": 0.19974607229232788, "learning_rate": 1.972296691217852e-05, "loss": 1.3027, "step": 3446 }, { "epoch": 1.026675850257823, "grad_norm": 0.2027173936367035, "learning_rate": 1.9722741399391517e-05, "loss": 1.3005, "step": 3447 }, { "epoch": 1.0269736964574918, "grad_norm": 0.20132912695407867, "learning_rate": 1.972251579614527e-05, "loss": 1.2963, "step": 3448 }, { "epoch": 1.0272715426571604, "grad_norm": 0.2098199427127838, "learning_rate": 1.972229010244187e-05, "loss": 1.3115, "step": 3449 }, { "epoch": 1.0275693888568291, "grad_norm": 0.20256924629211426, "learning_rate": 1.9722064318283425e-05, "loss": 1.3161, "step": 3450 }, { "epoch": 1.0278672350564977, "grad_norm": 0.20966893434524536, "learning_rate": 1.9721838443672035e-05, "loss": 1.2963, "step": 3451 }, { "epoch": 1.0281650812561662, "grad_norm": 0.2011045664548874, "learning_rate": 1.97216124786098e-05, "loss": 1.3011, "step": 3452 }, { "epoch": 1.028462927455835, "grad_norm": 0.19789999723434448, "learning_rate": 1.972138642309882e-05, "loss": 1.2844, "step": 3453 }, { "epoch": 1.0287607736555036, "grad_norm": 0.20889869332313538, "learning_rate": 1.9721160277141205e-05, "loss": 1.2916, "step": 3454 }, { "epoch": 1.0290586198551723, "grad_norm": 0.19314032793045044, "learning_rate": 1.972093404073905e-05, "loss": 1.3052, "step": 3455 }, { "epoch": 1.0293564660548409, "grad_norm": 0.1976383626461029, "learning_rate": 1.972070771389447e-05, "loss": 1.2919, "step": 3456 }, { "epoch": 1.0296543122545097, "grad_norm": 0.20616964995861053, "learning_rate": 1.9720481296609562e-05, "loss": 1.3037, "step": 3457 }, { "epoch": 1.0299521584541782, "grad_norm": 0.19698336720466614, "learning_rate": 1.9720254788886435e-05, "loss": 1.3013, "step": 3458 }, { "epoch": 1.030250004653847, "grad_norm": 0.2089948058128357, "learning_rate": 1.97200281907272e-05, "loss": 1.2874, "step": 3459 }, { "epoch": 1.0305478508535155, "grad_norm": 0.20039913058280945, "learning_rate": 1.971980150213396e-05, "loss": 1.2924, "step": 3460 }, { "epoch": 1.030845697053184, "grad_norm": 0.192903533577919, "learning_rate": 1.9719574723108828e-05, "loss": 1.2985, "step": 3461 }, { "epoch": 1.0311435432528528, "grad_norm": 0.19958768784999847, "learning_rate": 1.971934785365391e-05, "loss": 1.3123, "step": 3462 }, { "epoch": 1.0314413894525214, "grad_norm": 0.20145928859710693, "learning_rate": 1.9719120893771328e-05, "loss": 1.3063, "step": 3463 }, { "epoch": 1.0317392356521902, "grad_norm": 0.20627877116203308, "learning_rate": 1.9718893843463177e-05, "loss": 1.3132, "step": 3464 }, { "epoch": 1.0320370818518587, "grad_norm": 0.20193171501159668, "learning_rate": 1.971866670273158e-05, "loss": 1.3049, "step": 3465 }, { "epoch": 1.0323349280515275, "grad_norm": 0.21276478469371796, "learning_rate": 1.9718439471578645e-05, "loss": 1.3172, "step": 3466 }, { "epoch": 1.032632774251196, "grad_norm": 0.21268029510974884, "learning_rate": 1.971821215000649e-05, "loss": 1.3027, "step": 3467 }, { "epoch": 1.0329306204508646, "grad_norm": 0.20449820160865784, "learning_rate": 1.971798473801723e-05, "loss": 1.3016, "step": 3468 }, { "epoch": 1.0332284666505334, "grad_norm": 0.2022082656621933, "learning_rate": 1.9717757235612977e-05, "loss": 1.2869, "step": 3469 }, { "epoch": 1.033526312850202, "grad_norm": 0.22149288654327393, "learning_rate": 1.9717529642795853e-05, "loss": 1.2966, "step": 3470 }, { "epoch": 1.0338241590498707, "grad_norm": 0.20105499029159546, "learning_rate": 1.971730195956797e-05, "loss": 1.2822, "step": 3471 }, { "epoch": 1.0341220052495392, "grad_norm": 0.21253730356693268, "learning_rate": 1.9717074185931454e-05, "loss": 1.2882, "step": 3472 }, { "epoch": 1.034419851449208, "grad_norm": 0.22005505859851837, "learning_rate": 1.9716846321888417e-05, "loss": 1.3278, "step": 3473 }, { "epoch": 1.0347176976488766, "grad_norm": 0.19879037141799927, "learning_rate": 1.9716618367440978e-05, "loss": 1.3033, "step": 3474 }, { "epoch": 1.035015543848545, "grad_norm": 0.2200528383255005, "learning_rate": 1.9716390322591264e-05, "loss": 1.3149, "step": 3475 }, { "epoch": 1.0353133900482139, "grad_norm": 0.21222293376922607, "learning_rate": 1.9716162187341393e-05, "loss": 1.3334, "step": 3476 }, { "epoch": 1.0356112362478824, "grad_norm": 0.22255544364452362, "learning_rate": 1.971593396169349e-05, "loss": 1.2955, "step": 3477 }, { "epoch": 1.0359090824475512, "grad_norm": 0.2219313234090805, "learning_rate": 1.9715705645649677e-05, "loss": 1.3059, "step": 3478 }, { "epoch": 1.0362069286472197, "grad_norm": 0.19292521476745605, "learning_rate": 1.9715477239212074e-05, "loss": 1.2983, "step": 3479 }, { "epoch": 1.0365047748468885, "grad_norm": 0.21676228940486908, "learning_rate": 1.9715248742382815e-05, "loss": 1.2815, "step": 3480 }, { "epoch": 1.036802621046557, "grad_norm": 0.21063315868377686, "learning_rate": 1.9715020155164017e-05, "loss": 1.3115, "step": 3481 }, { "epoch": 1.0371004672462258, "grad_norm": 0.2131437510251999, "learning_rate": 1.971479147755781e-05, "loss": 1.2765, "step": 3482 }, { "epoch": 1.0373983134458944, "grad_norm": 0.22631488740444183, "learning_rate": 1.9714562709566326e-05, "loss": 1.3272, "step": 3483 }, { "epoch": 1.037696159645563, "grad_norm": 0.20695947110652924, "learning_rate": 1.9714333851191688e-05, "loss": 1.2974, "step": 3484 }, { "epoch": 1.0379940058452317, "grad_norm": 0.22460728883743286, "learning_rate": 1.971410490243603e-05, "loss": 1.308, "step": 3485 }, { "epoch": 1.0382918520449003, "grad_norm": 0.21392583847045898, "learning_rate": 1.9713875863301475e-05, "loss": 1.2898, "step": 3486 }, { "epoch": 1.038589698244569, "grad_norm": 0.21392081677913666, "learning_rate": 1.9713646733790158e-05, "loss": 1.2902, "step": 3487 }, { "epoch": 1.0388875444442376, "grad_norm": 0.21687346696853638, "learning_rate": 1.9713417513904213e-05, "loss": 1.304, "step": 3488 }, { "epoch": 1.0391853906439064, "grad_norm": 0.2224649339914322, "learning_rate": 1.971318820364577e-05, "loss": 1.3146, "step": 3489 }, { "epoch": 1.039483236843575, "grad_norm": 0.2142845094203949, "learning_rate": 1.9712958803016962e-05, "loss": 1.2824, "step": 3490 }, { "epoch": 1.0397810830432435, "grad_norm": 0.19883057475090027, "learning_rate": 1.9712729312019925e-05, "loss": 1.2832, "step": 3491 }, { "epoch": 1.0400789292429122, "grad_norm": 0.20167869329452515, "learning_rate": 1.9712499730656796e-05, "loss": 1.3068, "step": 3492 }, { "epoch": 1.0403767754425808, "grad_norm": 0.21614550054073334, "learning_rate": 1.9712270058929704e-05, "loss": 1.3258, "step": 3493 }, { "epoch": 1.0406746216422496, "grad_norm": 0.22374406456947327, "learning_rate": 1.9712040296840795e-05, "loss": 1.3295, "step": 3494 }, { "epoch": 1.040972467841918, "grad_norm": 0.2011774331331253, "learning_rate": 1.9711810444392198e-05, "loss": 1.3044, "step": 3495 }, { "epoch": 1.0412703140415869, "grad_norm": 0.2045108824968338, "learning_rate": 1.971158050158606e-05, "loss": 1.3218, "step": 3496 }, { "epoch": 1.0415681602412554, "grad_norm": 0.22346441447734833, "learning_rate": 1.971135046842451e-05, "loss": 1.3086, "step": 3497 }, { "epoch": 1.041866006440924, "grad_norm": 0.22070644795894623, "learning_rate": 1.9711120344909695e-05, "loss": 1.3092, "step": 3498 }, { "epoch": 1.0421638526405927, "grad_norm": 0.2148740440607071, "learning_rate": 1.971089013104376e-05, "loss": 1.3033, "step": 3499 }, { "epoch": 1.0424616988402613, "grad_norm": 0.20926442742347717, "learning_rate": 1.971065982682884e-05, "loss": 1.312, "step": 3500 }, { "epoch": 1.0424616988402613, "eval_loss": 1.3607652187347412, "eval_runtime": 21.2984, "eval_samples_per_second": 81.415, "eval_steps_per_second": 5.118, "step": 3500 }, { "epoch": 1.04275954503993, "grad_norm": 0.21613319218158722, "learning_rate": 1.9710429432267076e-05, "loss": 1.3031, "step": 3501 }, { "epoch": 1.0430573912395986, "grad_norm": 0.21077358722686768, "learning_rate": 1.9710198947360616e-05, "loss": 1.2972, "step": 3502 }, { "epoch": 1.0433552374392674, "grad_norm": 0.2114875614643097, "learning_rate": 1.9709968372111604e-05, "loss": 1.3196, "step": 3503 }, { "epoch": 1.043653083638936, "grad_norm": 0.19951535761356354, "learning_rate": 1.970973770652219e-05, "loss": 1.2752, "step": 3504 }, { "epoch": 1.0439509298386045, "grad_norm": 0.22120660543441772, "learning_rate": 1.970950695059451e-05, "loss": 1.3142, "step": 3505 }, { "epoch": 1.0442487760382733, "grad_norm": 0.2088823914527893, "learning_rate": 1.970927610433072e-05, "loss": 1.3014, "step": 3506 }, { "epoch": 1.0445466222379418, "grad_norm": 0.2029641717672348, "learning_rate": 1.9709045167732958e-05, "loss": 1.3043, "step": 3507 }, { "epoch": 1.0448444684376106, "grad_norm": 0.20846408605575562, "learning_rate": 1.970881414080338e-05, "loss": 1.3097, "step": 3508 }, { "epoch": 1.0451423146372791, "grad_norm": 0.208656907081604, "learning_rate": 1.9708583023544138e-05, "loss": 1.3067, "step": 3509 }, { "epoch": 1.045440160836948, "grad_norm": 0.22487233579158783, "learning_rate": 1.9708351815957374e-05, "loss": 1.2978, "step": 3510 }, { "epoch": 1.0457380070366165, "grad_norm": 0.22238117456436157, "learning_rate": 1.9708120518045245e-05, "loss": 1.3077, "step": 3511 }, { "epoch": 1.046035853236285, "grad_norm": 0.22144672274589539, "learning_rate": 1.9707889129809898e-05, "loss": 1.2787, "step": 3512 }, { "epoch": 1.0463336994359538, "grad_norm": 0.2036600261926651, "learning_rate": 1.9707657651253492e-05, "loss": 1.2807, "step": 3513 }, { "epoch": 1.0466315456356223, "grad_norm": 0.2069157212972641, "learning_rate": 1.9707426082378177e-05, "loss": 1.2925, "step": 3514 }, { "epoch": 1.046929391835291, "grad_norm": 0.2219168096780777, "learning_rate": 1.970719442318611e-05, "loss": 1.3058, "step": 3515 }, { "epoch": 1.0472272380349597, "grad_norm": 0.2090456485748291, "learning_rate": 1.970696267367944e-05, "loss": 1.3094, "step": 3516 }, { "epoch": 1.0475250842346284, "grad_norm": 0.21614153683185577, "learning_rate": 1.970673083386033e-05, "loss": 1.2913, "step": 3517 }, { "epoch": 1.047822930434297, "grad_norm": 0.20335333049297333, "learning_rate": 1.9706498903730934e-05, "loss": 1.293, "step": 3518 }, { "epoch": 1.0481207766339655, "grad_norm": 0.21833182871341705, "learning_rate": 1.9706266883293413e-05, "loss": 1.3076, "step": 3519 }, { "epoch": 1.0484186228336343, "grad_norm": 0.2128637731075287, "learning_rate": 1.970603477254992e-05, "loss": 1.2974, "step": 3520 }, { "epoch": 1.0487164690333028, "grad_norm": 0.21557752788066864, "learning_rate": 1.970580257150262e-05, "loss": 1.3052, "step": 3521 }, { "epoch": 1.0490143152329716, "grad_norm": 0.21013541519641876, "learning_rate": 1.970557028015367e-05, "loss": 1.3026, "step": 3522 }, { "epoch": 1.0493121614326402, "grad_norm": 0.2258487045764923, "learning_rate": 1.970533789850523e-05, "loss": 1.3012, "step": 3523 }, { "epoch": 1.049610007632309, "grad_norm": 0.2073233425617218, "learning_rate": 1.970510542655947e-05, "loss": 1.3225, "step": 3524 }, { "epoch": 1.0499078538319775, "grad_norm": 0.2119024246931076, "learning_rate": 1.970487286431854e-05, "loss": 1.3007, "step": 3525 }, { "epoch": 1.0502057000316463, "grad_norm": 0.20058347284793854, "learning_rate": 1.9704640211784617e-05, "loss": 1.2962, "step": 3526 }, { "epoch": 1.0505035462313148, "grad_norm": 0.20473431050777435, "learning_rate": 1.9704407468959855e-05, "loss": 1.2948, "step": 3527 }, { "epoch": 1.0508013924309834, "grad_norm": 0.20855119824409485, "learning_rate": 1.9704174635846426e-05, "loss": 1.3028, "step": 3528 }, { "epoch": 1.0510992386306521, "grad_norm": 0.20362649857997894, "learning_rate": 1.9703941712446495e-05, "loss": 1.3033, "step": 3529 }, { "epoch": 1.0513970848303207, "grad_norm": 0.20610179007053375, "learning_rate": 1.9703708698762226e-05, "loss": 1.2986, "step": 3530 }, { "epoch": 1.0516949310299895, "grad_norm": 0.20430563390254974, "learning_rate": 1.9703475594795792e-05, "loss": 1.311, "step": 3531 }, { "epoch": 1.051992777229658, "grad_norm": 0.20857155323028564, "learning_rate": 1.9703242400549358e-05, "loss": 1.3088, "step": 3532 }, { "epoch": 1.0522906234293268, "grad_norm": 0.19734951853752136, "learning_rate": 1.970300911602509e-05, "loss": 1.3023, "step": 3533 }, { "epoch": 1.0525884696289953, "grad_norm": 0.20390652120113373, "learning_rate": 1.970277574122517e-05, "loss": 1.2888, "step": 3534 }, { "epoch": 1.0528863158286639, "grad_norm": 0.20076511800289154, "learning_rate": 1.970254227615176e-05, "loss": 1.3049, "step": 3535 }, { "epoch": 1.0531841620283326, "grad_norm": 0.19973789155483246, "learning_rate": 1.970230872080703e-05, "loss": 1.3008, "step": 3536 }, { "epoch": 1.0534820082280012, "grad_norm": 0.20469968020915985, "learning_rate": 1.9702075075193162e-05, "loss": 1.3116, "step": 3537 }, { "epoch": 1.05377985442767, "grad_norm": 0.19541417062282562, "learning_rate": 1.9701841339312326e-05, "loss": 1.3026, "step": 3538 }, { "epoch": 1.0540777006273385, "grad_norm": 0.2089960277080536, "learning_rate": 1.9701607513166695e-05, "loss": 1.3073, "step": 3539 }, { "epoch": 1.0543755468270073, "grad_norm": 0.22149527072906494, "learning_rate": 1.9701373596758442e-05, "loss": 1.2924, "step": 3540 }, { "epoch": 1.0546733930266758, "grad_norm": 0.20074641704559326, "learning_rate": 1.970113959008975e-05, "loss": 1.3005, "step": 3541 }, { "epoch": 1.0549712392263444, "grad_norm": 0.2029009610414505, "learning_rate": 1.9700905493162792e-05, "loss": 1.303, "step": 3542 }, { "epoch": 1.0552690854260132, "grad_norm": 0.20230336487293243, "learning_rate": 1.9700671305979746e-05, "loss": 1.2961, "step": 3543 }, { "epoch": 1.0555669316256817, "grad_norm": 0.1958988606929779, "learning_rate": 1.9700437028542794e-05, "loss": 1.2986, "step": 3544 }, { "epoch": 1.0558647778253505, "grad_norm": 0.20778551697731018, "learning_rate": 1.9700202660854113e-05, "loss": 1.2911, "step": 3545 }, { "epoch": 1.056162624025019, "grad_norm": 0.20250651240348816, "learning_rate": 1.9699968202915884e-05, "loss": 1.3175, "step": 3546 }, { "epoch": 1.0564604702246878, "grad_norm": 0.2022535353899002, "learning_rate": 1.9699733654730285e-05, "loss": 1.2876, "step": 3547 }, { "epoch": 1.0567583164243564, "grad_norm": 0.19783522188663483, "learning_rate": 1.9699499016299506e-05, "loss": 1.2855, "step": 3548 }, { "epoch": 1.0570561626240251, "grad_norm": 0.20000971853733063, "learning_rate": 1.9699264287625722e-05, "loss": 1.312, "step": 3549 }, { "epoch": 1.0573540088236937, "grad_norm": 0.199110746383667, "learning_rate": 1.969902946871112e-05, "loss": 1.3082, "step": 3550 }, { "epoch": 1.0576518550233622, "grad_norm": 0.19644653797149658, "learning_rate": 1.969879455955789e-05, "loss": 1.3039, "step": 3551 }, { "epoch": 1.057949701223031, "grad_norm": 0.2020893692970276, "learning_rate": 1.969855956016821e-05, "loss": 1.2969, "step": 3552 }, { "epoch": 1.0582475474226996, "grad_norm": 0.20060493052005768, "learning_rate": 1.9698324470544268e-05, "loss": 1.2959, "step": 3553 }, { "epoch": 1.0585453936223683, "grad_norm": 0.1976126879453659, "learning_rate": 1.9698089290688253e-05, "loss": 1.2967, "step": 3554 }, { "epoch": 1.0588432398220369, "grad_norm": 0.2165234386920929, "learning_rate": 1.9697854020602353e-05, "loss": 1.2919, "step": 3555 }, { "epoch": 1.0591410860217056, "grad_norm": 0.202513188123703, "learning_rate": 1.9697618660288757e-05, "loss": 1.2934, "step": 3556 }, { "epoch": 1.0594389322213742, "grad_norm": 0.20844443142414093, "learning_rate": 1.9697383209749653e-05, "loss": 1.3099, "step": 3557 }, { "epoch": 1.0597367784210427, "grad_norm": 0.1990274339914322, "learning_rate": 1.9697147668987235e-05, "loss": 1.3027, "step": 3558 }, { "epoch": 1.0600346246207115, "grad_norm": 0.19725582003593445, "learning_rate": 1.969691203800369e-05, "loss": 1.2981, "step": 3559 }, { "epoch": 1.06033247082038, "grad_norm": 0.2047271728515625, "learning_rate": 1.9696676316801213e-05, "loss": 1.2986, "step": 3560 }, { "epoch": 1.0606303170200488, "grad_norm": 0.21688127517700195, "learning_rate": 1.9696440505381997e-05, "loss": 1.3113, "step": 3561 }, { "epoch": 1.0609281632197174, "grad_norm": 0.2020910382270813, "learning_rate": 1.9696204603748236e-05, "loss": 1.3008, "step": 3562 }, { "epoch": 1.0612260094193862, "grad_norm": 0.19921623170375824, "learning_rate": 1.9695968611902122e-05, "loss": 1.3083, "step": 3563 }, { "epoch": 1.0615238556190547, "grad_norm": 0.21965418756008148, "learning_rate": 1.9695732529845854e-05, "loss": 1.3077, "step": 3564 }, { "epoch": 1.0618217018187233, "grad_norm": 0.2067146599292755, "learning_rate": 1.969549635758163e-05, "loss": 1.3171, "step": 3565 }, { "epoch": 1.062119548018392, "grad_norm": 0.1980430632829666, "learning_rate": 1.9695260095111644e-05, "loss": 1.2916, "step": 3566 }, { "epoch": 1.0624173942180606, "grad_norm": 0.20229408144950867, "learning_rate": 1.9695023742438093e-05, "loss": 1.281, "step": 3567 }, { "epoch": 1.0627152404177294, "grad_norm": 0.20833523571491241, "learning_rate": 1.9694787299563177e-05, "loss": 1.3001, "step": 3568 }, { "epoch": 1.063013086617398, "grad_norm": 0.1953403651714325, "learning_rate": 1.96945507664891e-05, "loss": 1.2966, "step": 3569 }, { "epoch": 1.0633109328170667, "grad_norm": 0.20932403206825256, "learning_rate": 1.969431414321806e-05, "loss": 1.3006, "step": 3570 }, { "epoch": 1.0636087790167352, "grad_norm": 0.19709676504135132, "learning_rate": 1.9694077429752258e-05, "loss": 1.2995, "step": 3571 }, { "epoch": 1.0639066252164038, "grad_norm": 0.20356795191764832, "learning_rate": 1.969384062609389e-05, "loss": 1.2836, "step": 3572 }, { "epoch": 1.0642044714160726, "grad_norm": 0.19794374704360962, "learning_rate": 1.9693603732245176e-05, "loss": 1.2843, "step": 3573 }, { "epoch": 1.064502317615741, "grad_norm": 0.19218191504478455, "learning_rate": 1.9693366748208303e-05, "loss": 1.3023, "step": 3574 }, { "epoch": 1.0648001638154099, "grad_norm": 0.2103659212589264, "learning_rate": 1.9693129673985484e-05, "loss": 1.3042, "step": 3575 }, { "epoch": 1.0650980100150784, "grad_norm": 0.18991057574748993, "learning_rate": 1.969289250957892e-05, "loss": 1.2992, "step": 3576 }, { "epoch": 1.0653958562147472, "grad_norm": 0.2115504890680313, "learning_rate": 1.9692655254990824e-05, "loss": 1.2892, "step": 3577 }, { "epoch": 1.0656937024144157, "grad_norm": 0.200431689620018, "learning_rate": 1.96924179102234e-05, "loss": 1.2873, "step": 3578 }, { "epoch": 1.0659915486140843, "grad_norm": 0.20939333736896515, "learning_rate": 1.9692180475278853e-05, "loss": 1.3031, "step": 3579 }, { "epoch": 1.066289394813753, "grad_norm": 0.20650987327098846, "learning_rate": 1.96919429501594e-05, "loss": 1.3008, "step": 3580 }, { "epoch": 1.0665872410134216, "grad_norm": 0.20073242485523224, "learning_rate": 1.9691705334867246e-05, "loss": 1.3007, "step": 3581 }, { "epoch": 1.0668850872130904, "grad_norm": 0.19522999227046967, "learning_rate": 1.9691467629404602e-05, "loss": 1.2886, "step": 3582 }, { "epoch": 1.067182933412759, "grad_norm": 0.2064772993326187, "learning_rate": 1.9691229833773678e-05, "loss": 1.3139, "step": 3583 }, { "epoch": 1.0674807796124277, "grad_norm": 0.2075064331293106, "learning_rate": 1.9690991947976686e-05, "loss": 1.2917, "step": 3584 }, { "epoch": 1.0677786258120963, "grad_norm": 0.2000180035829544, "learning_rate": 1.9690753972015846e-05, "loss": 1.3011, "step": 3585 }, { "epoch": 1.0680764720117648, "grad_norm": 0.20665602385997772, "learning_rate": 1.9690515905893367e-05, "loss": 1.2927, "step": 3586 }, { "epoch": 1.0683743182114336, "grad_norm": 0.20087812840938568, "learning_rate": 1.969027774961146e-05, "loss": 1.2936, "step": 3587 }, { "epoch": 1.0686721644111021, "grad_norm": 0.19530273973941803, "learning_rate": 1.9690039503172346e-05, "loss": 1.2681, "step": 3588 }, { "epoch": 1.068970010610771, "grad_norm": 0.2068195939064026, "learning_rate": 1.9689801166578247e-05, "loss": 1.2885, "step": 3589 }, { "epoch": 1.0692678568104395, "grad_norm": 0.2021002173423767, "learning_rate": 1.9689562739831368e-05, "loss": 1.2993, "step": 3590 }, { "epoch": 1.0695657030101082, "grad_norm": 0.20480947196483612, "learning_rate": 1.9689324222933936e-05, "loss": 1.3172, "step": 3591 }, { "epoch": 1.0698635492097768, "grad_norm": 0.20652180910110474, "learning_rate": 1.9689085615888166e-05, "loss": 1.2853, "step": 3592 }, { "epoch": 1.0701613954094455, "grad_norm": 0.20586484670639038, "learning_rate": 1.968884691869628e-05, "loss": 1.3097, "step": 3593 }, { "epoch": 1.070459241609114, "grad_norm": 0.1960112303495407, "learning_rate": 1.96886081313605e-05, "loss": 1.3006, "step": 3594 }, { "epoch": 1.0707570878087826, "grad_norm": 0.22065342962741852, "learning_rate": 1.9688369253883043e-05, "loss": 1.3018, "step": 3595 }, { "epoch": 1.0710549340084514, "grad_norm": 0.21142372488975525, "learning_rate": 1.968813028626614e-05, "loss": 1.3154, "step": 3596 }, { "epoch": 1.07135278020812, "grad_norm": 0.2077939659357071, "learning_rate": 1.9687891228512003e-05, "loss": 1.2903, "step": 3597 }, { "epoch": 1.0716506264077887, "grad_norm": 0.19702668488025665, "learning_rate": 1.9687652080622866e-05, "loss": 1.311, "step": 3598 }, { "epoch": 1.0719484726074573, "grad_norm": 0.19933748245239258, "learning_rate": 1.968741284260095e-05, "loss": 1.2832, "step": 3599 }, { "epoch": 1.072246318807126, "grad_norm": 0.2061723917722702, "learning_rate": 1.968717351444848e-05, "loss": 1.2934, "step": 3600 }, { "epoch": 1.0725441650067946, "grad_norm": 0.2108973264694214, "learning_rate": 1.968693409616768e-05, "loss": 1.2809, "step": 3601 }, { "epoch": 1.0728420112064632, "grad_norm": 0.2182292491197586, "learning_rate": 1.9686694587760786e-05, "loss": 1.304, "step": 3602 }, { "epoch": 1.073139857406132, "grad_norm": 0.21988658607006073, "learning_rate": 1.968645498923002e-05, "loss": 1.3043, "step": 3603 }, { "epoch": 1.0734377036058005, "grad_norm": 0.21043458580970764, "learning_rate": 1.9686215300577613e-05, "loss": 1.3036, "step": 3604 }, { "epoch": 1.0737355498054693, "grad_norm": 0.1996757984161377, "learning_rate": 1.9685975521805793e-05, "loss": 1.3026, "step": 3605 }, { "epoch": 1.0740333960051378, "grad_norm": 0.20943333208560944, "learning_rate": 1.9685735652916797e-05, "loss": 1.2773, "step": 3606 }, { "epoch": 1.0743312422048066, "grad_norm": 0.20317253470420837, "learning_rate": 1.9685495693912846e-05, "loss": 1.3079, "step": 3607 }, { "epoch": 1.0746290884044751, "grad_norm": 0.20812860131263733, "learning_rate": 1.9685255644796184e-05, "loss": 1.3113, "step": 3608 }, { "epoch": 1.074926934604144, "grad_norm": 0.2107335329055786, "learning_rate": 1.9685015505569036e-05, "loss": 1.3022, "step": 3609 }, { "epoch": 1.0752247808038125, "grad_norm": 0.2116023153066635, "learning_rate": 1.968477527623364e-05, "loss": 1.2959, "step": 3610 }, { "epoch": 1.075522627003481, "grad_norm": 0.21139049530029297, "learning_rate": 1.9684534956792233e-05, "loss": 1.2899, "step": 3611 }, { "epoch": 1.0758204732031498, "grad_norm": 0.21621239185333252, "learning_rate": 1.9684294547247046e-05, "loss": 1.3221, "step": 3612 }, { "epoch": 1.0761183194028183, "grad_norm": 0.19819071888923645, "learning_rate": 1.9684054047600315e-05, "loss": 1.2978, "step": 3613 }, { "epoch": 1.076416165602487, "grad_norm": 0.2018895149230957, "learning_rate": 1.968381345785429e-05, "loss": 1.292, "step": 3614 }, { "epoch": 1.0767140118021556, "grad_norm": 0.2259438931941986, "learning_rate": 1.9683572778011193e-05, "loss": 1.2992, "step": 3615 }, { "epoch": 1.0770118580018244, "grad_norm": 0.20852015912532806, "learning_rate": 1.968333200807327e-05, "loss": 1.3029, "step": 3616 }, { "epoch": 1.077309704201493, "grad_norm": 0.21100470423698425, "learning_rate": 1.9683091148042765e-05, "loss": 1.2844, "step": 3617 }, { "epoch": 1.0776075504011615, "grad_norm": 0.1936296671628952, "learning_rate": 1.9682850197921914e-05, "loss": 1.2876, "step": 3618 }, { "epoch": 1.0779053966008303, "grad_norm": 0.2020305097103119, "learning_rate": 1.968260915771296e-05, "loss": 1.2876, "step": 3619 }, { "epoch": 1.0782032428004988, "grad_norm": 0.20538529753684998, "learning_rate": 1.9682368027418147e-05, "loss": 1.2761, "step": 3620 }, { "epoch": 1.0785010890001676, "grad_norm": 0.20383626222610474, "learning_rate": 1.9682126807039714e-05, "loss": 1.293, "step": 3621 }, { "epoch": 1.0787989351998362, "grad_norm": 0.19912081956863403, "learning_rate": 1.9681885496579914e-05, "loss": 1.2898, "step": 3622 }, { "epoch": 1.079096781399505, "grad_norm": 0.2121354043483734, "learning_rate": 1.968164409604098e-05, "loss": 1.3029, "step": 3623 }, { "epoch": 1.0793946275991735, "grad_norm": 0.2046370953321457, "learning_rate": 1.968140260542517e-05, "loss": 1.3002, "step": 3624 }, { "epoch": 1.079692473798842, "grad_norm": 0.2005256861448288, "learning_rate": 1.968116102473472e-05, "loss": 1.2891, "step": 3625 }, { "epoch": 1.0799903199985108, "grad_norm": 0.2049214243888855, "learning_rate": 1.968091935397189e-05, "loss": 1.3096, "step": 3626 }, { "epoch": 1.0802881661981794, "grad_norm": 0.21116481721401215, "learning_rate": 1.968067759313892e-05, "loss": 1.2935, "step": 3627 }, { "epoch": 1.0805860123978481, "grad_norm": 0.2080266922712326, "learning_rate": 1.9680435742238056e-05, "loss": 1.2879, "step": 3628 }, { "epoch": 1.0808838585975167, "grad_norm": 0.19524142146110535, "learning_rate": 1.9680193801271558e-05, "loss": 1.304, "step": 3629 }, { "epoch": 1.0811817047971854, "grad_norm": 0.20208951830863953, "learning_rate": 1.9679951770241673e-05, "loss": 1.2854, "step": 3630 }, { "epoch": 1.081479550996854, "grad_norm": 0.2034522444009781, "learning_rate": 1.9679709649150647e-05, "loss": 1.3, "step": 3631 }, { "epoch": 1.0817773971965225, "grad_norm": 0.2104310393333435, "learning_rate": 1.967946743800074e-05, "loss": 1.2851, "step": 3632 }, { "epoch": 1.0820752433961913, "grad_norm": 0.2176777422428131, "learning_rate": 1.9679225136794203e-05, "loss": 1.3078, "step": 3633 }, { "epoch": 1.0823730895958599, "grad_norm": 0.20740337669849396, "learning_rate": 1.967898274553329e-05, "loss": 1.2937, "step": 3634 }, { "epoch": 1.0826709357955286, "grad_norm": 0.21155376732349396, "learning_rate": 1.9678740264220257e-05, "loss": 1.3129, "step": 3635 }, { "epoch": 1.0829687819951972, "grad_norm": 0.21064534783363342, "learning_rate": 1.967849769285736e-05, "loss": 1.3073, "step": 3636 }, { "epoch": 1.083266628194866, "grad_norm": 0.21301575005054474, "learning_rate": 1.9678255031446855e-05, "loss": 1.2812, "step": 3637 }, { "epoch": 1.0835644743945345, "grad_norm": 0.21204115450382233, "learning_rate": 1.9678012279991e-05, "loss": 1.3044, "step": 3638 }, { "epoch": 1.083862320594203, "grad_norm": 0.2081066370010376, "learning_rate": 1.9677769438492055e-05, "loss": 1.2933, "step": 3639 }, { "epoch": 1.0841601667938718, "grad_norm": 0.20554843544960022, "learning_rate": 1.9677526506952275e-05, "loss": 1.2992, "step": 3640 }, { "epoch": 1.0844580129935404, "grad_norm": 0.2064088135957718, "learning_rate": 1.9677283485373923e-05, "loss": 1.307, "step": 3641 }, { "epoch": 1.0847558591932092, "grad_norm": 0.20249561965465546, "learning_rate": 1.9677040373759266e-05, "loss": 1.2989, "step": 3642 }, { "epoch": 1.0850537053928777, "grad_norm": 0.20630252361297607, "learning_rate": 1.9676797172110552e-05, "loss": 1.3039, "step": 3643 }, { "epoch": 1.0853515515925465, "grad_norm": 0.20152369141578674, "learning_rate": 1.9676553880430056e-05, "loss": 1.2981, "step": 3644 }, { "epoch": 1.085649397792215, "grad_norm": 0.2110234946012497, "learning_rate": 1.967631049872004e-05, "loss": 1.3199, "step": 3645 }, { "epoch": 1.0859472439918836, "grad_norm": 0.19750213623046875, "learning_rate": 1.9676067026982762e-05, "loss": 1.2809, "step": 3646 }, { "epoch": 1.0862450901915524, "grad_norm": 0.2041853368282318, "learning_rate": 1.967582346522049e-05, "loss": 1.3162, "step": 3647 }, { "epoch": 1.086542936391221, "grad_norm": 0.20011192560195923, "learning_rate": 1.9675579813435495e-05, "loss": 1.2837, "step": 3648 }, { "epoch": 1.0868407825908897, "grad_norm": 0.20544438064098358, "learning_rate": 1.967533607163004e-05, "loss": 1.2828, "step": 3649 }, { "epoch": 1.0871386287905582, "grad_norm": 0.20484983921051025, "learning_rate": 1.967509223980639e-05, "loss": 1.2918, "step": 3650 }, { "epoch": 1.087436474990227, "grad_norm": 0.21132327616214752, "learning_rate": 1.967484831796682e-05, "loss": 1.2938, "step": 3651 }, { "epoch": 1.0877343211898955, "grad_norm": 0.1959044486284256, "learning_rate": 1.967460430611359e-05, "loss": 1.3045, "step": 3652 }, { "epoch": 1.088032167389564, "grad_norm": 0.20495103299617767, "learning_rate": 1.967436020424898e-05, "loss": 1.276, "step": 3653 }, { "epoch": 1.0883300135892329, "grad_norm": 0.203419491648674, "learning_rate": 1.967411601237526e-05, "loss": 1.3149, "step": 3654 }, { "epoch": 1.0886278597889014, "grad_norm": 0.19674086570739746, "learning_rate": 1.967387173049469e-05, "loss": 1.288, "step": 3655 }, { "epoch": 1.0889257059885702, "grad_norm": 0.21166275441646576, "learning_rate": 1.967362735860956e-05, "loss": 1.3057, "step": 3656 }, { "epoch": 1.0892235521882387, "grad_norm": 0.2079380601644516, "learning_rate": 1.9673382896722134e-05, "loss": 1.2847, "step": 3657 }, { "epoch": 1.0895213983879075, "grad_norm": 0.21181327104568481, "learning_rate": 1.9673138344834686e-05, "loss": 1.3122, "step": 3658 }, { "epoch": 1.089819244587576, "grad_norm": 0.20894701778888702, "learning_rate": 1.9672893702949492e-05, "loss": 1.2882, "step": 3659 }, { "epoch": 1.0901170907872448, "grad_norm": 0.2010115683078766, "learning_rate": 1.9672648971068833e-05, "loss": 1.2841, "step": 3660 }, { "epoch": 1.0904149369869134, "grad_norm": 0.24069705605506897, "learning_rate": 1.967240414919498e-05, "loss": 1.3047, "step": 3661 }, { "epoch": 1.090712783186582, "grad_norm": 0.2139674872159958, "learning_rate": 1.967215923733021e-05, "loss": 1.2796, "step": 3662 }, { "epoch": 1.0910106293862507, "grad_norm": 0.22698570787906647, "learning_rate": 1.967191423547681e-05, "loss": 1.285, "step": 3663 }, { "epoch": 1.0913084755859193, "grad_norm": 0.21402917802333832, "learning_rate": 1.9671669143637054e-05, "loss": 1.3145, "step": 3664 }, { "epoch": 1.091606321785588, "grad_norm": 0.2062504142522812, "learning_rate": 1.9671423961813222e-05, "loss": 1.3042, "step": 3665 }, { "epoch": 1.0919041679852566, "grad_norm": 0.22338880598545074, "learning_rate": 1.9671178690007596e-05, "loss": 1.2936, "step": 3666 }, { "epoch": 1.0922020141849254, "grad_norm": 0.19754505157470703, "learning_rate": 1.9670933328222453e-05, "loss": 1.2981, "step": 3667 }, { "epoch": 1.092499860384594, "grad_norm": 0.21521669626235962, "learning_rate": 1.9670687876460084e-05, "loss": 1.2916, "step": 3668 }, { "epoch": 1.0927977065842625, "grad_norm": 0.22141075134277344, "learning_rate": 1.9670442334722767e-05, "loss": 1.2899, "step": 3669 }, { "epoch": 1.0930955527839312, "grad_norm": 0.20699742436408997, "learning_rate": 1.967019670301279e-05, "loss": 1.2974, "step": 3670 }, { "epoch": 1.0933933989835998, "grad_norm": 0.21716786921024323, "learning_rate": 1.9669950981332436e-05, "loss": 1.293, "step": 3671 }, { "epoch": 1.0936912451832685, "grad_norm": 0.21116264164447784, "learning_rate": 1.9669705169683996e-05, "loss": 1.2923, "step": 3672 }, { "epoch": 1.093989091382937, "grad_norm": 0.2112829089164734, "learning_rate": 1.966945926806975e-05, "loss": 1.3111, "step": 3673 }, { "epoch": 1.0942869375826059, "grad_norm": 0.20704688131809235, "learning_rate": 1.9669213276491988e-05, "loss": 1.2943, "step": 3674 }, { "epoch": 1.0945847837822744, "grad_norm": 0.19894598424434662, "learning_rate": 1.9668967194953e-05, "loss": 1.3108, "step": 3675 }, { "epoch": 1.0948826299819432, "grad_norm": 0.20967112481594086, "learning_rate": 1.9668721023455072e-05, "loss": 1.2952, "step": 3676 }, { "epoch": 1.0951804761816117, "grad_norm": 0.21888650953769684, "learning_rate": 1.96684747620005e-05, "loss": 1.2752, "step": 3677 }, { "epoch": 1.0954783223812803, "grad_norm": 0.21551842987537384, "learning_rate": 1.9668228410591577e-05, "loss": 1.2913, "step": 3678 }, { "epoch": 1.095776168580949, "grad_norm": 0.2061617374420166, "learning_rate": 1.9667981969230586e-05, "loss": 1.2855, "step": 3679 }, { "epoch": 1.0960740147806176, "grad_norm": 0.2146829217672348, "learning_rate": 1.9667735437919826e-05, "loss": 1.2983, "step": 3680 }, { "epoch": 1.0963718609802864, "grad_norm": 0.21890433132648468, "learning_rate": 1.966748881666159e-05, "loss": 1.2913, "step": 3681 }, { "epoch": 1.096669707179955, "grad_norm": 0.217780202627182, "learning_rate": 1.966724210545817e-05, "loss": 1.2984, "step": 3682 }, { "epoch": 1.0969675533796237, "grad_norm": 0.20973043143749237, "learning_rate": 1.966699530431186e-05, "loss": 1.2921, "step": 3683 }, { "epoch": 1.0972653995792923, "grad_norm": 0.2042854130268097, "learning_rate": 1.9666748413224966e-05, "loss": 1.3071, "step": 3684 }, { "epoch": 1.0975632457789608, "grad_norm": 0.21178506314754486, "learning_rate": 1.9666501432199772e-05, "loss": 1.2983, "step": 3685 }, { "epoch": 1.0978610919786296, "grad_norm": 0.21143445372581482, "learning_rate": 1.9666254361238585e-05, "loss": 1.2918, "step": 3686 }, { "epoch": 1.0981589381782981, "grad_norm": 0.21675217151641846, "learning_rate": 1.9666007200343702e-05, "loss": 1.2936, "step": 3687 }, { "epoch": 1.098456784377967, "grad_norm": 0.22217164933681488, "learning_rate": 1.9665759949517424e-05, "loss": 1.2981, "step": 3688 }, { "epoch": 1.0987546305776354, "grad_norm": 0.20536118745803833, "learning_rate": 1.9665512608762043e-05, "loss": 1.2848, "step": 3689 }, { "epoch": 1.0990524767773042, "grad_norm": 0.2172800451517105, "learning_rate": 1.966526517807987e-05, "loss": 1.3013, "step": 3690 }, { "epoch": 1.0993503229769728, "grad_norm": 0.20742656290531158, "learning_rate": 1.96650176574732e-05, "loss": 1.2808, "step": 3691 }, { "epoch": 1.0996481691766413, "grad_norm": 0.22240039706230164, "learning_rate": 1.966477004694434e-05, "loss": 1.3126, "step": 3692 }, { "epoch": 1.09994601537631, "grad_norm": 0.20464514195919037, "learning_rate": 1.9664522346495593e-05, "loss": 1.2789, "step": 3693 }, { "epoch": 1.1002438615759786, "grad_norm": 0.21234962344169617, "learning_rate": 1.9664274556129266e-05, "loss": 1.3126, "step": 3694 }, { "epoch": 1.1005417077756474, "grad_norm": 0.20635056495666504, "learning_rate": 1.966402667584766e-05, "loss": 1.2924, "step": 3695 }, { "epoch": 1.100839553975316, "grad_norm": 0.19965322315692902, "learning_rate": 1.9663778705653082e-05, "loss": 1.3111, "step": 3696 }, { "epoch": 1.1011374001749847, "grad_norm": 0.2301105260848999, "learning_rate": 1.9663530645547842e-05, "loss": 1.3003, "step": 3697 }, { "epoch": 1.1014352463746533, "grad_norm": 0.2165602147579193, "learning_rate": 1.9663282495534247e-05, "loss": 1.3017, "step": 3698 }, { "epoch": 1.1017330925743218, "grad_norm": 0.19816917181015015, "learning_rate": 1.9663034255614602e-05, "loss": 1.3024, "step": 3699 }, { "epoch": 1.1020309387739906, "grad_norm": 0.21041910350322723, "learning_rate": 1.966278592579122e-05, "loss": 1.302, "step": 3700 }, { "epoch": 1.1023287849736592, "grad_norm": 0.20530866086483002, "learning_rate": 1.966253750606641e-05, "loss": 1.2768, "step": 3701 }, { "epoch": 1.102626631173328, "grad_norm": 0.20916764438152313, "learning_rate": 1.966228899644249e-05, "loss": 1.2983, "step": 3702 }, { "epoch": 1.1029244773729965, "grad_norm": 0.206373929977417, "learning_rate": 1.966204039692176e-05, "loss": 1.2997, "step": 3703 }, { "epoch": 1.1032223235726653, "grad_norm": 0.1992659568786621, "learning_rate": 1.966179170750654e-05, "loss": 1.3076, "step": 3704 }, { "epoch": 1.1035201697723338, "grad_norm": 0.20194512605667114, "learning_rate": 1.9661542928199144e-05, "loss": 1.2932, "step": 3705 }, { "epoch": 1.1038180159720024, "grad_norm": 0.2105811983346939, "learning_rate": 1.9661294059001884e-05, "loss": 1.2847, "step": 3706 }, { "epoch": 1.1041158621716711, "grad_norm": 0.21456459164619446, "learning_rate": 1.966104509991708e-05, "loss": 1.3056, "step": 3707 }, { "epoch": 1.1044137083713397, "grad_norm": 0.2006983458995819, "learning_rate": 1.9660796050947045e-05, "loss": 1.2968, "step": 3708 }, { "epoch": 1.1047115545710084, "grad_norm": 0.19508807361125946, "learning_rate": 1.9660546912094095e-05, "loss": 1.2826, "step": 3709 }, { "epoch": 1.105009400770677, "grad_norm": 0.19879397749900818, "learning_rate": 1.9660297683360548e-05, "loss": 1.3001, "step": 3710 }, { "epoch": 1.1053072469703458, "grad_norm": 0.21688780188560486, "learning_rate": 1.9660048364748724e-05, "loss": 1.3096, "step": 3711 }, { "epoch": 1.1056050931700143, "grad_norm": 0.20888207852840424, "learning_rate": 1.9659798956260948e-05, "loss": 1.3002, "step": 3712 }, { "epoch": 1.1059029393696829, "grad_norm": 0.20637017488479614, "learning_rate": 1.965954945789953e-05, "loss": 1.3093, "step": 3713 }, { "epoch": 1.1062007855693516, "grad_norm": 0.20435267686843872, "learning_rate": 1.96592998696668e-05, "loss": 1.2938, "step": 3714 }, { "epoch": 1.1064986317690202, "grad_norm": 0.21587617695331573, "learning_rate": 1.9659050191565076e-05, "loss": 1.2859, "step": 3715 }, { "epoch": 1.106796477968689, "grad_norm": 0.20713822543621063, "learning_rate": 1.9658800423596676e-05, "loss": 1.2925, "step": 3716 }, { "epoch": 1.1070943241683575, "grad_norm": 0.20413658022880554, "learning_rate": 1.965855056576394e-05, "loss": 1.2846, "step": 3717 }, { "epoch": 1.1073921703680263, "grad_norm": 0.2183109074831009, "learning_rate": 1.9658300618069175e-05, "loss": 1.3003, "step": 3718 }, { "epoch": 1.1076900165676948, "grad_norm": 0.2023807168006897, "learning_rate": 1.9658050580514712e-05, "loss": 1.303, "step": 3719 }, { "epoch": 1.1079878627673634, "grad_norm": 0.20855534076690674, "learning_rate": 1.9657800453102884e-05, "loss": 1.3078, "step": 3720 }, { "epoch": 1.1082857089670322, "grad_norm": 0.2180613875389099, "learning_rate": 1.9657550235836012e-05, "loss": 1.2914, "step": 3721 }, { "epoch": 1.1085835551667007, "grad_norm": 0.20898592472076416, "learning_rate": 1.9657299928716423e-05, "loss": 1.2891, "step": 3722 }, { "epoch": 1.1088814013663695, "grad_norm": 0.2195393294095993, "learning_rate": 1.965704953174645e-05, "loss": 1.2995, "step": 3723 }, { "epoch": 1.109179247566038, "grad_norm": 0.2105409801006317, "learning_rate": 1.965679904492842e-05, "loss": 1.2744, "step": 3724 }, { "epoch": 1.1094770937657068, "grad_norm": 0.20014241337776184, "learning_rate": 1.9656548468264664e-05, "loss": 1.3026, "step": 3725 }, { "epoch": 1.1097749399653754, "grad_norm": 0.21112357079982758, "learning_rate": 1.9656297801757514e-05, "loss": 1.2834, "step": 3726 }, { "epoch": 1.1100727861650441, "grad_norm": 0.2157319337129593, "learning_rate": 1.9656047045409302e-05, "loss": 1.287, "step": 3727 }, { "epoch": 1.1103706323647127, "grad_norm": 0.20597907900810242, "learning_rate": 1.9655796199222357e-05, "loss": 1.3017, "step": 3728 }, { "epoch": 1.1106684785643812, "grad_norm": 0.20393458008766174, "learning_rate": 1.965554526319902e-05, "loss": 1.2901, "step": 3729 }, { "epoch": 1.11096632476405, "grad_norm": 0.2154955267906189, "learning_rate": 1.9655294237341622e-05, "loss": 1.2982, "step": 3730 }, { "epoch": 1.1112641709637185, "grad_norm": 0.20532436668872833, "learning_rate": 1.9655043121652496e-05, "loss": 1.2765, "step": 3731 }, { "epoch": 1.1115620171633873, "grad_norm": 0.20799566805362701, "learning_rate": 1.9654791916133986e-05, "loss": 1.2956, "step": 3732 }, { "epoch": 1.1118598633630559, "grad_norm": 0.22477100789546967, "learning_rate": 1.965454062078842e-05, "loss": 1.2966, "step": 3733 }, { "epoch": 1.1121577095627246, "grad_norm": 0.20537179708480835, "learning_rate": 1.965428923561814e-05, "loss": 1.3057, "step": 3734 }, { "epoch": 1.1124555557623932, "grad_norm": 0.20808099210262299, "learning_rate": 1.9654037760625486e-05, "loss": 1.3127, "step": 3735 }, { "epoch": 1.1127534019620617, "grad_norm": 0.20625001192092896, "learning_rate": 1.9653786195812797e-05, "loss": 1.2887, "step": 3736 }, { "epoch": 1.1130512481617305, "grad_norm": 0.22057880461215973, "learning_rate": 1.9653534541182412e-05, "loss": 1.3152, "step": 3737 }, { "epoch": 1.113349094361399, "grad_norm": 0.21256573498249054, "learning_rate": 1.9653282796736677e-05, "loss": 1.3075, "step": 3738 }, { "epoch": 1.1136469405610678, "grad_norm": 0.21121853590011597, "learning_rate": 1.965303096247793e-05, "loss": 1.2952, "step": 3739 }, { "epoch": 1.1139447867607364, "grad_norm": 0.20226754248142242, "learning_rate": 1.965277903840851e-05, "loss": 1.287, "step": 3740 }, { "epoch": 1.1142426329604052, "grad_norm": 0.21369782090187073, "learning_rate": 1.965252702453077e-05, "loss": 1.3082, "step": 3741 }, { "epoch": 1.1145404791600737, "grad_norm": 0.1950555294752121, "learning_rate": 1.965227492084705e-05, "loss": 1.2965, "step": 3742 }, { "epoch": 1.1148383253597425, "grad_norm": 0.2038709968328476, "learning_rate": 1.96520227273597e-05, "loss": 1.2898, "step": 3743 }, { "epoch": 1.115136171559411, "grad_norm": 0.2121773660182953, "learning_rate": 1.9651770444071058e-05, "loss": 1.3005, "step": 3744 }, { "epoch": 1.1154340177590796, "grad_norm": 0.21432389318943024, "learning_rate": 1.9651518070983474e-05, "loss": 1.2952, "step": 3745 }, { "epoch": 1.1157318639587483, "grad_norm": 0.20774976909160614, "learning_rate": 1.96512656080993e-05, "loss": 1.2983, "step": 3746 }, { "epoch": 1.116029710158417, "grad_norm": 0.1962873339653015, "learning_rate": 1.9651013055420882e-05, "loss": 1.2897, "step": 3747 }, { "epoch": 1.1163275563580857, "grad_norm": 0.1967422515153885, "learning_rate": 1.965076041295057e-05, "loss": 1.2972, "step": 3748 }, { "epoch": 1.1166254025577542, "grad_norm": 0.20895633101463318, "learning_rate": 1.9650507680690716e-05, "loss": 1.2871, "step": 3749 }, { "epoch": 1.116923248757423, "grad_norm": 0.24089834094047546, "learning_rate": 1.965025485864367e-05, "loss": 1.2985, "step": 3750 }, { "epoch": 1.1172210949570915, "grad_norm": 0.21199411153793335, "learning_rate": 1.9650001946811784e-05, "loss": 1.3011, "step": 3751 }, { "epoch": 1.11751894115676, "grad_norm": 0.21411864459514618, "learning_rate": 1.9649748945197412e-05, "loss": 1.2979, "step": 3752 }, { "epoch": 1.1178167873564289, "grad_norm": 0.20356041193008423, "learning_rate": 1.9649495853802907e-05, "loss": 1.2948, "step": 3753 }, { "epoch": 1.1181146335560974, "grad_norm": 0.20256462693214417, "learning_rate": 1.9649242672630625e-05, "loss": 1.2915, "step": 3754 }, { "epoch": 1.1184124797557662, "grad_norm": 0.20421351492404938, "learning_rate": 1.964898940168292e-05, "loss": 1.2914, "step": 3755 }, { "epoch": 1.1187103259554347, "grad_norm": 0.20901718735694885, "learning_rate": 1.964873604096215e-05, "loss": 1.2886, "step": 3756 }, { "epoch": 1.1190081721551035, "grad_norm": 0.19340622425079346, "learning_rate": 1.9648482590470666e-05, "loss": 1.2966, "step": 3757 }, { "epoch": 1.119306018354772, "grad_norm": 0.206883043050766, "learning_rate": 1.9648229050210838e-05, "loss": 1.308, "step": 3758 }, { "epoch": 1.1196038645544406, "grad_norm": 0.20688460767269135, "learning_rate": 1.9647975420185016e-05, "loss": 1.3051, "step": 3759 }, { "epoch": 1.1199017107541094, "grad_norm": 0.20681655406951904, "learning_rate": 1.964772170039556e-05, "loss": 1.3133, "step": 3760 }, { "epoch": 1.120199556953778, "grad_norm": 0.20772404968738556, "learning_rate": 1.9647467890844836e-05, "loss": 1.301, "step": 3761 }, { "epoch": 1.1204974031534467, "grad_norm": 0.20942550897598267, "learning_rate": 1.9647213991535202e-05, "loss": 1.2888, "step": 3762 }, { "epoch": 1.1207952493531153, "grad_norm": 0.20836666226387024, "learning_rate": 1.9646960002469017e-05, "loss": 1.2976, "step": 3763 }, { "epoch": 1.121093095552784, "grad_norm": 0.2175648808479309, "learning_rate": 1.964670592364865e-05, "loss": 1.2924, "step": 3764 }, { "epoch": 1.1213909417524526, "grad_norm": 0.21817858517169952, "learning_rate": 1.9646451755076464e-05, "loss": 1.294, "step": 3765 }, { "epoch": 1.1216887879521211, "grad_norm": 0.21252726018428802, "learning_rate": 1.964619749675482e-05, "loss": 1.2988, "step": 3766 }, { "epoch": 1.12198663415179, "grad_norm": 0.23812422156333923, "learning_rate": 1.9645943148686082e-05, "loss": 1.2986, "step": 3767 }, { "epoch": 1.1222844803514584, "grad_norm": 0.22017017006874084, "learning_rate": 1.9645688710872622e-05, "loss": 1.2837, "step": 3768 }, { "epoch": 1.1225823265511272, "grad_norm": 0.21784719824790955, "learning_rate": 1.9645434183316808e-05, "loss": 1.3051, "step": 3769 }, { "epoch": 1.1228801727507958, "grad_norm": 0.2230665236711502, "learning_rate": 1.9645179566021007e-05, "loss": 1.304, "step": 3770 }, { "epoch": 1.1231780189504645, "grad_norm": 0.20696522295475006, "learning_rate": 1.9644924858987582e-05, "loss": 1.2892, "step": 3771 }, { "epoch": 1.123475865150133, "grad_norm": 0.23003578186035156, "learning_rate": 1.964467006221891e-05, "loss": 1.3022, "step": 3772 }, { "epoch": 1.1237737113498016, "grad_norm": 0.20720861852169037, "learning_rate": 1.9644415175717356e-05, "loss": 1.2933, "step": 3773 }, { "epoch": 1.1240715575494704, "grad_norm": 0.20392398536205292, "learning_rate": 1.9644160199485297e-05, "loss": 1.2834, "step": 3774 }, { "epoch": 1.124369403749139, "grad_norm": 0.20635534822940826, "learning_rate": 1.9643905133525102e-05, "loss": 1.2945, "step": 3775 }, { "epoch": 1.1246672499488077, "grad_norm": 0.20737838745117188, "learning_rate": 1.964364997783914e-05, "loss": 1.2704, "step": 3776 }, { "epoch": 1.1249650961484763, "grad_norm": 0.20279735326766968, "learning_rate": 1.9643394732429795e-05, "loss": 1.2932, "step": 3777 }, { "epoch": 1.125262942348145, "grad_norm": 0.21164770424365997, "learning_rate": 1.9643139397299437e-05, "loss": 1.3012, "step": 3778 }, { "epoch": 1.1255607885478136, "grad_norm": 0.2219165563583374, "learning_rate": 1.9642883972450434e-05, "loss": 1.295, "step": 3779 }, { "epoch": 1.1258586347474822, "grad_norm": 0.2005193829536438, "learning_rate": 1.9642628457885175e-05, "loss": 1.2954, "step": 3780 }, { "epoch": 1.126156480947151, "grad_norm": 0.2079988270998001, "learning_rate": 1.964237285360603e-05, "loss": 1.2879, "step": 3781 }, { "epoch": 1.1264543271468195, "grad_norm": 0.20791326463222504, "learning_rate": 1.964211715961538e-05, "loss": 1.2914, "step": 3782 }, { "epoch": 1.1267521733464883, "grad_norm": 0.20197267830371857, "learning_rate": 1.96418613759156e-05, "loss": 1.2985, "step": 3783 }, { "epoch": 1.1270500195461568, "grad_norm": 0.2005607932806015, "learning_rate": 1.9641605502509074e-05, "loss": 1.3072, "step": 3784 }, { "epoch": 1.1273478657458256, "grad_norm": 0.21449939906597137, "learning_rate": 1.9641349539398182e-05, "loss": 1.3081, "step": 3785 }, { "epoch": 1.1276457119454941, "grad_norm": 0.20632421970367432, "learning_rate": 1.96410934865853e-05, "loss": 1.3078, "step": 3786 }, { "epoch": 1.1279435581451627, "grad_norm": 0.21048493683338165, "learning_rate": 1.9640837344072825e-05, "loss": 1.28, "step": 3787 }, { "epoch": 1.1282414043448314, "grad_norm": 0.2121082842350006, "learning_rate": 1.964058111186312e-05, "loss": 1.301, "step": 3788 }, { "epoch": 1.1285392505445, "grad_norm": 0.19674839079380035, "learning_rate": 1.964032478995858e-05, "loss": 1.2883, "step": 3789 }, { "epoch": 1.1288370967441688, "grad_norm": 0.2155827283859253, "learning_rate": 1.9640068378361592e-05, "loss": 1.2841, "step": 3790 }, { "epoch": 1.1291349429438373, "grad_norm": 0.20801493525505066, "learning_rate": 1.9639811877074537e-05, "loss": 1.2914, "step": 3791 }, { "epoch": 1.129432789143506, "grad_norm": 0.20542113482952118, "learning_rate": 1.9639555286099802e-05, "loss": 1.3002, "step": 3792 }, { "epoch": 1.1297306353431746, "grad_norm": 0.201734721660614, "learning_rate": 1.9639298605439775e-05, "loss": 1.3046, "step": 3793 }, { "epoch": 1.1300284815428434, "grad_norm": 0.20444509387016296, "learning_rate": 1.9639041835096845e-05, "loss": 1.2954, "step": 3794 }, { "epoch": 1.130326327742512, "grad_norm": 0.20374253392219543, "learning_rate": 1.96387849750734e-05, "loss": 1.2746, "step": 3795 }, { "epoch": 1.1306241739421805, "grad_norm": 0.21780647337436676, "learning_rate": 1.963852802537183e-05, "loss": 1.3071, "step": 3796 }, { "epoch": 1.1309220201418493, "grad_norm": 0.20837095379829407, "learning_rate": 1.9638270985994526e-05, "loss": 1.2804, "step": 3797 }, { "epoch": 1.1312198663415178, "grad_norm": 0.20445789396762848, "learning_rate": 1.963801385694388e-05, "loss": 1.3028, "step": 3798 }, { "epoch": 1.1315177125411866, "grad_norm": 0.2184823602437973, "learning_rate": 1.963775663822228e-05, "loss": 1.3017, "step": 3799 }, { "epoch": 1.1318155587408552, "grad_norm": 0.21296125650405884, "learning_rate": 1.9637499329832123e-05, "loss": 1.2998, "step": 3800 }, { "epoch": 1.132113404940524, "grad_norm": 0.20897722244262695, "learning_rate": 1.9637241931775803e-05, "loss": 1.2891, "step": 3801 }, { "epoch": 1.1324112511401925, "grad_norm": 0.20662549138069153, "learning_rate": 1.9636984444055716e-05, "loss": 1.2835, "step": 3802 }, { "epoch": 1.1327090973398612, "grad_norm": 0.20672594010829926, "learning_rate": 1.963672686667425e-05, "loss": 1.2908, "step": 3803 }, { "epoch": 1.1330069435395298, "grad_norm": 0.21389494836330414, "learning_rate": 1.9636469199633813e-05, "loss": 1.3081, "step": 3804 }, { "epoch": 1.1333047897391983, "grad_norm": 0.20938587188720703, "learning_rate": 1.9636211442936798e-05, "loss": 1.2825, "step": 3805 }, { "epoch": 1.1336026359388671, "grad_norm": 0.21103520691394806, "learning_rate": 1.9635953596585597e-05, "loss": 1.287, "step": 3806 }, { "epoch": 1.1339004821385357, "grad_norm": 0.20821918547153473, "learning_rate": 1.9635695660582617e-05, "loss": 1.292, "step": 3807 }, { "epoch": 1.1341983283382044, "grad_norm": 0.20691712200641632, "learning_rate": 1.9635437634930252e-05, "loss": 1.305, "step": 3808 }, { "epoch": 1.134496174537873, "grad_norm": 0.19941256940364838, "learning_rate": 1.9635179519630905e-05, "loss": 1.2947, "step": 3809 }, { "epoch": 1.1347940207375418, "grad_norm": 0.20479938387870789, "learning_rate": 1.963492131468698e-05, "loss": 1.2878, "step": 3810 }, { "epoch": 1.1350918669372103, "grad_norm": 0.2046506404876709, "learning_rate": 1.9634663020100877e-05, "loss": 1.2918, "step": 3811 }, { "epoch": 1.1353897131368789, "grad_norm": 0.21905964612960815, "learning_rate": 1.9634404635874996e-05, "loss": 1.3204, "step": 3812 }, { "epoch": 1.1356875593365476, "grad_norm": 0.214660182595253, "learning_rate": 1.963414616201175e-05, "loss": 1.291, "step": 3813 }, { "epoch": 1.1359854055362162, "grad_norm": 0.20889678597450256, "learning_rate": 1.963388759851353e-05, "loss": 1.3095, "step": 3814 }, { "epoch": 1.136283251735885, "grad_norm": 0.21485598385334015, "learning_rate": 1.9633628945382754e-05, "loss": 1.3088, "step": 3815 }, { "epoch": 1.1365810979355535, "grad_norm": 0.2200709581375122, "learning_rate": 1.9633370202621823e-05, "loss": 1.2874, "step": 3816 }, { "epoch": 1.1368789441352223, "grad_norm": 0.22136938571929932, "learning_rate": 1.9633111370233148e-05, "loss": 1.288, "step": 3817 }, { "epoch": 1.1371767903348908, "grad_norm": 0.20491695404052734, "learning_rate": 1.963285244821913e-05, "loss": 1.2945, "step": 3818 }, { "epoch": 1.1374746365345594, "grad_norm": 0.20777413249015808, "learning_rate": 1.9632593436582187e-05, "loss": 1.2855, "step": 3819 }, { "epoch": 1.1377724827342282, "grad_norm": 0.21913692355155945, "learning_rate": 1.9632334335324723e-05, "loss": 1.2853, "step": 3820 }, { "epoch": 1.1380703289338967, "grad_norm": 0.19954827427864075, "learning_rate": 1.9632075144449146e-05, "loss": 1.2795, "step": 3821 }, { "epoch": 1.1383681751335655, "grad_norm": 0.2131366729736328, "learning_rate": 1.9631815863957873e-05, "loss": 1.2922, "step": 3822 }, { "epoch": 1.138666021333234, "grad_norm": 0.2245870679616928, "learning_rate": 1.9631556493853317e-05, "loss": 1.289, "step": 3823 }, { "epoch": 1.1389638675329028, "grad_norm": 0.21591255068778992, "learning_rate": 1.9631297034137886e-05, "loss": 1.2894, "step": 3824 }, { "epoch": 1.1392617137325713, "grad_norm": 0.20875537395477295, "learning_rate": 1.9631037484814e-05, "loss": 1.2887, "step": 3825 }, { "epoch": 1.13955955993224, "grad_norm": 0.2085467278957367, "learning_rate": 1.9630777845884068e-05, "loss": 1.2922, "step": 3826 }, { "epoch": 1.1398574061319087, "grad_norm": 0.20918689668178558, "learning_rate": 1.963051811735051e-05, "loss": 1.2929, "step": 3827 }, { "epoch": 1.1401552523315772, "grad_norm": 0.22871707379817963, "learning_rate": 1.963025829921574e-05, "loss": 1.2919, "step": 3828 }, { "epoch": 1.140453098531246, "grad_norm": 0.21244798600673676, "learning_rate": 1.9629998391482177e-05, "loss": 1.2905, "step": 3829 }, { "epoch": 1.1407509447309145, "grad_norm": 0.2155030071735382, "learning_rate": 1.9629738394152237e-05, "loss": 1.2915, "step": 3830 }, { "epoch": 1.1410487909305833, "grad_norm": 0.20495347678661346, "learning_rate": 1.962947830722834e-05, "loss": 1.283, "step": 3831 }, { "epoch": 1.1413466371302519, "grad_norm": 0.23412112891674042, "learning_rate": 1.9629218130712906e-05, "loss": 1.2975, "step": 3832 }, { "epoch": 1.1416444833299204, "grad_norm": 0.21597813069820404, "learning_rate": 1.962895786460836e-05, "loss": 1.2876, "step": 3833 }, { "epoch": 1.1419423295295892, "grad_norm": 0.23608894646167755, "learning_rate": 1.9628697508917113e-05, "loss": 1.2987, "step": 3834 }, { "epoch": 1.1422401757292577, "grad_norm": 0.20092350244522095, "learning_rate": 1.9628437063641595e-05, "loss": 1.2761, "step": 3835 }, { "epoch": 1.1425380219289265, "grad_norm": 0.2102275788784027, "learning_rate": 1.9628176528784228e-05, "loss": 1.3009, "step": 3836 }, { "epoch": 1.142835868128595, "grad_norm": 0.22043468058109283, "learning_rate": 1.9627915904347435e-05, "loss": 1.2758, "step": 3837 }, { "epoch": 1.1431337143282638, "grad_norm": 0.21429385244846344, "learning_rate": 1.9627655190333645e-05, "loss": 1.2837, "step": 3838 }, { "epoch": 1.1434315605279324, "grad_norm": 0.207056924700737, "learning_rate": 1.9627394386745274e-05, "loss": 1.2877, "step": 3839 }, { "epoch": 1.143729406727601, "grad_norm": 0.21268786489963531, "learning_rate": 1.962713349358476e-05, "loss": 1.2794, "step": 3840 }, { "epoch": 1.1440272529272697, "grad_norm": 0.20578525960445404, "learning_rate": 1.9626872510854525e-05, "loss": 1.3017, "step": 3841 }, { "epoch": 1.1443250991269382, "grad_norm": 0.21620962023735046, "learning_rate": 1.9626611438556997e-05, "loss": 1.2905, "step": 3842 }, { "epoch": 1.144622945326607, "grad_norm": 0.20550653338432312, "learning_rate": 1.96263502766946e-05, "loss": 1.3023, "step": 3843 }, { "epoch": 1.1449207915262756, "grad_norm": 0.20768411457538605, "learning_rate": 1.9626089025269773e-05, "loss": 1.2964, "step": 3844 }, { "epoch": 1.1452186377259443, "grad_norm": 0.2171637862920761, "learning_rate": 1.9625827684284943e-05, "loss": 1.3013, "step": 3845 }, { "epoch": 1.145516483925613, "grad_norm": 0.2186325192451477, "learning_rate": 1.962556625374254e-05, "loss": 1.2831, "step": 3846 }, { "epoch": 1.1458143301252814, "grad_norm": 0.2084307074546814, "learning_rate": 1.9625304733644998e-05, "loss": 1.2874, "step": 3847 }, { "epoch": 1.1461121763249502, "grad_norm": 0.217898890376091, "learning_rate": 1.9625043123994748e-05, "loss": 1.2932, "step": 3848 }, { "epoch": 1.1464100225246188, "grad_norm": 0.20947939157485962, "learning_rate": 1.9624781424794226e-05, "loss": 1.2984, "step": 3849 }, { "epoch": 1.1467078687242875, "grad_norm": 0.21262046694755554, "learning_rate": 1.9624519636045866e-05, "loss": 1.2939, "step": 3850 }, { "epoch": 1.147005714923956, "grad_norm": 0.22141948342323303, "learning_rate": 1.9624257757752104e-05, "loss": 1.2946, "step": 3851 }, { "epoch": 1.1473035611236249, "grad_norm": 0.21174995601177216, "learning_rate": 1.9623995789915374e-05, "loss": 1.3007, "step": 3852 }, { "epoch": 1.1476014073232934, "grad_norm": 0.21072399616241455, "learning_rate": 1.9623733732538118e-05, "loss": 1.2946, "step": 3853 }, { "epoch": 1.147899253522962, "grad_norm": 0.2091841995716095, "learning_rate": 1.9623471585622774e-05, "loss": 1.3003, "step": 3854 }, { "epoch": 1.1481970997226307, "grad_norm": 0.20846165716648102, "learning_rate": 1.9623209349171775e-05, "loss": 1.298, "step": 3855 }, { "epoch": 1.1484949459222993, "grad_norm": 0.21168039739131927, "learning_rate": 1.962294702318757e-05, "loss": 1.279, "step": 3856 }, { "epoch": 1.148792792121968, "grad_norm": 0.21227902173995972, "learning_rate": 1.962268460767259e-05, "loss": 1.3035, "step": 3857 }, { "epoch": 1.1490906383216366, "grad_norm": 0.4189324378967285, "learning_rate": 1.9622422102629284e-05, "loss": 1.2847, "step": 3858 }, { "epoch": 1.1493884845213054, "grad_norm": 0.20862701535224915, "learning_rate": 1.9622159508060087e-05, "loss": 1.2975, "step": 3859 }, { "epoch": 1.149686330720974, "grad_norm": 0.21685558557510376, "learning_rate": 1.962189682396745e-05, "loss": 1.2963, "step": 3860 }, { "epoch": 1.1499841769206427, "grad_norm": 0.20306533575057983, "learning_rate": 1.9621634050353813e-05, "loss": 1.2913, "step": 3861 }, { "epoch": 1.1502820231203112, "grad_norm": 0.20939095318317413, "learning_rate": 1.962137118722162e-05, "loss": 1.2862, "step": 3862 }, { "epoch": 1.15057986931998, "grad_norm": 0.20440231263637543, "learning_rate": 1.9621108234573316e-05, "loss": 1.2678, "step": 3863 }, { "epoch": 1.1508777155196486, "grad_norm": 0.21635262668132782, "learning_rate": 1.962084519241135e-05, "loss": 1.298, "step": 3864 }, { "epoch": 1.1511755617193171, "grad_norm": 0.21755145490169525, "learning_rate": 1.9620582060738172e-05, "loss": 1.3014, "step": 3865 }, { "epoch": 1.151473407918986, "grad_norm": 0.19997747242450714, "learning_rate": 1.9620318839556223e-05, "loss": 1.293, "step": 3866 }, { "epoch": 1.1517712541186544, "grad_norm": 0.20665189623832703, "learning_rate": 1.9620055528867957e-05, "loss": 1.2842, "step": 3867 }, { "epoch": 1.1520691003183232, "grad_norm": 0.20914733409881592, "learning_rate": 1.961979212867582e-05, "loss": 1.2974, "step": 3868 }, { "epoch": 1.1523669465179918, "grad_norm": 0.21082919836044312, "learning_rate": 1.961952863898227e-05, "loss": 1.2705, "step": 3869 }, { "epoch": 1.1526647927176605, "grad_norm": 0.22354228794574738, "learning_rate": 1.961926505978975e-05, "loss": 1.3124, "step": 3870 }, { "epoch": 1.152962638917329, "grad_norm": 0.21259479224681854, "learning_rate": 1.9619001391100715e-05, "loss": 1.2724, "step": 3871 }, { "epoch": 1.1532604851169976, "grad_norm": 0.20475426316261292, "learning_rate": 1.961873763291762e-05, "loss": 1.2936, "step": 3872 }, { "epoch": 1.1535583313166664, "grad_norm": 0.2088763266801834, "learning_rate": 1.961847378524292e-05, "loss": 1.301, "step": 3873 }, { "epoch": 1.153856177516335, "grad_norm": 0.2091834992170334, "learning_rate": 1.9618209848079066e-05, "loss": 1.3026, "step": 3874 }, { "epoch": 1.1541540237160037, "grad_norm": 0.21396473050117493, "learning_rate": 1.9617945821428517e-05, "loss": 1.2975, "step": 3875 }, { "epoch": 1.1544518699156723, "grad_norm": 0.19916091859340668, "learning_rate": 1.961768170529373e-05, "loss": 1.281, "step": 3876 }, { "epoch": 1.154749716115341, "grad_norm": 0.20929083228111267, "learning_rate": 1.9617417499677152e-05, "loss": 1.298, "step": 3877 }, { "epoch": 1.1550475623150096, "grad_norm": 0.21480098366737366, "learning_rate": 1.9617153204581256e-05, "loss": 1.2833, "step": 3878 }, { "epoch": 1.1553454085146782, "grad_norm": 0.2138931155204773, "learning_rate": 1.9616888820008492e-05, "loss": 1.2994, "step": 3879 }, { "epoch": 1.155643254714347, "grad_norm": 0.21063470840454102, "learning_rate": 1.9616624345961324e-05, "loss": 1.292, "step": 3880 }, { "epoch": 1.1559411009140155, "grad_norm": 0.21075375378131866, "learning_rate": 1.961635978244221e-05, "loss": 1.2809, "step": 3881 }, { "epoch": 1.1562389471136842, "grad_norm": 0.2145451158285141, "learning_rate": 1.961609512945361e-05, "loss": 1.2942, "step": 3882 }, { "epoch": 1.1565367933133528, "grad_norm": 0.21413937211036682, "learning_rate": 1.961583038699799e-05, "loss": 1.2944, "step": 3883 }, { "epoch": 1.1568346395130216, "grad_norm": 0.21136625111103058, "learning_rate": 1.9615565555077817e-05, "loss": 1.2964, "step": 3884 }, { "epoch": 1.1571324857126901, "grad_norm": 0.22657860815525055, "learning_rate": 1.9615300633695545e-05, "loss": 1.2982, "step": 3885 }, { "epoch": 1.1574303319123587, "grad_norm": 0.21359318494796753, "learning_rate": 1.9615035622853643e-05, "loss": 1.3067, "step": 3886 }, { "epoch": 1.1577281781120274, "grad_norm": 0.19199714064598083, "learning_rate": 1.9614770522554576e-05, "loss": 1.3028, "step": 3887 }, { "epoch": 1.158026024311696, "grad_norm": 0.23190546035766602, "learning_rate": 1.9614505332800814e-05, "loss": 1.2931, "step": 3888 }, { "epoch": 1.1583238705113648, "grad_norm": 0.2225271463394165, "learning_rate": 1.961424005359482e-05, "loss": 1.307, "step": 3889 }, { "epoch": 1.1586217167110333, "grad_norm": 0.22097532451152802, "learning_rate": 1.9613974684939062e-05, "loss": 1.2974, "step": 3890 }, { "epoch": 1.158919562910702, "grad_norm": 0.21925872564315796, "learning_rate": 1.9613709226836016e-05, "loss": 1.2807, "step": 3891 }, { "epoch": 1.1592174091103706, "grad_norm": 0.20450474321842194, "learning_rate": 1.9613443679288144e-05, "loss": 1.3048, "step": 3892 }, { "epoch": 1.1595152553100392, "grad_norm": 0.2073204219341278, "learning_rate": 1.961317804229792e-05, "loss": 1.2886, "step": 3893 }, { "epoch": 1.159813101509708, "grad_norm": 0.19831877946853638, "learning_rate": 1.9612912315867815e-05, "loss": 1.3044, "step": 3894 }, { "epoch": 1.1601109477093765, "grad_norm": 0.21263408660888672, "learning_rate": 1.96126465000003e-05, "loss": 1.2783, "step": 3895 }, { "epoch": 1.1604087939090453, "grad_norm": 0.21071314811706543, "learning_rate": 1.9612380594697852e-05, "loss": 1.2862, "step": 3896 }, { "epoch": 1.1607066401087138, "grad_norm": 0.2105439007282257, "learning_rate": 1.961211459996294e-05, "loss": 1.2792, "step": 3897 }, { "epoch": 1.1610044863083826, "grad_norm": 0.2100057452917099, "learning_rate": 1.961184851579804e-05, "loss": 1.2865, "step": 3898 }, { "epoch": 1.1613023325080511, "grad_norm": 0.20325833559036255, "learning_rate": 1.961158234220563e-05, "loss": 1.2874, "step": 3899 }, { "epoch": 1.1616001787077197, "grad_norm": 0.19613565504550934, "learning_rate": 1.9611316079188185e-05, "loss": 1.2889, "step": 3900 }, { "epoch": 1.1618980249073885, "grad_norm": 0.2009575217962265, "learning_rate": 1.961104972674818e-05, "loss": 1.3017, "step": 3901 }, { "epoch": 1.162195871107057, "grad_norm": 0.19757536053657532, "learning_rate": 1.96107832848881e-05, "loss": 1.3061, "step": 3902 }, { "epoch": 1.1624937173067258, "grad_norm": 0.2167089879512787, "learning_rate": 1.9610516753610412e-05, "loss": 1.2811, "step": 3903 }, { "epoch": 1.1627915635063943, "grad_norm": 0.20637845993041992, "learning_rate": 1.961025013291761e-05, "loss": 1.2984, "step": 3904 }, { "epoch": 1.1630894097060631, "grad_norm": 0.203271746635437, "learning_rate": 1.9609983422812163e-05, "loss": 1.2832, "step": 3905 }, { "epoch": 1.1633872559057317, "grad_norm": 0.20609252154827118, "learning_rate": 1.9609716623296563e-05, "loss": 1.2721, "step": 3906 }, { "epoch": 1.1636851021054002, "grad_norm": 0.2045542150735855, "learning_rate": 1.9609449734373282e-05, "loss": 1.2894, "step": 3907 }, { "epoch": 1.163982948305069, "grad_norm": 0.20650260150432587, "learning_rate": 1.960918275604481e-05, "loss": 1.2896, "step": 3908 }, { "epoch": 1.1642807945047375, "grad_norm": 0.19946768879890442, "learning_rate": 1.9608915688313626e-05, "loss": 1.303, "step": 3909 }, { "epoch": 1.1645786407044063, "grad_norm": 0.2126103639602661, "learning_rate": 1.9608648531182217e-05, "loss": 1.2986, "step": 3910 }, { "epoch": 1.1648764869040749, "grad_norm": 0.2057417929172516, "learning_rate": 1.960838128465307e-05, "loss": 1.2979, "step": 3911 }, { "epoch": 1.1651743331037436, "grad_norm": 0.22044256329536438, "learning_rate": 1.960811394872867e-05, "loss": 1.3012, "step": 3912 }, { "epoch": 1.1654721793034122, "grad_norm": 0.19745013117790222, "learning_rate": 1.9607846523411506e-05, "loss": 1.307, "step": 3913 }, { "epoch": 1.1657700255030807, "grad_norm": 0.2026630938053131, "learning_rate": 1.9607579008704064e-05, "loss": 1.299, "step": 3914 }, { "epoch": 1.1660678717027495, "grad_norm": 0.21625575423240662, "learning_rate": 1.960731140460883e-05, "loss": 1.2911, "step": 3915 }, { "epoch": 1.166365717902418, "grad_norm": 0.2037813812494278, "learning_rate": 1.96070437111283e-05, "loss": 1.3124, "step": 3916 }, { "epoch": 1.1666635641020868, "grad_norm": 0.22128376364707947, "learning_rate": 1.9606775928264964e-05, "loss": 1.2894, "step": 3917 }, { "epoch": 1.1669614103017554, "grad_norm": 0.21458415687084198, "learning_rate": 1.960650805602131e-05, "loss": 1.2915, "step": 3918 }, { "epoch": 1.1672592565014241, "grad_norm": 0.19575026631355286, "learning_rate": 1.960624009439983e-05, "loss": 1.2744, "step": 3919 }, { "epoch": 1.1675571027010927, "grad_norm": 0.22207599878311157, "learning_rate": 1.960597204340302e-05, "loss": 1.2786, "step": 3920 }, { "epoch": 1.1678549489007612, "grad_norm": 0.21496036648750305, "learning_rate": 1.9605703903033374e-05, "loss": 1.3091, "step": 3921 }, { "epoch": 1.16815279510043, "grad_norm": 0.2081436961889267, "learning_rate": 1.9605435673293384e-05, "loss": 1.2839, "step": 3922 }, { "epoch": 1.1684506413000986, "grad_norm": 0.20992983877658844, "learning_rate": 1.9605167354185542e-05, "loss": 1.3063, "step": 3923 }, { "epoch": 1.1687484874997673, "grad_norm": 0.21274542808532715, "learning_rate": 1.9604898945712357e-05, "loss": 1.2795, "step": 3924 }, { "epoch": 1.169046333699436, "grad_norm": 0.21605077385902405, "learning_rate": 1.9604630447876315e-05, "loss": 1.293, "step": 3925 }, { "epoch": 1.1693441798991047, "grad_norm": 0.20542150735855103, "learning_rate": 1.960436186067992e-05, "loss": 1.2992, "step": 3926 }, { "epoch": 1.1696420260987732, "grad_norm": 0.20835378766059875, "learning_rate": 1.9604093184125666e-05, "loss": 1.3003, "step": 3927 }, { "epoch": 1.169939872298442, "grad_norm": 0.21384067833423615, "learning_rate": 1.9603824418216052e-05, "loss": 1.2728, "step": 3928 }, { "epoch": 1.1702377184981105, "grad_norm": 0.2059001922607422, "learning_rate": 1.9603555562953587e-05, "loss": 1.289, "step": 3929 }, { "epoch": 1.1705355646977793, "grad_norm": 0.20917369425296783, "learning_rate": 1.9603286618340768e-05, "loss": 1.2876, "step": 3930 }, { "epoch": 1.1708334108974479, "grad_norm": 0.21927359700202942, "learning_rate": 1.9603017584380094e-05, "loss": 1.2995, "step": 3931 }, { "epoch": 1.1711312570971164, "grad_norm": 0.20671169459819794, "learning_rate": 1.960274846107407e-05, "loss": 1.2934, "step": 3932 }, { "epoch": 1.1714291032967852, "grad_norm": 0.21301668882369995, "learning_rate": 1.96024792484252e-05, "loss": 1.2877, "step": 3933 }, { "epoch": 1.1717269494964537, "grad_norm": 0.2051594853401184, "learning_rate": 1.960220994643599e-05, "loss": 1.2832, "step": 3934 }, { "epoch": 1.1720247956961225, "grad_norm": 0.21233688294887543, "learning_rate": 1.9601940555108943e-05, "loss": 1.2885, "step": 3935 }, { "epoch": 1.172322641895791, "grad_norm": 0.21571063995361328, "learning_rate": 1.9601671074446572e-05, "loss": 1.3206, "step": 3936 }, { "epoch": 1.1726204880954598, "grad_norm": 0.20769110321998596, "learning_rate": 1.9601401504451374e-05, "loss": 1.2917, "step": 3937 }, { "epoch": 1.1729183342951284, "grad_norm": 0.21504393219947815, "learning_rate": 1.9601131845125865e-05, "loss": 1.3085, "step": 3938 }, { "epoch": 1.173216180494797, "grad_norm": 0.2108887881040573, "learning_rate": 1.960086209647255e-05, "loss": 1.2993, "step": 3939 }, { "epoch": 1.1735140266944657, "grad_norm": 0.2132851630449295, "learning_rate": 1.960059225849394e-05, "loss": 1.3069, "step": 3940 }, { "epoch": 1.1738118728941342, "grad_norm": 0.20661744475364685, "learning_rate": 1.960032233119255e-05, "loss": 1.2708, "step": 3941 }, { "epoch": 1.174109719093803, "grad_norm": 0.19309358298778534, "learning_rate": 1.960005231457088e-05, "loss": 1.2986, "step": 3942 }, { "epoch": 1.1744075652934716, "grad_norm": 0.2165200263261795, "learning_rate": 1.959978220863145e-05, "loss": 1.2891, "step": 3943 }, { "epoch": 1.1747054114931403, "grad_norm": 0.22003872692584991, "learning_rate": 1.9599512013376775e-05, "loss": 1.2956, "step": 3944 }, { "epoch": 1.175003257692809, "grad_norm": 0.21869656443595886, "learning_rate": 1.9599241728809363e-05, "loss": 1.2977, "step": 3945 }, { "epoch": 1.1753011038924774, "grad_norm": 0.20122909545898438, "learning_rate": 1.9598971354931735e-05, "loss": 1.3045, "step": 3946 }, { "epoch": 1.1755989500921462, "grad_norm": 0.19450651109218597, "learning_rate": 1.9598700891746403e-05, "loss": 1.2806, "step": 3947 }, { "epoch": 1.1758967962918148, "grad_norm": 0.23108182847499847, "learning_rate": 1.959843033925588e-05, "loss": 1.3044, "step": 3948 }, { "epoch": 1.1761946424914835, "grad_norm": 0.21403539180755615, "learning_rate": 1.959815969746269e-05, "loss": 1.2963, "step": 3949 }, { "epoch": 1.176492488691152, "grad_norm": 0.21135477721691132, "learning_rate": 1.9597888966369347e-05, "loss": 1.2954, "step": 3950 }, { "epoch": 1.1767903348908209, "grad_norm": 0.21040703356266022, "learning_rate": 1.959761814597837e-05, "loss": 1.2879, "step": 3951 }, { "epoch": 1.1770881810904894, "grad_norm": 0.21072392165660858, "learning_rate": 1.959734723629228e-05, "loss": 1.2762, "step": 3952 }, { "epoch": 1.177386027290158, "grad_norm": 0.22450478374958038, "learning_rate": 1.9597076237313594e-05, "loss": 1.3059, "step": 3953 }, { "epoch": 1.1776838734898267, "grad_norm": 0.23020806908607483, "learning_rate": 1.9596805149044838e-05, "loss": 1.2826, "step": 3954 }, { "epoch": 1.1779817196894953, "grad_norm": 0.21168041229248047, "learning_rate": 1.9596533971488533e-05, "loss": 1.3038, "step": 3955 }, { "epoch": 1.178279565889164, "grad_norm": 0.2160586565732956, "learning_rate": 1.95962627046472e-05, "loss": 1.2917, "step": 3956 }, { "epoch": 1.1785774120888326, "grad_norm": 0.2052982598543167, "learning_rate": 1.9595991348523363e-05, "loss": 1.2921, "step": 3957 }, { "epoch": 1.1788752582885014, "grad_norm": 0.21587428450584412, "learning_rate": 1.9595719903119554e-05, "loss": 1.3024, "step": 3958 }, { "epoch": 1.17917310448817, "grad_norm": 0.22295457124710083, "learning_rate": 1.9595448368438285e-05, "loss": 1.293, "step": 3959 }, { "epoch": 1.1794709506878385, "grad_norm": 0.20518071949481964, "learning_rate": 1.9595176744482095e-05, "loss": 1.2778, "step": 3960 }, { "epoch": 1.1797687968875072, "grad_norm": 0.21127352118492126, "learning_rate": 1.95949050312535e-05, "loss": 1.2959, "step": 3961 }, { "epoch": 1.1800666430871758, "grad_norm": 0.21096312999725342, "learning_rate": 1.9594633228755038e-05, "loss": 1.2829, "step": 3962 }, { "epoch": 1.1803644892868446, "grad_norm": 0.21224980056285858, "learning_rate": 1.9594361336989232e-05, "loss": 1.3133, "step": 3963 }, { "epoch": 1.1806623354865131, "grad_norm": 0.22840841114521027, "learning_rate": 1.9594089355958612e-05, "loss": 1.3053, "step": 3964 }, { "epoch": 1.1809601816861819, "grad_norm": 0.21935367584228516, "learning_rate": 1.9593817285665712e-05, "loss": 1.2869, "step": 3965 }, { "epoch": 1.1812580278858504, "grad_norm": 0.21192151308059692, "learning_rate": 1.9593545126113063e-05, "loss": 1.2963, "step": 3966 }, { "epoch": 1.181555874085519, "grad_norm": 0.2130948156118393, "learning_rate": 1.9593272877303192e-05, "loss": 1.3048, "step": 3967 }, { "epoch": 1.1818537202851878, "grad_norm": 0.21849285066127777, "learning_rate": 1.9593000539238637e-05, "loss": 1.299, "step": 3968 }, { "epoch": 1.1821515664848563, "grad_norm": 0.22445747256278992, "learning_rate": 1.9592728111921926e-05, "loss": 1.2823, "step": 3969 }, { "epoch": 1.182449412684525, "grad_norm": 0.21361754834651947, "learning_rate": 1.95924555953556e-05, "loss": 1.2737, "step": 3970 }, { "epoch": 1.1827472588841936, "grad_norm": 0.21532507240772247, "learning_rate": 1.959218298954219e-05, "loss": 1.3136, "step": 3971 }, { "epoch": 1.1830451050838624, "grad_norm": 0.2088194340467453, "learning_rate": 1.959191029448424e-05, "loss": 1.2899, "step": 3972 }, { "epoch": 1.183342951283531, "grad_norm": 0.2056037336587906, "learning_rate": 1.9591637510184277e-05, "loss": 1.2808, "step": 3973 }, { "epoch": 1.1836407974831995, "grad_norm": 0.20478075742721558, "learning_rate": 1.9591364636644842e-05, "loss": 1.2664, "step": 3974 }, { "epoch": 1.1839386436828683, "grad_norm": 0.2041953206062317, "learning_rate": 1.9591091673868477e-05, "loss": 1.2755, "step": 3975 }, { "epoch": 1.1842364898825368, "grad_norm": 0.2047312706708908, "learning_rate": 1.959081862185772e-05, "loss": 1.2782, "step": 3976 }, { "epoch": 1.1845343360822056, "grad_norm": 0.21848949790000916, "learning_rate": 1.959054548061511e-05, "loss": 1.2986, "step": 3977 }, { "epoch": 1.1848321822818741, "grad_norm": 0.21306787431240082, "learning_rate": 1.9590272250143193e-05, "loss": 1.2924, "step": 3978 }, { "epoch": 1.185130028481543, "grad_norm": 0.21525615453720093, "learning_rate": 1.9589998930444508e-05, "loss": 1.2936, "step": 3979 }, { "epoch": 1.1854278746812115, "grad_norm": 0.21527034044265747, "learning_rate": 1.9589725521521596e-05, "loss": 1.2891, "step": 3980 }, { "epoch": 1.18572572088088, "grad_norm": 0.2184000164270401, "learning_rate": 1.9589452023377e-05, "loss": 1.2918, "step": 3981 }, { "epoch": 1.1860235670805488, "grad_norm": 0.22402919828891754, "learning_rate": 1.9589178436013268e-05, "loss": 1.2926, "step": 3982 }, { "epoch": 1.1863214132802173, "grad_norm": 0.21852335333824158, "learning_rate": 1.9588904759432945e-05, "loss": 1.3042, "step": 3983 }, { "epoch": 1.1866192594798861, "grad_norm": 0.21109206974506378, "learning_rate": 1.9588630993638575e-05, "loss": 1.2826, "step": 3984 }, { "epoch": 1.1869171056795547, "grad_norm": 0.20981113612651825, "learning_rate": 1.9588357138632706e-05, "loss": 1.3027, "step": 3985 }, { "epoch": 1.1872149518792234, "grad_norm": 0.22667832672595978, "learning_rate": 1.958808319441789e-05, "loss": 1.3037, "step": 3986 }, { "epoch": 1.187512798078892, "grad_norm": 0.2064162641763687, "learning_rate": 1.958780916099667e-05, "loss": 1.2846, "step": 3987 }, { "epoch": 1.1878106442785605, "grad_norm": 0.2067207247018814, "learning_rate": 1.95875350383716e-05, "loss": 1.2758, "step": 3988 }, { "epoch": 1.1881084904782293, "grad_norm": 0.21313981711864471, "learning_rate": 1.9587260826545225e-05, "loss": 1.2912, "step": 3989 }, { "epoch": 1.1884063366778979, "grad_norm": 0.21533231437206268, "learning_rate": 1.9586986525520104e-05, "loss": 1.3031, "step": 3990 }, { "epoch": 1.1887041828775666, "grad_norm": 0.2270992547273636, "learning_rate": 1.9586712135298778e-05, "loss": 1.2797, "step": 3991 }, { "epoch": 1.1890020290772352, "grad_norm": 0.21351057291030884, "learning_rate": 1.9586437655883812e-05, "loss": 1.2974, "step": 3992 }, { "epoch": 1.189299875276904, "grad_norm": 0.20550043880939484, "learning_rate": 1.958616308727775e-05, "loss": 1.3004, "step": 3993 }, { "epoch": 1.1895977214765725, "grad_norm": 0.20054568350315094, "learning_rate": 1.9585888429483155e-05, "loss": 1.2796, "step": 3994 }, { "epoch": 1.1898955676762413, "grad_norm": 0.2166435420513153, "learning_rate": 1.9585613682502574e-05, "loss": 1.2906, "step": 3995 }, { "epoch": 1.1901934138759098, "grad_norm": 0.20004740357398987, "learning_rate": 1.9585338846338574e-05, "loss": 1.2846, "step": 3996 }, { "epoch": 1.1904912600755786, "grad_norm": 0.20989029109477997, "learning_rate": 1.9585063920993698e-05, "loss": 1.2887, "step": 3997 }, { "epoch": 1.1907891062752471, "grad_norm": 0.21801543235778809, "learning_rate": 1.9584788906470516e-05, "loss": 1.2885, "step": 3998 }, { "epoch": 1.1910869524749157, "grad_norm": 0.20614711940288544, "learning_rate": 1.958451380277158e-05, "loss": 1.2954, "step": 3999 }, { "epoch": 1.1913847986745845, "grad_norm": 0.2140260934829712, "learning_rate": 1.9584238609899452e-05, "loss": 1.2771, "step": 4000 }, { "epoch": 1.1913847986745845, "eval_loss": 1.355595588684082, "eval_runtime": 20.1193, "eval_samples_per_second": 86.186, "eval_steps_per_second": 5.418, "step": 4000 }, { "epoch": 1.191682644874253, "grad_norm": 0.19806234538555145, "learning_rate": 1.9583963327856696e-05, "loss": 1.2977, "step": 4001 }, { "epoch": 1.1919804910739218, "grad_norm": 0.21819539368152618, "learning_rate": 1.958368795664586e-05, "loss": 1.3065, "step": 4002 }, { "epoch": 1.1922783372735903, "grad_norm": 0.2082003653049469, "learning_rate": 1.958341249626952e-05, "loss": 1.3048, "step": 4003 }, { "epoch": 1.192576183473259, "grad_norm": 0.22938930988311768, "learning_rate": 1.9583136946730237e-05, "loss": 1.3025, "step": 4004 }, { "epoch": 1.1928740296729277, "grad_norm": 0.2137821465730667, "learning_rate": 1.9582861308030567e-05, "loss": 1.293, "step": 4005 }, { "epoch": 1.1931718758725962, "grad_norm": 0.21068082749843597, "learning_rate": 1.9582585580173083e-05, "loss": 1.2854, "step": 4006 }, { "epoch": 1.193469722072265, "grad_norm": 0.21020939946174622, "learning_rate": 1.9582309763160343e-05, "loss": 1.2937, "step": 4007 }, { "epoch": 1.1937675682719335, "grad_norm": 0.21396276354789734, "learning_rate": 1.958203385699492e-05, "loss": 1.2914, "step": 4008 }, { "epoch": 1.1940654144716023, "grad_norm": 0.22837236523628235, "learning_rate": 1.9581757861679372e-05, "loss": 1.287, "step": 4009 }, { "epoch": 1.1943632606712709, "grad_norm": 0.21628418564796448, "learning_rate": 1.9581481777216277e-05, "loss": 1.3058, "step": 4010 }, { "epoch": 1.1946611068709396, "grad_norm": 0.22252708673477173, "learning_rate": 1.95812056036082e-05, "loss": 1.2798, "step": 4011 }, { "epoch": 1.1949589530706082, "grad_norm": 0.2062803953886032, "learning_rate": 1.9580929340857707e-05, "loss": 1.3059, "step": 4012 }, { "epoch": 1.1952567992702767, "grad_norm": 0.2138001173734665, "learning_rate": 1.958065298896737e-05, "loss": 1.2957, "step": 4013 }, { "epoch": 1.1955546454699455, "grad_norm": 0.21340374648571014, "learning_rate": 1.9580376547939763e-05, "loss": 1.3015, "step": 4014 }, { "epoch": 1.195852491669614, "grad_norm": 0.20899172127246857, "learning_rate": 1.9580100017777455e-05, "loss": 1.2774, "step": 4015 }, { "epoch": 1.1961503378692828, "grad_norm": 0.21481002867221832, "learning_rate": 1.957982339848302e-05, "loss": 1.2859, "step": 4016 }, { "epoch": 1.1964481840689514, "grad_norm": 0.20328159630298615, "learning_rate": 1.9579546690059033e-05, "loss": 1.2875, "step": 4017 }, { "epoch": 1.1967460302686201, "grad_norm": 0.22197499871253967, "learning_rate": 1.9579269892508065e-05, "loss": 1.2953, "step": 4018 }, { "epoch": 1.1970438764682887, "grad_norm": 0.2199348658323288, "learning_rate": 1.9578993005832697e-05, "loss": 1.301, "step": 4019 }, { "epoch": 1.1973417226679572, "grad_norm": 0.22500820457935333, "learning_rate": 1.9578716030035497e-05, "loss": 1.2943, "step": 4020 }, { "epoch": 1.197639568867626, "grad_norm": 0.2298382967710495, "learning_rate": 1.957843896511905e-05, "loss": 1.2977, "step": 4021 }, { "epoch": 1.1979374150672946, "grad_norm": 0.22839154303073883, "learning_rate": 1.9578161811085924e-05, "loss": 1.2899, "step": 4022 }, { "epoch": 1.1982352612669633, "grad_norm": 0.2266552597284317, "learning_rate": 1.957788456793871e-05, "loss": 1.283, "step": 4023 }, { "epoch": 1.1985331074666319, "grad_norm": 0.21820247173309326, "learning_rate": 1.957760723567998e-05, "loss": 1.2815, "step": 4024 }, { "epoch": 1.1988309536663007, "grad_norm": 0.2251671552658081, "learning_rate": 1.9577329814312314e-05, "loss": 1.2832, "step": 4025 }, { "epoch": 1.1991287998659692, "grad_norm": 0.22719109058380127, "learning_rate": 1.9577052303838292e-05, "loss": 1.2841, "step": 4026 }, { "epoch": 1.1994266460656378, "grad_norm": 0.2208300679922104, "learning_rate": 1.9576774704260503e-05, "loss": 1.2923, "step": 4027 }, { "epoch": 1.1997244922653065, "grad_norm": 0.23036691546440125, "learning_rate": 1.9576497015581523e-05, "loss": 1.2983, "step": 4028 }, { "epoch": 1.200022338464975, "grad_norm": 0.20784254372119904, "learning_rate": 1.957621923780394e-05, "loss": 1.2892, "step": 4029 }, { "epoch": 1.2003201846646439, "grad_norm": 0.22049713134765625, "learning_rate": 1.9575941370930333e-05, "loss": 1.3079, "step": 4030 }, { "epoch": 1.2006180308643124, "grad_norm": 0.22592094540596008, "learning_rate": 1.957566341496329e-05, "loss": 1.2937, "step": 4031 }, { "epoch": 1.2009158770639812, "grad_norm": 0.22759899497032166, "learning_rate": 1.95753853699054e-05, "loss": 1.2873, "step": 4032 }, { "epoch": 1.2012137232636497, "grad_norm": 0.21229848265647888, "learning_rate": 1.9575107235759248e-05, "loss": 1.2747, "step": 4033 }, { "epoch": 1.2015115694633183, "grad_norm": 0.1977112889289856, "learning_rate": 1.957482901252742e-05, "loss": 1.2851, "step": 4034 }, { "epoch": 1.201809415662987, "grad_norm": 0.2177518755197525, "learning_rate": 1.9574550700212505e-05, "loss": 1.2916, "step": 4035 }, { "epoch": 1.2021072618626556, "grad_norm": 0.2394571155309677, "learning_rate": 1.9574272298817093e-05, "loss": 1.2904, "step": 4036 }, { "epoch": 1.2024051080623244, "grad_norm": 0.21085403859615326, "learning_rate": 1.9573993808343776e-05, "loss": 1.3214, "step": 4037 }, { "epoch": 1.202702954261993, "grad_norm": 0.20579291880130768, "learning_rate": 1.9573715228795142e-05, "loss": 1.2918, "step": 4038 }, { "epoch": 1.2030008004616617, "grad_norm": 0.22272194921970367, "learning_rate": 1.9573436560173784e-05, "loss": 1.307, "step": 4039 }, { "epoch": 1.2032986466613302, "grad_norm": 0.22960400581359863, "learning_rate": 1.9573157802482294e-05, "loss": 1.2914, "step": 4040 }, { "epoch": 1.2035964928609988, "grad_norm": 0.21425718069076538, "learning_rate": 1.9572878955723266e-05, "loss": 1.2935, "step": 4041 }, { "epoch": 1.2038943390606676, "grad_norm": 0.22439956665039062, "learning_rate": 1.95726000198993e-05, "loss": 1.2716, "step": 4042 }, { "epoch": 1.2041921852603361, "grad_norm": 0.21399863064289093, "learning_rate": 1.957232099501298e-05, "loss": 1.2917, "step": 4043 }, { "epoch": 1.2044900314600049, "grad_norm": 0.22308999300003052, "learning_rate": 1.957204188106691e-05, "loss": 1.301, "step": 4044 }, { "epoch": 1.2047878776596734, "grad_norm": 0.23780715465545654, "learning_rate": 1.9571762678063685e-05, "loss": 1.2773, "step": 4045 }, { "epoch": 1.2050857238593422, "grad_norm": 0.2037344127893448, "learning_rate": 1.9571483386005905e-05, "loss": 1.3037, "step": 4046 }, { "epoch": 1.2053835700590108, "grad_norm": 0.20831087231636047, "learning_rate": 1.957120400489616e-05, "loss": 1.2815, "step": 4047 }, { "epoch": 1.2056814162586793, "grad_norm": 0.2218935489654541, "learning_rate": 1.957092453473706e-05, "loss": 1.2883, "step": 4048 }, { "epoch": 1.205979262458348, "grad_norm": 0.22106003761291504, "learning_rate": 1.9570644975531202e-05, "loss": 1.312, "step": 4049 }, { "epoch": 1.2062771086580166, "grad_norm": 0.20549699664115906, "learning_rate": 1.9570365327281184e-05, "loss": 1.2767, "step": 4050 }, { "epoch": 1.2065749548576854, "grad_norm": 0.20258985459804535, "learning_rate": 1.9570085589989605e-05, "loss": 1.2831, "step": 4051 }, { "epoch": 1.206872801057354, "grad_norm": 0.21634162962436676, "learning_rate": 1.9569805763659075e-05, "loss": 1.2911, "step": 4052 }, { "epoch": 1.2071706472570227, "grad_norm": 0.22728785872459412, "learning_rate": 1.9569525848292192e-05, "loss": 1.2984, "step": 4053 }, { "epoch": 1.2074684934566913, "grad_norm": 0.23223990201950073, "learning_rate": 1.9569245843891565e-05, "loss": 1.2949, "step": 4054 }, { "epoch": 1.20776633965636, "grad_norm": 0.22025689482688904, "learning_rate": 1.95689657504598e-05, "loss": 1.293, "step": 4055 }, { "epoch": 1.2080641858560286, "grad_norm": 0.22677235305309296, "learning_rate": 1.9568685567999495e-05, "loss": 1.2875, "step": 4056 }, { "epoch": 1.2083620320556971, "grad_norm": 0.22293587028980255, "learning_rate": 1.9568405296513262e-05, "loss": 1.2921, "step": 4057 }, { "epoch": 1.208659878255366, "grad_norm": 0.22843754291534424, "learning_rate": 1.956812493600371e-05, "loss": 1.2974, "step": 4058 }, { "epoch": 1.2089577244550345, "grad_norm": 0.2277127504348755, "learning_rate": 1.9567844486473447e-05, "loss": 1.2793, "step": 4059 }, { "epoch": 1.2092555706547032, "grad_norm": 0.20787858963012695, "learning_rate": 1.9567563947925075e-05, "loss": 1.2745, "step": 4060 }, { "epoch": 1.2095534168543718, "grad_norm": 0.20440879464149475, "learning_rate": 1.956728332036122e-05, "loss": 1.2835, "step": 4061 }, { "epoch": 1.2098512630540406, "grad_norm": 0.21900875866413116, "learning_rate": 1.956700260378447e-05, "loss": 1.3026, "step": 4062 }, { "epoch": 1.210149109253709, "grad_norm": 0.22136928141117096, "learning_rate": 1.956672179819746e-05, "loss": 1.2818, "step": 4063 }, { "epoch": 1.2104469554533779, "grad_norm": 0.2110716998577118, "learning_rate": 1.9566440903602787e-05, "loss": 1.3016, "step": 4064 }, { "epoch": 1.2107448016530464, "grad_norm": 0.21238602697849274, "learning_rate": 1.956615992000307e-05, "loss": 1.2836, "step": 4065 }, { "epoch": 1.211042647852715, "grad_norm": 0.21743468940258026, "learning_rate": 1.9565878847400924e-05, "loss": 1.2932, "step": 4066 }, { "epoch": 1.2113404940523838, "grad_norm": 0.22191287577152252, "learning_rate": 1.9565597685798965e-05, "loss": 1.287, "step": 4067 }, { "epoch": 1.2116383402520523, "grad_norm": 0.23670993745326996, "learning_rate": 1.9565316435199807e-05, "loss": 1.2826, "step": 4068 }, { "epoch": 1.211936186451721, "grad_norm": 0.21914072334766388, "learning_rate": 1.956503509560606e-05, "loss": 1.276, "step": 4069 }, { "epoch": 1.2122340326513896, "grad_norm": 0.22580796480178833, "learning_rate": 1.9564753667020354e-05, "loss": 1.2866, "step": 4070 }, { "epoch": 1.2125318788510584, "grad_norm": 0.24082769453525543, "learning_rate": 1.95644721494453e-05, "loss": 1.2804, "step": 4071 }, { "epoch": 1.212829725050727, "grad_norm": 0.21872304379940033, "learning_rate": 1.9564190542883522e-05, "loss": 1.3067, "step": 4072 }, { "epoch": 1.2131275712503955, "grad_norm": 0.2314034253358841, "learning_rate": 1.956390884733763e-05, "loss": 1.2861, "step": 4073 }, { "epoch": 1.2134254174500643, "grad_norm": 0.212004154920578, "learning_rate": 1.956362706281026e-05, "loss": 1.2795, "step": 4074 }, { "epoch": 1.2137232636497328, "grad_norm": 0.2215016484260559, "learning_rate": 1.9563345189304016e-05, "loss": 1.289, "step": 4075 }, { "epoch": 1.2140211098494016, "grad_norm": 0.21931003034114838, "learning_rate": 1.9563063226821533e-05, "loss": 1.3087, "step": 4076 }, { "epoch": 1.2143189560490701, "grad_norm": 0.22194616496562958, "learning_rate": 1.9562781175365436e-05, "loss": 1.3011, "step": 4077 }, { "epoch": 1.214616802248739, "grad_norm": 0.2346036732196808, "learning_rate": 1.956249903493834e-05, "loss": 1.2898, "step": 4078 }, { "epoch": 1.2149146484484075, "grad_norm": 0.20830166339874268, "learning_rate": 1.9562216805542876e-05, "loss": 1.2936, "step": 4079 }, { "epoch": 1.215212494648076, "grad_norm": 0.21001702547073364, "learning_rate": 1.9561934487181665e-05, "loss": 1.2792, "step": 4080 }, { "epoch": 1.2155103408477448, "grad_norm": 0.22858309745788574, "learning_rate": 1.956165207985734e-05, "loss": 1.3071, "step": 4081 }, { "epoch": 1.2158081870474133, "grad_norm": 0.237175852060318, "learning_rate": 1.956136958357252e-05, "loss": 1.2896, "step": 4082 }, { "epoch": 1.216106033247082, "grad_norm": 0.21885618567466736, "learning_rate": 1.9561086998329847e-05, "loss": 1.2969, "step": 4083 }, { "epoch": 1.2164038794467507, "grad_norm": 0.21815459430217743, "learning_rate": 1.9560804324131938e-05, "loss": 1.3052, "step": 4084 }, { "epoch": 1.2167017256464194, "grad_norm": 0.22342005372047424, "learning_rate": 1.9560521560981428e-05, "loss": 1.288, "step": 4085 }, { "epoch": 1.216999571846088, "grad_norm": 0.25384339690208435, "learning_rate": 1.9560238708880942e-05, "loss": 1.2987, "step": 4086 }, { "epoch": 1.2172974180457565, "grad_norm": 0.2196073979139328, "learning_rate": 1.955995576783312e-05, "loss": 1.2933, "step": 4087 }, { "epoch": 1.2175952642454253, "grad_norm": 0.22214414179325104, "learning_rate": 1.955967273784059e-05, "loss": 1.298, "step": 4088 }, { "epoch": 1.2178931104450939, "grad_norm": 0.21233679354190826, "learning_rate": 1.9559389618905986e-05, "loss": 1.2773, "step": 4089 }, { "epoch": 1.2181909566447626, "grad_norm": 0.20780721306800842, "learning_rate": 1.955910641103194e-05, "loss": 1.2952, "step": 4090 }, { "epoch": 1.2184888028444312, "grad_norm": 0.20027214288711548, "learning_rate": 1.955882311422109e-05, "loss": 1.2931, "step": 4091 }, { "epoch": 1.2187866490441, "grad_norm": 0.2059791088104248, "learning_rate": 1.9558539728476072e-05, "loss": 1.2821, "step": 4092 }, { "epoch": 1.2190844952437685, "grad_norm": 0.20750513672828674, "learning_rate": 1.9558256253799523e-05, "loss": 1.2948, "step": 4093 }, { "epoch": 1.219382341443437, "grad_norm": 0.20728327333927155, "learning_rate": 1.9557972690194075e-05, "loss": 1.2817, "step": 4094 }, { "epoch": 1.2196801876431058, "grad_norm": 0.21938657760620117, "learning_rate": 1.9557689037662372e-05, "loss": 1.2947, "step": 4095 }, { "epoch": 1.2199780338427744, "grad_norm": 0.21456745266914368, "learning_rate": 1.955740529620705e-05, "loss": 1.2688, "step": 4096 }, { "epoch": 1.2202758800424431, "grad_norm": 0.20828519761562347, "learning_rate": 1.9557121465830747e-05, "loss": 1.2797, "step": 4097 }, { "epoch": 1.2205737262421117, "grad_norm": 0.20779909193515778, "learning_rate": 1.955683754653611e-05, "loss": 1.3035, "step": 4098 }, { "epoch": 1.2208715724417805, "grad_norm": 0.2051311582326889, "learning_rate": 1.955655353832578e-05, "loss": 1.2778, "step": 4099 }, { "epoch": 1.221169418641449, "grad_norm": 0.2165333777666092, "learning_rate": 1.9556269441202392e-05, "loss": 1.3092, "step": 4100 }, { "epoch": 1.2214672648411176, "grad_norm": 0.22863154113292694, "learning_rate": 1.9555985255168593e-05, "loss": 1.2889, "step": 4101 }, { "epoch": 1.2217651110407863, "grad_norm": 0.213213250041008, "learning_rate": 1.9555700980227028e-05, "loss": 1.2863, "step": 4102 }, { "epoch": 1.2220629572404549, "grad_norm": 0.20332065224647522, "learning_rate": 1.9555416616380346e-05, "loss": 1.2952, "step": 4103 }, { "epoch": 1.2223608034401237, "grad_norm": 0.2252652943134308, "learning_rate": 1.9555132163631186e-05, "loss": 1.3054, "step": 4104 }, { "epoch": 1.2226586496397922, "grad_norm": 0.23016788065433502, "learning_rate": 1.9554847621982195e-05, "loss": 1.3008, "step": 4105 }, { "epoch": 1.222956495839461, "grad_norm": 0.21542948484420776, "learning_rate": 1.9554562991436022e-05, "loss": 1.2791, "step": 4106 }, { "epoch": 1.2232543420391295, "grad_norm": 0.23817753791809082, "learning_rate": 1.955427827199532e-05, "loss": 1.2995, "step": 4107 }, { "epoch": 1.223552188238798, "grad_norm": 0.22850258648395538, "learning_rate": 1.955399346366273e-05, "loss": 1.294, "step": 4108 }, { "epoch": 1.2238500344384668, "grad_norm": 0.2405814826488495, "learning_rate": 1.9553708566440904e-05, "loss": 1.2986, "step": 4109 }, { "epoch": 1.2241478806381354, "grad_norm": 0.2581832706928253, "learning_rate": 1.9553423580332495e-05, "loss": 1.2792, "step": 4110 }, { "epoch": 1.2244457268378042, "grad_norm": 0.20295637845993042, "learning_rate": 1.9553138505340156e-05, "loss": 1.2843, "step": 4111 }, { "epoch": 1.2247435730374727, "grad_norm": 0.22549006342887878, "learning_rate": 1.9552853341466532e-05, "loss": 1.3055, "step": 4112 }, { "epoch": 1.2250414192371415, "grad_norm": 0.2101021260023117, "learning_rate": 1.9552568088714283e-05, "loss": 1.29, "step": 4113 }, { "epoch": 1.22533926543681, "grad_norm": 0.2183251976966858, "learning_rate": 1.955228274708606e-05, "loss": 1.2791, "step": 4114 }, { "epoch": 1.2256371116364786, "grad_norm": 0.2154933512210846, "learning_rate": 1.955199731658452e-05, "loss": 1.2951, "step": 4115 }, { "epoch": 1.2259349578361474, "grad_norm": 0.22604544460773468, "learning_rate": 1.9551711797212317e-05, "loss": 1.2906, "step": 4116 }, { "epoch": 1.226232804035816, "grad_norm": 0.21755193173885345, "learning_rate": 1.9551426188972104e-05, "loss": 1.3036, "step": 4117 }, { "epoch": 1.2265306502354847, "grad_norm": 0.21554438769817352, "learning_rate": 1.9551140491866546e-05, "loss": 1.2921, "step": 4118 }, { "epoch": 1.2268284964351532, "grad_norm": 0.22313088178634644, "learning_rate": 1.9550854705898295e-05, "loss": 1.2953, "step": 4119 }, { "epoch": 1.227126342634822, "grad_norm": 0.20665155351161957, "learning_rate": 1.9550568831070013e-05, "loss": 1.2845, "step": 4120 }, { "epoch": 1.2274241888344906, "grad_norm": 0.20775890350341797, "learning_rate": 1.955028286738436e-05, "loss": 1.2737, "step": 4121 }, { "epoch": 1.2277220350341593, "grad_norm": 0.2068718671798706, "learning_rate": 1.954999681484399e-05, "loss": 1.2931, "step": 4122 }, { "epoch": 1.2280198812338279, "grad_norm": 0.20639167726039886, "learning_rate": 1.9549710673451574e-05, "loss": 1.31, "step": 4123 }, { "epoch": 1.2283177274334964, "grad_norm": 0.21572212874889374, "learning_rate": 1.9549424443209768e-05, "loss": 1.2864, "step": 4124 }, { "epoch": 1.2286155736331652, "grad_norm": 0.21003632247447968, "learning_rate": 1.9549138124121236e-05, "loss": 1.2768, "step": 4125 }, { "epoch": 1.2289134198328338, "grad_norm": 0.21323645114898682, "learning_rate": 1.9548851716188645e-05, "loss": 1.2853, "step": 4126 }, { "epoch": 1.2292112660325025, "grad_norm": 0.2155407965183258, "learning_rate": 1.9548565219414658e-05, "loss": 1.2947, "step": 4127 }, { "epoch": 1.229509112232171, "grad_norm": 0.20775429904460907, "learning_rate": 1.9548278633801937e-05, "loss": 1.2916, "step": 4128 }, { "epoch": 1.2298069584318398, "grad_norm": 0.21007290482521057, "learning_rate": 1.9547991959353153e-05, "loss": 1.3014, "step": 4129 }, { "epoch": 1.2301048046315084, "grad_norm": 0.2189212143421173, "learning_rate": 1.954770519607097e-05, "loss": 1.2872, "step": 4130 }, { "epoch": 1.2304026508311772, "grad_norm": 0.21716181933879852, "learning_rate": 1.9547418343958058e-05, "loss": 1.2849, "step": 4131 }, { "epoch": 1.2307004970308457, "grad_norm": 0.20262238383293152, "learning_rate": 1.9547131403017086e-05, "loss": 1.3047, "step": 4132 }, { "epoch": 1.2309983432305143, "grad_norm": 0.20321613550186157, "learning_rate": 1.954684437325072e-05, "loss": 1.2857, "step": 4133 }, { "epoch": 1.231296189430183, "grad_norm": 0.21657630801200867, "learning_rate": 1.9546557254661634e-05, "loss": 1.2935, "step": 4134 }, { "epoch": 1.2315940356298516, "grad_norm": 0.24504989385604858, "learning_rate": 1.9546270047252506e-05, "loss": 1.2841, "step": 4135 }, { "epoch": 1.2318918818295204, "grad_norm": 0.19349680840969086, "learning_rate": 1.9545982751025994e-05, "loss": 1.2751, "step": 4136 }, { "epoch": 1.232189728029189, "grad_norm": 0.30486607551574707, "learning_rate": 1.9545695365984777e-05, "loss": 1.2776, "step": 4137 }, { "epoch": 1.2324875742288577, "grad_norm": 0.22059954702854156, "learning_rate": 1.9545407892131533e-05, "loss": 1.2872, "step": 4138 }, { "epoch": 1.2327854204285262, "grad_norm": 0.22120457887649536, "learning_rate": 1.9545120329468933e-05, "loss": 1.2954, "step": 4139 }, { "epoch": 1.2330832666281948, "grad_norm": 0.21268272399902344, "learning_rate": 1.9544832677999653e-05, "loss": 1.2859, "step": 4140 }, { "epoch": 1.2333811128278636, "grad_norm": 0.21759754419326782, "learning_rate": 1.9544544937726367e-05, "loss": 1.3013, "step": 4141 }, { "epoch": 1.233678959027532, "grad_norm": 0.21478748321533203, "learning_rate": 1.954425710865176e-05, "loss": 1.2927, "step": 4142 }, { "epoch": 1.2339768052272009, "grad_norm": 0.19693058729171753, "learning_rate": 1.9543969190778494e-05, "loss": 1.2988, "step": 4143 }, { "epoch": 1.2342746514268694, "grad_norm": 0.2068811058998108, "learning_rate": 1.9543681184109267e-05, "loss": 1.2864, "step": 4144 }, { "epoch": 1.2345724976265382, "grad_norm": 0.21200990676879883, "learning_rate": 1.9543393088646746e-05, "loss": 1.2916, "step": 4145 }, { "epoch": 1.2348703438262068, "grad_norm": 0.20587991178035736, "learning_rate": 1.9543104904393612e-05, "loss": 1.2882, "step": 4146 }, { "epoch": 1.2351681900258753, "grad_norm": 0.20354631543159485, "learning_rate": 1.9542816631352554e-05, "loss": 1.2864, "step": 4147 }, { "epoch": 1.235466036225544, "grad_norm": 0.21679729223251343, "learning_rate": 1.9542528269526248e-05, "loss": 1.3085, "step": 4148 }, { "epoch": 1.2357638824252126, "grad_norm": 0.24312865734100342, "learning_rate": 1.9542239818917373e-05, "loss": 1.3036, "step": 4149 }, { "epoch": 1.2360617286248814, "grad_norm": 0.2099834382534027, "learning_rate": 1.9541951279528624e-05, "loss": 1.2929, "step": 4150 }, { "epoch": 1.23635957482455, "grad_norm": 0.22744804620742798, "learning_rate": 1.9541662651362677e-05, "loss": 1.3021, "step": 4151 }, { "epoch": 1.2366574210242187, "grad_norm": 0.20686295628547668, "learning_rate": 1.954137393442222e-05, "loss": 1.2835, "step": 4152 }, { "epoch": 1.2369552672238873, "grad_norm": 0.20510515570640564, "learning_rate": 1.9541085128709937e-05, "loss": 1.296, "step": 4153 }, { "epoch": 1.2372531134235558, "grad_norm": 0.2252977192401886, "learning_rate": 1.954079623422852e-05, "loss": 1.29, "step": 4154 }, { "epoch": 1.2375509596232246, "grad_norm": 0.21374380588531494, "learning_rate": 1.954050725098065e-05, "loss": 1.3085, "step": 4155 }, { "epoch": 1.2378488058228931, "grad_norm": 0.2219097912311554, "learning_rate": 1.9540218178969018e-05, "loss": 1.2814, "step": 4156 }, { "epoch": 1.238146652022562, "grad_norm": 0.22387543320655823, "learning_rate": 1.953992901819632e-05, "loss": 1.3054, "step": 4157 }, { "epoch": 1.2384444982222305, "grad_norm": 0.21464106440544128, "learning_rate": 1.953963976866524e-05, "loss": 1.3065, "step": 4158 }, { "epoch": 1.2387423444218992, "grad_norm": 0.21472743153572083, "learning_rate": 1.953935043037847e-05, "loss": 1.302, "step": 4159 }, { "epoch": 1.2390401906215678, "grad_norm": 0.20701003074645996, "learning_rate": 1.95390610033387e-05, "loss": 1.2993, "step": 4160 }, { "epoch": 1.2393380368212363, "grad_norm": 0.21344922482967377, "learning_rate": 1.9538771487548628e-05, "loss": 1.2914, "step": 4161 }, { "epoch": 1.239635883020905, "grad_norm": 0.21337179839611053, "learning_rate": 1.9538481883010943e-05, "loss": 1.3071, "step": 4162 }, { "epoch": 1.2399337292205737, "grad_norm": 0.22430700063705444, "learning_rate": 1.953819218972834e-05, "loss": 1.2926, "step": 4163 }, { "epoch": 1.2402315754202424, "grad_norm": 0.2090141922235489, "learning_rate": 1.9537902407703514e-05, "loss": 1.2822, "step": 4164 }, { "epoch": 1.240529421619911, "grad_norm": 0.21654996275901794, "learning_rate": 1.953761253693917e-05, "loss": 1.2977, "step": 4165 }, { "epoch": 1.2408272678195797, "grad_norm": 0.2089291512966156, "learning_rate": 1.953732257743799e-05, "loss": 1.3051, "step": 4166 }, { "epoch": 1.2411251140192483, "grad_norm": 0.21138688921928406, "learning_rate": 1.953703252920268e-05, "loss": 1.2856, "step": 4167 }, { "epoch": 1.2414229602189168, "grad_norm": 0.23540814220905304, "learning_rate": 1.953674239223594e-05, "loss": 1.2965, "step": 4168 }, { "epoch": 1.2417208064185856, "grad_norm": 0.20688270032405853, "learning_rate": 1.9536452166540468e-05, "loss": 1.2927, "step": 4169 }, { "epoch": 1.2420186526182542, "grad_norm": 0.20564410090446472, "learning_rate": 1.953616185211896e-05, "loss": 1.2848, "step": 4170 }, { "epoch": 1.242316498817923, "grad_norm": 0.2275800108909607, "learning_rate": 1.953587144897412e-05, "loss": 1.2762, "step": 4171 }, { "epoch": 1.2426143450175915, "grad_norm": 0.2123938351869583, "learning_rate": 1.953558095710865e-05, "loss": 1.2932, "step": 4172 }, { "epoch": 1.2429121912172603, "grad_norm": 0.24131610989570618, "learning_rate": 1.953529037652526e-05, "loss": 1.2655, "step": 4173 }, { "epoch": 1.2432100374169288, "grad_norm": 0.2212681919336319, "learning_rate": 1.953499970722664e-05, "loss": 1.2736, "step": 4174 }, { "epoch": 1.2435078836165974, "grad_norm": 0.21585984528064728, "learning_rate": 1.9534708949215505e-05, "loss": 1.2828, "step": 4175 }, { "epoch": 1.2438057298162661, "grad_norm": 0.20756615698337555, "learning_rate": 1.9534418102494554e-05, "loss": 1.2779, "step": 4176 }, { "epoch": 1.2441035760159347, "grad_norm": 0.20586632192134857, "learning_rate": 1.9534127167066495e-05, "loss": 1.2708, "step": 4177 }, { "epoch": 1.2444014222156035, "grad_norm": 0.21256816387176514, "learning_rate": 1.9533836142934033e-05, "loss": 1.2897, "step": 4178 }, { "epoch": 1.244699268415272, "grad_norm": 0.21810956299304962, "learning_rate": 1.953354503009988e-05, "loss": 1.2873, "step": 4179 }, { "epoch": 1.2449971146149408, "grad_norm": 0.20932936668395996, "learning_rate": 1.953325382856674e-05, "loss": 1.2842, "step": 4180 }, { "epoch": 1.2452949608146093, "grad_norm": 0.20388562977313995, "learning_rate": 1.9532962538337326e-05, "loss": 1.2945, "step": 4181 }, { "epoch": 1.2455928070142779, "grad_norm": 0.2170279175043106, "learning_rate": 1.9532671159414346e-05, "loss": 1.2986, "step": 4182 }, { "epoch": 1.2458906532139467, "grad_norm": 0.20950999855995178, "learning_rate": 1.953237969180051e-05, "loss": 1.2841, "step": 4183 }, { "epoch": 1.2461884994136152, "grad_norm": 0.20044785737991333, "learning_rate": 1.9532088135498535e-05, "loss": 1.2891, "step": 4184 }, { "epoch": 1.246486345613284, "grad_norm": 0.22087042033672333, "learning_rate": 1.9531796490511126e-05, "loss": 1.2682, "step": 4185 }, { "epoch": 1.2467841918129525, "grad_norm": 0.2074468731880188, "learning_rate": 1.9531504756841003e-05, "loss": 1.2948, "step": 4186 }, { "epoch": 1.2470820380126213, "grad_norm": 0.21442781388759613, "learning_rate": 1.9531212934490874e-05, "loss": 1.2814, "step": 4187 }, { "epoch": 1.2473798842122898, "grad_norm": 0.21842724084854126, "learning_rate": 1.953092102346346e-05, "loss": 1.2786, "step": 4188 }, { "epoch": 1.2476777304119586, "grad_norm": 0.22215518355369568, "learning_rate": 1.9530629023761475e-05, "loss": 1.2873, "step": 4189 }, { "epoch": 1.2479755766116272, "grad_norm": 0.2289196401834488, "learning_rate": 1.9530336935387632e-05, "loss": 1.3195, "step": 4190 }, { "epoch": 1.248273422811296, "grad_norm": 0.20055343210697174, "learning_rate": 1.9530044758344652e-05, "loss": 1.283, "step": 4191 }, { "epoch": 1.2485712690109645, "grad_norm": 0.2111535668373108, "learning_rate": 1.952975249263526e-05, "loss": 1.2922, "step": 4192 }, { "epoch": 1.248869115210633, "grad_norm": 0.21124912798404694, "learning_rate": 1.952946013826216e-05, "loss": 1.3003, "step": 4193 }, { "epoch": 1.2491669614103018, "grad_norm": 0.20609290897846222, "learning_rate": 1.9529167695228084e-05, "loss": 1.3008, "step": 4194 }, { "epoch": 1.2494648076099704, "grad_norm": 0.2232304811477661, "learning_rate": 1.9528875163535747e-05, "loss": 1.267, "step": 4195 }, { "epoch": 1.2497626538096391, "grad_norm": 0.22072570025920868, "learning_rate": 1.9528582543187876e-05, "loss": 1.2904, "step": 4196 }, { "epoch": 1.2500605000093077, "grad_norm": 0.21208128333091736, "learning_rate": 1.952828983418719e-05, "loss": 1.296, "step": 4197 }, { "epoch": 1.2503583462089765, "grad_norm": 0.2197960913181305, "learning_rate": 1.952799703653641e-05, "loss": 1.2944, "step": 4198 }, { "epoch": 1.250656192408645, "grad_norm": 0.21740934252738953, "learning_rate": 1.9527704150238268e-05, "loss": 1.2844, "step": 4199 }, { "epoch": 1.2509540386083136, "grad_norm": 0.21597795188426971, "learning_rate": 1.952741117529548e-05, "loss": 1.2693, "step": 4200 }, { "epoch": 1.2512518848079823, "grad_norm": 0.2201114147901535, "learning_rate": 1.9527118111710775e-05, "loss": 1.3139, "step": 4201 }, { "epoch": 1.2515497310076509, "grad_norm": 0.22156377136707306, "learning_rate": 1.9526824959486884e-05, "loss": 1.2858, "step": 4202 }, { "epoch": 1.2518475772073197, "grad_norm": 0.22167210280895233, "learning_rate": 1.9526531718626525e-05, "loss": 1.2826, "step": 4203 }, { "epoch": 1.2521454234069882, "grad_norm": 0.2108449786901474, "learning_rate": 1.9526238389132435e-05, "loss": 1.284, "step": 4204 }, { "epoch": 1.252443269606657, "grad_norm": 0.2104608416557312, "learning_rate": 1.952594497100734e-05, "loss": 1.2766, "step": 4205 }, { "epoch": 1.2527411158063255, "grad_norm": 0.20697778463363647, "learning_rate": 1.9525651464253972e-05, "loss": 1.2892, "step": 4206 }, { "epoch": 1.253038962005994, "grad_norm": 0.22090132534503937, "learning_rate": 1.9525357868875057e-05, "loss": 1.2807, "step": 4207 }, { "epoch": 1.2533368082056628, "grad_norm": 0.2159069925546646, "learning_rate": 1.9525064184873332e-05, "loss": 1.2731, "step": 4208 }, { "epoch": 1.2536346544053314, "grad_norm": 0.22110089659690857, "learning_rate": 1.9524770412251523e-05, "loss": 1.2902, "step": 4209 }, { "epoch": 1.2539325006050002, "grad_norm": 0.21377326548099518, "learning_rate": 1.952447655101237e-05, "loss": 1.2809, "step": 4210 }, { "epoch": 1.2542303468046687, "grad_norm": 0.22198982536792755, "learning_rate": 1.9524182601158604e-05, "loss": 1.2934, "step": 4211 }, { "epoch": 1.2545281930043375, "grad_norm": 0.22517961263656616, "learning_rate": 1.952388856269296e-05, "loss": 1.2922, "step": 4212 }, { "epoch": 1.254826039204006, "grad_norm": 0.21857579052448273, "learning_rate": 1.9523594435618173e-05, "loss": 1.2959, "step": 4213 }, { "epoch": 1.2551238854036746, "grad_norm": 0.2065434604883194, "learning_rate": 1.952330021993698e-05, "loss": 1.2927, "step": 4214 }, { "epoch": 1.2554217316033434, "grad_norm": 0.21812330186367035, "learning_rate": 1.952300591565212e-05, "loss": 1.2897, "step": 4215 }, { "epoch": 1.255719577803012, "grad_norm": 0.2338680475950241, "learning_rate": 1.9522711522766328e-05, "loss": 1.2997, "step": 4216 }, { "epoch": 1.2560174240026807, "grad_norm": 0.3979673683643341, "learning_rate": 1.9522417041282344e-05, "loss": 1.2781, "step": 4217 }, { "epoch": 1.2563152702023492, "grad_norm": 0.25537681579589844, "learning_rate": 1.952212247120291e-05, "loss": 1.305, "step": 4218 }, { "epoch": 1.256613116402018, "grad_norm": 0.24418216943740845, "learning_rate": 1.952182781253077e-05, "loss": 1.2891, "step": 4219 }, { "epoch": 1.2569109626016866, "grad_norm": 0.24024534225463867, "learning_rate": 1.9521533065268652e-05, "loss": 1.2926, "step": 4220 }, { "epoch": 1.257208808801355, "grad_norm": 0.23503734171390533, "learning_rate": 1.952123822941931e-05, "loss": 1.2916, "step": 4221 }, { "epoch": 1.2575066550010239, "grad_norm": 0.21015004813671112, "learning_rate": 1.9520943304985484e-05, "loss": 1.2885, "step": 4222 }, { "epoch": 1.2578045012006924, "grad_norm": 0.22911418974399567, "learning_rate": 1.9520648291969918e-05, "loss": 1.2957, "step": 4223 }, { "epoch": 1.2581023474003612, "grad_norm": 0.22934724390506744, "learning_rate": 1.9520353190375356e-05, "loss": 1.2866, "step": 4224 }, { "epoch": 1.2584001936000297, "grad_norm": 0.2139049470424652, "learning_rate": 1.9520058000204546e-05, "loss": 1.2728, "step": 4225 }, { "epoch": 1.2586980397996985, "grad_norm": 0.23983533680438995, "learning_rate": 1.9519762721460233e-05, "loss": 1.3024, "step": 4226 }, { "epoch": 1.258995885999367, "grad_norm": 0.21902920305728912, "learning_rate": 1.9519467354145165e-05, "loss": 1.2855, "step": 4227 }, { "epoch": 1.2592937321990356, "grad_norm": 0.2280544638633728, "learning_rate": 1.9519171898262084e-05, "loss": 1.2984, "step": 4228 }, { "epoch": 1.2595915783987044, "grad_norm": 0.2220582515001297, "learning_rate": 1.951887635381375e-05, "loss": 1.2788, "step": 4229 }, { "epoch": 1.259889424598373, "grad_norm": 0.22434242069721222, "learning_rate": 1.9518580720802902e-05, "loss": 1.2805, "step": 4230 }, { "epoch": 1.2601872707980417, "grad_norm": 0.22028429806232452, "learning_rate": 1.9518284999232298e-05, "loss": 1.3068, "step": 4231 }, { "epoch": 1.2604851169977103, "grad_norm": 0.21655960381031036, "learning_rate": 1.9517989189104685e-05, "loss": 1.2782, "step": 4232 }, { "epoch": 1.260782963197379, "grad_norm": 0.22345635294914246, "learning_rate": 1.9517693290422815e-05, "loss": 1.288, "step": 4233 }, { "epoch": 1.2610808093970476, "grad_norm": 0.24837669730186462, "learning_rate": 1.9517397303189445e-05, "loss": 1.277, "step": 4234 }, { "epoch": 1.2613786555967161, "grad_norm": 0.21125711500644684, "learning_rate": 1.9517101227407326e-05, "loss": 1.2771, "step": 4235 }, { "epoch": 1.261676501796385, "grad_norm": 0.23232734203338623, "learning_rate": 1.9516805063079217e-05, "loss": 1.3007, "step": 4236 }, { "epoch": 1.2619743479960535, "grad_norm": 0.22613781690597534, "learning_rate": 1.9516508810207865e-05, "loss": 1.2769, "step": 4237 }, { "epoch": 1.2622721941957222, "grad_norm": 0.21599330008029938, "learning_rate": 1.9516212468796033e-05, "loss": 1.3, "step": 4238 }, { "epoch": 1.2625700403953908, "grad_norm": 0.2066168487071991, "learning_rate": 1.9515916038846474e-05, "loss": 1.2621, "step": 4239 }, { "epoch": 1.2628678865950596, "grad_norm": 0.2147272527217865, "learning_rate": 1.951561952036195e-05, "loss": 1.2962, "step": 4240 }, { "epoch": 1.263165732794728, "grad_norm": 0.2461201250553131, "learning_rate": 1.9515322913345218e-05, "loss": 1.2731, "step": 4241 }, { "epoch": 1.2634635789943967, "grad_norm": 0.21921150386333466, "learning_rate": 1.951502621779904e-05, "loss": 1.3004, "step": 4242 }, { "epoch": 1.2637614251940654, "grad_norm": 0.21837396919727325, "learning_rate": 1.9514729433726166e-05, "loss": 1.2832, "step": 4243 }, { "epoch": 1.2640592713937342, "grad_norm": 0.21291287243366241, "learning_rate": 1.951443256112937e-05, "loss": 1.2755, "step": 4244 }, { "epoch": 1.2643571175934027, "grad_norm": 0.21658475697040558, "learning_rate": 1.951413560001141e-05, "loss": 1.2783, "step": 4245 }, { "epoch": 1.2646549637930713, "grad_norm": 0.24197445809841156, "learning_rate": 1.951383855037505e-05, "loss": 1.2942, "step": 4246 }, { "epoch": 1.26495280999274, "grad_norm": 0.23163466155529022, "learning_rate": 1.9513541412223045e-05, "loss": 1.3004, "step": 4247 }, { "epoch": 1.2652506561924086, "grad_norm": 0.22297687828540802, "learning_rate": 1.951324418555817e-05, "loss": 1.2846, "step": 4248 }, { "epoch": 1.2655485023920772, "grad_norm": 0.2045927494764328, "learning_rate": 1.9512946870383186e-05, "loss": 1.2692, "step": 4249 }, { "epoch": 1.265846348591746, "grad_norm": 0.2474483996629715, "learning_rate": 1.951264946670086e-05, "loss": 1.2793, "step": 4250 }, { "epoch": 1.2661441947914147, "grad_norm": 0.21718846261501312, "learning_rate": 1.9512351974513963e-05, "loss": 1.294, "step": 4251 }, { "epoch": 1.2664420409910833, "grad_norm": 0.22226892411708832, "learning_rate": 1.9512054393825255e-05, "loss": 1.2919, "step": 4252 }, { "epoch": 1.2667398871907518, "grad_norm": 0.22698166966438293, "learning_rate": 1.9511756724637508e-05, "loss": 1.267, "step": 4253 }, { "epoch": 1.2670377333904206, "grad_norm": 0.21398533880710602, "learning_rate": 1.9511458966953493e-05, "loss": 1.2783, "step": 4254 }, { "epoch": 1.2673355795900891, "grad_norm": 0.2410353273153305, "learning_rate": 1.951116112077598e-05, "loss": 1.304, "step": 4255 }, { "epoch": 1.2676334257897577, "grad_norm": 0.2103150188922882, "learning_rate": 1.9510863186107737e-05, "loss": 1.2822, "step": 4256 }, { "epoch": 1.2679312719894265, "grad_norm": 0.22095987200737, "learning_rate": 1.9510565162951538e-05, "loss": 1.2747, "step": 4257 }, { "epoch": 1.2682291181890952, "grad_norm": 0.20994897186756134, "learning_rate": 1.9510267051310157e-05, "loss": 1.2905, "step": 4258 }, { "epoch": 1.2685269643887638, "grad_norm": 0.19952505826950073, "learning_rate": 1.9509968851186367e-05, "loss": 1.2934, "step": 4259 }, { "epoch": 1.2688248105884323, "grad_norm": 0.22760345041751862, "learning_rate": 1.950967056258294e-05, "loss": 1.2726, "step": 4260 }, { "epoch": 1.269122656788101, "grad_norm": 0.21213096380233765, "learning_rate": 1.9509372185502655e-05, "loss": 1.2858, "step": 4261 }, { "epoch": 1.2694205029877697, "grad_norm": 0.21417930722236633, "learning_rate": 1.9509073719948287e-05, "loss": 1.284, "step": 4262 }, { "epoch": 1.2697183491874382, "grad_norm": 0.21398523449897766, "learning_rate": 1.950877516592261e-05, "loss": 1.2757, "step": 4263 }, { "epoch": 1.270016195387107, "grad_norm": 0.1974363923072815, "learning_rate": 1.9508476523428407e-05, "loss": 1.2671, "step": 4264 }, { "epoch": 1.2703140415867757, "grad_norm": 0.22823457419872284, "learning_rate": 1.950817779246845e-05, "loss": 1.289, "step": 4265 }, { "epoch": 1.2706118877864443, "grad_norm": 0.20725269615650177, "learning_rate": 1.9507878973045524e-05, "loss": 1.2962, "step": 4266 }, { "epoch": 1.2709097339861128, "grad_norm": 0.21264007687568665, "learning_rate": 1.9507580065162405e-05, "loss": 1.2832, "step": 4267 }, { "epoch": 1.2712075801857816, "grad_norm": 0.2234097719192505, "learning_rate": 1.950728106882188e-05, "loss": 1.2933, "step": 4268 }, { "epoch": 1.2715054263854502, "grad_norm": 0.20301519334316254, "learning_rate": 1.9506981984026723e-05, "loss": 1.2858, "step": 4269 }, { "epoch": 1.271803272585119, "grad_norm": 0.20755331218242645, "learning_rate": 1.9506682810779722e-05, "loss": 1.2939, "step": 4270 }, { "epoch": 1.2721011187847875, "grad_norm": 0.2089110016822815, "learning_rate": 1.950638354908366e-05, "loss": 1.3018, "step": 4271 }, { "epoch": 1.2723989649844563, "grad_norm": 0.20934276282787323, "learning_rate": 1.9506084198941316e-05, "loss": 1.2788, "step": 4272 }, { "epoch": 1.2726968111841248, "grad_norm": 0.2058245688676834, "learning_rate": 1.9505784760355485e-05, "loss": 1.279, "step": 4273 }, { "epoch": 1.2729946573837934, "grad_norm": 0.2077820897102356, "learning_rate": 1.9505485233328944e-05, "loss": 1.2879, "step": 4274 }, { "epoch": 1.2732925035834621, "grad_norm": 0.21222275495529175, "learning_rate": 1.9505185617864483e-05, "loss": 1.2802, "step": 4275 }, { "epoch": 1.2735903497831307, "grad_norm": 0.21486896276474, "learning_rate": 1.950488591396489e-05, "loss": 1.2902, "step": 4276 }, { "epoch": 1.2738881959827995, "grad_norm": 0.24208930134773254, "learning_rate": 1.9504586121632957e-05, "loss": 1.2848, "step": 4277 }, { "epoch": 1.274186042182468, "grad_norm": 0.211529940366745, "learning_rate": 1.950428624087146e-05, "loss": 1.3098, "step": 4278 }, { "epoch": 1.2744838883821368, "grad_norm": 0.2138810157775879, "learning_rate": 1.9503986271683206e-05, "loss": 1.2828, "step": 4279 }, { "epoch": 1.2747817345818053, "grad_norm": 0.2132432907819748, "learning_rate": 1.9503686214070975e-05, "loss": 1.2861, "step": 4280 }, { "epoch": 1.2750795807814739, "grad_norm": 0.2315203845500946, "learning_rate": 1.9503386068037568e-05, "loss": 1.2804, "step": 4281 }, { "epoch": 1.2753774269811426, "grad_norm": 0.21458743512630463, "learning_rate": 1.9503085833585766e-05, "loss": 1.2748, "step": 4282 }, { "epoch": 1.2756752731808112, "grad_norm": 0.2249085009098053, "learning_rate": 1.9502785510718368e-05, "loss": 1.2868, "step": 4283 }, { "epoch": 1.27597311938048, "grad_norm": 0.2124832719564438, "learning_rate": 1.9502485099438165e-05, "loss": 1.2944, "step": 4284 }, { "epoch": 1.2762709655801485, "grad_norm": 0.2258549928665161, "learning_rate": 1.950218459974796e-05, "loss": 1.2939, "step": 4285 }, { "epoch": 1.2765688117798173, "grad_norm": 0.22272054851055145, "learning_rate": 1.9501884011650542e-05, "loss": 1.2998, "step": 4286 }, { "epoch": 1.2768666579794858, "grad_norm": 0.21465766429901123, "learning_rate": 1.950158333514871e-05, "loss": 1.2914, "step": 4287 }, { "epoch": 1.2771645041791544, "grad_norm": 0.21221554279327393, "learning_rate": 1.950128257024526e-05, "loss": 1.2734, "step": 4288 }, { "epoch": 1.2774623503788232, "grad_norm": 0.2124350517988205, "learning_rate": 1.950098171694299e-05, "loss": 1.2894, "step": 4289 }, { "epoch": 1.2777601965784917, "grad_norm": 0.2039698362350464, "learning_rate": 1.9500680775244702e-05, "loss": 1.2772, "step": 4290 }, { "epoch": 1.2780580427781605, "grad_norm": 0.2236998826265335, "learning_rate": 1.9500379745153193e-05, "loss": 1.305, "step": 4291 }, { "epoch": 1.278355888977829, "grad_norm": 0.21744026243686676, "learning_rate": 1.9500078626671268e-05, "loss": 1.299, "step": 4292 }, { "epoch": 1.2786537351774978, "grad_norm": 0.215065598487854, "learning_rate": 1.9499777419801722e-05, "loss": 1.2762, "step": 4293 }, { "epoch": 1.2789515813771664, "grad_norm": 0.20630770921707153, "learning_rate": 1.949947612454736e-05, "loss": 1.2982, "step": 4294 }, { "epoch": 1.279249427576835, "grad_norm": 0.22154337167739868, "learning_rate": 1.949917474091099e-05, "loss": 1.2827, "step": 4295 }, { "epoch": 1.2795472737765037, "grad_norm": 0.20596937835216522, "learning_rate": 1.949887326889541e-05, "loss": 1.2794, "step": 4296 }, { "epoch": 1.2798451199761722, "grad_norm": 0.20480772852897644, "learning_rate": 1.949857170850343e-05, "loss": 1.304, "step": 4297 }, { "epoch": 1.280142966175841, "grad_norm": 0.24925456941127777, "learning_rate": 1.9498270059737846e-05, "loss": 1.2814, "step": 4298 }, { "epoch": 1.2804408123755096, "grad_norm": 0.23596645891666412, "learning_rate": 1.9497968322601478e-05, "loss": 1.2875, "step": 4299 }, { "epoch": 1.2807386585751783, "grad_norm": 0.26264750957489014, "learning_rate": 1.9497666497097123e-05, "loss": 1.2729, "step": 4300 }, { "epoch": 1.2810365047748469, "grad_norm": 0.2836257815361023, "learning_rate": 1.9497364583227594e-05, "loss": 1.2886, "step": 4301 }, { "epoch": 1.2813343509745154, "grad_norm": 0.21951961517333984, "learning_rate": 1.9497062580995697e-05, "loss": 1.2787, "step": 4302 }, { "epoch": 1.2816321971741842, "grad_norm": 0.2535304129123688, "learning_rate": 1.9496760490404246e-05, "loss": 1.289, "step": 4303 }, { "epoch": 1.2819300433738527, "grad_norm": 0.22259165346622467, "learning_rate": 1.9496458311456048e-05, "loss": 1.279, "step": 4304 }, { "epoch": 1.2822278895735215, "grad_norm": 0.22210922837257385, "learning_rate": 1.9496156044153914e-05, "loss": 1.2869, "step": 4305 }, { "epoch": 1.28252573577319, "grad_norm": 0.250213086605072, "learning_rate": 1.949585368850066e-05, "loss": 1.2945, "step": 4306 }, { "epoch": 1.2828235819728588, "grad_norm": 0.22138406336307526, "learning_rate": 1.9495551244499092e-05, "loss": 1.2767, "step": 4307 }, { "epoch": 1.2831214281725274, "grad_norm": 0.21412420272827148, "learning_rate": 1.9495248712152035e-05, "loss": 1.279, "step": 4308 }, { "epoch": 1.283419274372196, "grad_norm": 0.22160974144935608, "learning_rate": 1.9494946091462294e-05, "loss": 1.2808, "step": 4309 }, { "epoch": 1.2837171205718647, "grad_norm": 0.2283344566822052, "learning_rate": 1.949464338243269e-05, "loss": 1.2845, "step": 4310 }, { "epoch": 1.2840149667715335, "grad_norm": 0.2155960649251938, "learning_rate": 1.9494340585066033e-05, "loss": 1.2865, "step": 4311 }, { "epoch": 1.284312812971202, "grad_norm": 0.21037152409553528, "learning_rate": 1.9494037699365148e-05, "loss": 1.2828, "step": 4312 }, { "epoch": 1.2846106591708706, "grad_norm": 0.22889646887779236, "learning_rate": 1.949373472533285e-05, "loss": 1.2761, "step": 4313 }, { "epoch": 1.2849085053705394, "grad_norm": 0.23508226871490479, "learning_rate": 1.9493431662971956e-05, "loss": 1.2841, "step": 4314 }, { "epoch": 1.285206351570208, "grad_norm": 0.2230076789855957, "learning_rate": 1.949312851228529e-05, "loss": 1.2909, "step": 4315 }, { "epoch": 1.2855041977698765, "grad_norm": 0.21971538662910461, "learning_rate": 1.9492825273275667e-05, "loss": 1.2876, "step": 4316 }, { "epoch": 1.2858020439695452, "grad_norm": 0.22598077356815338, "learning_rate": 1.9492521945945912e-05, "loss": 1.2783, "step": 4317 }, { "epoch": 1.286099890169214, "grad_norm": 0.2192184180021286, "learning_rate": 1.9492218530298843e-05, "loss": 1.2868, "step": 4318 }, { "epoch": 1.2863977363688825, "grad_norm": 0.225177600979805, "learning_rate": 1.949191502633729e-05, "loss": 1.2822, "step": 4319 }, { "epoch": 1.286695582568551, "grad_norm": 0.23144379258155823, "learning_rate": 1.9491611434064072e-05, "loss": 1.2797, "step": 4320 }, { "epoch": 1.2869934287682199, "grad_norm": 0.21540115773677826, "learning_rate": 1.949130775348201e-05, "loss": 1.275, "step": 4321 }, { "epoch": 1.2872912749678884, "grad_norm": 0.20263580977916718, "learning_rate": 1.9491003984593936e-05, "loss": 1.2993, "step": 4322 }, { "epoch": 1.287589121167557, "grad_norm": 0.20649705827236176, "learning_rate": 1.9490700127402676e-05, "loss": 1.3125, "step": 4323 }, { "epoch": 1.2878869673672257, "grad_norm": 0.22655902802944183, "learning_rate": 1.9490396181911054e-05, "loss": 1.2771, "step": 4324 }, { "epoch": 1.2881848135668945, "grad_norm": 0.21653781831264496, "learning_rate": 1.9490092148121898e-05, "loss": 1.2728, "step": 4325 }, { "epoch": 1.288482659766563, "grad_norm": 0.2190086841583252, "learning_rate": 1.9489788026038038e-05, "loss": 1.2875, "step": 4326 }, { "epoch": 1.2887805059662316, "grad_norm": 0.22180475294589996, "learning_rate": 1.94894838156623e-05, "loss": 1.2889, "step": 4327 }, { "epoch": 1.2890783521659004, "grad_norm": 0.21253114938735962, "learning_rate": 1.9489179516997522e-05, "loss": 1.3026, "step": 4328 }, { "epoch": 1.289376198365569, "grad_norm": 0.21621105074882507, "learning_rate": 1.9488875130046528e-05, "loss": 1.2805, "step": 4329 }, { "epoch": 1.2896740445652375, "grad_norm": 0.2294853925704956, "learning_rate": 1.9488570654812152e-05, "loss": 1.3019, "step": 4330 }, { "epoch": 1.2899718907649063, "grad_norm": 0.21360130608081818, "learning_rate": 1.948826609129723e-05, "loss": 1.3032, "step": 4331 }, { "epoch": 1.290269736964575, "grad_norm": 0.21647316217422485, "learning_rate": 1.948796143950459e-05, "loss": 1.2728, "step": 4332 }, { "epoch": 1.2905675831642436, "grad_norm": 0.21768029034137726, "learning_rate": 1.9487656699437073e-05, "loss": 1.2735, "step": 4333 }, { "epoch": 1.2908654293639121, "grad_norm": 0.2219466269016266, "learning_rate": 1.9487351871097507e-05, "loss": 1.2781, "step": 4334 }, { "epoch": 1.291163275563581, "grad_norm": 0.21864573657512665, "learning_rate": 1.9487046954488737e-05, "loss": 1.292, "step": 4335 }, { "epoch": 1.2914611217632495, "grad_norm": 0.23011457920074463, "learning_rate": 1.9486741949613587e-05, "loss": 1.2991, "step": 4336 }, { "epoch": 1.2917589679629182, "grad_norm": 0.22989176213741302, "learning_rate": 1.9486436856474907e-05, "loss": 1.2688, "step": 4337 }, { "epoch": 1.2920568141625868, "grad_norm": 0.2277642786502838, "learning_rate": 1.948613167507553e-05, "loss": 1.2934, "step": 4338 }, { "epoch": 1.2923546603622555, "grad_norm": 0.21044525504112244, "learning_rate": 1.9485826405418297e-05, "loss": 1.2959, "step": 4339 }, { "epoch": 1.292652506561924, "grad_norm": 0.22159777581691742, "learning_rate": 1.9485521047506045e-05, "loss": 1.302, "step": 4340 }, { "epoch": 1.2929503527615926, "grad_norm": 0.22066263854503632, "learning_rate": 1.948521560134162e-05, "loss": 1.2753, "step": 4341 }, { "epoch": 1.2932481989612614, "grad_norm": 0.20811598002910614, "learning_rate": 1.9484910066927862e-05, "loss": 1.295, "step": 4342 }, { "epoch": 1.29354604516093, "grad_norm": 0.2093423455953598, "learning_rate": 1.9484604444267613e-05, "loss": 1.3022, "step": 4343 }, { "epoch": 1.2938438913605987, "grad_norm": 0.21169036626815796, "learning_rate": 1.9484298733363715e-05, "loss": 1.2762, "step": 4344 }, { "epoch": 1.2941417375602673, "grad_norm": 0.2275347262620926, "learning_rate": 1.9483992934219014e-05, "loss": 1.2982, "step": 4345 }, { "epoch": 1.294439583759936, "grad_norm": 0.20272690057754517, "learning_rate": 1.9483687046836354e-05, "loss": 1.276, "step": 4346 }, { "epoch": 1.2947374299596046, "grad_norm": 0.21795092523097992, "learning_rate": 1.9483381071218583e-05, "loss": 1.2839, "step": 4347 }, { "epoch": 1.2950352761592732, "grad_norm": 0.22539231181144714, "learning_rate": 1.9483075007368544e-05, "loss": 1.2993, "step": 4348 }, { "epoch": 1.295333122358942, "grad_norm": 0.2134052813053131, "learning_rate": 1.948276885528909e-05, "loss": 1.2973, "step": 4349 }, { "epoch": 1.2956309685586105, "grad_norm": 0.21409595012664795, "learning_rate": 1.9482462614983065e-05, "loss": 1.2858, "step": 4350 }, { "epoch": 1.2959288147582793, "grad_norm": 0.2155877649784088, "learning_rate": 1.9482156286453323e-05, "loss": 1.2809, "step": 4351 }, { "epoch": 1.2962266609579478, "grad_norm": 0.22553592920303345, "learning_rate": 1.9481849869702708e-05, "loss": 1.276, "step": 4352 }, { "epoch": 1.2965245071576166, "grad_norm": 0.2257729023694992, "learning_rate": 1.9481543364734075e-05, "loss": 1.2804, "step": 4353 }, { "epoch": 1.2968223533572851, "grad_norm": 0.20933204889297485, "learning_rate": 1.9481236771550275e-05, "loss": 1.2852, "step": 4354 }, { "epoch": 1.2971201995569537, "grad_norm": 0.21046243607997894, "learning_rate": 1.948093009015416e-05, "loss": 1.2729, "step": 4355 }, { "epoch": 1.2974180457566225, "grad_norm": 0.21620683372020721, "learning_rate": 1.948062332054858e-05, "loss": 1.2835, "step": 4356 }, { "epoch": 1.297715891956291, "grad_norm": 0.22007808089256287, "learning_rate": 1.9480316462736394e-05, "loss": 1.2873, "step": 4357 }, { "epoch": 1.2980137381559598, "grad_norm": 0.2278037965297699, "learning_rate": 1.9480009516720457e-05, "loss": 1.291, "step": 4358 }, { "epoch": 1.2983115843556283, "grad_norm": 0.20690450072288513, "learning_rate": 1.947970248250362e-05, "loss": 1.2852, "step": 4359 }, { "epoch": 1.298609430555297, "grad_norm": 0.21415205299854279, "learning_rate": 1.9479395360088744e-05, "loss": 1.2865, "step": 4360 }, { "epoch": 1.2989072767549656, "grad_norm": 0.21674586832523346, "learning_rate": 1.9479088149478688e-05, "loss": 1.2806, "step": 4361 }, { "epoch": 1.2992051229546342, "grad_norm": 0.21887333691120148, "learning_rate": 1.9478780850676303e-05, "loss": 1.3123, "step": 4362 }, { "epoch": 1.299502969154303, "grad_norm": 0.22247789800167084, "learning_rate": 1.9478473463684456e-05, "loss": 1.2868, "step": 4363 }, { "epoch": 1.2998008153539715, "grad_norm": 0.2039203941822052, "learning_rate": 1.9478165988506003e-05, "loss": 1.287, "step": 4364 }, { "epoch": 1.3000986615536403, "grad_norm": 0.22355547547340393, "learning_rate": 1.9477858425143804e-05, "loss": 1.2792, "step": 4365 }, { "epoch": 1.3003965077533088, "grad_norm": 0.21715961396694183, "learning_rate": 1.9477550773600718e-05, "loss": 1.304, "step": 4366 }, { "epoch": 1.3006943539529776, "grad_norm": 0.20409496128559113, "learning_rate": 1.9477243033879615e-05, "loss": 1.2747, "step": 4367 }, { "epoch": 1.3009922001526462, "grad_norm": 0.2287827432155609, "learning_rate": 1.9476935205983355e-05, "loss": 1.2928, "step": 4368 }, { "epoch": 1.3012900463523147, "grad_norm": 0.21227742731571198, "learning_rate": 1.94766272899148e-05, "loss": 1.274, "step": 4369 }, { "epoch": 1.3015878925519835, "grad_norm": 0.2068890929222107, "learning_rate": 1.9476319285676817e-05, "loss": 1.2727, "step": 4370 }, { "epoch": 1.301885738751652, "grad_norm": 0.2015000283718109, "learning_rate": 1.947601119327227e-05, "loss": 1.2805, "step": 4371 }, { "epoch": 1.3021835849513208, "grad_norm": 0.21332566440105438, "learning_rate": 1.9475703012704026e-05, "loss": 1.2968, "step": 4372 }, { "epoch": 1.3024814311509894, "grad_norm": 0.2227504700422287, "learning_rate": 1.947539474397495e-05, "loss": 1.2885, "step": 4373 }, { "epoch": 1.3027792773506581, "grad_norm": 0.22368992865085602, "learning_rate": 1.947508638708792e-05, "loss": 1.2901, "step": 4374 }, { "epoch": 1.3030771235503267, "grad_norm": 0.2185707837343216, "learning_rate": 1.9474777942045787e-05, "loss": 1.2711, "step": 4375 }, { "epoch": 1.3033749697499952, "grad_norm": 0.23849822580814362, "learning_rate": 1.9474469408851436e-05, "loss": 1.3015, "step": 4376 }, { "epoch": 1.303672815949664, "grad_norm": 0.22657613456249237, "learning_rate": 1.9474160787507735e-05, "loss": 1.3063, "step": 4377 }, { "epoch": 1.3039706621493328, "grad_norm": 0.23561915755271912, "learning_rate": 1.947385207801755e-05, "loss": 1.2935, "step": 4378 }, { "epoch": 1.3042685083490013, "grad_norm": 0.23508505523204803, "learning_rate": 1.9473543280383755e-05, "loss": 1.2923, "step": 4379 }, { "epoch": 1.3045663545486699, "grad_norm": 0.23660191893577576, "learning_rate": 1.947323439460923e-05, "loss": 1.2775, "step": 4380 }, { "epoch": 1.3048642007483386, "grad_norm": 0.2111383080482483, "learning_rate": 1.9472925420696836e-05, "loss": 1.2892, "step": 4381 }, { "epoch": 1.3051620469480072, "grad_norm": 0.22012507915496826, "learning_rate": 1.9472616358649458e-05, "loss": 1.273, "step": 4382 }, { "epoch": 1.3054598931476757, "grad_norm": 0.2353675812482834, "learning_rate": 1.947230720846997e-05, "loss": 1.2778, "step": 4383 }, { "epoch": 1.3057577393473445, "grad_norm": 0.2224927544593811, "learning_rate": 1.9471997970161242e-05, "loss": 1.3011, "step": 4384 }, { "epoch": 1.3060555855470133, "grad_norm": 0.22048144042491913, "learning_rate": 1.947168864372616e-05, "loss": 1.2934, "step": 4385 }, { "epoch": 1.3063534317466818, "grad_norm": 0.22624224424362183, "learning_rate": 1.9471379229167597e-05, "loss": 1.2919, "step": 4386 }, { "epoch": 1.3066512779463504, "grad_norm": 0.21678951382637024, "learning_rate": 1.9471069726488432e-05, "loss": 1.2714, "step": 4387 }, { "epoch": 1.3069491241460192, "grad_norm": 0.21833226084709167, "learning_rate": 1.947076013569154e-05, "loss": 1.296, "step": 4388 }, { "epoch": 1.3072469703456877, "grad_norm": 0.22498853504657745, "learning_rate": 1.9470450456779812e-05, "loss": 1.2889, "step": 4389 }, { "epoch": 1.3075448165453563, "grad_norm": 0.20521242916584015, "learning_rate": 1.947014068975612e-05, "loss": 1.2675, "step": 4390 }, { "epoch": 1.307842662745025, "grad_norm": 0.21246260404586792, "learning_rate": 1.9469830834623352e-05, "loss": 1.2813, "step": 4391 }, { "epoch": 1.3081405089446938, "grad_norm": 0.20792168378829956, "learning_rate": 1.946952089138439e-05, "loss": 1.2886, "step": 4392 }, { "epoch": 1.3084383551443624, "grad_norm": 0.22202304005622864, "learning_rate": 1.9469210860042117e-05, "loss": 1.2799, "step": 4393 }, { "epoch": 1.308736201344031, "grad_norm": 0.2078225314617157, "learning_rate": 1.946890074059941e-05, "loss": 1.2934, "step": 4394 }, { "epoch": 1.3090340475436997, "grad_norm": 0.2236662060022354, "learning_rate": 1.9468590533059167e-05, "loss": 1.2811, "step": 4395 }, { "epoch": 1.3093318937433682, "grad_norm": 0.22034013271331787, "learning_rate": 1.9468280237424263e-05, "loss": 1.289, "step": 4396 }, { "epoch": 1.309629739943037, "grad_norm": 0.23171408474445343, "learning_rate": 1.946796985369759e-05, "loss": 1.2669, "step": 4397 }, { "epoch": 1.3099275861427055, "grad_norm": 0.26519349217414856, "learning_rate": 1.9467659381882044e-05, "loss": 1.3003, "step": 4398 }, { "epoch": 1.3102254323423743, "grad_norm": 0.22557543218135834, "learning_rate": 1.9467348821980495e-05, "loss": 1.3045, "step": 4399 }, { "epoch": 1.3105232785420429, "grad_norm": 0.2138628512620926, "learning_rate": 1.946703817399585e-05, "loss": 1.278, "step": 4400 }, { "epoch": 1.3108211247417114, "grad_norm": 0.23420557379722595, "learning_rate": 1.9466727437930987e-05, "loss": 1.2868, "step": 4401 }, { "epoch": 1.3111189709413802, "grad_norm": 0.21214647591114044, "learning_rate": 1.9466416613788806e-05, "loss": 1.2949, "step": 4402 }, { "epoch": 1.3114168171410487, "grad_norm": 0.21572305262088776, "learning_rate": 1.9466105701572193e-05, "loss": 1.2784, "step": 4403 }, { "epoch": 1.3117146633407175, "grad_norm": 0.21837739646434784, "learning_rate": 1.946579470128404e-05, "loss": 1.2985, "step": 4404 }, { "epoch": 1.312012509540386, "grad_norm": 0.22033147513866425, "learning_rate": 1.9465483612927246e-05, "loss": 1.2955, "step": 4405 }, { "epoch": 1.3123103557400548, "grad_norm": 0.21132859587669373, "learning_rate": 1.9465172436504705e-05, "loss": 1.2848, "step": 4406 }, { "epoch": 1.3126082019397234, "grad_norm": 0.22771236300468445, "learning_rate": 1.9464861172019307e-05, "loss": 1.2878, "step": 4407 }, { "epoch": 1.312906048139392, "grad_norm": 0.2159237563610077, "learning_rate": 1.946454981947395e-05, "loss": 1.291, "step": 4408 }, { "epoch": 1.3132038943390607, "grad_norm": 0.22716277837753296, "learning_rate": 1.9464238378871535e-05, "loss": 1.2824, "step": 4409 }, { "epoch": 1.3135017405387293, "grad_norm": 0.21937571465969086, "learning_rate": 1.946392685021495e-05, "loss": 1.2848, "step": 4410 }, { "epoch": 1.313799586738398, "grad_norm": 0.21618427336215973, "learning_rate": 1.9463615233507104e-05, "loss": 1.2919, "step": 4411 }, { "epoch": 1.3140974329380666, "grad_norm": 0.2274778038263321, "learning_rate": 1.946330352875089e-05, "loss": 1.2827, "step": 4412 }, { "epoch": 1.3143952791377354, "grad_norm": 0.2229549139738083, "learning_rate": 1.9462991735949206e-05, "loss": 1.2814, "step": 4413 }, { "epoch": 1.314693125337404, "grad_norm": 0.20636674761772156, "learning_rate": 1.9462679855104963e-05, "loss": 1.2873, "step": 4414 }, { "epoch": 1.3149909715370725, "grad_norm": 0.21750383079051971, "learning_rate": 1.9462367886221054e-05, "loss": 1.2778, "step": 4415 }, { "epoch": 1.3152888177367412, "grad_norm": 0.21384619176387787, "learning_rate": 1.9462055829300382e-05, "loss": 1.2866, "step": 4416 }, { "epoch": 1.3155866639364098, "grad_norm": 0.23439331352710724, "learning_rate": 1.9461743684345855e-05, "loss": 1.2687, "step": 4417 }, { "epoch": 1.3158845101360785, "grad_norm": 0.23477253317832947, "learning_rate": 1.946143145136037e-05, "loss": 1.2966, "step": 4418 }, { "epoch": 1.316182356335747, "grad_norm": 0.21923504769802094, "learning_rate": 1.946111913034684e-05, "loss": 1.2911, "step": 4419 }, { "epoch": 1.3164802025354159, "grad_norm": 0.28623026609420776, "learning_rate": 1.9460806721308167e-05, "loss": 1.2931, "step": 4420 }, { "epoch": 1.3167780487350844, "grad_norm": 0.24381518363952637, "learning_rate": 1.9460494224247255e-05, "loss": 1.2724, "step": 4421 }, { "epoch": 1.317075894934753, "grad_norm": 0.22767974436283112, "learning_rate": 1.9460181639167015e-05, "loss": 1.2802, "step": 4422 }, { "epoch": 1.3173737411344217, "grad_norm": 0.23355694115161896, "learning_rate": 1.9459868966070356e-05, "loss": 1.2672, "step": 4423 }, { "epoch": 1.3176715873340903, "grad_norm": 0.23531609773635864, "learning_rate": 1.9459556204960183e-05, "loss": 1.2877, "step": 4424 }, { "epoch": 1.317969433533759, "grad_norm": 0.21968941390514374, "learning_rate": 1.945924335583941e-05, "loss": 1.2863, "step": 4425 }, { "epoch": 1.3182672797334276, "grad_norm": 0.2364933043718338, "learning_rate": 1.9458930418710947e-05, "loss": 1.2825, "step": 4426 }, { "epoch": 1.3185651259330964, "grad_norm": 0.21582764387130737, "learning_rate": 1.9458617393577705e-05, "loss": 1.2746, "step": 4427 }, { "epoch": 1.318862972132765, "grad_norm": 0.21979326009750366, "learning_rate": 1.9458304280442594e-05, "loss": 1.2879, "step": 4428 }, { "epoch": 1.3191608183324335, "grad_norm": 0.23910918831825256, "learning_rate": 1.945799107930853e-05, "loss": 1.2991, "step": 4429 }, { "epoch": 1.3194586645321023, "grad_norm": 0.2085385024547577, "learning_rate": 1.9457677790178424e-05, "loss": 1.287, "step": 4430 }, { "epoch": 1.3197565107317708, "grad_norm": 0.22950591146945953, "learning_rate": 1.9457364413055196e-05, "loss": 1.2865, "step": 4431 }, { "epoch": 1.3200543569314396, "grad_norm": 0.21736650168895721, "learning_rate": 1.9457050947941755e-05, "loss": 1.2918, "step": 4432 }, { "epoch": 1.3203522031311081, "grad_norm": 0.20605461299419403, "learning_rate": 1.9456737394841024e-05, "loss": 1.2887, "step": 4433 }, { "epoch": 1.320650049330777, "grad_norm": 0.22259333729743958, "learning_rate": 1.945642375375592e-05, "loss": 1.2758, "step": 4434 }, { "epoch": 1.3209478955304454, "grad_norm": 0.21373486518859863, "learning_rate": 1.9456110024689353e-05, "loss": 1.2764, "step": 4435 }, { "epoch": 1.321245741730114, "grad_norm": 0.21873639523983002, "learning_rate": 1.945579620764425e-05, "loss": 1.2999, "step": 4436 }, { "epoch": 1.3215435879297828, "grad_norm": 0.2185695916414261, "learning_rate": 1.9455482302623525e-05, "loss": 1.2976, "step": 4437 }, { "epoch": 1.3218414341294515, "grad_norm": 0.21117202937602997, "learning_rate": 1.9455168309630104e-05, "loss": 1.2917, "step": 4438 }, { "epoch": 1.32213928032912, "grad_norm": 0.21831433475017548, "learning_rate": 1.9454854228666905e-05, "loss": 1.2764, "step": 4439 }, { "epoch": 1.3224371265287886, "grad_norm": 0.22479133307933807, "learning_rate": 1.945454005973685e-05, "loss": 1.2787, "step": 4440 }, { "epoch": 1.3227349727284574, "grad_norm": 0.22005313634872437, "learning_rate": 1.9454225802842865e-05, "loss": 1.2897, "step": 4441 }, { "epoch": 1.323032818928126, "grad_norm": 0.22108127176761627, "learning_rate": 1.9453911457987872e-05, "loss": 1.3085, "step": 4442 }, { "epoch": 1.3233306651277945, "grad_norm": 0.21450850367546082, "learning_rate": 1.9453597025174793e-05, "loss": 1.2766, "step": 4443 }, { "epoch": 1.3236285113274633, "grad_norm": 0.23771388828754425, "learning_rate": 1.9453282504406558e-05, "loss": 1.286, "step": 4444 }, { "epoch": 1.323926357527132, "grad_norm": 0.20356306433677673, "learning_rate": 1.945296789568609e-05, "loss": 1.2827, "step": 4445 }, { "epoch": 1.3242242037268006, "grad_norm": 0.22749559581279755, "learning_rate": 1.9452653199016316e-05, "loss": 1.2841, "step": 4446 }, { "epoch": 1.3245220499264692, "grad_norm": 0.22910209000110626, "learning_rate": 1.945233841440017e-05, "loss": 1.2902, "step": 4447 }, { "epoch": 1.324819896126138, "grad_norm": 0.22924593091011047, "learning_rate": 1.945202354184057e-05, "loss": 1.2734, "step": 4448 }, { "epoch": 1.3251177423258065, "grad_norm": 0.23718386888504028, "learning_rate": 1.9451708581340454e-05, "loss": 1.274, "step": 4449 }, { "epoch": 1.325415588525475, "grad_norm": 0.20674824714660645, "learning_rate": 1.9451393532902748e-05, "loss": 1.3067, "step": 4450 }, { "epoch": 1.3257134347251438, "grad_norm": 0.21308964490890503, "learning_rate": 1.945107839653039e-05, "loss": 1.2862, "step": 4451 }, { "epoch": 1.3260112809248126, "grad_norm": 0.21982550621032715, "learning_rate": 1.94507631722263e-05, "loss": 1.2803, "step": 4452 }, { "epoch": 1.3263091271244811, "grad_norm": 0.21804852783679962, "learning_rate": 1.9450447859993423e-05, "loss": 1.2908, "step": 4453 }, { "epoch": 1.3266069733241497, "grad_norm": 0.21226952970027924, "learning_rate": 1.9450132459834685e-05, "loss": 1.2989, "step": 4454 }, { "epoch": 1.3269048195238184, "grad_norm": 0.21352677047252655, "learning_rate": 1.9449816971753025e-05, "loss": 1.2802, "step": 4455 }, { "epoch": 1.327202665723487, "grad_norm": 0.21268369257450104, "learning_rate": 1.9449501395751374e-05, "loss": 1.2769, "step": 4456 }, { "epoch": 1.3275005119231555, "grad_norm": 0.21231991052627563, "learning_rate": 1.9449185731832667e-05, "loss": 1.2847, "step": 4457 }, { "epoch": 1.3277983581228243, "grad_norm": 0.21618475019931793, "learning_rate": 1.944886997999985e-05, "loss": 1.2858, "step": 4458 }, { "epoch": 1.328096204322493, "grad_norm": 0.21233153343200684, "learning_rate": 1.9448554140255852e-05, "loss": 1.2788, "step": 4459 }, { "epoch": 1.3283940505221616, "grad_norm": 0.23762664198875427, "learning_rate": 1.944823821260361e-05, "loss": 1.2966, "step": 4460 }, { "epoch": 1.3286918967218302, "grad_norm": 0.2210892140865326, "learning_rate": 1.9447922197046075e-05, "loss": 1.2867, "step": 4461 }, { "epoch": 1.328989742921499, "grad_norm": 0.21532797813415527, "learning_rate": 1.9447606093586176e-05, "loss": 1.2907, "step": 4462 }, { "epoch": 1.3292875891211675, "grad_norm": 0.211831733584404, "learning_rate": 1.9447289902226857e-05, "loss": 1.2854, "step": 4463 }, { "epoch": 1.3295854353208363, "grad_norm": 0.22693310678005219, "learning_rate": 1.9446973622971064e-05, "loss": 1.2737, "step": 4464 }, { "epoch": 1.3298832815205048, "grad_norm": 0.24312475323677063, "learning_rate": 1.9446657255821735e-05, "loss": 1.2788, "step": 4465 }, { "epoch": 1.3301811277201736, "grad_norm": 0.21440081298351288, "learning_rate": 1.9446340800781814e-05, "loss": 1.2891, "step": 4466 }, { "epoch": 1.3304789739198422, "grad_norm": 0.21881331503391266, "learning_rate": 1.9446024257854246e-05, "loss": 1.2914, "step": 4467 }, { "epoch": 1.3307768201195107, "grad_norm": 0.22630274295806885, "learning_rate": 1.9445707627041975e-05, "loss": 1.2779, "step": 4468 }, { "epoch": 1.3310746663191795, "grad_norm": 0.22332996129989624, "learning_rate": 1.944539090834795e-05, "loss": 1.3013, "step": 4469 }, { "epoch": 1.331372512518848, "grad_norm": 0.23849384486675262, "learning_rate": 1.9445074101775115e-05, "loss": 1.3143, "step": 4470 }, { "epoch": 1.3316703587185168, "grad_norm": 0.22371995449066162, "learning_rate": 1.9444757207326414e-05, "loss": 1.2949, "step": 4471 }, { "epoch": 1.3319682049181854, "grad_norm": 0.21483688056468964, "learning_rate": 1.9444440225004805e-05, "loss": 1.2912, "step": 4472 }, { "epoch": 1.3322660511178541, "grad_norm": 0.21644139289855957, "learning_rate": 1.9444123154813227e-05, "loss": 1.2831, "step": 4473 }, { "epoch": 1.3325638973175227, "grad_norm": 0.2105797380208969, "learning_rate": 1.944380599675464e-05, "loss": 1.2849, "step": 4474 }, { "epoch": 1.3328617435171912, "grad_norm": 0.21353058516979218, "learning_rate": 1.9443488750831988e-05, "loss": 1.2697, "step": 4475 }, { "epoch": 1.33315958971686, "grad_norm": 0.2092098444700241, "learning_rate": 1.944317141704822e-05, "loss": 1.2927, "step": 4476 }, { "epoch": 1.3334574359165285, "grad_norm": 0.21660619974136353, "learning_rate": 1.9442853995406297e-05, "loss": 1.2914, "step": 4477 }, { "epoch": 1.3337552821161973, "grad_norm": 0.2247992902994156, "learning_rate": 1.9442536485909165e-05, "loss": 1.3109, "step": 4478 }, { "epoch": 1.3340531283158659, "grad_norm": 0.20930930972099304, "learning_rate": 1.9442218888559782e-05, "loss": 1.2843, "step": 4479 }, { "epoch": 1.3343509745155346, "grad_norm": 0.20886363089084625, "learning_rate": 1.9441901203361105e-05, "loss": 1.2605, "step": 4480 }, { "epoch": 1.3346488207152032, "grad_norm": 0.21827760338783264, "learning_rate": 1.944158343031608e-05, "loss": 1.2791, "step": 4481 }, { "epoch": 1.3349466669148717, "grad_norm": 0.2067975103855133, "learning_rate": 1.9441265569427674e-05, "loss": 1.2843, "step": 4482 }, { "epoch": 1.3352445131145405, "grad_norm": 0.2142345905303955, "learning_rate": 1.944094762069884e-05, "loss": 1.2874, "step": 4483 }, { "epoch": 1.335542359314209, "grad_norm": 0.21202720701694489, "learning_rate": 1.9440629584132536e-05, "loss": 1.2677, "step": 4484 }, { "epoch": 1.3358402055138778, "grad_norm": 0.20592474937438965, "learning_rate": 1.944031145973172e-05, "loss": 1.3011, "step": 4485 }, { "epoch": 1.3361380517135464, "grad_norm": 0.21177034080028534, "learning_rate": 1.9439993247499352e-05, "loss": 1.2803, "step": 4486 }, { "epoch": 1.3364358979132152, "grad_norm": 0.2227989137172699, "learning_rate": 1.9439674947438398e-05, "loss": 1.2789, "step": 4487 }, { "epoch": 1.3367337441128837, "grad_norm": 0.22396139800548553, "learning_rate": 1.9439356559551813e-05, "loss": 1.2653, "step": 4488 }, { "epoch": 1.3370315903125523, "grad_norm": 0.22306722402572632, "learning_rate": 1.9439038083842562e-05, "loss": 1.2709, "step": 4489 }, { "epoch": 1.337329436512221, "grad_norm": 0.22983412444591522, "learning_rate": 1.9438719520313606e-05, "loss": 1.2839, "step": 4490 }, { "epoch": 1.3376272827118896, "grad_norm": 0.21686509251594543, "learning_rate": 1.9438400868967916e-05, "loss": 1.2873, "step": 4491 }, { "epoch": 1.3379251289115583, "grad_norm": 0.21184255182743073, "learning_rate": 1.9438082129808443e-05, "loss": 1.2843, "step": 4492 }, { "epoch": 1.338222975111227, "grad_norm": 0.2111564576625824, "learning_rate": 1.9437763302838166e-05, "loss": 1.2969, "step": 4493 }, { "epoch": 1.3385208213108957, "grad_norm": 0.25139352679252625, "learning_rate": 1.9437444388060045e-05, "loss": 1.3023, "step": 4494 }, { "epoch": 1.3388186675105642, "grad_norm": 0.2681314945220947, "learning_rate": 1.9437125385477046e-05, "loss": 1.2981, "step": 4495 }, { "epoch": 1.3391165137102328, "grad_norm": 0.2067553848028183, "learning_rate": 1.9436806295092143e-05, "loss": 1.2917, "step": 4496 }, { "epoch": 1.3394143599099015, "grad_norm": 0.34385374188423157, "learning_rate": 1.9436487116908294e-05, "loss": 1.2805, "step": 4497 }, { "epoch": 1.33971220610957, "grad_norm": 0.2229735404253006, "learning_rate": 1.943616785092848e-05, "loss": 1.2938, "step": 4498 }, { "epoch": 1.3400100523092389, "grad_norm": 0.22187785804271698, "learning_rate": 1.943584849715567e-05, "loss": 1.2912, "step": 4499 }, { "epoch": 1.3403078985089074, "grad_norm": 0.22360198199748993, "learning_rate": 1.943552905559283e-05, "loss": 1.3031, "step": 4500 }, { "epoch": 1.3403078985089074, "eval_loss": 1.3569642305374146, "eval_runtime": 21.1279, "eval_samples_per_second": 82.072, "eval_steps_per_second": 5.159, "step": 4500 }, { "epoch": 1.3406057447085762, "grad_norm": 0.22713731229305267, "learning_rate": 1.9435209526242933e-05, "loss": 1.2966, "step": 4501 }, { "epoch": 1.3409035909082447, "grad_norm": 0.2253570407629013, "learning_rate": 1.9434889909108952e-05, "loss": 1.2839, "step": 4502 }, { "epoch": 1.3412014371079133, "grad_norm": 0.2101002037525177, "learning_rate": 1.9434570204193863e-05, "loss": 1.2798, "step": 4503 }, { "epoch": 1.341499283307582, "grad_norm": 0.2120308130979538, "learning_rate": 1.9434250411500638e-05, "loss": 1.2644, "step": 4504 }, { "epoch": 1.3417971295072508, "grad_norm": 0.22912685573101044, "learning_rate": 1.9433930531032255e-05, "loss": 1.2782, "step": 4505 }, { "epoch": 1.3420949757069194, "grad_norm": 0.22854988276958466, "learning_rate": 1.943361056279169e-05, "loss": 1.2619, "step": 4506 }, { "epoch": 1.342392821906588, "grad_norm": 0.22772414982318878, "learning_rate": 1.9433290506781915e-05, "loss": 1.3025, "step": 4507 }, { "epoch": 1.3426906681062567, "grad_norm": 0.21109721064567566, "learning_rate": 1.9432970363005913e-05, "loss": 1.2858, "step": 4508 }, { "epoch": 1.3429885143059253, "grad_norm": 0.2292889803647995, "learning_rate": 1.943265013146666e-05, "loss": 1.2839, "step": 4509 }, { "epoch": 1.3432863605055938, "grad_norm": 0.23116926848888397, "learning_rate": 1.943232981216714e-05, "loss": 1.2975, "step": 4510 }, { "epoch": 1.3435842067052626, "grad_norm": 0.21780000627040863, "learning_rate": 1.9432009405110323e-05, "loss": 1.2765, "step": 4511 }, { "epoch": 1.3438820529049313, "grad_norm": 0.22388587892055511, "learning_rate": 1.9431688910299203e-05, "loss": 1.2856, "step": 4512 }, { "epoch": 1.3441798991046, "grad_norm": 0.231467604637146, "learning_rate": 1.943136832773675e-05, "loss": 1.283, "step": 4513 }, { "epoch": 1.3444777453042684, "grad_norm": 0.22428205609321594, "learning_rate": 1.9431047657425956e-05, "loss": 1.2697, "step": 4514 }, { "epoch": 1.3447755915039372, "grad_norm": 0.2137261927127838, "learning_rate": 1.94307268993698e-05, "loss": 1.2915, "step": 4515 }, { "epoch": 1.3450734377036058, "grad_norm": 0.2115289568901062, "learning_rate": 1.9430406053571265e-05, "loss": 1.274, "step": 4516 }, { "epoch": 1.3453712839032743, "grad_norm": 0.23327216506004333, "learning_rate": 1.9430085120033338e-05, "loss": 1.2839, "step": 4517 }, { "epoch": 1.345669130102943, "grad_norm": 0.22178760170936584, "learning_rate": 1.9429764098759007e-05, "loss": 1.2852, "step": 4518 }, { "epoch": 1.3459669763026119, "grad_norm": 0.21289120614528656, "learning_rate": 1.9429442989751255e-05, "loss": 1.2921, "step": 4519 }, { "epoch": 1.3462648225022804, "grad_norm": 0.22279572486877441, "learning_rate": 1.942912179301307e-05, "loss": 1.2878, "step": 4520 }, { "epoch": 1.346562668701949, "grad_norm": 0.21776123344898224, "learning_rate": 1.9428800508547444e-05, "loss": 1.282, "step": 4521 }, { "epoch": 1.3468605149016177, "grad_norm": 0.22095216810703278, "learning_rate": 1.9428479136357364e-05, "loss": 1.2896, "step": 4522 }, { "epoch": 1.3471583611012863, "grad_norm": 0.21695873141288757, "learning_rate": 1.9428157676445818e-05, "loss": 1.2797, "step": 4523 }, { "epoch": 1.3474562073009548, "grad_norm": 0.21062494814395905, "learning_rate": 1.9427836128815797e-05, "loss": 1.2861, "step": 4524 }, { "epoch": 1.3477540535006236, "grad_norm": 0.21845440566539764, "learning_rate": 1.9427514493470297e-05, "loss": 1.2738, "step": 4525 }, { "epoch": 1.3480518997002924, "grad_norm": 0.21795763075351715, "learning_rate": 1.942719277041231e-05, "loss": 1.2662, "step": 4526 }, { "epoch": 1.348349745899961, "grad_norm": 0.22721393406391144, "learning_rate": 1.9426870959644822e-05, "loss": 1.2803, "step": 4527 }, { "epoch": 1.3486475920996295, "grad_norm": 0.2117440104484558, "learning_rate": 1.9426549061170834e-05, "loss": 1.2939, "step": 4528 }, { "epoch": 1.3489454382992982, "grad_norm": 0.21315455436706543, "learning_rate": 1.942622707499334e-05, "loss": 1.2853, "step": 4529 }, { "epoch": 1.3492432844989668, "grad_norm": 0.21192117035388947, "learning_rate": 1.9425905001115332e-05, "loss": 1.2913, "step": 4530 }, { "epoch": 1.3495411306986356, "grad_norm": 0.215905100107193, "learning_rate": 1.9425582839539813e-05, "loss": 1.2862, "step": 4531 }, { "epoch": 1.3498389768983041, "grad_norm": 0.21604588627815247, "learning_rate": 1.9425260590269775e-05, "loss": 1.2671, "step": 4532 }, { "epoch": 1.350136823097973, "grad_norm": 0.20911955833435059, "learning_rate": 1.9424938253308217e-05, "loss": 1.2856, "step": 4533 }, { "epoch": 1.3504346692976414, "grad_norm": 0.20078156888484955, "learning_rate": 1.9424615828658138e-05, "loss": 1.2826, "step": 4534 }, { "epoch": 1.35073251549731, "grad_norm": 0.21429602801799774, "learning_rate": 1.942429331632254e-05, "loss": 1.2742, "step": 4535 }, { "epoch": 1.3510303616969788, "grad_norm": 0.2233225554227829, "learning_rate": 1.942397071630442e-05, "loss": 1.2664, "step": 4536 }, { "epoch": 1.3513282078966473, "grad_norm": 0.22216635942459106, "learning_rate": 1.9423648028606786e-05, "loss": 1.2922, "step": 4537 }, { "epoch": 1.351626054096316, "grad_norm": 0.2184571623802185, "learning_rate": 1.9423325253232632e-05, "loss": 1.2873, "step": 4538 }, { "epoch": 1.3519239002959846, "grad_norm": 0.20877744257450104, "learning_rate": 1.9423002390184967e-05, "loss": 1.2935, "step": 4539 }, { "epoch": 1.3522217464956534, "grad_norm": 0.21714043617248535, "learning_rate": 1.942267943946679e-05, "loss": 1.2808, "step": 4540 }, { "epoch": 1.352519592695322, "grad_norm": 0.21297597885131836, "learning_rate": 1.9422356401081107e-05, "loss": 1.2765, "step": 4541 }, { "epoch": 1.3528174388949905, "grad_norm": 0.2160167247056961, "learning_rate": 1.942203327503093e-05, "loss": 1.2935, "step": 4542 }, { "epoch": 1.3531152850946593, "grad_norm": 0.21878854930400848, "learning_rate": 1.9421710061319258e-05, "loss": 1.2857, "step": 4543 }, { "epoch": 1.3534131312943278, "grad_norm": 0.21440044045448303, "learning_rate": 1.9421386759949102e-05, "loss": 1.2822, "step": 4544 }, { "epoch": 1.3537109774939966, "grad_norm": 0.20837941765785217, "learning_rate": 1.9421063370923464e-05, "loss": 1.2863, "step": 4545 }, { "epoch": 1.3540088236936652, "grad_norm": 0.20805011689662933, "learning_rate": 1.9420739894245363e-05, "loss": 1.2725, "step": 4546 }, { "epoch": 1.354306669893334, "grad_norm": 0.23626692593097687, "learning_rate": 1.94204163299178e-05, "loss": 1.2867, "step": 4547 }, { "epoch": 1.3546045160930025, "grad_norm": 0.22463680803775787, "learning_rate": 1.942009267794379e-05, "loss": 1.2947, "step": 4548 }, { "epoch": 1.354902362292671, "grad_norm": 0.2230614572763443, "learning_rate": 1.9419768938326337e-05, "loss": 1.2658, "step": 4549 }, { "epoch": 1.3552002084923398, "grad_norm": 0.22742493450641632, "learning_rate": 1.941944511106846e-05, "loss": 1.3002, "step": 4550 }, { "epoch": 1.3554980546920083, "grad_norm": 0.21157263219356537, "learning_rate": 1.9419121196173175e-05, "loss": 1.2811, "step": 4551 }, { "epoch": 1.3557959008916771, "grad_norm": 0.23207604885101318, "learning_rate": 1.9418797193643488e-05, "loss": 1.2803, "step": 4552 }, { "epoch": 1.3560937470913457, "grad_norm": 0.22329147160053253, "learning_rate": 1.941847310348242e-05, "loss": 1.2926, "step": 4553 }, { "epoch": 1.3563915932910144, "grad_norm": 0.21609167754650116, "learning_rate": 1.9418148925692978e-05, "loss": 1.2772, "step": 4554 }, { "epoch": 1.356689439490683, "grad_norm": 0.21065406501293182, "learning_rate": 1.9417824660278182e-05, "loss": 1.2991, "step": 4555 }, { "epoch": 1.3569872856903515, "grad_norm": 0.21100889146327972, "learning_rate": 1.9417500307241054e-05, "loss": 1.2858, "step": 4556 }, { "epoch": 1.3572851318900203, "grad_norm": 0.22105230391025543, "learning_rate": 1.9417175866584605e-05, "loss": 1.2834, "step": 4557 }, { "epoch": 1.3575829780896889, "grad_norm": 0.2316344827413559, "learning_rate": 1.9416851338311854e-05, "loss": 1.2787, "step": 4558 }, { "epoch": 1.3578808242893576, "grad_norm": 0.21157117187976837, "learning_rate": 1.9416526722425826e-05, "loss": 1.2716, "step": 4559 }, { "epoch": 1.3581786704890262, "grad_norm": 0.21879585087299347, "learning_rate": 1.9416202018929537e-05, "loss": 1.2773, "step": 4560 }, { "epoch": 1.358476516688695, "grad_norm": 0.2180936336517334, "learning_rate": 1.9415877227826007e-05, "loss": 1.3113, "step": 4561 }, { "epoch": 1.3587743628883635, "grad_norm": 0.21200427412986755, "learning_rate": 1.9415552349118263e-05, "loss": 1.2925, "step": 4562 }, { "epoch": 1.359072209088032, "grad_norm": 0.22136510908603668, "learning_rate": 1.941522738280932e-05, "loss": 1.2855, "step": 4563 }, { "epoch": 1.3593700552877008, "grad_norm": 0.2183605581521988, "learning_rate": 1.9414902328902207e-05, "loss": 1.2752, "step": 4564 }, { "epoch": 1.3596679014873694, "grad_norm": 0.21506759524345398, "learning_rate": 1.9414577187399947e-05, "loss": 1.29, "step": 4565 }, { "epoch": 1.3599657476870382, "grad_norm": 0.215349942445755, "learning_rate": 1.9414251958305566e-05, "loss": 1.2857, "step": 4566 }, { "epoch": 1.3602635938867067, "grad_norm": 0.2266671359539032, "learning_rate": 1.9413926641622086e-05, "loss": 1.293, "step": 4567 }, { "epoch": 1.3605614400863755, "grad_norm": 0.218679279088974, "learning_rate": 1.9413601237352536e-05, "loss": 1.2942, "step": 4568 }, { "epoch": 1.360859286286044, "grad_norm": 0.2285957783460617, "learning_rate": 1.941327574549995e-05, "loss": 1.2833, "step": 4569 }, { "epoch": 1.3611571324857126, "grad_norm": 0.2153051793575287, "learning_rate": 1.9412950166067347e-05, "loss": 1.2817, "step": 4570 }, { "epoch": 1.3614549786853813, "grad_norm": 0.20250266790390015, "learning_rate": 1.9412624499057755e-05, "loss": 1.3068, "step": 4571 }, { "epoch": 1.3617528248850501, "grad_norm": 0.21537740528583527, "learning_rate": 1.9412298744474213e-05, "loss": 1.3089, "step": 4572 }, { "epoch": 1.3620506710847187, "grad_norm": 0.21063105762004852, "learning_rate": 1.9411972902319746e-05, "loss": 1.2839, "step": 4573 }, { "epoch": 1.3623485172843872, "grad_norm": 0.22086693346500397, "learning_rate": 1.9411646972597387e-05, "loss": 1.2801, "step": 4574 }, { "epoch": 1.362646363484056, "grad_norm": 0.22338207066059113, "learning_rate": 1.941132095531017e-05, "loss": 1.2862, "step": 4575 }, { "epoch": 1.3629442096837245, "grad_norm": 0.20457307994365692, "learning_rate": 1.9410994850461125e-05, "loss": 1.2701, "step": 4576 }, { "epoch": 1.363242055883393, "grad_norm": 0.20595508813858032, "learning_rate": 1.9410668658053286e-05, "loss": 1.273, "step": 4577 }, { "epoch": 1.3635399020830619, "grad_norm": 0.2182329148054123, "learning_rate": 1.941034237808969e-05, "loss": 1.2864, "step": 4578 }, { "epoch": 1.3638377482827306, "grad_norm": 0.21785813570022583, "learning_rate": 1.9410016010573373e-05, "loss": 1.2858, "step": 4579 }, { "epoch": 1.3641355944823992, "grad_norm": 0.21548466384410858, "learning_rate": 1.9409689555507373e-05, "loss": 1.2748, "step": 4580 }, { "epoch": 1.3644334406820677, "grad_norm": 0.2248479425907135, "learning_rate": 1.940936301289472e-05, "loss": 1.2742, "step": 4581 }, { "epoch": 1.3647312868817365, "grad_norm": 0.21730519831180573, "learning_rate": 1.940903638273846e-05, "loss": 1.2864, "step": 4582 }, { "epoch": 1.365029133081405, "grad_norm": 0.2234262079000473, "learning_rate": 1.9408709665041627e-05, "loss": 1.2886, "step": 4583 }, { "epoch": 1.3653269792810736, "grad_norm": 0.21113720536231995, "learning_rate": 1.9408382859807264e-05, "loss": 1.2748, "step": 4584 }, { "epoch": 1.3656248254807424, "grad_norm": 0.21111853420734406, "learning_rate": 1.940805596703841e-05, "loss": 1.2809, "step": 4585 }, { "epoch": 1.3659226716804111, "grad_norm": 0.21144357323646545, "learning_rate": 1.9407728986738107e-05, "loss": 1.2863, "step": 4586 }, { "epoch": 1.3662205178800797, "grad_norm": 0.20854952931404114, "learning_rate": 1.9407401918909394e-05, "loss": 1.279, "step": 4587 }, { "epoch": 1.3665183640797482, "grad_norm": 0.206075519323349, "learning_rate": 1.9407074763555317e-05, "loss": 1.2763, "step": 4588 }, { "epoch": 1.366816210279417, "grad_norm": 0.21549838781356812, "learning_rate": 1.9406747520678922e-05, "loss": 1.2895, "step": 4589 }, { "epoch": 1.3671140564790856, "grad_norm": 0.21204976737499237, "learning_rate": 1.9406420190283254e-05, "loss": 1.2996, "step": 4590 }, { "epoch": 1.3674119026787541, "grad_norm": 0.19928468763828278, "learning_rate": 1.940609277237135e-05, "loss": 1.2868, "step": 4591 }, { "epoch": 1.367709748878423, "grad_norm": 0.22234809398651123, "learning_rate": 1.9405765266946263e-05, "loss": 1.2798, "step": 4592 }, { "epoch": 1.3680075950780917, "grad_norm": 0.21840263903141022, "learning_rate": 1.9405437674011042e-05, "loss": 1.2761, "step": 4593 }, { "epoch": 1.3683054412777602, "grad_norm": 0.19964468479156494, "learning_rate": 1.940510999356873e-05, "loss": 1.2694, "step": 4594 }, { "epoch": 1.3686032874774288, "grad_norm": 0.21210229396820068, "learning_rate": 1.9404782225622376e-05, "loss": 1.3048, "step": 4595 }, { "epoch": 1.3689011336770975, "grad_norm": 0.20749431848526, "learning_rate": 1.940445437017503e-05, "loss": 1.2658, "step": 4596 }, { "epoch": 1.369198979876766, "grad_norm": 0.21786624193191528, "learning_rate": 1.9404126427229745e-05, "loss": 1.3007, "step": 4597 }, { "epoch": 1.3694968260764349, "grad_norm": 0.2164333164691925, "learning_rate": 1.9403798396789572e-05, "loss": 1.267, "step": 4598 }, { "epoch": 1.3697946722761034, "grad_norm": 0.22057907283306122, "learning_rate": 1.940347027885756e-05, "loss": 1.2628, "step": 4599 }, { "epoch": 1.3700925184757722, "grad_norm": 0.21018175780773163, "learning_rate": 1.9403142073436766e-05, "loss": 1.301, "step": 4600 }, { "epoch": 1.3703903646754407, "grad_norm": 0.224778413772583, "learning_rate": 1.9402813780530235e-05, "loss": 1.2823, "step": 4601 }, { "epoch": 1.3706882108751093, "grad_norm": 0.22259201109409332, "learning_rate": 1.9402485400141032e-05, "loss": 1.2918, "step": 4602 }, { "epoch": 1.370986057074778, "grad_norm": 0.2236838936805725, "learning_rate": 1.9402156932272205e-05, "loss": 1.2994, "step": 4603 }, { "epoch": 1.3712839032744466, "grad_norm": 0.22335247695446014, "learning_rate": 1.9401828376926813e-05, "loss": 1.286, "step": 4604 }, { "epoch": 1.3715817494741154, "grad_norm": 0.20863713324069977, "learning_rate": 1.9401499734107915e-05, "loss": 1.2834, "step": 4605 }, { "epoch": 1.371879595673784, "grad_norm": 0.205750972032547, "learning_rate": 1.940117100381856e-05, "loss": 1.2711, "step": 4606 }, { "epoch": 1.3721774418734527, "grad_norm": 0.21060913801193237, "learning_rate": 1.940084218606182e-05, "loss": 1.2817, "step": 4607 }, { "epoch": 1.3724752880731212, "grad_norm": 0.22086259722709656, "learning_rate": 1.9400513280840744e-05, "loss": 1.2882, "step": 4608 }, { "epoch": 1.3727731342727898, "grad_norm": 0.2112884372472763, "learning_rate": 1.9400184288158393e-05, "loss": 1.2789, "step": 4609 }, { "epoch": 1.3730709804724586, "grad_norm": 0.21684275567531586, "learning_rate": 1.9399855208017828e-05, "loss": 1.2947, "step": 4610 }, { "epoch": 1.3733688266721271, "grad_norm": 0.21600863337516785, "learning_rate": 1.9399526040422114e-05, "loss": 1.2779, "step": 4611 }, { "epoch": 1.373666672871796, "grad_norm": 0.20554274320602417, "learning_rate": 1.9399196785374313e-05, "loss": 1.2645, "step": 4612 }, { "epoch": 1.3739645190714644, "grad_norm": 0.21336773037910461, "learning_rate": 1.939886744287749e-05, "loss": 1.2764, "step": 4613 }, { "epoch": 1.3742623652711332, "grad_norm": 0.22521883249282837, "learning_rate": 1.9398538012934703e-05, "loss": 1.2894, "step": 4614 }, { "epoch": 1.3745602114708018, "grad_norm": 0.22535529732704163, "learning_rate": 1.9398208495549018e-05, "loss": 1.2914, "step": 4615 }, { "epoch": 1.3748580576704703, "grad_norm": 0.20711123943328857, "learning_rate": 1.9397878890723506e-05, "loss": 1.2747, "step": 4616 }, { "epoch": 1.375155903870139, "grad_norm": 0.21560992300510406, "learning_rate": 1.939754919846123e-05, "loss": 1.2903, "step": 4617 }, { "epoch": 1.3754537500698076, "grad_norm": 0.21865800023078918, "learning_rate": 1.9397219418765262e-05, "loss": 1.2694, "step": 4618 }, { "epoch": 1.3757515962694764, "grad_norm": 0.21390901505947113, "learning_rate": 1.939688955163866e-05, "loss": 1.2873, "step": 4619 }, { "epoch": 1.376049442469145, "grad_norm": 0.21455232799053192, "learning_rate": 1.9396559597084507e-05, "loss": 1.2844, "step": 4620 }, { "epoch": 1.3763472886688137, "grad_norm": 0.2182733714580536, "learning_rate": 1.9396229555105863e-05, "loss": 1.2884, "step": 4621 }, { "epoch": 1.3766451348684823, "grad_norm": 0.2096615880727768, "learning_rate": 1.93958994257058e-05, "loss": 1.2687, "step": 4622 }, { "epoch": 1.3769429810681508, "grad_norm": 0.21721862256526947, "learning_rate": 1.9395569208887388e-05, "loss": 1.2844, "step": 4623 }, { "epoch": 1.3772408272678196, "grad_norm": 0.22202299535274506, "learning_rate": 1.9395238904653706e-05, "loss": 1.2824, "step": 4624 }, { "epoch": 1.3775386734674882, "grad_norm": 0.21815629303455353, "learning_rate": 1.9394908513007823e-05, "loss": 1.2884, "step": 4625 }, { "epoch": 1.377836519667157, "grad_norm": 0.23223276436328888, "learning_rate": 1.939457803395281e-05, "loss": 1.3044, "step": 4626 }, { "epoch": 1.3781343658668255, "grad_norm": 0.22536700963974, "learning_rate": 1.9394247467491744e-05, "loss": 1.2686, "step": 4627 }, { "epoch": 1.3784322120664942, "grad_norm": 0.23456823825836182, "learning_rate": 1.9393916813627704e-05, "loss": 1.2788, "step": 4628 }, { "epoch": 1.3787300582661628, "grad_norm": 0.22492651641368866, "learning_rate": 1.9393586072363765e-05, "loss": 1.2945, "step": 4629 }, { "epoch": 1.3790279044658313, "grad_norm": 0.21123461425304413, "learning_rate": 1.9393255243702997e-05, "loss": 1.2868, "step": 4630 }, { "epoch": 1.3793257506655001, "grad_norm": 0.2123865783214569, "learning_rate": 1.9392924327648486e-05, "loss": 1.2751, "step": 4631 }, { "epoch": 1.3796235968651687, "grad_norm": 0.21001622080802917, "learning_rate": 1.939259332420331e-05, "loss": 1.2917, "step": 4632 }, { "epoch": 1.3799214430648374, "grad_norm": 0.21975718438625336, "learning_rate": 1.939226223337055e-05, "loss": 1.2732, "step": 4633 }, { "epoch": 1.380219289264506, "grad_norm": 0.22425499558448792, "learning_rate": 1.9391931055153278e-05, "loss": 1.264, "step": 4634 }, { "epoch": 1.3805171354641748, "grad_norm": 0.21482954919338226, "learning_rate": 1.9391599789554582e-05, "loss": 1.2774, "step": 4635 }, { "epoch": 1.3808149816638433, "grad_norm": 0.20815294981002808, "learning_rate": 1.9391268436577543e-05, "loss": 1.2884, "step": 4636 }, { "epoch": 1.3811128278635119, "grad_norm": 0.2155391275882721, "learning_rate": 1.9390936996225247e-05, "loss": 1.2755, "step": 4637 }, { "epoch": 1.3814106740631806, "grad_norm": 0.22542138397693634, "learning_rate": 1.939060546850077e-05, "loss": 1.2994, "step": 4638 }, { "epoch": 1.3817085202628494, "grad_norm": 0.22146451473236084, "learning_rate": 1.9390273853407205e-05, "loss": 1.2763, "step": 4639 }, { "epoch": 1.382006366462518, "grad_norm": 0.2203773856163025, "learning_rate": 1.938994215094763e-05, "loss": 1.2944, "step": 4640 }, { "epoch": 1.3823042126621865, "grad_norm": 0.21654027700424194, "learning_rate": 1.9389610361125133e-05, "loss": 1.2726, "step": 4641 }, { "epoch": 1.3826020588618553, "grad_norm": 0.22399553656578064, "learning_rate": 1.938927848394281e-05, "loss": 1.2695, "step": 4642 }, { "epoch": 1.3828999050615238, "grad_norm": 0.22837962210178375, "learning_rate": 1.9388946519403733e-05, "loss": 1.3121, "step": 4643 }, { "epoch": 1.3831977512611924, "grad_norm": 0.22870467603206635, "learning_rate": 1.9388614467511003e-05, "loss": 1.2727, "step": 4644 }, { "epoch": 1.3834955974608611, "grad_norm": 0.23329536616802216, "learning_rate": 1.9388282328267703e-05, "loss": 1.277, "step": 4645 }, { "epoch": 1.38379344366053, "grad_norm": 0.22212761640548706, "learning_rate": 1.9387950101676925e-05, "loss": 1.2938, "step": 4646 }, { "epoch": 1.3840912898601985, "grad_norm": 0.21830856800079346, "learning_rate": 1.938761778774176e-05, "loss": 1.2708, "step": 4647 }, { "epoch": 1.384389136059867, "grad_norm": 0.38767769932746887, "learning_rate": 1.93872853864653e-05, "loss": 1.2788, "step": 4648 }, { "epoch": 1.3846869822595358, "grad_norm": 0.22776472568511963, "learning_rate": 1.938695289785064e-05, "loss": 1.2966, "step": 4649 }, { "epoch": 1.3849848284592043, "grad_norm": 0.2177894413471222, "learning_rate": 1.9386620321900868e-05, "loss": 1.2894, "step": 4650 }, { "epoch": 1.385282674658873, "grad_norm": 0.21724140644073486, "learning_rate": 1.9386287658619083e-05, "loss": 1.2736, "step": 4651 }, { "epoch": 1.3855805208585417, "grad_norm": 0.22220289707183838, "learning_rate": 1.9385954908008377e-05, "loss": 1.3074, "step": 4652 }, { "epoch": 1.3858783670582104, "grad_norm": 0.23475584387779236, "learning_rate": 1.938562207007185e-05, "loss": 1.2722, "step": 4653 }, { "epoch": 1.386176213257879, "grad_norm": 0.2162562906742096, "learning_rate": 1.938528914481259e-05, "loss": 1.283, "step": 4654 }, { "epoch": 1.3864740594575475, "grad_norm": 0.21465593576431274, "learning_rate": 1.9384956132233706e-05, "loss": 1.2622, "step": 4655 }, { "epoch": 1.3867719056572163, "grad_norm": 0.2208995521068573, "learning_rate": 1.938462303233829e-05, "loss": 1.2848, "step": 4656 }, { "epoch": 1.3870697518568849, "grad_norm": 0.21462784707546234, "learning_rate": 1.938428984512944e-05, "loss": 1.284, "step": 4657 }, { "epoch": 1.3873675980565536, "grad_norm": 0.20261240005493164, "learning_rate": 1.938395657061026e-05, "loss": 1.2906, "step": 4658 }, { "epoch": 1.3876654442562222, "grad_norm": 0.2288421094417572, "learning_rate": 1.9383623208783845e-05, "loss": 1.2687, "step": 4659 }, { "epoch": 1.387963290455891, "grad_norm": 0.21072624623775482, "learning_rate": 1.9383289759653304e-05, "loss": 1.296, "step": 4660 }, { "epoch": 1.3882611366555595, "grad_norm": 0.22929874062538147, "learning_rate": 1.938295622322173e-05, "loss": 1.287, "step": 4661 }, { "epoch": 1.388558982855228, "grad_norm": 0.22117933630943298, "learning_rate": 1.9382622599492237e-05, "loss": 1.2825, "step": 4662 }, { "epoch": 1.3888568290548968, "grad_norm": 0.2204248011112213, "learning_rate": 1.938228888846792e-05, "loss": 1.2946, "step": 4663 }, { "epoch": 1.3891546752545654, "grad_norm": 0.21365541219711304, "learning_rate": 1.938195509015189e-05, "loss": 1.2806, "step": 4664 }, { "epoch": 1.3894525214542341, "grad_norm": 0.22503085434436798, "learning_rate": 1.938162120454725e-05, "loss": 1.2987, "step": 4665 }, { "epoch": 1.3897503676539027, "grad_norm": 0.22421522438526154, "learning_rate": 1.9381287231657105e-05, "loss": 1.2927, "step": 4666 }, { "epoch": 1.3900482138535715, "grad_norm": 0.21837452054023743, "learning_rate": 1.9380953171484566e-05, "loss": 1.2814, "step": 4667 }, { "epoch": 1.39034606005324, "grad_norm": 0.21142376959323883, "learning_rate": 1.9380619024032734e-05, "loss": 1.2951, "step": 4668 }, { "epoch": 1.3906439062529086, "grad_norm": 0.2263711541891098, "learning_rate": 1.938028478930473e-05, "loss": 1.2858, "step": 4669 }, { "epoch": 1.3909417524525773, "grad_norm": 0.21461977064609528, "learning_rate": 1.937995046730365e-05, "loss": 1.2786, "step": 4670 }, { "epoch": 1.391239598652246, "grad_norm": 0.21937595307826996, "learning_rate": 1.937961605803261e-05, "loss": 1.2999, "step": 4671 }, { "epoch": 1.3915374448519147, "grad_norm": 0.205740287899971, "learning_rate": 1.9379281561494726e-05, "loss": 1.294, "step": 4672 }, { "epoch": 1.3918352910515832, "grad_norm": 0.22866885364055634, "learning_rate": 1.9378946977693106e-05, "loss": 1.2815, "step": 4673 }, { "epoch": 1.392133137251252, "grad_norm": 0.21737025678157806, "learning_rate": 1.937861230663086e-05, "loss": 1.2841, "step": 4674 }, { "epoch": 1.3924309834509205, "grad_norm": 0.21780544519424438, "learning_rate": 1.937827754831111e-05, "loss": 1.2705, "step": 4675 }, { "epoch": 1.392728829650589, "grad_norm": 0.21779504418373108, "learning_rate": 1.937794270273696e-05, "loss": 1.2892, "step": 4676 }, { "epoch": 1.3930266758502579, "grad_norm": 0.21866978704929352, "learning_rate": 1.9377607769911534e-05, "loss": 1.2663, "step": 4677 }, { "epoch": 1.3933245220499264, "grad_norm": 0.22655676305294037, "learning_rate": 1.9377272749837944e-05, "loss": 1.2791, "step": 4678 }, { "epoch": 1.3936223682495952, "grad_norm": 0.20959815382957458, "learning_rate": 1.9376937642519307e-05, "loss": 1.2854, "step": 4679 }, { "epoch": 1.3939202144492637, "grad_norm": 0.2104579508304596, "learning_rate": 1.9376602447958747e-05, "loss": 1.2846, "step": 4680 }, { "epoch": 1.3942180606489325, "grad_norm": 0.21385684609413147, "learning_rate": 1.937626716615937e-05, "loss": 1.2871, "step": 4681 }, { "epoch": 1.394515906848601, "grad_norm": 0.20603463053703308, "learning_rate": 1.9375931797124306e-05, "loss": 1.2704, "step": 4682 }, { "epoch": 1.3948137530482696, "grad_norm": 0.22123649716377258, "learning_rate": 1.9375596340856673e-05, "loss": 1.2796, "step": 4683 }, { "epoch": 1.3951115992479384, "grad_norm": 0.21366514265537262, "learning_rate": 1.937526079735959e-05, "loss": 1.2814, "step": 4684 }, { "epoch": 1.395409445447607, "grad_norm": 0.2190808802843094, "learning_rate": 1.937492516663618e-05, "loss": 1.2726, "step": 4685 }, { "epoch": 1.3957072916472757, "grad_norm": 0.21271266043186188, "learning_rate": 1.9374589448689567e-05, "loss": 1.2728, "step": 4686 }, { "epoch": 1.3960051378469442, "grad_norm": 0.22519513964653015, "learning_rate": 1.937425364352287e-05, "loss": 1.2888, "step": 4687 }, { "epoch": 1.396302984046613, "grad_norm": 0.21729159355163574, "learning_rate": 1.937391775113922e-05, "loss": 1.2858, "step": 4688 }, { "epoch": 1.3966008302462816, "grad_norm": 0.2038884460926056, "learning_rate": 1.9373581771541737e-05, "loss": 1.2831, "step": 4689 }, { "epoch": 1.3968986764459501, "grad_norm": 0.20512959361076355, "learning_rate": 1.937324570473355e-05, "loss": 1.2724, "step": 4690 }, { "epoch": 1.3971965226456189, "grad_norm": 0.22382956743240356, "learning_rate": 1.937290955071778e-05, "loss": 1.3006, "step": 4691 }, { "epoch": 1.3974943688452874, "grad_norm": 0.21378615498542786, "learning_rate": 1.937257330949756e-05, "loss": 1.2821, "step": 4692 }, { "epoch": 1.3977922150449562, "grad_norm": 0.22013157606124878, "learning_rate": 1.937223698107602e-05, "loss": 1.2914, "step": 4693 }, { "epoch": 1.3980900612446248, "grad_norm": 0.21234650909900665, "learning_rate": 1.937190056545628e-05, "loss": 1.274, "step": 4694 }, { "epoch": 1.3983879074442935, "grad_norm": 0.20825199782848358, "learning_rate": 1.9371564062641482e-05, "loss": 1.2729, "step": 4695 }, { "epoch": 1.398685753643962, "grad_norm": 0.2126312106847763, "learning_rate": 1.937122747263475e-05, "loss": 1.2862, "step": 4696 }, { "epoch": 1.3989835998436306, "grad_norm": 0.22088484466075897, "learning_rate": 1.9370890795439215e-05, "loss": 1.2713, "step": 4697 }, { "epoch": 1.3992814460432994, "grad_norm": 0.2233424335718155, "learning_rate": 1.9370554031058013e-05, "loss": 1.2773, "step": 4698 }, { "epoch": 1.3995792922429682, "grad_norm": 0.21901319921016693, "learning_rate": 1.9370217179494274e-05, "loss": 1.2768, "step": 4699 }, { "epoch": 1.3998771384426367, "grad_norm": 0.2273193895816803, "learning_rate": 1.9369880240751132e-05, "loss": 1.2725, "step": 4700 }, { "epoch": 1.4001749846423053, "grad_norm": 0.2120569795370102, "learning_rate": 1.9369543214831725e-05, "loss": 1.2689, "step": 4701 }, { "epoch": 1.400472830841974, "grad_norm": 0.2282368242740631, "learning_rate": 1.9369206101739184e-05, "loss": 1.2655, "step": 4702 }, { "epoch": 1.4007706770416426, "grad_norm": 0.21750496327877045, "learning_rate": 1.936886890147665e-05, "loss": 1.3106, "step": 4703 }, { "epoch": 1.4010685232413111, "grad_norm": 0.21052533388137817, "learning_rate": 1.936853161404726e-05, "loss": 1.2659, "step": 4704 }, { "epoch": 1.40136636944098, "grad_norm": 0.21183964610099792, "learning_rate": 1.9368194239454146e-05, "loss": 1.2824, "step": 4705 }, { "epoch": 1.4016642156406487, "grad_norm": 0.20848579704761505, "learning_rate": 1.9367856777700455e-05, "loss": 1.2781, "step": 4706 }, { "epoch": 1.4019620618403172, "grad_norm": 0.2138061374425888, "learning_rate": 1.936751922878932e-05, "loss": 1.2687, "step": 4707 }, { "epoch": 1.4022599080399858, "grad_norm": 0.2122078537940979, "learning_rate": 1.936718159272389e-05, "loss": 1.2923, "step": 4708 }, { "epoch": 1.4025577542396546, "grad_norm": 0.2128915786743164, "learning_rate": 1.9366843869507296e-05, "loss": 1.2808, "step": 4709 }, { "epoch": 1.4028556004393231, "grad_norm": 0.22548538446426392, "learning_rate": 1.9366506059142688e-05, "loss": 1.2771, "step": 4710 }, { "epoch": 1.4031534466389917, "grad_norm": 0.21388880908489227, "learning_rate": 1.9366168161633206e-05, "loss": 1.284, "step": 4711 }, { "epoch": 1.4034512928386604, "grad_norm": 0.21711203455924988, "learning_rate": 1.9365830176981994e-05, "loss": 1.2683, "step": 4712 }, { "epoch": 1.4037491390383292, "grad_norm": 0.22926999628543854, "learning_rate": 1.9365492105192193e-05, "loss": 1.2879, "step": 4713 }, { "epoch": 1.4040469852379978, "grad_norm": 0.22222432494163513, "learning_rate": 1.936515394626695e-05, "loss": 1.2813, "step": 4714 }, { "epoch": 1.4043448314376663, "grad_norm": 0.3137796223163605, "learning_rate": 1.9364815700209417e-05, "loss": 1.2679, "step": 4715 }, { "epoch": 1.404642677637335, "grad_norm": 0.26504454016685486, "learning_rate": 1.9364477367022738e-05, "loss": 1.2887, "step": 4716 }, { "epoch": 1.4049405238370036, "grad_norm": 0.23855175077915192, "learning_rate": 1.9364138946710057e-05, "loss": 1.2706, "step": 4717 }, { "epoch": 1.4052383700366722, "grad_norm": 0.2073981761932373, "learning_rate": 1.9363800439274528e-05, "loss": 1.2718, "step": 4718 }, { "epoch": 1.405536216236341, "grad_norm": 0.23830677568912506, "learning_rate": 1.9363461844719292e-05, "loss": 1.2737, "step": 4719 }, { "epoch": 1.4058340624360097, "grad_norm": 0.24615485966205597, "learning_rate": 1.936312316304751e-05, "loss": 1.283, "step": 4720 }, { "epoch": 1.4061319086356783, "grad_norm": 0.2343778759241104, "learning_rate": 1.9362784394262327e-05, "loss": 1.2938, "step": 4721 }, { "epoch": 1.4064297548353468, "grad_norm": 0.2170053869485855, "learning_rate": 1.93624455383669e-05, "loss": 1.2877, "step": 4722 }, { "epoch": 1.4067276010350156, "grad_norm": 0.23598410189151764, "learning_rate": 1.9362106595364373e-05, "loss": 1.3068, "step": 4723 }, { "epoch": 1.4070254472346841, "grad_norm": 0.20887908339500427, "learning_rate": 1.9361767565257904e-05, "loss": 1.2805, "step": 4724 }, { "epoch": 1.407323293434353, "grad_norm": 0.20937970280647278, "learning_rate": 1.9361428448050645e-05, "loss": 1.3008, "step": 4725 }, { "epoch": 1.4076211396340215, "grad_norm": 0.2293359786272049, "learning_rate": 1.9361089243745755e-05, "loss": 1.2878, "step": 4726 }, { "epoch": 1.4079189858336902, "grad_norm": 0.22540301084518433, "learning_rate": 1.9360749952346393e-05, "loss": 1.256, "step": 4727 }, { "epoch": 1.4082168320333588, "grad_norm": 0.22422142326831818, "learning_rate": 1.9360410573855707e-05, "loss": 1.2824, "step": 4728 }, { "epoch": 1.4085146782330273, "grad_norm": 0.21460947394371033, "learning_rate": 1.936007110827686e-05, "loss": 1.2958, "step": 4729 }, { "epoch": 1.4088125244326961, "grad_norm": 0.2221807837486267, "learning_rate": 1.9359731555613012e-05, "loss": 1.2928, "step": 4730 }, { "epoch": 1.4091103706323647, "grad_norm": 0.23458969593048096, "learning_rate": 1.9359391915867315e-05, "loss": 1.2903, "step": 4731 }, { "epoch": 1.4094082168320334, "grad_norm": 0.22301706671714783, "learning_rate": 1.9359052189042932e-05, "loss": 1.2735, "step": 4732 }, { "epoch": 1.409706063031702, "grad_norm": 0.21750102937221527, "learning_rate": 1.9358712375143026e-05, "loss": 1.2828, "step": 4733 }, { "epoch": 1.4100039092313708, "grad_norm": 0.24715521931648254, "learning_rate": 1.9358372474170763e-05, "loss": 1.2917, "step": 4734 }, { "epoch": 1.4103017554310393, "grad_norm": 0.21691952645778656, "learning_rate": 1.9358032486129296e-05, "loss": 1.2782, "step": 4735 }, { "epoch": 1.4105996016307079, "grad_norm": 0.23581166565418243, "learning_rate": 1.935769241102179e-05, "loss": 1.2858, "step": 4736 }, { "epoch": 1.4108974478303766, "grad_norm": 0.221417635679245, "learning_rate": 1.935735224885141e-05, "loss": 1.3084, "step": 4737 }, { "epoch": 1.4111952940300452, "grad_norm": 0.21256232261657715, "learning_rate": 1.9357011999621326e-05, "loss": 1.2831, "step": 4738 }, { "epoch": 1.411493140229714, "grad_norm": 0.21626178920269012, "learning_rate": 1.9356671663334697e-05, "loss": 1.2882, "step": 4739 }, { "epoch": 1.4117909864293825, "grad_norm": 0.21899373829364777, "learning_rate": 1.9356331239994698e-05, "loss": 1.2654, "step": 4740 }, { "epoch": 1.4120888326290513, "grad_norm": 0.23099561035633087, "learning_rate": 1.9355990729604482e-05, "loss": 1.2698, "step": 4741 }, { "epoch": 1.4123866788287198, "grad_norm": 0.21576914191246033, "learning_rate": 1.9355650132167228e-05, "loss": 1.2881, "step": 4742 }, { "epoch": 1.4126845250283884, "grad_norm": 0.21933506429195404, "learning_rate": 1.9355309447686107e-05, "loss": 1.2817, "step": 4743 }, { "epoch": 1.4129823712280571, "grad_norm": 0.23537559807300568, "learning_rate": 1.935496867616428e-05, "loss": 1.2902, "step": 4744 }, { "epoch": 1.4132802174277257, "grad_norm": 0.21506574749946594, "learning_rate": 1.9354627817604922e-05, "loss": 1.2804, "step": 4745 }, { "epoch": 1.4135780636273945, "grad_norm": 0.20947204530239105, "learning_rate": 1.93542868720112e-05, "loss": 1.272, "step": 4746 }, { "epoch": 1.413875909827063, "grad_norm": 0.21577829122543335, "learning_rate": 1.9353945839386297e-05, "loss": 1.294, "step": 4747 }, { "epoch": 1.4141737560267318, "grad_norm": 0.22076302766799927, "learning_rate": 1.9353604719733373e-05, "loss": 1.2829, "step": 4748 }, { "epoch": 1.4144716022264003, "grad_norm": 0.2247573882341385, "learning_rate": 1.935326351305561e-05, "loss": 1.273, "step": 4749 }, { "epoch": 1.4147694484260689, "grad_norm": 0.2100122720003128, "learning_rate": 1.935292221935618e-05, "loss": 1.2947, "step": 4750 }, { "epoch": 1.4150672946257377, "grad_norm": 0.2099503129720688, "learning_rate": 1.9352580838638258e-05, "loss": 1.2953, "step": 4751 }, { "epoch": 1.4153651408254062, "grad_norm": 0.21044223010540009, "learning_rate": 1.9352239370905025e-05, "loss": 1.2674, "step": 4752 }, { "epoch": 1.415662987025075, "grad_norm": 0.2194964736700058, "learning_rate": 1.935189781615965e-05, "loss": 1.2783, "step": 4753 }, { "epoch": 1.4159608332247435, "grad_norm": 0.21921008825302124, "learning_rate": 1.935155617440531e-05, "loss": 1.2807, "step": 4754 }, { "epoch": 1.4162586794244123, "grad_norm": 0.2217089831829071, "learning_rate": 1.9351214445645193e-05, "loss": 1.2916, "step": 4755 }, { "epoch": 1.4165565256240809, "grad_norm": 0.2105427384376526, "learning_rate": 1.935087262988247e-05, "loss": 1.2699, "step": 4756 }, { "epoch": 1.4168543718237494, "grad_norm": 0.2272225320339203, "learning_rate": 1.935053072712033e-05, "loss": 1.2817, "step": 4757 }, { "epoch": 1.4171522180234182, "grad_norm": 0.22575689852237701, "learning_rate": 1.9350188737361947e-05, "loss": 1.2978, "step": 4758 }, { "epoch": 1.4174500642230867, "grad_norm": 0.2152029424905777, "learning_rate": 1.93498466606105e-05, "loss": 1.2723, "step": 4759 }, { "epoch": 1.4177479104227555, "grad_norm": 0.22033043205738068, "learning_rate": 1.9349504496869177e-05, "loss": 1.2911, "step": 4760 }, { "epoch": 1.418045756622424, "grad_norm": 0.21459448337554932, "learning_rate": 1.9349162246141165e-05, "loss": 1.282, "step": 4761 }, { "epoch": 1.4183436028220928, "grad_norm": 0.2216905653476715, "learning_rate": 1.934881990842964e-05, "loss": 1.2976, "step": 4762 }, { "epoch": 1.4186414490217614, "grad_norm": 0.21749120950698853, "learning_rate": 1.9348477483737792e-05, "loss": 1.2664, "step": 4763 }, { "epoch": 1.41893929522143, "grad_norm": 0.2163579910993576, "learning_rate": 1.934813497206881e-05, "loss": 1.2757, "step": 4764 }, { "epoch": 1.4192371414210987, "grad_norm": 0.2084561586380005, "learning_rate": 1.934779237342587e-05, "loss": 1.2762, "step": 4765 }, { "epoch": 1.4195349876207675, "grad_norm": 0.22256968915462494, "learning_rate": 1.934744968781217e-05, "loss": 1.2815, "step": 4766 }, { "epoch": 1.419832833820436, "grad_norm": 0.2200809270143509, "learning_rate": 1.934710691523089e-05, "loss": 1.3122, "step": 4767 }, { "epoch": 1.4201306800201046, "grad_norm": 0.23718221485614777, "learning_rate": 1.9346764055685224e-05, "loss": 1.2699, "step": 4768 }, { "epoch": 1.4204285262197733, "grad_norm": 0.20865055918693542, "learning_rate": 1.9346421109178365e-05, "loss": 1.2726, "step": 4769 }, { "epoch": 1.4207263724194419, "grad_norm": 0.21131010353565216, "learning_rate": 1.9346078075713498e-05, "loss": 1.2908, "step": 4770 }, { "epoch": 1.4210242186191104, "grad_norm": 0.22388875484466553, "learning_rate": 1.9345734955293817e-05, "loss": 1.2942, "step": 4771 }, { "epoch": 1.4213220648187792, "grad_norm": 0.2043249011039734, "learning_rate": 1.9345391747922515e-05, "loss": 1.2744, "step": 4772 }, { "epoch": 1.421619911018448, "grad_norm": 0.22854197025299072, "learning_rate": 1.9345048453602782e-05, "loss": 1.2865, "step": 4773 }, { "epoch": 1.4219177572181165, "grad_norm": 0.20839452743530273, "learning_rate": 1.9344705072337815e-05, "loss": 1.2639, "step": 4774 }, { "epoch": 1.422215603417785, "grad_norm": 0.21528539061546326, "learning_rate": 1.9344361604130807e-05, "loss": 1.2805, "step": 4775 }, { "epoch": 1.4225134496174539, "grad_norm": 0.20581011474132538, "learning_rate": 1.9344018048984955e-05, "loss": 1.2673, "step": 4776 }, { "epoch": 1.4228112958171224, "grad_norm": 0.22256039083003998, "learning_rate": 1.9343674406903455e-05, "loss": 1.2769, "step": 4777 }, { "epoch": 1.423109142016791, "grad_norm": 0.22472919523715973, "learning_rate": 1.9343330677889504e-05, "loss": 1.2981, "step": 4778 }, { "epoch": 1.4234069882164597, "grad_norm": 0.21348482370376587, "learning_rate": 1.9342986861946303e-05, "loss": 1.278, "step": 4779 }, { "epoch": 1.4237048344161285, "grad_norm": 0.21071229875087738, "learning_rate": 1.9342642959077044e-05, "loss": 1.2877, "step": 4780 }, { "epoch": 1.424002680615797, "grad_norm": 0.2112603634595871, "learning_rate": 1.9342298969284932e-05, "loss": 1.2761, "step": 4781 }, { "epoch": 1.4243005268154656, "grad_norm": 0.21463853120803833, "learning_rate": 1.9341954892573165e-05, "loss": 1.2662, "step": 4782 }, { "epoch": 1.4245983730151344, "grad_norm": 0.20784740149974823, "learning_rate": 1.9341610728944945e-05, "loss": 1.2803, "step": 4783 }, { "epoch": 1.424896219214803, "grad_norm": 0.21706542372703552, "learning_rate": 1.9341266478403474e-05, "loss": 1.2885, "step": 4784 }, { "epoch": 1.4251940654144715, "grad_norm": 0.2044822871685028, "learning_rate": 1.9340922140951954e-05, "loss": 1.2825, "step": 4785 }, { "epoch": 1.4254919116141402, "grad_norm": 0.21804824471473694, "learning_rate": 1.9340577716593593e-05, "loss": 1.2635, "step": 4786 }, { "epoch": 1.425789757813809, "grad_norm": 0.2229154109954834, "learning_rate": 1.934023320533159e-05, "loss": 1.2888, "step": 4787 }, { "epoch": 1.4260876040134776, "grad_norm": 0.23313355445861816, "learning_rate": 1.9339888607169152e-05, "loss": 1.2746, "step": 4788 }, { "epoch": 1.4263854502131461, "grad_norm": 0.2325923591852188, "learning_rate": 1.9339543922109487e-05, "loss": 1.2875, "step": 4789 }, { "epoch": 1.4266832964128149, "grad_norm": 0.21830318868160248, "learning_rate": 1.9339199150155804e-05, "loss": 1.2793, "step": 4790 }, { "epoch": 1.4269811426124834, "grad_norm": 0.21931092441082, "learning_rate": 1.93388542913113e-05, "loss": 1.2778, "step": 4791 }, { "epoch": 1.4272789888121522, "grad_norm": 0.2196022868156433, "learning_rate": 1.9338509345579196e-05, "loss": 1.2791, "step": 4792 }, { "epoch": 1.4275768350118208, "grad_norm": 0.2201681137084961, "learning_rate": 1.9338164312962694e-05, "loss": 1.2953, "step": 4793 }, { "epoch": 1.4278746812114895, "grad_norm": 0.2099919468164444, "learning_rate": 1.9337819193465007e-05, "loss": 1.2716, "step": 4794 }, { "epoch": 1.428172527411158, "grad_norm": 0.21924568712711334, "learning_rate": 1.9337473987089346e-05, "loss": 1.2851, "step": 4795 }, { "epoch": 1.4284703736108266, "grad_norm": 0.21668729186058044, "learning_rate": 1.933712869383892e-05, "loss": 1.2812, "step": 4796 }, { "epoch": 1.4287682198104954, "grad_norm": 0.20736163854599, "learning_rate": 1.9336783313716946e-05, "loss": 1.2678, "step": 4797 }, { "epoch": 1.429066066010164, "grad_norm": 0.20503593981266022, "learning_rate": 1.9336437846726634e-05, "loss": 1.2802, "step": 4798 }, { "epoch": 1.4293639122098327, "grad_norm": 0.23220981657505035, "learning_rate": 1.9336092292871197e-05, "loss": 1.2623, "step": 4799 }, { "epoch": 1.4296617584095013, "grad_norm": 0.22154389321804047, "learning_rate": 1.9335746652153856e-05, "loss": 1.2771, "step": 4800 }, { "epoch": 1.42995960460917, "grad_norm": 0.2188727706670761, "learning_rate": 1.933540092457782e-05, "loss": 1.2755, "step": 4801 }, { "epoch": 1.4302574508088386, "grad_norm": 0.22202670574188232, "learning_rate": 1.933505511014631e-05, "loss": 1.2695, "step": 4802 }, { "epoch": 1.4305552970085071, "grad_norm": 0.21409845352172852, "learning_rate": 1.9334709208862537e-05, "loss": 1.2808, "step": 4803 }, { "epoch": 1.430853143208176, "grad_norm": 0.21846888959407806, "learning_rate": 1.9334363220729733e-05, "loss": 1.268, "step": 4804 }, { "epoch": 1.4311509894078445, "grad_norm": 0.22371357679367065, "learning_rate": 1.9334017145751102e-05, "loss": 1.2848, "step": 4805 }, { "epoch": 1.4314488356075132, "grad_norm": 0.21155185997486115, "learning_rate": 1.9333670983929872e-05, "loss": 1.2774, "step": 4806 }, { "epoch": 1.4317466818071818, "grad_norm": 0.21358896791934967, "learning_rate": 1.933332473526926e-05, "loss": 1.2747, "step": 4807 }, { "epoch": 1.4320445280068506, "grad_norm": 0.21508902311325073, "learning_rate": 1.9332978399772493e-05, "loss": 1.2974, "step": 4808 }, { "epoch": 1.432342374206519, "grad_norm": 0.22894176840782166, "learning_rate": 1.9332631977442787e-05, "loss": 1.275, "step": 4809 }, { "epoch": 1.4326402204061877, "grad_norm": 0.21540142595767975, "learning_rate": 1.9332285468283368e-05, "loss": 1.2794, "step": 4810 }, { "epoch": 1.4329380666058564, "grad_norm": 0.21178120374679565, "learning_rate": 1.9331938872297454e-05, "loss": 1.2854, "step": 4811 }, { "epoch": 1.433235912805525, "grad_norm": 0.21985602378845215, "learning_rate": 1.9331592189488285e-05, "loss": 1.2958, "step": 4812 }, { "epoch": 1.4335337590051938, "grad_norm": 0.21311499178409576, "learning_rate": 1.933124541985907e-05, "loss": 1.2702, "step": 4813 }, { "epoch": 1.4338316052048623, "grad_norm": 0.21507811546325684, "learning_rate": 1.933089856341304e-05, "loss": 1.2707, "step": 4814 }, { "epoch": 1.434129451404531, "grad_norm": 0.22037610411643982, "learning_rate": 1.933055162015343e-05, "loss": 1.2806, "step": 4815 }, { "epoch": 1.4344272976041996, "grad_norm": 0.21492549777030945, "learning_rate": 1.9330204590083457e-05, "loss": 1.2773, "step": 4816 }, { "epoch": 1.4347251438038682, "grad_norm": 0.21301521360874176, "learning_rate": 1.9329857473206355e-05, "loss": 1.2853, "step": 4817 }, { "epoch": 1.435022990003537, "grad_norm": 0.2319442629814148, "learning_rate": 1.9329510269525358e-05, "loss": 1.2878, "step": 4818 }, { "epoch": 1.4353208362032055, "grad_norm": 0.21806570887565613, "learning_rate": 1.9329162979043687e-05, "loss": 1.2735, "step": 4819 }, { "epoch": 1.4356186824028743, "grad_norm": 0.21726150810718536, "learning_rate": 1.9328815601764577e-05, "loss": 1.3067, "step": 4820 }, { "epoch": 1.4359165286025428, "grad_norm": 0.2200784981250763, "learning_rate": 1.9328468137691266e-05, "loss": 1.303, "step": 4821 }, { "epoch": 1.4362143748022116, "grad_norm": 0.21538648009300232, "learning_rate": 1.9328120586826977e-05, "loss": 1.2801, "step": 4822 }, { "epoch": 1.4365122210018801, "grad_norm": 0.22035753726959229, "learning_rate": 1.9327772949174948e-05, "loss": 1.2773, "step": 4823 }, { "epoch": 1.4368100672015487, "grad_norm": 0.21646147966384888, "learning_rate": 1.9327425224738413e-05, "loss": 1.2709, "step": 4824 }, { "epoch": 1.4371079134012175, "grad_norm": 0.22500768303871155, "learning_rate": 1.932707741352061e-05, "loss": 1.2805, "step": 4825 }, { "epoch": 1.437405759600886, "grad_norm": 0.2312770038843155, "learning_rate": 1.9326729515524772e-05, "loss": 1.2925, "step": 4826 }, { "epoch": 1.4377036058005548, "grad_norm": 0.22772246599197388, "learning_rate": 1.9326381530754134e-05, "loss": 1.2726, "step": 4827 }, { "epoch": 1.4380014520002233, "grad_norm": 0.2224140763282776, "learning_rate": 1.932603345921194e-05, "loss": 1.2772, "step": 4828 }, { "epoch": 1.438299298199892, "grad_norm": 0.22427061200141907, "learning_rate": 1.932568530090142e-05, "loss": 1.3016, "step": 4829 }, { "epoch": 1.4385971443995607, "grad_norm": 0.21568401157855988, "learning_rate": 1.9325337055825818e-05, "loss": 1.2926, "step": 4830 }, { "epoch": 1.4388949905992292, "grad_norm": 0.22633923590183258, "learning_rate": 1.9324988723988377e-05, "loss": 1.2857, "step": 4831 }, { "epoch": 1.439192836798898, "grad_norm": 0.22848960757255554, "learning_rate": 1.932464030539233e-05, "loss": 1.2615, "step": 4832 }, { "epoch": 1.4394906829985668, "grad_norm": 0.21749244630336761, "learning_rate": 1.9324291800040927e-05, "loss": 1.2802, "step": 4833 }, { "epoch": 1.4397885291982353, "grad_norm": 0.2067568451166153, "learning_rate": 1.9323943207937404e-05, "loss": 1.2853, "step": 4834 }, { "epoch": 1.4400863753979039, "grad_norm": 0.21011200547218323, "learning_rate": 1.9323594529085005e-05, "loss": 1.2806, "step": 4835 }, { "epoch": 1.4403842215975726, "grad_norm": 0.21135860681533813, "learning_rate": 1.9323245763486982e-05, "loss": 1.2732, "step": 4836 }, { "epoch": 1.4406820677972412, "grad_norm": 0.21184229850769043, "learning_rate": 1.932289691114657e-05, "loss": 1.2818, "step": 4837 }, { "epoch": 1.4409799139969097, "grad_norm": 0.22322779893875122, "learning_rate": 1.9322547972067016e-05, "loss": 1.2677, "step": 4838 }, { "epoch": 1.4412777601965785, "grad_norm": 0.21357326209545135, "learning_rate": 1.9322198946251572e-05, "loss": 1.2705, "step": 4839 }, { "epoch": 1.4415756063962473, "grad_norm": 0.21662479639053345, "learning_rate": 1.9321849833703484e-05, "loss": 1.2731, "step": 4840 }, { "epoch": 1.4418734525959158, "grad_norm": 0.22164317965507507, "learning_rate": 1.9321500634425995e-05, "loss": 1.2734, "step": 4841 }, { "epoch": 1.4421712987955844, "grad_norm": 0.273386687040329, "learning_rate": 1.9321151348422358e-05, "loss": 1.289, "step": 4842 }, { "epoch": 1.4424691449952531, "grad_norm": 0.22557945549488068, "learning_rate": 1.932080197569582e-05, "loss": 1.2689, "step": 4843 }, { "epoch": 1.4427669911949217, "grad_norm": 0.214786559343338, "learning_rate": 1.932045251624964e-05, "loss": 1.2862, "step": 4844 }, { "epoch": 1.4430648373945902, "grad_norm": 0.2258441299200058, "learning_rate": 1.9320102970087055e-05, "loss": 1.286, "step": 4845 }, { "epoch": 1.443362683594259, "grad_norm": 0.21301913261413574, "learning_rate": 1.9319753337211327e-05, "loss": 1.2759, "step": 4846 }, { "epoch": 1.4436605297939278, "grad_norm": 0.21130412817001343, "learning_rate": 1.9319403617625707e-05, "loss": 1.2989, "step": 4847 }, { "epoch": 1.4439583759935963, "grad_norm": 0.2133743315935135, "learning_rate": 1.931905381133345e-05, "loss": 1.2857, "step": 4848 }, { "epoch": 1.4442562221932649, "grad_norm": 0.21618391573429108, "learning_rate": 1.9318703918337807e-05, "loss": 1.2911, "step": 4849 }, { "epoch": 1.4445540683929337, "grad_norm": 0.21383893489837646, "learning_rate": 1.9318353938642037e-05, "loss": 1.2873, "step": 4850 }, { "epoch": 1.4448519145926022, "grad_norm": 0.21067416667938232, "learning_rate": 1.9318003872249398e-05, "loss": 1.2893, "step": 4851 }, { "epoch": 1.4451497607922708, "grad_norm": 0.21318969130516052, "learning_rate": 1.9317653719163137e-05, "loss": 1.2833, "step": 4852 }, { "epoch": 1.4454476069919395, "grad_norm": 0.21748173236846924, "learning_rate": 1.9317303479386523e-05, "loss": 1.2693, "step": 4853 }, { "epoch": 1.4457454531916083, "grad_norm": 0.22137188911437988, "learning_rate": 1.931695315292281e-05, "loss": 1.2914, "step": 4854 }, { "epoch": 1.4460432993912768, "grad_norm": 0.22523652017116547, "learning_rate": 1.9316602739775255e-05, "loss": 1.2787, "step": 4855 }, { "epoch": 1.4463411455909454, "grad_norm": 0.2076699584722519, "learning_rate": 1.9316252239947123e-05, "loss": 1.265, "step": 4856 }, { "epoch": 1.4466389917906142, "grad_norm": 0.2129327356815338, "learning_rate": 1.9315901653441672e-05, "loss": 1.271, "step": 4857 }, { "epoch": 1.4469368379902827, "grad_norm": 0.21853697299957275, "learning_rate": 1.931555098026216e-05, "loss": 1.2663, "step": 4858 }, { "epoch": 1.4472346841899515, "grad_norm": 0.25575652718544006, "learning_rate": 1.9315200220411862e-05, "loss": 1.2753, "step": 4859 }, { "epoch": 1.44753253038962, "grad_norm": 0.2197348028421402, "learning_rate": 1.931484937389403e-05, "loss": 1.2602, "step": 4860 }, { "epoch": 1.4478303765892888, "grad_norm": 0.23306891322135925, "learning_rate": 1.9314498440711928e-05, "loss": 1.2772, "step": 4861 }, { "epoch": 1.4481282227889574, "grad_norm": 0.2237478792667389, "learning_rate": 1.931414742086883e-05, "loss": 1.283, "step": 4862 }, { "epoch": 1.448426068988626, "grad_norm": 0.21376194059848785, "learning_rate": 1.9313796314367995e-05, "loss": 1.2757, "step": 4863 }, { "epoch": 1.4487239151882947, "grad_norm": 0.2342880517244339, "learning_rate": 1.9313445121212692e-05, "loss": 1.2813, "step": 4864 }, { "epoch": 1.4490217613879632, "grad_norm": 0.21437346935272217, "learning_rate": 1.9313093841406186e-05, "loss": 1.2851, "step": 4865 }, { "epoch": 1.449319607587632, "grad_norm": 0.2158830463886261, "learning_rate": 1.9312742474951747e-05, "loss": 1.2785, "step": 4866 }, { "epoch": 1.4496174537873006, "grad_norm": 0.20946374535560608, "learning_rate": 1.9312391021852644e-05, "loss": 1.28, "step": 4867 }, { "epoch": 1.4499152999869693, "grad_norm": 0.21851618587970734, "learning_rate": 1.9312039482112147e-05, "loss": 1.2771, "step": 4868 }, { "epoch": 1.4502131461866379, "grad_norm": 0.23499397933483124, "learning_rate": 1.931168785573353e-05, "loss": 1.3018, "step": 4869 }, { "epoch": 1.4505109923863064, "grad_norm": 0.2345985770225525, "learning_rate": 1.931133614272006e-05, "loss": 1.2812, "step": 4870 }, { "epoch": 1.4508088385859752, "grad_norm": 0.21713724732398987, "learning_rate": 1.9310984343075006e-05, "loss": 1.2889, "step": 4871 }, { "epoch": 1.4511066847856438, "grad_norm": 0.2307348996400833, "learning_rate": 1.931063245680165e-05, "loss": 1.2784, "step": 4872 }, { "epoch": 1.4514045309853125, "grad_norm": 0.22369886934757233, "learning_rate": 1.931028048390326e-05, "loss": 1.2824, "step": 4873 }, { "epoch": 1.451702377184981, "grad_norm": 0.2125367820262909, "learning_rate": 1.930992842438311e-05, "loss": 1.2762, "step": 4874 }, { "epoch": 1.4520002233846498, "grad_norm": 0.23226135969161987, "learning_rate": 1.930957627824448e-05, "loss": 1.2888, "step": 4875 }, { "epoch": 1.4522980695843184, "grad_norm": 0.22335903346538544, "learning_rate": 1.9309224045490643e-05, "loss": 1.2783, "step": 4876 }, { "epoch": 1.452595915783987, "grad_norm": 0.21399128437042236, "learning_rate": 1.9308871726124877e-05, "loss": 1.2811, "step": 4877 }, { "epoch": 1.4528937619836557, "grad_norm": 0.2137995958328247, "learning_rate": 1.9308519320150463e-05, "loss": 1.283, "step": 4878 }, { "epoch": 1.4531916081833243, "grad_norm": 0.21936951577663422, "learning_rate": 1.9308166827570674e-05, "loss": 1.2782, "step": 4879 }, { "epoch": 1.453489454382993, "grad_norm": 0.22406361997127533, "learning_rate": 1.9307814248388793e-05, "loss": 1.293, "step": 4880 }, { "epoch": 1.4537873005826616, "grad_norm": 0.21134215593338013, "learning_rate": 1.93074615826081e-05, "loss": 1.2849, "step": 4881 }, { "epoch": 1.4540851467823304, "grad_norm": 0.2193523794412613, "learning_rate": 1.9307108830231878e-05, "loss": 1.2924, "step": 4882 }, { "epoch": 1.454382992981999, "grad_norm": 0.21330083906650543, "learning_rate": 1.9306755991263403e-05, "loss": 1.2797, "step": 4883 }, { "epoch": 1.4546808391816675, "grad_norm": 0.21060821413993835, "learning_rate": 1.9306403065705965e-05, "loss": 1.288, "step": 4884 }, { "epoch": 1.4549786853813362, "grad_norm": 0.25724872946739197, "learning_rate": 1.930605005356284e-05, "loss": 1.2907, "step": 4885 }, { "epoch": 1.4552765315810048, "grad_norm": 0.20756061375141144, "learning_rate": 1.930569695483732e-05, "loss": 1.2775, "step": 4886 }, { "epoch": 1.4555743777806736, "grad_norm": 0.21263720095157623, "learning_rate": 1.9305343769532686e-05, "loss": 1.2834, "step": 4887 }, { "epoch": 1.455872223980342, "grad_norm": 0.21779103577136993, "learning_rate": 1.9304990497652224e-05, "loss": 1.2674, "step": 4888 }, { "epoch": 1.4561700701800109, "grad_norm": 0.21574203670024872, "learning_rate": 1.9304637139199225e-05, "loss": 1.2655, "step": 4889 }, { "epoch": 1.4564679163796794, "grad_norm": 0.21916961669921875, "learning_rate": 1.930428369417697e-05, "loss": 1.2925, "step": 4890 }, { "epoch": 1.456765762579348, "grad_norm": 0.21140176057815552, "learning_rate": 1.930393016258875e-05, "loss": 1.288, "step": 4891 }, { "epoch": 1.4570636087790168, "grad_norm": 0.2188664972782135, "learning_rate": 1.9303576544437854e-05, "loss": 1.2685, "step": 4892 }, { "epoch": 1.4573614549786853, "grad_norm": 0.22733062505722046, "learning_rate": 1.9303222839727575e-05, "loss": 1.2931, "step": 4893 }, { "epoch": 1.457659301178354, "grad_norm": 0.21536576747894287, "learning_rate": 1.93028690484612e-05, "loss": 1.296, "step": 4894 }, { "epoch": 1.4579571473780226, "grad_norm": 0.2092432826757431, "learning_rate": 1.9302515170642026e-05, "loss": 1.2639, "step": 4895 }, { "epoch": 1.4582549935776914, "grad_norm": 0.22613051533699036, "learning_rate": 1.930216120627334e-05, "loss": 1.2741, "step": 4896 }, { "epoch": 1.45855283977736, "grad_norm": 0.22264140844345093, "learning_rate": 1.9301807155358437e-05, "loss": 1.2829, "step": 4897 }, { "epoch": 1.4588506859770285, "grad_norm": 0.23095308244228363, "learning_rate": 1.9301453017900608e-05, "loss": 1.279, "step": 4898 }, { "epoch": 1.4591485321766973, "grad_norm": 0.2456885427236557, "learning_rate": 1.9301098793903153e-05, "loss": 1.277, "step": 4899 }, { "epoch": 1.459446378376366, "grad_norm": 0.2145146131515503, "learning_rate": 1.9300744483369363e-05, "loss": 1.2852, "step": 4900 }, { "epoch": 1.4597442245760346, "grad_norm": 0.2277066856622696, "learning_rate": 1.9300390086302542e-05, "loss": 1.2805, "step": 4901 }, { "epoch": 1.4600420707757031, "grad_norm": 0.2226993590593338, "learning_rate": 1.930003560270598e-05, "loss": 1.2858, "step": 4902 }, { "epoch": 1.460339916975372, "grad_norm": 0.2158169001340866, "learning_rate": 1.9299681032582978e-05, "loss": 1.2857, "step": 4903 }, { "epoch": 1.4606377631750405, "grad_norm": 0.22127355635166168, "learning_rate": 1.929932637593683e-05, "loss": 1.2768, "step": 4904 }, { "epoch": 1.460935609374709, "grad_norm": 0.2199540138244629, "learning_rate": 1.9298971632770844e-05, "loss": 1.2587, "step": 4905 }, { "epoch": 1.4612334555743778, "grad_norm": 0.2244499772787094, "learning_rate": 1.9298616803088318e-05, "loss": 1.2593, "step": 4906 }, { "epoch": 1.4615313017740466, "grad_norm": 0.2019004076719284, "learning_rate": 1.9298261886892547e-05, "loss": 1.2638, "step": 4907 }, { "epoch": 1.461829147973715, "grad_norm": 0.2197563350200653, "learning_rate": 1.929790688418684e-05, "loss": 1.2976, "step": 4908 }, { "epoch": 1.4621269941733837, "grad_norm": 0.229153573513031, "learning_rate": 1.92975517949745e-05, "loss": 1.2667, "step": 4909 }, { "epoch": 1.4624248403730524, "grad_norm": 0.2259681075811386, "learning_rate": 1.9297196619258826e-05, "loss": 1.2761, "step": 4910 }, { "epoch": 1.462722686572721, "grad_norm": 0.22611136734485626, "learning_rate": 1.9296841357043124e-05, "loss": 1.2703, "step": 4911 }, { "epoch": 1.4630205327723895, "grad_norm": 0.2154085338115692, "learning_rate": 1.92964860083307e-05, "loss": 1.2735, "step": 4912 }, { "epoch": 1.4633183789720583, "grad_norm": 0.2122606784105301, "learning_rate": 1.9296130573124862e-05, "loss": 1.2843, "step": 4913 }, { "epoch": 1.463616225171727, "grad_norm": 0.23453332483768463, "learning_rate": 1.9295775051428914e-05, "loss": 1.2977, "step": 4914 }, { "epoch": 1.4639140713713956, "grad_norm": 0.2286519557237625, "learning_rate": 1.929541944324617e-05, "loss": 1.2814, "step": 4915 }, { "epoch": 1.4642119175710642, "grad_norm": 0.23382240533828735, "learning_rate": 1.929506374857993e-05, "loss": 1.2839, "step": 4916 }, { "epoch": 1.464509763770733, "grad_norm": 0.21530047059059143, "learning_rate": 1.9294707967433503e-05, "loss": 1.2797, "step": 4917 }, { "epoch": 1.4648076099704015, "grad_norm": 0.22245480120182037, "learning_rate": 1.9294352099810207e-05, "loss": 1.2932, "step": 4918 }, { "epoch": 1.46510545617007, "grad_norm": 0.23918482661247253, "learning_rate": 1.9293996145713348e-05, "loss": 1.2843, "step": 4919 }, { "epoch": 1.4654033023697388, "grad_norm": 0.2126484215259552, "learning_rate": 1.929364010514624e-05, "loss": 1.2931, "step": 4920 }, { "epoch": 1.4657011485694076, "grad_norm": 0.2203528881072998, "learning_rate": 1.929328397811219e-05, "loss": 1.263, "step": 4921 }, { "epoch": 1.4659989947690761, "grad_norm": 0.21826790273189545, "learning_rate": 1.929292776461452e-05, "loss": 1.2683, "step": 4922 }, { "epoch": 1.4662968409687447, "grad_norm": 0.2075348049402237, "learning_rate": 1.9292571464656538e-05, "loss": 1.271, "step": 4923 }, { "epoch": 1.4665946871684135, "grad_norm": 0.23244334757328033, "learning_rate": 1.9292215078241564e-05, "loss": 1.3111, "step": 4924 }, { "epoch": 1.466892533368082, "grad_norm": 0.2200518697500229, "learning_rate": 1.929185860537291e-05, "loss": 1.2752, "step": 4925 }, { "epoch": 1.4671903795677508, "grad_norm": 0.2329683154821396, "learning_rate": 1.9291502046053888e-05, "loss": 1.2694, "step": 4926 }, { "epoch": 1.4674882257674193, "grad_norm": 0.22009992599487305, "learning_rate": 1.9291145400287824e-05, "loss": 1.2882, "step": 4927 }, { "epoch": 1.467786071967088, "grad_norm": 0.21958021819591522, "learning_rate": 1.9290788668078032e-05, "loss": 1.3023, "step": 4928 }, { "epoch": 1.4680839181667567, "grad_norm": 0.22004231810569763, "learning_rate": 1.9290431849427834e-05, "loss": 1.2749, "step": 4929 }, { "epoch": 1.4683817643664252, "grad_norm": 0.21612748503684998, "learning_rate": 1.9290074944340545e-05, "loss": 1.2767, "step": 4930 }, { "epoch": 1.468679610566094, "grad_norm": 0.21939951181411743, "learning_rate": 1.9289717952819487e-05, "loss": 1.2978, "step": 4931 }, { "epoch": 1.4689774567657625, "grad_norm": 0.21950466930866241, "learning_rate": 1.9289360874867987e-05, "loss": 1.2717, "step": 4932 }, { "epoch": 1.4692753029654313, "grad_norm": 0.21353593468666077, "learning_rate": 1.928900371048936e-05, "loss": 1.2974, "step": 4933 }, { "epoch": 1.4695731491650998, "grad_norm": 0.22219714522361755, "learning_rate": 1.928864645968693e-05, "loss": 1.2641, "step": 4934 }, { "epoch": 1.4698709953647686, "grad_norm": 0.21758060157299042, "learning_rate": 1.9288289122464026e-05, "loss": 1.2755, "step": 4935 }, { "epoch": 1.4701688415644372, "grad_norm": 0.2182830423116684, "learning_rate": 1.9287931698823964e-05, "loss": 1.2664, "step": 4936 }, { "epoch": 1.4704666877641057, "grad_norm": 0.22116537392139435, "learning_rate": 1.9287574188770078e-05, "loss": 1.2765, "step": 4937 }, { "epoch": 1.4707645339637745, "grad_norm": 0.19969645142555237, "learning_rate": 1.9287216592305692e-05, "loss": 1.2877, "step": 4938 }, { "epoch": 1.471062380163443, "grad_norm": 0.22789418697357178, "learning_rate": 1.928685890943413e-05, "loss": 1.2806, "step": 4939 }, { "epoch": 1.4713602263631118, "grad_norm": 0.21748170256614685, "learning_rate": 1.9286501140158727e-05, "loss": 1.2716, "step": 4940 }, { "epoch": 1.4716580725627804, "grad_norm": 0.23064807057380676, "learning_rate": 1.92861432844828e-05, "loss": 1.2832, "step": 4941 }, { "epoch": 1.4719559187624491, "grad_norm": 0.25822216272354126, "learning_rate": 1.9285785342409687e-05, "loss": 1.2671, "step": 4942 }, { "epoch": 1.4722537649621177, "grad_norm": 0.24876217544078827, "learning_rate": 1.9285427313942717e-05, "loss": 1.299, "step": 4943 }, { "epoch": 1.4725516111617862, "grad_norm": 0.22561317682266235, "learning_rate": 1.928506919908522e-05, "loss": 1.2657, "step": 4944 }, { "epoch": 1.472849457361455, "grad_norm": 0.2996424436569214, "learning_rate": 1.928471099784053e-05, "loss": 1.295, "step": 4945 }, { "epoch": 1.4731473035611236, "grad_norm": 0.23014278709888458, "learning_rate": 1.928435271021197e-05, "loss": 1.2757, "step": 4946 }, { "epoch": 1.4734451497607923, "grad_norm": 0.23713409900665283, "learning_rate": 1.9283994336202888e-05, "loss": 1.2852, "step": 4947 }, { "epoch": 1.4737429959604609, "grad_norm": 0.22407498955726624, "learning_rate": 1.928363587581661e-05, "loss": 1.2806, "step": 4948 }, { "epoch": 1.4740408421601296, "grad_norm": 0.21705381572246552, "learning_rate": 1.928327732905647e-05, "loss": 1.2663, "step": 4949 }, { "epoch": 1.4743386883597982, "grad_norm": 0.23391960561275482, "learning_rate": 1.928291869592581e-05, "loss": 1.2617, "step": 4950 }, { "epoch": 1.4746365345594667, "grad_norm": 0.20994536578655243, "learning_rate": 1.9282559976427962e-05, "loss": 1.2909, "step": 4951 }, { "epoch": 1.4749343807591355, "grad_norm": 0.2166874259710312, "learning_rate": 1.9282201170566265e-05, "loss": 1.2705, "step": 4952 }, { "epoch": 1.475232226958804, "grad_norm": 0.21621519327163696, "learning_rate": 1.9281842278344053e-05, "loss": 1.2761, "step": 4953 }, { "epoch": 1.4755300731584728, "grad_norm": 0.22492529451847076, "learning_rate": 1.928148329976467e-05, "loss": 1.2766, "step": 4954 }, { "epoch": 1.4758279193581414, "grad_norm": 0.22125613689422607, "learning_rate": 1.9281124234831458e-05, "loss": 1.2636, "step": 4955 }, { "epoch": 1.4761257655578102, "grad_norm": 0.22913573682308197, "learning_rate": 1.9280765083547753e-05, "loss": 1.2956, "step": 4956 }, { "epoch": 1.4764236117574787, "grad_norm": 0.21469125151634216, "learning_rate": 1.9280405845916896e-05, "loss": 1.2854, "step": 4957 }, { "epoch": 1.4767214579571473, "grad_norm": 0.20909914374351501, "learning_rate": 1.928004652194223e-05, "loss": 1.2871, "step": 4958 }, { "epoch": 1.477019304156816, "grad_norm": 0.2232624590396881, "learning_rate": 1.9279687111627107e-05, "loss": 1.2907, "step": 4959 }, { "epoch": 1.4773171503564846, "grad_norm": 0.22202903032302856, "learning_rate": 1.927932761497486e-05, "loss": 1.2755, "step": 4960 }, { "epoch": 1.4776149965561534, "grad_norm": 0.2193203866481781, "learning_rate": 1.9278968031988835e-05, "loss": 1.2709, "step": 4961 }, { "epoch": 1.477912842755822, "grad_norm": 0.22183677554130554, "learning_rate": 1.9278608362672376e-05, "loss": 1.2912, "step": 4962 }, { "epoch": 1.4782106889554907, "grad_norm": 0.22596612572669983, "learning_rate": 1.927824860702884e-05, "loss": 1.2716, "step": 4963 }, { "epoch": 1.4785085351551592, "grad_norm": 0.21625077724456787, "learning_rate": 1.9277888765061562e-05, "loss": 1.2833, "step": 4964 }, { "epoch": 1.4788063813548278, "grad_norm": 0.241022989153862, "learning_rate": 1.92775288367739e-05, "loss": 1.2767, "step": 4965 }, { "epoch": 1.4791042275544966, "grad_norm": 0.21953821182250977, "learning_rate": 1.9277168822169194e-05, "loss": 1.2712, "step": 4966 }, { "epoch": 1.4794020737541653, "grad_norm": 0.21410316228866577, "learning_rate": 1.9276808721250798e-05, "loss": 1.2912, "step": 4967 }, { "epoch": 1.4796999199538339, "grad_norm": 0.21265076100826263, "learning_rate": 1.9276448534022057e-05, "loss": 1.2726, "step": 4968 }, { "epoch": 1.4799977661535024, "grad_norm": 0.22445210814476013, "learning_rate": 1.9276088260486335e-05, "loss": 1.2824, "step": 4969 }, { "epoch": 1.4802956123531712, "grad_norm": 0.21892309188842773, "learning_rate": 1.927572790064697e-05, "loss": 1.2789, "step": 4970 }, { "epoch": 1.4805934585528397, "grad_norm": 0.2171856313943863, "learning_rate": 1.9275367454507324e-05, "loss": 1.2932, "step": 4971 }, { "epoch": 1.4808913047525083, "grad_norm": 0.21604934334754944, "learning_rate": 1.9275006922070743e-05, "loss": 1.2669, "step": 4972 }, { "epoch": 1.481189150952177, "grad_norm": 0.2263178676366806, "learning_rate": 1.9274646303340587e-05, "loss": 1.278, "step": 4973 }, { "epoch": 1.4814869971518458, "grad_norm": 0.22390708327293396, "learning_rate": 1.927428559832021e-05, "loss": 1.2886, "step": 4974 }, { "epoch": 1.4817848433515144, "grad_norm": 0.21440237760543823, "learning_rate": 1.9273924807012966e-05, "loss": 1.2955, "step": 4975 }, { "epoch": 1.482082689551183, "grad_norm": 0.215138778090477, "learning_rate": 1.9273563929422216e-05, "loss": 1.2796, "step": 4976 }, { "epoch": 1.4823805357508517, "grad_norm": 0.21976342797279358, "learning_rate": 1.927320296555131e-05, "loss": 1.2593, "step": 4977 }, { "epoch": 1.4826783819505203, "grad_norm": 0.21736851334571838, "learning_rate": 1.9272841915403612e-05, "loss": 1.28, "step": 4978 }, { "epoch": 1.4829762281501888, "grad_norm": 0.21572822332382202, "learning_rate": 1.9272480778982484e-05, "loss": 1.2929, "step": 4979 }, { "epoch": 1.4832740743498576, "grad_norm": 0.22615979611873627, "learning_rate": 1.927211955629128e-05, "loss": 1.2785, "step": 4980 }, { "epoch": 1.4835719205495264, "grad_norm": 0.2202790528535843, "learning_rate": 1.9271758247333362e-05, "loss": 1.2781, "step": 4981 }, { "epoch": 1.483869766749195, "grad_norm": 0.22541899979114532, "learning_rate": 1.9271396852112094e-05, "loss": 1.2638, "step": 4982 }, { "epoch": 1.4841676129488635, "grad_norm": 0.22249038517475128, "learning_rate": 1.9271035370630838e-05, "loss": 1.2735, "step": 4983 }, { "epoch": 1.4844654591485322, "grad_norm": 0.25325918197631836, "learning_rate": 1.9270673802892954e-05, "loss": 1.2839, "step": 4984 }, { "epoch": 1.4847633053482008, "grad_norm": 0.21583291888237, "learning_rate": 1.927031214890181e-05, "loss": 1.2605, "step": 4985 }, { "epoch": 1.4850611515478696, "grad_norm": 0.23504725098609924, "learning_rate": 1.9269950408660766e-05, "loss": 1.2756, "step": 4986 }, { "epoch": 1.485358997747538, "grad_norm": 0.2224356085062027, "learning_rate": 1.926958858217319e-05, "loss": 1.2851, "step": 4987 }, { "epoch": 1.4856568439472069, "grad_norm": 0.23366491496562958, "learning_rate": 1.926922666944245e-05, "loss": 1.2689, "step": 4988 }, { "epoch": 1.4859546901468754, "grad_norm": 0.2041642814874649, "learning_rate": 1.9268864670471914e-05, "loss": 1.2746, "step": 4989 }, { "epoch": 1.486252536346544, "grad_norm": 0.21593394875526428, "learning_rate": 1.9268502585264946e-05, "loss": 1.2777, "step": 4990 }, { "epoch": 1.4865503825462127, "grad_norm": 0.21973749995231628, "learning_rate": 1.9268140413824915e-05, "loss": 1.2757, "step": 4991 }, { "epoch": 1.4868482287458813, "grad_norm": 0.22880889475345612, "learning_rate": 1.9267778156155198e-05, "loss": 1.2888, "step": 4992 }, { "epoch": 1.48714607494555, "grad_norm": 0.2150540053844452, "learning_rate": 1.9267415812259157e-05, "loss": 1.2677, "step": 4993 }, { "epoch": 1.4874439211452186, "grad_norm": 0.23506714403629303, "learning_rate": 1.9267053382140166e-05, "loss": 1.2886, "step": 4994 }, { "epoch": 1.4877417673448874, "grad_norm": 0.22009918093681335, "learning_rate": 1.9266690865801597e-05, "loss": 1.2688, "step": 4995 }, { "epoch": 1.488039613544556, "grad_norm": 0.229282408952713, "learning_rate": 1.9266328263246824e-05, "loss": 1.2643, "step": 4996 }, { "epoch": 1.4883374597442245, "grad_norm": 0.21965311467647552, "learning_rate": 1.9265965574479218e-05, "loss": 1.2814, "step": 4997 }, { "epoch": 1.4886353059438933, "grad_norm": 0.2273985892534256, "learning_rate": 1.9265602799502154e-05, "loss": 1.293, "step": 4998 }, { "epoch": 1.4889331521435618, "grad_norm": 0.22290019690990448, "learning_rate": 1.926523993831901e-05, "loss": 1.2965, "step": 4999 }, { "epoch": 1.4892309983432306, "grad_norm": 0.20859937369823456, "learning_rate": 1.9264876990933156e-05, "loss": 1.284, "step": 5000 }, { "epoch": 1.4892309983432306, "eval_loss": 1.350777506828308, "eval_runtime": 21.2077, "eval_samples_per_second": 81.763, "eval_steps_per_second": 5.14, "step": 5000 }, { "epoch": 1.4895288445428991, "grad_norm": 0.2520102262496948, "learning_rate": 1.9264513957347978e-05, "loss": 1.2765, "step": 5001 }, { "epoch": 1.489826690742568, "grad_norm": 0.23392906785011292, "learning_rate": 1.9264150837566847e-05, "loss": 1.2827, "step": 5002 }, { "epoch": 1.4901245369422365, "grad_norm": 0.22240546345710754, "learning_rate": 1.9263787631593144e-05, "loss": 1.2861, "step": 5003 }, { "epoch": 1.490422383141905, "grad_norm": 0.2256360948085785, "learning_rate": 1.9263424339430244e-05, "loss": 1.2811, "step": 5004 }, { "epoch": 1.4907202293415738, "grad_norm": 0.2463768869638443, "learning_rate": 1.926306096108153e-05, "loss": 1.2646, "step": 5005 }, { "epoch": 1.4910180755412423, "grad_norm": 0.22218027710914612, "learning_rate": 1.9262697496550388e-05, "loss": 1.2816, "step": 5006 }, { "epoch": 1.491315921740911, "grad_norm": 0.2227209061384201, "learning_rate": 1.926233394584019e-05, "loss": 1.2874, "step": 5007 }, { "epoch": 1.4916137679405796, "grad_norm": 0.22535954415798187, "learning_rate": 1.9261970308954326e-05, "loss": 1.2805, "step": 5008 }, { "epoch": 1.4919116141402484, "grad_norm": 0.21909268200397491, "learning_rate": 1.9261606585896174e-05, "loss": 1.2841, "step": 5009 }, { "epoch": 1.492209460339917, "grad_norm": 0.22318336367607117, "learning_rate": 1.9261242776669123e-05, "loss": 1.2798, "step": 5010 }, { "epoch": 1.4925073065395855, "grad_norm": 0.2382373809814453, "learning_rate": 1.9260878881276555e-05, "loss": 1.2983, "step": 5011 }, { "epoch": 1.4928051527392543, "grad_norm": 0.21087752282619476, "learning_rate": 1.9260514899721854e-05, "loss": 1.2786, "step": 5012 }, { "epoch": 1.4931029989389228, "grad_norm": 0.2340303212404251, "learning_rate": 1.9260150832008408e-05, "loss": 1.2782, "step": 5013 }, { "epoch": 1.4934008451385916, "grad_norm": 0.21488603949546814, "learning_rate": 1.9259786678139605e-05, "loss": 1.3042, "step": 5014 }, { "epoch": 1.4936986913382602, "grad_norm": 0.23216792941093445, "learning_rate": 1.9259422438118835e-05, "loss": 1.2624, "step": 5015 }, { "epoch": 1.493996537537929, "grad_norm": 0.221286341547966, "learning_rate": 1.9259058111949483e-05, "loss": 1.286, "step": 5016 }, { "epoch": 1.4942943837375975, "grad_norm": 0.22693414986133575, "learning_rate": 1.9258693699634937e-05, "loss": 1.275, "step": 5017 }, { "epoch": 1.494592229937266, "grad_norm": 0.2372429221868515, "learning_rate": 1.9258329201178596e-05, "loss": 1.2769, "step": 5018 }, { "epoch": 1.4948900761369348, "grad_norm": 0.23893238604068756, "learning_rate": 1.9257964616583843e-05, "loss": 1.2803, "step": 5019 }, { "epoch": 1.4951879223366034, "grad_norm": 0.21460749208927155, "learning_rate": 1.9257599945854073e-05, "loss": 1.2807, "step": 5020 }, { "epoch": 1.4954857685362721, "grad_norm": 0.23852066695690155, "learning_rate": 1.9257235188992676e-05, "loss": 1.2846, "step": 5021 }, { "epoch": 1.4957836147359407, "grad_norm": 0.22247886657714844, "learning_rate": 1.9256870346003055e-05, "loss": 1.2744, "step": 5022 }, { "epoch": 1.4960814609356095, "grad_norm": 0.22894200682640076, "learning_rate": 1.9256505416888595e-05, "loss": 1.2793, "step": 5023 }, { "epoch": 1.496379307135278, "grad_norm": 0.2209431231021881, "learning_rate": 1.925614040165269e-05, "loss": 1.2655, "step": 5024 }, { "epoch": 1.4966771533349466, "grad_norm": 0.23314248025417328, "learning_rate": 1.9255775300298744e-05, "loss": 1.2803, "step": 5025 }, { "epoch": 1.4969749995346153, "grad_norm": 0.22377417981624603, "learning_rate": 1.9255410112830148e-05, "loss": 1.2884, "step": 5026 }, { "epoch": 1.497272845734284, "grad_norm": 0.22946637868881226, "learning_rate": 1.92550448392503e-05, "loss": 1.2714, "step": 5027 }, { "epoch": 1.4975706919339526, "grad_norm": 0.21838387846946716, "learning_rate": 1.9254679479562607e-05, "loss": 1.2738, "step": 5028 }, { "epoch": 1.4978685381336212, "grad_norm": 0.2359665036201477, "learning_rate": 1.9254314033770456e-05, "loss": 1.272, "step": 5029 }, { "epoch": 1.49816638433329, "grad_norm": 0.22120609879493713, "learning_rate": 1.9253948501877257e-05, "loss": 1.2799, "step": 5030 }, { "epoch": 1.4984642305329585, "grad_norm": 0.22768308222293854, "learning_rate": 1.9253582883886398e-05, "loss": 1.2829, "step": 5031 }, { "epoch": 1.498762076732627, "grad_norm": 0.23316188156604767, "learning_rate": 1.9253217179801297e-05, "loss": 1.2654, "step": 5032 }, { "epoch": 1.4990599229322958, "grad_norm": 0.22771909832954407, "learning_rate": 1.9252851389625343e-05, "loss": 1.2654, "step": 5033 }, { "epoch": 1.4993577691319646, "grad_norm": 0.24694979190826416, "learning_rate": 1.925248551336195e-05, "loss": 1.2577, "step": 5034 }, { "epoch": 1.4996556153316332, "grad_norm": 0.2150806188583374, "learning_rate": 1.9252119551014516e-05, "loss": 1.2957, "step": 5035 }, { "epoch": 1.4999534615313017, "grad_norm": 0.22292621433734894, "learning_rate": 1.9251753502586443e-05, "loss": 1.2721, "step": 5036 }, { "epoch": 1.5002513077309705, "grad_norm": 0.22536718845367432, "learning_rate": 1.925138736808114e-05, "loss": 1.2795, "step": 5037 }, { "epoch": 1.500549153930639, "grad_norm": 0.226253941655159, "learning_rate": 1.9251021147502016e-05, "loss": 1.2814, "step": 5038 }, { "epoch": 1.5008470001303076, "grad_norm": 0.23530824482440948, "learning_rate": 1.9250654840852476e-05, "loss": 1.2819, "step": 5039 }, { "epoch": 1.5011448463299764, "grad_norm": 0.22629491984844208, "learning_rate": 1.9250288448135928e-05, "loss": 1.2728, "step": 5040 }, { "epoch": 1.5014426925296451, "grad_norm": 0.2234068661928177, "learning_rate": 1.924992196935578e-05, "loss": 1.2817, "step": 5041 }, { "epoch": 1.5017405387293137, "grad_norm": 0.21893714368343353, "learning_rate": 1.9249555404515444e-05, "loss": 1.286, "step": 5042 }, { "epoch": 1.5020383849289822, "grad_norm": 0.21677549183368683, "learning_rate": 1.9249188753618328e-05, "loss": 1.2842, "step": 5043 }, { "epoch": 1.502336231128651, "grad_norm": 0.2146213948726654, "learning_rate": 1.9248822016667844e-05, "loss": 1.2838, "step": 5044 }, { "epoch": 1.5026340773283196, "grad_norm": 0.2185962051153183, "learning_rate": 1.924845519366741e-05, "loss": 1.2771, "step": 5045 }, { "epoch": 1.502931923527988, "grad_norm": 0.24388039112091064, "learning_rate": 1.9248088284620428e-05, "loss": 1.2765, "step": 5046 }, { "epoch": 1.5032297697276569, "grad_norm": 0.2095615118741989, "learning_rate": 1.9247721289530318e-05, "loss": 1.2818, "step": 5047 }, { "epoch": 1.5035276159273256, "grad_norm": 0.23394814133644104, "learning_rate": 1.9247354208400492e-05, "loss": 1.2735, "step": 5048 }, { "epoch": 1.5038254621269942, "grad_norm": 0.21744844317436218, "learning_rate": 1.9246987041234372e-05, "loss": 1.2812, "step": 5049 }, { "epoch": 1.5041233083266627, "grad_norm": 0.23882654309272766, "learning_rate": 1.9246619788035363e-05, "loss": 1.2803, "step": 5050 }, { "epoch": 1.5044211545263315, "grad_norm": 0.21487607061862946, "learning_rate": 1.924625244880689e-05, "loss": 1.2784, "step": 5051 }, { "epoch": 1.504719000726, "grad_norm": 0.24575792253017426, "learning_rate": 1.924588502355237e-05, "loss": 1.3009, "step": 5052 }, { "epoch": 1.5050168469256686, "grad_norm": 0.2168632447719574, "learning_rate": 1.9245517512275217e-05, "loss": 1.2894, "step": 5053 }, { "epoch": 1.5053146931253374, "grad_norm": 0.2268664389848709, "learning_rate": 1.9245149914978854e-05, "loss": 1.2835, "step": 5054 }, { "epoch": 1.5056125393250062, "grad_norm": 0.2149057686328888, "learning_rate": 1.9244782231666703e-05, "loss": 1.2715, "step": 5055 }, { "epoch": 1.5059103855246747, "grad_norm": 0.22716905176639557, "learning_rate": 1.9244414462342184e-05, "loss": 1.2766, "step": 5056 }, { "epoch": 1.5062082317243433, "grad_norm": 0.22673822939395905, "learning_rate": 1.924404660700871e-05, "loss": 1.2717, "step": 5057 }, { "epoch": 1.506506077924012, "grad_norm": 0.21987012028694153, "learning_rate": 1.9243678665669715e-05, "loss": 1.2825, "step": 5058 }, { "epoch": 1.5068039241236806, "grad_norm": 0.22434185445308685, "learning_rate": 1.924331063832862e-05, "loss": 1.2698, "step": 5059 }, { "epoch": 1.5071017703233491, "grad_norm": 0.23307372629642487, "learning_rate": 1.9242942524988842e-05, "loss": 1.2887, "step": 5060 }, { "epoch": 1.507399616523018, "grad_norm": 0.2240176945924759, "learning_rate": 1.924257432565381e-05, "loss": 1.2878, "step": 5061 }, { "epoch": 1.5076974627226867, "grad_norm": 0.23216986656188965, "learning_rate": 1.924220604032695e-05, "loss": 1.29, "step": 5062 }, { "epoch": 1.5079953089223552, "grad_norm": 0.22601667046546936, "learning_rate": 1.9241837669011694e-05, "loss": 1.2878, "step": 5063 }, { "epoch": 1.5082931551220238, "grad_norm": 0.21720650792121887, "learning_rate": 1.924146921171146e-05, "loss": 1.2929, "step": 5064 }, { "epoch": 1.5085910013216925, "grad_norm": 0.22596383094787598, "learning_rate": 1.9241100668429685e-05, "loss": 1.2751, "step": 5065 }, { "epoch": 1.5088888475213613, "grad_norm": 0.2102445662021637, "learning_rate": 1.9240732039169786e-05, "loss": 1.2879, "step": 5066 }, { "epoch": 1.5091866937210296, "grad_norm": 0.24772226810455322, "learning_rate": 1.9240363323935206e-05, "loss": 1.2576, "step": 5067 }, { "epoch": 1.5094845399206984, "grad_norm": 0.217487633228302, "learning_rate": 1.9239994522729364e-05, "loss": 1.2735, "step": 5068 }, { "epoch": 1.5097823861203672, "grad_norm": 0.2197350114583969, "learning_rate": 1.9239625635555698e-05, "loss": 1.2637, "step": 5069 }, { "epoch": 1.5100802323200357, "grad_norm": 0.22306449711322784, "learning_rate": 1.923925666241764e-05, "loss": 1.2759, "step": 5070 }, { "epoch": 1.5103780785197043, "grad_norm": 0.2273397296667099, "learning_rate": 1.9238887603318625e-05, "loss": 1.2785, "step": 5071 }, { "epoch": 1.510675924719373, "grad_norm": 0.21052822470664978, "learning_rate": 1.9238518458262075e-05, "loss": 1.2786, "step": 5072 }, { "epoch": 1.5109737709190418, "grad_norm": 0.21630816161632538, "learning_rate": 1.9238149227251437e-05, "loss": 1.2927, "step": 5073 }, { "epoch": 1.5112716171187102, "grad_norm": 0.20805077254772186, "learning_rate": 1.9237779910290144e-05, "loss": 1.285, "step": 5074 }, { "epoch": 1.511569463318379, "grad_norm": 0.22114484012126923, "learning_rate": 1.9237410507381623e-05, "loss": 1.2876, "step": 5075 }, { "epoch": 1.5118673095180477, "grad_norm": 0.22389502823352814, "learning_rate": 1.9237041018529325e-05, "loss": 1.2795, "step": 5076 }, { "epoch": 1.5121651557177163, "grad_norm": 0.21567100286483765, "learning_rate": 1.9236671443736677e-05, "loss": 1.2854, "step": 5077 }, { "epoch": 1.5124630019173848, "grad_norm": 0.22371292114257812, "learning_rate": 1.9236301783007123e-05, "loss": 1.2769, "step": 5078 }, { "epoch": 1.5127608481170536, "grad_norm": 0.22854968905448914, "learning_rate": 1.9235932036344097e-05, "loss": 1.2843, "step": 5079 }, { "epoch": 1.5130586943167224, "grad_norm": 0.21412812173366547, "learning_rate": 1.9235562203751047e-05, "loss": 1.2641, "step": 5080 }, { "epoch": 1.513356540516391, "grad_norm": 0.22908322513103485, "learning_rate": 1.923519228523141e-05, "loss": 1.2798, "step": 5081 }, { "epoch": 1.5136543867160595, "grad_norm": 0.21367821097373962, "learning_rate": 1.923482228078862e-05, "loss": 1.2798, "step": 5082 }, { "epoch": 1.5139522329157282, "grad_norm": 0.33910495042800903, "learning_rate": 1.923445219042613e-05, "loss": 1.2733, "step": 5083 }, { "epoch": 1.5142500791153968, "grad_norm": 0.23164407908916473, "learning_rate": 1.923408201414738e-05, "loss": 1.2749, "step": 5084 }, { "epoch": 1.5145479253150653, "grad_norm": 0.23809514939785004, "learning_rate": 1.9233711751955815e-05, "loss": 1.2724, "step": 5085 }, { "epoch": 1.514845771514734, "grad_norm": 0.20901332795619965, "learning_rate": 1.9233341403854877e-05, "loss": 1.2691, "step": 5086 }, { "epoch": 1.5151436177144029, "grad_norm": 0.22775524854660034, "learning_rate": 1.9232970969848013e-05, "loss": 1.2832, "step": 5087 }, { "epoch": 1.5154414639140714, "grad_norm": 0.21273073554039001, "learning_rate": 1.9232600449938673e-05, "loss": 1.2783, "step": 5088 }, { "epoch": 1.51573931011374, "grad_norm": 0.22500090301036835, "learning_rate": 1.92322298441303e-05, "loss": 1.3053, "step": 5089 }, { "epoch": 1.5160371563134087, "grad_norm": 0.20830382406711578, "learning_rate": 1.923185915242634e-05, "loss": 1.2751, "step": 5090 }, { "epoch": 1.5163350025130773, "grad_norm": 0.21803244948387146, "learning_rate": 1.9231488374830247e-05, "loss": 1.2693, "step": 5091 }, { "epoch": 1.5166328487127458, "grad_norm": 0.21517494320869446, "learning_rate": 1.923111751134547e-05, "loss": 1.2797, "step": 5092 }, { "epoch": 1.5169306949124146, "grad_norm": 0.2202211320400238, "learning_rate": 1.9230746561975455e-05, "loss": 1.2711, "step": 5093 }, { "epoch": 1.5172285411120834, "grad_norm": 0.22223502397537231, "learning_rate": 1.9230375526723657e-05, "loss": 1.2846, "step": 5094 }, { "epoch": 1.517526387311752, "grad_norm": 0.21127375960350037, "learning_rate": 1.923000440559353e-05, "loss": 1.2882, "step": 5095 }, { "epoch": 1.5178242335114205, "grad_norm": 0.21672438085079193, "learning_rate": 1.922963319858852e-05, "loss": 1.281, "step": 5096 }, { "epoch": 1.5181220797110893, "grad_norm": 0.2225533276796341, "learning_rate": 1.9229261905712092e-05, "loss": 1.2891, "step": 5097 }, { "epoch": 1.5184199259107578, "grad_norm": 0.22136686742305756, "learning_rate": 1.9228890526967688e-05, "loss": 1.299, "step": 5098 }, { "epoch": 1.5187177721104264, "grad_norm": 0.22122547030448914, "learning_rate": 1.922851906235877e-05, "loss": 1.2862, "step": 5099 }, { "epoch": 1.5190156183100951, "grad_norm": 0.2353043109178543, "learning_rate": 1.9228147511888795e-05, "loss": 1.2834, "step": 5100 }, { "epoch": 1.519313464509764, "grad_norm": 0.21503135561943054, "learning_rate": 1.9227775875561218e-05, "loss": 1.2634, "step": 5101 }, { "epoch": 1.5196113107094325, "grad_norm": 0.21588893234729767, "learning_rate": 1.9227404153379492e-05, "loss": 1.2919, "step": 5102 }, { "epoch": 1.519909156909101, "grad_norm": 0.2221689373254776, "learning_rate": 1.922703234534708e-05, "loss": 1.2896, "step": 5103 }, { "epoch": 1.5202070031087698, "grad_norm": 0.2100355178117752, "learning_rate": 1.9226660451467443e-05, "loss": 1.2818, "step": 5104 }, { "epoch": 1.5205048493084383, "grad_norm": 0.2249361276626587, "learning_rate": 1.9226288471744042e-05, "loss": 1.2886, "step": 5105 }, { "epoch": 1.5208026955081069, "grad_norm": 0.22217456996440887, "learning_rate": 1.922591640618033e-05, "loss": 1.2585, "step": 5106 }, { "epoch": 1.5211005417077756, "grad_norm": 0.2258172482252121, "learning_rate": 1.9225544254779777e-05, "loss": 1.2908, "step": 5107 }, { "epoch": 1.5213983879074444, "grad_norm": 0.21065488457679749, "learning_rate": 1.922517201754584e-05, "loss": 1.289, "step": 5108 }, { "epoch": 1.521696234107113, "grad_norm": 0.22901864349842072, "learning_rate": 1.9224799694481988e-05, "loss": 1.2786, "step": 5109 }, { "epoch": 1.5219940803067815, "grad_norm": 0.2129736691713333, "learning_rate": 1.922442728559168e-05, "loss": 1.2937, "step": 5110 }, { "epoch": 1.5222919265064503, "grad_norm": 0.21151354908943176, "learning_rate": 1.9224054790878378e-05, "loss": 1.2686, "step": 5111 }, { "epoch": 1.5225897727061188, "grad_norm": 0.21267589926719666, "learning_rate": 1.9223682210345556e-05, "loss": 1.2684, "step": 5112 }, { "epoch": 1.5228876189057874, "grad_norm": 0.22577396035194397, "learning_rate": 1.9223309543996676e-05, "loss": 1.278, "step": 5113 }, { "epoch": 1.5231854651054562, "grad_norm": 0.21641625463962555, "learning_rate": 1.9222936791835205e-05, "loss": 1.2671, "step": 5114 }, { "epoch": 1.523483311305125, "grad_norm": 0.23217715322971344, "learning_rate": 1.9222563953864612e-05, "loss": 1.2881, "step": 5115 }, { "epoch": 1.5237811575047935, "grad_norm": 0.22642222046852112, "learning_rate": 1.9222191030088364e-05, "loss": 1.2736, "step": 5116 }, { "epoch": 1.524079003704462, "grad_norm": 0.2142854481935501, "learning_rate": 1.9221818020509933e-05, "loss": 1.2636, "step": 5117 }, { "epoch": 1.5243768499041308, "grad_norm": 0.22878050804138184, "learning_rate": 1.922144492513279e-05, "loss": 1.2855, "step": 5118 }, { "epoch": 1.5246746961037994, "grad_norm": 0.21787415444850922, "learning_rate": 1.92210717439604e-05, "loss": 1.2707, "step": 5119 }, { "epoch": 1.524972542303468, "grad_norm": 0.23277103900909424, "learning_rate": 1.9220698476996245e-05, "loss": 1.2876, "step": 5120 }, { "epoch": 1.5252703885031367, "grad_norm": 0.21832115948200226, "learning_rate": 1.922032512424379e-05, "loss": 1.2736, "step": 5121 }, { "epoch": 1.5255682347028054, "grad_norm": 0.2242104411125183, "learning_rate": 1.921995168570651e-05, "loss": 1.2613, "step": 5122 }, { "epoch": 1.525866080902474, "grad_norm": 0.21479088068008423, "learning_rate": 1.9219578161387886e-05, "loss": 1.2975, "step": 5123 }, { "epoch": 1.5261639271021425, "grad_norm": 0.21542489528656006, "learning_rate": 1.9219204551291385e-05, "loss": 1.2766, "step": 5124 }, { "epoch": 1.5264617733018113, "grad_norm": 0.21532559394836426, "learning_rate": 1.9218830855420486e-05, "loss": 1.2779, "step": 5125 }, { "epoch": 1.5267596195014799, "grad_norm": 0.23046466708183289, "learning_rate": 1.9218457073778665e-05, "loss": 1.2746, "step": 5126 }, { "epoch": 1.5270574657011484, "grad_norm": 0.21354719996452332, "learning_rate": 1.92180832063694e-05, "loss": 1.2745, "step": 5127 }, { "epoch": 1.5273553119008172, "grad_norm": 0.22858357429504395, "learning_rate": 1.921770925319617e-05, "loss": 1.2891, "step": 5128 }, { "epoch": 1.527653158100486, "grad_norm": 0.21283267438411713, "learning_rate": 1.9217335214262455e-05, "loss": 1.2865, "step": 5129 }, { "epoch": 1.5279510043001545, "grad_norm": 0.2285892367362976, "learning_rate": 1.9216961089571734e-05, "loss": 1.2906, "step": 5130 }, { "epoch": 1.528248850499823, "grad_norm": 0.22981026768684387, "learning_rate": 1.9216586879127486e-05, "loss": 1.2773, "step": 5131 }, { "epoch": 1.5285466966994918, "grad_norm": 0.22597664594650269, "learning_rate": 1.9216212582933197e-05, "loss": 1.2813, "step": 5132 }, { "epoch": 1.5288445428991606, "grad_norm": 0.23075662553310394, "learning_rate": 1.9215838200992344e-05, "loss": 1.2803, "step": 5133 }, { "epoch": 1.529142389098829, "grad_norm": 0.21592989563941956, "learning_rate": 1.9215463733308418e-05, "loss": 1.2742, "step": 5134 }, { "epoch": 1.5294402352984977, "grad_norm": 0.2478303462266922, "learning_rate": 1.9215089179884897e-05, "loss": 1.2767, "step": 5135 }, { "epoch": 1.5297380814981665, "grad_norm": 0.2218201756477356, "learning_rate": 1.9214714540725263e-05, "loss": 1.2676, "step": 5136 }, { "epoch": 1.530035927697835, "grad_norm": 0.22012294828891754, "learning_rate": 1.9214339815833004e-05, "loss": 1.2742, "step": 5137 }, { "epoch": 1.5303337738975036, "grad_norm": 0.21948669850826263, "learning_rate": 1.9213965005211614e-05, "loss": 1.2904, "step": 5138 }, { "epoch": 1.5306316200971724, "grad_norm": 0.20946471393108368, "learning_rate": 1.9213590108864572e-05, "loss": 1.2808, "step": 5139 }, { "epoch": 1.5309294662968411, "grad_norm": 0.2280375212430954, "learning_rate": 1.9213215126795366e-05, "loss": 1.2849, "step": 5140 }, { "epoch": 1.5312273124965095, "grad_norm": 0.2271459996700287, "learning_rate": 1.921284005900749e-05, "loss": 1.2733, "step": 5141 }, { "epoch": 1.5315251586961782, "grad_norm": 0.22264908254146576, "learning_rate": 1.921246490550443e-05, "loss": 1.2628, "step": 5142 }, { "epoch": 1.531823004895847, "grad_norm": 0.20613276958465576, "learning_rate": 1.9212089666289674e-05, "loss": 1.2704, "step": 5143 }, { "epoch": 1.5321208510955155, "grad_norm": 0.28468140959739685, "learning_rate": 1.9211714341366718e-05, "loss": 1.2673, "step": 5144 }, { "epoch": 1.532418697295184, "grad_norm": 0.24806201457977295, "learning_rate": 1.921133893073905e-05, "loss": 1.2897, "step": 5145 }, { "epoch": 1.5327165434948529, "grad_norm": 0.24836388230323792, "learning_rate": 1.9210963434410166e-05, "loss": 1.2792, "step": 5146 }, { "epoch": 1.5330143896945216, "grad_norm": 0.21674907207489014, "learning_rate": 1.921058785238356e-05, "loss": 1.2847, "step": 5147 }, { "epoch": 1.5333122358941902, "grad_norm": 0.30787208676338196, "learning_rate": 1.9210212184662724e-05, "loss": 1.2909, "step": 5148 }, { "epoch": 1.5336100820938587, "grad_norm": 0.24113978445529938, "learning_rate": 1.9209836431251154e-05, "loss": 1.2796, "step": 5149 }, { "epoch": 1.5339079282935275, "grad_norm": 0.22804510593414307, "learning_rate": 1.9209460592152345e-05, "loss": 1.2737, "step": 5150 }, { "epoch": 1.534205774493196, "grad_norm": 0.2252933233976364, "learning_rate": 1.9209084667369793e-05, "loss": 1.2851, "step": 5151 }, { "epoch": 1.5345036206928646, "grad_norm": 0.21543268859386444, "learning_rate": 1.9208708656907e-05, "loss": 1.2846, "step": 5152 }, { "epoch": 1.5348014668925334, "grad_norm": 0.21873699128627777, "learning_rate": 1.920833256076746e-05, "loss": 1.2744, "step": 5153 }, { "epoch": 1.5350993130922022, "grad_norm": 0.23945359885692596, "learning_rate": 1.9207956378954673e-05, "loss": 1.2795, "step": 5154 }, { "epoch": 1.5353971592918707, "grad_norm": 0.2227172702550888, "learning_rate": 1.9207580111472142e-05, "loss": 1.2978, "step": 5155 }, { "epoch": 1.5356950054915393, "grad_norm": 0.212611123919487, "learning_rate": 1.9207203758323362e-05, "loss": 1.2945, "step": 5156 }, { "epoch": 1.535992851691208, "grad_norm": 0.21947996318340302, "learning_rate": 1.920682731951184e-05, "loss": 1.2602, "step": 5157 }, { "epoch": 1.5362906978908766, "grad_norm": 0.23065443336963654, "learning_rate": 1.920645079504108e-05, "loss": 1.2713, "step": 5158 }, { "epoch": 1.5365885440905451, "grad_norm": 0.2252213954925537, "learning_rate": 1.9206074184914575e-05, "loss": 1.2844, "step": 5159 }, { "epoch": 1.536886390290214, "grad_norm": 0.21542248129844666, "learning_rate": 1.9205697489135838e-05, "loss": 1.2723, "step": 5160 }, { "epoch": 1.5371842364898827, "grad_norm": 0.20689259469509125, "learning_rate": 1.9205320707708372e-05, "loss": 1.2522, "step": 5161 }, { "epoch": 1.5374820826895512, "grad_norm": 0.22795073688030243, "learning_rate": 1.920494384063568e-05, "loss": 1.2672, "step": 5162 }, { "epoch": 1.5377799288892198, "grad_norm": 0.23105382919311523, "learning_rate": 1.9204566887921273e-05, "loss": 1.2841, "step": 5163 }, { "epoch": 1.5380777750888885, "grad_norm": 0.2340937703847885, "learning_rate": 1.9204189849568654e-05, "loss": 1.2813, "step": 5164 }, { "epoch": 1.538375621288557, "grad_norm": 0.21824911236763, "learning_rate": 1.9203812725581328e-05, "loss": 1.2751, "step": 5165 }, { "epoch": 1.5386734674882256, "grad_norm": 0.21339337527751923, "learning_rate": 1.920343551596281e-05, "loss": 1.2677, "step": 5166 }, { "epoch": 1.5389713136878944, "grad_norm": 0.22229644656181335, "learning_rate": 1.9203058220716607e-05, "loss": 1.2571, "step": 5167 }, { "epoch": 1.5392691598875632, "grad_norm": 0.22805114090442657, "learning_rate": 1.9202680839846232e-05, "loss": 1.2818, "step": 5168 }, { "epoch": 1.5395670060872317, "grad_norm": 0.2152119129896164, "learning_rate": 1.920230337335519e-05, "loss": 1.2931, "step": 5169 }, { "epoch": 1.5398648522869003, "grad_norm": 0.22499510645866394, "learning_rate": 1.9201925821247e-05, "loss": 1.2619, "step": 5170 }, { "epoch": 1.540162698486569, "grad_norm": 0.21436291933059692, "learning_rate": 1.920154818352517e-05, "loss": 1.2774, "step": 5171 }, { "epoch": 1.5404605446862376, "grad_norm": 0.2081919014453888, "learning_rate": 1.9201170460193213e-05, "loss": 1.295, "step": 5172 }, { "epoch": 1.5407583908859062, "grad_norm": 0.22008489072322845, "learning_rate": 1.9200792651254647e-05, "loss": 1.278, "step": 5173 }, { "epoch": 1.541056237085575, "grad_norm": 0.21873022615909576, "learning_rate": 1.9200414756712985e-05, "loss": 1.2692, "step": 5174 }, { "epoch": 1.5413540832852437, "grad_norm": 0.22118264436721802, "learning_rate": 1.920003677657174e-05, "loss": 1.2963, "step": 5175 }, { "epoch": 1.5416519294849123, "grad_norm": 0.21683140099048615, "learning_rate": 1.9199658710834434e-05, "loss": 1.2995, "step": 5176 }, { "epoch": 1.5419497756845808, "grad_norm": 0.21878816187381744, "learning_rate": 1.9199280559504584e-05, "loss": 1.2843, "step": 5177 }, { "epoch": 1.5422476218842496, "grad_norm": 0.22942353785037994, "learning_rate": 1.9198902322585704e-05, "loss": 1.2767, "step": 5178 }, { "epoch": 1.5425454680839181, "grad_norm": 0.23138710856437683, "learning_rate": 1.9198524000081317e-05, "loss": 1.301, "step": 5179 }, { "epoch": 1.5428433142835867, "grad_norm": 0.21942220628261566, "learning_rate": 1.919814559199494e-05, "loss": 1.2846, "step": 5180 }, { "epoch": 1.5431411604832554, "grad_norm": 0.21210066974163055, "learning_rate": 1.9197767098330097e-05, "loss": 1.2972, "step": 5181 }, { "epoch": 1.5434390066829242, "grad_norm": 0.219122976064682, "learning_rate": 1.9197388519090302e-05, "loss": 1.2776, "step": 5182 }, { "epoch": 1.5437368528825928, "grad_norm": 0.2294389009475708, "learning_rate": 1.919700985427909e-05, "loss": 1.2833, "step": 5183 }, { "epoch": 1.5440346990822613, "grad_norm": 0.2336971014738083, "learning_rate": 1.919663110389997e-05, "loss": 1.2677, "step": 5184 }, { "epoch": 1.54433254528193, "grad_norm": 0.2087344527244568, "learning_rate": 1.9196252267956477e-05, "loss": 1.2745, "step": 5185 }, { "epoch": 1.5446303914815986, "grad_norm": 0.22128574550151825, "learning_rate": 1.9195873346452132e-05, "loss": 1.2865, "step": 5186 }, { "epoch": 1.5449282376812672, "grad_norm": 0.2278864085674286, "learning_rate": 1.9195494339390455e-05, "loss": 1.2697, "step": 5187 }, { "epoch": 1.545226083880936, "grad_norm": 0.2292591631412506, "learning_rate": 1.9195115246774985e-05, "loss": 1.2819, "step": 5188 }, { "epoch": 1.5455239300806047, "grad_norm": 0.21845091879367828, "learning_rate": 1.9194736068609235e-05, "loss": 1.2734, "step": 5189 }, { "epoch": 1.5458217762802733, "grad_norm": 0.2252739667892456, "learning_rate": 1.919435680489674e-05, "loss": 1.2897, "step": 5190 }, { "epoch": 1.5461196224799418, "grad_norm": 0.21534298360347748, "learning_rate": 1.9193977455641025e-05, "loss": 1.2684, "step": 5191 }, { "epoch": 1.5464174686796106, "grad_norm": 0.22747106850147247, "learning_rate": 1.9193598020845626e-05, "loss": 1.2735, "step": 5192 }, { "epoch": 1.5467153148792792, "grad_norm": 0.2073824554681778, "learning_rate": 1.919321850051407e-05, "loss": 1.2672, "step": 5193 }, { "epoch": 1.5470131610789477, "grad_norm": 0.2117050439119339, "learning_rate": 1.9192838894649884e-05, "loss": 1.2871, "step": 5194 }, { "epoch": 1.5473110072786165, "grad_norm": 0.2082703709602356, "learning_rate": 1.9192459203256605e-05, "loss": 1.281, "step": 5195 }, { "epoch": 1.5476088534782853, "grad_norm": 0.21297723054885864, "learning_rate": 1.9192079426337762e-05, "loss": 1.2987, "step": 5196 }, { "epoch": 1.5479066996779538, "grad_norm": 0.21057502925395966, "learning_rate": 1.919169956389689e-05, "loss": 1.2651, "step": 5197 }, { "epoch": 1.5482045458776224, "grad_norm": 0.21725547313690186, "learning_rate": 1.9191319615937523e-05, "loss": 1.2815, "step": 5198 }, { "epoch": 1.5485023920772911, "grad_norm": 0.23164552450180054, "learning_rate": 1.9190939582463195e-05, "loss": 1.2888, "step": 5199 }, { "epoch": 1.54880023827696, "grad_norm": 0.21210889518260956, "learning_rate": 1.9190559463477445e-05, "loss": 1.277, "step": 5200 }, { "epoch": 1.5490980844766282, "grad_norm": 0.22068975865840912, "learning_rate": 1.9190179258983804e-05, "loss": 1.2701, "step": 5201 }, { "epoch": 1.549395930676297, "grad_norm": 0.22226393222808838, "learning_rate": 1.918979896898582e-05, "loss": 1.2522, "step": 5202 }, { "epoch": 1.5496937768759658, "grad_norm": 0.2246939092874527, "learning_rate": 1.918941859348702e-05, "loss": 1.2786, "step": 5203 }, { "epoch": 1.5499916230756343, "grad_norm": 0.2530314028263092, "learning_rate": 1.9189038132490945e-05, "loss": 1.2862, "step": 5204 }, { "epoch": 1.5502894692753029, "grad_norm": 0.20884862542152405, "learning_rate": 1.9188657586001137e-05, "loss": 1.2658, "step": 5205 }, { "epoch": 1.5505873154749716, "grad_norm": 0.22472144663333893, "learning_rate": 1.918827695402114e-05, "loss": 1.2804, "step": 5206 }, { "epoch": 1.5508851616746404, "grad_norm": 0.2202090620994568, "learning_rate": 1.9187896236554488e-05, "loss": 1.2542, "step": 5207 }, { "epoch": 1.5511830078743087, "grad_norm": 0.2156151682138443, "learning_rate": 1.918751543360473e-05, "loss": 1.2698, "step": 5208 }, { "epoch": 1.5514808540739775, "grad_norm": 0.22296875715255737, "learning_rate": 1.9187134545175403e-05, "loss": 1.2626, "step": 5209 }, { "epoch": 1.5517787002736463, "grad_norm": 0.22329306602478027, "learning_rate": 1.9186753571270054e-05, "loss": 1.2853, "step": 5210 }, { "epoch": 1.5520765464733148, "grad_norm": 0.22098888456821442, "learning_rate": 1.918637251189223e-05, "loss": 1.2864, "step": 5211 }, { "epoch": 1.5523743926729834, "grad_norm": 0.23140911757946014, "learning_rate": 1.918599136704547e-05, "loss": 1.2957, "step": 5212 }, { "epoch": 1.5526722388726522, "grad_norm": 0.21242539584636688, "learning_rate": 1.9185610136733322e-05, "loss": 1.2855, "step": 5213 }, { "epoch": 1.552970085072321, "grad_norm": 0.21810154616832733, "learning_rate": 1.918522882095934e-05, "loss": 1.2815, "step": 5214 }, { "epoch": 1.5532679312719895, "grad_norm": 0.2197161167860031, "learning_rate": 1.9184847419727063e-05, "loss": 1.2769, "step": 5215 }, { "epoch": 1.553565777471658, "grad_norm": 0.21979984641075134, "learning_rate": 1.9184465933040042e-05, "loss": 1.2682, "step": 5216 }, { "epoch": 1.5538636236713268, "grad_norm": 0.224289208650589, "learning_rate": 1.9184084360901827e-05, "loss": 1.2852, "step": 5217 }, { "epoch": 1.5541614698709953, "grad_norm": 0.22888165712356567, "learning_rate": 1.9183702703315972e-05, "loss": 1.2904, "step": 5218 }, { "epoch": 1.554459316070664, "grad_norm": 0.21977858245372772, "learning_rate": 1.918332096028602e-05, "loss": 1.2669, "step": 5219 }, { "epoch": 1.5547571622703327, "grad_norm": 0.22267065942287445, "learning_rate": 1.9182939131815527e-05, "loss": 1.2853, "step": 5220 }, { "epoch": 1.5550550084700014, "grad_norm": 0.21301211416721344, "learning_rate": 1.9182557217908046e-05, "loss": 1.2736, "step": 5221 }, { "epoch": 1.55535285466967, "grad_norm": 0.21961882710456848, "learning_rate": 1.918217521856713e-05, "loss": 1.284, "step": 5222 }, { "epoch": 1.5556507008693385, "grad_norm": 0.21925969421863556, "learning_rate": 1.9181793133796332e-05, "loss": 1.2796, "step": 5223 }, { "epoch": 1.5559485470690073, "grad_norm": 0.2270103543996811, "learning_rate": 1.918141096359921e-05, "loss": 1.2936, "step": 5224 }, { "epoch": 1.5562463932686759, "grad_norm": 0.24820318818092346, "learning_rate": 1.918102870797931e-05, "loss": 1.2593, "step": 5225 }, { "epoch": 1.5565442394683444, "grad_norm": 0.21502672135829926, "learning_rate": 1.9180646366940202e-05, "loss": 1.2757, "step": 5226 }, { "epoch": 1.5568420856680132, "grad_norm": 0.216362863779068, "learning_rate": 1.9180263940485434e-05, "loss": 1.2978, "step": 5227 }, { "epoch": 1.557139931867682, "grad_norm": 0.21232004463672638, "learning_rate": 1.9179881428618567e-05, "loss": 1.266, "step": 5228 }, { "epoch": 1.5574377780673505, "grad_norm": 0.22477225959300995, "learning_rate": 1.9179498831343162e-05, "loss": 1.2728, "step": 5229 }, { "epoch": 1.557735624267019, "grad_norm": 0.2526955306529999, "learning_rate": 1.9179116148662774e-05, "loss": 1.2824, "step": 5230 }, { "epoch": 1.5580334704666878, "grad_norm": 0.22520771622657776, "learning_rate": 1.9178733380580967e-05, "loss": 1.2768, "step": 5231 }, { "epoch": 1.5583313166663564, "grad_norm": 0.22579000890254974, "learning_rate": 1.91783505271013e-05, "loss": 1.2743, "step": 5232 }, { "epoch": 1.558629162866025, "grad_norm": 0.2351585179567337, "learning_rate": 1.9177967588227334e-05, "loss": 1.2989, "step": 5233 }, { "epoch": 1.5589270090656937, "grad_norm": 0.23377393186092377, "learning_rate": 1.9177584563962638e-05, "loss": 1.2784, "step": 5234 }, { "epoch": 1.5592248552653625, "grad_norm": 0.23362720012664795, "learning_rate": 1.917720145431077e-05, "loss": 1.2926, "step": 5235 }, { "epoch": 1.559522701465031, "grad_norm": 0.21404823660850525, "learning_rate": 1.9176818259275293e-05, "loss": 1.2722, "step": 5236 }, { "epoch": 1.5598205476646996, "grad_norm": 0.217382550239563, "learning_rate": 1.9176434978859776e-05, "loss": 1.2655, "step": 5237 }, { "epoch": 1.5601183938643683, "grad_norm": 0.2208022028207779, "learning_rate": 1.9176051613067787e-05, "loss": 1.2751, "step": 5238 }, { "epoch": 1.560416240064037, "grad_norm": 0.21439604461193085, "learning_rate": 1.9175668161902886e-05, "loss": 1.2818, "step": 5239 }, { "epoch": 1.5607140862637054, "grad_norm": 0.24205389618873596, "learning_rate": 1.917528462536865e-05, "loss": 1.2943, "step": 5240 }, { "epoch": 1.5610119324633742, "grad_norm": 0.21855804324150085, "learning_rate": 1.9174901003468638e-05, "loss": 1.2813, "step": 5241 }, { "epoch": 1.561309778663043, "grad_norm": 0.22007018327713013, "learning_rate": 1.917451729620642e-05, "loss": 1.2668, "step": 5242 }, { "epoch": 1.5616076248627115, "grad_norm": 0.2210848033428192, "learning_rate": 1.9174133503585573e-05, "loss": 1.2733, "step": 5243 }, { "epoch": 1.56190547106238, "grad_norm": 0.2746601402759552, "learning_rate": 1.9173749625609664e-05, "loss": 1.2664, "step": 5244 }, { "epoch": 1.5622033172620489, "grad_norm": 0.23702725768089294, "learning_rate": 1.9173365662282264e-05, "loss": 1.2771, "step": 5245 }, { "epoch": 1.5625011634617174, "grad_norm": 0.2350674420595169, "learning_rate": 1.9172981613606946e-05, "loss": 1.275, "step": 5246 }, { "epoch": 1.562799009661386, "grad_norm": 0.22329159080982208, "learning_rate": 1.9172597479587282e-05, "loss": 1.2868, "step": 5247 }, { "epoch": 1.5630968558610547, "grad_norm": 0.22355787456035614, "learning_rate": 1.9172213260226842e-05, "loss": 1.2586, "step": 5248 }, { "epoch": 1.5633947020607235, "grad_norm": 0.2204594910144806, "learning_rate": 1.9171828955529213e-05, "loss": 1.2765, "step": 5249 }, { "epoch": 1.563692548260392, "grad_norm": 0.2157185971736908, "learning_rate": 1.917144456549796e-05, "loss": 1.2695, "step": 5250 }, { "epoch": 1.5639903944600606, "grad_norm": 0.22540999948978424, "learning_rate": 1.917106009013666e-05, "loss": 1.2733, "step": 5251 }, { "epoch": 1.5642882406597294, "grad_norm": 0.22991201281547546, "learning_rate": 1.9170675529448895e-05, "loss": 1.278, "step": 5252 }, { "epoch": 1.564586086859398, "grad_norm": 0.2228827178478241, "learning_rate": 1.9170290883438238e-05, "loss": 1.2795, "step": 5253 }, { "epoch": 1.5648839330590665, "grad_norm": 0.21096153557300568, "learning_rate": 1.916990615210827e-05, "loss": 1.2773, "step": 5254 }, { "epoch": 1.5651817792587353, "grad_norm": 0.21494264900684357, "learning_rate": 1.916952133546257e-05, "loss": 1.2758, "step": 5255 }, { "epoch": 1.565479625458404, "grad_norm": 0.22339381277561188, "learning_rate": 1.9169136433504724e-05, "loss": 1.2691, "step": 5256 }, { "epoch": 1.5657774716580726, "grad_norm": 0.21925175189971924, "learning_rate": 1.9168751446238306e-05, "loss": 1.2597, "step": 5257 }, { "epoch": 1.5660753178577411, "grad_norm": 0.22400447726249695, "learning_rate": 1.91683663736669e-05, "loss": 1.2653, "step": 5258 }, { "epoch": 1.56637316405741, "grad_norm": 0.25193363428115845, "learning_rate": 1.9167981215794086e-05, "loss": 1.26, "step": 5259 }, { "epoch": 1.5666710102570787, "grad_norm": 0.2602287232875824, "learning_rate": 1.916759597262345e-05, "loss": 1.2733, "step": 5260 }, { "epoch": 1.566968856456747, "grad_norm": 0.2307026982307434, "learning_rate": 1.9167210644158577e-05, "loss": 1.2741, "step": 5261 }, { "epoch": 1.5672667026564158, "grad_norm": 0.22563683986663818, "learning_rate": 1.9166825230403047e-05, "loss": 1.2717, "step": 5262 }, { "epoch": 1.5675645488560845, "grad_norm": 0.3646620512008667, "learning_rate": 1.9166439731360454e-05, "loss": 1.2654, "step": 5263 }, { "epoch": 1.567862395055753, "grad_norm": 0.25848421454429626, "learning_rate": 1.916605414703438e-05, "loss": 1.2853, "step": 5264 }, { "epoch": 1.5681602412554216, "grad_norm": 0.24908843636512756, "learning_rate": 1.9165668477428414e-05, "loss": 1.2734, "step": 5265 }, { "epoch": 1.5684580874550904, "grad_norm": 0.2249835878610611, "learning_rate": 1.9165282722546146e-05, "loss": 1.2919, "step": 5266 }, { "epoch": 1.5687559336547592, "grad_norm": 0.254253089427948, "learning_rate": 1.9164896882391158e-05, "loss": 1.2778, "step": 5267 }, { "epoch": 1.5690537798544275, "grad_norm": 0.24973627924919128, "learning_rate": 1.9164510956967043e-05, "loss": 1.2732, "step": 5268 }, { "epoch": 1.5693516260540963, "grad_norm": 0.22042717039585114, "learning_rate": 1.9164124946277396e-05, "loss": 1.2998, "step": 5269 }, { "epoch": 1.569649472253765, "grad_norm": 0.22058488428592682, "learning_rate": 1.9163738850325806e-05, "loss": 1.2594, "step": 5270 }, { "epoch": 1.5699473184534336, "grad_norm": 0.2343423217535019, "learning_rate": 1.916335266911586e-05, "loss": 1.2808, "step": 5271 }, { "epoch": 1.5702451646531022, "grad_norm": 0.2160743623971939, "learning_rate": 1.916296640265116e-05, "loss": 1.2746, "step": 5272 }, { "epoch": 1.570543010852771, "grad_norm": 0.21926793456077576, "learning_rate": 1.9162580050935293e-05, "loss": 1.2807, "step": 5273 }, { "epoch": 1.5708408570524397, "grad_norm": 0.21966791152954102, "learning_rate": 1.9162193613971854e-05, "loss": 1.2708, "step": 5274 }, { "epoch": 1.571138703252108, "grad_norm": 0.21941600739955902, "learning_rate": 1.9161807091764442e-05, "loss": 1.2507, "step": 5275 }, { "epoch": 1.5714365494517768, "grad_norm": 0.23557275533676147, "learning_rate": 1.916142048431665e-05, "loss": 1.2921, "step": 5276 }, { "epoch": 1.5717343956514456, "grad_norm": 0.2285400778055191, "learning_rate": 1.916103379163208e-05, "loss": 1.2745, "step": 5277 }, { "epoch": 1.5720322418511141, "grad_norm": 0.2255667895078659, "learning_rate": 1.9160647013714323e-05, "loss": 1.2766, "step": 5278 }, { "epoch": 1.5723300880507827, "grad_norm": 0.21725843846797943, "learning_rate": 1.9160260150566978e-05, "loss": 1.2735, "step": 5279 }, { "epoch": 1.5726279342504514, "grad_norm": 0.2289770096540451, "learning_rate": 1.9159873202193648e-05, "loss": 1.2771, "step": 5280 }, { "epoch": 1.5729257804501202, "grad_norm": 0.21951895952224731, "learning_rate": 1.9159486168597934e-05, "loss": 1.2729, "step": 5281 }, { "epoch": 1.5732236266497888, "grad_norm": 0.21915505826473236, "learning_rate": 1.9159099049783435e-05, "loss": 1.2707, "step": 5282 }, { "epoch": 1.5735214728494573, "grad_norm": 0.22136370837688446, "learning_rate": 1.915871184575375e-05, "loss": 1.2576, "step": 5283 }, { "epoch": 1.573819319049126, "grad_norm": 0.2282637357711792, "learning_rate": 1.9158324556512483e-05, "loss": 1.2801, "step": 5284 }, { "epoch": 1.5741171652487946, "grad_norm": 0.22993646562099457, "learning_rate": 1.9157937182063243e-05, "loss": 1.2758, "step": 5285 }, { "epoch": 1.5744150114484632, "grad_norm": 0.21509593725204468, "learning_rate": 1.9157549722409628e-05, "loss": 1.2826, "step": 5286 }, { "epoch": 1.574712857648132, "grad_norm": 0.22301043570041656, "learning_rate": 1.9157162177555242e-05, "loss": 1.2861, "step": 5287 }, { "epoch": 1.5750107038478007, "grad_norm": 0.22497759759426117, "learning_rate": 1.9156774547503694e-05, "loss": 1.264, "step": 5288 }, { "epoch": 1.5753085500474693, "grad_norm": 0.22139903903007507, "learning_rate": 1.915638683225859e-05, "loss": 1.2825, "step": 5289 }, { "epoch": 1.5756063962471378, "grad_norm": 0.22024013102054596, "learning_rate": 1.9155999031823534e-05, "loss": 1.2887, "step": 5290 }, { "epoch": 1.5759042424468066, "grad_norm": 0.2222883701324463, "learning_rate": 1.915561114620214e-05, "loss": 1.2856, "step": 5291 }, { "epoch": 1.5762020886464752, "grad_norm": 0.22331392765045166, "learning_rate": 1.915522317539801e-05, "loss": 1.2867, "step": 5292 }, { "epoch": 1.5764999348461437, "grad_norm": 0.22336934506893158, "learning_rate": 1.915483511941476e-05, "loss": 1.2722, "step": 5293 }, { "epoch": 1.5767977810458125, "grad_norm": 0.20129472017288208, "learning_rate": 1.9154446978256e-05, "loss": 1.257, "step": 5294 }, { "epoch": 1.5770956272454812, "grad_norm": 0.2179460972547531, "learning_rate": 1.9154058751925335e-05, "loss": 1.2743, "step": 5295 }, { "epoch": 1.5773934734451498, "grad_norm": 0.21097329258918762, "learning_rate": 1.915367044042638e-05, "loss": 1.2743, "step": 5296 }, { "epoch": 1.5776913196448183, "grad_norm": 0.21877439320087433, "learning_rate": 1.9153282043762753e-05, "loss": 1.2652, "step": 5297 }, { "epoch": 1.5779891658444871, "grad_norm": 0.21587024629116058, "learning_rate": 1.9152893561938058e-05, "loss": 1.2801, "step": 5298 }, { "epoch": 1.5782870120441557, "grad_norm": 0.2207348793745041, "learning_rate": 1.915250499495592e-05, "loss": 1.269, "step": 5299 }, { "epoch": 1.5785848582438242, "grad_norm": 0.21685463190078735, "learning_rate": 1.9152116342819942e-05, "loss": 1.2649, "step": 5300 }, { "epoch": 1.578882704443493, "grad_norm": 0.2112230360507965, "learning_rate": 1.9151727605533753e-05, "loss": 1.256, "step": 5301 }, { "epoch": 1.5791805506431618, "grad_norm": 0.20439797639846802, "learning_rate": 1.9151338783100962e-05, "loss": 1.2688, "step": 5302 }, { "epoch": 1.5794783968428303, "grad_norm": 0.21740297973155975, "learning_rate": 1.9150949875525185e-05, "loss": 1.2851, "step": 5303 }, { "epoch": 1.5797762430424989, "grad_norm": 0.2155197411775589, "learning_rate": 1.9150560882810047e-05, "loss": 1.2613, "step": 5304 }, { "epoch": 1.5800740892421676, "grad_norm": 0.2102223038673401, "learning_rate": 1.9150171804959163e-05, "loss": 1.2895, "step": 5305 }, { "epoch": 1.5803719354418362, "grad_norm": 0.22810573875904083, "learning_rate": 1.9149782641976152e-05, "loss": 1.2826, "step": 5306 }, { "epoch": 1.5806697816415047, "grad_norm": 0.2198859602212906, "learning_rate": 1.9149393393864636e-05, "loss": 1.2809, "step": 5307 }, { "epoch": 1.5809676278411735, "grad_norm": 0.22619320452213287, "learning_rate": 1.9149004060628237e-05, "loss": 1.2689, "step": 5308 }, { "epoch": 1.5812654740408423, "grad_norm": 0.2240327000617981, "learning_rate": 1.914861464227058e-05, "loss": 1.2979, "step": 5309 }, { "epoch": 1.5815633202405108, "grad_norm": 0.217616006731987, "learning_rate": 1.9148225138795285e-05, "loss": 1.2831, "step": 5310 }, { "epoch": 1.5818611664401794, "grad_norm": 0.23388800024986267, "learning_rate": 1.9147835550205974e-05, "loss": 1.2849, "step": 5311 }, { "epoch": 1.5821590126398482, "grad_norm": 0.22546786069869995, "learning_rate": 1.914744587650627e-05, "loss": 1.2875, "step": 5312 }, { "epoch": 1.5824568588395167, "grad_norm": 0.22454583644866943, "learning_rate": 1.914705611769981e-05, "loss": 1.2806, "step": 5313 }, { "epoch": 1.5827547050391853, "grad_norm": 0.21439944207668304, "learning_rate": 1.9146666273790208e-05, "loss": 1.261, "step": 5314 }, { "epoch": 1.583052551238854, "grad_norm": 0.2200402319431305, "learning_rate": 1.9146276344781096e-05, "loss": 1.2879, "step": 5315 }, { "epoch": 1.5833503974385228, "grad_norm": 0.22738364338874817, "learning_rate": 1.9145886330676104e-05, "loss": 1.2963, "step": 5316 }, { "epoch": 1.5836482436381913, "grad_norm": 0.2065533846616745, "learning_rate": 1.9145496231478855e-05, "loss": 1.2747, "step": 5317 }, { "epoch": 1.58394608983786, "grad_norm": 0.21450136601924896, "learning_rate": 1.9145106047192983e-05, "loss": 1.2746, "step": 5318 }, { "epoch": 1.5842439360375287, "grad_norm": 0.2229100465774536, "learning_rate": 1.9144715777822113e-05, "loss": 1.2716, "step": 5319 }, { "epoch": 1.5845417822371972, "grad_norm": 0.21140901744365692, "learning_rate": 1.9144325423369883e-05, "loss": 1.2734, "step": 5320 }, { "epoch": 1.5848396284368658, "grad_norm": 0.21822868287563324, "learning_rate": 1.9143934983839923e-05, "loss": 1.2735, "step": 5321 }, { "epoch": 1.5851374746365345, "grad_norm": 0.23037533462047577, "learning_rate": 1.9143544459235864e-05, "loss": 1.2803, "step": 5322 }, { "epoch": 1.5854353208362033, "grad_norm": 0.2217172235250473, "learning_rate": 1.914315384956134e-05, "loss": 1.2738, "step": 5323 }, { "epoch": 1.5857331670358719, "grad_norm": 0.2185138761997223, "learning_rate": 1.9142763154819982e-05, "loss": 1.2855, "step": 5324 }, { "epoch": 1.5860310132355404, "grad_norm": 0.2042170614004135, "learning_rate": 1.9142372375015426e-05, "loss": 1.271, "step": 5325 }, { "epoch": 1.5863288594352092, "grad_norm": 0.23105546832084656, "learning_rate": 1.9141981510151314e-05, "loss": 1.2881, "step": 5326 }, { "epoch": 1.586626705634878, "grad_norm": 0.2183239758014679, "learning_rate": 1.9141590560231277e-05, "loss": 1.2661, "step": 5327 }, { "epoch": 1.5869245518345463, "grad_norm": 0.2241998314857483, "learning_rate": 1.914119952525895e-05, "loss": 1.2898, "step": 5328 }, { "epoch": 1.587222398034215, "grad_norm": 0.22711949050426483, "learning_rate": 1.914080840523798e-05, "loss": 1.2874, "step": 5329 }, { "epoch": 1.5875202442338838, "grad_norm": 0.21991921961307526, "learning_rate": 1.9140417200171995e-05, "loss": 1.2931, "step": 5330 }, { "epoch": 1.5878180904335524, "grad_norm": 0.23235772550106049, "learning_rate": 1.9140025910064645e-05, "loss": 1.2815, "step": 5331 }, { "epoch": 1.588115936633221, "grad_norm": 0.22265605628490448, "learning_rate": 1.9139634534919563e-05, "loss": 1.2811, "step": 5332 }, { "epoch": 1.5884137828328897, "grad_norm": 0.23455749452114105, "learning_rate": 1.9139243074740394e-05, "loss": 1.2648, "step": 5333 }, { "epoch": 1.5887116290325585, "grad_norm": 0.21858735382556915, "learning_rate": 1.913885152953078e-05, "loss": 1.2931, "step": 5334 }, { "epoch": 1.5890094752322268, "grad_norm": 0.2214006781578064, "learning_rate": 1.9138459899294363e-05, "loss": 1.2714, "step": 5335 }, { "epoch": 1.5893073214318956, "grad_norm": 0.22406496107578278, "learning_rate": 1.9138068184034786e-05, "loss": 1.2749, "step": 5336 }, { "epoch": 1.5896051676315643, "grad_norm": 0.22526974976062775, "learning_rate": 1.9137676383755696e-05, "loss": 1.2689, "step": 5337 }, { "epoch": 1.589903013831233, "grad_norm": 0.2210036814212799, "learning_rate": 1.913728449846073e-05, "loss": 1.2855, "step": 5338 }, { "epoch": 1.5902008600309014, "grad_norm": 0.21717329323291779, "learning_rate": 1.9136892528153548e-05, "loss": 1.2754, "step": 5339 }, { "epoch": 1.5904987062305702, "grad_norm": 0.22751939296722412, "learning_rate": 1.9136500472837785e-05, "loss": 1.294, "step": 5340 }, { "epoch": 1.590796552430239, "grad_norm": 0.21087098121643066, "learning_rate": 1.9136108332517095e-05, "loss": 1.2682, "step": 5341 }, { "epoch": 1.5910943986299075, "grad_norm": 0.21704800426959991, "learning_rate": 1.913571610719512e-05, "loss": 1.2899, "step": 5342 }, { "epoch": 1.591392244829576, "grad_norm": 0.22615809738636017, "learning_rate": 1.913532379687552e-05, "loss": 1.2861, "step": 5343 }, { "epoch": 1.5916900910292449, "grad_norm": 0.2318710833787918, "learning_rate": 1.9134931401561935e-05, "loss": 1.2758, "step": 5344 }, { "epoch": 1.5919879372289134, "grad_norm": 0.22961898148059845, "learning_rate": 1.9134538921258023e-05, "loss": 1.2824, "step": 5345 }, { "epoch": 1.592285783428582, "grad_norm": 0.22651560604572296, "learning_rate": 1.913414635596743e-05, "loss": 1.2778, "step": 5346 }, { "epoch": 1.5925836296282507, "grad_norm": 0.2375839352607727, "learning_rate": 1.913375370569381e-05, "loss": 1.2988, "step": 5347 }, { "epoch": 1.5928814758279195, "grad_norm": 0.24498562514781952, "learning_rate": 1.9133360970440816e-05, "loss": 1.2686, "step": 5348 }, { "epoch": 1.593179322027588, "grad_norm": 0.21912312507629395, "learning_rate": 1.91329681502121e-05, "loss": 1.269, "step": 5349 }, { "epoch": 1.5934771682272566, "grad_norm": 0.22842130064964294, "learning_rate": 1.9132575245011326e-05, "loss": 1.2766, "step": 5350 }, { "epoch": 1.5937750144269254, "grad_norm": 0.25212031602859497, "learning_rate": 1.9132182254842138e-05, "loss": 1.2784, "step": 5351 }, { "epoch": 1.594072860626594, "grad_norm": 0.2245430201292038, "learning_rate": 1.91317891797082e-05, "loss": 1.2805, "step": 5352 }, { "epoch": 1.5943707068262625, "grad_norm": 0.2176782637834549, "learning_rate": 1.9131396019613163e-05, "loss": 1.286, "step": 5353 }, { "epoch": 1.5946685530259312, "grad_norm": 0.22105389833450317, "learning_rate": 1.913100277456069e-05, "loss": 1.2804, "step": 5354 }, { "epoch": 1.5949663992256, "grad_norm": 0.20954161882400513, "learning_rate": 1.913060944455444e-05, "loss": 1.2603, "step": 5355 }, { "epoch": 1.5952642454252686, "grad_norm": 0.2434096336364746, "learning_rate": 1.9130216029598068e-05, "loss": 1.283, "step": 5356 }, { "epoch": 1.5955620916249371, "grad_norm": 0.25870367884635925, "learning_rate": 1.9129822529695236e-05, "loss": 1.2779, "step": 5357 }, { "epoch": 1.595859937824606, "grad_norm": 0.24894842505455017, "learning_rate": 1.912942894484961e-05, "loss": 1.2618, "step": 5358 }, { "epoch": 1.5961577840242744, "grad_norm": 0.2341824173927307, "learning_rate": 1.912903527506484e-05, "loss": 1.2819, "step": 5359 }, { "epoch": 1.596455630223943, "grad_norm": 0.3381546437740326, "learning_rate": 1.9128641520344602e-05, "loss": 1.2792, "step": 5360 }, { "epoch": 1.5967534764236118, "grad_norm": 0.25865137577056885, "learning_rate": 1.9128247680692556e-05, "loss": 1.2574, "step": 5361 }, { "epoch": 1.5970513226232805, "grad_norm": 0.23169195652008057, "learning_rate": 1.912785375611236e-05, "loss": 1.2807, "step": 5362 }, { "epoch": 1.597349168822949, "grad_norm": 0.21453149616718292, "learning_rate": 1.9127459746607685e-05, "loss": 1.2875, "step": 5363 }, { "epoch": 1.5976470150226176, "grad_norm": 0.210612952709198, "learning_rate": 1.9127065652182192e-05, "loss": 1.2623, "step": 5364 }, { "epoch": 1.5979448612222864, "grad_norm": 0.23503714799880981, "learning_rate": 1.9126671472839553e-05, "loss": 1.2784, "step": 5365 }, { "epoch": 1.598242707421955, "grad_norm": 0.2105993777513504, "learning_rate": 1.912627720858343e-05, "loss": 1.2633, "step": 5366 }, { "epoch": 1.5985405536216235, "grad_norm": 0.21981845796108246, "learning_rate": 1.9125882859417497e-05, "loss": 1.2679, "step": 5367 }, { "epoch": 1.5988383998212923, "grad_norm": 0.22042503952980042, "learning_rate": 1.912548842534542e-05, "loss": 1.2765, "step": 5368 }, { "epoch": 1.599136246020961, "grad_norm": 0.22172664105892181, "learning_rate": 1.9125093906370866e-05, "loss": 1.2834, "step": 5369 }, { "epoch": 1.5994340922206296, "grad_norm": 0.21650701761245728, "learning_rate": 1.9124699302497513e-05, "loss": 1.2773, "step": 5370 }, { "epoch": 1.5997319384202981, "grad_norm": 0.2130105346441269, "learning_rate": 1.9124304613729025e-05, "loss": 1.2701, "step": 5371 }, { "epoch": 1.600029784619967, "grad_norm": 0.22273163497447968, "learning_rate": 1.9123909840069076e-05, "loss": 1.2689, "step": 5372 }, { "epoch": 1.6003276308196355, "grad_norm": 0.2219657152891159, "learning_rate": 1.912351498152134e-05, "loss": 1.2946, "step": 5373 }, { "epoch": 1.600625477019304, "grad_norm": 0.22636285424232483, "learning_rate": 1.9123120038089487e-05, "loss": 1.2741, "step": 5374 }, { "epoch": 1.6009233232189728, "grad_norm": 0.2128158062696457, "learning_rate": 1.91227250097772e-05, "loss": 1.2898, "step": 5375 }, { "epoch": 1.6012211694186416, "grad_norm": 0.2251681089401245, "learning_rate": 1.912232989658815e-05, "loss": 1.2741, "step": 5376 }, { "epoch": 1.6015190156183101, "grad_norm": 0.2215256243944168, "learning_rate": 1.9121934698526012e-05, "loss": 1.2867, "step": 5377 }, { "epoch": 1.6018168618179787, "grad_norm": 0.22041627764701843, "learning_rate": 1.9121539415594462e-05, "loss": 1.2929, "step": 5378 }, { "epoch": 1.6021147080176474, "grad_norm": 0.2318020761013031, "learning_rate": 1.9121144047797177e-05, "loss": 1.2863, "step": 5379 }, { "epoch": 1.602412554217316, "grad_norm": 0.23751366138458252, "learning_rate": 1.912074859513784e-05, "loss": 1.2709, "step": 5380 }, { "epoch": 1.6027104004169845, "grad_norm": 0.22492028772830963, "learning_rate": 1.9120353057620127e-05, "loss": 1.2606, "step": 5381 }, { "epoch": 1.6030082466166533, "grad_norm": 0.22688385844230652, "learning_rate": 1.911995743524772e-05, "loss": 1.2795, "step": 5382 }, { "epoch": 1.603306092816322, "grad_norm": 0.2196384072303772, "learning_rate": 1.9119561728024295e-05, "loss": 1.2696, "step": 5383 }, { "epoch": 1.6036039390159906, "grad_norm": 0.21395279467105865, "learning_rate": 1.911916593595354e-05, "loss": 1.2871, "step": 5384 }, { "epoch": 1.6039017852156592, "grad_norm": 0.21836192905902863, "learning_rate": 1.9118770059039133e-05, "loss": 1.2748, "step": 5385 }, { "epoch": 1.604199631415328, "grad_norm": 0.21494531631469727, "learning_rate": 1.9118374097284758e-05, "loss": 1.2866, "step": 5386 }, { "epoch": 1.6044974776149965, "grad_norm": 0.23001225292682648, "learning_rate": 1.9117978050694104e-05, "loss": 1.2732, "step": 5387 }, { "epoch": 1.604795323814665, "grad_norm": 0.2237677127122879, "learning_rate": 1.911758191927085e-05, "loss": 1.2887, "step": 5388 }, { "epoch": 1.6050931700143338, "grad_norm": 0.21367740631103516, "learning_rate": 1.911718570301868e-05, "loss": 1.2662, "step": 5389 }, { "epoch": 1.6053910162140026, "grad_norm": 0.2201278805732727, "learning_rate": 1.9116789401941284e-05, "loss": 1.2671, "step": 5390 }, { "epoch": 1.6056888624136711, "grad_norm": 0.21882885694503784, "learning_rate": 1.911639301604235e-05, "loss": 1.2796, "step": 5391 }, { "epoch": 1.6059867086133397, "grad_norm": 0.25696447491645813, "learning_rate": 1.911599654532556e-05, "loss": 1.2805, "step": 5392 }, { "epoch": 1.6062845548130085, "grad_norm": 0.2266143560409546, "learning_rate": 1.9115599989794615e-05, "loss": 1.2896, "step": 5393 }, { "epoch": 1.6065824010126772, "grad_norm": 0.24198952317237854, "learning_rate": 1.911520334945319e-05, "loss": 1.2588, "step": 5394 }, { "epoch": 1.6068802472123456, "grad_norm": 0.23393751680850983, "learning_rate": 1.911480662430499e-05, "loss": 1.2726, "step": 5395 }, { "epoch": 1.6071780934120143, "grad_norm": 0.2551330626010895, "learning_rate": 1.911440981435369e-05, "loss": 1.2901, "step": 5396 }, { "epoch": 1.6074759396116831, "grad_norm": 0.21567265689373016, "learning_rate": 1.911401291960299e-05, "loss": 1.2778, "step": 5397 }, { "epoch": 1.6077737858113517, "grad_norm": 0.24062630534172058, "learning_rate": 1.9113615940056586e-05, "loss": 1.2726, "step": 5398 }, { "epoch": 1.6080716320110202, "grad_norm": 0.2298053652048111, "learning_rate": 1.9113218875718166e-05, "loss": 1.27, "step": 5399 }, { "epoch": 1.608369478210689, "grad_norm": 0.22689305245876312, "learning_rate": 1.9112821726591427e-05, "loss": 1.2807, "step": 5400 }, { "epoch": 1.6086673244103578, "grad_norm": 0.2515493333339691, "learning_rate": 1.9112424492680064e-05, "loss": 1.2797, "step": 5401 }, { "epoch": 1.608965170610026, "grad_norm": 0.22785691916942596, "learning_rate": 1.9112027173987774e-05, "loss": 1.2781, "step": 5402 }, { "epoch": 1.6092630168096949, "grad_norm": 0.21993227303028107, "learning_rate": 1.9111629770518246e-05, "loss": 1.2627, "step": 5403 }, { "epoch": 1.6095608630093636, "grad_norm": 0.23092100024223328, "learning_rate": 1.911123228227519e-05, "loss": 1.2953, "step": 5404 }, { "epoch": 1.6098587092090322, "grad_norm": 0.22650252282619476, "learning_rate": 1.911083470926229e-05, "loss": 1.2655, "step": 5405 }, { "epoch": 1.6101565554087007, "grad_norm": 0.2112005203962326, "learning_rate": 1.9110437051483256e-05, "loss": 1.2698, "step": 5406 }, { "epoch": 1.6104544016083695, "grad_norm": 0.2540183961391449, "learning_rate": 1.9110039308941784e-05, "loss": 1.2653, "step": 5407 }, { "epoch": 1.6107522478080383, "grad_norm": 0.21758641302585602, "learning_rate": 1.9109641481641575e-05, "loss": 1.2744, "step": 5408 }, { "epoch": 1.6110500940077068, "grad_norm": 0.21442918479442596, "learning_rate": 1.9109243569586327e-05, "loss": 1.2684, "step": 5409 }, { "epoch": 1.6113479402073754, "grad_norm": 0.22990816831588745, "learning_rate": 1.9108845572779748e-05, "loss": 1.2761, "step": 5410 }, { "epoch": 1.6116457864070441, "grad_norm": 0.22343116998672485, "learning_rate": 1.910844749122554e-05, "loss": 1.2808, "step": 5411 }, { "epoch": 1.6119436326067127, "grad_norm": 0.22033905982971191, "learning_rate": 1.9108049324927403e-05, "loss": 1.2882, "step": 5412 }, { "epoch": 1.6122414788063812, "grad_norm": 0.22083336114883423, "learning_rate": 1.9107651073889038e-05, "loss": 1.2621, "step": 5413 }, { "epoch": 1.61253932500605, "grad_norm": 0.20866520702838898, "learning_rate": 1.910725273811416e-05, "loss": 1.2732, "step": 5414 }, { "epoch": 1.6128371712057188, "grad_norm": 0.22707264125347137, "learning_rate": 1.910685431760647e-05, "loss": 1.2922, "step": 5415 }, { "epoch": 1.6131350174053873, "grad_norm": 0.2184765338897705, "learning_rate": 1.9106455812369677e-05, "loss": 1.263, "step": 5416 }, { "epoch": 1.613432863605056, "grad_norm": 0.23042844235897064, "learning_rate": 1.9106057222407486e-05, "loss": 1.2661, "step": 5417 }, { "epoch": 1.6137307098047247, "grad_norm": 0.22253674268722534, "learning_rate": 1.9105658547723607e-05, "loss": 1.2839, "step": 5418 }, { "epoch": 1.6140285560043932, "grad_norm": 0.22392839193344116, "learning_rate": 1.9105259788321746e-05, "loss": 1.2831, "step": 5419 }, { "epoch": 1.6143264022040618, "grad_norm": 0.21724386513233185, "learning_rate": 1.9104860944205615e-05, "loss": 1.2766, "step": 5420 }, { "epoch": 1.6146242484037305, "grad_norm": 0.2196635752916336, "learning_rate": 1.910446201537893e-05, "loss": 1.2854, "step": 5421 }, { "epoch": 1.6149220946033993, "grad_norm": 0.2713010609149933, "learning_rate": 1.9104063001845398e-05, "loss": 1.2843, "step": 5422 }, { "epoch": 1.6152199408030679, "grad_norm": 0.22543871402740479, "learning_rate": 1.9103663903608728e-05, "loss": 1.2726, "step": 5423 }, { "epoch": 1.6155177870027364, "grad_norm": 0.2172880917787552, "learning_rate": 1.9103264720672635e-05, "loss": 1.2741, "step": 5424 }, { "epoch": 1.6158156332024052, "grad_norm": 0.23078133165836334, "learning_rate": 1.910286545304084e-05, "loss": 1.2789, "step": 5425 }, { "epoch": 1.6161134794020737, "grad_norm": 0.22541062533855438, "learning_rate": 1.9102466100717048e-05, "loss": 1.2723, "step": 5426 }, { "epoch": 1.6164113256017423, "grad_norm": 0.23872913420200348, "learning_rate": 1.910206666370498e-05, "loss": 1.2655, "step": 5427 }, { "epoch": 1.616709171801411, "grad_norm": 0.24908356368541718, "learning_rate": 1.9101667142008353e-05, "loss": 1.2914, "step": 5428 }, { "epoch": 1.6170070180010798, "grad_norm": 0.2230306714773178, "learning_rate": 1.9101267535630882e-05, "loss": 1.2852, "step": 5429 }, { "epoch": 1.6173048642007484, "grad_norm": 0.21355973184108734, "learning_rate": 1.910086784457628e-05, "loss": 1.2782, "step": 5430 }, { "epoch": 1.617602710400417, "grad_norm": 0.2299388200044632, "learning_rate": 1.9100468068848275e-05, "loss": 1.2765, "step": 5431 }, { "epoch": 1.6179005566000857, "grad_norm": 0.23457807302474976, "learning_rate": 1.9100068208450583e-05, "loss": 1.2703, "step": 5432 }, { "epoch": 1.6181984027997542, "grad_norm": 0.2189137488603592, "learning_rate": 1.909966826338692e-05, "loss": 1.2696, "step": 5433 }, { "epoch": 1.6184962489994228, "grad_norm": 0.24128244817256927, "learning_rate": 1.9099268233661016e-05, "loss": 1.2749, "step": 5434 }, { "epoch": 1.6187940951990916, "grad_norm": 0.2196899801492691, "learning_rate": 1.9098868119276585e-05, "loss": 1.2658, "step": 5435 }, { "epoch": 1.6190919413987603, "grad_norm": 0.2194526493549347, "learning_rate": 1.909846792023735e-05, "loss": 1.2688, "step": 5436 }, { "epoch": 1.6193897875984289, "grad_norm": 0.22628456354141235, "learning_rate": 1.9098067636547038e-05, "loss": 1.2692, "step": 5437 }, { "epoch": 1.6196876337980974, "grad_norm": 0.2313448041677475, "learning_rate": 1.909766726820937e-05, "loss": 1.2814, "step": 5438 }, { "epoch": 1.6199854799977662, "grad_norm": 0.24621717631816864, "learning_rate": 1.9097266815228074e-05, "loss": 1.2846, "step": 5439 }, { "epoch": 1.6202833261974348, "grad_norm": 0.22433489561080933, "learning_rate": 1.9096866277606873e-05, "loss": 1.2617, "step": 5440 }, { "epoch": 1.6205811723971033, "grad_norm": 0.2487044781446457, "learning_rate": 1.9096465655349496e-05, "loss": 1.2588, "step": 5441 }, { "epoch": 1.620879018596772, "grad_norm": 0.21417568624019623, "learning_rate": 1.909606494845967e-05, "loss": 1.2712, "step": 5442 }, { "epoch": 1.6211768647964409, "grad_norm": 0.23787125945091248, "learning_rate": 1.9095664156941123e-05, "loss": 1.2558, "step": 5443 }, { "epoch": 1.6214747109961094, "grad_norm": 0.243920236825943, "learning_rate": 1.9095263280797584e-05, "loss": 1.2645, "step": 5444 }, { "epoch": 1.621772557195778, "grad_norm": 0.24374152719974518, "learning_rate": 1.909486232003278e-05, "loss": 1.2905, "step": 5445 }, { "epoch": 1.6220704033954467, "grad_norm": 0.23641370236873627, "learning_rate": 1.909446127465044e-05, "loss": 1.265, "step": 5446 }, { "epoch": 1.6223682495951153, "grad_norm": 0.21501170098781586, "learning_rate": 1.9094060144654306e-05, "loss": 1.2737, "step": 5447 }, { "epoch": 1.6226660957947838, "grad_norm": 0.2302922159433365, "learning_rate": 1.9093658930048097e-05, "loss": 1.2672, "step": 5448 }, { "epoch": 1.6229639419944526, "grad_norm": 0.24240373075008392, "learning_rate": 1.9093257630835554e-05, "loss": 1.2801, "step": 5449 }, { "epoch": 1.6232617881941214, "grad_norm": 0.22276267409324646, "learning_rate": 1.909285624702041e-05, "loss": 1.2984, "step": 5450 }, { "epoch": 1.62355963439379, "grad_norm": 0.2276402711868286, "learning_rate": 1.9092454778606395e-05, "loss": 1.2742, "step": 5451 }, { "epoch": 1.6238574805934585, "grad_norm": 0.23029804229736328, "learning_rate": 1.9092053225597245e-05, "loss": 1.2524, "step": 5452 }, { "epoch": 1.6241553267931272, "grad_norm": 0.22955401241779327, "learning_rate": 1.9091651587996704e-05, "loss": 1.261, "step": 5453 }, { "epoch": 1.6244531729927958, "grad_norm": 0.22547177970409393, "learning_rate": 1.9091249865808498e-05, "loss": 1.2697, "step": 5454 }, { "epoch": 1.6247510191924643, "grad_norm": 0.23433789610862732, "learning_rate": 1.9090848059036372e-05, "loss": 1.276, "step": 5455 }, { "epoch": 1.6250488653921331, "grad_norm": 0.22249680757522583, "learning_rate": 1.9090446167684062e-05, "loss": 1.2781, "step": 5456 }, { "epoch": 1.6253467115918019, "grad_norm": 0.23831702768802643, "learning_rate": 1.9090044191755305e-05, "loss": 1.2812, "step": 5457 }, { "epoch": 1.6256445577914704, "grad_norm": 0.21816882491111755, "learning_rate": 1.9089642131253843e-05, "loss": 1.2686, "step": 5458 }, { "epoch": 1.625942403991139, "grad_norm": 0.2549525201320648, "learning_rate": 1.9089239986183415e-05, "loss": 1.2765, "step": 5459 }, { "epoch": 1.6262402501908078, "grad_norm": 0.2201966941356659, "learning_rate": 1.9088837756547765e-05, "loss": 1.2544, "step": 5460 }, { "epoch": 1.6265380963904765, "grad_norm": 0.22870147228240967, "learning_rate": 1.9088435442350638e-05, "loss": 1.2857, "step": 5461 }, { "epoch": 1.6268359425901449, "grad_norm": 0.2498757541179657, "learning_rate": 1.908803304359577e-05, "loss": 1.2667, "step": 5462 }, { "epoch": 1.6271337887898136, "grad_norm": 0.21831880509853363, "learning_rate": 1.9087630560286906e-05, "loss": 1.2645, "step": 5463 }, { "epoch": 1.6274316349894824, "grad_norm": 0.229718878865242, "learning_rate": 1.9087227992427796e-05, "loss": 1.2711, "step": 5464 }, { "epoch": 1.627729481189151, "grad_norm": 0.22852644324302673, "learning_rate": 1.9086825340022183e-05, "loss": 1.2653, "step": 5465 }, { "epoch": 1.6280273273888195, "grad_norm": 0.21467527747154236, "learning_rate": 1.9086422603073812e-05, "loss": 1.286, "step": 5466 }, { "epoch": 1.6283251735884883, "grad_norm": 0.2270808070898056, "learning_rate": 1.908601978158643e-05, "loss": 1.2717, "step": 5467 }, { "epoch": 1.628623019788157, "grad_norm": 0.23573139309883118, "learning_rate": 1.9085616875563788e-05, "loss": 1.2708, "step": 5468 }, { "epoch": 1.6289208659878254, "grad_norm": 0.22102656960487366, "learning_rate": 1.908521388500963e-05, "loss": 1.2806, "step": 5469 }, { "epoch": 1.6292187121874941, "grad_norm": 0.223962664604187, "learning_rate": 1.9084810809927708e-05, "loss": 1.2603, "step": 5470 }, { "epoch": 1.629516558387163, "grad_norm": 0.24056734144687653, "learning_rate": 1.908440765032177e-05, "loss": 1.2711, "step": 5471 }, { "epoch": 1.6298144045868315, "grad_norm": 0.23388129472732544, "learning_rate": 1.9084004406195568e-05, "loss": 1.2893, "step": 5472 }, { "epoch": 1.6301122507865, "grad_norm": 0.21446532011032104, "learning_rate": 1.9083601077552857e-05, "loss": 1.2731, "step": 5473 }, { "epoch": 1.6304100969861688, "grad_norm": 0.23439285159111023, "learning_rate": 1.9083197664397386e-05, "loss": 1.2745, "step": 5474 }, { "epoch": 1.6307079431858376, "grad_norm": 0.24088014662265778, "learning_rate": 1.9082794166732906e-05, "loss": 1.2797, "step": 5475 }, { "epoch": 1.631005789385506, "grad_norm": 0.21608074009418488, "learning_rate": 1.9082390584563178e-05, "loss": 1.2807, "step": 5476 }, { "epoch": 1.6313036355851747, "grad_norm": 0.228294238448143, "learning_rate": 1.9081986917891952e-05, "loss": 1.2894, "step": 5477 }, { "epoch": 1.6316014817848434, "grad_norm": 0.2253601998090744, "learning_rate": 1.9081583166722986e-05, "loss": 1.2725, "step": 5478 }, { "epoch": 1.631899327984512, "grad_norm": 0.2199680656194687, "learning_rate": 1.908117933106003e-05, "loss": 1.272, "step": 5479 }, { "epoch": 1.6321971741841805, "grad_norm": 0.22012652456760406, "learning_rate": 1.9080775410906854e-05, "loss": 1.2612, "step": 5480 }, { "epoch": 1.6324950203838493, "grad_norm": 0.2143363207578659, "learning_rate": 1.9080371406267205e-05, "loss": 1.2676, "step": 5481 }, { "epoch": 1.632792866583518, "grad_norm": 0.23318170011043549, "learning_rate": 1.907996731714484e-05, "loss": 1.2822, "step": 5482 }, { "epoch": 1.6330907127831866, "grad_norm": 0.2162562608718872, "learning_rate": 1.907956314354353e-05, "loss": 1.2734, "step": 5483 }, { "epoch": 1.6333885589828552, "grad_norm": 0.22819747030735016, "learning_rate": 1.9079158885467027e-05, "loss": 1.2855, "step": 5484 }, { "epoch": 1.633686405182524, "grad_norm": 0.25651898980140686, "learning_rate": 1.90787545429191e-05, "loss": 1.2655, "step": 5485 }, { "epoch": 1.6339842513821925, "grad_norm": 0.22409702837467194, "learning_rate": 1.9078350115903496e-05, "loss": 1.2695, "step": 5486 }, { "epoch": 1.634282097581861, "grad_norm": 0.23563243448734283, "learning_rate": 1.9077945604423994e-05, "loss": 1.2765, "step": 5487 }, { "epoch": 1.6345799437815298, "grad_norm": 0.2208978831768036, "learning_rate": 1.9077541008484347e-05, "loss": 1.2881, "step": 5488 }, { "epoch": 1.6348777899811986, "grad_norm": 0.23069295287132263, "learning_rate": 1.9077136328088325e-05, "loss": 1.2965, "step": 5489 }, { "epoch": 1.6351756361808671, "grad_norm": 0.2202451229095459, "learning_rate": 1.907673156323969e-05, "loss": 1.27, "step": 5490 }, { "epoch": 1.6354734823805357, "grad_norm": 0.21095435321331024, "learning_rate": 1.9076326713942208e-05, "loss": 1.2546, "step": 5491 }, { "epoch": 1.6357713285802045, "grad_norm": 0.2126925140619278, "learning_rate": 1.9075921780199648e-05, "loss": 1.2693, "step": 5492 }, { "epoch": 1.636069174779873, "grad_norm": 0.21833592653274536, "learning_rate": 1.9075516762015777e-05, "loss": 1.2755, "step": 5493 }, { "epoch": 1.6363670209795416, "grad_norm": 0.22189980745315552, "learning_rate": 1.907511165939436e-05, "loss": 1.2861, "step": 5494 }, { "epoch": 1.6366648671792103, "grad_norm": 0.21767203509807587, "learning_rate": 1.907470647233917e-05, "loss": 1.2742, "step": 5495 }, { "epoch": 1.636962713378879, "grad_norm": 0.21431377530097961, "learning_rate": 1.9074301200853976e-05, "loss": 1.2577, "step": 5496 }, { "epoch": 1.6372605595785477, "grad_norm": 0.2187560349702835, "learning_rate": 1.9073895844942548e-05, "loss": 1.2672, "step": 5497 }, { "epoch": 1.6375584057782162, "grad_norm": 0.22370322048664093, "learning_rate": 1.9073490404608654e-05, "loss": 1.2764, "step": 5498 }, { "epoch": 1.637856251977885, "grad_norm": 0.2242758572101593, "learning_rate": 1.907308487985607e-05, "loss": 1.2857, "step": 5499 }, { "epoch": 1.6381540981775535, "grad_norm": 0.20985734462738037, "learning_rate": 1.907267927068857e-05, "loss": 1.2697, "step": 5500 }, { "epoch": 1.6381540981775535, "eval_loss": 1.3477442264556885, "eval_runtime": 21.5816, "eval_samples_per_second": 80.346, "eval_steps_per_second": 5.051, "step": 5500 }, { "epoch": 1.638451944377222, "grad_norm": 0.22465425729751587, "learning_rate": 1.9072273577109923e-05, "loss": 1.2718, "step": 5501 }, { "epoch": 1.6387497905768909, "grad_norm": 0.23549233376979828, "learning_rate": 1.9071867799123913e-05, "loss": 1.2783, "step": 5502 }, { "epoch": 1.6390476367765596, "grad_norm": 0.22173607349395752, "learning_rate": 1.90714619367343e-05, "loss": 1.2814, "step": 5503 }, { "epoch": 1.6393454829762282, "grad_norm": 0.24669671058654785, "learning_rate": 1.9071055989944873e-05, "loss": 1.2796, "step": 5504 }, { "epoch": 1.6396433291758967, "grad_norm": 0.2293134182691574, "learning_rate": 1.9070649958759406e-05, "loss": 1.2793, "step": 5505 }, { "epoch": 1.6399411753755655, "grad_norm": 0.23103909194469452, "learning_rate": 1.9070243843181675e-05, "loss": 1.2993, "step": 5506 }, { "epoch": 1.640239021575234, "grad_norm": 0.21258032321929932, "learning_rate": 1.9069837643215457e-05, "loss": 1.2903, "step": 5507 }, { "epoch": 1.6405368677749026, "grad_norm": 0.2531338334083557, "learning_rate": 1.9069431358864535e-05, "loss": 1.2822, "step": 5508 }, { "epoch": 1.6408347139745714, "grad_norm": 0.21448421478271484, "learning_rate": 1.9069024990132688e-05, "loss": 1.269, "step": 5509 }, { "epoch": 1.6411325601742401, "grad_norm": 0.21856717765331268, "learning_rate": 1.9068618537023695e-05, "loss": 1.2749, "step": 5510 }, { "epoch": 1.6414304063739087, "grad_norm": 0.23338472843170166, "learning_rate": 1.9068211999541336e-05, "loss": 1.2791, "step": 5511 }, { "epoch": 1.6417282525735772, "grad_norm": 0.22949013113975525, "learning_rate": 1.90678053776894e-05, "loss": 1.2751, "step": 5512 }, { "epoch": 1.642026098773246, "grad_norm": 0.25006377696990967, "learning_rate": 1.906739867147166e-05, "loss": 1.2781, "step": 5513 }, { "epoch": 1.6423239449729146, "grad_norm": 0.23023587465286255, "learning_rate": 1.906699188089191e-05, "loss": 1.2832, "step": 5514 }, { "epoch": 1.6426217911725831, "grad_norm": 0.2393902689218521, "learning_rate": 1.9066585005953934e-05, "loss": 1.2796, "step": 5515 }, { "epoch": 1.6429196373722519, "grad_norm": 0.22710615396499634, "learning_rate": 1.906617804666151e-05, "loss": 1.2771, "step": 5516 }, { "epoch": 1.6432174835719207, "grad_norm": 0.3001965582370758, "learning_rate": 1.9065771003018433e-05, "loss": 1.2713, "step": 5517 }, { "epoch": 1.6435153297715892, "grad_norm": 0.2223108857870102, "learning_rate": 1.906536387502848e-05, "loss": 1.2755, "step": 5518 }, { "epoch": 1.6438131759712578, "grad_norm": 0.2387503832578659, "learning_rate": 1.906495666269545e-05, "loss": 1.2922, "step": 5519 }, { "epoch": 1.6441110221709265, "grad_norm": 0.23094727098941803, "learning_rate": 1.9064549366023124e-05, "loss": 1.2711, "step": 5520 }, { "epoch": 1.6444088683705953, "grad_norm": 0.22692003846168518, "learning_rate": 1.9064141985015293e-05, "loss": 1.2765, "step": 5521 }, { "epoch": 1.6447067145702636, "grad_norm": 0.2327008694410324, "learning_rate": 1.9063734519675748e-05, "loss": 1.2811, "step": 5522 }, { "epoch": 1.6450045607699324, "grad_norm": 0.23037296533584595, "learning_rate": 1.906332697000828e-05, "loss": 1.286, "step": 5523 }, { "epoch": 1.6453024069696012, "grad_norm": 0.22375428676605225, "learning_rate": 1.906291933601668e-05, "loss": 1.2741, "step": 5524 }, { "epoch": 1.6456002531692697, "grad_norm": 0.22224177420139313, "learning_rate": 1.9062511617704743e-05, "loss": 1.2722, "step": 5525 }, { "epoch": 1.6458980993689383, "grad_norm": 0.22655761241912842, "learning_rate": 1.9062103815076257e-05, "loss": 1.2644, "step": 5526 }, { "epoch": 1.646195945568607, "grad_norm": 0.22255229949951172, "learning_rate": 1.906169592813502e-05, "loss": 1.271, "step": 5527 }, { "epoch": 1.6464937917682758, "grad_norm": 0.2182021141052246, "learning_rate": 1.9061287956884834e-05, "loss": 1.2652, "step": 5528 }, { "epoch": 1.6467916379679441, "grad_norm": 0.23365797102451324, "learning_rate": 1.9060879901329482e-05, "loss": 1.2886, "step": 5529 }, { "epoch": 1.647089484167613, "grad_norm": 0.2474723905324936, "learning_rate": 1.9060471761472766e-05, "loss": 1.2832, "step": 5530 }, { "epoch": 1.6473873303672817, "grad_norm": 0.20822176337242126, "learning_rate": 1.9060063537318484e-05, "loss": 1.2621, "step": 5531 }, { "epoch": 1.6476851765669502, "grad_norm": 0.22447814047336578, "learning_rate": 1.9059655228870434e-05, "loss": 1.276, "step": 5532 }, { "epoch": 1.6479830227666188, "grad_norm": 0.22813937067985535, "learning_rate": 1.905924683613241e-05, "loss": 1.2783, "step": 5533 }, { "epoch": 1.6482808689662876, "grad_norm": 0.22755442559719086, "learning_rate": 1.905883835910822e-05, "loss": 1.2882, "step": 5534 }, { "epoch": 1.6485787151659563, "grad_norm": 0.22414061427116394, "learning_rate": 1.9058429797801657e-05, "loss": 1.2656, "step": 5535 }, { "epoch": 1.6488765613656247, "grad_norm": 0.22432328760623932, "learning_rate": 1.9058021152216527e-05, "loss": 1.2664, "step": 5536 }, { "epoch": 1.6491744075652934, "grad_norm": 0.2137758731842041, "learning_rate": 1.9057612422356633e-05, "loss": 1.266, "step": 5537 }, { "epoch": 1.6494722537649622, "grad_norm": 0.22579067945480347, "learning_rate": 1.9057203608225773e-05, "loss": 1.2476, "step": 5538 }, { "epoch": 1.6497700999646308, "grad_norm": 0.2272815704345703, "learning_rate": 1.9056794709827752e-05, "loss": 1.2715, "step": 5539 }, { "epoch": 1.6500679461642993, "grad_norm": 0.22341889142990112, "learning_rate": 1.9056385727166376e-05, "loss": 1.2826, "step": 5540 }, { "epoch": 1.650365792363968, "grad_norm": 0.24031281471252441, "learning_rate": 1.905597666024545e-05, "loss": 1.268, "step": 5541 }, { "epoch": 1.6506636385636368, "grad_norm": 0.24679391086101532, "learning_rate": 1.9055567509068777e-05, "loss": 1.3016, "step": 5542 }, { "epoch": 1.6509614847633054, "grad_norm": 0.23233523964881897, "learning_rate": 1.905515827364017e-05, "loss": 1.2812, "step": 5543 }, { "epoch": 1.651259330962974, "grad_norm": 0.21395809948444366, "learning_rate": 1.9054748953963427e-05, "loss": 1.2742, "step": 5544 }, { "epoch": 1.6515571771626427, "grad_norm": 0.2266373187303543, "learning_rate": 1.9054339550042364e-05, "loss": 1.2536, "step": 5545 }, { "epoch": 1.6518550233623113, "grad_norm": 0.2295445203781128, "learning_rate": 1.9053930061880788e-05, "loss": 1.2784, "step": 5546 }, { "epoch": 1.6521528695619798, "grad_norm": 0.22353899478912354, "learning_rate": 1.9053520489482506e-05, "loss": 1.2576, "step": 5547 }, { "epoch": 1.6524507157616486, "grad_norm": 0.22203253209590912, "learning_rate": 1.9053110832851335e-05, "loss": 1.2885, "step": 5548 }, { "epoch": 1.6527485619613174, "grad_norm": 0.2155544012784958, "learning_rate": 1.905270109199108e-05, "loss": 1.2789, "step": 5549 }, { "epoch": 1.653046408160986, "grad_norm": 0.22099432349205017, "learning_rate": 1.9052291266905553e-05, "loss": 1.2626, "step": 5550 }, { "epoch": 1.6533442543606545, "grad_norm": 0.22569872438907623, "learning_rate": 1.9051881357598575e-05, "loss": 1.2599, "step": 5551 }, { "epoch": 1.6536421005603232, "grad_norm": 0.23275689780712128, "learning_rate": 1.9051471364073954e-05, "loss": 1.2762, "step": 5552 }, { "epoch": 1.6539399467599918, "grad_norm": 0.22771941125392914, "learning_rate": 1.9051061286335498e-05, "loss": 1.2906, "step": 5553 }, { "epoch": 1.6542377929596603, "grad_norm": 0.21359166502952576, "learning_rate": 1.9050651124387035e-05, "loss": 1.261, "step": 5554 }, { "epoch": 1.654535639159329, "grad_norm": 0.2313777059316635, "learning_rate": 1.9050240878232375e-05, "loss": 1.2778, "step": 5555 }, { "epoch": 1.6548334853589979, "grad_norm": 0.2221272587776184, "learning_rate": 1.9049830547875334e-05, "loss": 1.2712, "step": 5556 }, { "epoch": 1.6551313315586664, "grad_norm": 0.20936013758182526, "learning_rate": 1.9049420133319732e-05, "loss": 1.2664, "step": 5557 }, { "epoch": 1.655429177758335, "grad_norm": 0.2277703881263733, "learning_rate": 1.9049009634569384e-05, "loss": 1.2766, "step": 5558 }, { "epoch": 1.6557270239580038, "grad_norm": 0.2331382781267166, "learning_rate": 1.904859905162811e-05, "loss": 1.2557, "step": 5559 }, { "epoch": 1.6560248701576723, "grad_norm": 0.2420787364244461, "learning_rate": 1.9048188384499736e-05, "loss": 1.2863, "step": 5560 }, { "epoch": 1.6563227163573409, "grad_norm": 0.21740663051605225, "learning_rate": 1.9047777633188077e-05, "loss": 1.2483, "step": 5561 }, { "epoch": 1.6566205625570096, "grad_norm": 0.23249472677707672, "learning_rate": 1.9047366797696954e-05, "loss": 1.2588, "step": 5562 }, { "epoch": 1.6569184087566784, "grad_norm": 0.21488188207149506, "learning_rate": 1.9046955878030195e-05, "loss": 1.2798, "step": 5563 }, { "epoch": 1.657216254956347, "grad_norm": 0.24667470157146454, "learning_rate": 1.9046544874191617e-05, "loss": 1.2621, "step": 5564 }, { "epoch": 1.6575141011560155, "grad_norm": 0.23690661787986755, "learning_rate": 1.9046133786185045e-05, "loss": 1.2844, "step": 5565 }, { "epoch": 1.6578119473556843, "grad_norm": 0.23039014637470245, "learning_rate": 1.904572261401431e-05, "loss": 1.2688, "step": 5566 }, { "epoch": 1.6581097935553528, "grad_norm": 0.24273911118507385, "learning_rate": 1.904531135768323e-05, "loss": 1.2863, "step": 5567 }, { "epoch": 1.6584076397550214, "grad_norm": 0.21636506915092468, "learning_rate": 1.9044900017195627e-05, "loss": 1.2591, "step": 5568 }, { "epoch": 1.6587054859546901, "grad_norm": 0.22625398635864258, "learning_rate": 1.9044488592555343e-05, "loss": 1.2691, "step": 5569 }, { "epoch": 1.659003332154359, "grad_norm": 0.23376969993114471, "learning_rate": 1.90440770837662e-05, "loss": 1.2652, "step": 5570 }, { "epoch": 1.6593011783540275, "grad_norm": 0.21269488334655762, "learning_rate": 1.9043665490832018e-05, "loss": 1.2799, "step": 5571 }, { "epoch": 1.659599024553696, "grad_norm": 0.22330698370933533, "learning_rate": 1.9043253813756634e-05, "loss": 1.2787, "step": 5572 }, { "epoch": 1.6598968707533648, "grad_norm": 0.21178990602493286, "learning_rate": 1.9042842052543878e-05, "loss": 1.2599, "step": 5573 }, { "epoch": 1.6601947169530333, "grad_norm": 0.21995270252227783, "learning_rate": 1.904243020719758e-05, "loss": 1.2697, "step": 5574 }, { "epoch": 1.6604925631527019, "grad_norm": 0.25758859515190125, "learning_rate": 1.904201827772157e-05, "loss": 1.2784, "step": 5575 }, { "epoch": 1.6607904093523707, "grad_norm": 0.3004457652568817, "learning_rate": 1.9041606264119683e-05, "loss": 1.2754, "step": 5576 }, { "epoch": 1.6610882555520394, "grad_norm": 0.27928078174591064, "learning_rate": 1.9041194166395753e-05, "loss": 1.2803, "step": 5577 }, { "epoch": 1.661386101751708, "grad_norm": 0.22976548969745636, "learning_rate": 1.9040781984553612e-05, "loss": 1.2912, "step": 5578 }, { "epoch": 1.6616839479513765, "grad_norm": 0.3062029480934143, "learning_rate": 1.9040369718597098e-05, "loss": 1.2855, "step": 5579 }, { "epoch": 1.6619817941510453, "grad_norm": 0.2543890178203583, "learning_rate": 1.903995736853004e-05, "loss": 1.2666, "step": 5580 }, { "epoch": 1.6622796403507138, "grad_norm": 0.22848451137542725, "learning_rate": 1.903954493435628e-05, "loss": 1.2911, "step": 5581 }, { "epoch": 1.6625774865503824, "grad_norm": 0.22023822367191315, "learning_rate": 1.9039132416079656e-05, "loss": 1.2682, "step": 5582 }, { "epoch": 1.6628753327500512, "grad_norm": 0.24857406318187714, "learning_rate": 1.9038719813704e-05, "loss": 1.264, "step": 5583 }, { "epoch": 1.66317317894972, "grad_norm": 0.24410122632980347, "learning_rate": 1.9038307127233157e-05, "loss": 1.2985, "step": 5584 }, { "epoch": 1.6634710251493885, "grad_norm": 0.2163051813840866, "learning_rate": 1.9037894356670964e-05, "loss": 1.2765, "step": 5585 }, { "epoch": 1.663768871349057, "grad_norm": 0.22405213117599487, "learning_rate": 1.9037481502021266e-05, "loss": 1.2696, "step": 5586 }, { "epoch": 1.6640667175487258, "grad_norm": 0.2406502664089203, "learning_rate": 1.9037068563287894e-05, "loss": 1.2746, "step": 5587 }, { "epoch": 1.6643645637483946, "grad_norm": 0.23189301788806915, "learning_rate": 1.90366555404747e-05, "loss": 1.2684, "step": 5588 }, { "epoch": 1.664662409948063, "grad_norm": 0.21800807118415833, "learning_rate": 1.9036242433585517e-05, "loss": 1.2672, "step": 5589 }, { "epoch": 1.6649602561477317, "grad_norm": 0.2630351781845093, "learning_rate": 1.90358292426242e-05, "loss": 1.2678, "step": 5590 }, { "epoch": 1.6652581023474005, "grad_norm": 0.22028449177742004, "learning_rate": 1.9035415967594587e-05, "loss": 1.2878, "step": 5591 }, { "epoch": 1.665555948547069, "grad_norm": 0.22171668708324432, "learning_rate": 1.903500260850052e-05, "loss": 1.2765, "step": 5592 }, { "epoch": 1.6658537947467376, "grad_norm": 0.22152946889400482, "learning_rate": 1.9034589165345848e-05, "loss": 1.2744, "step": 5593 }, { "epoch": 1.6661516409464063, "grad_norm": 0.2194407433271408, "learning_rate": 1.903417563813442e-05, "loss": 1.2786, "step": 5594 }, { "epoch": 1.666449487146075, "grad_norm": 0.22076888382434845, "learning_rate": 1.903376202687008e-05, "loss": 1.2613, "step": 5595 }, { "epoch": 1.6667473333457434, "grad_norm": 0.22642038762569427, "learning_rate": 1.903334833155668e-05, "loss": 1.2887, "step": 5596 }, { "epoch": 1.6670451795454122, "grad_norm": 0.23118306696414948, "learning_rate": 1.9032934552198066e-05, "loss": 1.2798, "step": 5597 }, { "epoch": 1.667343025745081, "grad_norm": 0.22751539945602417, "learning_rate": 1.9032520688798085e-05, "loss": 1.2701, "step": 5598 }, { "epoch": 1.6676408719447495, "grad_norm": 0.21959461271762848, "learning_rate": 1.9032106741360593e-05, "loss": 1.2881, "step": 5599 }, { "epoch": 1.667938718144418, "grad_norm": 0.23361878097057343, "learning_rate": 1.9031692709889437e-05, "loss": 1.2481, "step": 5600 }, { "epoch": 1.6682365643440868, "grad_norm": 0.21968644857406616, "learning_rate": 1.9031278594388472e-05, "loss": 1.289, "step": 5601 }, { "epoch": 1.6685344105437556, "grad_norm": 0.21639177203178406, "learning_rate": 1.903086439486155e-05, "loss": 1.2789, "step": 5602 }, { "epoch": 1.6688322567434242, "grad_norm": 0.22558465600013733, "learning_rate": 1.9030450111312526e-05, "loss": 1.2613, "step": 5603 }, { "epoch": 1.6691301029430927, "grad_norm": 0.2187129706144333, "learning_rate": 1.9030035743745253e-05, "loss": 1.284, "step": 5604 }, { "epoch": 1.6694279491427615, "grad_norm": 0.23081013560295105, "learning_rate": 1.9029621292163587e-05, "loss": 1.2707, "step": 5605 }, { "epoch": 1.66972579534243, "grad_norm": 0.22469882667064667, "learning_rate": 1.9029206756571384e-05, "loss": 1.2798, "step": 5606 }, { "epoch": 1.6700236415420986, "grad_norm": 0.21861881017684937, "learning_rate": 1.90287921369725e-05, "loss": 1.2668, "step": 5607 }, { "epoch": 1.6703214877417674, "grad_norm": 0.22377395629882812, "learning_rate": 1.9028377433370787e-05, "loss": 1.2768, "step": 5608 }, { "epoch": 1.6706193339414361, "grad_norm": 0.22810766100883484, "learning_rate": 1.9027962645770114e-05, "loss": 1.2574, "step": 5609 }, { "epoch": 1.6709171801411047, "grad_norm": 0.2153186798095703, "learning_rate": 1.9027547774174337e-05, "loss": 1.2804, "step": 5610 }, { "epoch": 1.6712150263407732, "grad_norm": 0.21898439526557922, "learning_rate": 1.9027132818587308e-05, "loss": 1.2558, "step": 5611 }, { "epoch": 1.671512872540442, "grad_norm": 0.21713757514953613, "learning_rate": 1.9026717779012897e-05, "loss": 1.2703, "step": 5612 }, { "epoch": 1.6718107187401106, "grad_norm": 0.22190316021442413, "learning_rate": 1.9026302655454962e-05, "loss": 1.2768, "step": 5613 }, { "epoch": 1.672108564939779, "grad_norm": 0.21604092419147491, "learning_rate": 1.902588744791737e-05, "loss": 1.2789, "step": 5614 }, { "epoch": 1.6724064111394479, "grad_norm": 0.21985818445682526, "learning_rate": 1.9025472156403976e-05, "loss": 1.2885, "step": 5615 }, { "epoch": 1.6727042573391167, "grad_norm": 0.22372137010097504, "learning_rate": 1.9025056780918646e-05, "loss": 1.2875, "step": 5616 }, { "epoch": 1.6730021035387852, "grad_norm": 0.22135815024375916, "learning_rate": 1.9024641321465246e-05, "loss": 1.2765, "step": 5617 }, { "epoch": 1.6732999497384538, "grad_norm": 0.22023016214370728, "learning_rate": 1.902422577804764e-05, "loss": 1.2767, "step": 5618 }, { "epoch": 1.6735977959381225, "grad_norm": 0.24457132816314697, "learning_rate": 1.9023810150669702e-05, "loss": 1.2516, "step": 5619 }, { "epoch": 1.673895642137791, "grad_norm": 0.22207579016685486, "learning_rate": 1.9023394439335287e-05, "loss": 1.2702, "step": 5620 }, { "epoch": 1.6741934883374596, "grad_norm": 0.22879260778427124, "learning_rate": 1.9022978644048275e-05, "loss": 1.2526, "step": 5621 }, { "epoch": 1.6744913345371284, "grad_norm": 0.22566340863704681, "learning_rate": 1.902256276481252e-05, "loss": 1.2749, "step": 5622 }, { "epoch": 1.6747891807367972, "grad_norm": 0.2589960992336273, "learning_rate": 1.90221468016319e-05, "loss": 1.2624, "step": 5623 }, { "epoch": 1.6750870269364657, "grad_norm": 0.24110759794712067, "learning_rate": 1.9021730754510288e-05, "loss": 1.2804, "step": 5624 }, { "epoch": 1.6753848731361343, "grad_norm": 0.24634456634521484, "learning_rate": 1.902131462345155e-05, "loss": 1.2848, "step": 5625 }, { "epoch": 1.675682719335803, "grad_norm": 0.23470130562782288, "learning_rate": 1.902089840845956e-05, "loss": 1.2949, "step": 5626 }, { "epoch": 1.6759805655354716, "grad_norm": 0.32685351371765137, "learning_rate": 1.902048210953818e-05, "loss": 1.2794, "step": 5627 }, { "epoch": 1.6762784117351401, "grad_norm": 0.23882338404655457, "learning_rate": 1.9020065726691302e-05, "loss": 1.275, "step": 5628 }, { "epoch": 1.676576257934809, "grad_norm": 0.24273385107517242, "learning_rate": 1.9019649259922787e-05, "loss": 1.2843, "step": 5629 }, { "epoch": 1.6768741041344777, "grad_norm": 0.23705027997493744, "learning_rate": 1.9019232709236516e-05, "loss": 1.2632, "step": 5630 }, { "epoch": 1.6771719503341462, "grad_norm": 0.2242920696735382, "learning_rate": 1.9018816074636358e-05, "loss": 1.2862, "step": 5631 }, { "epoch": 1.6774697965338148, "grad_norm": 0.23454338312149048, "learning_rate": 1.9018399356126195e-05, "loss": 1.2653, "step": 5632 }, { "epoch": 1.6777676427334836, "grad_norm": 0.25819316506385803, "learning_rate": 1.90179825537099e-05, "loss": 1.2643, "step": 5633 }, { "epoch": 1.678065488933152, "grad_norm": 0.2194097489118576, "learning_rate": 1.901756566739135e-05, "loss": 1.2764, "step": 5634 }, { "epoch": 1.6783633351328207, "grad_norm": 0.23922835290431976, "learning_rate": 1.901714869717443e-05, "loss": 1.2685, "step": 5635 }, { "epoch": 1.6786611813324894, "grad_norm": 0.24081382155418396, "learning_rate": 1.9016731643063018e-05, "loss": 1.2866, "step": 5636 }, { "epoch": 1.6789590275321582, "grad_norm": 0.23041003942489624, "learning_rate": 1.9016314505060987e-05, "loss": 1.2781, "step": 5637 }, { "epoch": 1.6792568737318267, "grad_norm": 0.21383997797966003, "learning_rate": 1.9015897283172232e-05, "loss": 1.262, "step": 5638 }, { "epoch": 1.6795547199314953, "grad_norm": 0.2681867182254791, "learning_rate": 1.901547997740062e-05, "loss": 1.2789, "step": 5639 }, { "epoch": 1.679852566131164, "grad_norm": 0.23258072137832642, "learning_rate": 1.9015062587750036e-05, "loss": 1.2826, "step": 5640 }, { "epoch": 1.6801504123308326, "grad_norm": 0.23819170892238617, "learning_rate": 1.9014645114224374e-05, "loss": 1.2643, "step": 5641 }, { "epoch": 1.6804482585305012, "grad_norm": 0.2216070592403412, "learning_rate": 1.9014227556827505e-05, "loss": 1.2629, "step": 5642 }, { "epoch": 1.68074610473017, "grad_norm": 0.23336848616600037, "learning_rate": 1.9013809915563322e-05, "loss": 1.2682, "step": 5643 }, { "epoch": 1.6810439509298387, "grad_norm": 0.24089714884757996, "learning_rate": 1.901339219043571e-05, "loss": 1.2665, "step": 5644 }, { "epoch": 1.6813417971295073, "grad_norm": 0.2426704615354538, "learning_rate": 1.9012974381448553e-05, "loss": 1.2665, "step": 5645 }, { "epoch": 1.6816396433291758, "grad_norm": 0.22386778891086578, "learning_rate": 1.901255648860574e-05, "loss": 1.2746, "step": 5646 }, { "epoch": 1.6819374895288446, "grad_norm": 0.23168200254440308, "learning_rate": 1.9012138511911156e-05, "loss": 1.2739, "step": 5647 }, { "epoch": 1.6822353357285131, "grad_norm": 0.2353355884552002, "learning_rate": 1.9011720451368693e-05, "loss": 1.2616, "step": 5648 }, { "epoch": 1.6825331819281817, "grad_norm": 0.22925683856010437, "learning_rate": 1.901130230698224e-05, "loss": 1.2615, "step": 5649 }, { "epoch": 1.6828310281278505, "grad_norm": 0.23452536761760712, "learning_rate": 1.9010884078755688e-05, "loss": 1.2678, "step": 5650 }, { "epoch": 1.6831288743275192, "grad_norm": 0.22609202563762665, "learning_rate": 1.9010465766692924e-05, "loss": 1.281, "step": 5651 }, { "epoch": 1.6834267205271878, "grad_norm": 0.22634583711624146, "learning_rate": 1.901004737079785e-05, "loss": 1.2736, "step": 5652 }, { "epoch": 1.6837245667268563, "grad_norm": 0.22738151252269745, "learning_rate": 1.9009628891074346e-05, "loss": 1.286, "step": 5653 }, { "epoch": 1.684022412926525, "grad_norm": 0.22332048416137695, "learning_rate": 1.9009210327526314e-05, "loss": 1.2824, "step": 5654 }, { "epoch": 1.6843202591261939, "grad_norm": 0.2146749645471573, "learning_rate": 1.9008791680157643e-05, "loss": 1.2672, "step": 5655 }, { "epoch": 1.6846181053258622, "grad_norm": 0.28092333674430847, "learning_rate": 1.900837294897223e-05, "loss": 1.2681, "step": 5656 }, { "epoch": 1.684915951525531, "grad_norm": 0.21779432892799377, "learning_rate": 1.9007954133973974e-05, "loss": 1.2946, "step": 5657 }, { "epoch": 1.6852137977251997, "grad_norm": 0.23540149629116058, "learning_rate": 1.9007535235166768e-05, "loss": 1.2891, "step": 5658 }, { "epoch": 1.6855116439248683, "grad_norm": 0.23689579963684082, "learning_rate": 1.900711625255451e-05, "loss": 1.2635, "step": 5659 }, { "epoch": 1.6858094901245368, "grad_norm": 0.2266598492860794, "learning_rate": 1.90066971861411e-05, "loss": 1.2671, "step": 5660 }, { "epoch": 1.6861073363242056, "grad_norm": 0.21305103600025177, "learning_rate": 1.9006278035930434e-05, "loss": 1.2679, "step": 5661 }, { "epoch": 1.6864051825238744, "grad_norm": 0.2813588082790375, "learning_rate": 1.9005858801926416e-05, "loss": 1.2576, "step": 5662 }, { "epoch": 1.6867030287235427, "grad_norm": 0.21802647411823273, "learning_rate": 1.9005439484132943e-05, "loss": 1.2721, "step": 5663 }, { "epoch": 1.6870008749232115, "grad_norm": 0.22629772126674652, "learning_rate": 1.9005020082553915e-05, "loss": 1.2528, "step": 5664 }, { "epoch": 1.6872987211228803, "grad_norm": 0.2283496856689453, "learning_rate": 1.900460059719324e-05, "loss": 1.2587, "step": 5665 }, { "epoch": 1.6875965673225488, "grad_norm": 0.22529184818267822, "learning_rate": 1.9004181028054813e-05, "loss": 1.2721, "step": 5666 }, { "epoch": 1.6878944135222174, "grad_norm": 0.2219720184803009, "learning_rate": 1.9003761375142543e-05, "loss": 1.259, "step": 5667 }, { "epoch": 1.6881922597218861, "grad_norm": 0.22435903549194336, "learning_rate": 1.9003341638460334e-05, "loss": 1.266, "step": 5668 }, { "epoch": 1.688490105921555, "grad_norm": 0.23676352202892303, "learning_rate": 1.900292181801209e-05, "loss": 1.2651, "step": 5669 }, { "epoch": 1.6887879521212235, "grad_norm": 0.21435624361038208, "learning_rate": 1.900250191380171e-05, "loss": 1.2782, "step": 5670 }, { "epoch": 1.689085798320892, "grad_norm": 0.22388912737369537, "learning_rate": 1.9002081925833117e-05, "loss": 1.2797, "step": 5671 }, { "epoch": 1.6893836445205608, "grad_norm": 0.23809774219989777, "learning_rate": 1.9001661854110207e-05, "loss": 1.279, "step": 5672 }, { "epoch": 1.6896814907202293, "grad_norm": 0.22046956419944763, "learning_rate": 1.900124169863689e-05, "loss": 1.2641, "step": 5673 }, { "epoch": 1.6899793369198979, "grad_norm": 0.22175170481204987, "learning_rate": 1.9000821459417074e-05, "loss": 1.2809, "step": 5674 }, { "epoch": 1.6902771831195667, "grad_norm": 0.25351229310035706, "learning_rate": 1.9000401136454675e-05, "loss": 1.2721, "step": 5675 }, { "epoch": 1.6905750293192354, "grad_norm": 0.21347540616989136, "learning_rate": 1.89999807297536e-05, "loss": 1.2853, "step": 5676 }, { "epoch": 1.690872875518904, "grad_norm": 0.22976148128509521, "learning_rate": 1.8999560239317752e-05, "loss": 1.2745, "step": 5677 }, { "epoch": 1.6911707217185725, "grad_norm": 0.22690346837043762, "learning_rate": 1.8999139665151056e-05, "loss": 1.2798, "step": 5678 }, { "epoch": 1.6914685679182413, "grad_norm": 0.2327873408794403, "learning_rate": 1.899871900725742e-05, "loss": 1.2544, "step": 5679 }, { "epoch": 1.6917664141179098, "grad_norm": 0.22844178974628448, "learning_rate": 1.899829826564076e-05, "loss": 1.2614, "step": 5680 }, { "epoch": 1.6920642603175784, "grad_norm": 0.21966998279094696, "learning_rate": 1.8997877440304982e-05, "loss": 1.2887, "step": 5681 }, { "epoch": 1.6923621065172472, "grad_norm": 0.2034863829612732, "learning_rate": 1.899745653125401e-05, "loss": 1.2698, "step": 5682 }, { "epoch": 1.692659952716916, "grad_norm": 0.21531030535697937, "learning_rate": 1.899703553849176e-05, "loss": 1.2692, "step": 5683 }, { "epoch": 1.6929577989165845, "grad_norm": 0.22363252937793732, "learning_rate": 1.8996614462022146e-05, "loss": 1.2721, "step": 5684 }, { "epoch": 1.693255645116253, "grad_norm": 0.22647421061992645, "learning_rate": 1.899619330184908e-05, "loss": 1.2728, "step": 5685 }, { "epoch": 1.6935534913159218, "grad_norm": 0.22268177568912506, "learning_rate": 1.899577205797649e-05, "loss": 1.2978, "step": 5686 }, { "epoch": 1.6938513375155904, "grad_norm": 0.22444170713424683, "learning_rate": 1.8995350730408295e-05, "loss": 1.2535, "step": 5687 }, { "epoch": 1.694149183715259, "grad_norm": 0.2262217104434967, "learning_rate": 1.899492931914841e-05, "loss": 1.2644, "step": 5688 }, { "epoch": 1.6944470299149277, "grad_norm": 0.22429290413856506, "learning_rate": 1.8994507824200756e-05, "loss": 1.2883, "step": 5689 }, { "epoch": 1.6947448761145965, "grad_norm": 0.22514869272708893, "learning_rate": 1.8994086245569258e-05, "loss": 1.2642, "step": 5690 }, { "epoch": 1.695042722314265, "grad_norm": 0.24482673406600952, "learning_rate": 1.8993664583257833e-05, "loss": 1.2783, "step": 5691 }, { "epoch": 1.6953405685139336, "grad_norm": 0.24208861589431763, "learning_rate": 1.899324283727041e-05, "loss": 1.2807, "step": 5692 }, { "epoch": 1.6956384147136023, "grad_norm": 0.23001578450202942, "learning_rate": 1.899282100761091e-05, "loss": 1.2644, "step": 5693 }, { "epoch": 1.6959362609132709, "grad_norm": 0.21940132975578308, "learning_rate": 1.899239909428326e-05, "loss": 1.2892, "step": 5694 }, { "epoch": 1.6962341071129394, "grad_norm": 0.22105544805526733, "learning_rate": 1.8991977097291377e-05, "loss": 1.2838, "step": 5695 }, { "epoch": 1.6965319533126082, "grad_norm": 0.21267855167388916, "learning_rate": 1.89915550166392e-05, "loss": 1.2605, "step": 5696 }, { "epoch": 1.696829799512277, "grad_norm": 0.23278282582759857, "learning_rate": 1.8991132852330643e-05, "loss": 1.2889, "step": 5697 }, { "epoch": 1.6971276457119455, "grad_norm": 0.22636377811431885, "learning_rate": 1.8990710604369648e-05, "loss": 1.2899, "step": 5698 }, { "epoch": 1.697425491911614, "grad_norm": 0.22682049870491028, "learning_rate": 1.899028827276013e-05, "loss": 1.2736, "step": 5699 }, { "epoch": 1.6977233381112828, "grad_norm": 0.22536416351795197, "learning_rate": 1.8989865857506025e-05, "loss": 1.2765, "step": 5700 }, { "epoch": 1.6980211843109514, "grad_norm": 0.23606960475444794, "learning_rate": 1.8989443358611265e-05, "loss": 1.2651, "step": 5701 }, { "epoch": 1.69831903051062, "grad_norm": 0.25915801525115967, "learning_rate": 1.8989020776079778e-05, "loss": 1.2807, "step": 5702 }, { "epoch": 1.6986168767102887, "grad_norm": 0.24296355247497559, "learning_rate": 1.898859810991549e-05, "loss": 1.2767, "step": 5703 }, { "epoch": 1.6989147229099575, "grad_norm": 0.21679812669754028, "learning_rate": 1.8988175360122346e-05, "loss": 1.2886, "step": 5704 }, { "epoch": 1.699212569109626, "grad_norm": 0.22106091678142548, "learning_rate": 1.8987752526704267e-05, "loss": 1.2668, "step": 5705 }, { "epoch": 1.6995104153092946, "grad_norm": 0.21959993243217468, "learning_rate": 1.8987329609665197e-05, "loss": 1.2683, "step": 5706 }, { "epoch": 1.6998082615089634, "grad_norm": 0.23039567470550537, "learning_rate": 1.8986906609009065e-05, "loss": 1.2553, "step": 5707 }, { "epoch": 1.700106107708632, "grad_norm": 0.2438964694738388, "learning_rate": 1.8986483524739806e-05, "loss": 1.2739, "step": 5708 }, { "epoch": 1.7004039539083005, "grad_norm": 0.21674993634223938, "learning_rate": 1.898606035686136e-05, "loss": 1.2762, "step": 5709 }, { "epoch": 1.7007018001079692, "grad_norm": 0.31936731934547424, "learning_rate": 1.8985637105377663e-05, "loss": 1.2676, "step": 5710 }, { "epoch": 1.700999646307638, "grad_norm": 0.2805282771587372, "learning_rate": 1.898521377029265e-05, "loss": 1.2837, "step": 5711 }, { "epoch": 1.7012974925073066, "grad_norm": 0.25507208704948425, "learning_rate": 1.8984790351610262e-05, "loss": 1.2639, "step": 5712 }, { "epoch": 1.701595338706975, "grad_norm": 0.22662752866744995, "learning_rate": 1.898436684933444e-05, "loss": 1.2698, "step": 5713 }, { "epoch": 1.7018931849066439, "grad_norm": 0.24285344779491425, "learning_rate": 1.898394326346912e-05, "loss": 1.2742, "step": 5714 }, { "epoch": 1.7021910311063124, "grad_norm": 0.23638954758644104, "learning_rate": 1.8983519594018247e-05, "loss": 1.2848, "step": 5715 }, { "epoch": 1.702488877305981, "grad_norm": 0.22729748487472534, "learning_rate": 1.898309584098576e-05, "loss": 1.2802, "step": 5716 }, { "epoch": 1.7027867235056497, "grad_norm": 0.21747556328773499, "learning_rate": 1.8982672004375603e-05, "loss": 1.2615, "step": 5717 }, { "epoch": 1.7030845697053185, "grad_norm": 0.22167259454727173, "learning_rate": 1.898224808419172e-05, "loss": 1.2657, "step": 5718 }, { "epoch": 1.703382415904987, "grad_norm": 0.22318674623966217, "learning_rate": 1.8981824080438054e-05, "loss": 1.272, "step": 5719 }, { "epoch": 1.7036802621046556, "grad_norm": 0.21981275081634521, "learning_rate": 1.898139999311855e-05, "loss": 1.2678, "step": 5720 }, { "epoch": 1.7039781083043244, "grad_norm": 0.2351217269897461, "learning_rate": 1.898097582223715e-05, "loss": 1.2922, "step": 5721 }, { "epoch": 1.7042759545039932, "grad_norm": 0.22308693826198578, "learning_rate": 1.8980551567797808e-05, "loss": 1.2752, "step": 5722 }, { "epoch": 1.7045738007036615, "grad_norm": 0.22976164519786835, "learning_rate": 1.898012722980447e-05, "loss": 1.2582, "step": 5723 }, { "epoch": 1.7048716469033303, "grad_norm": 0.2237618863582611, "learning_rate": 1.8979702808261077e-05, "loss": 1.2789, "step": 5724 }, { "epoch": 1.705169493102999, "grad_norm": 0.21517914533615112, "learning_rate": 1.8979278303171584e-05, "loss": 1.2639, "step": 5725 }, { "epoch": 1.7054673393026676, "grad_norm": 0.21843688189983368, "learning_rate": 1.8978853714539935e-05, "loss": 1.2656, "step": 5726 }, { "epoch": 1.7057651855023361, "grad_norm": 0.2246447652578354, "learning_rate": 1.897842904237009e-05, "loss": 1.2754, "step": 5727 }, { "epoch": 1.706063031702005, "grad_norm": 0.22413119673728943, "learning_rate": 1.8978004286665986e-05, "loss": 1.2816, "step": 5728 }, { "epoch": 1.7063608779016737, "grad_norm": 0.22636112570762634, "learning_rate": 1.897757944743159e-05, "loss": 1.273, "step": 5729 }, { "epoch": 1.706658724101342, "grad_norm": 0.2248656004667282, "learning_rate": 1.8977154524670845e-05, "loss": 1.281, "step": 5730 }, { "epoch": 1.7069565703010108, "grad_norm": 0.2276724874973297, "learning_rate": 1.8976729518387708e-05, "loss": 1.2885, "step": 5731 }, { "epoch": 1.7072544165006796, "grad_norm": 0.24606406688690186, "learning_rate": 1.897630442858613e-05, "loss": 1.286, "step": 5732 }, { "epoch": 1.707552262700348, "grad_norm": 0.2130737155675888, "learning_rate": 1.897587925527007e-05, "loss": 1.2562, "step": 5733 }, { "epoch": 1.7078501089000167, "grad_norm": 0.21556521952152252, "learning_rate": 1.897545399844348e-05, "loss": 1.2737, "step": 5734 }, { "epoch": 1.7081479550996854, "grad_norm": 0.2331210970878601, "learning_rate": 1.8975028658110323e-05, "loss": 1.2682, "step": 5735 }, { "epoch": 1.7084458012993542, "grad_norm": 0.2184014916419983, "learning_rate": 1.897460323427455e-05, "loss": 1.2624, "step": 5736 }, { "epoch": 1.7087436474990227, "grad_norm": 0.22465340793132782, "learning_rate": 1.8974177726940117e-05, "loss": 1.2729, "step": 5737 }, { "epoch": 1.7090414936986913, "grad_norm": 0.22078350186347961, "learning_rate": 1.897375213611099e-05, "loss": 1.2585, "step": 5738 }, { "epoch": 1.70933933989836, "grad_norm": 0.2483685463666916, "learning_rate": 1.8973326461791126e-05, "loss": 1.2659, "step": 5739 }, { "epoch": 1.7096371860980286, "grad_norm": 0.21920223534107208, "learning_rate": 1.8972900703984484e-05, "loss": 1.2585, "step": 5740 }, { "epoch": 1.7099350322976972, "grad_norm": 0.22167551517486572, "learning_rate": 1.8972474862695024e-05, "loss": 1.2615, "step": 5741 }, { "epoch": 1.710232878497366, "grad_norm": 0.22482536733150482, "learning_rate": 1.8972048937926713e-05, "loss": 1.265, "step": 5742 }, { "epoch": 1.7105307246970347, "grad_norm": 0.21366822719573975, "learning_rate": 1.897162292968351e-05, "loss": 1.2745, "step": 5743 }, { "epoch": 1.7108285708967033, "grad_norm": 0.24560382962226868, "learning_rate": 1.8971196837969378e-05, "loss": 1.2714, "step": 5744 }, { "epoch": 1.7111264170963718, "grad_norm": 0.22219218313694, "learning_rate": 1.8970770662788285e-05, "loss": 1.2911, "step": 5745 }, { "epoch": 1.7114242632960406, "grad_norm": 0.22853435575962067, "learning_rate": 1.897034440414419e-05, "loss": 1.2725, "step": 5746 }, { "epoch": 1.7117221094957091, "grad_norm": 0.21046991646289825, "learning_rate": 1.896991806204107e-05, "loss": 1.2789, "step": 5747 }, { "epoch": 1.7120199556953777, "grad_norm": 0.2427307814359665, "learning_rate": 1.8969491636482878e-05, "loss": 1.2907, "step": 5748 }, { "epoch": 1.7123178018950465, "grad_norm": 0.2283223271369934, "learning_rate": 1.896906512747359e-05, "loss": 1.2799, "step": 5749 }, { "epoch": 1.7126156480947152, "grad_norm": 0.2287558764219284, "learning_rate": 1.8968638535017167e-05, "loss": 1.2705, "step": 5750 }, { "epoch": 1.7129134942943838, "grad_norm": 0.22238564491271973, "learning_rate": 1.8968211859117586e-05, "loss": 1.2601, "step": 5751 }, { "epoch": 1.7132113404940523, "grad_norm": 0.23806633055210114, "learning_rate": 1.8967785099778818e-05, "loss": 1.2689, "step": 5752 }, { "epoch": 1.713509186693721, "grad_norm": 0.2507690489292145, "learning_rate": 1.8967358257004825e-05, "loss": 1.2793, "step": 5753 }, { "epoch": 1.7138070328933896, "grad_norm": 0.23303022980690002, "learning_rate": 1.896693133079958e-05, "loss": 1.2722, "step": 5754 }, { "epoch": 1.7141048790930582, "grad_norm": 0.2564085125923157, "learning_rate": 1.896650432116706e-05, "loss": 1.2659, "step": 5755 }, { "epoch": 1.714402725292727, "grad_norm": 0.23091602325439453, "learning_rate": 1.8966077228111236e-05, "loss": 1.2664, "step": 5756 }, { "epoch": 1.7147005714923957, "grad_norm": 0.23390372097492218, "learning_rate": 1.896565005163608e-05, "loss": 1.2728, "step": 5757 }, { "epoch": 1.7149984176920643, "grad_norm": 0.21289603412151337, "learning_rate": 1.8965222791745568e-05, "loss": 1.2745, "step": 5758 }, { "epoch": 1.7152962638917328, "grad_norm": 0.22639428079128265, "learning_rate": 1.8964795448443672e-05, "loss": 1.2932, "step": 5759 }, { "epoch": 1.7155941100914016, "grad_norm": 0.2241027057170868, "learning_rate": 1.896436802173437e-05, "loss": 1.2693, "step": 5760 }, { "epoch": 1.7158919562910702, "grad_norm": 0.2254457324743271, "learning_rate": 1.8963940511621645e-05, "loss": 1.2649, "step": 5761 }, { "epoch": 1.7161898024907387, "grad_norm": 0.2425682544708252, "learning_rate": 1.8963512918109462e-05, "loss": 1.2729, "step": 5762 }, { "epoch": 1.7164876486904075, "grad_norm": 0.25813499093055725, "learning_rate": 1.896308524120181e-05, "loss": 1.2625, "step": 5763 }, { "epoch": 1.7167854948900763, "grad_norm": 0.222854882478714, "learning_rate": 1.8962657480902664e-05, "loss": 1.277, "step": 5764 }, { "epoch": 1.7170833410897448, "grad_norm": 0.25514060258865356, "learning_rate": 1.8962229637216004e-05, "loss": 1.2709, "step": 5765 }, { "epoch": 1.7173811872894134, "grad_norm": 0.2230352759361267, "learning_rate": 1.8961801710145807e-05, "loss": 1.2725, "step": 5766 }, { "epoch": 1.7176790334890821, "grad_norm": 0.24023233354091644, "learning_rate": 1.8961373699696063e-05, "loss": 1.2764, "step": 5767 }, { "epoch": 1.7179768796887507, "grad_norm": 0.2421426922082901, "learning_rate": 1.8960945605870744e-05, "loss": 1.2625, "step": 5768 }, { "epoch": 1.7182747258884192, "grad_norm": 0.22517035901546478, "learning_rate": 1.8960517428673843e-05, "loss": 1.2519, "step": 5769 }, { "epoch": 1.718572572088088, "grad_norm": 0.22730892896652222, "learning_rate": 1.8960089168109333e-05, "loss": 1.2722, "step": 5770 }, { "epoch": 1.7188704182877568, "grad_norm": 0.21366451680660248, "learning_rate": 1.8959660824181206e-05, "loss": 1.2722, "step": 5771 }, { "epoch": 1.7191682644874253, "grad_norm": 0.22521375119686127, "learning_rate": 1.8959232396893446e-05, "loss": 1.267, "step": 5772 }, { "epoch": 1.7194661106870939, "grad_norm": 0.21800497174263, "learning_rate": 1.8958803886250038e-05, "loss": 1.268, "step": 5773 }, { "epoch": 1.7197639568867626, "grad_norm": 0.23374035954475403, "learning_rate": 1.895837529225497e-05, "loss": 1.2784, "step": 5774 }, { "epoch": 1.7200618030864312, "grad_norm": 0.22994539141654968, "learning_rate": 1.8957946614912227e-05, "loss": 1.2686, "step": 5775 }, { "epoch": 1.7203596492860997, "grad_norm": 0.22978772222995758, "learning_rate": 1.89575178542258e-05, "loss": 1.2733, "step": 5776 }, { "epoch": 1.7206574954857685, "grad_norm": 0.23379528522491455, "learning_rate": 1.8957089010199678e-05, "loss": 1.2754, "step": 5777 }, { "epoch": 1.7209553416854373, "grad_norm": 0.24608808755874634, "learning_rate": 1.895666008283785e-05, "loss": 1.2859, "step": 5778 }, { "epoch": 1.7212531878851058, "grad_norm": 0.22173528373241425, "learning_rate": 1.8956231072144308e-05, "loss": 1.2824, "step": 5779 }, { "epoch": 1.7215510340847744, "grad_norm": 0.26645976305007935, "learning_rate": 1.8955801978123037e-05, "loss": 1.278, "step": 5780 }, { "epoch": 1.7218488802844432, "grad_norm": 0.26630792021751404, "learning_rate": 1.895537280077804e-05, "loss": 1.2726, "step": 5781 }, { "epoch": 1.7221467264841117, "grad_norm": 0.2354707568883896, "learning_rate": 1.8954943540113305e-05, "loss": 1.266, "step": 5782 }, { "epoch": 1.7224445726837803, "grad_norm": 0.2792515754699707, "learning_rate": 1.8954514196132825e-05, "loss": 1.2721, "step": 5783 }, { "epoch": 1.722742418883449, "grad_norm": 0.2280869483947754, "learning_rate": 1.8954084768840593e-05, "loss": 1.2803, "step": 5784 }, { "epoch": 1.7230402650831178, "grad_norm": 0.2183830738067627, "learning_rate": 1.8953655258240608e-05, "loss": 1.281, "step": 5785 }, { "epoch": 1.7233381112827864, "grad_norm": 0.21691355109214783, "learning_rate": 1.895322566433686e-05, "loss": 1.2719, "step": 5786 }, { "epoch": 1.723635957482455, "grad_norm": 0.22357288002967834, "learning_rate": 1.8952795987133354e-05, "loss": 1.267, "step": 5787 }, { "epoch": 1.7239338036821237, "grad_norm": 0.23809656500816345, "learning_rate": 1.8952366226634086e-05, "loss": 1.2514, "step": 5788 }, { "epoch": 1.7242316498817924, "grad_norm": 0.22516104578971863, "learning_rate": 1.8951936382843052e-05, "loss": 1.291, "step": 5789 }, { "epoch": 1.7245294960814608, "grad_norm": 0.21597300469875336, "learning_rate": 1.8951506455764247e-05, "loss": 1.2602, "step": 5790 }, { "epoch": 1.7248273422811295, "grad_norm": 0.2319001704454422, "learning_rate": 1.895107644540168e-05, "loss": 1.2872, "step": 5791 }, { "epoch": 1.7251251884807983, "grad_norm": 0.23265139758586884, "learning_rate": 1.8950646351759346e-05, "loss": 1.2777, "step": 5792 }, { "epoch": 1.7254230346804669, "grad_norm": 0.22303816676139832, "learning_rate": 1.895021617484125e-05, "loss": 1.2735, "step": 5793 }, { "epoch": 1.7257208808801354, "grad_norm": 0.22523610293865204, "learning_rate": 1.894978591465139e-05, "loss": 1.2815, "step": 5794 }, { "epoch": 1.7260187270798042, "grad_norm": 0.3229084014892578, "learning_rate": 1.894935557119377e-05, "loss": 1.258, "step": 5795 }, { "epoch": 1.726316573279473, "grad_norm": 0.28496530652046204, "learning_rate": 1.8948925144472395e-05, "loss": 1.28, "step": 5796 }, { "epoch": 1.7266144194791413, "grad_norm": 0.2660283148288727, "learning_rate": 1.8948494634491273e-05, "loss": 1.282, "step": 5797 }, { "epoch": 1.72691226567881, "grad_norm": 0.22563010454177856, "learning_rate": 1.8948064041254405e-05, "loss": 1.2706, "step": 5798 }, { "epoch": 1.7272101118784788, "grad_norm": 0.2682509422302246, "learning_rate": 1.89476333647658e-05, "loss": 1.263, "step": 5799 }, { "epoch": 1.7275079580781474, "grad_norm": 0.27108532190322876, "learning_rate": 1.894720260502946e-05, "loss": 1.2838, "step": 5800 }, { "epoch": 1.727805804277816, "grad_norm": 0.24455232918262482, "learning_rate": 1.89467717620494e-05, "loss": 1.2903, "step": 5801 }, { "epoch": 1.7281036504774847, "grad_norm": 0.22039395570755005, "learning_rate": 1.894634083582962e-05, "loss": 1.2701, "step": 5802 }, { "epoch": 1.7284014966771535, "grad_norm": 0.22624199092388153, "learning_rate": 1.8945909826374134e-05, "loss": 1.2706, "step": 5803 }, { "epoch": 1.728699342876822, "grad_norm": 0.23424167931079865, "learning_rate": 1.8945478733686956e-05, "loss": 1.276, "step": 5804 }, { "epoch": 1.7289971890764906, "grad_norm": 0.22864419221878052, "learning_rate": 1.8945047557772095e-05, "loss": 1.2696, "step": 5805 }, { "epoch": 1.7292950352761594, "grad_norm": 0.22675685584545135, "learning_rate": 1.8944616298633556e-05, "loss": 1.2791, "step": 5806 }, { "epoch": 1.729592881475828, "grad_norm": 0.22480760514736176, "learning_rate": 1.8944184956275356e-05, "loss": 1.2534, "step": 5807 }, { "epoch": 1.7298907276754965, "grad_norm": 0.30962008237838745, "learning_rate": 1.8943753530701508e-05, "loss": 1.2785, "step": 5808 }, { "epoch": 1.7301885738751652, "grad_norm": 0.24147537350654602, "learning_rate": 1.8943322021916028e-05, "loss": 1.2602, "step": 5809 }, { "epoch": 1.730486420074834, "grad_norm": 0.22445976734161377, "learning_rate": 1.8942890429922927e-05, "loss": 1.2734, "step": 5810 }, { "epoch": 1.7307842662745025, "grad_norm": 0.21948538720607758, "learning_rate": 1.8942458754726223e-05, "loss": 1.2628, "step": 5811 }, { "epoch": 1.731082112474171, "grad_norm": 0.2405489981174469, "learning_rate": 1.894202699632993e-05, "loss": 1.2878, "step": 5812 }, { "epoch": 1.7313799586738399, "grad_norm": 0.22903725504875183, "learning_rate": 1.8941595154738067e-05, "loss": 1.2749, "step": 5813 }, { "epoch": 1.7316778048735084, "grad_norm": 0.22374002635478973, "learning_rate": 1.8941163229954652e-05, "loss": 1.2691, "step": 5814 }, { "epoch": 1.731975651073177, "grad_norm": 0.2340724915266037, "learning_rate": 1.89407312219837e-05, "loss": 1.2703, "step": 5815 }, { "epoch": 1.7322734972728457, "grad_norm": 0.21973590552806854, "learning_rate": 1.894029913082924e-05, "loss": 1.2837, "step": 5816 }, { "epoch": 1.7325713434725145, "grad_norm": 0.23547907173633575, "learning_rate": 1.8939866956495278e-05, "loss": 1.2656, "step": 5817 }, { "epoch": 1.732869189672183, "grad_norm": 0.22913868725299835, "learning_rate": 1.8939434698985842e-05, "loss": 1.268, "step": 5818 }, { "epoch": 1.7331670358718516, "grad_norm": 0.23191285133361816, "learning_rate": 1.893900235830496e-05, "loss": 1.2481, "step": 5819 }, { "epoch": 1.7334648820715204, "grad_norm": 0.24176964163780212, "learning_rate": 1.8938569934456645e-05, "loss": 1.2914, "step": 5820 }, { "epoch": 1.733762728271189, "grad_norm": 0.22274360060691833, "learning_rate": 1.8938137427444926e-05, "loss": 1.2792, "step": 5821 }, { "epoch": 1.7340605744708575, "grad_norm": 0.24947451055049896, "learning_rate": 1.8937704837273817e-05, "loss": 1.2547, "step": 5822 }, { "epoch": 1.7343584206705263, "grad_norm": 0.22832182049751282, "learning_rate": 1.8937272163947356e-05, "loss": 1.2718, "step": 5823 }, { "epoch": 1.734656266870195, "grad_norm": 0.22086654603481293, "learning_rate": 1.893683940746956e-05, "loss": 1.2781, "step": 5824 }, { "epoch": 1.7349541130698636, "grad_norm": 0.23569034039974213, "learning_rate": 1.8936406567844464e-05, "loss": 1.2669, "step": 5825 }, { "epoch": 1.7352519592695321, "grad_norm": 0.2154817283153534, "learning_rate": 1.8935973645076087e-05, "loss": 1.2679, "step": 5826 }, { "epoch": 1.735549805469201, "grad_norm": 0.24194398522377014, "learning_rate": 1.8935540639168458e-05, "loss": 1.2664, "step": 5827 }, { "epoch": 1.7358476516688695, "grad_norm": 0.23178546130657196, "learning_rate": 1.8935107550125606e-05, "loss": 1.2703, "step": 5828 }, { "epoch": 1.736145497868538, "grad_norm": 0.22296249866485596, "learning_rate": 1.8934674377951562e-05, "loss": 1.2711, "step": 5829 }, { "epoch": 1.7364433440682068, "grad_norm": 0.23277190327644348, "learning_rate": 1.8934241122650355e-05, "loss": 1.2765, "step": 5830 }, { "epoch": 1.7367411902678755, "grad_norm": 0.22290737926959991, "learning_rate": 1.8933807784226014e-05, "loss": 1.2785, "step": 5831 }, { "epoch": 1.737039036467544, "grad_norm": 0.21856017410755157, "learning_rate": 1.8933374362682574e-05, "loss": 1.2878, "step": 5832 }, { "epoch": 1.7373368826672126, "grad_norm": 0.23294885456562042, "learning_rate": 1.8932940858024073e-05, "loss": 1.2905, "step": 5833 }, { "epoch": 1.7376347288668814, "grad_norm": 0.22651009261608124, "learning_rate": 1.893250727025453e-05, "loss": 1.2739, "step": 5834 }, { "epoch": 1.73793257506655, "grad_norm": 0.23007501661777496, "learning_rate": 1.893207359937799e-05, "loss": 1.2755, "step": 5835 }, { "epoch": 1.7382304212662185, "grad_norm": 0.22142191231250763, "learning_rate": 1.8931639845398486e-05, "loss": 1.2853, "step": 5836 }, { "epoch": 1.7385282674658873, "grad_norm": 0.21657319366931915, "learning_rate": 1.8931206008320048e-05, "loss": 1.2577, "step": 5837 }, { "epoch": 1.738826113665556, "grad_norm": 0.2249903827905655, "learning_rate": 1.893077208814672e-05, "loss": 1.2808, "step": 5838 }, { "epoch": 1.7391239598652246, "grad_norm": 0.22896753251552582, "learning_rate": 1.8930338084882535e-05, "loss": 1.2823, "step": 5839 }, { "epoch": 1.7394218060648932, "grad_norm": 0.23422539234161377, "learning_rate": 1.8929903998531527e-05, "loss": 1.2683, "step": 5840 }, { "epoch": 1.739719652264562, "grad_norm": 0.24125337600708008, "learning_rate": 1.8929469829097744e-05, "loss": 1.2699, "step": 5841 }, { "epoch": 1.7400174984642305, "grad_norm": 0.2250746190547943, "learning_rate": 1.892903557658522e-05, "loss": 1.2704, "step": 5842 }, { "epoch": 1.740315344663899, "grad_norm": 0.23315951228141785, "learning_rate": 1.8928601240997997e-05, "loss": 1.2914, "step": 5843 }, { "epoch": 1.7406131908635678, "grad_norm": 0.22729183733463287, "learning_rate": 1.8928166822340114e-05, "loss": 1.2746, "step": 5844 }, { "epoch": 1.7409110370632366, "grad_norm": 0.21855691075325012, "learning_rate": 1.8927732320615617e-05, "loss": 1.2745, "step": 5845 }, { "epoch": 1.7412088832629051, "grad_norm": 0.21247674524784088, "learning_rate": 1.8927297735828542e-05, "loss": 1.2718, "step": 5846 }, { "epoch": 1.7415067294625737, "grad_norm": 0.2240409553050995, "learning_rate": 1.8926863067982938e-05, "loss": 1.2597, "step": 5847 }, { "epoch": 1.7418045756622424, "grad_norm": 0.21879000961780548, "learning_rate": 1.8926428317082846e-05, "loss": 1.2859, "step": 5848 }, { "epoch": 1.7421024218619112, "grad_norm": 0.22439351677894592, "learning_rate": 1.892599348313231e-05, "loss": 1.2642, "step": 5849 }, { "epoch": 1.7424002680615795, "grad_norm": 0.21980057656764984, "learning_rate": 1.8925558566135377e-05, "loss": 1.256, "step": 5850 }, { "epoch": 1.7426981142612483, "grad_norm": 0.21809187531471252, "learning_rate": 1.8925123566096095e-05, "loss": 1.2719, "step": 5851 }, { "epoch": 1.742995960460917, "grad_norm": 0.21508194506168365, "learning_rate": 1.892468848301851e-05, "loss": 1.2876, "step": 5852 }, { "epoch": 1.7432938066605856, "grad_norm": 0.2727757692337036, "learning_rate": 1.892425331690667e-05, "loss": 1.2659, "step": 5853 }, { "epoch": 1.7435916528602542, "grad_norm": 0.25025418400764465, "learning_rate": 1.8923818067764624e-05, "loss": 1.2769, "step": 5854 }, { "epoch": 1.743889499059923, "grad_norm": 0.25716692209243774, "learning_rate": 1.8923382735596422e-05, "loss": 1.266, "step": 5855 }, { "epoch": 1.7441873452595917, "grad_norm": 0.22011108696460724, "learning_rate": 1.8922947320406113e-05, "loss": 1.2651, "step": 5856 }, { "epoch": 1.74448519145926, "grad_norm": 0.45085522532463074, "learning_rate": 1.892251182219775e-05, "loss": 1.275, "step": 5857 }, { "epoch": 1.7447830376589288, "grad_norm": 0.21316717565059662, "learning_rate": 1.892207624097538e-05, "loss": 1.2488, "step": 5858 }, { "epoch": 1.7450808838585976, "grad_norm": 0.22340159118175507, "learning_rate": 1.8921640576743058e-05, "loss": 1.2693, "step": 5859 }, { "epoch": 1.7453787300582662, "grad_norm": 0.24735619127750397, "learning_rate": 1.892120482950484e-05, "loss": 1.2788, "step": 5860 }, { "epoch": 1.7456765762579347, "grad_norm": 0.2385658472776413, "learning_rate": 1.892076899926478e-05, "loss": 1.2886, "step": 5861 }, { "epoch": 1.7459744224576035, "grad_norm": 0.21086426079273224, "learning_rate": 1.892033308602693e-05, "loss": 1.2661, "step": 5862 }, { "epoch": 1.7462722686572723, "grad_norm": 0.22353291511535645, "learning_rate": 1.891989708979535e-05, "loss": 1.2966, "step": 5863 }, { "epoch": 1.7465701148569406, "grad_norm": 0.2436157613992691, "learning_rate": 1.8919461010574086e-05, "loss": 1.273, "step": 5864 }, { "epoch": 1.7468679610566094, "grad_norm": 0.22707544267177582, "learning_rate": 1.891902484836721e-05, "loss": 1.266, "step": 5865 }, { "epoch": 1.7471658072562781, "grad_norm": 0.21741141378879547, "learning_rate": 1.8918588603178768e-05, "loss": 1.277, "step": 5866 }, { "epoch": 1.7474636534559467, "grad_norm": 0.22682414948940277, "learning_rate": 1.8918152275012828e-05, "loss": 1.2706, "step": 5867 }, { "epoch": 1.7477614996556152, "grad_norm": 0.2459396868944168, "learning_rate": 1.891771586387344e-05, "loss": 1.2613, "step": 5868 }, { "epoch": 1.748059345855284, "grad_norm": 0.23079337179660797, "learning_rate": 1.8917279369764673e-05, "loss": 1.2719, "step": 5869 }, { "epoch": 1.7483571920549528, "grad_norm": 0.21382039785385132, "learning_rate": 1.8916842792690583e-05, "loss": 1.2711, "step": 5870 }, { "epoch": 1.7486550382546213, "grad_norm": 0.21237541735172272, "learning_rate": 1.8916406132655233e-05, "loss": 1.2748, "step": 5871 }, { "epoch": 1.7489528844542899, "grad_norm": 0.234297513961792, "learning_rate": 1.8915969389662686e-05, "loss": 1.2761, "step": 5872 }, { "epoch": 1.7492507306539586, "grad_norm": 0.22806483507156372, "learning_rate": 1.8915532563717008e-05, "loss": 1.2742, "step": 5873 }, { "epoch": 1.7495485768536272, "grad_norm": 0.22773486375808716, "learning_rate": 1.8915095654822256e-05, "loss": 1.2631, "step": 5874 }, { "epoch": 1.7498464230532957, "grad_norm": 0.21805758774280548, "learning_rate": 1.8914658662982502e-05, "loss": 1.2566, "step": 5875 }, { "epoch": 1.7501442692529645, "grad_norm": 0.22487327456474304, "learning_rate": 1.891422158820181e-05, "loss": 1.2862, "step": 5876 }, { "epoch": 1.7504421154526333, "grad_norm": 0.24331773817539215, "learning_rate": 1.8913784430484246e-05, "loss": 1.296, "step": 5877 }, { "epoch": 1.7507399616523018, "grad_norm": 0.243910551071167, "learning_rate": 1.8913347189833876e-05, "loss": 1.2776, "step": 5878 }, { "epoch": 1.7510378078519704, "grad_norm": 0.21218135952949524, "learning_rate": 1.891290986625477e-05, "loss": 1.2821, "step": 5879 }, { "epoch": 1.7513356540516392, "grad_norm": 0.2238103151321411, "learning_rate": 1.8912472459750994e-05, "loss": 1.2799, "step": 5880 }, { "epoch": 1.7516335002513077, "grad_norm": 0.21185216307640076, "learning_rate": 1.8912034970326617e-05, "loss": 1.2613, "step": 5881 }, { "epoch": 1.7519313464509763, "grad_norm": 0.22607696056365967, "learning_rate": 1.8911597397985714e-05, "loss": 1.2659, "step": 5882 }, { "epoch": 1.752229192650645, "grad_norm": 0.22042007744312286, "learning_rate": 1.8911159742732357e-05, "loss": 1.2667, "step": 5883 }, { "epoch": 1.7525270388503138, "grad_norm": 0.2211650311946869, "learning_rate": 1.8910722004570612e-05, "loss": 1.2725, "step": 5884 }, { "epoch": 1.7528248850499824, "grad_norm": 0.21940304338932037, "learning_rate": 1.8910284183504552e-05, "loss": 1.2713, "step": 5885 }, { "epoch": 1.753122731249651, "grad_norm": 0.23272915184497833, "learning_rate": 1.8909846279538256e-05, "loss": 1.274, "step": 5886 }, { "epoch": 1.7534205774493197, "grad_norm": 0.216901957988739, "learning_rate": 1.8909408292675797e-05, "loss": 1.2692, "step": 5887 }, { "epoch": 1.7537184236489882, "grad_norm": 0.23030175268650055, "learning_rate": 1.8908970222921243e-05, "loss": 1.2913, "step": 5888 }, { "epoch": 1.7540162698486568, "grad_norm": 0.23435144126415253, "learning_rate": 1.8908532070278677e-05, "loss": 1.2575, "step": 5889 }, { "epoch": 1.7543141160483255, "grad_norm": 0.22257715463638306, "learning_rate": 1.8908093834752176e-05, "loss": 1.2525, "step": 5890 }, { "epoch": 1.7546119622479943, "grad_norm": 0.22075094282627106, "learning_rate": 1.8907655516345808e-05, "loss": 1.2593, "step": 5891 }, { "epoch": 1.7549098084476629, "grad_norm": 0.22186720371246338, "learning_rate": 1.8907217115063663e-05, "loss": 1.2783, "step": 5892 }, { "epoch": 1.7552076546473314, "grad_norm": 0.22689685225486755, "learning_rate": 1.8906778630909814e-05, "loss": 1.2973, "step": 5893 }, { "epoch": 1.7555055008470002, "grad_norm": 0.21796102821826935, "learning_rate": 1.890634006388834e-05, "loss": 1.2591, "step": 5894 }, { "epoch": 1.7558033470466687, "grad_norm": 0.2169775366783142, "learning_rate": 1.8905901414003322e-05, "loss": 1.2691, "step": 5895 }, { "epoch": 1.7561011932463373, "grad_norm": 0.2183646261692047, "learning_rate": 1.8905462681258845e-05, "loss": 1.2689, "step": 5896 }, { "epoch": 1.756399039446006, "grad_norm": 0.22654369473457336, "learning_rate": 1.8905023865658984e-05, "loss": 1.2911, "step": 5897 }, { "epoch": 1.7566968856456748, "grad_norm": 0.22667363286018372, "learning_rate": 1.8904584967207825e-05, "loss": 1.2637, "step": 5898 }, { "epoch": 1.7569947318453434, "grad_norm": 0.21776628494262695, "learning_rate": 1.890414598590945e-05, "loss": 1.2585, "step": 5899 }, { "epoch": 1.757292578045012, "grad_norm": 0.2251240313053131, "learning_rate": 1.890370692176795e-05, "loss": 1.2642, "step": 5900 }, { "epoch": 1.7575904242446807, "grad_norm": 0.2164892703294754, "learning_rate": 1.8903267774787402e-05, "loss": 1.2602, "step": 5901 }, { "epoch": 1.7578882704443493, "grad_norm": 0.22920046746730804, "learning_rate": 1.8902828544971896e-05, "loss": 1.2565, "step": 5902 }, { "epoch": 1.7581861166440178, "grad_norm": 0.20191632211208344, "learning_rate": 1.8902389232325515e-05, "loss": 1.2731, "step": 5903 }, { "epoch": 1.7584839628436866, "grad_norm": 0.2176978886127472, "learning_rate": 1.890194983685235e-05, "loss": 1.2786, "step": 5904 }, { "epoch": 1.7587818090433553, "grad_norm": 0.21307040750980377, "learning_rate": 1.8901510358556486e-05, "loss": 1.2699, "step": 5905 }, { "epoch": 1.759079655243024, "grad_norm": 0.21998471021652222, "learning_rate": 1.8901070797442017e-05, "loss": 1.2714, "step": 5906 }, { "epoch": 1.7593775014426924, "grad_norm": 0.2163301706314087, "learning_rate": 1.8900631153513024e-05, "loss": 1.27, "step": 5907 }, { "epoch": 1.7596753476423612, "grad_norm": 0.2146802842617035, "learning_rate": 1.8900191426773608e-05, "loss": 1.2677, "step": 5908 }, { "epoch": 1.7599731938420298, "grad_norm": 0.21443694829940796, "learning_rate": 1.8899751617227854e-05, "loss": 1.2646, "step": 5909 }, { "epoch": 1.7602710400416983, "grad_norm": 0.231244295835495, "learning_rate": 1.8899311724879853e-05, "loss": 1.2732, "step": 5910 }, { "epoch": 1.760568886241367, "grad_norm": 0.2077794224023819, "learning_rate": 1.8898871749733697e-05, "loss": 1.2414, "step": 5911 }, { "epoch": 1.7608667324410359, "grad_norm": 0.2148183137178421, "learning_rate": 1.8898431691793485e-05, "loss": 1.2753, "step": 5912 }, { "epoch": 1.7611645786407044, "grad_norm": 0.21195048093795776, "learning_rate": 1.8897991551063304e-05, "loss": 1.2664, "step": 5913 }, { "epoch": 1.761462424840373, "grad_norm": 0.2143191397190094, "learning_rate": 1.8897551327547257e-05, "loss": 1.2615, "step": 5914 }, { "epoch": 1.7617602710400417, "grad_norm": 0.220404714345932, "learning_rate": 1.8897111021249433e-05, "loss": 1.2815, "step": 5915 }, { "epoch": 1.7620581172397105, "grad_norm": 0.22406984865665436, "learning_rate": 1.8896670632173936e-05, "loss": 1.2687, "step": 5916 }, { "epoch": 1.7623559634393788, "grad_norm": 0.2176969051361084, "learning_rate": 1.8896230160324854e-05, "loss": 1.2726, "step": 5917 }, { "epoch": 1.7626538096390476, "grad_norm": 0.21982218325138092, "learning_rate": 1.889578960570629e-05, "loss": 1.2676, "step": 5918 }, { "epoch": 1.7629516558387164, "grad_norm": 0.2140255570411682, "learning_rate": 1.8895348968322346e-05, "loss": 1.2717, "step": 5919 }, { "epoch": 1.763249502038385, "grad_norm": 0.21119576692581177, "learning_rate": 1.8894908248177116e-05, "loss": 1.2695, "step": 5920 }, { "epoch": 1.7635473482380535, "grad_norm": 0.21961624920368195, "learning_rate": 1.8894467445274703e-05, "loss": 1.2634, "step": 5921 }, { "epoch": 1.7638451944377223, "grad_norm": 0.22621069848537445, "learning_rate": 1.889402655961921e-05, "loss": 1.2667, "step": 5922 }, { "epoch": 1.764143040637391, "grad_norm": 0.21127013862133026, "learning_rate": 1.8893585591214734e-05, "loss": 1.2783, "step": 5923 }, { "epoch": 1.7644408868370594, "grad_norm": 0.21664802730083466, "learning_rate": 1.889314454006538e-05, "loss": 1.2698, "step": 5924 }, { "epoch": 1.7647387330367281, "grad_norm": 0.22002741694450378, "learning_rate": 1.8892703406175257e-05, "loss": 1.2673, "step": 5925 }, { "epoch": 1.765036579236397, "grad_norm": 0.2275383025407791, "learning_rate": 1.8892262189548462e-05, "loss": 1.2581, "step": 5926 }, { "epoch": 1.7653344254360654, "grad_norm": 0.21121346950531006, "learning_rate": 1.88918208901891e-05, "loss": 1.275, "step": 5927 }, { "epoch": 1.765632271635734, "grad_norm": 0.21943354606628418, "learning_rate": 1.889137950810128e-05, "loss": 1.2757, "step": 5928 }, { "epoch": 1.7659301178354028, "grad_norm": 0.2101525366306305, "learning_rate": 1.8890938043289106e-05, "loss": 1.2684, "step": 5929 }, { "epoch": 1.7662279640350715, "grad_norm": 0.22099937498569489, "learning_rate": 1.8890496495756692e-05, "loss": 1.2712, "step": 5930 }, { "epoch": 1.76652581023474, "grad_norm": 0.23087163269519806, "learning_rate": 1.8890054865508137e-05, "loss": 1.273, "step": 5931 }, { "epoch": 1.7668236564344086, "grad_norm": 0.21452589333057404, "learning_rate": 1.8889613152547556e-05, "loss": 1.2845, "step": 5932 }, { "epoch": 1.7671215026340774, "grad_norm": 0.21434038877487183, "learning_rate": 1.8889171356879056e-05, "loss": 1.2606, "step": 5933 }, { "epoch": 1.767419348833746, "grad_norm": 0.22064903378486633, "learning_rate": 1.8888729478506747e-05, "loss": 1.2735, "step": 5934 }, { "epoch": 1.7677171950334145, "grad_norm": 0.2192641794681549, "learning_rate": 1.8888287517434746e-05, "loss": 1.2742, "step": 5935 }, { "epoch": 1.7680150412330833, "grad_norm": 0.21743933856487274, "learning_rate": 1.8887845473667155e-05, "loss": 1.2558, "step": 5936 }, { "epoch": 1.768312887432752, "grad_norm": 0.22595295310020447, "learning_rate": 1.888740334720809e-05, "loss": 1.2664, "step": 5937 }, { "epoch": 1.7686107336324206, "grad_norm": 0.21765536069869995, "learning_rate": 1.888696113806167e-05, "loss": 1.2871, "step": 5938 }, { "epoch": 1.7689085798320892, "grad_norm": 0.2340780645608902, "learning_rate": 1.888651884623201e-05, "loss": 1.2686, "step": 5939 }, { "epoch": 1.769206426031758, "grad_norm": 0.2243206799030304, "learning_rate": 1.8886076471723216e-05, "loss": 1.2568, "step": 5940 }, { "epoch": 1.7695042722314265, "grad_norm": 0.2287902683019638, "learning_rate": 1.888563401453941e-05, "loss": 1.2711, "step": 5941 }, { "epoch": 1.769802118431095, "grad_norm": 0.23100508749485016, "learning_rate": 1.8885191474684706e-05, "loss": 1.2832, "step": 5942 }, { "epoch": 1.7700999646307638, "grad_norm": 0.21319617331027985, "learning_rate": 1.8884748852163223e-05, "loss": 1.2907, "step": 5943 }, { "epoch": 1.7703978108304326, "grad_norm": 0.2116445004940033, "learning_rate": 1.888430614697908e-05, "loss": 1.2801, "step": 5944 }, { "epoch": 1.7706956570301011, "grad_norm": 0.22597062587738037, "learning_rate": 1.888386335913639e-05, "loss": 1.2662, "step": 5945 }, { "epoch": 1.7709935032297697, "grad_norm": 0.23162300884723663, "learning_rate": 1.8883420488639278e-05, "loss": 1.284, "step": 5946 }, { "epoch": 1.7712913494294384, "grad_norm": 0.21227452158927917, "learning_rate": 1.8882977535491867e-05, "loss": 1.2554, "step": 5947 }, { "epoch": 1.771589195629107, "grad_norm": 0.2118864804506302, "learning_rate": 1.8882534499698272e-05, "loss": 1.2644, "step": 5948 }, { "epoch": 1.7718870418287755, "grad_norm": 0.22540844976902008, "learning_rate": 1.888209138126262e-05, "loss": 1.2756, "step": 5949 }, { "epoch": 1.7721848880284443, "grad_norm": 0.2292068600654602, "learning_rate": 1.888164818018903e-05, "loss": 1.282, "step": 5950 }, { "epoch": 1.772482734228113, "grad_norm": 0.2195514440536499, "learning_rate": 1.8881204896481625e-05, "loss": 1.2861, "step": 5951 }, { "epoch": 1.7727805804277816, "grad_norm": 0.22650204598903656, "learning_rate": 1.8880761530144536e-05, "loss": 1.2663, "step": 5952 }, { "epoch": 1.7730784266274502, "grad_norm": 0.21833503246307373, "learning_rate": 1.888031808118188e-05, "loss": 1.2659, "step": 5953 }, { "epoch": 1.773376272827119, "grad_norm": 0.21655771136283875, "learning_rate": 1.8879874549597783e-05, "loss": 1.2811, "step": 5954 }, { "epoch": 1.7736741190267875, "grad_norm": 0.2311340719461441, "learning_rate": 1.887943093539638e-05, "loss": 1.2689, "step": 5955 }, { "epoch": 1.773971965226456, "grad_norm": 0.21346105635166168, "learning_rate": 1.8878987238581786e-05, "loss": 1.2483, "step": 5956 }, { "epoch": 1.7742698114261248, "grad_norm": 0.220946803689003, "learning_rate": 1.8878543459158143e-05, "loss": 1.2655, "step": 5957 }, { "epoch": 1.7745676576257936, "grad_norm": 0.2175082564353943, "learning_rate": 1.887809959712957e-05, "loss": 1.269, "step": 5958 }, { "epoch": 1.7748655038254622, "grad_norm": 0.21422596275806427, "learning_rate": 1.88776556525002e-05, "loss": 1.2663, "step": 5959 }, { "epoch": 1.7751633500251307, "grad_norm": 0.2292601764202118, "learning_rate": 1.887721162527416e-05, "loss": 1.2774, "step": 5960 }, { "epoch": 1.7754611962247995, "grad_norm": 0.23035894334316254, "learning_rate": 1.8876767515455586e-05, "loss": 1.2631, "step": 5961 }, { "epoch": 1.775759042424468, "grad_norm": 0.22301332652568817, "learning_rate": 1.8876323323048612e-05, "loss": 1.2722, "step": 5962 }, { "epoch": 1.7760568886241366, "grad_norm": 0.22273315489292145, "learning_rate": 1.8875879048057362e-05, "loss": 1.2859, "step": 5963 }, { "epoch": 1.7763547348238053, "grad_norm": 0.2204713076353073, "learning_rate": 1.8875434690485973e-05, "loss": 1.2511, "step": 5964 }, { "epoch": 1.7766525810234741, "grad_norm": 0.21425484120845795, "learning_rate": 1.8874990250338582e-05, "loss": 1.2644, "step": 5965 }, { "epoch": 1.7769504272231427, "grad_norm": 0.22761522233486176, "learning_rate": 1.8874545727619327e-05, "loss": 1.2703, "step": 5966 }, { "epoch": 1.7772482734228112, "grad_norm": 0.23269104957580566, "learning_rate": 1.8874101122332332e-05, "loss": 1.2499, "step": 5967 }, { "epoch": 1.77754611962248, "grad_norm": 0.2156340777873993, "learning_rate": 1.8873656434481744e-05, "loss": 1.288, "step": 5968 }, { "epoch": 1.7778439658221485, "grad_norm": 0.21951022744178772, "learning_rate": 1.88732116640717e-05, "loss": 1.2656, "step": 5969 }, { "epoch": 1.778141812021817, "grad_norm": 0.23585747182369232, "learning_rate": 1.887276681110633e-05, "loss": 1.2686, "step": 5970 }, { "epoch": 1.7784396582214859, "grad_norm": 0.2331666797399521, "learning_rate": 1.8872321875589782e-05, "loss": 1.2705, "step": 5971 }, { "epoch": 1.7787375044211546, "grad_norm": 0.2160840481519699, "learning_rate": 1.887187685752619e-05, "loss": 1.2581, "step": 5972 }, { "epoch": 1.7790353506208232, "grad_norm": 0.21729782223701477, "learning_rate": 1.8871431756919696e-05, "loss": 1.2701, "step": 5973 }, { "epoch": 1.7793331968204917, "grad_norm": 0.22713887691497803, "learning_rate": 1.887098657377444e-05, "loss": 1.2599, "step": 5974 }, { "epoch": 1.7796310430201605, "grad_norm": 0.22662410140037537, "learning_rate": 1.8870541308094567e-05, "loss": 1.2516, "step": 5975 }, { "epoch": 1.779928889219829, "grad_norm": 0.21475499868392944, "learning_rate": 1.8870095959884218e-05, "loss": 1.2561, "step": 5976 }, { "epoch": 1.7802267354194976, "grad_norm": 0.2308451533317566, "learning_rate": 1.8869650529147537e-05, "loss": 1.2583, "step": 5977 }, { "epoch": 1.7805245816191664, "grad_norm": 0.22059254348278046, "learning_rate": 1.8869205015888667e-05, "loss": 1.2695, "step": 5978 }, { "epoch": 1.7808224278188352, "grad_norm": 0.22483398020267487, "learning_rate": 1.8868759420111753e-05, "loss": 1.2714, "step": 5979 }, { "epoch": 1.7811202740185037, "grad_norm": 0.22063012421131134, "learning_rate": 1.886831374182094e-05, "loss": 1.266, "step": 5980 }, { "epoch": 1.7814181202181723, "grad_norm": 0.2195800244808197, "learning_rate": 1.886786798102038e-05, "loss": 1.2838, "step": 5981 }, { "epoch": 1.781715966417841, "grad_norm": 0.22838428616523743, "learning_rate": 1.886742213771421e-05, "loss": 1.2724, "step": 5982 }, { "epoch": 1.7820138126175098, "grad_norm": 0.2218025028705597, "learning_rate": 1.8866976211906588e-05, "loss": 1.2689, "step": 5983 }, { "epoch": 1.7823116588171781, "grad_norm": 0.21701163053512573, "learning_rate": 1.886653020360166e-05, "loss": 1.2733, "step": 5984 }, { "epoch": 1.782609505016847, "grad_norm": 0.24475444853305817, "learning_rate": 1.8866084112803574e-05, "loss": 1.2692, "step": 5985 }, { "epoch": 1.7829073512165157, "grad_norm": 0.2269645631313324, "learning_rate": 1.886563793951648e-05, "loss": 1.2877, "step": 5986 }, { "epoch": 1.7832051974161842, "grad_norm": 0.2226429432630539, "learning_rate": 1.886519168374453e-05, "loss": 1.2505, "step": 5987 }, { "epoch": 1.7835030436158528, "grad_norm": 0.21845246851444244, "learning_rate": 1.886474534549188e-05, "loss": 1.2501, "step": 5988 }, { "epoch": 1.7838008898155215, "grad_norm": 0.21259817481040955, "learning_rate": 1.8864298924762673e-05, "loss": 1.2859, "step": 5989 }, { "epoch": 1.7840987360151903, "grad_norm": 0.21492299437522888, "learning_rate": 1.886385242156107e-05, "loss": 1.2627, "step": 5990 }, { "epoch": 1.7843965822148586, "grad_norm": 0.22381509840488434, "learning_rate": 1.8863405835891225e-05, "loss": 1.278, "step": 5991 }, { "epoch": 1.7846944284145274, "grad_norm": 0.22142374515533447, "learning_rate": 1.8862959167757294e-05, "loss": 1.2726, "step": 5992 }, { "epoch": 1.7849922746141962, "grad_norm": 0.22224482893943787, "learning_rate": 1.886251241716343e-05, "loss": 1.2676, "step": 5993 }, { "epoch": 1.7852901208138647, "grad_norm": 0.21727298200130463, "learning_rate": 1.8862065584113783e-05, "loss": 1.2637, "step": 5994 }, { "epoch": 1.7855879670135333, "grad_norm": 0.21374741196632385, "learning_rate": 1.8861618668612523e-05, "loss": 1.2681, "step": 5995 }, { "epoch": 1.785885813213202, "grad_norm": 0.22146710753440857, "learning_rate": 1.88611716706638e-05, "loss": 1.2724, "step": 5996 }, { "epoch": 1.7861836594128708, "grad_norm": 0.22528040409088135, "learning_rate": 1.8860724590271775e-05, "loss": 1.2778, "step": 5997 }, { "epoch": 1.7864815056125394, "grad_norm": 0.22059322893619537, "learning_rate": 1.8860277427440608e-05, "loss": 1.2506, "step": 5998 }, { "epoch": 1.786779351812208, "grad_norm": 0.22748437523841858, "learning_rate": 1.885983018217446e-05, "loss": 1.2794, "step": 5999 }, { "epoch": 1.7870771980118767, "grad_norm": 0.2144092172384262, "learning_rate": 1.8859382854477484e-05, "loss": 1.2594, "step": 6000 }, { "epoch": 1.7870771980118767, "eval_loss": 1.3452653884887695, "eval_runtime": 23.4453, "eval_samples_per_second": 73.959, "eval_steps_per_second": 4.649, "step": 6000 }, { "epoch": 1.7873750442115452, "grad_norm": 0.228704035282135, "learning_rate": 1.8858935444353856e-05, "loss": 1.2717, "step": 6001 }, { "epoch": 1.7876728904112138, "grad_norm": 0.21824924647808075, "learning_rate": 1.885848795180773e-05, "loss": 1.2632, "step": 6002 }, { "epoch": 1.7879707366108826, "grad_norm": 0.23707593977451324, "learning_rate": 1.8858040376843272e-05, "loss": 1.2665, "step": 6003 }, { "epoch": 1.7882685828105513, "grad_norm": 0.23337838053703308, "learning_rate": 1.8857592719464644e-05, "loss": 1.2768, "step": 6004 }, { "epoch": 1.78856642901022, "grad_norm": 0.23033075034618378, "learning_rate": 1.8857144979676013e-05, "loss": 1.2716, "step": 6005 }, { "epoch": 1.7888642752098884, "grad_norm": 0.2230740189552307, "learning_rate": 1.885669715748154e-05, "loss": 1.2663, "step": 6006 }, { "epoch": 1.7891621214095572, "grad_norm": 0.22400899231433868, "learning_rate": 1.8856249252885398e-05, "loss": 1.2693, "step": 6007 }, { "epoch": 1.7894599676092258, "grad_norm": 0.2205016165971756, "learning_rate": 1.8855801265891753e-05, "loss": 1.2592, "step": 6008 }, { "epoch": 1.7897578138088943, "grad_norm": 0.220457524061203, "learning_rate": 1.885535319650477e-05, "loss": 1.2593, "step": 6009 }, { "epoch": 1.790055660008563, "grad_norm": 0.22970102727413177, "learning_rate": 1.8854905044728617e-05, "loss": 1.2751, "step": 6010 }, { "epoch": 1.7903535062082319, "grad_norm": 0.22625747323036194, "learning_rate": 1.8854456810567467e-05, "loss": 1.286, "step": 6011 }, { "epoch": 1.7906513524079004, "grad_norm": 0.2235589474439621, "learning_rate": 1.885400849402549e-05, "loss": 1.2779, "step": 6012 }, { "epoch": 1.790949198607569, "grad_norm": 0.22546285390853882, "learning_rate": 1.8853560095106857e-05, "loss": 1.2683, "step": 6013 }, { "epoch": 1.7912470448072377, "grad_norm": 0.22701458632946014, "learning_rate": 1.8853111613815738e-05, "loss": 1.2497, "step": 6014 }, { "epoch": 1.7915448910069063, "grad_norm": 0.22597983479499817, "learning_rate": 1.885266305015631e-05, "loss": 1.2692, "step": 6015 }, { "epoch": 1.7918427372065748, "grad_norm": 0.22108972072601318, "learning_rate": 1.885221440413274e-05, "loss": 1.2684, "step": 6016 }, { "epoch": 1.7921405834062436, "grad_norm": 0.21375659108161926, "learning_rate": 1.8851765675749203e-05, "loss": 1.2581, "step": 6017 }, { "epoch": 1.7924384296059124, "grad_norm": 0.2176457941532135, "learning_rate": 1.8851316865009877e-05, "loss": 1.274, "step": 6018 }, { "epoch": 1.792736275805581, "grad_norm": 0.23012582957744598, "learning_rate": 1.885086797191894e-05, "loss": 1.2551, "step": 6019 }, { "epoch": 1.7930341220052495, "grad_norm": 0.22473666071891785, "learning_rate": 1.8850418996480565e-05, "loss": 1.2645, "step": 6020 }, { "epoch": 1.7933319682049182, "grad_norm": 0.22484521567821503, "learning_rate": 1.8849969938698927e-05, "loss": 1.2701, "step": 6021 }, { "epoch": 1.7936298144045868, "grad_norm": 0.211605042219162, "learning_rate": 1.884952079857821e-05, "loss": 1.2729, "step": 6022 }, { "epoch": 1.7939276606042553, "grad_norm": 0.2143273800611496, "learning_rate": 1.8849071576122584e-05, "loss": 1.2842, "step": 6023 }, { "epoch": 1.7942255068039241, "grad_norm": 0.21585173904895782, "learning_rate": 1.8848622271336235e-05, "loss": 1.2512, "step": 6024 }, { "epoch": 1.794523353003593, "grad_norm": 0.2228710949420929, "learning_rate": 1.8848172884223345e-05, "loss": 1.2592, "step": 6025 }, { "epoch": 1.7948211992032614, "grad_norm": 0.23387007415294647, "learning_rate": 1.8847723414788093e-05, "loss": 1.2594, "step": 6026 }, { "epoch": 1.79511904540293, "grad_norm": 0.22081926465034485, "learning_rate": 1.8847273863034658e-05, "loss": 1.2696, "step": 6027 }, { "epoch": 1.7954168916025988, "grad_norm": 0.21065343916416168, "learning_rate": 1.8846824228967222e-05, "loss": 1.2764, "step": 6028 }, { "epoch": 1.7957147378022673, "grad_norm": 0.2157675176858902, "learning_rate": 1.8846374512589972e-05, "loss": 1.2786, "step": 6029 }, { "epoch": 1.7960125840019359, "grad_norm": 0.2320483773946762, "learning_rate": 1.8845924713907094e-05, "loss": 1.2599, "step": 6030 }, { "epoch": 1.7963104302016046, "grad_norm": 0.2298630177974701, "learning_rate": 1.8845474832922764e-05, "loss": 1.2671, "step": 6031 }, { "epoch": 1.7966082764012734, "grad_norm": 0.25772473216056824, "learning_rate": 1.884502486964118e-05, "loss": 1.2763, "step": 6032 }, { "epoch": 1.796906122600942, "grad_norm": 0.2270805686712265, "learning_rate": 1.884457482406652e-05, "loss": 1.2822, "step": 6033 }, { "epoch": 1.7972039688006105, "grad_norm": 0.24416397511959076, "learning_rate": 1.884412469620297e-05, "loss": 1.2835, "step": 6034 }, { "epoch": 1.7975018150002793, "grad_norm": 0.2223682850599289, "learning_rate": 1.884367448605472e-05, "loss": 1.2612, "step": 6035 }, { "epoch": 1.7977996611999478, "grad_norm": 0.26283374428749084, "learning_rate": 1.8843224193625967e-05, "loss": 1.2863, "step": 6036 }, { "epoch": 1.7980975073996164, "grad_norm": 0.23235592246055603, "learning_rate": 1.8842773818920887e-05, "loss": 1.2712, "step": 6037 }, { "epoch": 1.7983953535992852, "grad_norm": 0.2454880326986313, "learning_rate": 1.884232336194368e-05, "loss": 1.2395, "step": 6038 }, { "epoch": 1.798693199798954, "grad_norm": 0.23668311536312103, "learning_rate": 1.8841872822698528e-05, "loss": 1.2666, "step": 6039 }, { "epoch": 1.7989910459986225, "grad_norm": 0.2175203114748001, "learning_rate": 1.8841422201189633e-05, "loss": 1.2679, "step": 6040 }, { "epoch": 1.799288892198291, "grad_norm": 0.22800126671791077, "learning_rate": 1.884097149742118e-05, "loss": 1.264, "step": 6041 }, { "epoch": 1.7995867383979598, "grad_norm": 0.2290138304233551, "learning_rate": 1.8840520711397367e-05, "loss": 1.2478, "step": 6042 }, { "epoch": 1.7998845845976283, "grad_norm": 0.24279721081256866, "learning_rate": 1.8840069843122384e-05, "loss": 1.2856, "step": 6043 }, { "epoch": 1.800182430797297, "grad_norm": 0.23835624754428864, "learning_rate": 1.8839618892600427e-05, "loss": 1.2635, "step": 6044 }, { "epoch": 1.8004802769969657, "grad_norm": 0.21752871572971344, "learning_rate": 1.8839167859835695e-05, "loss": 1.2784, "step": 6045 }, { "epoch": 1.8007781231966344, "grad_norm": 0.23671525716781616, "learning_rate": 1.8838716744832385e-05, "loss": 1.2689, "step": 6046 }, { "epoch": 1.801075969396303, "grad_norm": 0.22538290917873383, "learning_rate": 1.8838265547594684e-05, "loss": 1.2965, "step": 6047 }, { "epoch": 1.8013738155959715, "grad_norm": 0.2299932986497879, "learning_rate": 1.8837814268126798e-05, "loss": 1.267, "step": 6048 }, { "epoch": 1.8016716617956403, "grad_norm": 0.2351229190826416, "learning_rate": 1.8837362906432928e-05, "loss": 1.2631, "step": 6049 }, { "epoch": 1.801969507995309, "grad_norm": 0.22970789670944214, "learning_rate": 1.8836911462517268e-05, "loss": 1.2736, "step": 6050 }, { "epoch": 1.8022673541949774, "grad_norm": 0.22385859489440918, "learning_rate": 1.883645993638402e-05, "loss": 1.2694, "step": 6051 }, { "epoch": 1.8025652003946462, "grad_norm": 0.2234676629304886, "learning_rate": 1.8836008328037384e-05, "loss": 1.2633, "step": 6052 }, { "epoch": 1.802863046594315, "grad_norm": 0.2438935488462448, "learning_rate": 1.8835556637481566e-05, "loss": 1.276, "step": 6053 }, { "epoch": 1.8031608927939835, "grad_norm": 0.2130393236875534, "learning_rate": 1.883510486472076e-05, "loss": 1.2765, "step": 6054 }, { "epoch": 1.803458738993652, "grad_norm": 0.2313595861196518, "learning_rate": 1.8834653009759177e-05, "loss": 1.292, "step": 6055 }, { "epoch": 1.8037565851933208, "grad_norm": 0.2259809374809265, "learning_rate": 1.883420107260102e-05, "loss": 1.2533, "step": 6056 }, { "epoch": 1.8040544313929896, "grad_norm": 0.22493532299995422, "learning_rate": 1.883374905325049e-05, "loss": 1.281, "step": 6057 }, { "epoch": 1.804352277592658, "grad_norm": 0.2182721346616745, "learning_rate": 1.8833296951711793e-05, "loss": 1.2652, "step": 6058 }, { "epoch": 1.8046501237923267, "grad_norm": 0.22102218866348267, "learning_rate": 1.883284476798914e-05, "loss": 1.2588, "step": 6059 }, { "epoch": 1.8049479699919955, "grad_norm": 0.2173367142677307, "learning_rate": 1.8832392502086736e-05, "loss": 1.2644, "step": 6060 }, { "epoch": 1.805245816191664, "grad_norm": 0.22203823924064636, "learning_rate": 1.8831940154008785e-05, "loss": 1.2599, "step": 6061 }, { "epoch": 1.8055436623913326, "grad_norm": 0.21372835338115692, "learning_rate": 1.88314877237595e-05, "loss": 1.2772, "step": 6062 }, { "epoch": 1.8058415085910013, "grad_norm": 0.22985833883285522, "learning_rate": 1.8831035211343088e-05, "loss": 1.2799, "step": 6063 }, { "epoch": 1.8061393547906701, "grad_norm": 0.21987077593803406, "learning_rate": 1.883058261676376e-05, "loss": 1.2606, "step": 6064 }, { "epoch": 1.8064372009903387, "grad_norm": 0.22975841164588928, "learning_rate": 1.883012994002573e-05, "loss": 1.2736, "step": 6065 }, { "epoch": 1.8067350471900072, "grad_norm": 0.22226479649543762, "learning_rate": 1.8829677181133202e-05, "loss": 1.2683, "step": 6066 }, { "epoch": 1.807032893389676, "grad_norm": 0.22626134753227234, "learning_rate": 1.8829224340090398e-05, "loss": 1.2783, "step": 6067 }, { "epoch": 1.8073307395893445, "grad_norm": 0.21422359347343445, "learning_rate": 1.882877141690152e-05, "loss": 1.2742, "step": 6068 }, { "epoch": 1.807628585789013, "grad_norm": 0.23135624825954437, "learning_rate": 1.882831841157079e-05, "loss": 1.2614, "step": 6069 }, { "epoch": 1.8079264319886819, "grad_norm": 0.24068643152713776, "learning_rate": 1.8827865324102426e-05, "loss": 1.2823, "step": 6070 }, { "epoch": 1.8082242781883506, "grad_norm": 0.22447143495082855, "learning_rate": 1.8827412154500634e-05, "loss": 1.2554, "step": 6071 }, { "epoch": 1.8085221243880192, "grad_norm": 0.22670595347881317, "learning_rate": 1.8826958902769636e-05, "loss": 1.2683, "step": 6072 }, { "epoch": 1.8088199705876877, "grad_norm": 0.21980814635753632, "learning_rate": 1.8826505568913644e-05, "loss": 1.2773, "step": 6073 }, { "epoch": 1.8091178167873565, "grad_norm": 0.23693813383579254, "learning_rate": 1.882605215293688e-05, "loss": 1.2541, "step": 6074 }, { "epoch": 1.809415662987025, "grad_norm": 0.2204945832490921, "learning_rate": 1.8825598654843563e-05, "loss": 1.2771, "step": 6075 }, { "epoch": 1.8097135091866936, "grad_norm": 0.22734081745147705, "learning_rate": 1.882514507463791e-05, "loss": 1.2766, "step": 6076 }, { "epoch": 1.8100113553863624, "grad_norm": 0.2248615026473999, "learning_rate": 1.8824691412324144e-05, "loss": 1.2861, "step": 6077 }, { "epoch": 1.8103092015860311, "grad_norm": 0.25749897956848145, "learning_rate": 1.882423766790648e-05, "loss": 1.2697, "step": 6078 }, { "epoch": 1.8106070477856997, "grad_norm": 0.21990980207920074, "learning_rate": 1.882378384138915e-05, "loss": 1.2709, "step": 6079 }, { "epoch": 1.8109048939853682, "grad_norm": 0.22228093445301056, "learning_rate": 1.8823329932776367e-05, "loss": 1.2844, "step": 6080 }, { "epoch": 1.811202740185037, "grad_norm": 0.22391878068447113, "learning_rate": 1.8822875942072355e-05, "loss": 1.2648, "step": 6081 }, { "epoch": 1.8115005863847056, "grad_norm": 0.22353194653987885, "learning_rate": 1.8822421869281344e-05, "loss": 1.2751, "step": 6082 }, { "epoch": 1.8117984325843741, "grad_norm": 0.21903499960899353, "learning_rate": 1.882196771440755e-05, "loss": 1.2792, "step": 6083 }, { "epoch": 1.812096278784043, "grad_norm": 0.22099357843399048, "learning_rate": 1.8821513477455205e-05, "loss": 1.2714, "step": 6084 }, { "epoch": 1.8123941249837117, "grad_norm": 0.21618127822875977, "learning_rate": 1.8821059158428534e-05, "loss": 1.2577, "step": 6085 }, { "epoch": 1.8126919711833802, "grad_norm": 0.21984921395778656, "learning_rate": 1.8820604757331763e-05, "loss": 1.2557, "step": 6086 }, { "epoch": 1.8129898173830488, "grad_norm": 0.22133424878120422, "learning_rate": 1.8820150274169115e-05, "loss": 1.2793, "step": 6087 }, { "epoch": 1.8132876635827175, "grad_norm": 0.22182853519916534, "learning_rate": 1.8819695708944827e-05, "loss": 1.279, "step": 6088 }, { "epoch": 1.813585509782386, "grad_norm": 0.22566376626491547, "learning_rate": 1.8819241061663124e-05, "loss": 1.2718, "step": 6089 }, { "epoch": 1.8138833559820546, "grad_norm": 0.2206290364265442, "learning_rate": 1.8818786332328236e-05, "loss": 1.2608, "step": 6090 }, { "epoch": 1.8141812021817234, "grad_norm": 0.21377462148666382, "learning_rate": 1.8818331520944396e-05, "loss": 1.2419, "step": 6091 }, { "epoch": 1.8144790483813922, "grad_norm": 0.2487218827009201, "learning_rate": 1.8817876627515834e-05, "loss": 1.2748, "step": 6092 }, { "epoch": 1.8147768945810607, "grad_norm": 0.22330373525619507, "learning_rate": 1.8817421652046777e-05, "loss": 1.2595, "step": 6093 }, { "epoch": 1.8150747407807293, "grad_norm": 0.23433241248130798, "learning_rate": 1.8816966594541465e-05, "loss": 1.2688, "step": 6094 }, { "epoch": 1.815372586980398, "grad_norm": 0.2255052775144577, "learning_rate": 1.8816511455004133e-05, "loss": 1.2642, "step": 6095 }, { "epoch": 1.8156704331800666, "grad_norm": 0.22542732954025269, "learning_rate": 1.8816056233439008e-05, "loss": 1.2643, "step": 6096 }, { "epoch": 1.8159682793797352, "grad_norm": 0.23201413452625275, "learning_rate": 1.8815600929850333e-05, "loss": 1.2647, "step": 6097 }, { "epoch": 1.816266125579404, "grad_norm": 0.23117925226688385, "learning_rate": 1.881514554424234e-05, "loss": 1.2517, "step": 6098 }, { "epoch": 1.8165639717790727, "grad_norm": 0.22815728187561035, "learning_rate": 1.8814690076619264e-05, "loss": 1.2622, "step": 6099 }, { "epoch": 1.8168618179787412, "grad_norm": 0.22319307923316956, "learning_rate": 1.8814234526985346e-05, "loss": 1.2719, "step": 6100 }, { "epoch": 1.8171596641784098, "grad_norm": 0.2264396995306015, "learning_rate": 1.8813778895344826e-05, "loss": 1.2772, "step": 6101 }, { "epoch": 1.8174575103780786, "grad_norm": 0.23161791265010834, "learning_rate": 1.8813323181701938e-05, "loss": 1.2572, "step": 6102 }, { "epoch": 1.8177553565777471, "grad_norm": 0.228708416223526, "learning_rate": 1.8812867386060928e-05, "loss": 1.2874, "step": 6103 }, { "epoch": 1.8180532027774157, "grad_norm": 0.21395297348499298, "learning_rate": 1.881241150842603e-05, "loss": 1.2519, "step": 6104 }, { "epoch": 1.8183510489770844, "grad_norm": 0.22581659257411957, "learning_rate": 1.8811955548801492e-05, "loss": 1.2628, "step": 6105 }, { "epoch": 1.8186488951767532, "grad_norm": 0.23664312064647675, "learning_rate": 1.881149950719155e-05, "loss": 1.2683, "step": 6106 }, { "epoch": 1.8189467413764218, "grad_norm": 0.23028619587421417, "learning_rate": 1.881104338360045e-05, "loss": 1.2732, "step": 6107 }, { "epoch": 1.8192445875760903, "grad_norm": 0.21617329120635986, "learning_rate": 1.8810587178032434e-05, "loss": 1.2808, "step": 6108 }, { "epoch": 1.819542433775759, "grad_norm": 0.21901915967464447, "learning_rate": 1.8810130890491755e-05, "loss": 1.2636, "step": 6109 }, { "epoch": 1.8198402799754279, "grad_norm": 0.23325960338115692, "learning_rate": 1.8809674520982643e-05, "loss": 1.281, "step": 6110 }, { "epoch": 1.8201381261750962, "grad_norm": 0.2988772392272949, "learning_rate": 1.880921806950936e-05, "loss": 1.2565, "step": 6111 }, { "epoch": 1.820435972374765, "grad_norm": 0.26141929626464844, "learning_rate": 1.880876153607614e-05, "loss": 1.2591, "step": 6112 }, { "epoch": 1.8207338185744337, "grad_norm": 0.27643170952796936, "learning_rate": 1.8808304920687238e-05, "loss": 1.2612, "step": 6113 }, { "epoch": 1.8210316647741023, "grad_norm": 0.3000950217247009, "learning_rate": 1.8807848223346895e-05, "loss": 1.2705, "step": 6114 }, { "epoch": 1.8213295109737708, "grad_norm": 0.21227096021175385, "learning_rate": 1.8807391444059368e-05, "loss": 1.2747, "step": 6115 }, { "epoch": 1.8216273571734396, "grad_norm": 0.22193902730941772, "learning_rate": 1.8806934582828908e-05, "loss": 1.2743, "step": 6116 }, { "epoch": 1.8219252033731084, "grad_norm": 0.22305090725421906, "learning_rate": 1.8806477639659757e-05, "loss": 1.2796, "step": 6117 }, { "epoch": 1.8222230495727767, "grad_norm": 0.22063802182674408, "learning_rate": 1.880602061455617e-05, "loss": 1.2766, "step": 6118 }, { "epoch": 1.8225208957724455, "grad_norm": 0.24384748935699463, "learning_rate": 1.8805563507522403e-05, "loss": 1.2711, "step": 6119 }, { "epoch": 1.8228187419721142, "grad_norm": 0.23025380074977875, "learning_rate": 1.8805106318562702e-05, "loss": 1.2682, "step": 6120 }, { "epoch": 1.8231165881717828, "grad_norm": 0.233215793967247, "learning_rate": 1.8804649047681328e-05, "loss": 1.2661, "step": 6121 }, { "epoch": 1.8234144343714513, "grad_norm": 0.21864233911037445, "learning_rate": 1.880419169488253e-05, "loss": 1.2664, "step": 6122 }, { "epoch": 1.8237122805711201, "grad_norm": 0.23096491396427155, "learning_rate": 1.8803734260170565e-05, "loss": 1.2575, "step": 6123 }, { "epoch": 1.8240101267707889, "grad_norm": 0.2311975210905075, "learning_rate": 1.880327674354969e-05, "loss": 1.2569, "step": 6124 }, { "epoch": 1.8243079729704572, "grad_norm": 0.24429549276828766, "learning_rate": 1.8802819145024157e-05, "loss": 1.2552, "step": 6125 }, { "epoch": 1.824605819170126, "grad_norm": 0.23141895234584808, "learning_rate": 1.880236146459823e-05, "loss": 1.2498, "step": 6126 }, { "epoch": 1.8249036653697948, "grad_norm": 0.21364538371562958, "learning_rate": 1.8801903702276164e-05, "loss": 1.2581, "step": 6127 }, { "epoch": 1.8252015115694633, "grad_norm": 0.21442024409770966, "learning_rate": 1.880144585806222e-05, "loss": 1.2776, "step": 6128 }, { "epoch": 1.8254993577691319, "grad_norm": 0.2209845632314682, "learning_rate": 1.880098793196065e-05, "loss": 1.2797, "step": 6129 }, { "epoch": 1.8257972039688006, "grad_norm": 0.20947112143039703, "learning_rate": 1.8800529923975726e-05, "loss": 1.2527, "step": 6130 }, { "epoch": 1.8260950501684694, "grad_norm": 0.21361005306243896, "learning_rate": 1.8800071834111704e-05, "loss": 1.2467, "step": 6131 }, { "epoch": 1.826392896368138, "grad_norm": 0.2133372575044632, "learning_rate": 1.879961366237284e-05, "loss": 1.2806, "step": 6132 }, { "epoch": 1.8266907425678065, "grad_norm": 0.22230187058448792, "learning_rate": 1.8799155408763407e-05, "loss": 1.2621, "step": 6133 }, { "epoch": 1.8269885887674753, "grad_norm": 0.22517280280590057, "learning_rate": 1.879869707328766e-05, "loss": 1.2427, "step": 6134 }, { "epoch": 1.8272864349671438, "grad_norm": 0.2517028748989105, "learning_rate": 1.8798238655949873e-05, "loss": 1.2958, "step": 6135 }, { "epoch": 1.8275842811668124, "grad_norm": 0.25225141644477844, "learning_rate": 1.8797780156754303e-05, "loss": 1.2589, "step": 6136 }, { "epoch": 1.8278821273664811, "grad_norm": 0.22715818881988525, "learning_rate": 1.8797321575705216e-05, "loss": 1.2772, "step": 6137 }, { "epoch": 1.82817997356615, "grad_norm": 0.23603639006614685, "learning_rate": 1.879686291280688e-05, "loss": 1.258, "step": 6138 }, { "epoch": 1.8284778197658185, "grad_norm": 0.2428574413061142, "learning_rate": 1.8796404168063564e-05, "loss": 1.2743, "step": 6139 }, { "epoch": 1.828775665965487, "grad_norm": 0.21798360347747803, "learning_rate": 1.8795945341479537e-05, "loss": 1.2665, "step": 6140 }, { "epoch": 1.8290735121651558, "grad_norm": 0.25193336606025696, "learning_rate": 1.8795486433059067e-05, "loss": 1.2804, "step": 6141 }, { "epoch": 1.8293713583648243, "grad_norm": 0.24949511885643005, "learning_rate": 1.879502744280642e-05, "loss": 1.2491, "step": 6142 }, { "epoch": 1.829669204564493, "grad_norm": 0.22949185967445374, "learning_rate": 1.8794568370725867e-05, "loss": 1.2611, "step": 6143 }, { "epoch": 1.8299670507641617, "grad_norm": 0.3252366781234741, "learning_rate": 1.8794109216821686e-05, "loss": 1.2688, "step": 6144 }, { "epoch": 1.8302648969638304, "grad_norm": 0.2781335711479187, "learning_rate": 1.879364998109814e-05, "loss": 1.2864, "step": 6145 }, { "epoch": 1.830562743163499, "grad_norm": 0.248800590634346, "learning_rate": 1.879319066355951e-05, "loss": 1.2746, "step": 6146 }, { "epoch": 1.8308605893631675, "grad_norm": 0.2256658971309662, "learning_rate": 1.8792731264210063e-05, "loss": 1.2705, "step": 6147 }, { "epoch": 1.8311584355628363, "grad_norm": 0.32897138595581055, "learning_rate": 1.8792271783054072e-05, "loss": 1.2777, "step": 6148 }, { "epoch": 1.8314562817625049, "grad_norm": 0.22394399344921112, "learning_rate": 1.879181222009582e-05, "loss": 1.2753, "step": 6149 }, { "epoch": 1.8317541279621734, "grad_norm": 0.22119250893592834, "learning_rate": 1.8791352575339577e-05, "loss": 1.2642, "step": 6150 }, { "epoch": 1.8320519741618422, "grad_norm": 0.2309536635875702, "learning_rate": 1.879089284878962e-05, "loss": 1.2729, "step": 6151 }, { "epoch": 1.832349820361511, "grad_norm": 0.22146302461624146, "learning_rate": 1.8790433040450227e-05, "loss": 1.2569, "step": 6152 }, { "epoch": 1.8326476665611795, "grad_norm": 0.22456271946430206, "learning_rate": 1.878997315032568e-05, "loss": 1.2745, "step": 6153 }, { "epoch": 1.832945512760848, "grad_norm": 0.2310352325439453, "learning_rate": 1.8789513178420246e-05, "loss": 1.2688, "step": 6154 }, { "epoch": 1.8332433589605168, "grad_norm": 0.24492207169532776, "learning_rate": 1.8789053124738213e-05, "loss": 1.2598, "step": 6155 }, { "epoch": 1.8335412051601854, "grad_norm": 0.2163112908601761, "learning_rate": 1.8788592989283863e-05, "loss": 1.2659, "step": 6156 }, { "epoch": 1.833839051359854, "grad_norm": 0.22027339041233063, "learning_rate": 1.8788132772061476e-05, "loss": 1.2685, "step": 6157 }, { "epoch": 1.8341368975595227, "grad_norm": 0.22436580061912537, "learning_rate": 1.878767247307533e-05, "loss": 1.2487, "step": 6158 }, { "epoch": 1.8344347437591915, "grad_norm": 0.2317281812429428, "learning_rate": 1.8787212092329713e-05, "loss": 1.265, "step": 6159 }, { "epoch": 1.83473258995886, "grad_norm": 0.2352665215730667, "learning_rate": 1.87867516298289e-05, "loss": 1.2836, "step": 6160 }, { "epoch": 1.8350304361585286, "grad_norm": 0.2499164193868637, "learning_rate": 1.878629108557718e-05, "loss": 1.2638, "step": 6161 }, { "epoch": 1.8353282823581973, "grad_norm": 0.2163504809141159, "learning_rate": 1.8785830459578845e-05, "loss": 1.2662, "step": 6162 }, { "epoch": 1.8356261285578659, "grad_norm": 0.2350332885980606, "learning_rate": 1.878536975183817e-05, "loss": 1.2497, "step": 6163 }, { "epoch": 1.8359239747575344, "grad_norm": 0.23730915784835815, "learning_rate": 1.8784908962359443e-05, "loss": 1.2606, "step": 6164 }, { "epoch": 1.8362218209572032, "grad_norm": 0.23150528967380524, "learning_rate": 1.8784448091146953e-05, "loss": 1.2739, "step": 6165 }, { "epoch": 1.836519667156872, "grad_norm": 0.29068508744239807, "learning_rate": 1.8783987138204992e-05, "loss": 1.2667, "step": 6166 }, { "epoch": 1.8368175133565405, "grad_norm": 0.21419180929660797, "learning_rate": 1.878352610353784e-05, "loss": 1.2655, "step": 6167 }, { "epoch": 1.837115359556209, "grad_norm": 0.2460484802722931, "learning_rate": 1.8783064987149796e-05, "loss": 1.2737, "step": 6168 }, { "epoch": 1.8374132057558779, "grad_norm": 0.2468438595533371, "learning_rate": 1.8782603789045143e-05, "loss": 1.283, "step": 6169 }, { "epoch": 1.8377110519555464, "grad_norm": 0.22701148688793182, "learning_rate": 1.8782142509228174e-05, "loss": 1.2797, "step": 6170 }, { "epoch": 1.838008898155215, "grad_norm": 0.2519301772117615, "learning_rate": 1.8781681147703178e-05, "loss": 1.2766, "step": 6171 }, { "epoch": 1.8383067443548837, "grad_norm": 0.2273082137107849, "learning_rate": 1.8781219704474457e-05, "loss": 1.2673, "step": 6172 }, { "epoch": 1.8386045905545525, "grad_norm": 0.22925065457820892, "learning_rate": 1.8780758179546295e-05, "loss": 1.2771, "step": 6173 }, { "epoch": 1.838902436754221, "grad_norm": 0.22185562551021576, "learning_rate": 1.878029657292299e-05, "loss": 1.2721, "step": 6174 }, { "epoch": 1.8392002829538896, "grad_norm": 0.22061800956726074, "learning_rate": 1.877983488460883e-05, "loss": 1.2774, "step": 6175 }, { "epoch": 1.8394981291535584, "grad_norm": 0.22943070530891418, "learning_rate": 1.8779373114608116e-05, "loss": 1.26, "step": 6176 }, { "epoch": 1.8397959753532271, "grad_norm": 0.2462579756975174, "learning_rate": 1.8778911262925147e-05, "loss": 1.2783, "step": 6177 }, { "epoch": 1.8400938215528955, "grad_norm": 0.24037913978099823, "learning_rate": 1.877844932956422e-05, "loss": 1.2715, "step": 6178 }, { "epoch": 1.8403916677525642, "grad_norm": 0.2180246114730835, "learning_rate": 1.8777987314529625e-05, "loss": 1.2808, "step": 6179 }, { "epoch": 1.840689513952233, "grad_norm": 0.23191234469413757, "learning_rate": 1.877752521782567e-05, "loss": 1.2812, "step": 6180 }, { "epoch": 1.8409873601519016, "grad_norm": 0.22117017209529877, "learning_rate": 1.8777063039456644e-05, "loss": 1.2523, "step": 6181 }, { "epoch": 1.8412852063515701, "grad_norm": 0.22852277755737305, "learning_rate": 1.8776600779426856e-05, "loss": 1.2662, "step": 6182 }, { "epoch": 1.8415830525512389, "grad_norm": 0.22987455129623413, "learning_rate": 1.87761384377406e-05, "loss": 1.2726, "step": 6183 }, { "epoch": 1.8418808987509077, "grad_norm": 0.21900035440921783, "learning_rate": 1.8775676014402187e-05, "loss": 1.2724, "step": 6184 }, { "epoch": 1.842178744950576, "grad_norm": 0.23168663680553436, "learning_rate": 1.8775213509415913e-05, "loss": 1.2658, "step": 6185 }, { "epoch": 1.8424765911502448, "grad_norm": 0.23066608607769012, "learning_rate": 1.8774750922786077e-05, "loss": 1.2753, "step": 6186 }, { "epoch": 1.8427744373499135, "grad_norm": 0.23946154117584229, "learning_rate": 1.877428825451699e-05, "loss": 1.2471, "step": 6187 }, { "epoch": 1.843072283549582, "grad_norm": 0.2212175577878952, "learning_rate": 1.877382550461295e-05, "loss": 1.2615, "step": 6188 }, { "epoch": 1.8433701297492506, "grad_norm": 0.2367374747991562, "learning_rate": 1.8773362673078274e-05, "loss": 1.2637, "step": 6189 }, { "epoch": 1.8436679759489194, "grad_norm": 0.23277902603149414, "learning_rate": 1.8772899759917257e-05, "loss": 1.2641, "step": 6190 }, { "epoch": 1.8439658221485882, "grad_norm": 0.22886283695697784, "learning_rate": 1.877243676513421e-05, "loss": 1.273, "step": 6191 }, { "epoch": 1.8442636683482567, "grad_norm": 0.2127494364976883, "learning_rate": 1.877197368873344e-05, "loss": 1.2625, "step": 6192 }, { "epoch": 1.8445615145479253, "grad_norm": 0.21837003529071808, "learning_rate": 1.8771510530719253e-05, "loss": 1.2756, "step": 6193 }, { "epoch": 1.844859360747594, "grad_norm": 0.22271862626075745, "learning_rate": 1.877104729109596e-05, "loss": 1.2807, "step": 6194 }, { "epoch": 1.8451572069472626, "grad_norm": 0.2360060065984726, "learning_rate": 1.8770583969867876e-05, "loss": 1.2673, "step": 6195 }, { "epoch": 1.8454550531469311, "grad_norm": 0.22648479044437408, "learning_rate": 1.87701205670393e-05, "loss": 1.2547, "step": 6196 }, { "epoch": 1.8457528993466, "grad_norm": 0.2194075882434845, "learning_rate": 1.876965708261456e-05, "loss": 1.2599, "step": 6197 }, { "epoch": 1.8460507455462687, "grad_norm": 0.2420044094324112, "learning_rate": 1.876919351659795e-05, "loss": 1.2629, "step": 6198 }, { "epoch": 1.8463485917459372, "grad_norm": 0.22396166622638702, "learning_rate": 1.87687298689938e-05, "loss": 1.2693, "step": 6199 }, { "epoch": 1.8466464379456058, "grad_norm": 0.25090858340263367, "learning_rate": 1.876826613980641e-05, "loss": 1.2653, "step": 6200 }, { "epoch": 1.8469442841452746, "grad_norm": 0.22225917875766754, "learning_rate": 1.8767802329040105e-05, "loss": 1.2446, "step": 6201 }, { "epoch": 1.8472421303449431, "grad_norm": 0.2735970616340637, "learning_rate": 1.8767338436699193e-05, "loss": 1.2748, "step": 6202 }, { "epoch": 1.8475399765446117, "grad_norm": 0.23733359575271606, "learning_rate": 1.876687446278799e-05, "loss": 1.294, "step": 6203 }, { "epoch": 1.8478378227442804, "grad_norm": 0.24605488777160645, "learning_rate": 1.8766410407310817e-05, "loss": 1.2947, "step": 6204 }, { "epoch": 1.8481356689439492, "grad_norm": 0.23347657918930054, "learning_rate": 1.8765946270271986e-05, "loss": 1.2678, "step": 6205 }, { "epoch": 1.8484335151436178, "grad_norm": 0.241032212972641, "learning_rate": 1.8765482051675823e-05, "loss": 1.2678, "step": 6206 }, { "epoch": 1.8487313613432863, "grad_norm": 0.2312811017036438, "learning_rate": 1.8765017751526642e-05, "loss": 1.2826, "step": 6207 }, { "epoch": 1.849029207542955, "grad_norm": 0.2209426313638687, "learning_rate": 1.8764553369828765e-05, "loss": 1.2546, "step": 6208 }, { "epoch": 1.8493270537426236, "grad_norm": 0.22778834402561188, "learning_rate": 1.876408890658651e-05, "loss": 1.2604, "step": 6209 }, { "epoch": 1.8496248999422922, "grad_norm": 0.2315870225429535, "learning_rate": 1.8763624361804198e-05, "loss": 1.2737, "step": 6210 }, { "epoch": 1.849922746141961, "grad_norm": 0.2249654233455658, "learning_rate": 1.8763159735486153e-05, "loss": 1.292, "step": 6211 }, { "epoch": 1.8502205923416297, "grad_norm": 0.2213476002216339, "learning_rate": 1.87626950276367e-05, "loss": 1.2748, "step": 6212 }, { "epoch": 1.8505184385412983, "grad_norm": 0.22676432132720947, "learning_rate": 1.876223023826016e-05, "loss": 1.2649, "step": 6213 }, { "epoch": 1.8508162847409668, "grad_norm": 0.24635079503059387, "learning_rate": 1.876176536736085e-05, "loss": 1.2729, "step": 6214 }, { "epoch": 1.8511141309406356, "grad_norm": 0.2461482584476471, "learning_rate": 1.876130041494311e-05, "loss": 1.2471, "step": 6215 }, { "epoch": 1.8514119771403041, "grad_norm": 0.22454310953617096, "learning_rate": 1.876083538101126e-05, "loss": 1.2645, "step": 6216 }, { "epoch": 1.8517098233399727, "grad_norm": 0.26606985926628113, "learning_rate": 1.876037026556962e-05, "loss": 1.2822, "step": 6217 }, { "epoch": 1.8520076695396415, "grad_norm": 0.2288210541009903, "learning_rate": 1.8759905068622523e-05, "loss": 1.2579, "step": 6218 }, { "epoch": 1.8523055157393102, "grad_norm": 0.21645639836788177, "learning_rate": 1.8759439790174297e-05, "loss": 1.2595, "step": 6219 }, { "epoch": 1.8526033619389788, "grad_norm": 0.22343234717845917, "learning_rate": 1.875897443022927e-05, "loss": 1.2691, "step": 6220 }, { "epoch": 1.8529012081386473, "grad_norm": 0.24432212114334106, "learning_rate": 1.8758508988791773e-05, "loss": 1.2626, "step": 6221 }, { "epoch": 1.853199054338316, "grad_norm": 0.22526809573173523, "learning_rate": 1.875804346586614e-05, "loss": 1.2552, "step": 6222 }, { "epoch": 1.8534969005379847, "grad_norm": 0.2272035777568817, "learning_rate": 1.8757577861456687e-05, "loss": 1.2497, "step": 6223 }, { "epoch": 1.8537947467376532, "grad_norm": 0.24414905905723572, "learning_rate": 1.8757112175567765e-05, "loss": 1.2659, "step": 6224 }, { "epoch": 1.854092592937322, "grad_norm": 0.21001029014587402, "learning_rate": 1.8756646408203695e-05, "loss": 1.2712, "step": 6225 }, { "epoch": 1.8543904391369908, "grad_norm": 0.26856371760368347, "learning_rate": 1.8756180559368812e-05, "loss": 1.2719, "step": 6226 }, { "epoch": 1.8546882853366593, "grad_norm": 0.2772649824619293, "learning_rate": 1.8755714629067455e-05, "loss": 1.2714, "step": 6227 }, { "epoch": 1.8549861315363279, "grad_norm": 0.24141082167625427, "learning_rate": 1.8755248617303955e-05, "loss": 1.2489, "step": 6228 }, { "epoch": 1.8552839777359966, "grad_norm": 0.3939058184623718, "learning_rate": 1.875478252408265e-05, "loss": 1.2715, "step": 6229 }, { "epoch": 1.8555818239356652, "grad_norm": 0.247290700674057, "learning_rate": 1.8754316349407875e-05, "loss": 1.2684, "step": 6230 }, { "epoch": 1.8558796701353337, "grad_norm": 0.2563334107398987, "learning_rate": 1.8753850093283966e-05, "loss": 1.2531, "step": 6231 }, { "epoch": 1.8561775163350025, "grad_norm": 0.23085132241249084, "learning_rate": 1.875338375571526e-05, "loss": 1.2906, "step": 6232 }, { "epoch": 1.8564753625346713, "grad_norm": 0.21739709377288818, "learning_rate": 1.87529173367061e-05, "loss": 1.2734, "step": 6233 }, { "epoch": 1.8567732087343398, "grad_norm": 0.2239406406879425, "learning_rate": 1.8752450836260823e-05, "loss": 1.2716, "step": 6234 }, { "epoch": 1.8570710549340084, "grad_norm": 0.25117847323417664, "learning_rate": 1.8751984254383773e-05, "loss": 1.2792, "step": 6235 }, { "epoch": 1.8573689011336771, "grad_norm": 0.2235771268606186, "learning_rate": 1.8751517591079284e-05, "loss": 1.268, "step": 6236 }, { "epoch": 1.8576667473333457, "grad_norm": 0.21495643258094788, "learning_rate": 1.8751050846351703e-05, "loss": 1.2744, "step": 6237 }, { "epoch": 1.8579645935330142, "grad_norm": 0.2274235486984253, "learning_rate": 1.875058402020537e-05, "loss": 1.2881, "step": 6238 }, { "epoch": 1.858262439732683, "grad_norm": 0.2241227924823761, "learning_rate": 1.875011711264463e-05, "loss": 1.2549, "step": 6239 }, { "epoch": 1.8585602859323518, "grad_norm": 0.22675253450870514, "learning_rate": 1.8749650123673828e-05, "loss": 1.2695, "step": 6240 }, { "epoch": 1.8588581321320203, "grad_norm": 0.21627160906791687, "learning_rate": 1.8749183053297305e-05, "loss": 1.2546, "step": 6241 }, { "epoch": 1.8591559783316889, "grad_norm": 0.23366791009902954, "learning_rate": 1.874871590151941e-05, "loss": 1.2689, "step": 6242 }, { "epoch": 1.8594538245313577, "grad_norm": 0.23596471548080444, "learning_rate": 1.874824866834449e-05, "loss": 1.2607, "step": 6243 }, { "epoch": 1.8597516707310264, "grad_norm": 0.22348742187023163, "learning_rate": 1.8747781353776885e-05, "loss": 1.2695, "step": 6244 }, { "epoch": 1.8600495169306948, "grad_norm": 0.2204684019088745, "learning_rate": 1.8747313957820955e-05, "loss": 1.2751, "step": 6245 }, { "epoch": 1.8603473631303635, "grad_norm": 0.22575503587722778, "learning_rate": 1.8746846480481036e-05, "loss": 1.264, "step": 6246 }, { "epoch": 1.8606452093300323, "grad_norm": 0.22473333775997162, "learning_rate": 1.8746378921761484e-05, "loss": 1.2665, "step": 6247 }, { "epoch": 1.8609430555297009, "grad_norm": 0.23724524676799774, "learning_rate": 1.874591128166665e-05, "loss": 1.265, "step": 6248 }, { "epoch": 1.8612409017293694, "grad_norm": 0.2172134816646576, "learning_rate": 1.8745443560200885e-05, "loss": 1.2743, "step": 6249 }, { "epoch": 1.8615387479290382, "grad_norm": 0.21474622189998627, "learning_rate": 1.8744975757368532e-05, "loss": 1.2775, "step": 6250 }, { "epoch": 1.861836594128707, "grad_norm": 0.21865083277225494, "learning_rate": 1.8744507873173955e-05, "loss": 1.2752, "step": 6251 }, { "epoch": 1.8621344403283753, "grad_norm": 0.2198323756456375, "learning_rate": 1.8744039907621504e-05, "loss": 1.2606, "step": 6252 }, { "epoch": 1.862432286528044, "grad_norm": 0.22255510091781616, "learning_rate": 1.874357186071553e-05, "loss": 1.27, "step": 6253 }, { "epoch": 1.8627301327277128, "grad_norm": 0.23042772710323334, "learning_rate": 1.8743103732460382e-05, "loss": 1.2692, "step": 6254 }, { "epoch": 1.8630279789273814, "grad_norm": 0.21695443987846375, "learning_rate": 1.874263552286043e-05, "loss": 1.2658, "step": 6255 }, { "epoch": 1.86332582512705, "grad_norm": 0.21861781179904938, "learning_rate": 1.8742167231920018e-05, "loss": 1.2713, "step": 6256 }, { "epoch": 1.8636236713267187, "grad_norm": 0.22876958549022675, "learning_rate": 1.874169885964351e-05, "loss": 1.2901, "step": 6257 }, { "epoch": 1.8639215175263875, "grad_norm": 0.21717725694179535, "learning_rate": 1.874123040603526e-05, "loss": 1.2666, "step": 6258 }, { "epoch": 1.864219363726056, "grad_norm": 0.23413631319999695, "learning_rate": 1.8740761871099625e-05, "loss": 1.2719, "step": 6259 }, { "epoch": 1.8645172099257246, "grad_norm": 0.2251828908920288, "learning_rate": 1.874029325484097e-05, "loss": 1.2702, "step": 6260 }, { "epoch": 1.8648150561253933, "grad_norm": 0.22034551203250885, "learning_rate": 1.873982455726365e-05, "loss": 1.2629, "step": 6261 }, { "epoch": 1.8651129023250619, "grad_norm": 0.2504265308380127, "learning_rate": 1.8739355778372025e-05, "loss": 1.2801, "step": 6262 }, { "epoch": 1.8654107485247304, "grad_norm": 0.22186152637004852, "learning_rate": 1.8738886918170463e-05, "loss": 1.2736, "step": 6263 }, { "epoch": 1.8657085947243992, "grad_norm": 0.22834238409996033, "learning_rate": 1.873841797666332e-05, "loss": 1.2809, "step": 6264 }, { "epoch": 1.866006440924068, "grad_norm": 0.22512505948543549, "learning_rate": 1.873794895385496e-05, "loss": 1.2801, "step": 6265 }, { "epoch": 1.8663042871237365, "grad_norm": 0.22817987203598022, "learning_rate": 1.873747984974975e-05, "loss": 1.2527, "step": 6266 }, { "epoch": 1.866602133323405, "grad_norm": 0.22437958419322968, "learning_rate": 1.8737010664352048e-05, "loss": 1.2597, "step": 6267 }, { "epoch": 1.8668999795230738, "grad_norm": 0.21601814031600952, "learning_rate": 1.8736541397666226e-05, "loss": 1.2644, "step": 6268 }, { "epoch": 1.8671978257227424, "grad_norm": 0.23356673121452332, "learning_rate": 1.8736072049696648e-05, "loss": 1.265, "step": 6269 }, { "epoch": 1.867495671922411, "grad_norm": 0.2248130738735199, "learning_rate": 1.8735602620447676e-05, "loss": 1.2676, "step": 6270 }, { "epoch": 1.8677935181220797, "grad_norm": 0.2332877218723297, "learning_rate": 1.873513310992369e-05, "loss": 1.2774, "step": 6271 }, { "epoch": 1.8680913643217485, "grad_norm": 0.23454421758651733, "learning_rate": 1.8734663518129045e-05, "loss": 1.2721, "step": 6272 }, { "epoch": 1.868389210521417, "grad_norm": 0.2268686145544052, "learning_rate": 1.873419384506811e-05, "loss": 1.2748, "step": 6273 }, { "epoch": 1.8686870567210856, "grad_norm": 0.23339509963989258, "learning_rate": 1.8733724090745268e-05, "loss": 1.2777, "step": 6274 }, { "epoch": 1.8689849029207544, "grad_norm": 0.24625654518604279, "learning_rate": 1.8733254255164874e-05, "loss": 1.2638, "step": 6275 }, { "epoch": 1.869282749120423, "grad_norm": 0.22551506757736206, "learning_rate": 1.8732784338331313e-05, "loss": 1.2716, "step": 6276 }, { "epoch": 1.8695805953200915, "grad_norm": 0.25346723198890686, "learning_rate": 1.8732314340248946e-05, "loss": 1.2674, "step": 6277 }, { "epoch": 1.8698784415197602, "grad_norm": 0.23279167711734772, "learning_rate": 1.8731844260922153e-05, "loss": 1.2584, "step": 6278 }, { "epoch": 1.870176287719429, "grad_norm": 0.25668370723724365, "learning_rate": 1.8731374100355302e-05, "loss": 1.2447, "step": 6279 }, { "epoch": 1.8704741339190976, "grad_norm": 0.25280889868736267, "learning_rate": 1.873090385855277e-05, "loss": 1.2716, "step": 6280 }, { "epoch": 1.870771980118766, "grad_norm": 0.24993520975112915, "learning_rate": 1.873043353551894e-05, "loss": 1.2531, "step": 6281 }, { "epoch": 1.8710698263184349, "grad_norm": 0.22116729617118835, "learning_rate": 1.872996313125817e-05, "loss": 1.2654, "step": 6282 }, { "epoch": 1.8713676725181034, "grad_norm": 0.23692280054092407, "learning_rate": 1.872949264577485e-05, "loss": 1.2655, "step": 6283 }, { "epoch": 1.871665518717772, "grad_norm": 0.2352658212184906, "learning_rate": 1.8729022079073358e-05, "loss": 1.2655, "step": 6284 }, { "epoch": 1.8719633649174408, "grad_norm": 0.21809260547161102, "learning_rate": 1.8728551431158068e-05, "loss": 1.2623, "step": 6285 }, { "epoch": 1.8722612111171095, "grad_norm": 0.23344051837921143, "learning_rate": 1.8728080702033354e-05, "loss": 1.2654, "step": 6286 }, { "epoch": 1.872559057316778, "grad_norm": 0.22276152670383453, "learning_rate": 1.8727609891703603e-05, "loss": 1.2576, "step": 6287 }, { "epoch": 1.8728569035164466, "grad_norm": 0.22614987194538116, "learning_rate": 1.8727139000173197e-05, "loss": 1.2522, "step": 6288 }, { "epoch": 1.8731547497161154, "grad_norm": 0.21375888586044312, "learning_rate": 1.8726668027446507e-05, "loss": 1.2701, "step": 6289 }, { "epoch": 1.873452595915784, "grad_norm": 0.2416151911020279, "learning_rate": 1.8726196973527922e-05, "loss": 1.281, "step": 6290 }, { "epoch": 1.8737504421154525, "grad_norm": 0.24732106924057007, "learning_rate": 1.8725725838421825e-05, "loss": 1.269, "step": 6291 }, { "epoch": 1.8740482883151213, "grad_norm": 0.23814208805561066, "learning_rate": 1.8725254622132597e-05, "loss": 1.2635, "step": 6292 }, { "epoch": 1.87434613451479, "grad_norm": 0.2380426526069641, "learning_rate": 1.8724783324664626e-05, "loss": 1.2612, "step": 6293 }, { "epoch": 1.8746439807144586, "grad_norm": 0.23904134333133698, "learning_rate": 1.8724311946022293e-05, "loss": 1.2597, "step": 6294 }, { "epoch": 1.8749418269141271, "grad_norm": 0.21786151826381683, "learning_rate": 1.8723840486209984e-05, "loss": 1.2771, "step": 6295 }, { "epoch": 1.875239673113796, "grad_norm": 0.2555582821369171, "learning_rate": 1.872336894523209e-05, "loss": 1.2563, "step": 6296 }, { "epoch": 1.8755375193134645, "grad_norm": 0.2183697521686554, "learning_rate": 1.8722897323092988e-05, "loss": 1.2642, "step": 6297 }, { "epoch": 1.875835365513133, "grad_norm": 0.22678542137145996, "learning_rate": 1.8722425619797074e-05, "loss": 1.2594, "step": 6298 }, { "epoch": 1.8761332117128018, "grad_norm": 0.2133307158946991, "learning_rate": 1.8721953835348734e-05, "loss": 1.2621, "step": 6299 }, { "epoch": 1.8764310579124706, "grad_norm": 0.23420582711696625, "learning_rate": 1.8721481969752363e-05, "loss": 1.2645, "step": 6300 }, { "epoch": 1.876728904112139, "grad_norm": 0.2238420844078064, "learning_rate": 1.8721010023012343e-05, "loss": 1.2645, "step": 6301 }, { "epoch": 1.8770267503118077, "grad_norm": 0.22619526088237762, "learning_rate": 1.872053799513307e-05, "loss": 1.2644, "step": 6302 }, { "epoch": 1.8773245965114764, "grad_norm": 0.2313038855791092, "learning_rate": 1.872006588611893e-05, "loss": 1.2555, "step": 6303 }, { "epoch": 1.877622442711145, "grad_norm": 0.22440804541110992, "learning_rate": 1.871959369597432e-05, "loss": 1.2806, "step": 6304 }, { "epoch": 1.8779202889108135, "grad_norm": 0.2729954421520233, "learning_rate": 1.871912142470364e-05, "loss": 1.269, "step": 6305 }, { "epoch": 1.8782181351104823, "grad_norm": 0.21770896017551422, "learning_rate": 1.8718649072311272e-05, "loss": 1.2796, "step": 6306 }, { "epoch": 1.878515981310151, "grad_norm": 0.22921176254749298, "learning_rate": 1.8718176638801614e-05, "loss": 1.2685, "step": 6307 }, { "epoch": 1.8788138275098196, "grad_norm": 0.2256852388381958, "learning_rate": 1.8717704124179065e-05, "loss": 1.2633, "step": 6308 }, { "epoch": 1.8791116737094882, "grad_norm": 0.2169460952281952, "learning_rate": 1.8717231528448015e-05, "loss": 1.2879, "step": 6309 }, { "epoch": 1.879409519909157, "grad_norm": 0.2836572825908661, "learning_rate": 1.8716758851612873e-05, "loss": 1.2723, "step": 6310 }, { "epoch": 1.8797073661088257, "grad_norm": 0.22437360882759094, "learning_rate": 1.8716286093678023e-05, "loss": 1.2715, "step": 6311 }, { "epoch": 1.880005212308494, "grad_norm": 0.22923672199249268, "learning_rate": 1.8715813254647872e-05, "loss": 1.2609, "step": 6312 }, { "epoch": 1.8803030585081628, "grad_norm": 0.22592346370220184, "learning_rate": 1.8715340334526812e-05, "loss": 1.2837, "step": 6313 }, { "epoch": 1.8806009047078316, "grad_norm": 0.21839545667171478, "learning_rate": 1.871486733331925e-05, "loss": 1.2558, "step": 6314 }, { "epoch": 1.8808987509075001, "grad_norm": 0.22045713663101196, "learning_rate": 1.871439425102959e-05, "loss": 1.2804, "step": 6315 }, { "epoch": 1.8811965971071687, "grad_norm": 0.23273544013500214, "learning_rate": 1.8713921087662223e-05, "loss": 1.2772, "step": 6316 }, { "epoch": 1.8814944433068375, "grad_norm": 0.22192887961864471, "learning_rate": 1.871344784322155e-05, "loss": 1.2484, "step": 6317 }, { "epoch": 1.8817922895065062, "grad_norm": 0.23370422422885895, "learning_rate": 1.871297451771199e-05, "loss": 1.2679, "step": 6318 }, { "epoch": 1.8820901357061746, "grad_norm": 0.22910785675048828, "learning_rate": 1.8712501111137933e-05, "loss": 1.2686, "step": 6319 }, { "epoch": 1.8823879819058433, "grad_norm": 0.22994323074817657, "learning_rate": 1.8712027623503785e-05, "loss": 1.2549, "step": 6320 }, { "epoch": 1.882685828105512, "grad_norm": 0.2222004234790802, "learning_rate": 1.8711554054813955e-05, "loss": 1.2761, "step": 6321 }, { "epoch": 1.8829836743051807, "grad_norm": 0.22731554508209229, "learning_rate": 1.871108040507285e-05, "loss": 1.2623, "step": 6322 }, { "epoch": 1.8832815205048492, "grad_norm": 0.22629621624946594, "learning_rate": 1.8710606674284874e-05, "loss": 1.2814, "step": 6323 }, { "epoch": 1.883579366704518, "grad_norm": 0.22215241193771362, "learning_rate": 1.8710132862454432e-05, "loss": 1.2745, "step": 6324 }, { "epoch": 1.8838772129041867, "grad_norm": 0.2270025908946991, "learning_rate": 1.8709658969585936e-05, "loss": 1.2704, "step": 6325 }, { "epoch": 1.8841750591038553, "grad_norm": 0.24180828034877777, "learning_rate": 1.8709184995683797e-05, "loss": 1.2672, "step": 6326 }, { "epoch": 1.8844729053035238, "grad_norm": 0.23187457025051117, "learning_rate": 1.870871094075242e-05, "loss": 1.2845, "step": 6327 }, { "epoch": 1.8847707515031926, "grad_norm": 0.2269025444984436, "learning_rate": 1.870823680479622e-05, "loss": 1.263, "step": 6328 }, { "epoch": 1.8850685977028612, "grad_norm": 0.21496471762657166, "learning_rate": 1.87077625878196e-05, "loss": 1.2658, "step": 6329 }, { "epoch": 1.8853664439025297, "grad_norm": 0.26027464866638184, "learning_rate": 1.8707288289826983e-05, "loss": 1.262, "step": 6330 }, { "epoch": 1.8856642901021985, "grad_norm": 0.23373070359230042, "learning_rate": 1.8706813910822776e-05, "loss": 1.2617, "step": 6331 }, { "epoch": 1.8859621363018673, "grad_norm": 0.23181870579719543, "learning_rate": 1.8706339450811396e-05, "loss": 1.2688, "step": 6332 }, { "epoch": 1.8862599825015358, "grad_norm": 0.22112888097763062, "learning_rate": 1.870586490979725e-05, "loss": 1.2649, "step": 6333 }, { "epoch": 1.8865578287012044, "grad_norm": 0.2553446590900421, "learning_rate": 1.8705390287784758e-05, "loss": 1.2698, "step": 6334 }, { "epoch": 1.8868556749008731, "grad_norm": 0.22858518362045288, "learning_rate": 1.870491558477834e-05, "loss": 1.2796, "step": 6335 }, { "epoch": 1.8871535211005417, "grad_norm": 0.2378801852464676, "learning_rate": 1.8704440800782403e-05, "loss": 1.2609, "step": 6336 }, { "epoch": 1.8874513673002102, "grad_norm": 0.22767691314220428, "learning_rate": 1.8703965935801373e-05, "loss": 1.2714, "step": 6337 }, { "epoch": 1.887749213499879, "grad_norm": 0.22173963487148285, "learning_rate": 1.8703490989839664e-05, "loss": 1.2564, "step": 6338 }, { "epoch": 1.8880470596995478, "grad_norm": 0.26524287462234497, "learning_rate": 1.8703015962901692e-05, "loss": 1.2921, "step": 6339 }, { "epoch": 1.8883449058992163, "grad_norm": 0.23315204679965973, "learning_rate": 1.8702540854991884e-05, "loss": 1.2585, "step": 6340 }, { "epoch": 1.8886427520988849, "grad_norm": 0.22819599509239197, "learning_rate": 1.8702065666114658e-05, "loss": 1.2513, "step": 6341 }, { "epoch": 1.8889405982985537, "grad_norm": 0.21942868828773499, "learning_rate": 1.870159039627443e-05, "loss": 1.2547, "step": 6342 }, { "epoch": 1.8892384444982222, "grad_norm": 0.2668503522872925, "learning_rate": 1.8701115045475627e-05, "loss": 1.2617, "step": 6343 }, { "epoch": 1.8895362906978908, "grad_norm": 0.2508845329284668, "learning_rate": 1.8700639613722667e-05, "loss": 1.2818, "step": 6344 }, { "epoch": 1.8898341368975595, "grad_norm": 0.23692795634269714, "learning_rate": 1.870016410101998e-05, "loss": 1.261, "step": 6345 }, { "epoch": 1.8901319830972283, "grad_norm": 0.30562636256217957, "learning_rate": 1.8699688507371987e-05, "loss": 1.2886, "step": 6346 }, { "epoch": 1.8904298292968968, "grad_norm": 0.23118507862091064, "learning_rate": 1.869921283278311e-05, "loss": 1.2845, "step": 6347 }, { "epoch": 1.8907276754965654, "grad_norm": 0.2323632538318634, "learning_rate": 1.8698737077257776e-05, "loss": 1.259, "step": 6348 }, { "epoch": 1.8910255216962342, "grad_norm": 0.22015786170959473, "learning_rate": 1.869826124080041e-05, "loss": 1.2548, "step": 6349 }, { "epoch": 1.8913233678959027, "grad_norm": 0.2369198501110077, "learning_rate": 1.869778532341545e-05, "loss": 1.2668, "step": 6350 }, { "epoch": 1.8916212140955713, "grad_norm": 0.22735393047332764, "learning_rate": 1.8697309325107308e-05, "loss": 1.2682, "step": 6351 }, { "epoch": 1.89191906029524, "grad_norm": 0.2261963188648224, "learning_rate": 1.869683324588042e-05, "loss": 1.2662, "step": 6352 }, { "epoch": 1.8922169064949088, "grad_norm": 0.23152479529380798, "learning_rate": 1.869635708573922e-05, "loss": 1.2597, "step": 6353 }, { "epoch": 1.8925147526945774, "grad_norm": 0.2275117188692093, "learning_rate": 1.8695880844688133e-05, "loss": 1.2793, "step": 6354 }, { "epoch": 1.892812598894246, "grad_norm": 0.2778516411781311, "learning_rate": 1.869540452273159e-05, "loss": 1.2615, "step": 6355 }, { "epoch": 1.8931104450939147, "grad_norm": 0.22918972373008728, "learning_rate": 1.869492811987402e-05, "loss": 1.2645, "step": 6356 }, { "epoch": 1.8934082912935832, "grad_norm": 0.241749107837677, "learning_rate": 1.8694451636119858e-05, "loss": 1.2554, "step": 6357 }, { "epoch": 1.8937061374932518, "grad_norm": 0.2322414517402649, "learning_rate": 1.8693975071473537e-05, "loss": 1.2894, "step": 6358 }, { "epoch": 1.8940039836929206, "grad_norm": 0.297078937292099, "learning_rate": 1.8693498425939497e-05, "loss": 1.2644, "step": 6359 }, { "epoch": 1.8943018298925893, "grad_norm": 0.24310941994190216, "learning_rate": 1.8693021699522162e-05, "loss": 1.2822, "step": 6360 }, { "epoch": 1.8945996760922579, "grad_norm": 0.24547787010669708, "learning_rate": 1.8692544892225975e-05, "loss": 1.2514, "step": 6361 }, { "epoch": 1.8948975222919264, "grad_norm": 0.23121975362300873, "learning_rate": 1.869206800405537e-05, "loss": 1.2825, "step": 6362 }, { "epoch": 1.8951953684915952, "grad_norm": 0.2518481910228729, "learning_rate": 1.869159103501478e-05, "loss": 1.2399, "step": 6363 }, { "epoch": 1.8954932146912638, "grad_norm": 0.2387373447418213, "learning_rate": 1.8691113985108652e-05, "loss": 1.2723, "step": 6364 }, { "epoch": 1.8957910608909323, "grad_norm": 0.23863528668880463, "learning_rate": 1.8690636854341414e-05, "loss": 1.2809, "step": 6365 }, { "epoch": 1.896088907090601, "grad_norm": 0.23983988165855408, "learning_rate": 1.869015964271751e-05, "loss": 1.2794, "step": 6366 }, { "epoch": 1.8963867532902698, "grad_norm": 0.22375057637691498, "learning_rate": 1.8689682350241384e-05, "loss": 1.2825, "step": 6367 }, { "epoch": 1.8966845994899384, "grad_norm": 0.2556236982345581, "learning_rate": 1.868920497691747e-05, "loss": 1.2732, "step": 6368 }, { "epoch": 1.896982445689607, "grad_norm": 0.2415783554315567, "learning_rate": 1.868872752275021e-05, "loss": 1.2763, "step": 6369 }, { "epoch": 1.8972802918892757, "grad_norm": 0.23676668107509613, "learning_rate": 1.868824998774405e-05, "loss": 1.2794, "step": 6370 }, { "epoch": 1.8975781380889443, "grad_norm": 0.22184059023857117, "learning_rate": 1.8687772371903427e-05, "loss": 1.2596, "step": 6371 }, { "epoch": 1.8978759842886128, "grad_norm": 0.23676683008670807, "learning_rate": 1.8687294675232795e-05, "loss": 1.2698, "step": 6372 }, { "epoch": 1.8981738304882816, "grad_norm": 0.23678049445152283, "learning_rate": 1.868681689773659e-05, "loss": 1.2804, "step": 6373 }, { "epoch": 1.8984716766879504, "grad_norm": 0.22727486491203308, "learning_rate": 1.8686339039419254e-05, "loss": 1.2666, "step": 6374 }, { "epoch": 1.898769522887619, "grad_norm": 0.2718203663825989, "learning_rate": 1.8685861100285242e-05, "loss": 1.2587, "step": 6375 }, { "epoch": 1.8990673690872875, "grad_norm": 0.2525746822357178, "learning_rate": 1.8685383080339e-05, "loss": 1.2758, "step": 6376 }, { "epoch": 1.8993652152869562, "grad_norm": 0.24332743883132935, "learning_rate": 1.868490497958497e-05, "loss": 1.2773, "step": 6377 }, { "epoch": 1.899663061486625, "grad_norm": 0.3178490698337555, "learning_rate": 1.86844267980276e-05, "loss": 1.2591, "step": 6378 }, { "epoch": 1.8999609076862933, "grad_norm": 0.23794011771678925, "learning_rate": 1.8683948535671344e-05, "loss": 1.2536, "step": 6379 }, { "epoch": 1.900258753885962, "grad_norm": 0.2222364842891693, "learning_rate": 1.868347019252065e-05, "loss": 1.2539, "step": 6380 }, { "epoch": 1.9005566000856309, "grad_norm": 0.22782839834690094, "learning_rate": 1.8682991768579965e-05, "loss": 1.2592, "step": 6381 }, { "epoch": 1.9008544462852994, "grad_norm": 0.2816076874732971, "learning_rate": 1.8682513263853743e-05, "loss": 1.2584, "step": 6382 }, { "epoch": 1.901152292484968, "grad_norm": 0.22128011286258698, "learning_rate": 1.868203467834644e-05, "loss": 1.27, "step": 6383 }, { "epoch": 1.9014501386846367, "grad_norm": 0.26034650206565857, "learning_rate": 1.8681556012062502e-05, "loss": 1.2779, "step": 6384 }, { "epoch": 1.9017479848843055, "grad_norm": 0.22287800908088684, "learning_rate": 1.868107726500638e-05, "loss": 1.2539, "step": 6385 }, { "epoch": 1.9020458310839738, "grad_norm": 0.2690623998641968, "learning_rate": 1.8680598437182537e-05, "loss": 1.2748, "step": 6386 }, { "epoch": 1.9023436772836426, "grad_norm": 0.22985732555389404, "learning_rate": 1.8680119528595427e-05, "loss": 1.2684, "step": 6387 }, { "epoch": 1.9026415234833114, "grad_norm": 0.26795485615730286, "learning_rate": 1.8679640539249498e-05, "loss": 1.2669, "step": 6388 }, { "epoch": 1.90293936968298, "grad_norm": 0.23416420817375183, "learning_rate": 1.8679161469149214e-05, "loss": 1.2389, "step": 6389 }, { "epoch": 1.9032372158826485, "grad_norm": 0.3261023759841919, "learning_rate": 1.8678682318299032e-05, "loss": 1.2779, "step": 6390 }, { "epoch": 1.9035350620823173, "grad_norm": 0.2772911787033081, "learning_rate": 1.8678203086703402e-05, "loss": 1.2659, "step": 6391 }, { "epoch": 1.903832908281986, "grad_norm": 0.27153968811035156, "learning_rate": 1.8677723774366792e-05, "loss": 1.2812, "step": 6392 }, { "epoch": 1.9041307544816546, "grad_norm": 0.22545918822288513, "learning_rate": 1.8677244381293655e-05, "loss": 1.2555, "step": 6393 }, { "epoch": 1.9044286006813231, "grad_norm": 0.3606754541397095, "learning_rate": 1.8676764907488455e-05, "loss": 1.271, "step": 6394 }, { "epoch": 1.904726446880992, "grad_norm": 0.23116520047187805, "learning_rate": 1.8676285352955653e-05, "loss": 1.265, "step": 6395 }, { "epoch": 1.9050242930806605, "grad_norm": 0.21856485307216644, "learning_rate": 1.8675805717699705e-05, "loss": 1.2694, "step": 6396 }, { "epoch": 1.905322139280329, "grad_norm": 0.22345422208309174, "learning_rate": 1.8675326001725086e-05, "loss": 1.2642, "step": 6397 }, { "epoch": 1.9056199854799978, "grad_norm": 0.22112628817558289, "learning_rate": 1.8674846205036243e-05, "loss": 1.2568, "step": 6398 }, { "epoch": 1.9059178316796666, "grad_norm": 0.21793977916240692, "learning_rate": 1.8674366327637652e-05, "loss": 1.2516, "step": 6399 }, { "epoch": 1.906215677879335, "grad_norm": 0.22539812326431274, "learning_rate": 1.8673886369533772e-05, "loss": 1.2735, "step": 6400 }, { "epoch": 1.9065135240790037, "grad_norm": 0.23870933055877686, "learning_rate": 1.8673406330729072e-05, "loss": 1.2719, "step": 6401 }, { "epoch": 1.9068113702786724, "grad_norm": 0.22308455407619476, "learning_rate": 1.8672926211228018e-05, "loss": 1.2672, "step": 6402 }, { "epoch": 1.907109216478341, "grad_norm": 0.2290334850549698, "learning_rate": 1.8672446011035074e-05, "loss": 1.2778, "step": 6403 }, { "epoch": 1.9074070626780095, "grad_norm": 0.22054670751094818, "learning_rate": 1.867196573015471e-05, "loss": 1.2746, "step": 6404 }, { "epoch": 1.9077049088776783, "grad_norm": 0.22691991925239563, "learning_rate": 1.8671485368591393e-05, "loss": 1.2681, "step": 6405 }, { "epoch": 1.908002755077347, "grad_norm": 0.2267318069934845, "learning_rate": 1.8671004926349592e-05, "loss": 1.2529, "step": 6406 }, { "epoch": 1.9083006012770156, "grad_norm": 0.22116418182849884, "learning_rate": 1.867052440343378e-05, "loss": 1.2737, "step": 6407 }, { "epoch": 1.9085984474766842, "grad_norm": 0.2230682671070099, "learning_rate": 1.867004379984842e-05, "loss": 1.2628, "step": 6408 }, { "epoch": 1.908896293676353, "grad_norm": 0.226077601313591, "learning_rate": 1.8669563115598e-05, "loss": 1.284, "step": 6409 }, { "epoch": 1.9091941398760215, "grad_norm": 0.21638865768909454, "learning_rate": 1.8669082350686973e-05, "loss": 1.2772, "step": 6410 }, { "epoch": 1.90949198607569, "grad_norm": 0.23876522481441498, "learning_rate": 1.866860150511982e-05, "loss": 1.2668, "step": 6411 }, { "epoch": 1.9097898322753588, "grad_norm": 0.22125433385372162, "learning_rate": 1.8668120578901022e-05, "loss": 1.2522, "step": 6412 }, { "epoch": 1.9100876784750276, "grad_norm": 0.22033749520778656, "learning_rate": 1.866763957203504e-05, "loss": 1.2667, "step": 6413 }, { "epoch": 1.9103855246746961, "grad_norm": 0.22628949582576752, "learning_rate": 1.866715848452636e-05, "loss": 1.2686, "step": 6414 }, { "epoch": 1.9106833708743647, "grad_norm": 0.25021255016326904, "learning_rate": 1.8666677316379453e-05, "loss": 1.2807, "step": 6415 }, { "epoch": 1.9109812170740335, "grad_norm": 0.2332649976015091, "learning_rate": 1.8666196067598793e-05, "loss": 1.285, "step": 6416 }, { "epoch": 1.911279063273702, "grad_norm": 0.2220938801765442, "learning_rate": 1.8665714738188866e-05, "loss": 1.257, "step": 6417 }, { "epoch": 1.9115769094733706, "grad_norm": 0.2171488106250763, "learning_rate": 1.8665233328154142e-05, "loss": 1.2579, "step": 6418 }, { "epoch": 1.9118747556730393, "grad_norm": 0.2283460795879364, "learning_rate": 1.8664751837499104e-05, "loss": 1.2701, "step": 6419 }, { "epoch": 1.912172601872708, "grad_norm": 0.2154398411512375, "learning_rate": 1.866427026622823e-05, "loss": 1.2668, "step": 6420 }, { "epoch": 1.9124704480723766, "grad_norm": 0.21827664971351624, "learning_rate": 1.8663788614346003e-05, "loss": 1.2886, "step": 6421 }, { "epoch": 1.9127682942720452, "grad_norm": 0.22104908525943756, "learning_rate": 1.8663306881856906e-05, "loss": 1.2813, "step": 6422 }, { "epoch": 1.913066140471714, "grad_norm": 0.2262624204158783, "learning_rate": 1.866282506876541e-05, "loss": 1.2605, "step": 6423 }, { "epoch": 1.9133639866713825, "grad_norm": 0.23348413407802582, "learning_rate": 1.866234317507601e-05, "loss": 1.2662, "step": 6424 }, { "epoch": 1.913661832871051, "grad_norm": 0.22634485363960266, "learning_rate": 1.8661861200793187e-05, "loss": 1.2569, "step": 6425 }, { "epoch": 1.9139596790707198, "grad_norm": 0.23128573596477509, "learning_rate": 1.866137914592142e-05, "loss": 1.2505, "step": 6426 }, { "epoch": 1.9142575252703886, "grad_norm": 0.2224595546722412, "learning_rate": 1.86608970104652e-05, "loss": 1.2555, "step": 6427 }, { "epoch": 1.9145553714700572, "grad_norm": 0.21892890334129333, "learning_rate": 1.8660414794429003e-05, "loss": 1.2743, "step": 6428 }, { "epoch": 1.9148532176697257, "grad_norm": 0.24438150227069855, "learning_rate": 1.8659932497817328e-05, "loss": 1.299, "step": 6429 }, { "epoch": 1.9151510638693945, "grad_norm": 0.21374495327472687, "learning_rate": 1.8659450120634656e-05, "loss": 1.261, "step": 6430 }, { "epoch": 1.915448910069063, "grad_norm": 0.24549736082553864, "learning_rate": 1.8658967662885472e-05, "loss": 1.2712, "step": 6431 }, { "epoch": 1.9157467562687316, "grad_norm": 0.2284165322780609, "learning_rate": 1.8658485124574274e-05, "loss": 1.2549, "step": 6432 }, { "epoch": 1.9160446024684004, "grad_norm": 0.2533683478832245, "learning_rate": 1.8658002505705543e-05, "loss": 1.2804, "step": 6433 }, { "epoch": 1.9163424486680691, "grad_norm": 0.21719850599765778, "learning_rate": 1.8657519806283772e-05, "loss": 1.2756, "step": 6434 }, { "epoch": 1.9166402948677377, "grad_norm": 0.23185434937477112, "learning_rate": 1.865703702631345e-05, "loss": 1.2539, "step": 6435 }, { "epoch": 1.9169381410674062, "grad_norm": 0.2468571960926056, "learning_rate": 1.8656554165799074e-05, "loss": 1.2479, "step": 6436 }, { "epoch": 1.917235987267075, "grad_norm": 0.24522361159324646, "learning_rate": 1.8656071224745132e-05, "loss": 1.2761, "step": 6437 }, { "epoch": 1.9175338334667438, "grad_norm": 0.22193396091461182, "learning_rate": 1.8655588203156118e-05, "loss": 1.2677, "step": 6438 }, { "epoch": 1.917831679666412, "grad_norm": 0.2236398607492447, "learning_rate": 1.8655105101036523e-05, "loss": 1.2674, "step": 6439 }, { "epoch": 1.9181295258660809, "grad_norm": 0.24252815544605255, "learning_rate": 1.865462191839085e-05, "loss": 1.2843, "step": 6440 }, { "epoch": 1.9184273720657496, "grad_norm": 0.24570755660533905, "learning_rate": 1.865413865522359e-05, "loss": 1.2619, "step": 6441 }, { "epoch": 1.9187252182654182, "grad_norm": 0.22457174956798553, "learning_rate": 1.8653655311539234e-05, "loss": 1.2696, "step": 6442 }, { "epoch": 1.9190230644650867, "grad_norm": 0.2523292303085327, "learning_rate": 1.8653171887342287e-05, "loss": 1.2577, "step": 6443 }, { "epoch": 1.9193209106647555, "grad_norm": 0.2227764129638672, "learning_rate": 1.8652688382637243e-05, "loss": 1.2581, "step": 6444 }, { "epoch": 1.9196187568644243, "grad_norm": 0.25271645188331604, "learning_rate": 1.8652204797428602e-05, "loss": 1.2701, "step": 6445 }, { "epoch": 1.9199166030640926, "grad_norm": 0.23058974742889404, "learning_rate": 1.8651721131720857e-05, "loss": 1.2532, "step": 6446 }, { "epoch": 1.9202144492637614, "grad_norm": 0.2599303126335144, "learning_rate": 1.865123738551852e-05, "loss": 1.264, "step": 6447 }, { "epoch": 1.9205122954634302, "grad_norm": 0.23858413100242615, "learning_rate": 1.8650753558826083e-05, "loss": 1.2429, "step": 6448 }, { "epoch": 1.9208101416630987, "grad_norm": 0.2499847710132599, "learning_rate": 1.865026965164805e-05, "loss": 1.2853, "step": 6449 }, { "epoch": 1.9211079878627673, "grad_norm": 0.2274659425020218, "learning_rate": 1.864978566398892e-05, "loss": 1.2637, "step": 6450 }, { "epoch": 1.921405834062436, "grad_norm": 0.24274186789989471, "learning_rate": 1.86493015958532e-05, "loss": 1.2734, "step": 6451 }, { "epoch": 1.9217036802621048, "grad_norm": 0.2235558032989502, "learning_rate": 1.864881744724539e-05, "loss": 1.2783, "step": 6452 }, { "epoch": 1.9220015264617731, "grad_norm": 0.2452191561460495, "learning_rate": 1.8648333218169998e-05, "loss": 1.2716, "step": 6453 }, { "epoch": 1.922299372661442, "grad_norm": 0.23667138814926147, "learning_rate": 1.864784890863153e-05, "loss": 1.2676, "step": 6454 }, { "epoch": 1.9225972188611107, "grad_norm": 0.23631134629249573, "learning_rate": 1.8647364518634488e-05, "loss": 1.2631, "step": 6455 }, { "epoch": 1.9228950650607792, "grad_norm": 0.2375621795654297, "learning_rate": 1.864688004818338e-05, "loss": 1.2724, "step": 6456 }, { "epoch": 1.9231929112604478, "grad_norm": 0.2185078263282776, "learning_rate": 1.8646395497282718e-05, "loss": 1.263, "step": 6457 }, { "epoch": 1.9234907574601166, "grad_norm": 0.22520285844802856, "learning_rate": 1.8645910865937e-05, "loss": 1.269, "step": 6458 }, { "epoch": 1.9237886036597853, "grad_norm": 0.24822598695755005, "learning_rate": 1.8645426154150744e-05, "loss": 1.2605, "step": 6459 }, { "epoch": 1.9240864498594539, "grad_norm": 0.26536160707473755, "learning_rate": 1.8644941361928458e-05, "loss": 1.2835, "step": 6460 }, { "epoch": 1.9243842960591224, "grad_norm": 0.2194330245256424, "learning_rate": 1.864445648927465e-05, "loss": 1.2877, "step": 6461 }, { "epoch": 1.9246821422587912, "grad_norm": 0.3066021502017975, "learning_rate": 1.8643971536193835e-05, "loss": 1.24, "step": 6462 }, { "epoch": 1.9249799884584597, "grad_norm": 0.2796516418457031, "learning_rate": 1.8643486502690517e-05, "loss": 1.2638, "step": 6463 }, { "epoch": 1.9252778346581283, "grad_norm": 0.27320075035095215, "learning_rate": 1.864300138876922e-05, "loss": 1.2736, "step": 6464 }, { "epoch": 1.925575680857797, "grad_norm": 0.24786368012428284, "learning_rate": 1.8642516194434448e-05, "loss": 1.2495, "step": 6465 }, { "epoch": 1.9258735270574658, "grad_norm": 0.23576407134532928, "learning_rate": 1.864203091969072e-05, "loss": 1.284, "step": 6466 }, { "epoch": 1.9261713732571344, "grad_norm": 0.22346080839633942, "learning_rate": 1.864154556454255e-05, "loss": 1.2677, "step": 6467 }, { "epoch": 1.926469219456803, "grad_norm": 0.23468804359436035, "learning_rate": 1.8641060128994452e-05, "loss": 1.256, "step": 6468 }, { "epoch": 1.9267670656564717, "grad_norm": 0.22079093754291534, "learning_rate": 1.8640574613050946e-05, "loss": 1.2681, "step": 6469 }, { "epoch": 1.9270649118561403, "grad_norm": 0.23220165073871613, "learning_rate": 1.8640089016716545e-05, "loss": 1.2501, "step": 6470 }, { "epoch": 1.9273627580558088, "grad_norm": 0.251128613948822, "learning_rate": 1.863960333999577e-05, "loss": 1.2586, "step": 6471 }, { "epoch": 1.9276606042554776, "grad_norm": 0.2199133038520813, "learning_rate": 1.863911758289314e-05, "loss": 1.2721, "step": 6472 }, { "epoch": 1.9279584504551464, "grad_norm": 0.22844639420509338, "learning_rate": 1.863863174541317e-05, "loss": 1.2687, "step": 6473 }, { "epoch": 1.928256296654815, "grad_norm": 0.22571970522403717, "learning_rate": 1.863814582756039e-05, "loss": 1.268, "step": 6474 }, { "epoch": 1.9285541428544835, "grad_norm": 0.21965770423412323, "learning_rate": 1.8637659829339307e-05, "loss": 1.2592, "step": 6475 }, { "epoch": 1.9288519890541522, "grad_norm": 0.21255746483802795, "learning_rate": 1.863717375075445e-05, "loss": 1.238, "step": 6476 }, { "epoch": 1.9291498352538208, "grad_norm": 0.22225402295589447, "learning_rate": 1.863668759181034e-05, "loss": 1.2421, "step": 6477 }, { "epoch": 1.9294476814534893, "grad_norm": 0.2166040986776352, "learning_rate": 1.863620135251151e-05, "loss": 1.2648, "step": 6478 }, { "epoch": 1.929745527653158, "grad_norm": 0.2302701473236084, "learning_rate": 1.863571503286247e-05, "loss": 1.2662, "step": 6479 }, { "epoch": 1.9300433738528269, "grad_norm": 0.26717764139175415, "learning_rate": 1.863522863286775e-05, "loss": 1.2623, "step": 6480 }, { "epoch": 1.9303412200524954, "grad_norm": 0.39063355326652527, "learning_rate": 1.8634742152531875e-05, "loss": 1.2732, "step": 6481 }, { "epoch": 1.930639066252164, "grad_norm": 0.27181732654571533, "learning_rate": 1.863425559185937e-05, "loss": 1.2676, "step": 6482 }, { "epoch": 1.9309369124518327, "grad_norm": 0.24872452020645142, "learning_rate": 1.863376895085477e-05, "loss": 1.2705, "step": 6483 }, { "epoch": 1.9312347586515013, "grad_norm": 0.2171843945980072, "learning_rate": 1.863328222952259e-05, "loss": 1.2668, "step": 6484 }, { "epoch": 1.9315326048511698, "grad_norm": 0.21550366282463074, "learning_rate": 1.8632795427867365e-05, "loss": 1.262, "step": 6485 }, { "epoch": 1.9318304510508386, "grad_norm": 0.2369561344385147, "learning_rate": 1.8632308545893625e-05, "loss": 1.2616, "step": 6486 }, { "epoch": 1.9321282972505074, "grad_norm": 0.24757346510887146, "learning_rate": 1.8631821583605898e-05, "loss": 1.2698, "step": 6487 }, { "epoch": 1.932426143450176, "grad_norm": 0.23112964630126953, "learning_rate": 1.8631334541008717e-05, "loss": 1.2723, "step": 6488 }, { "epoch": 1.9327239896498445, "grad_norm": 0.2273424118757248, "learning_rate": 1.863084741810661e-05, "loss": 1.2709, "step": 6489 }, { "epoch": 1.9330218358495133, "grad_norm": 0.2369772493839264, "learning_rate": 1.863036021490411e-05, "loss": 1.2515, "step": 6490 }, { "epoch": 1.9333196820491818, "grad_norm": 0.23573176562786102, "learning_rate": 1.862987293140575e-05, "loss": 1.2516, "step": 6491 }, { "epoch": 1.9336175282488504, "grad_norm": 0.23649613559246063, "learning_rate": 1.8629385567616067e-05, "loss": 1.2681, "step": 6492 }, { "epoch": 1.9339153744485191, "grad_norm": 0.22525697946548462, "learning_rate": 1.862889812353959e-05, "loss": 1.2441, "step": 6493 }, { "epoch": 1.934213220648188, "grad_norm": 0.22657667100429535, "learning_rate": 1.8628410599180858e-05, "loss": 1.2802, "step": 6494 }, { "epoch": 1.9345110668478565, "grad_norm": 0.2088024765253067, "learning_rate": 1.8627922994544408e-05, "loss": 1.2486, "step": 6495 }, { "epoch": 1.934808913047525, "grad_norm": 0.24303029477596283, "learning_rate": 1.862743530963477e-05, "loss": 1.2791, "step": 6496 }, { "epoch": 1.9351067592471938, "grad_norm": 0.22134655714035034, "learning_rate": 1.862694754445649e-05, "loss": 1.2701, "step": 6497 }, { "epoch": 1.9354046054468623, "grad_norm": 0.2253435254096985, "learning_rate": 1.86264596990141e-05, "loss": 1.2556, "step": 6498 }, { "epoch": 1.9357024516465309, "grad_norm": 0.22450168430805206, "learning_rate": 1.8625971773312138e-05, "loss": 1.259, "step": 6499 }, { "epoch": 1.9360002978461996, "grad_norm": 0.22210893034934998, "learning_rate": 1.8625483767355146e-05, "loss": 1.254, "step": 6500 }, { "epoch": 1.9360002978461996, "eval_loss": 1.3413488864898682, "eval_runtime": 22.2616, "eval_samples_per_second": 77.892, "eval_steps_per_second": 4.896, "step": 6500 }, { "epoch": 1.9362981440458684, "grad_norm": 0.21273358166217804, "learning_rate": 1.862499568114767e-05, "loss": 1.268, "step": 6501 }, { "epoch": 1.936595990245537, "grad_norm": 0.2237015664577484, "learning_rate": 1.862450751469424e-05, "loss": 1.2594, "step": 6502 }, { "epoch": 1.9368938364452055, "grad_norm": 0.22538568079471588, "learning_rate": 1.8624019267999407e-05, "loss": 1.2558, "step": 6503 }, { "epoch": 1.9371916826448743, "grad_norm": 0.22177664935588837, "learning_rate": 1.8623530941067707e-05, "loss": 1.2603, "step": 6504 }, { "epoch": 1.937489528844543, "grad_norm": 0.22911646962165833, "learning_rate": 1.8623042533903687e-05, "loss": 1.2778, "step": 6505 }, { "epoch": 1.9377873750442114, "grad_norm": 0.221419095993042, "learning_rate": 1.8622554046511895e-05, "loss": 1.2661, "step": 6506 }, { "epoch": 1.9380852212438802, "grad_norm": 0.2237306833267212, "learning_rate": 1.8622065478896863e-05, "loss": 1.2711, "step": 6507 }, { "epoch": 1.938383067443549, "grad_norm": 0.23403052985668182, "learning_rate": 1.862157683106315e-05, "loss": 1.2563, "step": 6508 }, { "epoch": 1.9386809136432175, "grad_norm": 0.22545550763607025, "learning_rate": 1.8621088103015297e-05, "loss": 1.2692, "step": 6509 }, { "epoch": 1.938978759842886, "grad_norm": 0.22285085916519165, "learning_rate": 1.8620599294757853e-05, "loss": 1.2472, "step": 6510 }, { "epoch": 1.9392766060425548, "grad_norm": 0.2223236858844757, "learning_rate": 1.862011040629536e-05, "loss": 1.2892, "step": 6511 }, { "epoch": 1.9395744522422236, "grad_norm": 0.2175566852092743, "learning_rate": 1.8619621437632374e-05, "loss": 1.2471, "step": 6512 }, { "epoch": 1.939872298441892, "grad_norm": 0.2539050877094269, "learning_rate": 1.861913238877344e-05, "loss": 1.2717, "step": 6513 }, { "epoch": 1.9401701446415607, "grad_norm": 0.24455590546131134, "learning_rate": 1.861864325972311e-05, "loss": 1.2586, "step": 6514 }, { "epoch": 1.9404679908412295, "grad_norm": 0.22603529691696167, "learning_rate": 1.861815405048593e-05, "loss": 1.2603, "step": 6515 }, { "epoch": 1.940765837040898, "grad_norm": 0.22951674461364746, "learning_rate": 1.8617664761066457e-05, "loss": 1.2715, "step": 6516 }, { "epoch": 1.9410636832405666, "grad_norm": 0.23248666524887085, "learning_rate": 1.8617175391469243e-05, "loss": 1.2792, "step": 6517 }, { "epoch": 1.9413615294402353, "grad_norm": 0.25327444076538086, "learning_rate": 1.861668594169884e-05, "loss": 1.2724, "step": 6518 }, { "epoch": 1.941659375639904, "grad_norm": 0.22093702852725983, "learning_rate": 1.8616196411759797e-05, "loss": 1.2654, "step": 6519 }, { "epoch": 1.9419572218395726, "grad_norm": 0.23170721530914307, "learning_rate": 1.861570680165668e-05, "loss": 1.2813, "step": 6520 }, { "epoch": 1.9422550680392412, "grad_norm": 0.230038583278656, "learning_rate": 1.8615217111394032e-05, "loss": 1.2554, "step": 6521 }, { "epoch": 1.94255291423891, "grad_norm": 0.24214862287044525, "learning_rate": 1.8614727340976417e-05, "loss": 1.2699, "step": 6522 }, { "epoch": 1.9428507604385785, "grad_norm": 0.23127305507659912, "learning_rate": 1.8614237490408387e-05, "loss": 1.2652, "step": 6523 }, { "epoch": 1.943148606638247, "grad_norm": 0.22470897436141968, "learning_rate": 1.8613747559694502e-05, "loss": 1.2639, "step": 6524 }, { "epoch": 1.9434464528379158, "grad_norm": 0.23439840972423553, "learning_rate": 1.8613257548839317e-05, "loss": 1.2694, "step": 6525 }, { "epoch": 1.9437442990375846, "grad_norm": 0.22015082836151123, "learning_rate": 1.86127674578474e-05, "loss": 1.2629, "step": 6526 }, { "epoch": 1.9440421452372532, "grad_norm": 0.2245972901582718, "learning_rate": 1.86122772867233e-05, "loss": 1.2758, "step": 6527 }, { "epoch": 1.9443399914369217, "grad_norm": 0.2232385128736496, "learning_rate": 1.8611787035471583e-05, "loss": 1.2594, "step": 6528 }, { "epoch": 1.9446378376365905, "grad_norm": 0.22602230310440063, "learning_rate": 1.8611296704096813e-05, "loss": 1.2651, "step": 6529 }, { "epoch": 1.944935683836259, "grad_norm": 0.2488381415605545, "learning_rate": 1.8610806292603545e-05, "loss": 1.2696, "step": 6530 }, { "epoch": 1.9452335300359276, "grad_norm": 0.22943571209907532, "learning_rate": 1.8610315800996342e-05, "loss": 1.2712, "step": 6531 }, { "epoch": 1.9455313762355964, "grad_norm": 0.23891986906528473, "learning_rate": 1.8609825229279775e-05, "loss": 1.2666, "step": 6532 }, { "epoch": 1.9458292224352651, "grad_norm": 0.29686233401298523, "learning_rate": 1.86093345774584e-05, "loss": 1.2642, "step": 6533 }, { "epoch": 1.9461270686349337, "grad_norm": 0.2327146977186203, "learning_rate": 1.8608843845536794e-05, "loss": 1.2575, "step": 6534 }, { "epoch": 1.9464249148346022, "grad_norm": 0.22958245873451233, "learning_rate": 1.8608353033519507e-05, "loss": 1.258, "step": 6535 }, { "epoch": 1.946722761034271, "grad_norm": 0.21778999269008636, "learning_rate": 1.860786214141111e-05, "loss": 1.2575, "step": 6536 }, { "epoch": 1.9470206072339395, "grad_norm": 0.24746260046958923, "learning_rate": 1.860737116921618e-05, "loss": 1.2735, "step": 6537 }, { "epoch": 1.947318453433608, "grad_norm": 0.2289373278617859, "learning_rate": 1.8606880116939273e-05, "loss": 1.2666, "step": 6538 }, { "epoch": 1.9476162996332769, "grad_norm": 0.24286894500255585, "learning_rate": 1.8606388984584968e-05, "loss": 1.2663, "step": 6539 }, { "epoch": 1.9479141458329456, "grad_norm": 0.22748291492462158, "learning_rate": 1.8605897772157826e-05, "loss": 1.2649, "step": 6540 }, { "epoch": 1.9482119920326142, "grad_norm": 0.2637212574481964, "learning_rate": 1.860540647966242e-05, "loss": 1.2526, "step": 6541 }, { "epoch": 1.9485098382322827, "grad_norm": 0.25491830706596375, "learning_rate": 1.860491510710332e-05, "loss": 1.2761, "step": 6542 }, { "epoch": 1.9488076844319515, "grad_norm": 0.23540496826171875, "learning_rate": 1.8604423654485103e-05, "loss": 1.2788, "step": 6543 }, { "epoch": 1.94910553063162, "grad_norm": 0.2479553520679474, "learning_rate": 1.8603932121812334e-05, "loss": 1.2588, "step": 6544 }, { "epoch": 1.9494033768312886, "grad_norm": 0.21612919867038727, "learning_rate": 1.860344050908959e-05, "loss": 1.2431, "step": 6545 }, { "epoch": 1.9497012230309574, "grad_norm": 0.23982878029346466, "learning_rate": 1.8602948816321446e-05, "loss": 1.2685, "step": 6546 }, { "epoch": 1.9499990692306262, "grad_norm": 0.21687816083431244, "learning_rate": 1.8602457043512475e-05, "loss": 1.2629, "step": 6547 }, { "epoch": 1.9502969154302947, "grad_norm": 0.2796175479888916, "learning_rate": 1.8601965190667252e-05, "loss": 1.2692, "step": 6548 }, { "epoch": 1.9505947616299633, "grad_norm": 0.2920897603034973, "learning_rate": 1.860147325779035e-05, "loss": 1.2651, "step": 6549 }, { "epoch": 1.950892607829632, "grad_norm": 0.22344376146793365, "learning_rate": 1.8600981244886354e-05, "loss": 1.2678, "step": 6550 }, { "epoch": 1.9511904540293006, "grad_norm": 0.5246843099594116, "learning_rate": 1.8600489151959834e-05, "loss": 1.2464, "step": 6551 }, { "epoch": 1.9514883002289691, "grad_norm": 0.2972206473350525, "learning_rate": 1.8599996979015372e-05, "loss": 1.2628, "step": 6552 }, { "epoch": 1.951786146428638, "grad_norm": 0.27029719948768616, "learning_rate": 1.8599504726057548e-05, "loss": 1.2553, "step": 6553 }, { "epoch": 1.9520839926283067, "grad_norm": 0.23671601712703705, "learning_rate": 1.859901239309094e-05, "loss": 1.2696, "step": 6554 }, { "epoch": 1.9523818388279752, "grad_norm": 0.22710232436656952, "learning_rate": 1.859851998012013e-05, "loss": 1.2607, "step": 6555 }, { "epoch": 1.9526796850276438, "grad_norm": 0.25671708583831787, "learning_rate": 1.8598027487149696e-05, "loss": 1.2546, "step": 6556 }, { "epoch": 1.9529775312273125, "grad_norm": 0.250498503446579, "learning_rate": 1.8597534914184224e-05, "loss": 1.2697, "step": 6557 }, { "epoch": 1.953275377426981, "grad_norm": 0.23408140242099762, "learning_rate": 1.8597042261228298e-05, "loss": 1.2725, "step": 6558 }, { "epoch": 1.9535732236266496, "grad_norm": 0.21811716258525848, "learning_rate": 1.8596549528286495e-05, "loss": 1.2573, "step": 6559 }, { "epoch": 1.9538710698263184, "grad_norm": 0.22456933557987213, "learning_rate": 1.8596056715363403e-05, "loss": 1.267, "step": 6560 }, { "epoch": 1.9541689160259872, "grad_norm": 0.24056674540042877, "learning_rate": 1.859556382246361e-05, "loss": 1.2706, "step": 6561 }, { "epoch": 1.9544667622256557, "grad_norm": 0.22696880996227264, "learning_rate": 1.8595070849591697e-05, "loss": 1.2634, "step": 6562 }, { "epoch": 1.9547646084253243, "grad_norm": 0.22648760676383972, "learning_rate": 1.8594577796752252e-05, "loss": 1.2745, "step": 6563 }, { "epoch": 1.955062454624993, "grad_norm": 0.22334381937980652, "learning_rate": 1.8594084663949867e-05, "loss": 1.2625, "step": 6564 }, { "epoch": 1.9553603008246616, "grad_norm": 0.22345022857189178, "learning_rate": 1.8593591451189124e-05, "loss": 1.2702, "step": 6565 }, { "epoch": 1.9556581470243302, "grad_norm": 0.22984817624092102, "learning_rate": 1.8593098158474614e-05, "loss": 1.2753, "step": 6566 }, { "epoch": 1.955955993223999, "grad_norm": 0.2276160567998886, "learning_rate": 1.8592604785810927e-05, "loss": 1.2684, "step": 6567 }, { "epoch": 1.9562538394236677, "grad_norm": 0.22295142710208893, "learning_rate": 1.859211133320265e-05, "loss": 1.2669, "step": 6568 }, { "epoch": 1.9565516856233363, "grad_norm": 0.214729905128479, "learning_rate": 1.859161780065438e-05, "loss": 1.2693, "step": 6569 }, { "epoch": 1.9568495318230048, "grad_norm": 0.21655964851379395, "learning_rate": 1.85911241881707e-05, "loss": 1.2537, "step": 6570 }, { "epoch": 1.9571473780226736, "grad_norm": 0.23342803120613098, "learning_rate": 1.8590630495756214e-05, "loss": 1.2574, "step": 6571 }, { "epoch": 1.9574452242223424, "grad_norm": 0.22633297741413116, "learning_rate": 1.8590136723415507e-05, "loss": 1.2549, "step": 6572 }, { "epoch": 1.9577430704220107, "grad_norm": 0.21886546909809113, "learning_rate": 1.8589642871153177e-05, "loss": 1.2662, "step": 6573 }, { "epoch": 1.9580409166216795, "grad_norm": 0.21713319420814514, "learning_rate": 1.8589148938973816e-05, "loss": 1.2763, "step": 6574 }, { "epoch": 1.9583387628213482, "grad_norm": 0.22585688531398773, "learning_rate": 1.858865492688202e-05, "loss": 1.2773, "step": 6575 }, { "epoch": 1.9586366090210168, "grad_norm": 0.22475779056549072, "learning_rate": 1.8588160834882385e-05, "loss": 1.2741, "step": 6576 }, { "epoch": 1.9589344552206853, "grad_norm": 0.220277339220047, "learning_rate": 1.858766666297951e-05, "loss": 1.2547, "step": 6577 }, { "epoch": 1.959232301420354, "grad_norm": 0.22024206817150116, "learning_rate": 1.8587172411177993e-05, "loss": 1.27, "step": 6578 }, { "epoch": 1.9595301476200229, "grad_norm": 0.21963857114315033, "learning_rate": 1.8586678079482427e-05, "loss": 1.2442, "step": 6579 }, { "epoch": 1.9598279938196912, "grad_norm": 0.23683059215545654, "learning_rate": 1.8586183667897417e-05, "loss": 1.2902, "step": 6580 }, { "epoch": 1.96012584001936, "grad_norm": 0.23136749863624573, "learning_rate": 1.8585689176427558e-05, "loss": 1.2657, "step": 6581 }, { "epoch": 1.9604236862190287, "grad_norm": 0.23223896324634552, "learning_rate": 1.8585194605077457e-05, "loss": 1.2644, "step": 6582 }, { "epoch": 1.9607215324186973, "grad_norm": 0.2299426645040512, "learning_rate": 1.8584699953851712e-05, "loss": 1.2734, "step": 6583 }, { "epoch": 1.9610193786183658, "grad_norm": 0.22070257365703583, "learning_rate": 1.8584205222754924e-05, "loss": 1.2609, "step": 6584 }, { "epoch": 1.9613172248180346, "grad_norm": 0.2285425364971161, "learning_rate": 1.85837104117917e-05, "loss": 1.2711, "step": 6585 }, { "epoch": 1.9616150710177034, "grad_norm": 0.2327187955379486, "learning_rate": 1.8583215520966638e-05, "loss": 1.2698, "step": 6586 }, { "epoch": 1.961912917217372, "grad_norm": 0.2283291071653366, "learning_rate": 1.858272055028435e-05, "loss": 1.2511, "step": 6587 }, { "epoch": 1.9622107634170405, "grad_norm": 0.2311006486415863, "learning_rate": 1.858222549974943e-05, "loss": 1.2515, "step": 6588 }, { "epoch": 1.9625086096167093, "grad_norm": 0.23775631189346313, "learning_rate": 1.858173036936649e-05, "loss": 1.2527, "step": 6589 }, { "epoch": 1.9628064558163778, "grad_norm": 0.2357291877269745, "learning_rate": 1.8581235159140144e-05, "loss": 1.2771, "step": 6590 }, { "epoch": 1.9631043020160464, "grad_norm": 0.22054970264434814, "learning_rate": 1.8580739869074987e-05, "loss": 1.274, "step": 6591 }, { "epoch": 1.9634021482157151, "grad_norm": 0.22896882891654968, "learning_rate": 1.8580244499175634e-05, "loss": 1.2659, "step": 6592 }, { "epoch": 1.963699994415384, "grad_norm": 0.24621307849884033, "learning_rate": 1.8579749049446695e-05, "loss": 1.2692, "step": 6593 }, { "epoch": 1.9639978406150524, "grad_norm": 0.2353813201189041, "learning_rate": 1.857925351989277e-05, "loss": 1.2431, "step": 6594 }, { "epoch": 1.964295686814721, "grad_norm": 0.21590836346149445, "learning_rate": 1.8578757910518485e-05, "loss": 1.25, "step": 6595 }, { "epoch": 1.9645935330143898, "grad_norm": 0.21876215934753418, "learning_rate": 1.8578262221328436e-05, "loss": 1.2567, "step": 6596 }, { "epoch": 1.9648913792140583, "grad_norm": 0.22484147548675537, "learning_rate": 1.8577766452327243e-05, "loss": 1.2633, "step": 6597 }, { "epoch": 1.9651892254137269, "grad_norm": 0.22392192482948303, "learning_rate": 1.8577270603519517e-05, "loss": 1.2672, "step": 6598 }, { "epoch": 1.9654870716133956, "grad_norm": 0.24420735239982605, "learning_rate": 1.8576774674909873e-05, "loss": 1.2646, "step": 6599 }, { "epoch": 1.9657849178130644, "grad_norm": 0.2220325618982315, "learning_rate": 1.857627866650292e-05, "loss": 1.2623, "step": 6600 }, { "epoch": 1.966082764012733, "grad_norm": 0.22404667735099792, "learning_rate": 1.8575782578303278e-05, "loss": 1.2443, "step": 6601 }, { "epoch": 1.9663806102124015, "grad_norm": 0.22577859461307526, "learning_rate": 1.857528641031556e-05, "loss": 1.2638, "step": 6602 }, { "epoch": 1.9666784564120703, "grad_norm": 0.23402155935764313, "learning_rate": 1.8574790162544382e-05, "loss": 1.273, "step": 6603 }, { "epoch": 1.9669763026117388, "grad_norm": 0.22014889121055603, "learning_rate": 1.8574293834994363e-05, "loss": 1.2592, "step": 6604 }, { "epoch": 1.9672741488114074, "grad_norm": 0.22578471899032593, "learning_rate": 1.857379742767012e-05, "loss": 1.2805, "step": 6605 }, { "epoch": 1.9675719950110762, "grad_norm": 0.21594074368476868, "learning_rate": 1.8573300940576268e-05, "loss": 1.2556, "step": 6606 }, { "epoch": 1.967869841210745, "grad_norm": 0.23016615211963654, "learning_rate": 1.8572804373717432e-05, "loss": 1.2615, "step": 6607 }, { "epoch": 1.9681676874104135, "grad_norm": 0.22241929173469543, "learning_rate": 1.857230772709823e-05, "loss": 1.258, "step": 6608 }, { "epoch": 1.968465533610082, "grad_norm": 0.22699490189552307, "learning_rate": 1.8571811000723282e-05, "loss": 1.2653, "step": 6609 }, { "epoch": 1.9687633798097508, "grad_norm": 0.2268591970205307, "learning_rate": 1.8571314194597208e-05, "loss": 1.248, "step": 6610 }, { "epoch": 1.9690612260094194, "grad_norm": 0.21655899286270142, "learning_rate": 1.8570817308724632e-05, "loss": 1.2607, "step": 6611 }, { "epoch": 1.969359072209088, "grad_norm": 0.23562391102313995, "learning_rate": 1.857032034311018e-05, "loss": 1.2725, "step": 6612 }, { "epoch": 1.9696569184087567, "grad_norm": 0.22721697390079498, "learning_rate": 1.856982329775847e-05, "loss": 1.2697, "step": 6613 }, { "epoch": 1.9699547646084254, "grad_norm": 0.22269241511821747, "learning_rate": 1.856932617267413e-05, "loss": 1.269, "step": 6614 }, { "epoch": 1.970252610808094, "grad_norm": 0.2151211053133011, "learning_rate": 1.856882896786178e-05, "loss": 1.273, "step": 6615 }, { "epoch": 1.9705504570077625, "grad_norm": 0.22445057332515717, "learning_rate": 1.8568331683326054e-05, "loss": 1.2638, "step": 6616 }, { "epoch": 1.9708483032074313, "grad_norm": 0.23451349139213562, "learning_rate": 1.8567834319071577e-05, "loss": 1.2591, "step": 6617 }, { "epoch": 1.9711461494070999, "grad_norm": 0.22861187160015106, "learning_rate": 1.856733687510297e-05, "loss": 1.268, "step": 6618 }, { "epoch": 1.9714439956067684, "grad_norm": 0.22528326511383057, "learning_rate": 1.8566839351424866e-05, "loss": 1.2644, "step": 6619 }, { "epoch": 1.9717418418064372, "grad_norm": 0.23457497358322144, "learning_rate": 1.8566341748041895e-05, "loss": 1.2612, "step": 6620 }, { "epoch": 1.972039688006106, "grad_norm": 0.21923547983169556, "learning_rate": 1.8565844064958684e-05, "loss": 1.2498, "step": 6621 }, { "epoch": 1.9723375342057745, "grad_norm": 0.2287691980600357, "learning_rate": 1.8565346302179864e-05, "loss": 1.2554, "step": 6622 }, { "epoch": 1.972635380405443, "grad_norm": 0.2202005833387375, "learning_rate": 1.8564848459710066e-05, "loss": 1.2664, "step": 6623 }, { "epoch": 1.9729332266051118, "grad_norm": 0.23299351334571838, "learning_rate": 1.8564350537553927e-05, "loss": 1.2563, "step": 6624 }, { "epoch": 1.9732310728047804, "grad_norm": 0.22319965064525604, "learning_rate": 1.856385253571607e-05, "loss": 1.2551, "step": 6625 }, { "epoch": 1.973528919004449, "grad_norm": 0.2123069018125534, "learning_rate": 1.8563354454201133e-05, "loss": 1.2541, "step": 6626 }, { "epoch": 1.9738267652041177, "grad_norm": 0.21652795374393463, "learning_rate": 1.8562856293013753e-05, "loss": 1.2479, "step": 6627 }, { "epoch": 1.9741246114037865, "grad_norm": 0.2229059785604477, "learning_rate": 1.8562358052158556e-05, "loss": 1.2518, "step": 6628 }, { "epoch": 1.974422457603455, "grad_norm": 0.22529351711273193, "learning_rate": 1.8561859731640192e-05, "loss": 1.262, "step": 6629 }, { "epoch": 1.9747203038031236, "grad_norm": 0.2385956346988678, "learning_rate": 1.8561361331463286e-05, "loss": 1.2775, "step": 6630 }, { "epoch": 1.9750181500027923, "grad_norm": 0.23154640197753906, "learning_rate": 1.8560862851632478e-05, "loss": 1.2669, "step": 6631 }, { "epoch": 1.975315996202461, "grad_norm": 0.2195601463317871, "learning_rate": 1.8560364292152405e-05, "loss": 1.2649, "step": 6632 }, { "epoch": 1.9756138424021294, "grad_norm": 0.21839290857315063, "learning_rate": 1.8559865653027707e-05, "loss": 1.2638, "step": 6633 }, { "epoch": 1.9759116886017982, "grad_norm": 0.2357621043920517, "learning_rate": 1.855936693426302e-05, "loss": 1.2662, "step": 6634 }, { "epoch": 1.976209534801467, "grad_norm": 0.2239905297756195, "learning_rate": 1.8558868135862992e-05, "loss": 1.2662, "step": 6635 }, { "epoch": 1.9765073810011355, "grad_norm": 0.22861617803573608, "learning_rate": 1.8558369257832255e-05, "loss": 1.2731, "step": 6636 }, { "epoch": 1.976805227200804, "grad_norm": 0.22513408958911896, "learning_rate": 1.8557870300175454e-05, "loss": 1.2796, "step": 6637 }, { "epoch": 1.9771030734004729, "grad_norm": 0.23186704516410828, "learning_rate": 1.8557371262897235e-05, "loss": 1.2547, "step": 6638 }, { "epoch": 1.9774009196001416, "grad_norm": 0.21897944808006287, "learning_rate": 1.8556872146002234e-05, "loss": 1.2791, "step": 6639 }, { "epoch": 1.97769876579981, "grad_norm": 0.22537602484226227, "learning_rate": 1.85563729494951e-05, "loss": 1.2689, "step": 6640 }, { "epoch": 1.9779966119994787, "grad_norm": 0.21599406003952026, "learning_rate": 1.8555873673380472e-05, "loss": 1.2476, "step": 6641 }, { "epoch": 1.9782944581991475, "grad_norm": 0.22699366509914398, "learning_rate": 1.8555374317663e-05, "loss": 1.2488, "step": 6642 }, { "epoch": 1.978592304398816, "grad_norm": 0.23197655379772186, "learning_rate": 1.8554874882347333e-05, "loss": 1.2647, "step": 6643 }, { "epoch": 1.9788901505984846, "grad_norm": 0.22245697677135468, "learning_rate": 1.8554375367438106e-05, "loss": 1.2524, "step": 6644 }, { "epoch": 1.9791879967981534, "grad_norm": 0.22904571890830994, "learning_rate": 1.855387577293998e-05, "loss": 1.2691, "step": 6645 }, { "epoch": 1.9794858429978222, "grad_norm": 0.23290230333805084, "learning_rate": 1.8553376098857593e-05, "loss": 1.2771, "step": 6646 }, { "epoch": 1.9797836891974905, "grad_norm": 0.22315536439418793, "learning_rate": 1.8552876345195597e-05, "loss": 1.2729, "step": 6647 }, { "epoch": 1.9800815353971593, "grad_norm": 0.21373401582241058, "learning_rate": 1.8552376511958646e-05, "loss": 1.2454, "step": 6648 }, { "epoch": 1.980379381596828, "grad_norm": 0.2315627634525299, "learning_rate": 1.8551876599151387e-05, "loss": 1.2519, "step": 6649 }, { "epoch": 1.9806772277964966, "grad_norm": 0.2316744327545166, "learning_rate": 1.855137660677847e-05, "loss": 1.2665, "step": 6650 }, { "epoch": 1.9809750739961651, "grad_norm": 0.21895499527454376, "learning_rate": 1.8550876534844545e-05, "loss": 1.2584, "step": 6651 }, { "epoch": 1.981272920195834, "grad_norm": 0.22238340973854065, "learning_rate": 1.855037638335427e-05, "loss": 1.2672, "step": 6652 }, { "epoch": 1.9815707663955027, "grad_norm": 0.22337839007377625, "learning_rate": 1.8549876152312297e-05, "loss": 1.2557, "step": 6653 }, { "epoch": 1.9818686125951712, "grad_norm": 0.22859609127044678, "learning_rate": 1.8549375841723278e-05, "loss": 1.2647, "step": 6654 }, { "epoch": 1.9821664587948398, "grad_norm": 0.2241157442331314, "learning_rate": 1.8548875451591865e-05, "loss": 1.2656, "step": 6655 }, { "epoch": 1.9824643049945085, "grad_norm": 0.22195187211036682, "learning_rate": 1.854837498192272e-05, "loss": 1.257, "step": 6656 }, { "epoch": 1.982762151194177, "grad_norm": 0.23629960417747498, "learning_rate": 1.8547874432720498e-05, "loss": 1.2572, "step": 6657 }, { "epoch": 1.9830599973938456, "grad_norm": 0.22006109356880188, "learning_rate": 1.8547373803989854e-05, "loss": 1.2616, "step": 6658 }, { "epoch": 1.9833578435935144, "grad_norm": 0.2148144692182541, "learning_rate": 1.8546873095735445e-05, "loss": 1.2821, "step": 6659 }, { "epoch": 1.9836556897931832, "grad_norm": 0.22610807418823242, "learning_rate": 1.8546372307961934e-05, "loss": 1.2518, "step": 6660 }, { "epoch": 1.9839535359928517, "grad_norm": 0.23026686906814575, "learning_rate": 1.8545871440673972e-05, "loss": 1.2849, "step": 6661 }, { "epoch": 1.9842513821925203, "grad_norm": 0.22461266815662384, "learning_rate": 1.854537049387623e-05, "loss": 1.238, "step": 6662 }, { "epoch": 1.984549228392189, "grad_norm": 0.23078782856464386, "learning_rate": 1.854486946757336e-05, "loss": 1.2672, "step": 6663 }, { "epoch": 1.9848470745918576, "grad_norm": 0.21966131031513214, "learning_rate": 1.8544368361770026e-05, "loss": 1.2612, "step": 6664 }, { "epoch": 1.9851449207915262, "grad_norm": 0.22543630003929138, "learning_rate": 1.8543867176470892e-05, "loss": 1.2782, "step": 6665 }, { "epoch": 1.985442766991195, "grad_norm": 0.24705076217651367, "learning_rate": 1.8543365911680616e-05, "loss": 1.27, "step": 6666 }, { "epoch": 1.9857406131908637, "grad_norm": 0.21681620180606842, "learning_rate": 1.854286456740387e-05, "loss": 1.2607, "step": 6667 }, { "epoch": 1.9860384593905323, "grad_norm": 0.22567439079284668, "learning_rate": 1.8542363143645315e-05, "loss": 1.2603, "step": 6668 }, { "epoch": 1.9863363055902008, "grad_norm": 0.23189802467823029, "learning_rate": 1.8541861640409613e-05, "loss": 1.2477, "step": 6669 }, { "epoch": 1.9866341517898696, "grad_norm": 0.23018525540828705, "learning_rate": 1.854136005770143e-05, "loss": 1.2612, "step": 6670 }, { "epoch": 1.9869319979895381, "grad_norm": 0.22216768562793732, "learning_rate": 1.8540858395525435e-05, "loss": 1.2805, "step": 6671 }, { "epoch": 1.9872298441892067, "grad_norm": 0.22187797725200653, "learning_rate": 1.8540356653886297e-05, "loss": 1.2497, "step": 6672 }, { "epoch": 1.9875276903888754, "grad_norm": 0.21225842833518982, "learning_rate": 1.853985483278868e-05, "loss": 1.2488, "step": 6673 }, { "epoch": 1.9878255365885442, "grad_norm": 0.2173687219619751, "learning_rate": 1.8539352932237258e-05, "loss": 1.2607, "step": 6674 }, { "epoch": 1.9881233827882128, "grad_norm": 0.23274730145931244, "learning_rate": 1.8538850952236695e-05, "loss": 1.2768, "step": 6675 }, { "epoch": 1.9884212289878813, "grad_norm": 0.22934773564338684, "learning_rate": 1.853834889279167e-05, "loss": 1.2659, "step": 6676 }, { "epoch": 1.98871907518755, "grad_norm": 0.216777041554451, "learning_rate": 1.853784675390684e-05, "loss": 1.2558, "step": 6677 }, { "epoch": 1.9890169213872186, "grad_norm": 0.22982190549373627, "learning_rate": 1.8537344535586888e-05, "loss": 1.2812, "step": 6678 }, { "epoch": 1.9893147675868872, "grad_norm": 0.21795396506786346, "learning_rate": 1.8536842237836484e-05, "loss": 1.2637, "step": 6679 }, { "epoch": 1.989612613786556, "grad_norm": 0.22477737069129944, "learning_rate": 1.8536339860660302e-05, "loss": 1.2525, "step": 6680 }, { "epoch": 1.9899104599862247, "grad_norm": 0.22227859497070312, "learning_rate": 1.8535837404063014e-05, "loss": 1.2558, "step": 6681 }, { "epoch": 1.9902083061858933, "grad_norm": 0.22554489970207214, "learning_rate": 1.8535334868049297e-05, "loss": 1.2551, "step": 6682 }, { "epoch": 1.9905061523855618, "grad_norm": 0.22632233798503876, "learning_rate": 1.853483225262382e-05, "loss": 1.2681, "step": 6683 }, { "epoch": 1.9908039985852306, "grad_norm": 0.23057745397090912, "learning_rate": 1.853432955779127e-05, "loss": 1.2752, "step": 6684 }, { "epoch": 1.9911018447848992, "grad_norm": 0.2378273457288742, "learning_rate": 1.8533826783556318e-05, "loss": 1.2526, "step": 6685 }, { "epoch": 1.9913996909845677, "grad_norm": 0.22854778170585632, "learning_rate": 1.8533323929923643e-05, "loss": 1.261, "step": 6686 }, { "epoch": 1.9916975371842365, "grad_norm": 0.22855976223945618, "learning_rate": 1.853282099689792e-05, "loss": 1.2621, "step": 6687 }, { "epoch": 1.9919953833839052, "grad_norm": 0.2171647548675537, "learning_rate": 1.853231798448383e-05, "loss": 1.2632, "step": 6688 }, { "epoch": 1.9922932295835738, "grad_norm": 0.22770820558071136, "learning_rate": 1.853181489268606e-05, "loss": 1.2534, "step": 6689 }, { "epoch": 1.9925910757832423, "grad_norm": 0.22236798703670502, "learning_rate": 1.8531311721509278e-05, "loss": 1.2623, "step": 6690 }, { "epoch": 1.9928889219829111, "grad_norm": 0.24259105324745178, "learning_rate": 1.8530808470958176e-05, "loss": 1.256, "step": 6691 }, { "epoch": 1.9931867681825797, "grad_norm": 0.2270156890153885, "learning_rate": 1.853030514103743e-05, "loss": 1.2743, "step": 6692 }, { "epoch": 1.9934846143822482, "grad_norm": 0.21757015585899353, "learning_rate": 1.852980173175173e-05, "loss": 1.27, "step": 6693 }, { "epoch": 1.993782460581917, "grad_norm": 0.22424596548080444, "learning_rate": 1.852929824310575e-05, "loss": 1.2655, "step": 6694 }, { "epoch": 1.9940803067815858, "grad_norm": 0.23281924426555634, "learning_rate": 1.8528794675104183e-05, "loss": 1.267, "step": 6695 }, { "epoch": 1.9943781529812543, "grad_norm": 0.22783441841602325, "learning_rate": 1.8528291027751705e-05, "loss": 1.2778, "step": 6696 }, { "epoch": 1.9946759991809229, "grad_norm": 0.22311203181743622, "learning_rate": 1.8527787301053013e-05, "loss": 1.2722, "step": 6697 }, { "epoch": 1.9949738453805916, "grad_norm": 0.22183965146541595, "learning_rate": 1.8527283495012788e-05, "loss": 1.2545, "step": 6698 }, { "epoch": 1.9952716915802604, "grad_norm": 0.2308036834001541, "learning_rate": 1.8526779609635714e-05, "loss": 1.263, "step": 6699 }, { "epoch": 1.9955695377799287, "grad_norm": 0.22947841882705688, "learning_rate": 1.8526275644926482e-05, "loss": 1.2741, "step": 6700 }, { "epoch": 1.9958673839795975, "grad_norm": 0.22482331097126007, "learning_rate": 1.8525771600889783e-05, "loss": 1.2663, "step": 6701 }, { "epoch": 1.9961652301792663, "grad_norm": 0.2146119326353073, "learning_rate": 1.8525267477530304e-05, "loss": 1.2715, "step": 6702 }, { "epoch": 1.9964630763789348, "grad_norm": 0.21766456961631775, "learning_rate": 1.852476327485274e-05, "loss": 1.2624, "step": 6703 }, { "epoch": 1.9967609225786034, "grad_norm": 0.21831873059272766, "learning_rate": 1.8524258992861775e-05, "loss": 1.2799, "step": 6704 }, { "epoch": 1.9970587687782722, "grad_norm": 0.22991088032722473, "learning_rate": 1.8523754631562102e-05, "loss": 1.2683, "step": 6705 }, { "epoch": 1.997356614977941, "grad_norm": 0.21384309232234955, "learning_rate": 1.8523250190958417e-05, "loss": 1.2463, "step": 6706 }, { "epoch": 1.9976544611776093, "grad_norm": 0.2210661917924881, "learning_rate": 1.8522745671055413e-05, "loss": 1.2649, "step": 6707 }, { "epoch": 1.997952307377278, "grad_norm": 0.22385594248771667, "learning_rate": 1.852224107185778e-05, "loss": 1.2675, "step": 6708 }, { "epoch": 1.9982501535769468, "grad_norm": 0.21237967908382416, "learning_rate": 1.8521736393370216e-05, "loss": 1.2692, "step": 6709 }, { "epoch": 1.9985479997766153, "grad_norm": 0.21873606741428375, "learning_rate": 1.852123163559742e-05, "loss": 1.2527, "step": 6710 }, { "epoch": 1.998845845976284, "grad_norm": 0.22441710531711578, "learning_rate": 1.8520726798544084e-05, "loss": 1.2538, "step": 6711 }, { "epoch": 1.9991436921759527, "grad_norm": 0.23059618473052979, "learning_rate": 1.8520221882214898e-05, "loss": 1.2514, "step": 6712 }, { "epoch": 1.9994415383756214, "grad_norm": 0.22172501683235168, "learning_rate": 1.8519716886614572e-05, "loss": 1.2666, "step": 6713 }, { "epoch": 1.9997393845752898, "grad_norm": 0.22111006081104279, "learning_rate": 1.8519211811747798e-05, "loss": 1.2677, "step": 6714 }, { "epoch": 2.0000372307749585, "grad_norm": 0.21776264905929565, "learning_rate": 1.8518706657619276e-05, "loss": 1.2746, "step": 6715 }, { "epoch": 2.0003350769746273, "grad_norm": 0.22761105000972748, "learning_rate": 1.8518201424233705e-05, "loss": 1.2487, "step": 6716 }, { "epoch": 2.000632923174296, "grad_norm": 0.22182194888591766, "learning_rate": 1.851769611159579e-05, "loss": 1.2601, "step": 6717 }, { "epoch": 2.0009307693739644, "grad_norm": 0.22404825687408447, "learning_rate": 1.8517190719710226e-05, "loss": 1.2548, "step": 6718 }, { "epoch": 2.001228615573633, "grad_norm": 0.2240694761276245, "learning_rate": 1.8516685248581724e-05, "loss": 1.2632, "step": 6719 }, { "epoch": 2.001526461773302, "grad_norm": 0.22302211821079254, "learning_rate": 1.8516179698214974e-05, "loss": 1.25, "step": 6720 }, { "epoch": 2.0018243079729703, "grad_norm": 0.21828891336917877, "learning_rate": 1.8515674068614692e-05, "loss": 1.2722, "step": 6721 }, { "epoch": 2.002122154172639, "grad_norm": 0.22998200356960297, "learning_rate": 1.8515168359785573e-05, "loss": 1.2611, "step": 6722 }, { "epoch": 2.002420000372308, "grad_norm": 0.21669180691242218, "learning_rate": 1.851466257173233e-05, "loss": 1.2631, "step": 6723 }, { "epoch": 2.0027178465719766, "grad_norm": 0.22056011855602264, "learning_rate": 1.8514156704459663e-05, "loss": 1.2469, "step": 6724 }, { "epoch": 2.003015692771645, "grad_norm": 0.22987788915634155, "learning_rate": 1.851365075797228e-05, "loss": 1.2668, "step": 6725 }, { "epoch": 2.0033135389713137, "grad_norm": 0.22454120218753815, "learning_rate": 1.851314473227489e-05, "loss": 1.2687, "step": 6726 }, { "epoch": 2.0036113851709825, "grad_norm": 0.22449935972690582, "learning_rate": 1.8512638627372198e-05, "loss": 1.2532, "step": 6727 }, { "epoch": 2.003909231370651, "grad_norm": 0.2300426959991455, "learning_rate": 1.851213244326892e-05, "loss": 1.2568, "step": 6728 }, { "epoch": 2.0042070775703196, "grad_norm": 0.21341969072818756, "learning_rate": 1.851162617996975e-05, "loss": 1.2561, "step": 6729 }, { "epoch": 2.0045049237699883, "grad_norm": 0.22083446383476257, "learning_rate": 1.8511119837479413e-05, "loss": 1.2485, "step": 6730 }, { "epoch": 2.004802769969657, "grad_norm": 0.2172650694847107, "learning_rate": 1.8510613415802617e-05, "loss": 1.2642, "step": 6731 }, { "epoch": 2.0051006161693254, "grad_norm": 0.21617619693279266, "learning_rate": 1.8510106914944072e-05, "loss": 1.2704, "step": 6732 }, { "epoch": 2.005398462368994, "grad_norm": 0.21791894733905792, "learning_rate": 1.8509600334908486e-05, "loss": 1.2641, "step": 6733 }, { "epoch": 2.005696308568663, "grad_norm": 0.22550319135189056, "learning_rate": 1.850909367570058e-05, "loss": 1.2607, "step": 6734 }, { "epoch": 2.0059941547683313, "grad_norm": 0.22665993869304657, "learning_rate": 1.8508586937325063e-05, "loss": 1.2568, "step": 6735 }, { "epoch": 2.006292000968, "grad_norm": 0.22282226383686066, "learning_rate": 1.8508080119786652e-05, "loss": 1.2451, "step": 6736 }, { "epoch": 2.006589847167669, "grad_norm": 0.21354874968528748, "learning_rate": 1.8507573223090062e-05, "loss": 1.2531, "step": 6737 }, { "epoch": 2.0068876933673376, "grad_norm": 0.219691663980484, "learning_rate": 1.8507066247240005e-05, "loss": 1.2587, "step": 6738 }, { "epoch": 2.007185539567006, "grad_norm": 0.22585265338420868, "learning_rate": 1.8506559192241203e-05, "loss": 1.2709, "step": 6739 }, { "epoch": 2.0074833857666747, "grad_norm": 0.2313259243965149, "learning_rate": 1.850605205809837e-05, "loss": 1.262, "step": 6740 }, { "epoch": 2.0077812319663435, "grad_norm": 0.23173195123672485, "learning_rate": 1.850554484481623e-05, "loss": 1.2685, "step": 6741 }, { "epoch": 2.008079078166012, "grad_norm": 0.24330610036849976, "learning_rate": 1.8505037552399497e-05, "loss": 1.2708, "step": 6742 }, { "epoch": 2.0083769243656806, "grad_norm": 0.22211681306362152, "learning_rate": 1.850453018085289e-05, "loss": 1.2603, "step": 6743 }, { "epoch": 2.0086747705653494, "grad_norm": 0.22535398602485657, "learning_rate": 1.850402273018113e-05, "loss": 1.2603, "step": 6744 }, { "epoch": 2.008972616765018, "grad_norm": 0.21972212195396423, "learning_rate": 1.8503515200388945e-05, "loss": 1.263, "step": 6745 }, { "epoch": 2.0092704629646865, "grad_norm": 0.22704623639583588, "learning_rate": 1.850300759148105e-05, "loss": 1.2467, "step": 6746 }, { "epoch": 2.0095683091643552, "grad_norm": 0.23253682255744934, "learning_rate": 1.8502499903462165e-05, "loss": 1.2538, "step": 6747 }, { "epoch": 2.009866155364024, "grad_norm": 0.22977900505065918, "learning_rate": 1.8501992136337022e-05, "loss": 1.282, "step": 6748 }, { "epoch": 2.0101640015636923, "grad_norm": 0.23483891785144806, "learning_rate": 1.850148429011034e-05, "loss": 1.2598, "step": 6749 }, { "epoch": 2.010461847763361, "grad_norm": 0.2185533195734024, "learning_rate": 1.850097636478685e-05, "loss": 1.244, "step": 6750 }, { "epoch": 2.01075969396303, "grad_norm": 0.2195434421300888, "learning_rate": 1.8500468360371268e-05, "loss": 1.252, "step": 6751 }, { "epoch": 2.0110575401626987, "grad_norm": 0.2338484823703766, "learning_rate": 1.8499960276868324e-05, "loss": 1.2842, "step": 6752 }, { "epoch": 2.011355386362367, "grad_norm": 0.23106524348258972, "learning_rate": 1.849945211428275e-05, "loss": 1.267, "step": 6753 }, { "epoch": 2.0116532325620358, "grad_norm": 0.229380264878273, "learning_rate": 1.8498943872619266e-05, "loss": 1.2556, "step": 6754 }, { "epoch": 2.0119510787617045, "grad_norm": 0.2258760929107666, "learning_rate": 1.8498435551882607e-05, "loss": 1.2758, "step": 6755 }, { "epoch": 2.012248924961373, "grad_norm": 0.2275063544511795, "learning_rate": 1.84979271520775e-05, "loss": 1.2667, "step": 6756 }, { "epoch": 2.0125467711610416, "grad_norm": 0.22864452004432678, "learning_rate": 1.8497418673208677e-05, "loss": 1.2592, "step": 6757 }, { "epoch": 2.0128446173607104, "grad_norm": 0.22659514844417572, "learning_rate": 1.8496910115280865e-05, "loss": 1.2486, "step": 6758 }, { "epoch": 2.013142463560379, "grad_norm": 0.2362157255411148, "learning_rate": 1.8496401478298798e-05, "loss": 1.2685, "step": 6759 }, { "epoch": 2.0134403097600475, "grad_norm": 0.2306687980890274, "learning_rate": 1.8495892762267208e-05, "loss": 1.2639, "step": 6760 }, { "epoch": 2.0137381559597163, "grad_norm": 0.23007617890834808, "learning_rate": 1.849538396719083e-05, "loss": 1.2479, "step": 6761 }, { "epoch": 2.014036002159385, "grad_norm": 0.24096724390983582, "learning_rate": 1.8494875093074392e-05, "loss": 1.257, "step": 6762 }, { "epoch": 2.0143338483590534, "grad_norm": 0.22791236639022827, "learning_rate": 1.8494366139922634e-05, "loss": 1.2679, "step": 6763 }, { "epoch": 2.014631694558722, "grad_norm": 0.2302304208278656, "learning_rate": 1.849385710774029e-05, "loss": 1.2734, "step": 6764 }, { "epoch": 2.014929540758391, "grad_norm": 0.2323758900165558, "learning_rate": 1.8493347996532097e-05, "loss": 1.255, "step": 6765 }, { "epoch": 2.0152273869580597, "grad_norm": 0.22466978430747986, "learning_rate": 1.849283880630279e-05, "loss": 1.2689, "step": 6766 }, { "epoch": 2.015525233157728, "grad_norm": 0.23928982019424438, "learning_rate": 1.8492329537057102e-05, "loss": 1.2644, "step": 6767 }, { "epoch": 2.015823079357397, "grad_norm": 0.2315400242805481, "learning_rate": 1.849182018879978e-05, "loss": 1.2554, "step": 6768 }, { "epoch": 2.0161209255570656, "grad_norm": 0.23092469573020935, "learning_rate": 1.849131076153556e-05, "loss": 1.241, "step": 6769 }, { "epoch": 2.016418771756734, "grad_norm": 0.21835115551948547, "learning_rate": 1.8490801255269176e-05, "loss": 1.2558, "step": 6770 }, { "epoch": 2.0167166179564027, "grad_norm": 0.2587580978870392, "learning_rate": 1.849029167000538e-05, "loss": 1.249, "step": 6771 }, { "epoch": 2.0170144641560714, "grad_norm": 0.2355932891368866, "learning_rate": 1.84897820057489e-05, "loss": 1.2595, "step": 6772 }, { "epoch": 2.01731231035574, "grad_norm": 0.24647144973278046, "learning_rate": 1.848927226250448e-05, "loss": 1.2677, "step": 6773 }, { "epoch": 2.0176101565554085, "grad_norm": 0.2150038331747055, "learning_rate": 1.848876244027687e-05, "loss": 1.2575, "step": 6774 }, { "epoch": 2.0179080027550773, "grad_norm": 0.248734250664711, "learning_rate": 1.8488252539070815e-05, "loss": 1.2691, "step": 6775 }, { "epoch": 2.018205848954746, "grad_norm": 0.22936883568763733, "learning_rate": 1.848774255889105e-05, "loss": 1.2564, "step": 6776 }, { "epoch": 2.018503695154415, "grad_norm": 0.23732532560825348, "learning_rate": 1.8487232499742323e-05, "loss": 1.2576, "step": 6777 }, { "epoch": 2.018801541354083, "grad_norm": 0.2249889373779297, "learning_rate": 1.8486722361629377e-05, "loss": 1.2661, "step": 6778 }, { "epoch": 2.019099387553752, "grad_norm": 0.21732540428638458, "learning_rate": 1.848621214455697e-05, "loss": 1.2626, "step": 6779 }, { "epoch": 2.0193972337534207, "grad_norm": 0.2325662225484848, "learning_rate": 1.8485701848529835e-05, "loss": 1.2554, "step": 6780 }, { "epoch": 2.019695079953089, "grad_norm": 0.2356565296649933, "learning_rate": 1.848519147355272e-05, "loss": 1.2618, "step": 6781 }, { "epoch": 2.019992926152758, "grad_norm": 0.23597142100334167, "learning_rate": 1.8484681019630386e-05, "loss": 1.2529, "step": 6782 }, { "epoch": 2.0202907723524266, "grad_norm": 0.2436504364013672, "learning_rate": 1.8484170486767574e-05, "loss": 1.2584, "step": 6783 }, { "epoch": 2.0205886185520954, "grad_norm": 0.22701208293437958, "learning_rate": 1.8483659874969034e-05, "loss": 1.2499, "step": 6784 }, { "epoch": 2.0208864647517637, "grad_norm": 0.27323466539382935, "learning_rate": 1.8483149184239515e-05, "loss": 1.2675, "step": 6785 }, { "epoch": 2.0211843109514325, "grad_norm": 0.22097352147102356, "learning_rate": 1.848263841458377e-05, "loss": 1.2548, "step": 6786 }, { "epoch": 2.0214821571511012, "grad_norm": 0.2183672934770584, "learning_rate": 1.8482127566006556e-05, "loss": 1.266, "step": 6787 }, { "epoch": 2.0217800033507696, "grad_norm": 0.2181762307882309, "learning_rate": 1.848161663851262e-05, "loss": 1.2596, "step": 6788 }, { "epoch": 2.0220778495504383, "grad_norm": 0.2295231968164444, "learning_rate": 1.8481105632106718e-05, "loss": 1.2494, "step": 6789 }, { "epoch": 2.022375695750107, "grad_norm": 0.22780396044254303, "learning_rate": 1.8480594546793602e-05, "loss": 1.2513, "step": 6790 }, { "epoch": 2.022673541949776, "grad_norm": 0.23354724049568176, "learning_rate": 1.8480083382578033e-05, "loss": 1.2711, "step": 6791 }, { "epoch": 2.022971388149444, "grad_norm": 0.2310808002948761, "learning_rate": 1.847957213946476e-05, "loss": 1.2674, "step": 6792 }, { "epoch": 2.023269234349113, "grad_norm": 0.25081607699394226, "learning_rate": 1.847906081745854e-05, "loss": 1.2685, "step": 6793 }, { "epoch": 2.0235670805487818, "grad_norm": 0.23171740770339966, "learning_rate": 1.8478549416564132e-05, "loss": 1.2666, "step": 6794 }, { "epoch": 2.02386492674845, "grad_norm": 0.25872913002967834, "learning_rate": 1.84780379367863e-05, "loss": 1.2565, "step": 6795 }, { "epoch": 2.024162772948119, "grad_norm": 0.2311062514781952, "learning_rate": 1.8477526378129794e-05, "loss": 1.2614, "step": 6796 }, { "epoch": 2.0244606191477876, "grad_norm": 0.2204236090183258, "learning_rate": 1.8477014740599376e-05, "loss": 1.2553, "step": 6797 }, { "epoch": 2.0247584653474564, "grad_norm": 0.23132838308811188, "learning_rate": 1.8476503024199806e-05, "loss": 1.2441, "step": 6798 }, { "epoch": 2.0250563115471247, "grad_norm": 0.22377510368824005, "learning_rate": 1.8475991228935847e-05, "loss": 1.2536, "step": 6799 }, { "epoch": 2.0253541577467935, "grad_norm": 0.22365237772464752, "learning_rate": 1.847547935481226e-05, "loss": 1.2769, "step": 6800 }, { "epoch": 2.0256520039464623, "grad_norm": 0.23301854729652405, "learning_rate": 1.8474967401833807e-05, "loss": 1.2469, "step": 6801 }, { "epoch": 2.0259498501461306, "grad_norm": 0.21874602138996124, "learning_rate": 1.847445537000525e-05, "loss": 1.2656, "step": 6802 }, { "epoch": 2.0262476963457994, "grad_norm": 0.23669880628585815, "learning_rate": 1.8473943259331358e-05, "loss": 1.2685, "step": 6803 }, { "epoch": 2.026545542545468, "grad_norm": 0.22757285833358765, "learning_rate": 1.8473431069816887e-05, "loss": 1.2484, "step": 6804 }, { "epoch": 2.026843388745137, "grad_norm": 0.23288923501968384, "learning_rate": 1.847291880146661e-05, "loss": 1.2371, "step": 6805 }, { "epoch": 2.0271412349448052, "grad_norm": 0.22801439464092255, "learning_rate": 1.8472406454285287e-05, "loss": 1.2581, "step": 6806 }, { "epoch": 2.027439081144474, "grad_norm": 0.21672135591506958, "learning_rate": 1.847189402827769e-05, "loss": 1.2546, "step": 6807 }, { "epoch": 2.027736927344143, "grad_norm": 0.22586698830127716, "learning_rate": 1.8471381523448583e-05, "loss": 1.258, "step": 6808 }, { "epoch": 2.028034773543811, "grad_norm": 0.23314444720745087, "learning_rate": 1.847086893980274e-05, "loss": 1.2656, "step": 6809 }, { "epoch": 2.02833261974348, "grad_norm": 0.24733838438987732, "learning_rate": 1.847035627734492e-05, "loss": 1.2514, "step": 6810 }, { "epoch": 2.0286304659431487, "grad_norm": 0.23412738740444183, "learning_rate": 1.84698435360799e-05, "loss": 1.2699, "step": 6811 }, { "epoch": 2.0289283121428174, "grad_norm": 0.2705174386501312, "learning_rate": 1.846933071601245e-05, "loss": 1.2667, "step": 6812 }, { "epoch": 2.0292261583424858, "grad_norm": 0.22393466532230377, "learning_rate": 1.8468817817147343e-05, "loss": 1.2575, "step": 6813 }, { "epoch": 2.0295240045421545, "grad_norm": 0.22294221818447113, "learning_rate": 1.846830483948934e-05, "loss": 1.261, "step": 6814 }, { "epoch": 2.0298218507418233, "grad_norm": 0.2212774157524109, "learning_rate": 1.846779178304323e-05, "loss": 1.2584, "step": 6815 }, { "epoch": 2.0301196969414916, "grad_norm": 0.29387468099594116, "learning_rate": 1.8467278647813775e-05, "loss": 1.2542, "step": 6816 }, { "epoch": 2.0304175431411604, "grad_norm": 0.2356308400630951, "learning_rate": 1.8466765433805754e-05, "loss": 1.2584, "step": 6817 }, { "epoch": 2.030715389340829, "grad_norm": 0.22963428497314453, "learning_rate": 1.8466252141023937e-05, "loss": 1.2508, "step": 6818 }, { "epoch": 2.031013235540498, "grad_norm": 0.22229976952075958, "learning_rate": 1.8465738769473107e-05, "loss": 1.2522, "step": 6819 }, { "epoch": 2.0313110817401663, "grad_norm": 0.22314177453517914, "learning_rate": 1.8465225319158033e-05, "loss": 1.2671, "step": 6820 }, { "epoch": 2.031608927939835, "grad_norm": 0.2310333549976349, "learning_rate": 1.8464711790083496e-05, "loss": 1.2392, "step": 6821 }, { "epoch": 2.031906774139504, "grad_norm": 0.22531390190124512, "learning_rate": 1.8464198182254273e-05, "loss": 1.2593, "step": 6822 }, { "epoch": 2.032204620339172, "grad_norm": 0.23093685507774353, "learning_rate": 1.8463684495675146e-05, "loss": 1.2418, "step": 6823 }, { "epoch": 2.032502466538841, "grad_norm": 0.22888115048408508, "learning_rate": 1.8463170730350887e-05, "loss": 1.2571, "step": 6824 }, { "epoch": 2.0328003127385097, "grad_norm": 0.23483408987522125, "learning_rate": 1.846265688628628e-05, "loss": 1.2541, "step": 6825 }, { "epoch": 2.0330981589381785, "grad_norm": 0.23314787447452545, "learning_rate": 1.8462142963486105e-05, "loss": 1.2576, "step": 6826 }, { "epoch": 2.033396005137847, "grad_norm": 0.2304399311542511, "learning_rate": 1.8461628961955148e-05, "loss": 1.2925, "step": 6827 }, { "epoch": 2.0336938513375156, "grad_norm": 0.22697293758392334, "learning_rate": 1.8461114881698184e-05, "loss": 1.2547, "step": 6828 }, { "epoch": 2.0339916975371843, "grad_norm": 0.22943907976150513, "learning_rate": 1.846060072272e-05, "loss": 1.2579, "step": 6829 }, { "epoch": 2.0342895437368527, "grad_norm": 0.2288867086172104, "learning_rate": 1.8460086485025382e-05, "loss": 1.2745, "step": 6830 }, { "epoch": 2.0345873899365214, "grad_norm": 0.22665607929229736, "learning_rate": 1.8459572168619105e-05, "loss": 1.2611, "step": 6831 }, { "epoch": 2.03488523613619, "grad_norm": 0.21407344937324524, "learning_rate": 1.8459057773505968e-05, "loss": 1.2605, "step": 6832 }, { "epoch": 2.035183082335859, "grad_norm": 0.23433037102222443, "learning_rate": 1.8458543299690747e-05, "loss": 1.2699, "step": 6833 }, { "epoch": 2.0354809285355273, "grad_norm": 0.2505665719509125, "learning_rate": 1.845802874717823e-05, "loss": 1.246, "step": 6834 }, { "epoch": 2.035778774735196, "grad_norm": 0.25351783633232117, "learning_rate": 1.8457514115973202e-05, "loss": 1.2471, "step": 6835 }, { "epoch": 2.036076620934865, "grad_norm": 0.22248394787311554, "learning_rate": 1.845699940608046e-05, "loss": 1.2662, "step": 6836 }, { "epoch": 2.036374467134533, "grad_norm": 0.3558101952075958, "learning_rate": 1.8456484617504787e-05, "loss": 1.2368, "step": 6837 }, { "epoch": 2.036672313334202, "grad_norm": 0.28423941135406494, "learning_rate": 1.845596975025097e-05, "loss": 1.2575, "step": 6838 }, { "epoch": 2.0369701595338707, "grad_norm": 0.2607453763484955, "learning_rate": 1.8455454804323802e-05, "loss": 1.267, "step": 6839 }, { "epoch": 2.0372680057335395, "grad_norm": 0.22378839552402496, "learning_rate": 1.8454939779728077e-05, "loss": 1.2569, "step": 6840 }, { "epoch": 2.037565851933208, "grad_norm": 0.2533341348171234, "learning_rate": 1.8454424676468582e-05, "loss": 1.2545, "step": 6841 }, { "epoch": 2.0378636981328766, "grad_norm": 0.261690616607666, "learning_rate": 1.8453909494550108e-05, "loss": 1.249, "step": 6842 }, { "epoch": 2.0381615443325454, "grad_norm": 0.23489874601364136, "learning_rate": 1.8453394233977455e-05, "loss": 1.2653, "step": 6843 }, { "epoch": 2.038459390532214, "grad_norm": 0.2246517837047577, "learning_rate": 1.8452878894755414e-05, "loss": 1.2666, "step": 6844 }, { "epoch": 2.0387572367318825, "grad_norm": 0.2273196429014206, "learning_rate": 1.8452363476888777e-05, "loss": 1.269, "step": 6845 }, { "epoch": 2.0390550829315512, "grad_norm": 0.23828180134296417, "learning_rate": 1.8451847980382344e-05, "loss": 1.2609, "step": 6846 }, { "epoch": 2.03935292913122, "grad_norm": 0.2287394106388092, "learning_rate": 1.8451332405240906e-05, "loss": 1.2601, "step": 6847 }, { "epoch": 2.0396507753308883, "grad_norm": 0.22577506303787231, "learning_rate": 1.8450816751469264e-05, "loss": 1.2557, "step": 6848 }, { "epoch": 2.039948621530557, "grad_norm": 0.23476409912109375, "learning_rate": 1.8450301019072212e-05, "loss": 1.2583, "step": 6849 }, { "epoch": 2.040246467730226, "grad_norm": 0.2375391721725464, "learning_rate": 1.844978520805455e-05, "loss": 1.2736, "step": 6850 }, { "epoch": 2.0405443139298947, "grad_norm": 0.2250700443983078, "learning_rate": 1.844926931842108e-05, "loss": 1.2566, "step": 6851 }, { "epoch": 2.040842160129563, "grad_norm": 0.21955886483192444, "learning_rate": 1.8448753350176597e-05, "loss": 1.2436, "step": 6852 }, { "epoch": 2.0411400063292318, "grad_norm": 0.2313331663608551, "learning_rate": 1.8448237303325905e-05, "loss": 1.2628, "step": 6853 }, { "epoch": 2.0414378525289005, "grad_norm": 0.2249409407377243, "learning_rate": 1.8447721177873802e-05, "loss": 1.2607, "step": 6854 }, { "epoch": 2.041735698728569, "grad_norm": 0.23042838275432587, "learning_rate": 1.8447204973825092e-05, "loss": 1.2428, "step": 6855 }, { "epoch": 2.0420335449282376, "grad_norm": 0.23948293924331665, "learning_rate": 1.844668869118458e-05, "loss": 1.2417, "step": 6856 }, { "epoch": 2.0423313911279064, "grad_norm": 0.22150684893131256, "learning_rate": 1.8446172329957063e-05, "loss": 1.2508, "step": 6857 }, { "epoch": 2.042629237327575, "grad_norm": 0.22669459879398346, "learning_rate": 1.844565589014735e-05, "loss": 1.2677, "step": 6858 }, { "epoch": 2.0429270835272435, "grad_norm": 0.24645200371742249, "learning_rate": 1.844513937176025e-05, "loss": 1.2655, "step": 6859 }, { "epoch": 2.0432249297269123, "grad_norm": 0.2355499267578125, "learning_rate": 1.844462277480056e-05, "loss": 1.24, "step": 6860 }, { "epoch": 2.043522775926581, "grad_norm": 0.22788940370082855, "learning_rate": 1.8444106099273086e-05, "loss": 1.2664, "step": 6861 }, { "epoch": 2.0438206221262494, "grad_norm": 0.22359760105609894, "learning_rate": 1.8443589345182644e-05, "loss": 1.2529, "step": 6862 }, { "epoch": 2.044118468325918, "grad_norm": 0.2289741486310959, "learning_rate": 1.8443072512534036e-05, "loss": 1.2696, "step": 6863 }, { "epoch": 2.044416314525587, "grad_norm": 0.2220015972852707, "learning_rate": 1.844255560133207e-05, "loss": 1.2524, "step": 6864 }, { "epoch": 2.0447141607252557, "grad_norm": 0.23483262956142426, "learning_rate": 1.844203861158156e-05, "loss": 1.2668, "step": 6865 }, { "epoch": 2.045012006924924, "grad_norm": 0.2522992491722107, "learning_rate": 1.8441521543287312e-05, "loss": 1.2647, "step": 6866 }, { "epoch": 2.045309853124593, "grad_norm": 0.23144623637199402, "learning_rate": 1.8441004396454136e-05, "loss": 1.2507, "step": 6867 }, { "epoch": 2.0456076993242616, "grad_norm": 0.22831657528877258, "learning_rate": 1.8440487171086844e-05, "loss": 1.2746, "step": 6868 }, { "epoch": 2.04590554552393, "grad_norm": 0.22541166841983795, "learning_rate": 1.843996986719025e-05, "loss": 1.2666, "step": 6869 }, { "epoch": 2.0462033917235987, "grad_norm": 0.2273152619600296, "learning_rate": 1.8439452484769167e-05, "loss": 1.2753, "step": 6870 }, { "epoch": 2.0465012379232674, "grad_norm": 0.2365824282169342, "learning_rate": 1.8438935023828405e-05, "loss": 1.2541, "step": 6871 }, { "epoch": 2.046799084122936, "grad_norm": 0.2436744123697281, "learning_rate": 1.8438417484372785e-05, "loss": 1.2623, "step": 6872 }, { "epoch": 2.0470969303226045, "grad_norm": 0.22438614070415497, "learning_rate": 1.8437899866407117e-05, "loss": 1.2677, "step": 6873 }, { "epoch": 2.0473947765222733, "grad_norm": 0.21692584455013275, "learning_rate": 1.8437382169936217e-05, "loss": 1.2601, "step": 6874 }, { "epoch": 2.047692622721942, "grad_norm": 0.23106350004673004, "learning_rate": 1.8436864394964905e-05, "loss": 1.2381, "step": 6875 }, { "epoch": 2.0479904689216104, "grad_norm": 0.23219044506549835, "learning_rate": 1.8436346541497993e-05, "loss": 1.2798, "step": 6876 }, { "epoch": 2.048288315121279, "grad_norm": 0.2407381385564804, "learning_rate": 1.84358286095403e-05, "loss": 1.2648, "step": 6877 }, { "epoch": 2.048586161320948, "grad_norm": 0.23298059403896332, "learning_rate": 1.8435310599096653e-05, "loss": 1.2559, "step": 6878 }, { "epoch": 2.0488840075206167, "grad_norm": 0.22220873832702637, "learning_rate": 1.843479251017186e-05, "loss": 1.2796, "step": 6879 }, { "epoch": 2.049181853720285, "grad_norm": 0.23294870555400848, "learning_rate": 1.843427434277075e-05, "loss": 1.2622, "step": 6880 }, { "epoch": 2.049479699919954, "grad_norm": 0.21809184551239014, "learning_rate": 1.843375609689814e-05, "loss": 1.2516, "step": 6881 }, { "epoch": 2.0497775461196226, "grad_norm": 0.2328861504793167, "learning_rate": 1.8433237772558856e-05, "loss": 1.2596, "step": 6882 }, { "epoch": 2.050075392319291, "grad_norm": 0.2302510291337967, "learning_rate": 1.843271936975771e-05, "loss": 1.2471, "step": 6883 }, { "epoch": 2.0503732385189597, "grad_norm": 0.24519892036914825, "learning_rate": 1.8432200888499533e-05, "loss": 1.2461, "step": 6884 }, { "epoch": 2.0506710847186285, "grad_norm": 0.2264488786458969, "learning_rate": 1.843168232878915e-05, "loss": 1.242, "step": 6885 }, { "epoch": 2.0509689309182972, "grad_norm": 0.2383405566215515, "learning_rate": 1.8431163690631383e-05, "loss": 1.2608, "step": 6886 }, { "epoch": 2.0512667771179656, "grad_norm": 0.22933566570281982, "learning_rate": 1.8430644974031057e-05, "loss": 1.2852, "step": 6887 }, { "epoch": 2.0515646233176343, "grad_norm": 0.2428789883852005, "learning_rate": 1.8430126178992996e-05, "loss": 1.2723, "step": 6888 }, { "epoch": 2.051862469517303, "grad_norm": 0.2358553111553192, "learning_rate": 1.8429607305522036e-05, "loss": 1.239, "step": 6889 }, { "epoch": 2.0521603157169714, "grad_norm": 0.22229686379432678, "learning_rate": 1.8429088353622994e-05, "loss": 1.2562, "step": 6890 }, { "epoch": 2.05245816191664, "grad_norm": 0.2688424289226532, "learning_rate": 1.8428569323300706e-05, "loss": 1.2959, "step": 6891 }, { "epoch": 2.052756008116309, "grad_norm": 0.22803576290607452, "learning_rate": 1.8428050214559992e-05, "loss": 1.2387, "step": 6892 }, { "epoch": 2.0530538543159778, "grad_norm": 0.23481722176074982, "learning_rate": 1.842753102740569e-05, "loss": 1.2629, "step": 6893 }, { "epoch": 2.053351700515646, "grad_norm": 0.23635835945606232, "learning_rate": 1.8427011761842627e-05, "loss": 1.2751, "step": 6894 }, { "epoch": 2.053649546715315, "grad_norm": 0.21708984673023224, "learning_rate": 1.8426492417875636e-05, "loss": 1.2518, "step": 6895 }, { "epoch": 2.0539473929149836, "grad_norm": 0.2377351075410843, "learning_rate": 1.8425972995509545e-05, "loss": 1.2634, "step": 6896 }, { "epoch": 2.054245239114652, "grad_norm": 0.2265872359275818, "learning_rate": 1.8425453494749193e-05, "loss": 1.2558, "step": 6897 }, { "epoch": 2.0545430853143207, "grad_norm": 0.2197393923997879, "learning_rate": 1.8424933915599408e-05, "loss": 1.2535, "step": 6898 }, { "epoch": 2.0548409315139895, "grad_norm": 0.2214498519897461, "learning_rate": 1.8424414258065028e-05, "loss": 1.2559, "step": 6899 }, { "epoch": 2.0551387777136583, "grad_norm": 0.23770642280578613, "learning_rate": 1.842389452215088e-05, "loss": 1.2604, "step": 6900 }, { "epoch": 2.0554366239133266, "grad_norm": 0.2686885595321655, "learning_rate": 1.8423374707861808e-05, "loss": 1.2723, "step": 6901 }, { "epoch": 2.0557344701129954, "grad_norm": 0.24521741271018982, "learning_rate": 1.8422854815202645e-05, "loss": 1.2574, "step": 6902 }, { "epoch": 2.056032316312664, "grad_norm": 0.233657568693161, "learning_rate": 1.842233484417823e-05, "loss": 1.2642, "step": 6903 }, { "epoch": 2.0563301625123325, "grad_norm": 0.2279568612575531, "learning_rate": 1.84218147947934e-05, "loss": 1.2602, "step": 6904 }, { "epoch": 2.0566280087120012, "grad_norm": 0.22631187736988068, "learning_rate": 1.842129466705299e-05, "loss": 1.2694, "step": 6905 }, { "epoch": 2.05692585491167, "grad_norm": 0.2367851287126541, "learning_rate": 1.8420774460961846e-05, "loss": 1.2638, "step": 6906 }, { "epoch": 2.057223701111339, "grad_norm": 0.23497571051120758, "learning_rate": 1.84202541765248e-05, "loss": 1.2538, "step": 6907 }, { "epoch": 2.057521547311007, "grad_norm": 0.22271881997585297, "learning_rate": 1.84197338137467e-05, "loss": 1.2397, "step": 6908 }, { "epoch": 2.057819393510676, "grad_norm": 0.2256689965724945, "learning_rate": 1.841921337263238e-05, "loss": 1.274, "step": 6909 }, { "epoch": 2.0581172397103447, "grad_norm": 0.2230585217475891, "learning_rate": 1.8418692853186687e-05, "loss": 1.2515, "step": 6910 }, { "epoch": 2.0584150859100134, "grad_norm": 0.23039759695529938, "learning_rate": 1.8418172255414463e-05, "loss": 1.257, "step": 6911 }, { "epoch": 2.0587129321096818, "grad_norm": 0.22781473398208618, "learning_rate": 1.8417651579320555e-05, "loss": 1.2669, "step": 6912 }, { "epoch": 2.0590107783093505, "grad_norm": 0.23078389465808868, "learning_rate": 1.8417130824909797e-05, "loss": 1.2602, "step": 6913 }, { "epoch": 2.0593086245090193, "grad_norm": 0.23632900416851044, "learning_rate": 1.841660999218705e-05, "loss": 1.2497, "step": 6914 }, { "epoch": 2.0596064707086876, "grad_norm": 0.22319194674491882, "learning_rate": 1.8416089081157142e-05, "loss": 1.2702, "step": 6915 }, { "epoch": 2.0599043169083564, "grad_norm": 0.23077838122844696, "learning_rate": 1.8415568091824934e-05, "loss": 1.2526, "step": 6916 }, { "epoch": 2.060202163108025, "grad_norm": 0.24915198981761932, "learning_rate": 1.8415047024195263e-05, "loss": 1.2477, "step": 6917 }, { "epoch": 2.060500009307694, "grad_norm": 0.22520673274993896, "learning_rate": 1.8414525878272986e-05, "loss": 1.2594, "step": 6918 }, { "epoch": 2.0607978555073623, "grad_norm": 0.22590255737304688, "learning_rate": 1.8414004654062943e-05, "loss": 1.2708, "step": 6919 }, { "epoch": 2.061095701707031, "grad_norm": 0.22848273813724518, "learning_rate": 1.8413483351569986e-05, "loss": 1.2622, "step": 6920 }, { "epoch": 2.0613935479067, "grad_norm": 0.22753171622753143, "learning_rate": 1.8412961970798974e-05, "loss": 1.2611, "step": 6921 }, { "epoch": 2.061691394106368, "grad_norm": 0.23591996729373932, "learning_rate": 1.8412440511754745e-05, "loss": 1.2688, "step": 6922 }, { "epoch": 2.061989240306037, "grad_norm": 0.2515062987804413, "learning_rate": 1.8411918974442154e-05, "loss": 1.2748, "step": 6923 }, { "epoch": 2.0622870865057057, "grad_norm": 0.41803601384162903, "learning_rate": 1.841139735886606e-05, "loss": 1.2672, "step": 6924 }, { "epoch": 2.0625849327053745, "grad_norm": 0.2785837650299072, "learning_rate": 1.841087566503131e-05, "loss": 1.255, "step": 6925 }, { "epoch": 2.062882778905043, "grad_norm": 0.28804776072502136, "learning_rate": 1.8410353892942757e-05, "loss": 1.2779, "step": 6926 }, { "epoch": 2.0631806251047116, "grad_norm": 0.23044361174106598, "learning_rate": 1.8409832042605258e-05, "loss": 1.244, "step": 6927 }, { "epoch": 2.0634784713043803, "grad_norm": 0.225747749209404, "learning_rate": 1.840931011402367e-05, "loss": 1.2545, "step": 6928 }, { "epoch": 2.0637763175040487, "grad_norm": 0.26567065715789795, "learning_rate": 1.8408788107202844e-05, "loss": 1.2725, "step": 6929 }, { "epoch": 2.0640741637037174, "grad_norm": 0.25946280360221863, "learning_rate": 1.8408266022147643e-05, "loss": 1.2642, "step": 6930 }, { "epoch": 2.064372009903386, "grad_norm": 0.23841392993927002, "learning_rate": 1.8407743858862915e-05, "loss": 1.2548, "step": 6931 }, { "epoch": 2.064669856103055, "grad_norm": 0.2300664484500885, "learning_rate": 1.840722161735353e-05, "loss": 1.2523, "step": 6932 }, { "epoch": 2.0649677023027233, "grad_norm": 0.23629789054393768, "learning_rate": 1.840669929762434e-05, "loss": 1.2669, "step": 6933 }, { "epoch": 2.065265548502392, "grad_norm": 0.23946943879127502, "learning_rate": 1.8406176899680203e-05, "loss": 1.2745, "step": 6934 }, { "epoch": 2.065563394702061, "grad_norm": 0.23306971788406372, "learning_rate": 1.8405654423525984e-05, "loss": 1.2748, "step": 6935 }, { "epoch": 2.065861240901729, "grad_norm": 0.21573369204998016, "learning_rate": 1.840513186916654e-05, "loss": 1.2631, "step": 6936 }, { "epoch": 2.066159087101398, "grad_norm": 0.23127424716949463, "learning_rate": 1.8404609236606736e-05, "loss": 1.271, "step": 6937 }, { "epoch": 2.0664569333010667, "grad_norm": 0.234597310423851, "learning_rate": 1.8404086525851434e-05, "loss": 1.2648, "step": 6938 }, { "epoch": 2.0667547795007355, "grad_norm": 0.22970066964626312, "learning_rate": 1.8403563736905498e-05, "loss": 1.2408, "step": 6939 }, { "epoch": 2.067052625700404, "grad_norm": 0.22580933570861816, "learning_rate": 1.840304086977379e-05, "loss": 1.2748, "step": 6940 }, { "epoch": 2.0673504719000726, "grad_norm": 0.2299324870109558, "learning_rate": 1.8402517924461173e-05, "loss": 1.2589, "step": 6941 }, { "epoch": 2.0676483180997414, "grad_norm": 0.2186276614665985, "learning_rate": 1.840199490097251e-05, "loss": 1.2525, "step": 6942 }, { "epoch": 2.0679461642994097, "grad_norm": 0.2327873557806015, "learning_rate": 1.840147179931268e-05, "loss": 1.2632, "step": 6943 }, { "epoch": 2.0682440104990785, "grad_norm": 0.23202918469905853, "learning_rate": 1.8400948619486538e-05, "loss": 1.2711, "step": 6944 }, { "epoch": 2.0685418566987472, "grad_norm": 0.236239954829216, "learning_rate": 1.8400425361498953e-05, "loss": 1.2541, "step": 6945 }, { "epoch": 2.068839702898416, "grad_norm": 0.21637538075447083, "learning_rate": 1.8399902025354798e-05, "loss": 1.2592, "step": 6946 }, { "epoch": 2.0691375490980843, "grad_norm": 0.22838181257247925, "learning_rate": 1.839937861105894e-05, "loss": 1.2749, "step": 6947 }, { "epoch": 2.069435395297753, "grad_norm": 0.2435142993927002, "learning_rate": 1.839885511861625e-05, "loss": 1.2732, "step": 6948 }, { "epoch": 2.069733241497422, "grad_norm": 0.23763881623744965, "learning_rate": 1.8398331548031595e-05, "loss": 1.2604, "step": 6949 }, { "epoch": 2.07003108769709, "grad_norm": 0.2310408502817154, "learning_rate": 1.8397807899309847e-05, "loss": 1.2513, "step": 6950 }, { "epoch": 2.070328933896759, "grad_norm": 0.22499972581863403, "learning_rate": 1.839728417245588e-05, "loss": 1.2467, "step": 6951 }, { "epoch": 2.0706267800964278, "grad_norm": 0.22741033136844635, "learning_rate": 1.8396760367474565e-05, "loss": 1.2614, "step": 6952 }, { "epoch": 2.0709246262960965, "grad_norm": 0.23971451818943024, "learning_rate": 1.839623648437078e-05, "loss": 1.2663, "step": 6953 }, { "epoch": 2.071222472495765, "grad_norm": 0.22412295639514923, "learning_rate": 1.8395712523149392e-05, "loss": 1.2601, "step": 6954 }, { "epoch": 2.0715203186954336, "grad_norm": 0.22380006313323975, "learning_rate": 1.839518848381528e-05, "loss": 1.2614, "step": 6955 }, { "epoch": 2.0718181648951024, "grad_norm": 0.2436816543340683, "learning_rate": 1.8394664366373317e-05, "loss": 1.2687, "step": 6956 }, { "epoch": 2.0721160110947707, "grad_norm": 0.24516181647777557, "learning_rate": 1.8394140170828382e-05, "loss": 1.2494, "step": 6957 }, { "epoch": 2.0724138572944395, "grad_norm": 0.21680797636508942, "learning_rate": 1.8393615897185352e-05, "loss": 1.2705, "step": 6958 }, { "epoch": 2.0727117034941083, "grad_norm": 0.22413359582424164, "learning_rate": 1.8393091545449103e-05, "loss": 1.2561, "step": 6959 }, { "epoch": 2.073009549693777, "grad_norm": 0.21973325312137604, "learning_rate": 1.8392567115624514e-05, "loss": 1.2504, "step": 6960 }, { "epoch": 2.0733073958934454, "grad_norm": 0.2217159867286682, "learning_rate": 1.8392042607716467e-05, "loss": 1.2565, "step": 6961 }, { "epoch": 2.073605242093114, "grad_norm": 0.22410368919372559, "learning_rate": 1.839151802172984e-05, "loss": 1.2835, "step": 6962 }, { "epoch": 2.073903088292783, "grad_norm": 0.23028461635112762, "learning_rate": 1.839099335766951e-05, "loss": 1.2482, "step": 6963 }, { "epoch": 2.0742009344924517, "grad_norm": 0.23592053353786469, "learning_rate": 1.8390468615540366e-05, "loss": 1.2453, "step": 6964 }, { "epoch": 2.07449878069212, "grad_norm": 0.23165811598300934, "learning_rate": 1.8389943795347284e-05, "loss": 1.2465, "step": 6965 }, { "epoch": 2.074796626891789, "grad_norm": 0.24902459979057312, "learning_rate": 1.8389418897095145e-05, "loss": 1.2702, "step": 6966 }, { "epoch": 2.0750944730914576, "grad_norm": 0.24945801496505737, "learning_rate": 1.8388893920788843e-05, "loss": 1.2578, "step": 6967 }, { "epoch": 2.075392319291126, "grad_norm": 0.22417689859867096, "learning_rate": 1.8388368866433252e-05, "loss": 1.2624, "step": 6968 }, { "epoch": 2.0756901654907947, "grad_norm": 0.24911852180957794, "learning_rate": 1.8387843734033258e-05, "loss": 1.2526, "step": 6969 }, { "epoch": 2.0759880116904634, "grad_norm": 0.23068749904632568, "learning_rate": 1.8387318523593754e-05, "loss": 1.2449, "step": 6970 }, { "epoch": 2.0762858578901318, "grad_norm": 0.2273305356502533, "learning_rate": 1.838679323511962e-05, "loss": 1.2681, "step": 6971 }, { "epoch": 2.0765837040898005, "grad_norm": 0.22893273830413818, "learning_rate": 1.8386267868615747e-05, "loss": 1.265, "step": 6972 }, { "epoch": 2.0768815502894693, "grad_norm": 0.23285570740699768, "learning_rate": 1.8385742424087022e-05, "loss": 1.2615, "step": 6973 }, { "epoch": 2.077179396489138, "grad_norm": 0.24007824063301086, "learning_rate": 1.838521690153833e-05, "loss": 1.2699, "step": 6974 }, { "epoch": 2.0774772426888064, "grad_norm": 0.22753021121025085, "learning_rate": 1.8384691300974563e-05, "loss": 1.261, "step": 6975 }, { "epoch": 2.077775088888475, "grad_norm": 0.2304292917251587, "learning_rate": 1.8384165622400613e-05, "loss": 1.255, "step": 6976 }, { "epoch": 2.078072935088144, "grad_norm": 0.24333694577217102, "learning_rate": 1.838363986582137e-05, "loss": 1.2575, "step": 6977 }, { "epoch": 2.0783707812878127, "grad_norm": 0.22278420627117157, "learning_rate": 1.838311403124172e-05, "loss": 1.2443, "step": 6978 }, { "epoch": 2.078668627487481, "grad_norm": 0.22980521619319916, "learning_rate": 1.8382588118666564e-05, "loss": 1.2649, "step": 6979 }, { "epoch": 2.07896647368715, "grad_norm": 0.22375096380710602, "learning_rate": 1.838206212810079e-05, "loss": 1.249, "step": 6980 }, { "epoch": 2.0792643198868186, "grad_norm": 0.21912263333797455, "learning_rate": 1.8381536059549298e-05, "loss": 1.2602, "step": 6981 }, { "epoch": 2.079562166086487, "grad_norm": 0.23730771243572235, "learning_rate": 1.838100991301697e-05, "loss": 1.2677, "step": 6982 }, { "epoch": 2.0798600122861557, "grad_norm": 0.21935300529003143, "learning_rate": 1.8380483688508713e-05, "loss": 1.2646, "step": 6983 }, { "epoch": 2.0801578584858245, "grad_norm": 0.22099409997463226, "learning_rate": 1.8379957386029417e-05, "loss": 1.2589, "step": 6984 }, { "epoch": 2.0804557046854932, "grad_norm": 0.2275436371564865, "learning_rate": 1.8379431005583977e-05, "loss": 1.2545, "step": 6985 }, { "epoch": 2.0807535508851616, "grad_norm": 0.22090966999530792, "learning_rate": 1.83789045471773e-05, "loss": 1.2587, "step": 6986 }, { "epoch": 2.0810513970848303, "grad_norm": 0.23452916741371155, "learning_rate": 1.8378378010814276e-05, "loss": 1.2536, "step": 6987 }, { "epoch": 2.081349243284499, "grad_norm": 0.24162159860134125, "learning_rate": 1.8377851396499804e-05, "loss": 1.2646, "step": 6988 }, { "epoch": 2.0816470894841674, "grad_norm": 0.22596555948257446, "learning_rate": 1.8377324704238785e-05, "loss": 1.2561, "step": 6989 }, { "epoch": 2.081944935683836, "grad_norm": 0.23950867354869843, "learning_rate": 1.8376797934036118e-05, "loss": 1.2505, "step": 6990 }, { "epoch": 2.082242781883505, "grad_norm": 0.23692114651203156, "learning_rate": 1.8376271085896706e-05, "loss": 1.254, "step": 6991 }, { "epoch": 2.0825406280831738, "grad_norm": 0.2381732314825058, "learning_rate": 1.8375744159825452e-05, "loss": 1.276, "step": 6992 }, { "epoch": 2.082838474282842, "grad_norm": 0.22579234838485718, "learning_rate": 1.8375217155827255e-05, "loss": 1.2728, "step": 6993 }, { "epoch": 2.083136320482511, "grad_norm": 0.22766689956188202, "learning_rate": 1.837469007390702e-05, "loss": 1.2296, "step": 6994 }, { "epoch": 2.0834341666821796, "grad_norm": 0.2368287295103073, "learning_rate": 1.8374162914069652e-05, "loss": 1.2648, "step": 6995 }, { "epoch": 2.083732012881848, "grad_norm": 0.21219930052757263, "learning_rate": 1.8373635676320052e-05, "loss": 1.2447, "step": 6996 }, { "epoch": 2.0840298590815167, "grad_norm": 0.23508693277835846, "learning_rate": 1.8373108360663126e-05, "loss": 1.2649, "step": 6997 }, { "epoch": 2.0843277052811855, "grad_norm": 0.23294752836227417, "learning_rate": 1.8372580967103787e-05, "loss": 1.246, "step": 6998 }, { "epoch": 2.0846255514808543, "grad_norm": 0.2368563860654831, "learning_rate": 1.8372053495646934e-05, "loss": 1.2609, "step": 6999 }, { "epoch": 2.0849233976805226, "grad_norm": 0.25801020860671997, "learning_rate": 1.8371525946297474e-05, "loss": 1.2652, "step": 7000 }, { "epoch": 2.0849233976805226, "eval_loss": 1.3425718545913696, "eval_runtime": 21.1491, "eval_samples_per_second": 81.989, "eval_steps_per_second": 5.154, "step": 7000 }, { "epoch": 2.0852212438801914, "grad_norm": 0.23416325449943542, "learning_rate": 1.8370998319060322e-05, "loss": 1.2779, "step": 7001 }, { "epoch": 2.08551909007986, "grad_norm": 0.22796592116355896, "learning_rate": 1.837047061394038e-05, "loss": 1.2707, "step": 7002 }, { "epoch": 2.0858169362795285, "grad_norm": 0.24770605564117432, "learning_rate": 1.8369942830942567e-05, "loss": 1.2526, "step": 7003 }, { "epoch": 2.0861147824791972, "grad_norm": 0.23605570197105408, "learning_rate": 1.8369414970071783e-05, "loss": 1.2479, "step": 7004 }, { "epoch": 2.086412628678866, "grad_norm": 0.23263491690158844, "learning_rate": 1.8368887031332945e-05, "loss": 1.2474, "step": 7005 }, { "epoch": 2.086710474878535, "grad_norm": 0.24133925139904022, "learning_rate": 1.836835901473096e-05, "loss": 1.2501, "step": 7006 }, { "epoch": 2.087008321078203, "grad_norm": 0.20939506590366364, "learning_rate": 1.836783092027075e-05, "loss": 1.2444, "step": 7007 }, { "epoch": 2.087306167277872, "grad_norm": 0.22284309566020966, "learning_rate": 1.8367302747957216e-05, "loss": 1.2544, "step": 7008 }, { "epoch": 2.0876040134775407, "grad_norm": 0.24253371357917786, "learning_rate": 1.8366774497795284e-05, "loss": 1.2658, "step": 7009 }, { "epoch": 2.087901859677209, "grad_norm": 0.23892003297805786, "learning_rate": 1.8366246169789858e-05, "loss": 1.2367, "step": 7010 }, { "epoch": 2.0881997058768778, "grad_norm": 0.22930456697940826, "learning_rate": 1.8365717763945862e-05, "loss": 1.2601, "step": 7011 }, { "epoch": 2.0884975520765465, "grad_norm": 0.2401750683784485, "learning_rate": 1.836518928026821e-05, "loss": 1.2688, "step": 7012 }, { "epoch": 2.0887953982762153, "grad_norm": 0.2856822609901428, "learning_rate": 1.8364660718761816e-05, "loss": 1.2573, "step": 7013 }, { "epoch": 2.0890932444758836, "grad_norm": 0.2995435297489166, "learning_rate": 1.83641320794316e-05, "loss": 1.2656, "step": 7014 }, { "epoch": 2.0893910906755524, "grad_norm": 0.24359357357025146, "learning_rate": 1.836360336228248e-05, "loss": 1.2537, "step": 7015 }, { "epoch": 2.089688936875221, "grad_norm": 0.5386900901794434, "learning_rate": 1.8363074567319374e-05, "loss": 1.2737, "step": 7016 }, { "epoch": 2.0899867830748895, "grad_norm": 0.2589731216430664, "learning_rate": 1.8362545694547202e-05, "loss": 1.2579, "step": 7017 }, { "epoch": 2.0902846292745583, "grad_norm": 0.273444801568985, "learning_rate": 1.836201674397089e-05, "loss": 1.2638, "step": 7018 }, { "epoch": 2.090582475474227, "grad_norm": 0.23925411701202393, "learning_rate": 1.8361487715595353e-05, "loss": 1.2682, "step": 7019 }, { "epoch": 2.090880321673896, "grad_norm": 0.2418997436761856, "learning_rate": 1.8360958609425512e-05, "loss": 1.2489, "step": 7020 }, { "epoch": 2.091178167873564, "grad_norm": 0.2647736072540283, "learning_rate": 1.8360429425466297e-05, "loss": 1.2583, "step": 7021 }, { "epoch": 2.091476014073233, "grad_norm": 0.2629542350769043, "learning_rate": 1.8359900163722622e-05, "loss": 1.259, "step": 7022 }, { "epoch": 2.0917738602729017, "grad_norm": 0.24343015253543854, "learning_rate": 1.835937082419942e-05, "loss": 1.2715, "step": 7023 }, { "epoch": 2.09207170647257, "grad_norm": 0.21744422614574432, "learning_rate": 1.835884140690161e-05, "loss": 1.2327, "step": 7024 }, { "epoch": 2.092369552672239, "grad_norm": 0.2299731820821762, "learning_rate": 1.8358311911834122e-05, "loss": 1.2448, "step": 7025 }, { "epoch": 2.0926673988719076, "grad_norm": 0.25502943992614746, "learning_rate": 1.8357782339001877e-05, "loss": 1.2617, "step": 7026 }, { "epoch": 2.0929652450715763, "grad_norm": 0.22891417145729065, "learning_rate": 1.835725268840981e-05, "loss": 1.2534, "step": 7027 }, { "epoch": 2.0932630912712447, "grad_norm": 0.2278069704771042, "learning_rate": 1.835672296006284e-05, "loss": 1.2672, "step": 7028 }, { "epoch": 2.0935609374709134, "grad_norm": 0.22990724444389343, "learning_rate": 1.8356193153965897e-05, "loss": 1.2635, "step": 7029 }, { "epoch": 2.093858783670582, "grad_norm": 0.23280049860477448, "learning_rate": 1.8355663270123916e-05, "loss": 1.2564, "step": 7030 }, { "epoch": 2.094156629870251, "grad_norm": 0.2301468402147293, "learning_rate": 1.8355133308541827e-05, "loss": 1.2705, "step": 7031 }, { "epoch": 2.0944544760699193, "grad_norm": 0.227765753865242, "learning_rate": 1.8354603269224554e-05, "loss": 1.2808, "step": 7032 }, { "epoch": 2.094752322269588, "grad_norm": 0.22093096375465393, "learning_rate": 1.8354073152177032e-05, "loss": 1.2482, "step": 7033 }, { "epoch": 2.095050168469257, "grad_norm": 0.21572571992874146, "learning_rate": 1.835354295740419e-05, "loss": 1.2628, "step": 7034 }, { "epoch": 2.095348014668925, "grad_norm": 0.23582948744297028, "learning_rate": 1.8353012684910968e-05, "loss": 1.2565, "step": 7035 }, { "epoch": 2.095645860868594, "grad_norm": 0.2237798422574997, "learning_rate": 1.8352482334702296e-05, "loss": 1.2664, "step": 7036 }, { "epoch": 2.0959437070682627, "grad_norm": 0.22031529247760773, "learning_rate": 1.8351951906783108e-05, "loss": 1.2567, "step": 7037 }, { "epoch": 2.096241553267931, "grad_norm": 0.22599531710147858, "learning_rate": 1.8351421401158337e-05, "loss": 1.2484, "step": 7038 }, { "epoch": 2.0965393994676, "grad_norm": 0.22428205609321594, "learning_rate": 1.835089081783292e-05, "loss": 1.2602, "step": 7039 }, { "epoch": 2.0968372456672686, "grad_norm": 0.2303728461265564, "learning_rate": 1.8350360156811796e-05, "loss": 1.2655, "step": 7040 }, { "epoch": 2.0971350918669374, "grad_norm": 0.22629369795322418, "learning_rate": 1.83498294180999e-05, "loss": 1.2672, "step": 7041 }, { "epoch": 2.0974329380666057, "grad_norm": 0.22464722394943237, "learning_rate": 1.834929860170217e-05, "loss": 1.2596, "step": 7042 }, { "epoch": 2.0977307842662745, "grad_norm": 0.2226731777191162, "learning_rate": 1.8348767707623544e-05, "loss": 1.2639, "step": 7043 }, { "epoch": 2.0980286304659432, "grad_norm": 0.24072232842445374, "learning_rate": 1.8348236735868963e-05, "loss": 1.2646, "step": 7044 }, { "epoch": 2.098326476665612, "grad_norm": 0.21738742291927338, "learning_rate": 1.8347705686443365e-05, "loss": 1.2569, "step": 7045 }, { "epoch": 2.0986243228652803, "grad_norm": 0.22332030534744263, "learning_rate": 1.8347174559351693e-05, "loss": 1.2646, "step": 7046 }, { "epoch": 2.098922169064949, "grad_norm": 0.23183809220790863, "learning_rate": 1.834664335459889e-05, "loss": 1.2605, "step": 7047 }, { "epoch": 2.099220015264618, "grad_norm": 0.23424433171749115, "learning_rate": 1.8346112072189894e-05, "loss": 1.2593, "step": 7048 }, { "epoch": 2.099517861464286, "grad_norm": 0.2322787046432495, "learning_rate": 1.834558071212965e-05, "loss": 1.234, "step": 7049 }, { "epoch": 2.099815707663955, "grad_norm": 0.23153313994407654, "learning_rate": 1.8345049274423102e-05, "loss": 1.2555, "step": 7050 }, { "epoch": 2.1001135538636237, "grad_norm": 0.22688642144203186, "learning_rate": 1.8344517759075193e-05, "loss": 1.2356, "step": 7051 }, { "epoch": 2.1004114000632925, "grad_norm": 0.22773028910160065, "learning_rate": 1.8343986166090873e-05, "loss": 1.25, "step": 7052 }, { "epoch": 2.100709246262961, "grad_norm": 0.23619934916496277, "learning_rate": 1.8343454495475083e-05, "loss": 1.2741, "step": 7053 }, { "epoch": 2.1010070924626296, "grad_norm": 0.22756488621234894, "learning_rate": 1.8342922747232768e-05, "loss": 1.2498, "step": 7054 }, { "epoch": 2.1013049386622984, "grad_norm": 0.2158007025718689, "learning_rate": 1.834239092136888e-05, "loss": 1.2508, "step": 7055 }, { "epoch": 2.1016027848619667, "grad_norm": 0.21473854780197144, "learning_rate": 1.8341859017888363e-05, "loss": 1.2704, "step": 7056 }, { "epoch": 2.1019006310616355, "grad_norm": 0.22193129360675812, "learning_rate": 1.834132703679617e-05, "loss": 1.2839, "step": 7057 }, { "epoch": 2.1021984772613043, "grad_norm": 0.23289480805397034, "learning_rate": 1.8340794978097248e-05, "loss": 1.266, "step": 7058 }, { "epoch": 2.102496323460973, "grad_norm": 0.22643692791461945, "learning_rate": 1.8340262841796546e-05, "loss": 1.2644, "step": 7059 }, { "epoch": 2.1027941696606414, "grad_norm": 0.2265244424343109, "learning_rate": 1.8339730627899017e-05, "loss": 1.2534, "step": 7060 }, { "epoch": 2.10309201586031, "grad_norm": 0.22465166449546814, "learning_rate": 1.8339198336409615e-05, "loss": 1.243, "step": 7061 }, { "epoch": 2.103389862059979, "grad_norm": 0.2255132794380188, "learning_rate": 1.8338665967333288e-05, "loss": 1.2651, "step": 7062 }, { "epoch": 2.1036877082596472, "grad_norm": 0.23886889219284058, "learning_rate": 1.833813352067499e-05, "loss": 1.2605, "step": 7063 }, { "epoch": 2.103985554459316, "grad_norm": 0.2307594269514084, "learning_rate": 1.8337600996439678e-05, "loss": 1.2612, "step": 7064 }, { "epoch": 2.104283400658985, "grad_norm": 0.23562608659267426, "learning_rate": 1.83370683946323e-05, "loss": 1.2712, "step": 7065 }, { "epoch": 2.1045812468586536, "grad_norm": 0.22392936050891876, "learning_rate": 1.8336535715257818e-05, "loss": 1.2533, "step": 7066 }, { "epoch": 2.104879093058322, "grad_norm": 0.233314648270607, "learning_rate": 1.8336002958321185e-05, "loss": 1.2763, "step": 7067 }, { "epoch": 2.1051769392579907, "grad_norm": 0.2265716791152954, "learning_rate": 1.8335470123827356e-05, "loss": 1.248, "step": 7068 }, { "epoch": 2.1054747854576594, "grad_norm": 0.2358282506465912, "learning_rate": 1.833493721178129e-05, "loss": 1.2695, "step": 7069 }, { "epoch": 2.1057726316573278, "grad_norm": 0.2352137416601181, "learning_rate": 1.8334404222187953e-05, "loss": 1.2627, "step": 7070 }, { "epoch": 2.1060704778569965, "grad_norm": 0.23004014790058136, "learning_rate": 1.8333871155052286e-05, "loss": 1.2474, "step": 7071 }, { "epoch": 2.1063683240566653, "grad_norm": 0.24043875932693481, "learning_rate": 1.8333338010379264e-05, "loss": 1.2649, "step": 7072 }, { "epoch": 2.106666170256334, "grad_norm": 0.23524345457553864, "learning_rate": 1.8332804788173843e-05, "loss": 1.262, "step": 7073 }, { "epoch": 2.1069640164560024, "grad_norm": 0.22107917070388794, "learning_rate": 1.8332271488440985e-05, "loss": 1.2511, "step": 7074 }, { "epoch": 2.107261862655671, "grad_norm": 0.2195582240819931, "learning_rate": 1.8331738111185648e-05, "loss": 1.2771, "step": 7075 }, { "epoch": 2.10755970885534, "grad_norm": 0.23306547105312347, "learning_rate": 1.8331204656412796e-05, "loss": 1.2649, "step": 7076 }, { "epoch": 2.1078575550550083, "grad_norm": 0.2380416840314865, "learning_rate": 1.8330671124127394e-05, "loss": 1.273, "step": 7077 }, { "epoch": 2.108155401254677, "grad_norm": 0.22080141305923462, "learning_rate": 1.8330137514334405e-05, "loss": 1.2669, "step": 7078 }, { "epoch": 2.108453247454346, "grad_norm": 0.22362235188484192, "learning_rate": 1.8329603827038792e-05, "loss": 1.2627, "step": 7079 }, { "epoch": 2.1087510936540146, "grad_norm": 0.2260538935661316, "learning_rate": 1.8329070062245522e-05, "loss": 1.2521, "step": 7080 }, { "epoch": 2.109048939853683, "grad_norm": 0.22199952602386475, "learning_rate": 1.8328536219959563e-05, "loss": 1.2398, "step": 7081 }, { "epoch": 2.1093467860533517, "grad_norm": 0.22181862592697144, "learning_rate": 1.8328002300185878e-05, "loss": 1.2525, "step": 7082 }, { "epoch": 2.1096446322530205, "grad_norm": 0.23434802889823914, "learning_rate": 1.8327468302929437e-05, "loss": 1.2468, "step": 7083 }, { "epoch": 2.109942478452689, "grad_norm": 0.2293621003627777, "learning_rate": 1.8326934228195205e-05, "loss": 1.2695, "step": 7084 }, { "epoch": 2.1102403246523576, "grad_norm": 0.22341133654117584, "learning_rate": 1.8326400075988157e-05, "loss": 1.2357, "step": 7085 }, { "epoch": 2.1105381708520263, "grad_norm": 0.24247558414936066, "learning_rate": 1.8325865846313255e-05, "loss": 1.2589, "step": 7086 }, { "epoch": 2.110836017051695, "grad_norm": 0.22479978203773499, "learning_rate": 1.8325331539175475e-05, "loss": 1.2624, "step": 7087 }, { "epoch": 2.1111338632513634, "grad_norm": 0.22271622717380524, "learning_rate": 1.832479715457979e-05, "loss": 1.263, "step": 7088 }, { "epoch": 2.111431709451032, "grad_norm": 0.2161114364862442, "learning_rate": 1.8324262692531162e-05, "loss": 1.2583, "step": 7089 }, { "epoch": 2.111729555650701, "grad_norm": 0.21842052042484283, "learning_rate": 1.8323728153034576e-05, "loss": 1.2611, "step": 7090 }, { "epoch": 2.1120274018503693, "grad_norm": 0.22679197788238525, "learning_rate": 1.8323193536094998e-05, "loss": 1.2384, "step": 7091 }, { "epoch": 2.112325248050038, "grad_norm": 0.2320265918970108, "learning_rate": 1.8322658841717404e-05, "loss": 1.2473, "step": 7092 }, { "epoch": 2.112623094249707, "grad_norm": 0.22139060497283936, "learning_rate": 1.8322124069906765e-05, "loss": 1.2461, "step": 7093 }, { "epoch": 2.1129209404493756, "grad_norm": 0.2263459712266922, "learning_rate": 1.8321589220668062e-05, "loss": 1.2569, "step": 7094 }, { "epoch": 2.113218786649044, "grad_norm": 0.22318021953105927, "learning_rate": 1.8321054294006265e-05, "loss": 1.2519, "step": 7095 }, { "epoch": 2.1135166328487127, "grad_norm": 0.21863384544849396, "learning_rate": 1.832051928992636e-05, "loss": 1.2587, "step": 7096 }, { "epoch": 2.1138144790483815, "grad_norm": 0.2100280076265335, "learning_rate": 1.8319984208433318e-05, "loss": 1.2587, "step": 7097 }, { "epoch": 2.1141123252480503, "grad_norm": 0.22919198870658875, "learning_rate": 1.8319449049532115e-05, "loss": 1.2711, "step": 7098 }, { "epoch": 2.1144101714477186, "grad_norm": 0.24317790567874908, "learning_rate": 1.8318913813227738e-05, "loss": 1.2668, "step": 7099 }, { "epoch": 2.1147080176473874, "grad_norm": 0.23060758411884308, "learning_rate": 1.831837849952516e-05, "loss": 1.2724, "step": 7100 }, { "epoch": 2.115005863847056, "grad_norm": 0.22312051057815552, "learning_rate": 1.8317843108429362e-05, "loss": 1.2454, "step": 7101 }, { "epoch": 2.1153037100467245, "grad_norm": 0.23319660127162933, "learning_rate": 1.8317307639945332e-05, "loss": 1.2777, "step": 7102 }, { "epoch": 2.1156015562463932, "grad_norm": 0.22555091977119446, "learning_rate": 1.8316772094078043e-05, "loss": 1.2523, "step": 7103 }, { "epoch": 2.115899402446062, "grad_norm": 0.22101464867591858, "learning_rate": 1.8316236470832484e-05, "loss": 1.2671, "step": 7104 }, { "epoch": 2.1161972486457303, "grad_norm": 0.22322897613048553, "learning_rate": 1.8315700770213637e-05, "loss": 1.2543, "step": 7105 }, { "epoch": 2.116495094845399, "grad_norm": 0.2308381050825119, "learning_rate": 1.8315164992226482e-05, "loss": 1.2727, "step": 7106 }, { "epoch": 2.116792941045068, "grad_norm": 0.2280161827802658, "learning_rate": 1.831462913687601e-05, "loss": 1.2642, "step": 7107 }, { "epoch": 2.1170907872447366, "grad_norm": 0.24484100937843323, "learning_rate": 1.83140932041672e-05, "loss": 1.2837, "step": 7108 }, { "epoch": 2.117388633444405, "grad_norm": 0.23114639520645142, "learning_rate": 1.8313557194105046e-05, "loss": 1.2573, "step": 7109 }, { "epoch": 2.1176864796440737, "grad_norm": 0.2284366935491562, "learning_rate": 1.831302110669453e-05, "loss": 1.2669, "step": 7110 }, { "epoch": 2.1179843258437425, "grad_norm": 0.224045529961586, "learning_rate": 1.8312484941940635e-05, "loss": 1.2434, "step": 7111 }, { "epoch": 2.1182821720434113, "grad_norm": 0.24650663137435913, "learning_rate": 1.831194869984836e-05, "loss": 1.2562, "step": 7112 }, { "epoch": 2.1185800182430796, "grad_norm": 0.22716335952281952, "learning_rate": 1.8311412380422688e-05, "loss": 1.2444, "step": 7113 }, { "epoch": 2.1188778644427484, "grad_norm": 0.2214750498533249, "learning_rate": 1.831087598366861e-05, "loss": 1.2655, "step": 7114 }, { "epoch": 2.119175710642417, "grad_norm": 0.22327202558517456, "learning_rate": 1.8310339509591118e-05, "loss": 1.2576, "step": 7115 }, { "epoch": 2.1194735568420855, "grad_norm": 0.2163681983947754, "learning_rate": 1.8309802958195197e-05, "loss": 1.2498, "step": 7116 }, { "epoch": 2.1197714030417543, "grad_norm": 0.22627170383930206, "learning_rate": 1.8309266329485847e-05, "loss": 1.2722, "step": 7117 }, { "epoch": 2.120069249241423, "grad_norm": 0.22877243161201477, "learning_rate": 1.8308729623468056e-05, "loss": 1.2592, "step": 7118 }, { "epoch": 2.120367095441092, "grad_norm": 0.22487469017505646, "learning_rate": 1.830819284014682e-05, "loss": 1.2586, "step": 7119 }, { "epoch": 2.12066494164076, "grad_norm": 0.23687994480133057, "learning_rate": 1.8307655979527133e-05, "loss": 1.2619, "step": 7120 }, { "epoch": 2.120962787840429, "grad_norm": 0.21765540540218353, "learning_rate": 1.8307119041613988e-05, "loss": 1.255, "step": 7121 }, { "epoch": 2.1212606340400977, "grad_norm": 0.22309458255767822, "learning_rate": 1.8306582026412384e-05, "loss": 1.2648, "step": 7122 }, { "epoch": 2.121558480239766, "grad_norm": 0.22108714282512665, "learning_rate": 1.830604493392731e-05, "loss": 1.2516, "step": 7123 }, { "epoch": 2.121856326439435, "grad_norm": 0.23164448142051697, "learning_rate": 1.8305507764163772e-05, "loss": 1.285, "step": 7124 }, { "epoch": 2.1221541726391036, "grad_norm": 0.23148779571056366, "learning_rate": 1.830497051712676e-05, "loss": 1.2271, "step": 7125 }, { "epoch": 2.1224520188387723, "grad_norm": 0.22546571493148804, "learning_rate": 1.830443319282128e-05, "loss": 1.2637, "step": 7126 }, { "epoch": 2.1227498650384407, "grad_norm": 0.22211135923862457, "learning_rate": 1.8303895791252327e-05, "loss": 1.255, "step": 7127 }, { "epoch": 2.1230477112381094, "grad_norm": 0.2260240763425827, "learning_rate": 1.83033583124249e-05, "loss": 1.2687, "step": 7128 }, { "epoch": 2.123345557437778, "grad_norm": 0.23260627686977386, "learning_rate": 1.8302820756344e-05, "loss": 1.2624, "step": 7129 }, { "epoch": 2.1236434036374465, "grad_norm": 0.2216912806034088, "learning_rate": 1.830228312301463e-05, "loss": 1.2567, "step": 7130 }, { "epoch": 2.1239412498371153, "grad_norm": 0.2314852923154831, "learning_rate": 1.8301745412441793e-05, "loss": 1.2652, "step": 7131 }, { "epoch": 2.124239096036784, "grad_norm": 0.23260203003883362, "learning_rate": 1.8301207624630487e-05, "loss": 1.2622, "step": 7132 }, { "epoch": 2.124536942236453, "grad_norm": 0.22038774192333221, "learning_rate": 1.8300669759585724e-05, "loss": 1.2579, "step": 7133 }, { "epoch": 2.124834788436121, "grad_norm": 0.22577354311943054, "learning_rate": 1.83001318173125e-05, "loss": 1.2662, "step": 7134 }, { "epoch": 2.12513263463579, "grad_norm": 0.22296591103076935, "learning_rate": 1.829959379781582e-05, "loss": 1.2715, "step": 7135 }, { "epoch": 2.1254304808354587, "grad_norm": 0.22403176128864288, "learning_rate": 1.8299055701100696e-05, "loss": 1.2463, "step": 7136 }, { "epoch": 2.125728327035127, "grad_norm": 0.23563499748706818, "learning_rate": 1.8298517527172128e-05, "loss": 1.2562, "step": 7137 }, { "epoch": 2.126026173234796, "grad_norm": 0.22584551572799683, "learning_rate": 1.8297979276035128e-05, "loss": 1.2479, "step": 7138 }, { "epoch": 2.1263240194344646, "grad_norm": 0.23392264544963837, "learning_rate": 1.8297440947694703e-05, "loss": 1.2683, "step": 7139 }, { "epoch": 2.1266218656341334, "grad_norm": 0.23094266653060913, "learning_rate": 1.8296902542155862e-05, "loss": 1.258, "step": 7140 }, { "epoch": 2.1269197118338017, "grad_norm": 0.22943681478500366, "learning_rate": 1.829636405942361e-05, "loss": 1.2705, "step": 7141 }, { "epoch": 2.1272175580334705, "grad_norm": 0.23909318447113037, "learning_rate": 1.829582549950296e-05, "loss": 1.2741, "step": 7142 }, { "epoch": 2.1275154042331392, "grad_norm": 0.22960753738880157, "learning_rate": 1.829528686239892e-05, "loss": 1.2576, "step": 7143 }, { "epoch": 2.1278132504328076, "grad_norm": 0.22400975227355957, "learning_rate": 1.8294748148116505e-05, "loss": 1.2507, "step": 7144 }, { "epoch": 2.1281110966324763, "grad_norm": 0.2300107777118683, "learning_rate": 1.829420935666073e-05, "loss": 1.2666, "step": 7145 }, { "epoch": 2.128408942832145, "grad_norm": 0.23621103167533875, "learning_rate": 1.8293670488036598e-05, "loss": 1.2505, "step": 7146 }, { "epoch": 2.128706789031814, "grad_norm": 0.22813810408115387, "learning_rate": 1.8293131542249135e-05, "loss": 1.2746, "step": 7147 }, { "epoch": 2.129004635231482, "grad_norm": 0.21974337100982666, "learning_rate": 1.829259251930334e-05, "loss": 1.2543, "step": 7148 }, { "epoch": 2.129302481431151, "grad_norm": 0.21717719733715057, "learning_rate": 1.8292053419204244e-05, "loss": 1.2689, "step": 7149 }, { "epoch": 2.1296003276308197, "grad_norm": 0.22804971039295197, "learning_rate": 1.8291514241956853e-05, "loss": 1.2672, "step": 7150 }, { "epoch": 2.1298981738304885, "grad_norm": 0.23685619235038757, "learning_rate": 1.8290974987566183e-05, "loss": 1.2711, "step": 7151 }, { "epoch": 2.130196020030157, "grad_norm": 0.23363293707370758, "learning_rate": 1.8290435656037256e-05, "loss": 1.2595, "step": 7152 }, { "epoch": 2.1304938662298256, "grad_norm": 0.22478599846363068, "learning_rate": 1.828989624737509e-05, "loss": 1.2615, "step": 7153 }, { "epoch": 2.1307917124294944, "grad_norm": 0.2230616807937622, "learning_rate": 1.8289356761584698e-05, "loss": 1.2661, "step": 7154 }, { "epoch": 2.1310895586291627, "grad_norm": 0.22488093376159668, "learning_rate": 1.8288817198671103e-05, "loss": 1.255, "step": 7155 }, { "epoch": 2.1313874048288315, "grad_norm": 0.22709622979164124, "learning_rate": 1.828827755863933e-05, "loss": 1.2545, "step": 7156 }, { "epoch": 2.1316852510285003, "grad_norm": 0.2322174310684204, "learning_rate": 1.8287737841494387e-05, "loss": 1.2603, "step": 7157 }, { "epoch": 2.1319830972281686, "grad_norm": 0.22935152053833008, "learning_rate": 1.828719804724131e-05, "loss": 1.2581, "step": 7158 }, { "epoch": 2.1322809434278374, "grad_norm": 0.22376160323619843, "learning_rate": 1.828665817588511e-05, "loss": 1.2679, "step": 7159 }, { "epoch": 2.132578789627506, "grad_norm": 0.22763124108314514, "learning_rate": 1.8286118227430816e-05, "loss": 1.2655, "step": 7160 }, { "epoch": 2.132876635827175, "grad_norm": 0.22681979835033417, "learning_rate": 1.828557820188345e-05, "loss": 1.2472, "step": 7161 }, { "epoch": 2.1331744820268432, "grad_norm": 0.23133428394794464, "learning_rate": 1.8285038099248032e-05, "loss": 1.2594, "step": 7162 }, { "epoch": 2.133472328226512, "grad_norm": 0.22428368031978607, "learning_rate": 1.8284497919529592e-05, "loss": 1.2625, "step": 7163 }, { "epoch": 2.1337701744261808, "grad_norm": 0.22396983206272125, "learning_rate": 1.828395766273316e-05, "loss": 1.2425, "step": 7164 }, { "epoch": 2.1340680206258495, "grad_norm": 0.21950854361057281, "learning_rate": 1.8283417328863752e-05, "loss": 1.2489, "step": 7165 }, { "epoch": 2.134365866825518, "grad_norm": 0.2416682243347168, "learning_rate": 1.82828769179264e-05, "loss": 1.2673, "step": 7166 }, { "epoch": 2.1346637130251866, "grad_norm": 0.21806782484054565, "learning_rate": 1.828233642992614e-05, "loss": 1.2661, "step": 7167 }, { "epoch": 2.1349615592248554, "grad_norm": 0.22544948756694794, "learning_rate": 1.828179586486799e-05, "loss": 1.262, "step": 7168 }, { "epoch": 2.1352594054245237, "grad_norm": 0.22799630463123322, "learning_rate": 1.8281255222756977e-05, "loss": 1.2529, "step": 7169 }, { "epoch": 2.1355572516241925, "grad_norm": 0.2227330505847931, "learning_rate": 1.828071450359814e-05, "loss": 1.2513, "step": 7170 }, { "epoch": 2.1358550978238613, "grad_norm": 0.22194263339042664, "learning_rate": 1.8280173707396507e-05, "loss": 1.2589, "step": 7171 }, { "epoch": 2.1361529440235296, "grad_norm": 0.2329639047384262, "learning_rate": 1.827963283415711e-05, "loss": 1.2685, "step": 7172 }, { "epoch": 2.1364507902231984, "grad_norm": 0.23032844066619873, "learning_rate": 1.8279091883884977e-05, "loss": 1.2414, "step": 7173 }, { "epoch": 2.136748636422867, "grad_norm": 0.2239466905593872, "learning_rate": 1.8278550856585142e-05, "loss": 1.2429, "step": 7174 }, { "epoch": 2.137046482622536, "grad_norm": 0.22458164393901825, "learning_rate": 1.8278009752262647e-05, "loss": 1.2576, "step": 7175 }, { "epoch": 2.1373443288222043, "grad_norm": 0.22926200926303864, "learning_rate": 1.8277468570922515e-05, "loss": 1.2485, "step": 7176 }, { "epoch": 2.137642175021873, "grad_norm": 0.21348576247692108, "learning_rate": 1.827692731256979e-05, "loss": 1.2591, "step": 7177 }, { "epoch": 2.137940021221542, "grad_norm": 0.22748205065727234, "learning_rate": 1.82763859772095e-05, "loss": 1.2777, "step": 7178 }, { "epoch": 2.1382378674212106, "grad_norm": 0.22369438409805298, "learning_rate": 1.8275844564846688e-05, "loss": 1.2521, "step": 7179 }, { "epoch": 2.138535713620879, "grad_norm": 0.23350432515144348, "learning_rate": 1.8275303075486386e-05, "loss": 1.2588, "step": 7180 }, { "epoch": 2.1388335598205477, "grad_norm": 0.2353038489818573, "learning_rate": 1.8274761509133635e-05, "loss": 1.2676, "step": 7181 }, { "epoch": 2.1391314060202165, "grad_norm": 0.22810761630535126, "learning_rate": 1.8274219865793477e-05, "loss": 1.2504, "step": 7182 }, { "epoch": 2.139429252219885, "grad_norm": 0.21917693316936493, "learning_rate": 1.8273678145470947e-05, "loss": 1.2681, "step": 7183 }, { "epoch": 2.1397270984195536, "grad_norm": 0.23463785648345947, "learning_rate": 1.8273136348171082e-05, "loss": 1.2598, "step": 7184 }, { "epoch": 2.1400249446192223, "grad_norm": 0.2273704707622528, "learning_rate": 1.827259447389893e-05, "loss": 1.2754, "step": 7185 }, { "epoch": 2.140322790818891, "grad_norm": 0.2283557802438736, "learning_rate": 1.8272052522659525e-05, "loss": 1.2435, "step": 7186 }, { "epoch": 2.1406206370185594, "grad_norm": 0.23130172491073608, "learning_rate": 1.827151049445792e-05, "loss": 1.2578, "step": 7187 }, { "epoch": 2.140918483218228, "grad_norm": 0.2366407811641693, "learning_rate": 1.8270968389299145e-05, "loss": 1.247, "step": 7188 }, { "epoch": 2.141216329417897, "grad_norm": 0.22454547882080078, "learning_rate": 1.8270426207188252e-05, "loss": 1.2624, "step": 7189 }, { "epoch": 2.1415141756175653, "grad_norm": 0.22233931720256805, "learning_rate": 1.8269883948130283e-05, "loss": 1.2723, "step": 7190 }, { "epoch": 2.141812021817234, "grad_norm": 0.23232373595237732, "learning_rate": 1.8269341612130284e-05, "loss": 1.257, "step": 7191 }, { "epoch": 2.142109868016903, "grad_norm": 0.22737377882003784, "learning_rate": 1.82687991991933e-05, "loss": 1.2609, "step": 7192 }, { "epoch": 2.1424077142165716, "grad_norm": 0.2243301272392273, "learning_rate": 1.8268256709324383e-05, "loss": 1.2515, "step": 7193 }, { "epoch": 2.14270556041624, "grad_norm": 0.23105941712856293, "learning_rate": 1.826771414252857e-05, "loss": 1.2595, "step": 7194 }, { "epoch": 2.1430034066159087, "grad_norm": 0.22792844474315643, "learning_rate": 1.826717149881091e-05, "loss": 1.2664, "step": 7195 }, { "epoch": 2.1433012528155775, "grad_norm": 0.22798797488212585, "learning_rate": 1.8266628778176465e-05, "loss": 1.2624, "step": 7196 }, { "epoch": 2.143599099015246, "grad_norm": 0.24311818182468414, "learning_rate": 1.826608598063027e-05, "loss": 1.258, "step": 7197 }, { "epoch": 2.1438969452149146, "grad_norm": 0.2258772999048233, "learning_rate": 1.826554310617738e-05, "loss": 1.2578, "step": 7198 }, { "epoch": 2.1441947914145834, "grad_norm": 0.2354695200920105, "learning_rate": 1.8265000154822846e-05, "loss": 1.2454, "step": 7199 }, { "epoch": 2.144492637614252, "grad_norm": 0.24303671717643738, "learning_rate": 1.8264457126571723e-05, "loss": 1.277, "step": 7200 }, { "epoch": 2.1447904838139205, "grad_norm": 0.23406969010829926, "learning_rate": 1.8263914021429057e-05, "loss": 1.2581, "step": 7201 }, { "epoch": 2.1450883300135892, "grad_norm": 0.2457258403301239, "learning_rate": 1.8263370839399906e-05, "loss": 1.2546, "step": 7202 }, { "epoch": 2.145386176213258, "grad_norm": 0.2412896603345871, "learning_rate": 1.8262827580489322e-05, "loss": 1.2597, "step": 7203 }, { "epoch": 2.1456840224129263, "grad_norm": 0.2142343819141388, "learning_rate": 1.826228424470236e-05, "loss": 1.2576, "step": 7204 }, { "epoch": 2.145981868612595, "grad_norm": 0.2583129107952118, "learning_rate": 1.826174083204407e-05, "loss": 1.2615, "step": 7205 }, { "epoch": 2.146279714812264, "grad_norm": 0.22269707918167114, "learning_rate": 1.8261197342519513e-05, "loss": 1.2541, "step": 7206 }, { "epoch": 2.1465775610119326, "grad_norm": 0.23978152871131897, "learning_rate": 1.8260653776133746e-05, "loss": 1.2631, "step": 7207 }, { "epoch": 2.146875407211601, "grad_norm": 0.2298610359430313, "learning_rate": 1.8260110132891825e-05, "loss": 1.2583, "step": 7208 }, { "epoch": 2.1471732534112697, "grad_norm": 0.22163039445877075, "learning_rate": 1.825956641279881e-05, "loss": 1.2533, "step": 7209 }, { "epoch": 2.1474710996109385, "grad_norm": 0.23827357590198517, "learning_rate": 1.8259022615859756e-05, "loss": 1.2775, "step": 7210 }, { "epoch": 2.147768945810607, "grad_norm": 0.24706870317459106, "learning_rate": 1.8258478742079725e-05, "loss": 1.268, "step": 7211 }, { "epoch": 2.1480667920102756, "grad_norm": 0.23246964812278748, "learning_rate": 1.8257934791463774e-05, "loss": 1.2754, "step": 7212 }, { "epoch": 2.1483646382099444, "grad_norm": 0.22534261643886566, "learning_rate": 1.8257390764016968e-05, "loss": 1.2704, "step": 7213 }, { "epoch": 2.148662484409613, "grad_norm": 0.2393663376569748, "learning_rate": 1.8256846659744364e-05, "loss": 1.273, "step": 7214 }, { "epoch": 2.1489603306092815, "grad_norm": 0.21977217495441437, "learning_rate": 1.8256302478651033e-05, "loss": 1.2625, "step": 7215 }, { "epoch": 2.1492581768089503, "grad_norm": 0.24201923608779907, "learning_rate": 1.8255758220742025e-05, "loss": 1.2478, "step": 7216 }, { "epoch": 2.149556023008619, "grad_norm": 0.24218101799488068, "learning_rate": 1.8255213886022412e-05, "loss": 1.2552, "step": 7217 }, { "epoch": 2.149853869208288, "grad_norm": 0.2628040909767151, "learning_rate": 1.825466947449726e-05, "loss": 1.2653, "step": 7218 }, { "epoch": 2.150151715407956, "grad_norm": 0.23015199601650238, "learning_rate": 1.825412498617163e-05, "loss": 1.2422, "step": 7219 }, { "epoch": 2.150449561607625, "grad_norm": 0.2507378160953522, "learning_rate": 1.8253580421050587e-05, "loss": 1.2785, "step": 7220 }, { "epoch": 2.1507474078072937, "grad_norm": 0.2190084457397461, "learning_rate": 1.8253035779139203e-05, "loss": 1.2675, "step": 7221 }, { "epoch": 2.151045254006962, "grad_norm": 0.23176588118076324, "learning_rate": 1.825249106044254e-05, "loss": 1.2642, "step": 7222 }, { "epoch": 2.1513431002066308, "grad_norm": 0.22592073678970337, "learning_rate": 1.8251946264965668e-05, "loss": 1.2487, "step": 7223 }, { "epoch": 2.1516409464062995, "grad_norm": 0.24401065707206726, "learning_rate": 1.8251401392713655e-05, "loss": 1.2684, "step": 7224 }, { "epoch": 2.151938792605968, "grad_norm": 0.23437196016311646, "learning_rate": 1.825085644369157e-05, "loss": 1.2643, "step": 7225 }, { "epoch": 2.1522366388056366, "grad_norm": 0.2282634824514389, "learning_rate": 1.8250311417904488e-05, "loss": 1.2717, "step": 7226 }, { "epoch": 2.1525344850053054, "grad_norm": 0.23535984754562378, "learning_rate": 1.8249766315357474e-05, "loss": 1.2717, "step": 7227 }, { "epoch": 2.152832331204974, "grad_norm": 0.22330625355243683, "learning_rate": 1.82492211360556e-05, "loss": 1.2542, "step": 7228 }, { "epoch": 2.1531301774046425, "grad_norm": 0.2360328733921051, "learning_rate": 1.824867588000394e-05, "loss": 1.2545, "step": 7229 }, { "epoch": 2.1534280236043113, "grad_norm": 0.2204810082912445, "learning_rate": 1.824813054720757e-05, "loss": 1.2444, "step": 7230 }, { "epoch": 2.15372586980398, "grad_norm": 0.23123958706855774, "learning_rate": 1.8247585137671562e-05, "loss": 1.247, "step": 7231 }, { "epoch": 2.154023716003649, "grad_norm": 0.22586317360401154, "learning_rate": 1.8247039651400984e-05, "loss": 1.2506, "step": 7232 }, { "epoch": 2.154321562203317, "grad_norm": 0.22761893272399902, "learning_rate": 1.8246494088400918e-05, "loss": 1.2599, "step": 7233 }, { "epoch": 2.154619408402986, "grad_norm": 0.2438083440065384, "learning_rate": 1.8245948448676438e-05, "loss": 1.2562, "step": 7234 }, { "epoch": 2.1549172546026547, "grad_norm": 0.22488285601139069, "learning_rate": 1.8245402732232622e-05, "loss": 1.2649, "step": 7235 }, { "epoch": 2.155215100802323, "grad_norm": 0.24174481630325317, "learning_rate": 1.8244856939074544e-05, "loss": 1.2448, "step": 7236 }, { "epoch": 2.155512947001992, "grad_norm": 0.23343084752559662, "learning_rate": 1.8244311069207285e-05, "loss": 1.2557, "step": 7237 }, { "epoch": 2.1558107932016606, "grad_norm": 0.27241039276123047, "learning_rate": 1.824376512263592e-05, "loss": 1.2537, "step": 7238 }, { "epoch": 2.156108639401329, "grad_norm": 0.22729940712451935, "learning_rate": 1.8243219099365534e-05, "loss": 1.2502, "step": 7239 }, { "epoch": 2.1564064856009977, "grad_norm": 0.23571279644966125, "learning_rate": 1.8242672999401202e-05, "loss": 1.2629, "step": 7240 }, { "epoch": 2.1567043318006665, "grad_norm": 0.23134379088878632, "learning_rate": 1.824212682274801e-05, "loss": 1.2681, "step": 7241 }, { "epoch": 2.1570021780003352, "grad_norm": 0.23377610743045807, "learning_rate": 1.8241580569411038e-05, "loss": 1.2497, "step": 7242 }, { "epoch": 2.1573000242000036, "grad_norm": 0.2495405226945877, "learning_rate": 1.824103423939536e-05, "loss": 1.2578, "step": 7243 }, { "epoch": 2.1575978703996723, "grad_norm": 0.24279369413852692, "learning_rate": 1.824048783270607e-05, "loss": 1.2564, "step": 7244 }, { "epoch": 2.157895716599341, "grad_norm": 0.24328194558620453, "learning_rate": 1.8239941349348246e-05, "loss": 1.2721, "step": 7245 }, { "epoch": 2.15819356279901, "grad_norm": 0.2252289354801178, "learning_rate": 1.8239394789326978e-05, "loss": 1.2542, "step": 7246 }, { "epoch": 2.158491408998678, "grad_norm": 0.22826309502124786, "learning_rate": 1.8238848152647345e-05, "loss": 1.2661, "step": 7247 }, { "epoch": 2.158789255198347, "grad_norm": 0.23042802512645721, "learning_rate": 1.823830143931443e-05, "loss": 1.2625, "step": 7248 }, { "epoch": 2.1590871013980157, "grad_norm": 0.23423734307289124, "learning_rate": 1.8237754649333334e-05, "loss": 1.27, "step": 7249 }, { "epoch": 2.159384947597684, "grad_norm": 0.2401869148015976, "learning_rate": 1.8237207782709124e-05, "loss": 1.2711, "step": 7250 }, { "epoch": 2.159682793797353, "grad_norm": 0.24477271735668182, "learning_rate": 1.8236660839446908e-05, "loss": 1.272, "step": 7251 }, { "epoch": 2.1599806399970216, "grad_norm": 0.23227417469024658, "learning_rate": 1.8236113819551758e-05, "loss": 1.2616, "step": 7252 }, { "epoch": 2.1602784861966904, "grad_norm": 0.2281360775232315, "learning_rate": 1.8235566723028776e-05, "loss": 1.2566, "step": 7253 }, { "epoch": 2.1605763323963587, "grad_norm": 0.3357749581336975, "learning_rate": 1.8235019549883045e-05, "loss": 1.2721, "step": 7254 }, { "epoch": 2.1608741785960275, "grad_norm": 0.26791083812713623, "learning_rate": 1.823447230011966e-05, "loss": 1.2501, "step": 7255 }, { "epoch": 2.1611720247956963, "grad_norm": 0.2542476952075958, "learning_rate": 1.8233924973743707e-05, "loss": 1.2634, "step": 7256 }, { "epoch": 2.1614698709953646, "grad_norm": 0.22172990441322327, "learning_rate": 1.823337757076028e-05, "loss": 1.2648, "step": 7257 }, { "epoch": 2.1617677171950334, "grad_norm": 0.2839290499687195, "learning_rate": 1.823283009117448e-05, "loss": 1.2669, "step": 7258 }, { "epoch": 2.162065563394702, "grad_norm": 0.23253114521503448, "learning_rate": 1.823228253499139e-05, "loss": 1.2503, "step": 7259 }, { "epoch": 2.162363409594371, "grad_norm": 0.23647059500217438, "learning_rate": 1.823173490221611e-05, "loss": 1.2686, "step": 7260 }, { "epoch": 2.1626612557940392, "grad_norm": 0.24149256944656372, "learning_rate": 1.8231187192853732e-05, "loss": 1.2444, "step": 7261 }, { "epoch": 2.162959101993708, "grad_norm": 0.233870729804039, "learning_rate": 1.8230639406909357e-05, "loss": 1.2642, "step": 7262 }, { "epoch": 2.1632569481933768, "grad_norm": 0.21807897090911865, "learning_rate": 1.8230091544388074e-05, "loss": 1.2575, "step": 7263 }, { "epoch": 2.163554794393045, "grad_norm": 0.24832923710346222, "learning_rate": 1.8229543605294985e-05, "loss": 1.2598, "step": 7264 }, { "epoch": 2.163852640592714, "grad_norm": 0.23019640147686005, "learning_rate": 1.822899558963519e-05, "loss": 1.2529, "step": 7265 }, { "epoch": 2.1641504867923826, "grad_norm": 0.22573243081569672, "learning_rate": 1.8228447497413785e-05, "loss": 1.2497, "step": 7266 }, { "epoch": 2.1644483329920514, "grad_norm": 0.24307508766651154, "learning_rate": 1.8227899328635867e-05, "loss": 1.239, "step": 7267 }, { "epoch": 2.1647461791917197, "grad_norm": 0.23096969723701477, "learning_rate": 1.822735108330654e-05, "loss": 1.2417, "step": 7268 }, { "epoch": 2.1650440253913885, "grad_norm": 0.2541373074054718, "learning_rate": 1.8226802761430905e-05, "loss": 1.263, "step": 7269 }, { "epoch": 2.1653418715910573, "grad_norm": 0.23824550211429596, "learning_rate": 1.8226254363014058e-05, "loss": 1.2667, "step": 7270 }, { "epoch": 2.1656397177907256, "grad_norm": 0.23206298053264618, "learning_rate": 1.8225705888061107e-05, "loss": 1.2544, "step": 7271 }, { "epoch": 2.1659375639903944, "grad_norm": 0.21915820240974426, "learning_rate": 1.8225157336577153e-05, "loss": 1.2517, "step": 7272 }, { "epoch": 2.166235410190063, "grad_norm": 0.25287652015686035, "learning_rate": 1.82246087085673e-05, "loss": 1.2436, "step": 7273 }, { "epoch": 2.166533256389732, "grad_norm": 0.24380943179130554, "learning_rate": 1.8224060004036652e-05, "loss": 1.2671, "step": 7274 }, { "epoch": 2.1668311025894003, "grad_norm": 0.24164150655269623, "learning_rate": 1.8223511222990313e-05, "loss": 1.2468, "step": 7275 }, { "epoch": 2.167128948789069, "grad_norm": 0.23552227020263672, "learning_rate": 1.822296236543339e-05, "loss": 1.277, "step": 7276 }, { "epoch": 2.167426794988738, "grad_norm": 0.22162701189517975, "learning_rate": 1.822241343137099e-05, "loss": 1.2572, "step": 7277 }, { "epoch": 2.167724641188406, "grad_norm": 0.23937855660915375, "learning_rate": 1.822186442080822e-05, "loss": 1.2497, "step": 7278 }, { "epoch": 2.168022487388075, "grad_norm": 0.22830797731876373, "learning_rate": 1.8221315333750187e-05, "loss": 1.2713, "step": 7279 }, { "epoch": 2.1683203335877437, "grad_norm": 0.2521073520183563, "learning_rate": 1.8220766170202e-05, "loss": 1.2629, "step": 7280 }, { "epoch": 2.1686181797874124, "grad_norm": 0.2280997335910797, "learning_rate": 1.822021693016877e-05, "loss": 1.2534, "step": 7281 }, { "epoch": 2.1689160259870808, "grad_norm": 0.22700686752796173, "learning_rate": 1.82196676136556e-05, "loss": 1.2643, "step": 7282 }, { "epoch": 2.1692138721867495, "grad_norm": 0.22790886461734772, "learning_rate": 1.8219118220667616e-05, "loss": 1.2464, "step": 7283 }, { "epoch": 2.1695117183864183, "grad_norm": 0.2317965030670166, "learning_rate": 1.8218568751209914e-05, "loss": 1.2647, "step": 7284 }, { "epoch": 2.169809564586087, "grad_norm": 0.2157122939825058, "learning_rate": 1.8218019205287613e-05, "loss": 1.246, "step": 7285 }, { "epoch": 2.1701074107857554, "grad_norm": 0.22612899541854858, "learning_rate": 1.8217469582905826e-05, "loss": 1.2781, "step": 7286 }, { "epoch": 2.170405256985424, "grad_norm": 0.21930953860282898, "learning_rate": 1.8216919884069663e-05, "loss": 1.2552, "step": 7287 }, { "epoch": 2.170703103185093, "grad_norm": 0.2246462106704712, "learning_rate": 1.8216370108784243e-05, "loss": 1.2596, "step": 7288 }, { "epoch": 2.1710009493847613, "grad_norm": 0.2279270887374878, "learning_rate": 1.821582025705468e-05, "loss": 1.2403, "step": 7289 }, { "epoch": 2.17129879558443, "grad_norm": 0.23340153694152832, "learning_rate": 1.8215270328886084e-05, "loss": 1.2641, "step": 7290 }, { "epoch": 2.171596641784099, "grad_norm": 0.225138857960701, "learning_rate": 1.8214720324283584e-05, "loss": 1.2596, "step": 7291 }, { "epoch": 2.171894487983767, "grad_norm": 0.22960343956947327, "learning_rate": 1.8214170243252284e-05, "loss": 1.2507, "step": 7292 }, { "epoch": 2.172192334183436, "grad_norm": 0.25454360246658325, "learning_rate": 1.8213620085797308e-05, "loss": 1.262, "step": 7293 }, { "epoch": 2.1724901803831047, "grad_norm": 0.2196982353925705, "learning_rate": 1.8213069851923775e-05, "loss": 1.2573, "step": 7294 }, { "epoch": 2.1727880265827735, "grad_norm": 0.314749151468277, "learning_rate": 1.8212519541636798e-05, "loss": 1.244, "step": 7295 }, { "epoch": 2.173085872782442, "grad_norm": 0.32642850279808044, "learning_rate": 1.821196915494151e-05, "loss": 1.2591, "step": 7296 }, { "epoch": 2.1733837189821106, "grad_norm": 0.27008360624313354, "learning_rate": 1.821141869184302e-05, "loss": 1.2594, "step": 7297 }, { "epoch": 2.1736815651817794, "grad_norm": 0.5230419039726257, "learning_rate": 1.8210868152346456e-05, "loss": 1.2637, "step": 7298 }, { "epoch": 2.173979411381448, "grad_norm": 0.24664689600467682, "learning_rate": 1.8210317536456934e-05, "loss": 1.2583, "step": 7299 }, { "epoch": 2.1742772575811165, "grad_norm": 0.2564030587673187, "learning_rate": 1.820976684417958e-05, "loss": 1.2536, "step": 7300 }, { "epoch": 2.1745751037807852, "grad_norm": 0.2287197709083557, "learning_rate": 1.8209216075519522e-05, "loss": 1.2753, "step": 7301 }, { "epoch": 2.174872949980454, "grad_norm": 0.22164995968341827, "learning_rate": 1.8208665230481878e-05, "loss": 1.2506, "step": 7302 }, { "epoch": 2.1751707961801223, "grad_norm": 0.24038252234458923, "learning_rate": 1.8208114309071776e-05, "loss": 1.2522, "step": 7303 }, { "epoch": 2.175468642379791, "grad_norm": 0.25487661361694336, "learning_rate": 1.820756331129434e-05, "loss": 1.2614, "step": 7304 }, { "epoch": 2.17576648857946, "grad_norm": 0.25161856412887573, "learning_rate": 1.82070122371547e-05, "loss": 1.2811, "step": 7305 }, { "epoch": 2.176064334779128, "grad_norm": 0.23490729928016663, "learning_rate": 1.820646108665798e-05, "loss": 1.2512, "step": 7306 }, { "epoch": 2.176362180978797, "grad_norm": 0.22478432953357697, "learning_rate": 1.8205909859809307e-05, "loss": 1.2576, "step": 7307 }, { "epoch": 2.1766600271784657, "grad_norm": 0.23823775351047516, "learning_rate": 1.820535855661381e-05, "loss": 1.2573, "step": 7308 }, { "epoch": 2.1769578733781345, "grad_norm": 0.23629657924175262, "learning_rate": 1.8204807177076617e-05, "loss": 1.2723, "step": 7309 }, { "epoch": 2.177255719577803, "grad_norm": 0.22689202427864075, "learning_rate": 1.8204255721202867e-05, "loss": 1.2475, "step": 7310 }, { "epoch": 2.1775535657774716, "grad_norm": 0.22683067619800568, "learning_rate": 1.820370418899768e-05, "loss": 1.2508, "step": 7311 }, { "epoch": 2.1778514119771404, "grad_norm": 0.23133999109268188, "learning_rate": 1.8203152580466187e-05, "loss": 1.2549, "step": 7312 }, { "epoch": 2.178149258176809, "grad_norm": 0.23448750376701355, "learning_rate": 1.820260089561353e-05, "loss": 1.247, "step": 7313 }, { "epoch": 2.1784471043764775, "grad_norm": 0.2387056052684784, "learning_rate": 1.8202049134444837e-05, "loss": 1.2543, "step": 7314 }, { "epoch": 2.1787449505761463, "grad_norm": 0.2189805805683136, "learning_rate": 1.820149729696524e-05, "loss": 1.2484, "step": 7315 }, { "epoch": 2.179042796775815, "grad_norm": 0.22526004910469055, "learning_rate": 1.820094538317987e-05, "loss": 1.2571, "step": 7316 }, { "epoch": 2.1793406429754834, "grad_norm": 0.23390717804431915, "learning_rate": 1.820039339309387e-05, "loss": 1.2411, "step": 7317 }, { "epoch": 2.179638489175152, "grad_norm": 0.22101286053657532, "learning_rate": 1.8199841326712368e-05, "loss": 1.2657, "step": 7318 }, { "epoch": 2.179936335374821, "grad_norm": 0.24784044921398163, "learning_rate": 1.8199289184040507e-05, "loss": 1.2403, "step": 7319 }, { "epoch": 2.1802341815744897, "grad_norm": 0.22040356695652008, "learning_rate": 1.819873696508342e-05, "loss": 1.2663, "step": 7320 }, { "epoch": 2.180532027774158, "grad_norm": 0.23102213442325592, "learning_rate": 1.8198184669846243e-05, "loss": 1.2511, "step": 7321 }, { "epoch": 2.1808298739738268, "grad_norm": 0.233845517039299, "learning_rate": 1.819763229833412e-05, "loss": 1.2594, "step": 7322 }, { "epoch": 2.1811277201734955, "grad_norm": 0.22688744962215424, "learning_rate": 1.8197079850552188e-05, "loss": 1.2525, "step": 7323 }, { "epoch": 2.181425566373164, "grad_norm": 0.21882183849811554, "learning_rate": 1.8196527326505585e-05, "loss": 1.2547, "step": 7324 }, { "epoch": 2.1817234125728326, "grad_norm": 0.23973548412322998, "learning_rate": 1.8195974726199454e-05, "loss": 1.2478, "step": 7325 }, { "epoch": 2.1820212587725014, "grad_norm": 0.23163484036922455, "learning_rate": 1.8195422049638935e-05, "loss": 1.2569, "step": 7326 }, { "epoch": 2.18231910497217, "grad_norm": 0.23362986743450165, "learning_rate": 1.8194869296829167e-05, "loss": 1.2596, "step": 7327 }, { "epoch": 2.1826169511718385, "grad_norm": 0.2282031625509262, "learning_rate": 1.81943164677753e-05, "loss": 1.2459, "step": 7328 }, { "epoch": 2.1829147973715073, "grad_norm": 0.22614647448062897, "learning_rate": 1.819376356248247e-05, "loss": 1.2479, "step": 7329 }, { "epoch": 2.183212643571176, "grad_norm": 0.22355595231056213, "learning_rate": 1.819321058095583e-05, "loss": 1.2726, "step": 7330 }, { "epoch": 2.1835104897708444, "grad_norm": 0.2299981415271759, "learning_rate": 1.8192657523200514e-05, "loss": 1.255, "step": 7331 }, { "epoch": 2.183808335970513, "grad_norm": 0.22681733965873718, "learning_rate": 1.8192104389221677e-05, "loss": 1.2557, "step": 7332 }, { "epoch": 2.184106182170182, "grad_norm": 0.2261442244052887, "learning_rate": 1.8191551179024462e-05, "loss": 1.2635, "step": 7333 }, { "epoch": 2.1844040283698507, "grad_norm": 0.2253769338130951, "learning_rate": 1.819099789261401e-05, "loss": 1.266, "step": 7334 }, { "epoch": 2.184701874569519, "grad_norm": 0.224657341837883, "learning_rate": 1.819044452999548e-05, "loss": 1.2601, "step": 7335 }, { "epoch": 2.184999720769188, "grad_norm": 0.22582083940505981, "learning_rate": 1.818989109117401e-05, "loss": 1.2542, "step": 7336 }, { "epoch": 2.1852975669688566, "grad_norm": 0.22553744912147522, "learning_rate": 1.818933757615476e-05, "loss": 1.2606, "step": 7337 }, { "epoch": 2.185595413168525, "grad_norm": 0.2209191471338272, "learning_rate": 1.8188783984942865e-05, "loss": 1.2512, "step": 7338 }, { "epoch": 2.1858932593681937, "grad_norm": 0.23108714818954468, "learning_rate": 1.818823031754349e-05, "loss": 1.2642, "step": 7339 }, { "epoch": 2.1861911055678624, "grad_norm": 0.22774134576320648, "learning_rate": 1.818767657396178e-05, "loss": 1.2722, "step": 7340 }, { "epoch": 2.186488951767531, "grad_norm": 0.22755049169063568, "learning_rate": 1.8187122754202884e-05, "loss": 1.2588, "step": 7341 }, { "epoch": 2.1867867979671995, "grad_norm": 0.22437641024589539, "learning_rate": 1.818656885827196e-05, "loss": 1.2487, "step": 7342 }, { "epoch": 2.1870846441668683, "grad_norm": 0.23329325020313263, "learning_rate": 1.818601488617416e-05, "loss": 1.2527, "step": 7343 }, { "epoch": 2.187382490366537, "grad_norm": 0.22857099771499634, "learning_rate": 1.8185460837914635e-05, "loss": 1.2625, "step": 7344 }, { "epoch": 2.1876803365662054, "grad_norm": 0.2302483767271042, "learning_rate": 1.8184906713498544e-05, "loss": 1.2655, "step": 7345 }, { "epoch": 2.187978182765874, "grad_norm": 0.23071642220020294, "learning_rate": 1.8184352512931044e-05, "loss": 1.2666, "step": 7346 }, { "epoch": 2.188276028965543, "grad_norm": 0.23209547996520996, "learning_rate": 1.818379823621728e-05, "loss": 1.264, "step": 7347 }, { "epoch": 2.1885738751652117, "grad_norm": 0.2276519536972046, "learning_rate": 1.8183243883362424e-05, "loss": 1.2591, "step": 7348 }, { "epoch": 2.18887172136488, "grad_norm": 0.23050251603126526, "learning_rate": 1.8182689454371622e-05, "loss": 1.25, "step": 7349 }, { "epoch": 2.189169567564549, "grad_norm": 0.2253330498933792, "learning_rate": 1.818213494925004e-05, "loss": 1.2655, "step": 7350 }, { "epoch": 2.1894674137642176, "grad_norm": 0.2334011048078537, "learning_rate": 1.8181580368002833e-05, "loss": 1.2629, "step": 7351 }, { "epoch": 2.1897652599638864, "grad_norm": 0.22390751540660858, "learning_rate": 1.8181025710635163e-05, "loss": 1.249, "step": 7352 }, { "epoch": 2.1900631061635547, "grad_norm": 0.2347511351108551, "learning_rate": 1.8180470977152188e-05, "loss": 1.2778, "step": 7353 }, { "epoch": 2.1903609523632235, "grad_norm": 0.22769004106521606, "learning_rate": 1.8179916167559067e-05, "loss": 1.2693, "step": 7354 }, { "epoch": 2.1906587985628923, "grad_norm": 0.2368936538696289, "learning_rate": 1.817936128186097e-05, "loss": 1.2525, "step": 7355 }, { "epoch": 2.1909566447625606, "grad_norm": 0.2429242581129074, "learning_rate": 1.8178806320063054e-05, "loss": 1.2667, "step": 7356 }, { "epoch": 2.1912544909622294, "grad_norm": 0.2240075021982193, "learning_rate": 1.817825128217048e-05, "loss": 1.2665, "step": 7357 }, { "epoch": 2.191552337161898, "grad_norm": 0.22910600900650024, "learning_rate": 1.8177696168188417e-05, "loss": 1.2589, "step": 7358 }, { "epoch": 2.1918501833615665, "grad_norm": 0.2249964326620102, "learning_rate": 1.817714097812203e-05, "loss": 1.2532, "step": 7359 }, { "epoch": 2.1921480295612352, "grad_norm": 0.23178963363170624, "learning_rate": 1.817658571197648e-05, "loss": 1.2586, "step": 7360 }, { "epoch": 2.192445875760904, "grad_norm": 0.25021347403526306, "learning_rate": 1.8176030369756935e-05, "loss": 1.2577, "step": 7361 }, { "epoch": 2.1927437219605728, "grad_norm": 0.2263617217540741, "learning_rate": 1.8175474951468564e-05, "loss": 1.2564, "step": 7362 }, { "epoch": 2.193041568160241, "grad_norm": 0.24502508342266083, "learning_rate": 1.8174919457116532e-05, "loss": 1.2547, "step": 7363 }, { "epoch": 2.19333941435991, "grad_norm": 0.23017960786819458, "learning_rate": 1.8174363886706004e-05, "loss": 1.255, "step": 7364 }, { "epoch": 2.1936372605595786, "grad_norm": 0.2292788326740265, "learning_rate": 1.8173808240242156e-05, "loss": 1.2461, "step": 7365 }, { "epoch": 2.1939351067592474, "grad_norm": 0.22475972771644592, "learning_rate": 1.817325251773016e-05, "loss": 1.2611, "step": 7366 }, { "epoch": 2.1942329529589157, "grad_norm": 0.23396527767181396, "learning_rate": 1.8172696719175172e-05, "loss": 1.2573, "step": 7367 }, { "epoch": 2.1945307991585845, "grad_norm": 0.24701645970344543, "learning_rate": 1.8172140844582377e-05, "loss": 1.2705, "step": 7368 }, { "epoch": 2.1948286453582533, "grad_norm": 0.24896377325057983, "learning_rate": 1.8171584893956943e-05, "loss": 1.254, "step": 7369 }, { "epoch": 2.1951264915579216, "grad_norm": 0.24997586011886597, "learning_rate": 1.817102886730404e-05, "loss": 1.2639, "step": 7370 }, { "epoch": 2.1954243377575904, "grad_norm": 0.22887980937957764, "learning_rate": 1.817047276462884e-05, "loss": 1.2589, "step": 7371 }, { "epoch": 2.195722183957259, "grad_norm": 0.22723832726478577, "learning_rate": 1.8169916585936523e-05, "loss": 1.2462, "step": 7372 }, { "epoch": 2.1960200301569275, "grad_norm": 0.25205981731414795, "learning_rate": 1.8169360331232258e-05, "loss": 1.2354, "step": 7373 }, { "epoch": 2.1963178763565963, "grad_norm": 0.23514196276664734, "learning_rate": 1.8168804000521222e-05, "loss": 1.2519, "step": 7374 }, { "epoch": 2.196615722556265, "grad_norm": 0.23175553977489471, "learning_rate": 1.8168247593808594e-05, "loss": 1.2463, "step": 7375 }, { "epoch": 2.196913568755934, "grad_norm": 0.22872765362262726, "learning_rate": 1.816769111109955e-05, "loss": 1.2668, "step": 7376 }, { "epoch": 2.197211414955602, "grad_norm": 0.22961491346359253, "learning_rate": 1.816713455239926e-05, "loss": 1.238, "step": 7377 }, { "epoch": 2.197509261155271, "grad_norm": 0.233178973197937, "learning_rate": 1.816657791771291e-05, "loss": 1.2572, "step": 7378 }, { "epoch": 2.1978071073549397, "grad_norm": 0.23030194640159607, "learning_rate": 1.816602120704568e-05, "loss": 1.2474, "step": 7379 }, { "epoch": 2.1981049535546084, "grad_norm": 0.23031818866729736, "learning_rate": 1.8165464420402742e-05, "loss": 1.2478, "step": 7380 }, { "epoch": 2.1984027997542768, "grad_norm": 0.2297825664281845, "learning_rate": 1.816490755778928e-05, "loss": 1.2449, "step": 7381 }, { "epoch": 2.1987006459539455, "grad_norm": 0.2581939995288849, "learning_rate": 1.816435061921048e-05, "loss": 1.2573, "step": 7382 }, { "epoch": 2.1989984921536143, "grad_norm": 0.23687393963336945, "learning_rate": 1.8163793604671516e-05, "loss": 1.2747, "step": 7383 }, { "epoch": 2.1992963383532826, "grad_norm": 0.22601406276226044, "learning_rate": 1.8163236514177575e-05, "loss": 1.258, "step": 7384 }, { "epoch": 2.1995941845529514, "grad_norm": 0.2287813276052475, "learning_rate": 1.816267934773384e-05, "loss": 1.2549, "step": 7385 }, { "epoch": 2.19989203075262, "grad_norm": 0.2404824048280716, "learning_rate": 1.816212210534549e-05, "loss": 1.264, "step": 7386 }, { "epoch": 2.200189876952289, "grad_norm": 0.2244536280632019, "learning_rate": 1.8161564787017716e-05, "loss": 1.2583, "step": 7387 }, { "epoch": 2.2004877231519573, "grad_norm": 0.23286356031894684, "learning_rate": 1.81610073927557e-05, "loss": 1.2674, "step": 7388 }, { "epoch": 2.200785569351626, "grad_norm": 0.2290877103805542, "learning_rate": 1.8160449922564627e-05, "loss": 1.2494, "step": 7389 }, { "epoch": 2.201083415551295, "grad_norm": 0.2407354712486267, "learning_rate": 1.8159892376449685e-05, "loss": 1.2622, "step": 7390 }, { "epoch": 2.201381261750963, "grad_norm": 0.22834835946559906, "learning_rate": 1.815933475441606e-05, "loss": 1.259, "step": 7391 }, { "epoch": 2.201679107950632, "grad_norm": 0.23079723119735718, "learning_rate": 1.8158777056468942e-05, "loss": 1.2622, "step": 7392 }, { "epoch": 2.2019769541503007, "grad_norm": 0.22950461506843567, "learning_rate": 1.815821928261352e-05, "loss": 1.2458, "step": 7393 }, { "epoch": 2.2022748003499695, "grad_norm": 0.2347557693719864, "learning_rate": 1.8157661432854982e-05, "loss": 1.258, "step": 7394 }, { "epoch": 2.202572646549638, "grad_norm": 0.24329420924186707, "learning_rate": 1.8157103507198522e-05, "loss": 1.2569, "step": 7395 }, { "epoch": 2.2028704927493066, "grad_norm": 0.23572076857089996, "learning_rate": 1.8156545505649323e-05, "loss": 1.2505, "step": 7396 }, { "epoch": 2.2031683389489753, "grad_norm": 0.23422683775424957, "learning_rate": 1.815598742821258e-05, "loss": 1.2532, "step": 7397 }, { "epoch": 2.2034661851486437, "grad_norm": 0.22762665152549744, "learning_rate": 1.8155429274893493e-05, "loss": 1.2584, "step": 7398 }, { "epoch": 2.2037640313483124, "grad_norm": 0.2506447732448578, "learning_rate": 1.8154871045697243e-05, "loss": 1.2506, "step": 7399 }, { "epoch": 2.204061877547981, "grad_norm": 0.24103914201259613, "learning_rate": 1.815431274062903e-05, "loss": 1.262, "step": 7400 }, { "epoch": 2.20435972374765, "grad_norm": 0.23970681428909302, "learning_rate": 1.815375435969405e-05, "loss": 1.2573, "step": 7401 }, { "epoch": 2.2046575699473183, "grad_norm": 0.2336505949497223, "learning_rate": 1.8153195902897495e-05, "loss": 1.2459, "step": 7402 }, { "epoch": 2.204955416146987, "grad_norm": 0.27376458048820496, "learning_rate": 1.8152637370244557e-05, "loss": 1.2671, "step": 7403 }, { "epoch": 2.205253262346656, "grad_norm": 0.2327093482017517, "learning_rate": 1.8152078761740438e-05, "loss": 1.2613, "step": 7404 }, { "epoch": 2.205551108546324, "grad_norm": 0.2462681084871292, "learning_rate": 1.815152007739034e-05, "loss": 1.2664, "step": 7405 }, { "epoch": 2.205848954745993, "grad_norm": 0.2664211690425873, "learning_rate": 1.815096131719945e-05, "loss": 1.2528, "step": 7406 }, { "epoch": 2.2061468009456617, "grad_norm": 0.246301531791687, "learning_rate": 1.8150402481172973e-05, "loss": 1.2496, "step": 7407 }, { "epoch": 2.2064446471453305, "grad_norm": 0.25915271043777466, "learning_rate": 1.8149843569316107e-05, "loss": 1.2721, "step": 7408 }, { "epoch": 2.206742493344999, "grad_norm": 0.23023463785648346, "learning_rate": 1.814928458163405e-05, "loss": 1.2565, "step": 7409 }, { "epoch": 2.2070403395446676, "grad_norm": 0.2400447577238083, "learning_rate": 1.8148725518132005e-05, "loss": 1.2508, "step": 7410 }, { "epoch": 2.2073381857443364, "grad_norm": 0.226594939827919, "learning_rate": 1.8148166378815178e-05, "loss": 1.269, "step": 7411 }, { "epoch": 2.2076360319440047, "grad_norm": 0.2427109032869339, "learning_rate": 1.8147607163688763e-05, "loss": 1.2441, "step": 7412 }, { "epoch": 2.2079338781436735, "grad_norm": 0.23050576448440552, "learning_rate": 1.8147047872757964e-05, "loss": 1.2628, "step": 7413 }, { "epoch": 2.2082317243433423, "grad_norm": 0.23181301355361938, "learning_rate": 1.8146488506027996e-05, "loss": 1.2536, "step": 7414 }, { "epoch": 2.208529570543011, "grad_norm": 0.22633394598960876, "learning_rate": 1.8145929063504043e-05, "loss": 1.2426, "step": 7415 }, { "epoch": 2.2088274167426794, "grad_norm": 0.22863715887069702, "learning_rate": 1.814536954519133e-05, "loss": 1.2457, "step": 7416 }, { "epoch": 2.209125262942348, "grad_norm": 0.23071017861366272, "learning_rate": 1.8144809951095052e-05, "loss": 1.2585, "step": 7417 }, { "epoch": 2.209423109142017, "grad_norm": 0.2343638688325882, "learning_rate": 1.8144250281220412e-05, "loss": 1.2429, "step": 7418 }, { "epoch": 2.2097209553416857, "grad_norm": 0.23640675842761993, "learning_rate": 1.814369053557263e-05, "loss": 1.2481, "step": 7419 }, { "epoch": 2.210018801541354, "grad_norm": 0.23015083372592926, "learning_rate": 1.81431307141569e-05, "loss": 1.2751, "step": 7420 }, { "epoch": 2.2103166477410228, "grad_norm": 0.22590556740760803, "learning_rate": 1.814257081697844e-05, "loss": 1.2459, "step": 7421 }, { "epoch": 2.2106144939406915, "grad_norm": 0.23690733313560486, "learning_rate": 1.8142010844042454e-05, "loss": 1.2473, "step": 7422 }, { "epoch": 2.21091234014036, "grad_norm": 0.22188520431518555, "learning_rate": 1.8141450795354155e-05, "loss": 1.2513, "step": 7423 }, { "epoch": 2.2112101863400286, "grad_norm": 0.2462684065103531, "learning_rate": 1.8140890670918755e-05, "loss": 1.2594, "step": 7424 }, { "epoch": 2.2115080325396974, "grad_norm": 0.23598931729793549, "learning_rate": 1.814033047074146e-05, "loss": 1.2569, "step": 7425 }, { "epoch": 2.2118058787393657, "grad_norm": 0.24653668701648712, "learning_rate": 1.8139770194827485e-05, "loss": 1.2657, "step": 7426 }, { "epoch": 2.2121037249390345, "grad_norm": 0.27435415983200073, "learning_rate": 1.8139209843182043e-05, "loss": 1.2661, "step": 7427 }, { "epoch": 2.2124015711387033, "grad_norm": 0.22956374287605286, "learning_rate": 1.8138649415810348e-05, "loss": 1.2546, "step": 7428 }, { "epoch": 2.212699417338372, "grad_norm": 0.33536839485168457, "learning_rate": 1.813808891271761e-05, "loss": 1.2586, "step": 7429 }, { "epoch": 2.2129972635380404, "grad_norm": 0.2709580063819885, "learning_rate": 1.8137528333909048e-05, "loss": 1.276, "step": 7430 }, { "epoch": 2.213295109737709, "grad_norm": 0.2404656857252121, "learning_rate": 1.813696767938988e-05, "loss": 1.2687, "step": 7431 }, { "epoch": 2.213592955937378, "grad_norm": 0.23287378251552582, "learning_rate": 1.8136406949165315e-05, "loss": 1.2365, "step": 7432 }, { "epoch": 2.2138908021370467, "grad_norm": 0.255045086145401, "learning_rate": 1.8135846143240575e-05, "loss": 1.2567, "step": 7433 }, { "epoch": 2.214188648336715, "grad_norm": 0.234689861536026, "learning_rate": 1.8135285261620882e-05, "loss": 1.2636, "step": 7434 }, { "epoch": 2.214486494536384, "grad_norm": 0.23581352829933167, "learning_rate": 1.8134724304311443e-05, "loss": 1.2582, "step": 7435 }, { "epoch": 2.2147843407360526, "grad_norm": 0.2270398736000061, "learning_rate": 1.8134163271317483e-05, "loss": 1.2559, "step": 7436 }, { "epoch": 2.215082186935721, "grad_norm": 0.24003979563713074, "learning_rate": 1.8133602162644225e-05, "loss": 1.2608, "step": 7437 }, { "epoch": 2.2153800331353897, "grad_norm": 0.23116470873355865, "learning_rate": 1.813304097829688e-05, "loss": 1.2734, "step": 7438 }, { "epoch": 2.2156778793350584, "grad_norm": 0.25134536623954773, "learning_rate": 1.813247971828068e-05, "loss": 1.2421, "step": 7439 }, { "epoch": 2.2159757255347268, "grad_norm": 0.2228419929742813, "learning_rate": 1.8131918382600843e-05, "loss": 1.2741, "step": 7440 }, { "epoch": 2.2162735717343955, "grad_norm": 0.22911174595355988, "learning_rate": 1.813135697126259e-05, "loss": 1.2747, "step": 7441 }, { "epoch": 2.2165714179340643, "grad_norm": 0.22643044590950012, "learning_rate": 1.8130795484271147e-05, "loss": 1.2472, "step": 7442 }, { "epoch": 2.216869264133733, "grad_norm": 0.2381702959537506, "learning_rate": 1.8130233921631733e-05, "loss": 1.26, "step": 7443 }, { "epoch": 2.2171671103334014, "grad_norm": 0.2607860863208771, "learning_rate": 1.8129672283349577e-05, "loss": 1.2545, "step": 7444 }, { "epoch": 2.21746495653307, "grad_norm": 0.22908510267734528, "learning_rate": 1.8129110569429906e-05, "loss": 1.2681, "step": 7445 }, { "epoch": 2.217762802732739, "grad_norm": 0.2557067573070526, "learning_rate": 1.812854877987794e-05, "loss": 1.2713, "step": 7446 }, { "epoch": 2.2180606489324077, "grad_norm": 0.24011479318141937, "learning_rate": 1.812798691469891e-05, "loss": 1.2477, "step": 7447 }, { "epoch": 2.218358495132076, "grad_norm": 0.2298644781112671, "learning_rate": 1.8127424973898046e-05, "loss": 1.2533, "step": 7448 }, { "epoch": 2.218656341331745, "grad_norm": 0.23344075679779053, "learning_rate": 1.8126862957480572e-05, "loss": 1.2462, "step": 7449 }, { "epoch": 2.2189541875314136, "grad_norm": 0.24125604331493378, "learning_rate": 1.8126300865451716e-05, "loss": 1.2507, "step": 7450 }, { "epoch": 2.219252033731082, "grad_norm": 0.24440434575080872, "learning_rate": 1.812573869781671e-05, "loss": 1.256, "step": 7451 }, { "epoch": 2.2195498799307507, "grad_norm": 0.24648922681808472, "learning_rate": 1.8125176454580785e-05, "loss": 1.2559, "step": 7452 }, { "epoch": 2.2198477261304195, "grad_norm": 0.3261624276638031, "learning_rate": 1.812461413574917e-05, "loss": 1.2543, "step": 7453 }, { "epoch": 2.2201455723300882, "grad_norm": 0.2771762013435364, "learning_rate": 1.81240517413271e-05, "loss": 1.245, "step": 7454 }, { "epoch": 2.2204434185297566, "grad_norm": 0.2676098942756653, "learning_rate": 1.81234892713198e-05, "loss": 1.2548, "step": 7455 }, { "epoch": 2.2207412647294253, "grad_norm": 0.33100032806396484, "learning_rate": 1.8122926725732513e-05, "loss": 1.2602, "step": 7456 }, { "epoch": 2.221039110929094, "grad_norm": 0.23672987520694733, "learning_rate": 1.812236410457047e-05, "loss": 1.2741, "step": 7457 }, { "epoch": 2.2213369571287624, "grad_norm": 0.238791361451149, "learning_rate": 1.8121801407838903e-05, "loss": 1.2719, "step": 7458 }, { "epoch": 2.221634803328431, "grad_norm": 0.23731465637683868, "learning_rate": 1.8121238635543043e-05, "loss": 1.2602, "step": 7459 }, { "epoch": 2.2219326495281, "grad_norm": 0.22845380008220673, "learning_rate": 1.8120675787688134e-05, "loss": 1.256, "step": 7460 }, { "epoch": 2.2222304957277688, "grad_norm": 0.24039316177368164, "learning_rate": 1.812011286427941e-05, "loss": 1.2407, "step": 7461 }, { "epoch": 2.222528341927437, "grad_norm": 0.22779229283332825, "learning_rate": 1.811954986532211e-05, "loss": 1.2619, "step": 7462 }, { "epoch": 2.222826188127106, "grad_norm": 0.2333533763885498, "learning_rate": 1.8118986790821468e-05, "loss": 1.2559, "step": 7463 }, { "epoch": 2.2231240343267746, "grad_norm": 0.23861506581306458, "learning_rate": 1.8118423640782724e-05, "loss": 1.2662, "step": 7464 }, { "epoch": 2.223421880526443, "grad_norm": 0.22525864839553833, "learning_rate": 1.811786041521112e-05, "loss": 1.2527, "step": 7465 }, { "epoch": 2.2237197267261117, "grad_norm": 0.23941968381404877, "learning_rate": 1.8117297114111894e-05, "loss": 1.2569, "step": 7466 }, { "epoch": 2.2240175729257805, "grad_norm": 0.2459057718515396, "learning_rate": 1.8116733737490292e-05, "loss": 1.2558, "step": 7467 }, { "epoch": 2.2243154191254493, "grad_norm": 0.2310280203819275, "learning_rate": 1.8116170285351545e-05, "loss": 1.2646, "step": 7468 }, { "epoch": 2.2246132653251176, "grad_norm": 0.22232785820960999, "learning_rate": 1.81156067577009e-05, "loss": 1.2502, "step": 7469 }, { "epoch": 2.2249111115247864, "grad_norm": 0.23295165598392487, "learning_rate": 1.811504315454361e-05, "loss": 1.2465, "step": 7470 }, { "epoch": 2.225208957724455, "grad_norm": 0.24131250381469727, "learning_rate": 1.8114479475884906e-05, "loss": 1.2508, "step": 7471 }, { "epoch": 2.2255068039241235, "grad_norm": 0.24913492798805237, "learning_rate": 1.8113915721730036e-05, "loss": 1.254, "step": 7472 }, { "epoch": 2.2258046501237922, "grad_norm": 0.24638479948043823, "learning_rate": 1.8113351892084242e-05, "loss": 1.2634, "step": 7473 }, { "epoch": 2.226102496323461, "grad_norm": 0.22303815186023712, "learning_rate": 1.8112787986952776e-05, "loss": 1.2719, "step": 7474 }, { "epoch": 2.22640034252313, "grad_norm": 0.2328311800956726, "learning_rate": 1.8112224006340887e-05, "loss": 1.2465, "step": 7475 }, { "epoch": 2.226698188722798, "grad_norm": 0.23571890592575073, "learning_rate": 1.811165995025381e-05, "loss": 1.2768, "step": 7476 }, { "epoch": 2.226996034922467, "grad_norm": 0.25322532653808594, "learning_rate": 1.8111095818696805e-05, "loss": 1.2453, "step": 7477 }, { "epoch": 2.2272938811221357, "grad_norm": 0.276750385761261, "learning_rate": 1.8110531611675112e-05, "loss": 1.2583, "step": 7478 }, { "epoch": 2.227591727321804, "grad_norm": 0.28301140666007996, "learning_rate": 1.8109967329193986e-05, "loss": 1.2395, "step": 7479 }, { "epoch": 2.2278895735214728, "grad_norm": 0.24565206468105316, "learning_rate": 1.8109402971258676e-05, "loss": 1.2594, "step": 7480 }, { "epoch": 2.2281874197211415, "grad_norm": 0.3332265615463257, "learning_rate": 1.8108838537874428e-05, "loss": 1.2633, "step": 7481 }, { "epoch": 2.2284852659208103, "grad_norm": 0.2968139946460724, "learning_rate": 1.81082740290465e-05, "loss": 1.2752, "step": 7482 }, { "epoch": 2.2287831121204786, "grad_norm": 0.2606860399246216, "learning_rate": 1.810770944478014e-05, "loss": 1.2588, "step": 7483 }, { "epoch": 2.2290809583201474, "grad_norm": 0.2384524643421173, "learning_rate": 1.8107144785080604e-05, "loss": 1.2421, "step": 7484 }, { "epoch": 2.229378804519816, "grad_norm": 0.28585508465766907, "learning_rate": 1.810658004995314e-05, "loss": 1.2511, "step": 7485 }, { "epoch": 2.229676650719485, "grad_norm": 0.29733842611312866, "learning_rate": 1.810601523940301e-05, "loss": 1.2553, "step": 7486 }, { "epoch": 2.2299744969191533, "grad_norm": 0.23562990128993988, "learning_rate": 1.8105450353435463e-05, "loss": 1.2835, "step": 7487 }, { "epoch": 2.230272343118822, "grad_norm": 0.23298020660877228, "learning_rate": 1.8104885392055755e-05, "loss": 1.2612, "step": 7488 }, { "epoch": 2.230570189318491, "grad_norm": 0.23380275070667267, "learning_rate": 1.8104320355269145e-05, "loss": 1.2566, "step": 7489 }, { "epoch": 2.230868035518159, "grad_norm": 0.2470693737268448, "learning_rate": 1.8103755243080893e-05, "loss": 1.2517, "step": 7490 }, { "epoch": 2.231165881717828, "grad_norm": 0.22034691274166107, "learning_rate": 1.8103190055496246e-05, "loss": 1.2417, "step": 7491 }, { "epoch": 2.2314637279174967, "grad_norm": 0.23405425250530243, "learning_rate": 1.8102624792520472e-05, "loss": 1.2783, "step": 7492 }, { "epoch": 2.231761574117165, "grad_norm": 0.2418336421251297, "learning_rate": 1.8102059454158824e-05, "loss": 1.2496, "step": 7493 }, { "epoch": 2.232059420316834, "grad_norm": 0.23358824849128723, "learning_rate": 1.8101494040416566e-05, "loss": 1.2414, "step": 7494 }, { "epoch": 2.2323572665165026, "grad_norm": 0.2360301911830902, "learning_rate": 1.810092855129896e-05, "loss": 1.2538, "step": 7495 }, { "epoch": 2.2326551127161713, "grad_norm": 0.2237144261598587, "learning_rate": 1.8100362986811262e-05, "loss": 1.2534, "step": 7496 }, { "epoch": 2.2329529589158397, "grad_norm": 0.23044529557228088, "learning_rate": 1.809979734695874e-05, "loss": 1.2422, "step": 7497 }, { "epoch": 2.2332508051155084, "grad_norm": 0.24516120553016663, "learning_rate": 1.809923163174665e-05, "loss": 1.2648, "step": 7498 }, { "epoch": 2.233548651315177, "grad_norm": 0.24140438437461853, "learning_rate": 1.8098665841180262e-05, "loss": 1.2455, "step": 7499 }, { "epoch": 2.233846497514846, "grad_norm": 0.22544613480567932, "learning_rate": 1.8098099975264834e-05, "loss": 1.2529, "step": 7500 }, { "epoch": 2.233846497514846, "eval_loss": 1.343520164489746, "eval_runtime": 20.7156, "eval_samples_per_second": 83.705, "eval_steps_per_second": 5.262, "step": 7500 }, { "epoch": 2.2341443437145143, "grad_norm": 0.23673491179943085, "learning_rate": 1.8097534034005636e-05, "loss": 1.2488, "step": 7501 }, { "epoch": 2.234442189914183, "grad_norm": 0.22079885005950928, "learning_rate": 1.809696801740793e-05, "loss": 1.2609, "step": 7502 }, { "epoch": 2.234740036113852, "grad_norm": 0.2393401712179184, "learning_rate": 1.809640192547698e-05, "loss": 1.2668, "step": 7503 }, { "epoch": 2.23503788231352, "grad_norm": 0.23307958245277405, "learning_rate": 1.809583575821806e-05, "loss": 1.238, "step": 7504 }, { "epoch": 2.235335728513189, "grad_norm": 0.2271372377872467, "learning_rate": 1.809526951563643e-05, "loss": 1.2503, "step": 7505 }, { "epoch": 2.2356335747128577, "grad_norm": 0.23181135952472687, "learning_rate": 1.8094703197737364e-05, "loss": 1.2561, "step": 7506 }, { "epoch": 2.2359314209125265, "grad_norm": 0.22521166503429413, "learning_rate": 1.809413680452613e-05, "loss": 1.2544, "step": 7507 }, { "epoch": 2.236229267112195, "grad_norm": 0.22727955877780914, "learning_rate": 1.8093570336007996e-05, "loss": 1.2475, "step": 7508 }, { "epoch": 2.2365271133118636, "grad_norm": 0.22818197309970856, "learning_rate": 1.8093003792188227e-05, "loss": 1.2526, "step": 7509 }, { "epoch": 2.2368249595115324, "grad_norm": 0.23357020318508148, "learning_rate": 1.8092437173072105e-05, "loss": 1.2754, "step": 7510 }, { "epoch": 2.2371228057112007, "grad_norm": 0.23212124407291412, "learning_rate": 1.8091870478664898e-05, "loss": 1.2467, "step": 7511 }, { "epoch": 2.2374206519108695, "grad_norm": 0.2305627465248108, "learning_rate": 1.809130370897187e-05, "loss": 1.282, "step": 7512 }, { "epoch": 2.2377184981105382, "grad_norm": 0.23068830370903015, "learning_rate": 1.8090736863998307e-05, "loss": 1.2518, "step": 7513 }, { "epoch": 2.238016344310207, "grad_norm": 0.23182810842990875, "learning_rate": 1.8090169943749477e-05, "loss": 1.2579, "step": 7514 }, { "epoch": 2.2383141905098753, "grad_norm": 0.21896368265151978, "learning_rate": 1.8089602948230653e-05, "loss": 1.2613, "step": 7515 }, { "epoch": 2.238612036709544, "grad_norm": 0.23936767876148224, "learning_rate": 1.8089035877447114e-05, "loss": 1.2482, "step": 7516 }, { "epoch": 2.238909882909213, "grad_norm": 0.22754739224910736, "learning_rate": 1.808846873140413e-05, "loss": 1.2672, "step": 7517 }, { "epoch": 2.239207729108881, "grad_norm": 0.2435189187526703, "learning_rate": 1.808790151010698e-05, "loss": 1.2532, "step": 7518 }, { "epoch": 2.23950557530855, "grad_norm": 0.2510552406311035, "learning_rate": 1.808733421356095e-05, "loss": 1.2701, "step": 7519 }, { "epoch": 2.2398034215082188, "grad_norm": 0.22681453824043274, "learning_rate": 1.8086766841771305e-05, "loss": 1.2505, "step": 7520 }, { "epoch": 2.2401012677078875, "grad_norm": 0.25242292881011963, "learning_rate": 1.808619939474333e-05, "loss": 1.2735, "step": 7521 }, { "epoch": 2.240399113907556, "grad_norm": 0.2699648141860962, "learning_rate": 1.8085631872482306e-05, "loss": 1.2509, "step": 7522 }, { "epoch": 2.2406969601072246, "grad_norm": 0.22968195378780365, "learning_rate": 1.8085064274993507e-05, "loss": 1.2615, "step": 7523 }, { "epoch": 2.2409948063068934, "grad_norm": 0.22539275884628296, "learning_rate": 1.8084496602282223e-05, "loss": 1.2528, "step": 7524 }, { "epoch": 2.2412926525065617, "grad_norm": 0.24039646983146667, "learning_rate": 1.8083928854353732e-05, "loss": 1.2634, "step": 7525 }, { "epoch": 2.2415904987062305, "grad_norm": 0.23207874596118927, "learning_rate": 1.808336103121331e-05, "loss": 1.2787, "step": 7526 }, { "epoch": 2.2418883449058993, "grad_norm": 0.23065651953220367, "learning_rate": 1.808279313286625e-05, "loss": 1.2545, "step": 7527 }, { "epoch": 2.242186191105568, "grad_norm": 0.23620425164699554, "learning_rate": 1.8082225159317827e-05, "loss": 1.2435, "step": 7528 }, { "epoch": 2.2424840373052364, "grad_norm": 0.22482654452323914, "learning_rate": 1.8081657110573327e-05, "loss": 1.2587, "step": 7529 }, { "epoch": 2.242781883504905, "grad_norm": 0.228297159075737, "learning_rate": 1.808108898663804e-05, "loss": 1.2665, "step": 7530 }, { "epoch": 2.243079729704574, "grad_norm": 0.23502565920352936, "learning_rate": 1.808052078751725e-05, "loss": 1.2735, "step": 7531 }, { "epoch": 2.2433775759042422, "grad_norm": 0.24171894788742065, "learning_rate": 1.8079952513216238e-05, "loss": 1.2627, "step": 7532 }, { "epoch": 2.243675422103911, "grad_norm": 0.22185218334197998, "learning_rate": 1.8079384163740296e-05, "loss": 1.2512, "step": 7533 }, { "epoch": 2.24397326830358, "grad_norm": 0.22304552793502808, "learning_rate": 1.8078815739094714e-05, "loss": 1.2444, "step": 7534 }, { "epoch": 2.2442711145032486, "grad_norm": 0.26106178760528564, "learning_rate": 1.807824723928478e-05, "loss": 1.2564, "step": 7535 }, { "epoch": 2.244568960702917, "grad_norm": 0.37261083722114563, "learning_rate": 1.8077678664315775e-05, "loss": 1.2324, "step": 7536 }, { "epoch": 2.2448668069025857, "grad_norm": 0.29698267579078674, "learning_rate": 1.8077110014192997e-05, "loss": 1.274, "step": 7537 }, { "epoch": 2.2451646531022544, "grad_norm": 0.2707362473011017, "learning_rate": 1.8076541288921733e-05, "loss": 1.251, "step": 7538 }, { "epoch": 2.2454624993019228, "grad_norm": 0.2235717624425888, "learning_rate": 1.807597248850728e-05, "loss": 1.2532, "step": 7539 }, { "epoch": 2.2457603455015915, "grad_norm": 0.4530438780784607, "learning_rate": 1.8075403612954926e-05, "loss": 1.2535, "step": 7540 }, { "epoch": 2.2460581917012603, "grad_norm": 0.23756301403045654, "learning_rate": 1.8074834662269957e-05, "loss": 1.261, "step": 7541 }, { "epoch": 2.246356037900929, "grad_norm": 0.23115527629852295, "learning_rate": 1.807426563645768e-05, "loss": 1.2605, "step": 7542 }, { "epoch": 2.2466538841005974, "grad_norm": 0.22856971621513367, "learning_rate": 1.8073696535523383e-05, "loss": 1.2416, "step": 7543 }, { "epoch": 2.246951730300266, "grad_norm": 0.2366601526737213, "learning_rate": 1.8073127359472355e-05, "loss": 1.245, "step": 7544 }, { "epoch": 2.247249576499935, "grad_norm": 0.22170701622962952, "learning_rate": 1.8072558108309902e-05, "loss": 1.2462, "step": 7545 }, { "epoch": 2.2475474226996033, "grad_norm": 0.23127001523971558, "learning_rate": 1.8071988782041308e-05, "loss": 1.2541, "step": 7546 }, { "epoch": 2.247845268899272, "grad_norm": 0.22585774958133698, "learning_rate": 1.8071419380671883e-05, "loss": 1.2622, "step": 7547 }, { "epoch": 2.248143115098941, "grad_norm": 0.2309402972459793, "learning_rate": 1.8070849904206916e-05, "loss": 1.2733, "step": 7548 }, { "epoch": 2.2484409612986096, "grad_norm": 0.22721268236637115, "learning_rate": 1.807028035265171e-05, "loss": 1.2347, "step": 7549 }, { "epoch": 2.248738807498278, "grad_norm": 0.2178564965724945, "learning_rate": 1.806971072601156e-05, "loss": 1.2525, "step": 7550 }, { "epoch": 2.2490366536979467, "grad_norm": 0.23110532760620117, "learning_rate": 1.8069141024291768e-05, "loss": 1.2599, "step": 7551 }, { "epoch": 2.2493344998976155, "grad_norm": 0.2313280552625656, "learning_rate": 1.8068571247497636e-05, "loss": 1.246, "step": 7552 }, { "epoch": 2.2496323460972842, "grad_norm": 0.22944484651088715, "learning_rate": 1.806800139563446e-05, "loss": 1.2483, "step": 7553 }, { "epoch": 2.2499301922969526, "grad_norm": 0.23092588782310486, "learning_rate": 1.806743146870755e-05, "loss": 1.2585, "step": 7554 }, { "epoch": 2.2502280384966213, "grad_norm": 0.22942329943180084, "learning_rate": 1.80668614667222e-05, "loss": 1.2677, "step": 7555 }, { "epoch": 2.25052588469629, "grad_norm": 0.22068588435649872, "learning_rate": 1.8066291389683717e-05, "loss": 1.2677, "step": 7556 }, { "epoch": 2.2508237308959584, "grad_norm": 0.22939462959766388, "learning_rate": 1.8065721237597403e-05, "loss": 1.2464, "step": 7557 }, { "epoch": 2.251121577095627, "grad_norm": 0.2231801152229309, "learning_rate": 1.806515101046857e-05, "loss": 1.2502, "step": 7558 }, { "epoch": 2.251419423295296, "grad_norm": 0.22507058084011078, "learning_rate": 1.806458070830251e-05, "loss": 1.2541, "step": 7559 }, { "epoch": 2.2517172694949643, "grad_norm": 0.22735922038555145, "learning_rate": 1.806401033110454e-05, "loss": 1.2492, "step": 7560 }, { "epoch": 2.252015115694633, "grad_norm": 0.22040493786334991, "learning_rate": 1.806343987887997e-05, "loss": 1.2396, "step": 7561 }, { "epoch": 2.252312961894302, "grad_norm": 0.23119086027145386, "learning_rate": 1.8062869351634095e-05, "loss": 1.2757, "step": 7562 }, { "epoch": 2.2526108080939706, "grad_norm": 0.24003298580646515, "learning_rate": 1.806229874937223e-05, "loss": 1.2753, "step": 7563 }, { "epoch": 2.252908654293639, "grad_norm": 0.2215633988380432, "learning_rate": 1.8061728072099682e-05, "loss": 1.2654, "step": 7564 }, { "epoch": 2.2532065004933077, "grad_norm": 0.22103771567344666, "learning_rate": 1.806115731982176e-05, "loss": 1.2547, "step": 7565 }, { "epoch": 2.2535043466929765, "grad_norm": 0.23959650099277496, "learning_rate": 1.8060586492543777e-05, "loss": 1.265, "step": 7566 }, { "epoch": 2.2538021928926453, "grad_norm": 0.23556892573833466, "learning_rate": 1.8060015590271045e-05, "loss": 1.2506, "step": 7567 }, { "epoch": 2.2541000390923136, "grad_norm": 0.22280603647232056, "learning_rate": 1.8059444613008873e-05, "loss": 1.2708, "step": 7568 }, { "epoch": 2.2543978852919824, "grad_norm": 0.2173597514629364, "learning_rate": 1.8058873560762567e-05, "loss": 1.2472, "step": 7569 }, { "epoch": 2.254695731491651, "grad_norm": 0.22834259271621704, "learning_rate": 1.8058302433537454e-05, "loss": 1.2568, "step": 7570 }, { "epoch": 2.2549935776913195, "grad_norm": 0.23622368276119232, "learning_rate": 1.8057731231338836e-05, "loss": 1.265, "step": 7571 }, { "epoch": 2.2552914238909882, "grad_norm": 0.22571687400341034, "learning_rate": 1.8057159954172032e-05, "loss": 1.2505, "step": 7572 }, { "epoch": 2.255589270090657, "grad_norm": 0.22665946185588837, "learning_rate": 1.805658860204236e-05, "loss": 1.2539, "step": 7573 }, { "epoch": 2.2558871162903253, "grad_norm": 0.23141254484653473, "learning_rate": 1.8056017174955127e-05, "loss": 1.2576, "step": 7574 }, { "epoch": 2.256184962489994, "grad_norm": 0.21993815898895264, "learning_rate": 1.805544567291566e-05, "loss": 1.247, "step": 7575 }, { "epoch": 2.256482808689663, "grad_norm": 0.227073073387146, "learning_rate": 1.8054874095929267e-05, "loss": 1.2491, "step": 7576 }, { "epoch": 2.2567806548893317, "grad_norm": 0.2353278547525406, "learning_rate": 1.8054302444001274e-05, "loss": 1.2501, "step": 7577 }, { "epoch": 2.257078501089, "grad_norm": 0.23063941299915314, "learning_rate": 1.805373071713699e-05, "loss": 1.2669, "step": 7578 }, { "epoch": 2.2573763472886688, "grad_norm": 0.23481705784797668, "learning_rate": 1.8053158915341743e-05, "loss": 1.2677, "step": 7579 }, { "epoch": 2.2576741934883375, "grad_norm": 0.22463823854923248, "learning_rate": 1.8052587038620852e-05, "loss": 1.2582, "step": 7580 }, { "epoch": 2.2579720396880063, "grad_norm": 0.22420097887516022, "learning_rate": 1.8052015086979632e-05, "loss": 1.2425, "step": 7581 }, { "epoch": 2.2582698858876746, "grad_norm": 0.23631395399570465, "learning_rate": 1.805144306042341e-05, "loss": 1.2444, "step": 7582 }, { "epoch": 2.2585677320873434, "grad_norm": 0.2266579270362854, "learning_rate": 1.8050870958957504e-05, "loss": 1.2528, "step": 7583 }, { "epoch": 2.258865578287012, "grad_norm": 0.2310836911201477, "learning_rate": 1.805029878258724e-05, "loss": 1.2459, "step": 7584 }, { "epoch": 2.2591634244866805, "grad_norm": 0.23519113659858704, "learning_rate": 1.804972653131794e-05, "loss": 1.2859, "step": 7585 }, { "epoch": 2.2594612706863493, "grad_norm": 0.2356254607439041, "learning_rate": 1.8049154205154928e-05, "loss": 1.2536, "step": 7586 }, { "epoch": 2.259759116886018, "grad_norm": 0.2175767570734024, "learning_rate": 1.804858180410353e-05, "loss": 1.2559, "step": 7587 }, { "epoch": 2.260056963085687, "grad_norm": 0.23540367186069489, "learning_rate": 1.804800932816907e-05, "loss": 1.2646, "step": 7588 }, { "epoch": 2.260354809285355, "grad_norm": 0.22534309327602386, "learning_rate": 1.8047436777356875e-05, "loss": 1.2613, "step": 7589 }, { "epoch": 2.260652655485024, "grad_norm": 0.23676803708076477, "learning_rate": 1.804686415167227e-05, "loss": 1.2586, "step": 7590 }, { "epoch": 2.2609505016846927, "grad_norm": 0.23621177673339844, "learning_rate": 1.804629145112059e-05, "loss": 1.2566, "step": 7591 }, { "epoch": 2.261248347884361, "grad_norm": 0.22155267000198364, "learning_rate": 1.8045718675707152e-05, "loss": 1.2438, "step": 7592 }, { "epoch": 2.26154619408403, "grad_norm": 0.23292042315006256, "learning_rate": 1.8045145825437294e-05, "loss": 1.259, "step": 7593 }, { "epoch": 2.2618440402836986, "grad_norm": 0.22824744880199432, "learning_rate": 1.804457290031634e-05, "loss": 1.2487, "step": 7594 }, { "epoch": 2.2621418864833673, "grad_norm": 0.23176303505897522, "learning_rate": 1.804399990034963e-05, "loss": 1.2323, "step": 7595 }, { "epoch": 2.2624397326830357, "grad_norm": 0.24980776011943817, "learning_rate": 1.804342682554248e-05, "loss": 1.2654, "step": 7596 }, { "epoch": 2.2627375788827044, "grad_norm": 0.24396789073944092, "learning_rate": 1.8042853675900235e-05, "loss": 1.27, "step": 7597 }, { "epoch": 2.263035425082373, "grad_norm": 0.23645758628845215, "learning_rate": 1.8042280451428222e-05, "loss": 1.2403, "step": 7598 }, { "epoch": 2.2633332712820415, "grad_norm": 0.23644089698791504, "learning_rate": 1.8041707152131772e-05, "loss": 1.2389, "step": 7599 }, { "epoch": 2.2636311174817103, "grad_norm": 0.2311485856771469, "learning_rate": 1.804113377801622e-05, "loss": 1.2662, "step": 7600 }, { "epoch": 2.263928963681379, "grad_norm": 0.23153501749038696, "learning_rate": 1.8040560329086908e-05, "loss": 1.2583, "step": 7601 }, { "epoch": 2.264226809881048, "grad_norm": 0.23604245483875275, "learning_rate": 1.8039986805349167e-05, "loss": 1.2504, "step": 7602 }, { "epoch": 2.264524656080716, "grad_norm": 0.22809988260269165, "learning_rate": 1.8039413206808326e-05, "loss": 1.2422, "step": 7603 }, { "epoch": 2.264822502280385, "grad_norm": 0.2308470755815506, "learning_rate": 1.803883953346973e-05, "loss": 1.2558, "step": 7604 }, { "epoch": 2.2651203484800537, "grad_norm": 0.23038657009601593, "learning_rate": 1.803826578533871e-05, "loss": 1.266, "step": 7605 }, { "epoch": 2.2654181946797225, "grad_norm": 0.22560255229473114, "learning_rate": 1.803769196242061e-05, "loss": 1.2569, "step": 7606 }, { "epoch": 2.265716040879391, "grad_norm": 0.23066085577011108, "learning_rate": 1.8037118064720767e-05, "loss": 1.2543, "step": 7607 }, { "epoch": 2.2660138870790596, "grad_norm": 0.2334594577550888, "learning_rate": 1.803654409224452e-05, "loss": 1.2505, "step": 7608 }, { "epoch": 2.2663117332787284, "grad_norm": 0.24509860575199127, "learning_rate": 1.8035970044997212e-05, "loss": 1.2572, "step": 7609 }, { "epoch": 2.2666095794783967, "grad_norm": 0.23509445786476135, "learning_rate": 1.803539592298418e-05, "loss": 1.2586, "step": 7610 }, { "epoch": 2.2669074256780655, "grad_norm": 0.2275058478116989, "learning_rate": 1.803482172621076e-05, "loss": 1.2646, "step": 7611 }, { "epoch": 2.2672052718777342, "grad_norm": 0.2272939532995224, "learning_rate": 1.803424745468231e-05, "loss": 1.2736, "step": 7612 }, { "epoch": 2.2675031180774026, "grad_norm": 0.2450646609067917, "learning_rate": 1.8033673108404157e-05, "loss": 1.2436, "step": 7613 }, { "epoch": 2.2678009642770713, "grad_norm": 0.2289837896823883, "learning_rate": 1.8033098687381656e-05, "loss": 1.2453, "step": 7614 }, { "epoch": 2.26809881047674, "grad_norm": 0.23882275819778442, "learning_rate": 1.8032524191620143e-05, "loss": 1.2587, "step": 7615 }, { "epoch": 2.268396656676409, "grad_norm": 0.23556388914585114, "learning_rate": 1.8031949621124967e-05, "loss": 1.2615, "step": 7616 }, { "epoch": 2.268694502876077, "grad_norm": 0.2411070019006729, "learning_rate": 1.803137497590148e-05, "loss": 1.2636, "step": 7617 }, { "epoch": 2.268992349075746, "grad_norm": 0.23291894793510437, "learning_rate": 1.8030800255955014e-05, "loss": 1.2397, "step": 7618 }, { "epoch": 2.2692901952754148, "grad_norm": 0.22428922355175018, "learning_rate": 1.803022546129093e-05, "loss": 1.2575, "step": 7619 }, { "epoch": 2.2695880414750835, "grad_norm": 0.2388489842414856, "learning_rate": 1.8029650591914566e-05, "loss": 1.2649, "step": 7620 }, { "epoch": 2.269885887674752, "grad_norm": 0.22928275167942047, "learning_rate": 1.8029075647831274e-05, "loss": 1.2601, "step": 7621 }, { "epoch": 2.2701837338744206, "grad_norm": 0.2210404872894287, "learning_rate": 1.8028500629046408e-05, "loss": 1.2673, "step": 7622 }, { "epoch": 2.2704815800740894, "grad_norm": 0.22059838473796844, "learning_rate": 1.802792553556531e-05, "loss": 1.2481, "step": 7623 }, { "epoch": 2.2707794262737577, "grad_norm": 0.22074320912361145, "learning_rate": 1.8027350367393337e-05, "loss": 1.2415, "step": 7624 }, { "epoch": 2.2710772724734265, "grad_norm": 0.24399973452091217, "learning_rate": 1.8026775124535835e-05, "loss": 1.2522, "step": 7625 }, { "epoch": 2.2713751186730953, "grad_norm": 0.23464760184288025, "learning_rate": 1.8026199806998163e-05, "loss": 1.2716, "step": 7626 }, { "epoch": 2.2716729648727636, "grad_norm": 0.2399558275938034, "learning_rate": 1.8025624414785663e-05, "loss": 1.2717, "step": 7627 }, { "epoch": 2.2719708110724324, "grad_norm": 0.23549529910087585, "learning_rate": 1.8025048947903698e-05, "loss": 1.2663, "step": 7628 }, { "epoch": 2.272268657272101, "grad_norm": 0.231827050447464, "learning_rate": 1.802447340635762e-05, "loss": 1.2398, "step": 7629 }, { "epoch": 2.27256650347177, "grad_norm": 0.23063279688358307, "learning_rate": 1.8023897790152778e-05, "loss": 1.245, "step": 7630 }, { "epoch": 2.2728643496714382, "grad_norm": 0.23611456155776978, "learning_rate": 1.8023322099294533e-05, "loss": 1.2436, "step": 7631 }, { "epoch": 2.273162195871107, "grad_norm": 0.22566045820713043, "learning_rate": 1.8022746333788243e-05, "loss": 1.2592, "step": 7632 }, { "epoch": 2.273460042070776, "grad_norm": 0.23024094104766846, "learning_rate": 1.8022170493639258e-05, "loss": 1.2664, "step": 7633 }, { "epoch": 2.2737578882704446, "grad_norm": 0.25157734751701355, "learning_rate": 1.8021594578852942e-05, "loss": 1.2835, "step": 7634 }, { "epoch": 2.274055734470113, "grad_norm": 0.2292707860469818, "learning_rate": 1.802101858943465e-05, "loss": 1.2735, "step": 7635 }, { "epoch": 2.2743535806697817, "grad_norm": 0.23525993525981903, "learning_rate": 1.8020442525389742e-05, "loss": 1.2605, "step": 7636 }, { "epoch": 2.2746514268694504, "grad_norm": 0.24633367359638214, "learning_rate": 1.8019866386723582e-05, "loss": 1.253, "step": 7637 }, { "epoch": 2.2749492730691188, "grad_norm": 0.267260879278183, "learning_rate": 1.801929017344152e-05, "loss": 1.2732, "step": 7638 }, { "epoch": 2.2752471192687875, "grad_norm": 0.236179381608963, "learning_rate": 1.801871388554892e-05, "loss": 1.2599, "step": 7639 }, { "epoch": 2.2755449654684563, "grad_norm": 0.24293148517608643, "learning_rate": 1.801813752305115e-05, "loss": 1.2631, "step": 7640 }, { "epoch": 2.2758428116681246, "grad_norm": 0.2451498657464981, "learning_rate": 1.801756108595357e-05, "loss": 1.246, "step": 7641 }, { "epoch": 2.2761406578677934, "grad_norm": 0.2330862134695053, "learning_rate": 1.801698457426154e-05, "loss": 1.2497, "step": 7642 }, { "epoch": 2.276438504067462, "grad_norm": 0.2343113273382187, "learning_rate": 1.8016407987980427e-05, "loss": 1.2576, "step": 7643 }, { "epoch": 2.276736350267131, "grad_norm": 0.24515904486179352, "learning_rate": 1.8015831327115592e-05, "loss": 1.2512, "step": 7644 }, { "epoch": 2.2770341964667993, "grad_norm": 0.24204449355602264, "learning_rate": 1.8015254591672403e-05, "loss": 1.2312, "step": 7645 }, { "epoch": 2.277332042666468, "grad_norm": 0.2208164781332016, "learning_rate": 1.8014677781656226e-05, "loss": 1.2421, "step": 7646 }, { "epoch": 2.277629888866137, "grad_norm": 0.23881329596042633, "learning_rate": 1.801410089707243e-05, "loss": 1.2381, "step": 7647 }, { "epoch": 2.2779277350658056, "grad_norm": 0.23398655652999878, "learning_rate": 1.8013523937926375e-05, "loss": 1.2551, "step": 7648 }, { "epoch": 2.278225581265474, "grad_norm": 0.22927062213420868, "learning_rate": 1.801294690422343e-05, "loss": 1.2526, "step": 7649 }, { "epoch": 2.2785234274651427, "grad_norm": 0.24090448021888733, "learning_rate": 1.8012369795968972e-05, "loss": 1.2652, "step": 7650 }, { "epoch": 2.2788212736648115, "grad_norm": 0.22747567296028137, "learning_rate": 1.801179261316836e-05, "loss": 1.2431, "step": 7651 }, { "epoch": 2.27911911986448, "grad_norm": 0.24036529660224915, "learning_rate": 1.8011215355826976e-05, "loss": 1.2688, "step": 7652 }, { "epoch": 2.2794169660641486, "grad_norm": 0.2285313606262207, "learning_rate": 1.801063802395018e-05, "loss": 1.2636, "step": 7653 }, { "epoch": 2.2797148122638173, "grad_norm": 0.28348928689956665, "learning_rate": 1.8010060617543346e-05, "loss": 1.2525, "step": 7654 }, { "epoch": 2.280012658463486, "grad_norm": 0.24212366342544556, "learning_rate": 1.8009483136611847e-05, "loss": 1.2716, "step": 7655 }, { "epoch": 2.2803105046631544, "grad_norm": 0.2637227475643158, "learning_rate": 1.800890558116106e-05, "loss": 1.2647, "step": 7656 }, { "epoch": 2.280608350862823, "grad_norm": 0.23166510462760925, "learning_rate": 1.8008327951196352e-05, "loss": 1.2686, "step": 7657 }, { "epoch": 2.280906197062492, "grad_norm": 0.28819966316223145, "learning_rate": 1.80077502467231e-05, "loss": 1.2638, "step": 7658 }, { "epoch": 2.2812040432621603, "grad_norm": 0.26619935035705566, "learning_rate": 1.8007172467746677e-05, "loss": 1.2556, "step": 7659 }, { "epoch": 2.281501889461829, "grad_norm": 0.2544212341308594, "learning_rate": 1.8006594614272462e-05, "loss": 1.2497, "step": 7660 }, { "epoch": 2.281799735661498, "grad_norm": 0.226546049118042, "learning_rate": 1.800601668630583e-05, "loss": 1.2616, "step": 7661 }, { "epoch": 2.2820975818611666, "grad_norm": 0.27174046635627747, "learning_rate": 1.8005438683852158e-05, "loss": 1.244, "step": 7662 }, { "epoch": 2.282395428060835, "grad_norm": 0.23841282725334167, "learning_rate": 1.800486060691682e-05, "loss": 1.2624, "step": 7663 }, { "epoch": 2.2826932742605037, "grad_norm": 0.2460107058286667, "learning_rate": 1.8004282455505202e-05, "loss": 1.2618, "step": 7664 }, { "epoch": 2.2829911204601725, "grad_norm": 0.24947495758533478, "learning_rate": 1.800370422962268e-05, "loss": 1.273, "step": 7665 }, { "epoch": 2.283288966659841, "grad_norm": 0.22599336504936218, "learning_rate": 1.8003125929274628e-05, "loss": 1.2569, "step": 7666 }, { "epoch": 2.2835868128595096, "grad_norm": 0.26056331396102905, "learning_rate": 1.8002547554466433e-05, "loss": 1.2623, "step": 7667 }, { "epoch": 2.2838846590591784, "grad_norm": 0.2334567755460739, "learning_rate": 1.800196910520347e-05, "loss": 1.2523, "step": 7668 }, { "epoch": 2.284182505258847, "grad_norm": 0.24040871858596802, "learning_rate": 1.800139058149113e-05, "loss": 1.2563, "step": 7669 }, { "epoch": 2.2844803514585155, "grad_norm": 0.24634890258312225, "learning_rate": 1.8000811983334788e-05, "loss": 1.2549, "step": 7670 }, { "epoch": 2.2847781976581842, "grad_norm": 0.23977108299732208, "learning_rate": 1.8000233310739828e-05, "loss": 1.2502, "step": 7671 }, { "epoch": 2.285076043857853, "grad_norm": 0.2574748694896698, "learning_rate": 1.799965456371164e-05, "loss": 1.2725, "step": 7672 }, { "epoch": 2.285373890057522, "grad_norm": 0.2517244219779968, "learning_rate": 1.7999075742255602e-05, "loss": 1.2605, "step": 7673 }, { "epoch": 2.28567173625719, "grad_norm": 0.23184464871883392, "learning_rate": 1.79984968463771e-05, "loss": 1.2518, "step": 7674 }, { "epoch": 2.285969582456859, "grad_norm": 0.24130327999591827, "learning_rate": 1.799791787608152e-05, "loss": 1.2655, "step": 7675 }, { "epoch": 2.2862674286565277, "grad_norm": 0.23445512354373932, "learning_rate": 1.7997338831374254e-05, "loss": 1.2527, "step": 7676 }, { "epoch": 2.286565274856196, "grad_norm": 0.2374877631664276, "learning_rate": 1.7996759712260683e-05, "loss": 1.26, "step": 7677 }, { "epoch": 2.2868631210558648, "grad_norm": 0.23821516335010529, "learning_rate": 1.7996180518746197e-05, "loss": 1.2467, "step": 7678 }, { "epoch": 2.2871609672555335, "grad_norm": 0.2226201891899109, "learning_rate": 1.7995601250836184e-05, "loss": 1.2445, "step": 7679 }, { "epoch": 2.287458813455202, "grad_norm": 0.2388964295387268, "learning_rate": 1.7995021908536037e-05, "loss": 1.2637, "step": 7680 }, { "epoch": 2.2877566596548706, "grad_norm": 0.22276601195335388, "learning_rate": 1.7994442491851145e-05, "loss": 1.2652, "step": 7681 }, { "epoch": 2.2880545058545394, "grad_norm": 0.31236881017684937, "learning_rate": 1.7993863000786893e-05, "loss": 1.2496, "step": 7682 }, { "epoch": 2.288352352054208, "grad_norm": 0.2572900950908661, "learning_rate": 1.799328343534868e-05, "loss": 1.2493, "step": 7683 }, { "epoch": 2.2886501982538765, "grad_norm": 0.24791483581066132, "learning_rate": 1.7992703795541895e-05, "loss": 1.2462, "step": 7684 }, { "epoch": 2.2889480444535453, "grad_norm": 0.23369233310222626, "learning_rate": 1.799212408137193e-05, "loss": 1.2589, "step": 7685 }, { "epoch": 2.289245890653214, "grad_norm": 0.253845751285553, "learning_rate": 1.799154429284418e-05, "loss": 1.2519, "step": 7686 }, { "epoch": 2.289543736852883, "grad_norm": 0.2269347906112671, "learning_rate": 1.799096442996404e-05, "loss": 1.2529, "step": 7687 }, { "epoch": 2.289841583052551, "grad_norm": 0.24698413908481598, "learning_rate": 1.7990384492736903e-05, "loss": 1.2442, "step": 7688 }, { "epoch": 2.29013942925222, "grad_norm": 0.23516568541526794, "learning_rate": 1.7989804481168168e-05, "loss": 1.2529, "step": 7689 }, { "epoch": 2.2904372754518887, "grad_norm": 0.22608694434165955, "learning_rate": 1.798922439526323e-05, "loss": 1.2546, "step": 7690 }, { "epoch": 2.290735121651557, "grad_norm": 0.23483608663082123, "learning_rate": 1.7988644235027478e-05, "loss": 1.2534, "step": 7691 }, { "epoch": 2.291032967851226, "grad_norm": 0.24747517704963684, "learning_rate": 1.7988064000466322e-05, "loss": 1.2554, "step": 7692 }, { "epoch": 2.2913308140508946, "grad_norm": 0.22679506242275238, "learning_rate": 1.7987483691585156e-05, "loss": 1.2411, "step": 7693 }, { "epoch": 2.291628660250563, "grad_norm": 0.26324447989463806, "learning_rate": 1.7986903308389375e-05, "loss": 1.2531, "step": 7694 }, { "epoch": 2.2919265064502317, "grad_norm": 0.23506203293800354, "learning_rate": 1.7986322850884387e-05, "loss": 1.2573, "step": 7695 }, { "epoch": 2.2922243526499004, "grad_norm": 0.2332557588815689, "learning_rate": 1.7985742319075584e-05, "loss": 1.2588, "step": 7696 }, { "epoch": 2.292522198849569, "grad_norm": 0.23972269892692566, "learning_rate": 1.7985161712968372e-05, "loss": 1.2614, "step": 7697 }, { "epoch": 2.2928200450492375, "grad_norm": 0.23484289646148682, "learning_rate": 1.798458103256815e-05, "loss": 1.2537, "step": 7698 }, { "epoch": 2.2931178912489063, "grad_norm": 0.22184839844703674, "learning_rate": 1.7984000277880325e-05, "loss": 1.2558, "step": 7699 }, { "epoch": 2.293415737448575, "grad_norm": 0.2516103982925415, "learning_rate": 1.7983419448910298e-05, "loss": 1.2572, "step": 7700 }, { "epoch": 2.293713583648244, "grad_norm": 0.2929055392742157, "learning_rate": 1.798283854566347e-05, "loss": 1.2588, "step": 7701 }, { "epoch": 2.294011429847912, "grad_norm": 0.2357298880815506, "learning_rate": 1.798225756814525e-05, "loss": 1.2466, "step": 7702 }, { "epoch": 2.294309276047581, "grad_norm": 0.23832914233207703, "learning_rate": 1.798167651636104e-05, "loss": 1.2322, "step": 7703 }, { "epoch": 2.2946071222472497, "grad_norm": 0.24009771645069122, "learning_rate": 1.798109539031625e-05, "loss": 1.2525, "step": 7704 }, { "epoch": 2.294904968446918, "grad_norm": 0.2367497831583023, "learning_rate": 1.7980514190016283e-05, "loss": 1.2573, "step": 7705 }, { "epoch": 2.295202814646587, "grad_norm": 0.23563018441200256, "learning_rate": 1.797993291546655e-05, "loss": 1.2634, "step": 7706 }, { "epoch": 2.2955006608462556, "grad_norm": 0.23822346329689026, "learning_rate": 1.7979351566672454e-05, "loss": 1.2506, "step": 7707 }, { "epoch": 2.295798507045924, "grad_norm": 0.24498026072978973, "learning_rate": 1.797877014363941e-05, "loss": 1.2654, "step": 7708 }, { "epoch": 2.2960963532455927, "grad_norm": 0.3724459111690521, "learning_rate": 1.7978188646372818e-05, "loss": 1.2516, "step": 7709 }, { "epoch": 2.2963941994452615, "grad_norm": 0.2689630389213562, "learning_rate": 1.79776070748781e-05, "loss": 1.2433, "step": 7710 }, { "epoch": 2.2966920456449302, "grad_norm": 0.2751366198062897, "learning_rate": 1.797702542916066e-05, "loss": 1.2307, "step": 7711 }, { "epoch": 2.2969898918445986, "grad_norm": 0.298320472240448, "learning_rate": 1.7976443709225912e-05, "loss": 1.263, "step": 7712 }, { "epoch": 2.2972877380442673, "grad_norm": 0.29242566227912903, "learning_rate": 1.7975861915079263e-05, "loss": 1.247, "step": 7713 }, { "epoch": 2.297585584243936, "grad_norm": 0.23475217819213867, "learning_rate": 1.7975280046726135e-05, "loss": 1.2609, "step": 7714 }, { "epoch": 2.297883430443605, "grad_norm": 0.26329460740089417, "learning_rate": 1.7974698104171934e-05, "loss": 1.2601, "step": 7715 }, { "epoch": 2.298181276643273, "grad_norm": 0.22359760105609894, "learning_rate": 1.797411608742208e-05, "loss": 1.2702, "step": 7716 }, { "epoch": 2.298479122842942, "grad_norm": 0.2478565275669098, "learning_rate": 1.797353399648198e-05, "loss": 1.2587, "step": 7717 }, { "epoch": 2.2987769690426108, "grad_norm": 0.2553616464138031, "learning_rate": 1.7972951831357056e-05, "loss": 1.2609, "step": 7718 }, { "epoch": 2.299074815242279, "grad_norm": 0.22546374797821045, "learning_rate": 1.7972369592052726e-05, "loss": 1.2599, "step": 7719 }, { "epoch": 2.299372661441948, "grad_norm": 0.24717041850090027, "learning_rate": 1.79717872785744e-05, "loss": 1.2556, "step": 7720 }, { "epoch": 2.2996705076416166, "grad_norm": 0.23478208482265472, "learning_rate": 1.79712048909275e-05, "loss": 1.2639, "step": 7721 }, { "epoch": 2.2999683538412854, "grad_norm": 0.33374375104904175, "learning_rate": 1.797062242911745e-05, "loss": 1.2531, "step": 7722 }, { "epoch": 2.3002662000409537, "grad_norm": 0.24589771032333374, "learning_rate": 1.797003989314966e-05, "loss": 1.2477, "step": 7723 }, { "epoch": 2.3005640462406225, "grad_norm": 0.24167700111865997, "learning_rate": 1.7969457283029554e-05, "loss": 1.2493, "step": 7724 }, { "epoch": 2.3008618924402913, "grad_norm": 0.229723259806633, "learning_rate": 1.796887459876255e-05, "loss": 1.2463, "step": 7725 }, { "epoch": 2.30115973863996, "grad_norm": 0.23125611245632172, "learning_rate": 1.7968291840354073e-05, "loss": 1.2453, "step": 7726 }, { "epoch": 2.3014575848396284, "grad_norm": 0.23347200453281403, "learning_rate": 1.7967709007809544e-05, "loss": 1.2579, "step": 7727 }, { "epoch": 2.301755431039297, "grad_norm": 0.2410845160484314, "learning_rate": 1.7967126101134386e-05, "loss": 1.2632, "step": 7728 }, { "epoch": 2.302053277238966, "grad_norm": 0.22133436799049377, "learning_rate": 1.7966543120334016e-05, "loss": 1.2621, "step": 7729 }, { "epoch": 2.3023511234386342, "grad_norm": 0.24848565459251404, "learning_rate": 1.796596006541387e-05, "loss": 1.2479, "step": 7730 }, { "epoch": 2.302648969638303, "grad_norm": 0.2426963597536087, "learning_rate": 1.7965376936379358e-05, "loss": 1.2469, "step": 7731 }, { "epoch": 2.302946815837972, "grad_norm": 0.23645684123039246, "learning_rate": 1.7964793733235916e-05, "loss": 1.2509, "step": 7732 }, { "epoch": 2.30324466203764, "grad_norm": 0.233712837100029, "learning_rate": 1.796421045598897e-05, "loss": 1.2359, "step": 7733 }, { "epoch": 2.303542508237309, "grad_norm": 0.2517712116241455, "learning_rate": 1.796362710464394e-05, "loss": 1.2602, "step": 7734 }, { "epoch": 2.3038403544369777, "grad_norm": 0.24844689667224884, "learning_rate": 1.796304367920626e-05, "loss": 1.2486, "step": 7735 }, { "epoch": 2.3041382006366464, "grad_norm": 0.2320019155740738, "learning_rate": 1.7962460179681357e-05, "loss": 1.2745, "step": 7736 }, { "epoch": 2.3044360468363148, "grad_norm": 0.2363097369670868, "learning_rate": 1.796187660607465e-05, "loss": 1.2531, "step": 7737 }, { "epoch": 2.3047338930359835, "grad_norm": 0.24142713844776154, "learning_rate": 1.7961292958391585e-05, "loss": 1.2652, "step": 7738 }, { "epoch": 2.3050317392356523, "grad_norm": 0.23480938374996185, "learning_rate": 1.796070923663758e-05, "loss": 1.2615, "step": 7739 }, { "epoch": 2.305329585435321, "grad_norm": 0.23894020915031433, "learning_rate": 1.7960125440818073e-05, "loss": 1.2363, "step": 7740 }, { "epoch": 2.3056274316349894, "grad_norm": 0.27292248606681824, "learning_rate": 1.7959541570938487e-05, "loss": 1.2273, "step": 7741 }, { "epoch": 2.305925277834658, "grad_norm": 0.25207120180130005, "learning_rate": 1.7958957627004265e-05, "loss": 1.2726, "step": 7742 }, { "epoch": 2.306223124034327, "grad_norm": 0.23265618085861206, "learning_rate": 1.7958373609020833e-05, "loss": 1.2501, "step": 7743 }, { "epoch": 2.3065209702339953, "grad_norm": 0.24379467964172363, "learning_rate": 1.7957789516993623e-05, "loss": 1.2544, "step": 7744 }, { "epoch": 2.306818816433664, "grad_norm": 0.33975672721862793, "learning_rate": 1.7957205350928076e-05, "loss": 1.2577, "step": 7745 }, { "epoch": 2.307116662633333, "grad_norm": 0.353736013174057, "learning_rate": 1.7956621110829624e-05, "loss": 1.2658, "step": 7746 }, { "epoch": 2.307414508833001, "grad_norm": 0.27018821239471436, "learning_rate": 1.79560367967037e-05, "loss": 1.2548, "step": 7747 }, { "epoch": 2.30771235503267, "grad_norm": 0.621097207069397, "learning_rate": 1.7955452408555744e-05, "loss": 1.2628, "step": 7748 }, { "epoch": 2.3080102012323387, "grad_norm": 0.2451144903898239, "learning_rate": 1.7954867946391192e-05, "loss": 1.27, "step": 7749 }, { "epoch": 2.3083080474320075, "grad_norm": 0.2555001974105835, "learning_rate": 1.795428341021548e-05, "loss": 1.2554, "step": 7750 }, { "epoch": 2.308605893631676, "grad_norm": 0.25533974170684814, "learning_rate": 1.795369880003405e-05, "loss": 1.2467, "step": 7751 }, { "epoch": 2.3089037398313446, "grad_norm": 0.22745493054389954, "learning_rate": 1.7953114115852336e-05, "loss": 1.2617, "step": 7752 }, { "epoch": 2.3092015860310133, "grad_norm": 0.22543756663799286, "learning_rate": 1.7952529357675783e-05, "loss": 1.2543, "step": 7753 }, { "epoch": 2.309499432230682, "grad_norm": 0.26410216093063354, "learning_rate": 1.795194452550983e-05, "loss": 1.2555, "step": 7754 }, { "epoch": 2.3097972784303504, "grad_norm": 0.24005134403705597, "learning_rate": 1.7951359619359917e-05, "loss": 1.2544, "step": 7755 }, { "epoch": 2.310095124630019, "grad_norm": 0.2326568216085434, "learning_rate": 1.7950774639231486e-05, "loss": 1.2521, "step": 7756 }, { "epoch": 2.310392970829688, "grad_norm": 0.21785187721252441, "learning_rate": 1.7950189585129977e-05, "loss": 1.2254, "step": 7757 }, { "epoch": 2.3106908170293563, "grad_norm": 0.2363487184047699, "learning_rate": 1.7949604457060845e-05, "loss": 1.2518, "step": 7758 }, { "epoch": 2.310988663229025, "grad_norm": 0.23210032284259796, "learning_rate": 1.7949019255029517e-05, "loss": 1.272, "step": 7759 }, { "epoch": 2.311286509428694, "grad_norm": 0.23723196983337402, "learning_rate": 1.794843397904145e-05, "loss": 1.2384, "step": 7760 }, { "epoch": 2.311584355628362, "grad_norm": 0.22842244803905487, "learning_rate": 1.7947848629102082e-05, "loss": 1.2448, "step": 7761 }, { "epoch": 2.311882201828031, "grad_norm": 0.22675757110118866, "learning_rate": 1.7947263205216864e-05, "loss": 1.2597, "step": 7762 }, { "epoch": 2.3121800480276997, "grad_norm": 0.246642604470253, "learning_rate": 1.7946677707391244e-05, "loss": 1.2428, "step": 7763 }, { "epoch": 2.3124778942273685, "grad_norm": 0.22891004383563995, "learning_rate": 1.7946092135630665e-05, "loss": 1.2654, "step": 7764 }, { "epoch": 2.312775740427037, "grad_norm": 0.23032474517822266, "learning_rate": 1.7945506489940573e-05, "loss": 1.2489, "step": 7765 }, { "epoch": 2.3130735866267056, "grad_norm": 0.2301180064678192, "learning_rate": 1.7944920770326422e-05, "loss": 1.2439, "step": 7766 }, { "epoch": 2.3133714328263744, "grad_norm": 0.22702735662460327, "learning_rate": 1.794433497679366e-05, "loss": 1.2643, "step": 7767 }, { "epoch": 2.313669279026043, "grad_norm": 0.22810158133506775, "learning_rate": 1.7943749109347742e-05, "loss": 1.2681, "step": 7768 }, { "epoch": 2.3139671252257115, "grad_norm": 0.2263939380645752, "learning_rate": 1.7943163167994108e-05, "loss": 1.2475, "step": 7769 }, { "epoch": 2.3142649714253802, "grad_norm": 0.23496605455875397, "learning_rate": 1.7942577152738218e-05, "loss": 1.2654, "step": 7770 }, { "epoch": 2.314562817625049, "grad_norm": 0.23172979056835175, "learning_rate": 1.794199106358552e-05, "loss": 1.255, "step": 7771 }, { "epoch": 2.3148606638247173, "grad_norm": 0.23240801692008972, "learning_rate": 1.794140490054147e-05, "loss": 1.2627, "step": 7772 }, { "epoch": 2.315158510024386, "grad_norm": 0.23094674944877625, "learning_rate": 1.794081866361152e-05, "loss": 1.2584, "step": 7773 }, { "epoch": 2.315456356224055, "grad_norm": 0.23189008235931396, "learning_rate": 1.794023235280112e-05, "loss": 1.2571, "step": 7774 }, { "epoch": 2.315754202423723, "grad_norm": 0.22598449885845184, "learning_rate": 1.7939645968115734e-05, "loss": 1.258, "step": 7775 }, { "epoch": 2.316052048623392, "grad_norm": 0.2240254282951355, "learning_rate": 1.793905950956081e-05, "loss": 1.2351, "step": 7776 }, { "epoch": 2.3163498948230608, "grad_norm": 0.22057874500751495, "learning_rate": 1.7938472977141814e-05, "loss": 1.2552, "step": 7777 }, { "epoch": 2.3166477410227295, "grad_norm": 0.21995866298675537, "learning_rate": 1.793788637086419e-05, "loss": 1.2783, "step": 7778 }, { "epoch": 2.316945587222398, "grad_norm": 0.23166123032569885, "learning_rate": 1.7937299690733402e-05, "loss": 1.2578, "step": 7779 }, { "epoch": 2.3172434334220666, "grad_norm": 0.23654724657535553, "learning_rate": 1.793671293675491e-05, "loss": 1.2624, "step": 7780 }, { "epoch": 2.3175412796217354, "grad_norm": 0.22917701303958893, "learning_rate": 1.7936126108934174e-05, "loss": 1.2544, "step": 7781 }, { "epoch": 2.317839125821404, "grad_norm": 0.23114502429962158, "learning_rate": 1.7935539207276648e-05, "loss": 1.271, "step": 7782 }, { "epoch": 2.3181369720210725, "grad_norm": 0.23824429512023926, "learning_rate": 1.7934952231787797e-05, "loss": 1.2563, "step": 7783 }, { "epoch": 2.3184348182207413, "grad_norm": 0.23625443875789642, "learning_rate": 1.7934365182473082e-05, "loss": 1.2535, "step": 7784 }, { "epoch": 2.31873266442041, "grad_norm": 0.2254304736852646, "learning_rate": 1.793377805933796e-05, "loss": 1.2567, "step": 7785 }, { "epoch": 2.3190305106200784, "grad_norm": 0.2224108725786209, "learning_rate": 1.79331908623879e-05, "loss": 1.2665, "step": 7786 }, { "epoch": 2.319328356819747, "grad_norm": 0.22796858847141266, "learning_rate": 1.7932603591628363e-05, "loss": 1.2637, "step": 7787 }, { "epoch": 2.319626203019416, "grad_norm": 0.22788214683532715, "learning_rate": 1.793201624706481e-05, "loss": 1.2578, "step": 7788 }, { "epoch": 2.3199240492190847, "grad_norm": 0.22808735072612762, "learning_rate": 1.793142882870271e-05, "loss": 1.2542, "step": 7789 }, { "epoch": 2.320221895418753, "grad_norm": 0.21906575560569763, "learning_rate": 1.7930841336547525e-05, "loss": 1.2411, "step": 7790 }, { "epoch": 2.320519741618422, "grad_norm": 0.22968780994415283, "learning_rate": 1.7930253770604726e-05, "loss": 1.2583, "step": 7791 }, { "epoch": 2.3208175878180906, "grad_norm": 0.2303999662399292, "learning_rate": 1.792966613087977e-05, "loss": 1.2609, "step": 7792 }, { "epoch": 2.3211154340177593, "grad_norm": 0.2238573133945465, "learning_rate": 1.7929078417378135e-05, "loss": 1.2414, "step": 7793 }, { "epoch": 2.3214132802174277, "grad_norm": 0.22726310789585114, "learning_rate": 1.792849063010528e-05, "loss": 1.256, "step": 7794 }, { "epoch": 2.3217111264170964, "grad_norm": 0.22456595301628113, "learning_rate": 1.7927902769066682e-05, "loss": 1.2592, "step": 7795 }, { "epoch": 2.322008972616765, "grad_norm": 0.2278863936662674, "learning_rate": 1.7927314834267804e-05, "loss": 1.2402, "step": 7796 }, { "epoch": 2.3223068188164335, "grad_norm": 0.2422952800989151, "learning_rate": 1.7926726825714117e-05, "loss": 1.2547, "step": 7797 }, { "epoch": 2.3226046650161023, "grad_norm": 0.22926056385040283, "learning_rate": 1.7926138743411095e-05, "loss": 1.2543, "step": 7798 }, { "epoch": 2.322902511215771, "grad_norm": 0.23135961592197418, "learning_rate": 1.792555058736421e-05, "loss": 1.2626, "step": 7799 }, { "epoch": 2.3232003574154394, "grad_norm": 0.22803236544132233, "learning_rate": 1.7924962357578928e-05, "loss": 1.2424, "step": 7800 }, { "epoch": 2.323498203615108, "grad_norm": 0.22060105204582214, "learning_rate": 1.7924374054060725e-05, "loss": 1.2479, "step": 7801 }, { "epoch": 2.323796049814777, "grad_norm": 0.2315826714038849, "learning_rate": 1.7923785676815078e-05, "loss": 1.2546, "step": 7802 }, { "epoch": 2.3240938960144457, "grad_norm": 0.2271241992712021, "learning_rate": 1.7923197225847457e-05, "loss": 1.2718, "step": 7803 }, { "epoch": 2.324391742214114, "grad_norm": 0.23083896934986115, "learning_rate": 1.792260870116334e-05, "loss": 1.2639, "step": 7804 }, { "epoch": 2.324689588413783, "grad_norm": 0.2252204567193985, "learning_rate": 1.7922020102768197e-05, "loss": 1.2534, "step": 7805 }, { "epoch": 2.3249874346134516, "grad_norm": 0.21705453097820282, "learning_rate": 1.792143143066751e-05, "loss": 1.2344, "step": 7806 }, { "epoch": 2.3252852808131204, "grad_norm": 0.23861151933670044, "learning_rate": 1.7920842684866756e-05, "loss": 1.2557, "step": 7807 }, { "epoch": 2.3255831270127887, "grad_norm": 0.22201643884181976, "learning_rate": 1.7920253865371407e-05, "loss": 1.236, "step": 7808 }, { "epoch": 2.3258809732124575, "grad_norm": 0.22820638120174408, "learning_rate": 1.7919664972186946e-05, "loss": 1.2531, "step": 7809 }, { "epoch": 2.3261788194121262, "grad_norm": 0.23169077932834625, "learning_rate": 1.7919076005318852e-05, "loss": 1.2574, "step": 7810 }, { "epoch": 2.3264766656117946, "grad_norm": 0.2316010743379593, "learning_rate": 1.79184869647726e-05, "loss": 1.2705, "step": 7811 }, { "epoch": 2.3267745118114633, "grad_norm": 0.22276555001735687, "learning_rate": 1.791789785055368e-05, "loss": 1.2446, "step": 7812 }, { "epoch": 2.327072358011132, "grad_norm": 0.22381189465522766, "learning_rate": 1.791730866266756e-05, "loss": 1.2644, "step": 7813 }, { "epoch": 2.3273702042108004, "grad_norm": 0.23511162400245667, "learning_rate": 1.7916719401119733e-05, "loss": 1.2624, "step": 7814 }, { "epoch": 2.327668050410469, "grad_norm": 0.21948397159576416, "learning_rate": 1.7916130065915675e-05, "loss": 1.2624, "step": 7815 }, { "epoch": 2.327965896610138, "grad_norm": 0.22322896122932434, "learning_rate": 1.7915540657060873e-05, "loss": 1.2579, "step": 7816 }, { "epoch": 2.3282637428098067, "grad_norm": 0.22729644179344177, "learning_rate": 1.791495117456081e-05, "loss": 1.2524, "step": 7817 }, { "epoch": 2.328561589009475, "grad_norm": 0.22460895776748657, "learning_rate": 1.7914361618420966e-05, "loss": 1.267, "step": 7818 }, { "epoch": 2.328859435209144, "grad_norm": 0.2263486236333847, "learning_rate": 1.7913771988646832e-05, "loss": 1.2576, "step": 7819 }, { "epoch": 2.3291572814088126, "grad_norm": 0.22927837073802948, "learning_rate": 1.791318228524389e-05, "loss": 1.2536, "step": 7820 }, { "epoch": 2.3294551276084814, "grad_norm": 0.23089389503002167, "learning_rate": 1.7912592508217627e-05, "loss": 1.2668, "step": 7821 }, { "epoch": 2.3297529738081497, "grad_norm": 0.23298054933547974, "learning_rate": 1.7912002657573533e-05, "loss": 1.253, "step": 7822 }, { "epoch": 2.3300508200078185, "grad_norm": 0.23412643373012543, "learning_rate": 1.7911412733317096e-05, "loss": 1.2693, "step": 7823 }, { "epoch": 2.3303486662074873, "grad_norm": 0.21775412559509277, "learning_rate": 1.7910822735453797e-05, "loss": 1.2557, "step": 7824 }, { "epoch": 2.3306465124071556, "grad_norm": 0.22747136652469635, "learning_rate": 1.7910232663989135e-05, "loss": 1.2553, "step": 7825 }, { "epoch": 2.3309443586068244, "grad_norm": 0.22300876677036285, "learning_rate": 1.7909642518928593e-05, "loss": 1.2591, "step": 7826 }, { "epoch": 2.331242204806493, "grad_norm": 0.22167746722698212, "learning_rate": 1.790905230027767e-05, "loss": 1.2485, "step": 7827 }, { "epoch": 2.3315400510061615, "grad_norm": 0.22107626497745514, "learning_rate": 1.790846200804185e-05, "loss": 1.2575, "step": 7828 }, { "epoch": 2.3318378972058302, "grad_norm": 0.2367164045572281, "learning_rate": 1.790787164222662e-05, "loss": 1.2628, "step": 7829 }, { "epoch": 2.332135743405499, "grad_norm": 0.22507604956626892, "learning_rate": 1.790728120283749e-05, "loss": 1.2579, "step": 7830 }, { "epoch": 2.332433589605168, "grad_norm": 0.22825001180171967, "learning_rate": 1.7906690689879935e-05, "loss": 1.2661, "step": 7831 }, { "epoch": 2.332731435804836, "grad_norm": 0.2338763177394867, "learning_rate": 1.790610010335946e-05, "loss": 1.2355, "step": 7832 }, { "epoch": 2.333029282004505, "grad_norm": 0.22398051619529724, "learning_rate": 1.790550944328156e-05, "loss": 1.2439, "step": 7833 }, { "epoch": 2.3333271282041737, "grad_norm": 0.2237871140241623, "learning_rate": 1.7904918709651723e-05, "loss": 1.2651, "step": 7834 }, { "epoch": 2.3336249744038424, "grad_norm": 0.22252443432807922, "learning_rate": 1.790432790247545e-05, "loss": 1.2581, "step": 7835 }, { "epoch": 2.3339228206035108, "grad_norm": 0.21413569152355194, "learning_rate": 1.7903737021758237e-05, "loss": 1.2473, "step": 7836 }, { "epoch": 2.3342206668031795, "grad_norm": 0.2181364744901657, "learning_rate": 1.7903146067505582e-05, "loss": 1.2618, "step": 7837 }, { "epoch": 2.3345185130028483, "grad_norm": 0.22701337933540344, "learning_rate": 1.790255503972298e-05, "loss": 1.2466, "step": 7838 }, { "epoch": 2.3348163592025166, "grad_norm": 0.2238103300333023, "learning_rate": 1.7901963938415935e-05, "loss": 1.2459, "step": 7839 }, { "epoch": 2.3351142054021854, "grad_norm": 0.230665922164917, "learning_rate": 1.7901372763589947e-05, "loss": 1.2502, "step": 7840 }, { "epoch": 2.335412051601854, "grad_norm": 0.2242044061422348, "learning_rate": 1.790078151525051e-05, "loss": 1.2501, "step": 7841 }, { "epoch": 2.3357098978015225, "grad_norm": 0.23318134248256683, "learning_rate": 1.790019019340313e-05, "loss": 1.2467, "step": 7842 }, { "epoch": 2.3360077440011913, "grad_norm": 0.2387678474187851, "learning_rate": 1.7899598798053306e-05, "loss": 1.2434, "step": 7843 }, { "epoch": 2.33630559020086, "grad_norm": 0.22364795207977295, "learning_rate": 1.789900732920654e-05, "loss": 1.2656, "step": 7844 }, { "epoch": 2.336603436400529, "grad_norm": 0.22308523952960968, "learning_rate": 1.7898415786868338e-05, "loss": 1.2808, "step": 7845 }, { "epoch": 2.336901282600197, "grad_norm": 0.23038873076438904, "learning_rate": 1.78978241710442e-05, "loss": 1.2387, "step": 7846 }, { "epoch": 2.337199128799866, "grad_norm": 0.22550682723522186, "learning_rate": 1.7897232481739634e-05, "loss": 1.2587, "step": 7847 }, { "epoch": 2.3374969749995347, "grad_norm": 0.22604312002658844, "learning_rate": 1.789664071896014e-05, "loss": 1.2436, "step": 7848 }, { "epoch": 2.3377948211992035, "grad_norm": 0.2248874008655548, "learning_rate": 1.789604888271123e-05, "loss": 1.2602, "step": 7849 }, { "epoch": 2.338092667398872, "grad_norm": 0.22980491816997528, "learning_rate": 1.7895456972998406e-05, "loss": 1.2608, "step": 7850 }, { "epoch": 2.3383905135985406, "grad_norm": 0.22804193198680878, "learning_rate": 1.7894864989827176e-05, "loss": 1.2568, "step": 7851 }, { "epoch": 2.3386883597982093, "grad_norm": 0.2315458208322525, "learning_rate": 1.7894272933203048e-05, "loss": 1.2587, "step": 7852 }, { "epoch": 2.3389862059978777, "grad_norm": 0.22518876194953918, "learning_rate": 1.7893680803131528e-05, "loss": 1.2641, "step": 7853 }, { "epoch": 2.3392840521975464, "grad_norm": 0.2149897813796997, "learning_rate": 1.789308859961813e-05, "loss": 1.2486, "step": 7854 }, { "epoch": 2.339581898397215, "grad_norm": 0.23355364799499512, "learning_rate": 1.7892496322668363e-05, "loss": 1.2454, "step": 7855 }, { "epoch": 2.339879744596884, "grad_norm": 0.22749406099319458, "learning_rate": 1.7891903972287733e-05, "loss": 1.2615, "step": 7856 }, { "epoch": 2.3401775907965523, "grad_norm": 0.23475691676139832, "learning_rate": 1.7891311548481754e-05, "loss": 1.2476, "step": 7857 }, { "epoch": 2.340475436996221, "grad_norm": 0.24041210114955902, "learning_rate": 1.789071905125594e-05, "loss": 1.2372, "step": 7858 }, { "epoch": 2.34077328319589, "grad_norm": 0.24427035450935364, "learning_rate": 1.78901264806158e-05, "loss": 1.2559, "step": 7859 }, { "epoch": 2.3410711293955586, "grad_norm": 0.2267962247133255, "learning_rate": 1.7889533836566845e-05, "loss": 1.2533, "step": 7860 }, { "epoch": 2.341368975595227, "grad_norm": 0.22979292273521423, "learning_rate": 1.7888941119114597e-05, "loss": 1.2531, "step": 7861 }, { "epoch": 2.3416668217948957, "grad_norm": 0.2190021276473999, "learning_rate": 1.7888348328264563e-05, "loss": 1.2542, "step": 7862 }, { "epoch": 2.3419646679945645, "grad_norm": 0.23141010105609894, "learning_rate": 1.7887755464022265e-05, "loss": 1.2546, "step": 7863 }, { "epoch": 2.342262514194233, "grad_norm": 0.22548598051071167, "learning_rate": 1.7887162526393212e-05, "loss": 1.2325, "step": 7864 }, { "epoch": 2.3425603603939016, "grad_norm": 0.22789525985717773, "learning_rate": 1.7886569515382927e-05, "loss": 1.2457, "step": 7865 }, { "epoch": 2.3428582065935704, "grad_norm": 0.2295307219028473, "learning_rate": 1.7885976430996922e-05, "loss": 1.2633, "step": 7866 }, { "epoch": 2.3431560527932387, "grad_norm": 0.2266976684331894, "learning_rate": 1.7885383273240716e-05, "loss": 1.2625, "step": 7867 }, { "epoch": 2.3434538989929075, "grad_norm": 0.22434692084789276, "learning_rate": 1.7884790042119826e-05, "loss": 1.2661, "step": 7868 }, { "epoch": 2.3437517451925762, "grad_norm": 0.23144297301769257, "learning_rate": 1.7884196737639777e-05, "loss": 1.2534, "step": 7869 }, { "epoch": 2.344049591392245, "grad_norm": 0.24207472801208496, "learning_rate": 1.788360335980609e-05, "loss": 1.2495, "step": 7870 }, { "epoch": 2.3443474375919133, "grad_norm": 0.22699899971485138, "learning_rate": 1.7883009908624276e-05, "loss": 1.2512, "step": 7871 }, { "epoch": 2.344645283791582, "grad_norm": 0.23398232460021973, "learning_rate": 1.7882416384099867e-05, "loss": 1.2441, "step": 7872 }, { "epoch": 2.344943129991251, "grad_norm": 0.2263219654560089, "learning_rate": 1.7881822786238376e-05, "loss": 1.2361, "step": 7873 }, { "epoch": 2.3452409761909196, "grad_norm": 0.22675180435180664, "learning_rate": 1.7881229115045333e-05, "loss": 1.2534, "step": 7874 }, { "epoch": 2.345538822390588, "grad_norm": 0.23145712912082672, "learning_rate": 1.7880635370526257e-05, "loss": 1.2602, "step": 7875 }, { "epoch": 2.3458366685902567, "grad_norm": 0.225954070687294, "learning_rate": 1.7880041552686674e-05, "loss": 1.2608, "step": 7876 }, { "epoch": 2.3461345147899255, "grad_norm": 0.2244177609682083, "learning_rate": 1.7879447661532107e-05, "loss": 1.226, "step": 7877 }, { "epoch": 2.346432360989594, "grad_norm": 0.22313684225082397, "learning_rate": 1.7878853697068085e-05, "loss": 1.2526, "step": 7878 }, { "epoch": 2.3467302071892626, "grad_norm": 0.23761752247810364, "learning_rate": 1.787825965930013e-05, "loss": 1.2514, "step": 7879 }, { "epoch": 2.3470280533889314, "grad_norm": 0.2322452962398529, "learning_rate": 1.7877665548233775e-05, "loss": 1.2581, "step": 7880 }, { "epoch": 2.3473258995885997, "grad_norm": 0.22980079054832458, "learning_rate": 1.7877071363874542e-05, "loss": 1.2764, "step": 7881 }, { "epoch": 2.3476237457882685, "grad_norm": 0.24619711935520172, "learning_rate": 1.7876477106227958e-05, "loss": 1.2592, "step": 7882 }, { "epoch": 2.3479215919879373, "grad_norm": 0.23735593259334564, "learning_rate": 1.7875882775299557e-05, "loss": 1.2743, "step": 7883 }, { "epoch": 2.348219438187606, "grad_norm": 0.22482122480869293, "learning_rate": 1.7875288371094867e-05, "loss": 1.2553, "step": 7884 }, { "epoch": 2.3485172843872744, "grad_norm": 0.2246832549571991, "learning_rate": 1.7874693893619415e-05, "loss": 1.2675, "step": 7885 }, { "epoch": 2.348815130586943, "grad_norm": 0.2285926192998886, "learning_rate": 1.7874099342878733e-05, "loss": 1.26, "step": 7886 }, { "epoch": 2.349112976786612, "grad_norm": 0.2274121195077896, "learning_rate": 1.7873504718878362e-05, "loss": 1.2479, "step": 7887 }, { "epoch": 2.3494108229862807, "grad_norm": 0.2367192953824997, "learning_rate": 1.7872910021623816e-05, "loss": 1.2493, "step": 7888 }, { "epoch": 2.349708669185949, "grad_norm": 0.22948838770389557, "learning_rate": 1.7872315251120643e-05, "loss": 1.2585, "step": 7889 }, { "epoch": 2.350006515385618, "grad_norm": 0.23225794732570648, "learning_rate": 1.7871720407374375e-05, "loss": 1.2211, "step": 7890 }, { "epoch": 2.3503043615852865, "grad_norm": 0.22939930856227875, "learning_rate": 1.787112549039054e-05, "loss": 1.2483, "step": 7891 }, { "epoch": 2.350602207784955, "grad_norm": 0.242367222905159, "learning_rate": 1.787053050017468e-05, "loss": 1.2621, "step": 7892 }, { "epoch": 2.3509000539846236, "grad_norm": 0.23725159466266632, "learning_rate": 1.786993543673232e-05, "loss": 1.2481, "step": 7893 }, { "epoch": 2.3511979001842924, "grad_norm": 0.2657933235168457, "learning_rate": 1.7869340300069012e-05, "loss": 1.2441, "step": 7894 }, { "epoch": 2.3514957463839607, "grad_norm": 0.2347121685743332, "learning_rate": 1.786874509019028e-05, "loss": 1.2523, "step": 7895 }, { "epoch": 2.3517935925836295, "grad_norm": 0.23327769339084625, "learning_rate": 1.7868149807101666e-05, "loss": 1.2558, "step": 7896 }, { "epoch": 2.3520914387832983, "grad_norm": 0.22707679867744446, "learning_rate": 1.786755445080871e-05, "loss": 1.2515, "step": 7897 }, { "epoch": 2.352389284982967, "grad_norm": 0.2269888073205948, "learning_rate": 1.786695902131695e-05, "loss": 1.2621, "step": 7898 }, { "epoch": 2.3526871311826354, "grad_norm": 0.22768308222293854, "learning_rate": 1.7866363518631925e-05, "loss": 1.2543, "step": 7899 }, { "epoch": 2.352984977382304, "grad_norm": 0.21840879321098328, "learning_rate": 1.7865767942759177e-05, "loss": 1.2501, "step": 7900 }, { "epoch": 2.353282823581973, "grad_norm": 0.23985709249973297, "learning_rate": 1.786517229370425e-05, "loss": 1.2419, "step": 7901 }, { "epoch": 2.3535806697816417, "grad_norm": 0.2266675978899002, "learning_rate": 1.7864576571472678e-05, "loss": 1.2625, "step": 7902 }, { "epoch": 2.35387851598131, "grad_norm": 0.23214082419872284, "learning_rate": 1.7863980776070007e-05, "loss": 1.2592, "step": 7903 }, { "epoch": 2.354176362180979, "grad_norm": 0.23541660606861115, "learning_rate": 1.7863384907501784e-05, "loss": 1.2702, "step": 7904 }, { "epoch": 2.3544742083806476, "grad_norm": 0.23487745225429535, "learning_rate": 1.786278896577355e-05, "loss": 1.2521, "step": 7905 }, { "epoch": 2.354772054580316, "grad_norm": 0.23422829806804657, "learning_rate": 1.7862192950890846e-05, "loss": 1.2473, "step": 7906 }, { "epoch": 2.3550699007799847, "grad_norm": 0.2254904955625534, "learning_rate": 1.7861596862859224e-05, "loss": 1.2518, "step": 7907 }, { "epoch": 2.3553677469796535, "grad_norm": 0.23697738349437714, "learning_rate": 1.7861000701684225e-05, "loss": 1.2688, "step": 7908 }, { "epoch": 2.355665593179322, "grad_norm": 0.22762492299079895, "learning_rate": 1.7860404467371398e-05, "loss": 1.2344, "step": 7909 }, { "epoch": 2.3559634393789906, "grad_norm": 0.24504396319389343, "learning_rate": 1.785980815992629e-05, "loss": 1.256, "step": 7910 }, { "epoch": 2.3562612855786593, "grad_norm": 0.2187090963125229, "learning_rate": 1.785921177935445e-05, "loss": 1.2615, "step": 7911 }, { "epoch": 2.356559131778328, "grad_norm": 0.23367871344089508, "learning_rate": 1.785861532566142e-05, "loss": 1.2485, "step": 7912 }, { "epoch": 2.3568569779779964, "grad_norm": 0.23267200589179993, "learning_rate": 1.7858018798852758e-05, "loss": 1.2613, "step": 7913 }, { "epoch": 2.357154824177665, "grad_norm": 0.24347952008247375, "learning_rate": 1.7857422198934012e-05, "loss": 1.2732, "step": 7914 }, { "epoch": 2.357452670377334, "grad_norm": 0.22890640795230865, "learning_rate": 1.785682552591073e-05, "loss": 1.2683, "step": 7915 }, { "epoch": 2.3577505165770027, "grad_norm": 0.2342701554298401, "learning_rate": 1.7856228779788462e-05, "loss": 1.2664, "step": 7916 }, { "epoch": 2.358048362776671, "grad_norm": 0.2646051049232483, "learning_rate": 1.7855631960572764e-05, "loss": 1.2586, "step": 7917 }, { "epoch": 2.35834620897634, "grad_norm": 0.25664758682250977, "learning_rate": 1.7855035068269192e-05, "loss": 1.267, "step": 7918 }, { "epoch": 2.3586440551760086, "grad_norm": 0.25148266553878784, "learning_rate": 1.785443810288329e-05, "loss": 1.2465, "step": 7919 }, { "epoch": 2.358941901375677, "grad_norm": 0.2698698043823242, "learning_rate": 1.7853841064420617e-05, "loss": 1.247, "step": 7920 }, { "epoch": 2.3592397475753457, "grad_norm": 0.2338656485080719, "learning_rate": 1.785324395288673e-05, "loss": 1.2559, "step": 7921 }, { "epoch": 2.3595375937750145, "grad_norm": 0.24385720491409302, "learning_rate": 1.7852646768287182e-05, "loss": 1.2537, "step": 7922 }, { "epoch": 2.3598354399746833, "grad_norm": 0.24207328259944916, "learning_rate": 1.7852049510627526e-05, "loss": 1.2706, "step": 7923 }, { "epoch": 2.3601332861743516, "grad_norm": 0.2335837334394455, "learning_rate": 1.7851452179913327e-05, "loss": 1.2443, "step": 7924 }, { "epoch": 2.3604311323740204, "grad_norm": 0.24712742865085602, "learning_rate": 1.7850854776150136e-05, "loss": 1.2575, "step": 7925 }, { "epoch": 2.360728978573689, "grad_norm": 0.23162207007408142, "learning_rate": 1.785025729934351e-05, "loss": 1.2437, "step": 7926 }, { "epoch": 2.361026824773358, "grad_norm": 0.2364334911108017, "learning_rate": 1.784965974949901e-05, "loss": 1.2456, "step": 7927 }, { "epoch": 2.3613246709730262, "grad_norm": 0.22042107582092285, "learning_rate": 1.7849062126622204e-05, "loss": 1.2403, "step": 7928 }, { "epoch": 2.361622517172695, "grad_norm": 0.2693556249141693, "learning_rate": 1.7848464430718637e-05, "loss": 1.2756, "step": 7929 }, { "epoch": 2.3619203633723638, "grad_norm": 0.2263534814119339, "learning_rate": 1.784786666179388e-05, "loss": 1.258, "step": 7930 }, { "epoch": 2.362218209572032, "grad_norm": 0.25349611043930054, "learning_rate": 1.7847268819853493e-05, "loss": 1.2588, "step": 7931 }, { "epoch": 2.362516055771701, "grad_norm": 0.23080848157405853, "learning_rate": 1.7846670904903032e-05, "loss": 1.2447, "step": 7932 }, { "epoch": 2.3628139019713696, "grad_norm": 0.23246009647846222, "learning_rate": 1.784607291694807e-05, "loss": 1.2686, "step": 7933 }, { "epoch": 2.363111748171038, "grad_norm": 0.23326291143894196, "learning_rate": 1.7845474855994166e-05, "loss": 1.2531, "step": 7934 }, { "epoch": 2.3634095943707067, "grad_norm": 0.23655524849891663, "learning_rate": 1.784487672204688e-05, "loss": 1.2538, "step": 7935 }, { "epoch": 2.3637074405703755, "grad_norm": 0.21734671294689178, "learning_rate": 1.7844278515111785e-05, "loss": 1.2333, "step": 7936 }, { "epoch": 2.3640052867700443, "grad_norm": 0.23813478648662567, "learning_rate": 1.784368023519444e-05, "loss": 1.2702, "step": 7937 }, { "epoch": 2.3643031329697126, "grad_norm": 0.22581513226032257, "learning_rate": 1.7843081882300414e-05, "loss": 1.2531, "step": 7938 }, { "epoch": 2.3646009791693814, "grad_norm": 0.2356468141078949, "learning_rate": 1.7842483456435275e-05, "loss": 1.2545, "step": 7939 }, { "epoch": 2.36489882536905, "grad_norm": 0.23121199011802673, "learning_rate": 1.784188495760459e-05, "loss": 1.2583, "step": 7940 }, { "epoch": 2.365196671568719, "grad_norm": 0.3000968098640442, "learning_rate": 1.7841286385813922e-05, "loss": 1.2641, "step": 7941 }, { "epoch": 2.3654945177683873, "grad_norm": 0.26428544521331787, "learning_rate": 1.7840687741068852e-05, "loss": 1.2495, "step": 7942 }, { "epoch": 2.365792363968056, "grad_norm": 0.2293512225151062, "learning_rate": 1.7840089023374937e-05, "loss": 1.2388, "step": 7943 }, { "epoch": 2.366090210167725, "grad_norm": 0.25424709916114807, "learning_rate": 1.7839490232737756e-05, "loss": 1.246, "step": 7944 }, { "epoch": 2.366388056367393, "grad_norm": 0.22907119989395142, "learning_rate": 1.783889136916288e-05, "loss": 1.2467, "step": 7945 }, { "epoch": 2.366685902567062, "grad_norm": 0.24755051732063293, "learning_rate": 1.7838292432655874e-05, "loss": 1.2576, "step": 7946 }, { "epoch": 2.3669837487667307, "grad_norm": 0.22845561802387238, "learning_rate": 1.7837693423222314e-05, "loss": 1.266, "step": 7947 }, { "epoch": 2.367281594966399, "grad_norm": 0.23387497663497925, "learning_rate": 1.7837094340867775e-05, "loss": 1.2731, "step": 7948 }, { "epoch": 2.3675794411660678, "grad_norm": 0.2515455186367035, "learning_rate": 1.7836495185597828e-05, "loss": 1.2627, "step": 7949 }, { "epoch": 2.3678772873657365, "grad_norm": 0.22718527913093567, "learning_rate": 1.783589595741805e-05, "loss": 1.2673, "step": 7950 }, { "epoch": 2.3681751335654053, "grad_norm": 0.2656766474246979, "learning_rate": 1.7835296656334017e-05, "loss": 1.2578, "step": 7951 }, { "epoch": 2.3684729797650736, "grad_norm": 0.22987836599349976, "learning_rate": 1.78346972823513e-05, "loss": 1.2566, "step": 7952 }, { "epoch": 2.3687708259647424, "grad_norm": 0.2829611301422119, "learning_rate": 1.7834097835475475e-05, "loss": 1.2561, "step": 7953 }, { "epoch": 2.369068672164411, "grad_norm": 0.25986650586128235, "learning_rate": 1.7833498315712126e-05, "loss": 1.2722, "step": 7954 }, { "epoch": 2.36936651836408, "grad_norm": 0.24209918081760406, "learning_rate": 1.783289872306683e-05, "loss": 1.252, "step": 7955 }, { "epoch": 2.3696643645637483, "grad_norm": 0.3400126099586487, "learning_rate": 1.7832299057545158e-05, "loss": 1.245, "step": 7956 }, { "epoch": 2.369962210763417, "grad_norm": 0.27828526496887207, "learning_rate": 1.78316993191527e-05, "loss": 1.247, "step": 7957 }, { "epoch": 2.370260056963086, "grad_norm": 0.27232053875923157, "learning_rate": 1.7831099507895026e-05, "loss": 1.2762, "step": 7958 }, { "epoch": 2.370557903162754, "grad_norm": 0.2513825595378876, "learning_rate": 1.783049962377772e-05, "loss": 1.2554, "step": 7959 }, { "epoch": 2.370855749362423, "grad_norm": 0.28506922721862793, "learning_rate": 1.7829899666806363e-05, "loss": 1.2699, "step": 7960 }, { "epoch": 2.3711535955620917, "grad_norm": 0.2429376244544983, "learning_rate": 1.7829299636986536e-05, "loss": 1.2713, "step": 7961 }, { "epoch": 2.37145144176176, "grad_norm": 0.2558785080909729, "learning_rate": 1.7828699534323828e-05, "loss": 1.245, "step": 7962 }, { "epoch": 2.371749287961429, "grad_norm": 0.23596563935279846, "learning_rate": 1.7828099358823818e-05, "loss": 1.244, "step": 7963 }, { "epoch": 2.3720471341610976, "grad_norm": 0.2452450841665268, "learning_rate": 1.7827499110492086e-05, "loss": 1.2415, "step": 7964 }, { "epoch": 2.3723449803607664, "grad_norm": 0.24221771955490112, "learning_rate": 1.7826898789334223e-05, "loss": 1.2358, "step": 7965 }, { "epoch": 2.3726428265604347, "grad_norm": 0.2362058162689209, "learning_rate": 1.7826298395355806e-05, "loss": 1.2669, "step": 7966 }, { "epoch": 2.3729406727601035, "grad_norm": 0.22643139958381653, "learning_rate": 1.7825697928562433e-05, "loss": 1.245, "step": 7967 }, { "epoch": 2.3732385189597722, "grad_norm": 0.23953528702259064, "learning_rate": 1.7825097388959682e-05, "loss": 1.2377, "step": 7968 }, { "epoch": 2.373536365159441, "grad_norm": 0.24942715466022491, "learning_rate": 1.7824496776553143e-05, "loss": 1.2499, "step": 7969 }, { "epoch": 2.3738342113591093, "grad_norm": 0.23290188610553741, "learning_rate": 1.7823896091348403e-05, "loss": 1.2614, "step": 7970 }, { "epoch": 2.374132057558778, "grad_norm": 0.239968404173851, "learning_rate": 1.782329533335105e-05, "loss": 1.2475, "step": 7971 }, { "epoch": 2.374429903758447, "grad_norm": 0.25155937671661377, "learning_rate": 1.7822694502566675e-05, "loss": 1.2543, "step": 7972 }, { "epoch": 2.374727749958115, "grad_norm": 0.2466360330581665, "learning_rate": 1.7822093599000868e-05, "loss": 1.2458, "step": 7973 }, { "epoch": 2.375025596157784, "grad_norm": 0.23671437799930573, "learning_rate": 1.782149262265922e-05, "loss": 1.2507, "step": 7974 }, { "epoch": 2.3753234423574527, "grad_norm": 0.2404567152261734, "learning_rate": 1.782089157354732e-05, "loss": 1.26, "step": 7975 }, { "epoch": 2.375621288557121, "grad_norm": 0.23791678249835968, "learning_rate": 1.7820290451670767e-05, "loss": 1.234, "step": 7976 }, { "epoch": 2.37591913475679, "grad_norm": 0.23145097494125366, "learning_rate": 1.7819689257035144e-05, "loss": 1.249, "step": 7977 }, { "epoch": 2.3762169809564586, "grad_norm": 0.2443423867225647, "learning_rate": 1.781908798964605e-05, "loss": 1.2532, "step": 7978 }, { "epoch": 2.3765148271561274, "grad_norm": 0.23321878910064697, "learning_rate": 1.781848664950908e-05, "loss": 1.2391, "step": 7979 }, { "epoch": 2.3768126733557957, "grad_norm": 0.21833214163780212, "learning_rate": 1.7817885236629824e-05, "loss": 1.2568, "step": 7980 }, { "epoch": 2.3771105195554645, "grad_norm": 0.23264311254024506, "learning_rate": 1.7817283751013882e-05, "loss": 1.2725, "step": 7981 }, { "epoch": 2.3774083657551333, "grad_norm": 0.23435115814208984, "learning_rate": 1.781668219266685e-05, "loss": 1.246, "step": 7982 }, { "epoch": 2.377706211954802, "grad_norm": 0.22859029471874237, "learning_rate": 1.7816080561594322e-05, "loss": 1.2546, "step": 7983 }, { "epoch": 2.3780040581544704, "grad_norm": 0.23380321264266968, "learning_rate": 1.7815478857801896e-05, "loss": 1.2607, "step": 7984 }, { "epoch": 2.378301904354139, "grad_norm": 0.23703013360500336, "learning_rate": 1.781487708129517e-05, "loss": 1.2652, "step": 7985 }, { "epoch": 2.378599750553808, "grad_norm": 0.23852579295635223, "learning_rate": 1.7814275232079748e-05, "loss": 1.2528, "step": 7986 }, { "epoch": 2.3788975967534762, "grad_norm": 0.2664247751235962, "learning_rate": 1.7813673310161227e-05, "loss": 1.2535, "step": 7987 }, { "epoch": 2.379195442953145, "grad_norm": 0.2276148945093155, "learning_rate": 1.7813071315545202e-05, "loss": 1.2507, "step": 7988 }, { "epoch": 2.3794932891528138, "grad_norm": 0.2421445995569229, "learning_rate": 1.7812469248237277e-05, "loss": 1.2532, "step": 7989 }, { "epoch": 2.3797911353524825, "grad_norm": 0.22952455282211304, "learning_rate": 1.7811867108243056e-05, "loss": 1.2545, "step": 7990 }, { "epoch": 2.380088981552151, "grad_norm": 0.23287591338157654, "learning_rate": 1.781126489556814e-05, "loss": 1.2528, "step": 7991 }, { "epoch": 2.3803868277518196, "grad_norm": 0.24828951060771942, "learning_rate": 1.7810662610218132e-05, "loss": 1.2648, "step": 7992 }, { "epoch": 2.3806846739514884, "grad_norm": 0.24405096471309662, "learning_rate": 1.7810060252198634e-05, "loss": 1.2523, "step": 7993 }, { "epoch": 2.380982520151157, "grad_norm": 0.23873582482337952, "learning_rate": 1.780945782151525e-05, "loss": 1.2441, "step": 7994 }, { "epoch": 2.3812803663508255, "grad_norm": 0.24015365540981293, "learning_rate": 1.7808855318173586e-05, "loss": 1.2482, "step": 7995 }, { "epoch": 2.3815782125504943, "grad_norm": 0.24203690886497498, "learning_rate": 1.780825274217925e-05, "loss": 1.2496, "step": 7996 }, { "epoch": 2.381876058750163, "grad_norm": 0.3029463589191437, "learning_rate": 1.7807650093537844e-05, "loss": 1.2459, "step": 7997 }, { "epoch": 2.3821739049498314, "grad_norm": 0.3219797611236572, "learning_rate": 1.780704737225498e-05, "loss": 1.2558, "step": 7998 }, { "epoch": 2.3824717511495, "grad_norm": 0.2756625711917877, "learning_rate": 1.7806444578336258e-05, "loss": 1.2429, "step": 7999 }, { "epoch": 2.382769597349169, "grad_norm": 0.23735983669757843, "learning_rate": 1.78058417117873e-05, "loss": 1.2544, "step": 8000 }, { "epoch": 2.382769597349169, "eval_loss": 1.3381941318511963, "eval_runtime": 20.3569, "eval_samples_per_second": 85.18, "eval_steps_per_second": 5.354, "step": 8000 }, { "epoch": 2.3830674435488373, "grad_norm": 0.23161669075489044, "learning_rate": 1.7805238772613694e-05, "loss": 1.2388, "step": 8001 }, { "epoch": 2.383365289748506, "grad_norm": 0.25970157980918884, "learning_rate": 1.780463576082107e-05, "loss": 1.2579, "step": 8002 }, { "epoch": 2.383663135948175, "grad_norm": 0.2822204530239105, "learning_rate": 1.7804032676415028e-05, "loss": 1.2651, "step": 8003 }, { "epoch": 2.3839609821478436, "grad_norm": 0.24108311533927917, "learning_rate": 1.780342951940118e-05, "loss": 1.2437, "step": 8004 }, { "epoch": 2.384258828347512, "grad_norm": 0.4408750534057617, "learning_rate": 1.780282628978514e-05, "loss": 1.2393, "step": 8005 }, { "epoch": 2.3845566745471807, "grad_norm": 0.2541865110397339, "learning_rate": 1.7802222987572522e-05, "loss": 1.2684, "step": 8006 }, { "epoch": 2.3848545207468494, "grad_norm": 0.2612745463848114, "learning_rate": 1.780161961276893e-05, "loss": 1.2633, "step": 8007 }, { "epoch": 2.385152366946518, "grad_norm": 0.24160808324813843, "learning_rate": 1.780101616537999e-05, "loss": 1.2692, "step": 8008 }, { "epoch": 2.3854502131461865, "grad_norm": 0.22500190138816833, "learning_rate": 1.7800412645411306e-05, "loss": 1.2589, "step": 8009 }, { "epoch": 2.3857480593458553, "grad_norm": 0.234347864985466, "learning_rate": 1.7799809052868503e-05, "loss": 1.2352, "step": 8010 }, { "epoch": 2.386045905545524, "grad_norm": 0.25336188077926636, "learning_rate": 1.7799205387757186e-05, "loss": 1.2487, "step": 8011 }, { "epoch": 2.3863437517451924, "grad_norm": 0.23233574628829956, "learning_rate": 1.7798601650082976e-05, "loss": 1.2508, "step": 8012 }, { "epoch": 2.386641597944861, "grad_norm": 0.22018462419509888, "learning_rate": 1.7797997839851497e-05, "loss": 1.2236, "step": 8013 }, { "epoch": 2.38693944414453, "grad_norm": 0.2367081195116043, "learning_rate": 1.7797393957068356e-05, "loss": 1.2528, "step": 8014 }, { "epoch": 2.3872372903441983, "grad_norm": 0.2383645623922348, "learning_rate": 1.7796790001739174e-05, "loss": 1.2541, "step": 8015 }, { "epoch": 2.387535136543867, "grad_norm": 0.24189260601997375, "learning_rate": 1.7796185973869575e-05, "loss": 1.2545, "step": 8016 }, { "epoch": 2.387832982743536, "grad_norm": 0.23932260274887085, "learning_rate": 1.7795581873465174e-05, "loss": 1.2445, "step": 8017 }, { "epoch": 2.3881308289432046, "grad_norm": 0.22369612753391266, "learning_rate": 1.7794977700531598e-05, "loss": 1.2392, "step": 8018 }, { "epoch": 2.388428675142873, "grad_norm": 0.2379557341337204, "learning_rate": 1.7794373455074458e-05, "loss": 1.257, "step": 8019 }, { "epoch": 2.3887265213425417, "grad_norm": 0.23386241495609283, "learning_rate": 1.7793769137099385e-05, "loss": 1.236, "step": 8020 }, { "epoch": 2.3890243675422105, "grad_norm": 0.2289877086877823, "learning_rate": 1.7793164746611997e-05, "loss": 1.2564, "step": 8021 }, { "epoch": 2.3893222137418793, "grad_norm": 0.22471486032009125, "learning_rate": 1.779256028361792e-05, "loss": 1.2457, "step": 8022 }, { "epoch": 2.3896200599415476, "grad_norm": 0.23774881660938263, "learning_rate": 1.779195574812277e-05, "loss": 1.2581, "step": 8023 }, { "epoch": 2.3899179061412164, "grad_norm": 0.24830865859985352, "learning_rate": 1.7791351140132182e-05, "loss": 1.2566, "step": 8024 }, { "epoch": 2.390215752340885, "grad_norm": 0.24003386497497559, "learning_rate": 1.7790746459651775e-05, "loss": 1.2417, "step": 8025 }, { "epoch": 2.3905135985405535, "grad_norm": 0.22913935780525208, "learning_rate": 1.7790141706687177e-05, "loss": 1.2518, "step": 8026 }, { "epoch": 2.3908114447402222, "grad_norm": 0.23181475698947906, "learning_rate": 1.7789536881244017e-05, "loss": 1.2511, "step": 8027 }, { "epoch": 2.391109290939891, "grad_norm": 0.22720636427402496, "learning_rate": 1.7788931983327914e-05, "loss": 1.2594, "step": 8028 }, { "epoch": 2.3914071371395593, "grad_norm": 0.23401761054992676, "learning_rate": 1.7788327012944508e-05, "loss": 1.2505, "step": 8029 }, { "epoch": 2.391704983339228, "grad_norm": 0.21780389547348022, "learning_rate": 1.7787721970099414e-05, "loss": 1.2346, "step": 8030 }, { "epoch": 2.392002829538897, "grad_norm": 0.2231125831604004, "learning_rate": 1.7787116854798273e-05, "loss": 1.2501, "step": 8031 }, { "epoch": 2.3923006757385656, "grad_norm": 0.24449528753757477, "learning_rate": 1.7786511667046706e-05, "loss": 1.2587, "step": 8032 }, { "epoch": 2.392598521938234, "grad_norm": 0.23590652644634247, "learning_rate": 1.778590640685035e-05, "loss": 1.2819, "step": 8033 }, { "epoch": 2.3928963681379027, "grad_norm": 0.23900920152664185, "learning_rate": 1.7785301074214835e-05, "loss": 1.2559, "step": 8034 }, { "epoch": 2.3931942143375715, "grad_norm": 0.23054538667201996, "learning_rate": 1.778469566914579e-05, "loss": 1.2668, "step": 8035 }, { "epoch": 2.3934920605372403, "grad_norm": 0.23224182426929474, "learning_rate": 1.7784090191648845e-05, "loss": 1.2539, "step": 8036 }, { "epoch": 2.3937899067369086, "grad_norm": 0.23819798231124878, "learning_rate": 1.7783484641729643e-05, "loss": 1.2313, "step": 8037 }, { "epoch": 2.3940877529365774, "grad_norm": 0.23229552805423737, "learning_rate": 1.7782879019393813e-05, "loss": 1.2407, "step": 8038 }, { "epoch": 2.394385599136246, "grad_norm": 0.236992746591568, "learning_rate": 1.7782273324646987e-05, "loss": 1.272, "step": 8039 }, { "epoch": 2.3946834453359145, "grad_norm": 0.22687511146068573, "learning_rate": 1.77816675574948e-05, "loss": 1.2616, "step": 8040 }, { "epoch": 2.3949812915355833, "grad_norm": 0.22410380840301514, "learning_rate": 1.7781061717942895e-05, "loss": 1.2527, "step": 8041 }, { "epoch": 2.395279137735252, "grad_norm": 0.2301110178232193, "learning_rate": 1.7780455805996902e-05, "loss": 1.2534, "step": 8042 }, { "epoch": 2.3955769839349204, "grad_norm": 0.22933273017406464, "learning_rate": 1.7779849821662463e-05, "loss": 1.2395, "step": 8043 }, { "epoch": 2.395874830134589, "grad_norm": 0.2381003051996231, "learning_rate": 1.7779243764945214e-05, "loss": 1.2589, "step": 8044 }, { "epoch": 2.396172676334258, "grad_norm": 0.2247830033302307, "learning_rate": 1.7778637635850792e-05, "loss": 1.2513, "step": 8045 }, { "epoch": 2.3964705225339267, "grad_norm": 0.2232261747121811, "learning_rate": 1.7778031434384834e-05, "loss": 1.2609, "step": 8046 }, { "epoch": 2.396768368733595, "grad_norm": 0.22561782598495483, "learning_rate": 1.777742516055299e-05, "loss": 1.2477, "step": 8047 }, { "epoch": 2.3970662149332638, "grad_norm": 0.2415585070848465, "learning_rate": 1.777681881436089e-05, "loss": 1.2602, "step": 8048 }, { "epoch": 2.3973640611329325, "grad_norm": 0.22911863029003143, "learning_rate": 1.777621239581418e-05, "loss": 1.2599, "step": 8049 }, { "epoch": 2.3976619073326013, "grad_norm": 0.23407961428165436, "learning_rate": 1.7775605904918505e-05, "loss": 1.2643, "step": 8050 }, { "epoch": 2.3979597535322696, "grad_norm": 0.22826075553894043, "learning_rate": 1.77749993416795e-05, "loss": 1.2462, "step": 8051 }, { "epoch": 2.3982575997319384, "grad_norm": 0.2386498898267746, "learning_rate": 1.7774392706102818e-05, "loss": 1.2543, "step": 8052 }, { "epoch": 2.398555445931607, "grad_norm": 0.2155432254076004, "learning_rate": 1.777378599819409e-05, "loss": 1.2602, "step": 8053 }, { "epoch": 2.3988532921312755, "grad_norm": 0.23106734454631805, "learning_rate": 1.7773179217958976e-05, "loss": 1.2488, "step": 8054 }, { "epoch": 2.3991511383309443, "grad_norm": 0.2344607561826706, "learning_rate": 1.7772572365403112e-05, "loss": 1.2502, "step": 8055 }, { "epoch": 2.399448984530613, "grad_norm": 0.21689283847808838, "learning_rate": 1.7771965440532145e-05, "loss": 1.2509, "step": 8056 }, { "epoch": 2.399746830730282, "grad_norm": 0.23053377866744995, "learning_rate": 1.7771358443351724e-05, "loss": 1.2453, "step": 8057 }, { "epoch": 2.40004467692995, "grad_norm": 0.22964678704738617, "learning_rate": 1.7770751373867494e-05, "loss": 1.246, "step": 8058 }, { "epoch": 2.400342523129619, "grad_norm": 0.21987757086753845, "learning_rate": 1.7770144232085105e-05, "loss": 1.2348, "step": 8059 }, { "epoch": 2.4006403693292877, "grad_norm": 0.2204911708831787, "learning_rate": 1.776953701801021e-05, "loss": 1.244, "step": 8060 }, { "epoch": 2.4009382155289565, "grad_norm": 0.21743136644363403, "learning_rate": 1.7768929731648447e-05, "loss": 1.235, "step": 8061 }, { "epoch": 2.401236061728625, "grad_norm": 0.22693593800067902, "learning_rate": 1.7768322373005474e-05, "loss": 1.2409, "step": 8062 }, { "epoch": 2.4015339079282936, "grad_norm": 0.23844876885414124, "learning_rate": 1.776771494208694e-05, "loss": 1.2527, "step": 8063 }, { "epoch": 2.4018317541279623, "grad_norm": 0.23794607818126678, "learning_rate": 1.77671074388985e-05, "loss": 1.2596, "step": 8064 }, { "epoch": 2.4021296003276307, "grad_norm": 0.2254185527563095, "learning_rate": 1.7766499863445804e-05, "loss": 1.2373, "step": 8065 }, { "epoch": 2.4024274465272994, "grad_norm": 0.21888013184070587, "learning_rate": 1.77658922157345e-05, "loss": 1.2601, "step": 8066 }, { "epoch": 2.402725292726968, "grad_norm": 0.25763723254203796, "learning_rate": 1.776528449577025e-05, "loss": 1.256, "step": 8067 }, { "epoch": 2.4030231389266365, "grad_norm": 0.2353525310754776, "learning_rate": 1.77646767035587e-05, "loss": 1.2649, "step": 8068 }, { "epoch": 2.4033209851263053, "grad_norm": 0.24496106803417206, "learning_rate": 1.776406883910551e-05, "loss": 1.2587, "step": 8069 }, { "epoch": 2.403618831325974, "grad_norm": 0.24062785506248474, "learning_rate": 1.7763460902416333e-05, "loss": 1.2642, "step": 8070 }, { "epoch": 2.403916677525643, "grad_norm": 0.23483458161354065, "learning_rate": 1.7762852893496825e-05, "loss": 1.2403, "step": 8071 }, { "epoch": 2.404214523725311, "grad_norm": 0.23496435582637787, "learning_rate": 1.7762244812352648e-05, "loss": 1.2517, "step": 8072 }, { "epoch": 2.40451236992498, "grad_norm": 0.23324401676654816, "learning_rate": 1.7761636658989452e-05, "loss": 1.2488, "step": 8073 }, { "epoch": 2.4048102161246487, "grad_norm": 0.2287825495004654, "learning_rate": 1.77610284334129e-05, "loss": 1.2413, "step": 8074 }, { "epoch": 2.4051080623243175, "grad_norm": 0.2442363053560257, "learning_rate": 1.7760420135628652e-05, "loss": 1.2495, "step": 8075 }, { "epoch": 2.405405908523986, "grad_norm": 0.23072031140327454, "learning_rate": 1.775981176564236e-05, "loss": 1.2583, "step": 8076 }, { "epoch": 2.4057037547236546, "grad_norm": 0.23112305998802185, "learning_rate": 1.7759203323459693e-05, "loss": 1.2585, "step": 8077 }, { "epoch": 2.4060016009233234, "grad_norm": 0.24222977459430695, "learning_rate": 1.7758594809086308e-05, "loss": 1.2456, "step": 8078 }, { "epoch": 2.4062994471229917, "grad_norm": 0.22285297513008118, "learning_rate": 1.7757986222527864e-05, "loss": 1.2626, "step": 8079 }, { "epoch": 2.4065972933226605, "grad_norm": 0.22241798043251038, "learning_rate": 1.775737756379003e-05, "loss": 1.2452, "step": 8080 }, { "epoch": 2.4068951395223293, "grad_norm": 0.24153169989585876, "learning_rate": 1.7756768832878463e-05, "loss": 1.246, "step": 8081 }, { "epoch": 2.4071929857219976, "grad_norm": 0.23785129189491272, "learning_rate": 1.7756160029798824e-05, "loss": 1.2595, "step": 8082 }, { "epoch": 2.4074908319216664, "grad_norm": 0.23555125296115875, "learning_rate": 1.775555115455679e-05, "loss": 1.2484, "step": 8083 }, { "epoch": 2.407788678121335, "grad_norm": 0.24767501652240753, "learning_rate": 1.775494220715801e-05, "loss": 1.2438, "step": 8084 }, { "epoch": 2.408086524321004, "grad_norm": 0.2364848554134369, "learning_rate": 1.7754333187608163e-05, "loss": 1.2685, "step": 8085 }, { "epoch": 2.4083843705206722, "grad_norm": 0.24467679858207703, "learning_rate": 1.7753724095912906e-05, "loss": 1.2494, "step": 8086 }, { "epoch": 2.408682216720341, "grad_norm": 0.2808564305305481, "learning_rate": 1.775311493207791e-05, "loss": 1.2435, "step": 8087 }, { "epoch": 2.4089800629200098, "grad_norm": 0.22737839818000793, "learning_rate": 1.775250569610884e-05, "loss": 1.2472, "step": 8088 }, { "epoch": 2.4092779091196785, "grad_norm": 0.2532578110694885, "learning_rate": 1.775189638801137e-05, "loss": 1.261, "step": 8089 }, { "epoch": 2.409575755319347, "grad_norm": 0.23660686612129211, "learning_rate": 1.7751287007791163e-05, "loss": 1.2489, "step": 8090 }, { "epoch": 2.4098736015190156, "grad_norm": 0.2513912618160248, "learning_rate": 1.775067755545389e-05, "loss": 1.2674, "step": 8091 }, { "epoch": 2.4101714477186844, "grad_norm": 0.23578931391239166, "learning_rate": 1.775006803100522e-05, "loss": 1.2571, "step": 8092 }, { "epoch": 2.4104692939183527, "grad_norm": 0.2183685600757599, "learning_rate": 1.774945843445083e-05, "loss": 1.2296, "step": 8093 }, { "epoch": 2.4107671401180215, "grad_norm": 0.310764342546463, "learning_rate": 1.7748848765796385e-05, "loss": 1.2464, "step": 8094 }, { "epoch": 2.4110649863176903, "grad_norm": 0.33203524351119995, "learning_rate": 1.774823902504756e-05, "loss": 1.2397, "step": 8095 }, { "epoch": 2.4113628325173586, "grad_norm": 0.2394951581954956, "learning_rate": 1.7747629212210033e-05, "loss": 1.2333, "step": 8096 }, { "epoch": 2.4116606787170274, "grad_norm": 0.46868062019348145, "learning_rate": 1.7747019327289465e-05, "loss": 1.2342, "step": 8097 }, { "epoch": 2.411958524916696, "grad_norm": 0.29705339670181274, "learning_rate": 1.774640937029154e-05, "loss": 1.2477, "step": 8098 }, { "epoch": 2.412256371116365, "grad_norm": 0.2741549611091614, "learning_rate": 1.774579934122193e-05, "loss": 1.2415, "step": 8099 }, { "epoch": 2.4125542173160333, "grad_norm": 0.23555107414722443, "learning_rate": 1.7745189240086313e-05, "loss": 1.2648, "step": 8100 }, { "epoch": 2.412852063515702, "grad_norm": 0.22980466485023499, "learning_rate": 1.7744579066890363e-05, "loss": 1.2498, "step": 8101 }, { "epoch": 2.413149909715371, "grad_norm": 0.24976091086864471, "learning_rate": 1.7743968821639757e-05, "loss": 1.2497, "step": 8102 }, { "epoch": 2.4134477559150396, "grad_norm": 0.256198912858963, "learning_rate": 1.7743358504340173e-05, "loss": 1.2587, "step": 8103 }, { "epoch": 2.413745602114708, "grad_norm": 0.2384638488292694, "learning_rate": 1.774274811499729e-05, "loss": 1.258, "step": 8104 }, { "epoch": 2.4140434483143767, "grad_norm": 0.22125479578971863, "learning_rate": 1.7742137653616787e-05, "loss": 1.2666, "step": 8105 }, { "epoch": 2.4143412945140454, "grad_norm": 0.23319008946418762, "learning_rate": 1.774152712020434e-05, "loss": 1.2417, "step": 8106 }, { "epoch": 2.4146391407137138, "grad_norm": 0.24594014883041382, "learning_rate": 1.7740916514765638e-05, "loss": 1.2693, "step": 8107 }, { "epoch": 2.4149369869133825, "grad_norm": 0.23496028780937195, "learning_rate": 1.7740305837306353e-05, "loss": 1.2401, "step": 8108 }, { "epoch": 2.4152348331130513, "grad_norm": 0.22659975290298462, "learning_rate": 1.773969508783217e-05, "loss": 1.2624, "step": 8109 }, { "epoch": 2.41553267931272, "grad_norm": 0.22997890412807465, "learning_rate": 1.7739084266348772e-05, "loss": 1.2691, "step": 8110 }, { "epoch": 2.4158305255123884, "grad_norm": 0.2514461278915405, "learning_rate": 1.773847337286184e-05, "loss": 1.2614, "step": 8111 }, { "epoch": 2.416128371712057, "grad_norm": 0.2418598085641861, "learning_rate": 1.773786240737706e-05, "loss": 1.2583, "step": 8112 }, { "epoch": 2.416426217911726, "grad_norm": 0.2289545089006424, "learning_rate": 1.7737251369900118e-05, "loss": 1.2521, "step": 8113 }, { "epoch": 2.4167240641113943, "grad_norm": 0.22397193312644958, "learning_rate": 1.7736640260436693e-05, "loss": 1.2571, "step": 8114 }, { "epoch": 2.417021910311063, "grad_norm": 0.2338794767856598, "learning_rate": 1.7736029078992477e-05, "loss": 1.2574, "step": 8115 }, { "epoch": 2.417319756510732, "grad_norm": 0.235575869679451, "learning_rate": 1.7735417825573154e-05, "loss": 1.234, "step": 8116 }, { "epoch": 2.4176176027104006, "grad_norm": 0.2261354774236679, "learning_rate": 1.773480650018441e-05, "loss": 1.2426, "step": 8117 }, { "epoch": 2.417915448910069, "grad_norm": 0.22895053029060364, "learning_rate": 1.773419510283193e-05, "loss": 1.2641, "step": 8118 }, { "epoch": 2.4182132951097377, "grad_norm": 0.23475541174411774, "learning_rate": 1.7733583633521412e-05, "loss": 1.2676, "step": 8119 }, { "epoch": 2.4185111413094065, "grad_norm": 0.24860520660877228, "learning_rate": 1.7732972092258535e-05, "loss": 1.2355, "step": 8120 }, { "epoch": 2.418808987509075, "grad_norm": 0.22481386363506317, "learning_rate": 1.773236047904899e-05, "loss": 1.2404, "step": 8121 }, { "epoch": 2.4191068337087436, "grad_norm": 0.23251011967658997, "learning_rate": 1.7731748793898472e-05, "loss": 1.24, "step": 8122 }, { "epoch": 2.4194046799084123, "grad_norm": 0.237873375415802, "learning_rate": 1.7731137036812674e-05, "loss": 1.2382, "step": 8123 }, { "epoch": 2.419702526108081, "grad_norm": 0.2502184808254242, "learning_rate": 1.773052520779728e-05, "loss": 1.248, "step": 8124 }, { "epoch": 2.4200003723077494, "grad_norm": 0.23529601097106934, "learning_rate": 1.7729913306857987e-05, "loss": 1.2515, "step": 8125 }, { "epoch": 2.420298218507418, "grad_norm": 0.22526681423187256, "learning_rate": 1.7729301334000486e-05, "loss": 1.2516, "step": 8126 }, { "epoch": 2.420596064707087, "grad_norm": 0.23735322058200836, "learning_rate": 1.772868928923047e-05, "loss": 1.2629, "step": 8127 }, { "epoch": 2.4208939109067558, "grad_norm": 0.22896255552768707, "learning_rate": 1.772807717255364e-05, "loss": 1.2577, "step": 8128 }, { "epoch": 2.421191757106424, "grad_norm": 0.23871658742427826, "learning_rate": 1.772746498397568e-05, "loss": 1.2457, "step": 8129 }, { "epoch": 2.421489603306093, "grad_norm": 0.23917968571186066, "learning_rate": 1.7726852723502296e-05, "loss": 1.2349, "step": 8130 }, { "epoch": 2.4217874495057616, "grad_norm": 0.22797973453998566, "learning_rate": 1.772624039113918e-05, "loss": 1.2533, "step": 8131 }, { "epoch": 2.42208529570543, "grad_norm": 0.23172640800476074, "learning_rate": 1.7725627986892028e-05, "loss": 1.2257, "step": 8132 }, { "epoch": 2.4223831419050987, "grad_norm": 0.2374148964881897, "learning_rate": 1.772501551076654e-05, "loss": 1.2345, "step": 8133 }, { "epoch": 2.4226809881047675, "grad_norm": 0.2308272123336792, "learning_rate": 1.7724402962768413e-05, "loss": 1.2586, "step": 8134 }, { "epoch": 2.422978834304436, "grad_norm": 0.23716746270656586, "learning_rate": 1.7723790342903353e-05, "loss": 1.2633, "step": 8135 }, { "epoch": 2.4232766805041046, "grad_norm": 0.23746775090694427, "learning_rate": 1.7723177651177046e-05, "loss": 1.2371, "step": 8136 }, { "epoch": 2.4235745267037734, "grad_norm": 0.25077927112579346, "learning_rate": 1.77225648875952e-05, "loss": 1.2462, "step": 8137 }, { "epoch": 2.423872372903442, "grad_norm": 0.22465792298316956, "learning_rate": 1.7721952052163517e-05, "loss": 1.2437, "step": 8138 }, { "epoch": 2.4241702191031105, "grad_norm": 0.2292993813753128, "learning_rate": 1.77213391448877e-05, "loss": 1.2634, "step": 8139 }, { "epoch": 2.4244680653027793, "grad_norm": 0.22933197021484375, "learning_rate": 1.7720726165773444e-05, "loss": 1.2498, "step": 8140 }, { "epoch": 2.424765911502448, "grad_norm": 0.2276661992073059, "learning_rate": 1.7720113114826462e-05, "loss": 1.2562, "step": 8141 }, { "epoch": 2.425063757702117, "grad_norm": 0.2251230925321579, "learning_rate": 1.771949999205245e-05, "loss": 1.2293, "step": 8142 }, { "epoch": 2.425361603901785, "grad_norm": 0.2342134565114975, "learning_rate": 1.7718886797457118e-05, "loss": 1.2502, "step": 8143 }, { "epoch": 2.425659450101454, "grad_norm": 0.23211674392223358, "learning_rate": 1.7718273531046167e-05, "loss": 1.2669, "step": 8144 }, { "epoch": 2.4259572963011227, "grad_norm": 0.24708154797554016, "learning_rate": 1.7717660192825304e-05, "loss": 1.2456, "step": 8145 }, { "epoch": 2.426255142500791, "grad_norm": 0.2347024530172348, "learning_rate": 1.7717046782800236e-05, "loss": 1.2599, "step": 8146 }, { "epoch": 2.4265529887004598, "grad_norm": 0.2513992190361023, "learning_rate": 1.7716433300976667e-05, "loss": 1.2559, "step": 8147 }, { "epoch": 2.4268508349001285, "grad_norm": 0.23596066236495972, "learning_rate": 1.7715819747360313e-05, "loss": 1.2442, "step": 8148 }, { "epoch": 2.427148681099797, "grad_norm": 0.2340855449438095, "learning_rate": 1.7715206121956874e-05, "loss": 1.2506, "step": 8149 }, { "epoch": 2.4274465272994656, "grad_norm": 0.22590996325016022, "learning_rate": 1.771459242477206e-05, "loss": 1.2454, "step": 8150 }, { "epoch": 2.4277443734991344, "grad_norm": 0.23030374944210052, "learning_rate": 1.7713978655811583e-05, "loss": 1.2528, "step": 8151 }, { "epoch": 2.428042219698803, "grad_norm": 0.24000777304172516, "learning_rate": 1.7713364815081154e-05, "loss": 1.2424, "step": 8152 }, { "epoch": 2.4283400658984715, "grad_norm": 0.24114497005939484, "learning_rate": 1.7712750902586485e-05, "loss": 1.2519, "step": 8153 }, { "epoch": 2.4286379120981403, "grad_norm": 0.23782244324684143, "learning_rate": 1.7712136918333285e-05, "loss": 1.2687, "step": 8154 }, { "epoch": 2.428935758297809, "grad_norm": 0.23510950803756714, "learning_rate": 1.7711522862327267e-05, "loss": 1.2696, "step": 8155 }, { "epoch": 2.429233604497478, "grad_norm": 0.24703507125377655, "learning_rate": 1.7710908734574147e-05, "loss": 1.2383, "step": 8156 }, { "epoch": 2.429531450697146, "grad_norm": 0.22979791462421417, "learning_rate": 1.7710294535079633e-05, "loss": 1.2644, "step": 8157 }, { "epoch": 2.429829296896815, "grad_norm": 0.23439759016036987, "learning_rate": 1.7709680263849445e-05, "loss": 1.2621, "step": 8158 }, { "epoch": 2.4301271430964837, "grad_norm": 0.2432500123977661, "learning_rate": 1.770906592088929e-05, "loss": 1.2688, "step": 8159 }, { "epoch": 2.430424989296152, "grad_norm": 0.28154420852661133, "learning_rate": 1.77084515062049e-05, "loss": 1.2555, "step": 8160 }, { "epoch": 2.430722835495821, "grad_norm": 0.22606149315834045, "learning_rate": 1.7707837019801975e-05, "loss": 1.256, "step": 8161 }, { "epoch": 2.4310206816954896, "grad_norm": 0.21935401856899261, "learning_rate": 1.770722246168624e-05, "loss": 1.2425, "step": 8162 }, { "epoch": 2.431318527895158, "grad_norm": 0.23840375244617462, "learning_rate": 1.770660783186341e-05, "loss": 1.2658, "step": 8163 }, { "epoch": 2.4316163740948267, "grad_norm": 0.24386407434940338, "learning_rate": 1.7705993130339204e-05, "loss": 1.257, "step": 8164 }, { "epoch": 2.4319142202944954, "grad_norm": 0.2271236926317215, "learning_rate": 1.770537835711934e-05, "loss": 1.2692, "step": 8165 }, { "epoch": 2.432212066494164, "grad_norm": 0.23163583874702454, "learning_rate": 1.7704763512209543e-05, "loss": 1.2615, "step": 8166 }, { "epoch": 2.4325099126938325, "grad_norm": 0.23572777211666107, "learning_rate": 1.770414859561553e-05, "loss": 1.2308, "step": 8167 }, { "epoch": 2.4328077588935013, "grad_norm": 0.26480162143707275, "learning_rate": 1.7703533607343017e-05, "loss": 1.2534, "step": 8168 }, { "epoch": 2.43310560509317, "grad_norm": 0.25801411271095276, "learning_rate": 1.7702918547397734e-05, "loss": 1.2568, "step": 8169 }, { "epoch": 2.433403451292839, "grad_norm": 0.28933751583099365, "learning_rate": 1.77023034157854e-05, "loss": 1.2473, "step": 8170 }, { "epoch": 2.433701297492507, "grad_norm": 0.23392869532108307, "learning_rate": 1.7701688212511738e-05, "loss": 1.2563, "step": 8171 }, { "epoch": 2.433999143692176, "grad_norm": 0.24702076613903046, "learning_rate": 1.7701072937582473e-05, "loss": 1.238, "step": 8172 }, { "epoch": 2.4342969898918447, "grad_norm": 0.2662452459335327, "learning_rate": 1.7700457591003327e-05, "loss": 1.2503, "step": 8173 }, { "epoch": 2.434594836091513, "grad_norm": 0.23712895810604095, "learning_rate": 1.7699842172780028e-05, "loss": 1.2525, "step": 8174 }, { "epoch": 2.434892682291182, "grad_norm": 0.3020852506160736, "learning_rate": 1.7699226682918297e-05, "loss": 1.2593, "step": 8175 }, { "epoch": 2.4351905284908506, "grad_norm": 0.2775361239910126, "learning_rate": 1.7698611121423867e-05, "loss": 1.2568, "step": 8176 }, { "epoch": 2.4354883746905194, "grad_norm": 0.269408643245697, "learning_rate": 1.769799548830246e-05, "loss": 1.2647, "step": 8177 }, { "epoch": 2.4357862208901877, "grad_norm": 0.2418518215417862, "learning_rate": 1.769737978355981e-05, "loss": 1.2479, "step": 8178 }, { "epoch": 2.4360840670898565, "grad_norm": 0.3381626307964325, "learning_rate": 1.7696764007201638e-05, "loss": 1.2746, "step": 8179 }, { "epoch": 2.4363819132895252, "grad_norm": 0.23209111392498016, "learning_rate": 1.7696148159233676e-05, "loss": 1.2399, "step": 8180 }, { "epoch": 2.4366797594891936, "grad_norm": 0.24149450659751892, "learning_rate": 1.7695532239661655e-05, "loss": 1.2634, "step": 8181 }, { "epoch": 2.4369776056888623, "grad_norm": 0.2611485719680786, "learning_rate": 1.7694916248491304e-05, "loss": 1.2582, "step": 8182 }, { "epoch": 2.437275451888531, "grad_norm": 0.22782889008522034, "learning_rate": 1.7694300185728353e-05, "loss": 1.2535, "step": 8183 }, { "epoch": 2.4375732980882, "grad_norm": 0.22121752798557281, "learning_rate": 1.7693684051378538e-05, "loss": 1.2483, "step": 8184 }, { "epoch": 2.437871144287868, "grad_norm": 0.24565903842449188, "learning_rate": 1.7693067845447588e-05, "loss": 1.2679, "step": 8185 }, { "epoch": 2.438168990487537, "grad_norm": 0.24552184343338013, "learning_rate": 1.7692451567941232e-05, "loss": 1.2434, "step": 8186 }, { "epoch": 2.4384668366872058, "grad_norm": 0.2353091686964035, "learning_rate": 1.7691835218865217e-05, "loss": 1.2488, "step": 8187 }, { "epoch": 2.438764682886874, "grad_norm": 0.2381007969379425, "learning_rate": 1.7691218798225262e-05, "loss": 1.2654, "step": 8188 }, { "epoch": 2.439062529086543, "grad_norm": 0.23425203561782837, "learning_rate": 1.7690602306027114e-05, "loss": 1.2554, "step": 8189 }, { "epoch": 2.4393603752862116, "grad_norm": 0.23426920175552368, "learning_rate": 1.7689985742276504e-05, "loss": 1.2618, "step": 8190 }, { "epoch": 2.4396582214858804, "grad_norm": 0.24380579590797424, "learning_rate": 1.7689369106979166e-05, "loss": 1.2411, "step": 8191 }, { "epoch": 2.4399560676855487, "grad_norm": 0.22629253566265106, "learning_rate": 1.7688752400140842e-05, "loss": 1.2417, "step": 8192 }, { "epoch": 2.4402539138852175, "grad_norm": 0.22987054288387299, "learning_rate": 1.7688135621767262e-05, "loss": 1.2657, "step": 8193 }, { "epoch": 2.4405517600848863, "grad_norm": 0.2381831705570221, "learning_rate": 1.7687518771864173e-05, "loss": 1.237, "step": 8194 }, { "epoch": 2.440849606284555, "grad_norm": 0.24247990548610687, "learning_rate": 1.7686901850437316e-05, "loss": 1.2607, "step": 8195 }, { "epoch": 2.4411474524842234, "grad_norm": 0.22796709835529327, "learning_rate": 1.768628485749242e-05, "loss": 1.2441, "step": 8196 }, { "epoch": 2.441445298683892, "grad_norm": 0.24143260717391968, "learning_rate": 1.768566779303523e-05, "loss": 1.2676, "step": 8197 }, { "epoch": 2.441743144883561, "grad_norm": 0.23797409236431122, "learning_rate": 1.7685050657071488e-05, "loss": 1.2605, "step": 8198 }, { "epoch": 2.4420409910832293, "grad_norm": 0.23557357490062714, "learning_rate": 1.7684433449606938e-05, "loss": 1.2568, "step": 8199 }, { "epoch": 2.442338837282898, "grad_norm": 0.22420549392700195, "learning_rate": 1.7683816170647322e-05, "loss": 1.2412, "step": 8200 }, { "epoch": 2.442636683482567, "grad_norm": 0.23614062368869781, "learning_rate": 1.768319882019838e-05, "loss": 1.2578, "step": 8201 }, { "epoch": 2.442934529682235, "grad_norm": 0.2433299571275711, "learning_rate": 1.7682581398265856e-05, "loss": 1.2596, "step": 8202 }, { "epoch": 2.443232375881904, "grad_norm": 0.24343116581439972, "learning_rate": 1.7681963904855495e-05, "loss": 1.2633, "step": 8203 }, { "epoch": 2.4435302220815727, "grad_norm": 0.23356005549430847, "learning_rate": 1.7681346339973044e-05, "loss": 1.2509, "step": 8204 }, { "epoch": 2.4438280682812414, "grad_norm": 0.24294687807559967, "learning_rate": 1.7680728703624247e-05, "loss": 1.2458, "step": 8205 }, { "epoch": 2.4441259144809098, "grad_norm": 0.24111899733543396, "learning_rate": 1.7680110995814855e-05, "loss": 1.2502, "step": 8206 }, { "epoch": 2.4444237606805785, "grad_norm": 0.23431457579135895, "learning_rate": 1.7679493216550607e-05, "loss": 1.2385, "step": 8207 }, { "epoch": 2.4447216068802473, "grad_norm": 0.24026180803775787, "learning_rate": 1.7678875365837253e-05, "loss": 1.2535, "step": 8208 }, { "epoch": 2.445019453079916, "grad_norm": 0.23439113795757294, "learning_rate": 1.7678257443680546e-05, "loss": 1.2459, "step": 8209 }, { "epoch": 2.4453172992795844, "grad_norm": 0.24530130624771118, "learning_rate": 1.7677639450086232e-05, "loss": 1.2664, "step": 8210 }, { "epoch": 2.445615145479253, "grad_norm": 0.24152901768684387, "learning_rate": 1.7677021385060057e-05, "loss": 1.2488, "step": 8211 }, { "epoch": 2.445912991678922, "grad_norm": 0.2546330690383911, "learning_rate": 1.767640324860778e-05, "loss": 1.2481, "step": 8212 }, { "epoch": 2.4462108378785903, "grad_norm": 0.24023675918579102, "learning_rate": 1.7675785040735146e-05, "loss": 1.265, "step": 8213 }, { "epoch": 2.446508684078259, "grad_norm": 0.24142986536026, "learning_rate": 1.7675166761447905e-05, "loss": 1.2538, "step": 8214 }, { "epoch": 2.446806530277928, "grad_norm": 0.2541647255420685, "learning_rate": 1.7674548410751815e-05, "loss": 1.2538, "step": 8215 }, { "epoch": 2.447104376477596, "grad_norm": 0.23540470004081726, "learning_rate": 1.7673929988652627e-05, "loss": 1.2587, "step": 8216 }, { "epoch": 2.447402222677265, "grad_norm": 0.23725782334804535, "learning_rate": 1.7673311495156093e-05, "loss": 1.2451, "step": 8217 }, { "epoch": 2.4477000688769337, "grad_norm": 0.2404332160949707, "learning_rate": 1.767269293026797e-05, "loss": 1.251, "step": 8218 }, { "epoch": 2.4479979150766025, "grad_norm": 0.22643688321113586, "learning_rate": 1.767207429399401e-05, "loss": 1.2541, "step": 8219 }, { "epoch": 2.448295761276271, "grad_norm": 0.2497372180223465, "learning_rate": 1.7671455586339973e-05, "loss": 1.2448, "step": 8220 }, { "epoch": 2.4485936074759396, "grad_norm": 0.22984126210212708, "learning_rate": 1.7670836807311614e-05, "loss": 1.2538, "step": 8221 }, { "epoch": 2.4488914536756083, "grad_norm": 0.2346605509519577, "learning_rate": 1.7670217956914685e-05, "loss": 1.2471, "step": 8222 }, { "epoch": 2.449189299875277, "grad_norm": 0.23125341534614563, "learning_rate": 1.766959903515495e-05, "loss": 1.234, "step": 8223 }, { "epoch": 2.4494871460749454, "grad_norm": 0.23240256309509277, "learning_rate": 1.7668980042038166e-05, "loss": 1.2543, "step": 8224 }, { "epoch": 2.449784992274614, "grad_norm": 0.2352667599916458, "learning_rate": 1.7668360977570085e-05, "loss": 1.2489, "step": 8225 }, { "epoch": 2.450082838474283, "grad_norm": 0.23893482983112335, "learning_rate": 1.7667741841756477e-05, "loss": 1.2503, "step": 8226 }, { "epoch": 2.4503806846739513, "grad_norm": 0.23530250787734985, "learning_rate": 1.7667122634603097e-05, "loss": 1.2487, "step": 8227 }, { "epoch": 2.45067853087362, "grad_norm": 0.22781215608119965, "learning_rate": 1.7666503356115706e-05, "loss": 1.2583, "step": 8228 }, { "epoch": 2.450976377073289, "grad_norm": 0.23113566637039185, "learning_rate": 1.766588400630007e-05, "loss": 1.2665, "step": 8229 }, { "epoch": 2.451274223272957, "grad_norm": 0.23249532282352448, "learning_rate": 1.7665264585161947e-05, "loss": 1.2568, "step": 8230 }, { "epoch": 2.451572069472626, "grad_norm": 0.2413317710161209, "learning_rate": 1.7664645092707102e-05, "loss": 1.2554, "step": 8231 }, { "epoch": 2.4518699156722947, "grad_norm": 0.33798786997795105, "learning_rate": 1.7664025528941295e-05, "loss": 1.258, "step": 8232 }, { "epoch": 2.4521677618719635, "grad_norm": 0.330740362405777, "learning_rate": 1.7663405893870296e-05, "loss": 1.2459, "step": 8233 }, { "epoch": 2.452465608071632, "grad_norm": 0.28560367226600647, "learning_rate": 1.7662786187499865e-05, "loss": 1.2482, "step": 8234 }, { "epoch": 2.4527634542713006, "grad_norm": 0.5696402788162231, "learning_rate": 1.7662166409835772e-05, "loss": 1.2583, "step": 8235 }, { "epoch": 2.4530613004709694, "grad_norm": 0.24457116425037384, "learning_rate": 1.7661546560883784e-05, "loss": 1.2402, "step": 8236 }, { "epoch": 2.453359146670638, "grad_norm": 0.24923206865787506, "learning_rate": 1.766092664064966e-05, "loss": 1.2388, "step": 8237 }, { "epoch": 2.4536569928703065, "grad_norm": 0.24296973645687103, "learning_rate": 1.766030664913917e-05, "loss": 1.2567, "step": 8238 }, { "epoch": 2.4539548390699752, "grad_norm": 0.2376331090927124, "learning_rate": 1.765968658635809e-05, "loss": 1.2694, "step": 8239 }, { "epoch": 2.454252685269644, "grad_norm": 0.2334982305765152, "learning_rate": 1.7659066452312184e-05, "loss": 1.2507, "step": 8240 }, { "epoch": 2.4545505314693123, "grad_norm": 0.25672271847724915, "learning_rate": 1.7658446247007224e-05, "loss": 1.2584, "step": 8241 }, { "epoch": 2.454848377668981, "grad_norm": 0.24763455986976624, "learning_rate": 1.7657825970448975e-05, "loss": 1.2332, "step": 8242 }, { "epoch": 2.45514622386865, "grad_norm": 0.23960673809051514, "learning_rate": 1.7657205622643214e-05, "loss": 1.2475, "step": 8243 }, { "epoch": 2.4554440700683187, "grad_norm": 0.22872363030910492, "learning_rate": 1.7656585203595708e-05, "loss": 1.2622, "step": 8244 }, { "epoch": 2.455741916267987, "grad_norm": 0.22929058969020844, "learning_rate": 1.7655964713312232e-05, "loss": 1.2567, "step": 8245 }, { "epoch": 2.4560397624676558, "grad_norm": 0.23774370551109314, "learning_rate": 1.7655344151798554e-05, "loss": 1.2499, "step": 8246 }, { "epoch": 2.4563376086673245, "grad_norm": 0.23429562151432037, "learning_rate": 1.7654723519060457e-05, "loss": 1.2536, "step": 8247 }, { "epoch": 2.456635454866993, "grad_norm": 0.22949855029582977, "learning_rate": 1.7654102815103708e-05, "loss": 1.2442, "step": 8248 }, { "epoch": 2.4569333010666616, "grad_norm": 0.2358241230249405, "learning_rate": 1.7653482039934084e-05, "loss": 1.254, "step": 8249 }, { "epoch": 2.4572311472663304, "grad_norm": 0.23049359023571014, "learning_rate": 1.765286119355736e-05, "loss": 1.2416, "step": 8250 }, { "epoch": 2.457528993465999, "grad_norm": 0.22333648800849915, "learning_rate": 1.7652240275979312e-05, "loss": 1.2468, "step": 8251 }, { "epoch": 2.4578268396656675, "grad_norm": 0.21928377449512482, "learning_rate": 1.7651619287205722e-05, "loss": 1.2514, "step": 8252 }, { "epoch": 2.4581246858653363, "grad_norm": 0.23795348405838013, "learning_rate": 1.7650998227242357e-05, "loss": 1.2476, "step": 8253 }, { "epoch": 2.458422532065005, "grad_norm": 0.2292827069759369, "learning_rate": 1.7650377096095007e-05, "loss": 1.2392, "step": 8254 }, { "epoch": 2.4587203782646734, "grad_norm": 0.22743940353393555, "learning_rate": 1.7649755893769446e-05, "loss": 1.2573, "step": 8255 }, { "epoch": 2.459018224464342, "grad_norm": 0.22143808007240295, "learning_rate": 1.764913462027145e-05, "loss": 1.2518, "step": 8256 }, { "epoch": 2.459316070664011, "grad_norm": 0.22706541419029236, "learning_rate": 1.7648513275606802e-05, "loss": 1.2339, "step": 8257 }, { "epoch": 2.4596139168636797, "grad_norm": 0.23309893906116486, "learning_rate": 1.7647891859781284e-05, "loss": 1.2525, "step": 8258 }, { "epoch": 2.459911763063348, "grad_norm": 0.22900895774364471, "learning_rate": 1.764727037280068e-05, "loss": 1.2633, "step": 8259 }, { "epoch": 2.460209609263017, "grad_norm": 0.22730329632759094, "learning_rate": 1.7646648814670765e-05, "loss": 1.2409, "step": 8260 }, { "epoch": 2.4605074554626856, "grad_norm": 0.22470271587371826, "learning_rate": 1.764602718539733e-05, "loss": 1.2409, "step": 8261 }, { "epoch": 2.4608053016623543, "grad_norm": 0.2320781648159027, "learning_rate": 1.7645405484986152e-05, "loss": 1.2581, "step": 8262 }, { "epoch": 2.4611031478620227, "grad_norm": 0.21992452442646027, "learning_rate": 1.764478371344302e-05, "loss": 1.2258, "step": 8263 }, { "epoch": 2.4614009940616914, "grad_norm": 0.21929426491260529, "learning_rate": 1.7644161870773715e-05, "loss": 1.2574, "step": 8264 }, { "epoch": 2.46169884026136, "grad_norm": 0.23645782470703125, "learning_rate": 1.7643539956984026e-05, "loss": 1.26, "step": 8265 }, { "epoch": 2.4619966864610285, "grad_norm": 0.23062999546527863, "learning_rate": 1.7642917972079737e-05, "loss": 1.2329, "step": 8266 }, { "epoch": 2.4622945326606973, "grad_norm": 0.23762331902980804, "learning_rate": 1.7642295916066636e-05, "loss": 1.2575, "step": 8267 }, { "epoch": 2.462592378860366, "grad_norm": 0.23018376529216766, "learning_rate": 1.7641673788950506e-05, "loss": 1.2355, "step": 8268 }, { "epoch": 2.4628902250600344, "grad_norm": 0.22822463512420654, "learning_rate": 1.764105159073714e-05, "loss": 1.2329, "step": 8269 }, { "epoch": 2.463188071259703, "grad_norm": 0.23241938650608063, "learning_rate": 1.7640429321432335e-05, "loss": 1.2513, "step": 8270 }, { "epoch": 2.463485917459372, "grad_norm": 0.23877792060375214, "learning_rate": 1.7639806981041864e-05, "loss": 1.247, "step": 8271 }, { "epoch": 2.4637837636590407, "grad_norm": 0.2348979413509369, "learning_rate": 1.7639184569571522e-05, "loss": 1.255, "step": 8272 }, { "epoch": 2.464081609858709, "grad_norm": 0.22891142964363098, "learning_rate": 1.7638562087027106e-05, "loss": 1.255, "step": 8273 }, { "epoch": 2.464379456058378, "grad_norm": 0.23356257379055023, "learning_rate": 1.7637939533414407e-05, "loss": 1.2586, "step": 8274 }, { "epoch": 2.4646773022580466, "grad_norm": 0.2342851608991623, "learning_rate": 1.763731690873921e-05, "loss": 1.2519, "step": 8275 }, { "epoch": 2.4649751484577154, "grad_norm": 0.2412482500076294, "learning_rate": 1.7636694213007316e-05, "loss": 1.243, "step": 8276 }, { "epoch": 2.4652729946573837, "grad_norm": 0.22454293072223663, "learning_rate": 1.7636071446224508e-05, "loss": 1.235, "step": 8277 }, { "epoch": 2.4655708408570525, "grad_norm": 0.23977334797382355, "learning_rate": 1.7635448608396592e-05, "loss": 1.2599, "step": 8278 }, { "epoch": 2.4658686870567212, "grad_norm": 0.23532359302043915, "learning_rate": 1.7634825699529357e-05, "loss": 1.2615, "step": 8279 }, { "epoch": 2.4661665332563896, "grad_norm": 0.2302032858133316, "learning_rate": 1.76342027196286e-05, "loss": 1.2655, "step": 8280 }, { "epoch": 2.4664643794560583, "grad_norm": 0.2321433573961258, "learning_rate": 1.7633579668700114e-05, "loss": 1.2524, "step": 8281 }, { "epoch": 2.466762225655727, "grad_norm": 0.2206072211265564, "learning_rate": 1.7632956546749696e-05, "loss": 1.256, "step": 8282 }, { "epoch": 2.4670600718553954, "grad_norm": 0.2309189736843109, "learning_rate": 1.763233335378315e-05, "loss": 1.2492, "step": 8283 }, { "epoch": 2.467357918055064, "grad_norm": 0.2347707450389862, "learning_rate": 1.7631710089806265e-05, "loss": 1.2542, "step": 8284 }, { "epoch": 2.467655764254733, "grad_norm": 0.23785153031349182, "learning_rate": 1.7631086754824845e-05, "loss": 1.2479, "step": 8285 }, { "epoch": 2.4679536104544018, "grad_norm": 0.2298462986946106, "learning_rate": 1.7630463348844694e-05, "loss": 1.2641, "step": 8286 }, { "epoch": 2.46825145665407, "grad_norm": 0.23382051289081573, "learning_rate": 1.76298398718716e-05, "loss": 1.2599, "step": 8287 }, { "epoch": 2.468549302853739, "grad_norm": 0.2526465356349945, "learning_rate": 1.7629216323911377e-05, "loss": 1.29, "step": 8288 }, { "epoch": 2.4688471490534076, "grad_norm": 0.2504581809043884, "learning_rate": 1.7628592704969815e-05, "loss": 1.2599, "step": 8289 }, { "epoch": 2.4691449952530764, "grad_norm": 0.2236238569021225, "learning_rate": 1.762796901505272e-05, "loss": 1.2481, "step": 8290 }, { "epoch": 2.4694428414527447, "grad_norm": 0.22754114866256714, "learning_rate": 1.76273452541659e-05, "loss": 1.234, "step": 8291 }, { "epoch": 2.4697406876524135, "grad_norm": 0.23117542266845703, "learning_rate": 1.7626721422315153e-05, "loss": 1.2252, "step": 8292 }, { "epoch": 2.4700385338520823, "grad_norm": 0.23307673633098602, "learning_rate": 1.7626097519506285e-05, "loss": 1.2422, "step": 8293 }, { "epoch": 2.4703363800517506, "grad_norm": 0.23654675483703613, "learning_rate": 1.76254735457451e-05, "loss": 1.2352, "step": 8294 }, { "epoch": 2.4706342262514194, "grad_norm": 0.23664255440235138, "learning_rate": 1.76248495010374e-05, "loss": 1.2593, "step": 8295 }, { "epoch": 2.470932072451088, "grad_norm": 0.2272440791130066, "learning_rate": 1.7624225385388998e-05, "loss": 1.2538, "step": 8296 }, { "epoch": 2.4712299186507565, "grad_norm": 0.22650983929634094, "learning_rate": 1.7623601198805695e-05, "loss": 1.2515, "step": 8297 }, { "epoch": 2.4715277648504252, "grad_norm": 0.23413802683353424, "learning_rate": 1.76229769412933e-05, "loss": 1.2614, "step": 8298 }, { "epoch": 2.471825611050094, "grad_norm": 0.2297011911869049, "learning_rate": 1.7622352612857622e-05, "loss": 1.256, "step": 8299 }, { "epoch": 2.472123457249763, "grad_norm": 0.23197448253631592, "learning_rate": 1.762172821350447e-05, "loss": 1.237, "step": 8300 }, { "epoch": 2.472421303449431, "grad_norm": 0.23182253539562225, "learning_rate": 1.7621103743239652e-05, "loss": 1.2585, "step": 8301 }, { "epoch": 2.4727191496491, "grad_norm": 0.2433755099773407, "learning_rate": 1.7620479202068977e-05, "loss": 1.2453, "step": 8302 }, { "epoch": 2.4730169958487687, "grad_norm": 0.22609774768352509, "learning_rate": 1.761985458999826e-05, "loss": 1.2519, "step": 8303 }, { "epoch": 2.4733148420484374, "grad_norm": 0.24316880106925964, "learning_rate": 1.7619229907033307e-05, "loss": 1.2365, "step": 8304 }, { "epoch": 2.4736126882481058, "grad_norm": 0.2347557693719864, "learning_rate": 1.7618605153179933e-05, "loss": 1.2437, "step": 8305 }, { "epoch": 2.4739105344477745, "grad_norm": 0.2216932475566864, "learning_rate": 1.7617980328443952e-05, "loss": 1.2369, "step": 8306 }, { "epoch": 2.4742083806474433, "grad_norm": 0.24111506342887878, "learning_rate": 1.7617355432831175e-05, "loss": 1.2569, "step": 8307 }, { "epoch": 2.4745062268471116, "grad_norm": 0.23701366782188416, "learning_rate": 1.7616730466347415e-05, "loss": 1.2542, "step": 8308 }, { "epoch": 2.4748040730467804, "grad_norm": 0.23881372809410095, "learning_rate": 1.7616105428998487e-05, "loss": 1.2449, "step": 8309 }, { "epoch": 2.475101919246449, "grad_norm": 0.23791971802711487, "learning_rate": 1.7615480320790208e-05, "loss": 1.2613, "step": 8310 }, { "epoch": 2.475399765446118, "grad_norm": 0.2507789433002472, "learning_rate": 1.7614855141728395e-05, "loss": 1.2469, "step": 8311 }, { "epoch": 2.4756976116457863, "grad_norm": 0.22700440883636475, "learning_rate": 1.761422989181886e-05, "loss": 1.2544, "step": 8312 }, { "epoch": 2.475995457845455, "grad_norm": 0.23406459391117096, "learning_rate": 1.7613604571067425e-05, "loss": 1.2394, "step": 8313 }, { "epoch": 2.476293304045124, "grad_norm": 0.227818563580513, "learning_rate": 1.7612979179479903e-05, "loss": 1.2446, "step": 8314 }, { "epoch": 2.4765911502447926, "grad_norm": 0.23256555199623108, "learning_rate": 1.7612353717062117e-05, "loss": 1.2715, "step": 8315 }, { "epoch": 2.476888996444461, "grad_norm": 0.22174721956253052, "learning_rate": 1.7611728183819888e-05, "loss": 1.2467, "step": 8316 }, { "epoch": 2.4771868426441297, "grad_norm": 0.23079760372638702, "learning_rate": 1.761110257975903e-05, "loss": 1.2595, "step": 8317 }, { "epoch": 2.4774846888437985, "grad_norm": 0.23321036994457245, "learning_rate": 1.7610476904885363e-05, "loss": 1.2507, "step": 8318 }, { "epoch": 2.477782535043467, "grad_norm": 0.22463597357273102, "learning_rate": 1.7609851159204716e-05, "loss": 1.2277, "step": 8319 }, { "epoch": 2.4780803812431356, "grad_norm": 0.23679356276988983, "learning_rate": 1.7609225342722906e-05, "loss": 1.2531, "step": 8320 }, { "epoch": 2.4783782274428043, "grad_norm": 0.22336843609809875, "learning_rate": 1.7608599455445753e-05, "loss": 1.2311, "step": 8321 }, { "epoch": 2.4786760736424727, "grad_norm": 0.23606246709823608, "learning_rate": 1.7607973497379083e-05, "loss": 1.2543, "step": 8322 }, { "epoch": 2.4789739198421414, "grad_norm": 0.23542477190494537, "learning_rate": 1.7607347468528718e-05, "loss": 1.231, "step": 8323 }, { "epoch": 2.47927176604181, "grad_norm": 0.2172466218471527, "learning_rate": 1.760672136890049e-05, "loss": 1.251, "step": 8324 }, { "epoch": 2.479569612241479, "grad_norm": 0.23283933103084564, "learning_rate": 1.7606095198500213e-05, "loss": 1.2563, "step": 8325 }, { "epoch": 2.4798674584411473, "grad_norm": 0.23413917422294617, "learning_rate": 1.7605468957333722e-05, "loss": 1.2655, "step": 8326 }, { "epoch": 2.480165304640816, "grad_norm": 0.22806623578071594, "learning_rate": 1.760484264540684e-05, "loss": 1.2641, "step": 8327 }, { "epoch": 2.480463150840485, "grad_norm": 0.22838027775287628, "learning_rate": 1.7604216262725392e-05, "loss": 1.2412, "step": 8328 }, { "epoch": 2.4807609970401536, "grad_norm": 0.23101909458637238, "learning_rate": 1.7603589809295205e-05, "loss": 1.2404, "step": 8329 }, { "epoch": 2.481058843239822, "grad_norm": 0.22244912385940552, "learning_rate": 1.7602963285122115e-05, "loss": 1.2399, "step": 8330 }, { "epoch": 2.4813566894394907, "grad_norm": 0.22440123558044434, "learning_rate": 1.7602336690211944e-05, "loss": 1.2486, "step": 8331 }, { "epoch": 2.4816545356391595, "grad_norm": 0.24199633300304413, "learning_rate": 1.7601710024570524e-05, "loss": 1.2348, "step": 8332 }, { "epoch": 2.481952381838828, "grad_norm": 0.2698337137699127, "learning_rate": 1.7601083288203685e-05, "loss": 1.2496, "step": 8333 }, { "epoch": 2.4822502280384966, "grad_norm": 0.24751141667366028, "learning_rate": 1.7600456481117257e-05, "loss": 1.2594, "step": 8334 }, { "epoch": 2.4825480742381654, "grad_norm": 0.23082788288593292, "learning_rate": 1.7599829603317075e-05, "loss": 1.2473, "step": 8335 }, { "epoch": 2.4828459204378337, "grad_norm": 0.21850734949111938, "learning_rate": 1.7599202654808972e-05, "loss": 1.2519, "step": 8336 }, { "epoch": 2.4831437666375025, "grad_norm": 0.2733902335166931, "learning_rate": 1.7598575635598775e-05, "loss": 1.253, "step": 8337 }, { "epoch": 2.4834416128371712, "grad_norm": 0.2962580919265747, "learning_rate": 1.759794854569232e-05, "loss": 1.2546, "step": 8338 }, { "epoch": 2.48373945903684, "grad_norm": 0.23880480229854584, "learning_rate": 1.7597321385095445e-05, "loss": 1.2532, "step": 8339 }, { "epoch": 2.4840373052365083, "grad_norm": 0.31910011172294617, "learning_rate": 1.7596694153813984e-05, "loss": 1.2539, "step": 8340 }, { "epoch": 2.484335151436177, "grad_norm": 0.29905804991722107, "learning_rate": 1.759606685185377e-05, "loss": 1.2373, "step": 8341 }, { "epoch": 2.484632997635846, "grad_norm": 0.25430893898010254, "learning_rate": 1.759543947922064e-05, "loss": 1.2527, "step": 8342 }, { "epoch": 2.4849308438355147, "grad_norm": 0.30623292922973633, "learning_rate": 1.7594812035920434e-05, "loss": 1.2703, "step": 8343 }, { "epoch": 2.485228690035183, "grad_norm": 0.2231859266757965, "learning_rate": 1.7594184521958986e-05, "loss": 1.2616, "step": 8344 }, { "epoch": 2.4855265362348518, "grad_norm": 0.24325911700725555, "learning_rate": 1.7593556937342136e-05, "loss": 1.2464, "step": 8345 }, { "epoch": 2.4858243824345205, "grad_norm": 0.23927530646324158, "learning_rate": 1.7592929282075722e-05, "loss": 1.2496, "step": 8346 }, { "epoch": 2.486122228634189, "grad_norm": 0.24509276449680328, "learning_rate": 1.7592301556165584e-05, "loss": 1.2581, "step": 8347 }, { "epoch": 2.4864200748338576, "grad_norm": 0.2705968916416168, "learning_rate": 1.7591673759617564e-05, "loss": 1.2318, "step": 8348 }, { "epoch": 2.4867179210335264, "grad_norm": 0.27921196818351746, "learning_rate": 1.7591045892437503e-05, "loss": 1.242, "step": 8349 }, { "epoch": 2.4870157672331947, "grad_norm": 0.27228739857673645, "learning_rate": 1.7590417954631238e-05, "loss": 1.2626, "step": 8350 }, { "epoch": 2.4873136134328635, "grad_norm": 0.4469757080078125, "learning_rate": 1.7589789946204617e-05, "loss": 1.2477, "step": 8351 }, { "epoch": 2.4876114596325323, "grad_norm": 0.28334805369377136, "learning_rate": 1.758916186716348e-05, "loss": 1.2571, "step": 8352 }, { "epoch": 2.487909305832201, "grad_norm": 0.2805827558040619, "learning_rate": 1.758853371751367e-05, "loss": 1.2554, "step": 8353 }, { "epoch": 2.4882071520318694, "grad_norm": 0.2342253029346466, "learning_rate": 1.758790549726103e-05, "loss": 1.2336, "step": 8354 }, { "epoch": 2.488504998231538, "grad_norm": 0.26948752999305725, "learning_rate": 1.758727720641141e-05, "loss": 1.2533, "step": 8355 }, { "epoch": 2.488802844431207, "grad_norm": 0.26174086332321167, "learning_rate": 1.7586648844970652e-05, "loss": 1.2521, "step": 8356 }, { "epoch": 2.4891006906308757, "grad_norm": 0.23120491206645966, "learning_rate": 1.7586020412944603e-05, "loss": 1.2392, "step": 8357 }, { "epoch": 2.489398536830544, "grad_norm": 0.24739497900009155, "learning_rate": 1.758539191033911e-05, "loss": 1.2678, "step": 8358 }, { "epoch": 2.489696383030213, "grad_norm": 0.23574678599834442, "learning_rate": 1.758476333716002e-05, "loss": 1.2505, "step": 8359 }, { "epoch": 2.4899942292298816, "grad_norm": 0.24132049083709717, "learning_rate": 1.7584134693413178e-05, "loss": 1.2475, "step": 8360 }, { "epoch": 2.49029207542955, "grad_norm": 0.22077466547489166, "learning_rate": 1.758350597910444e-05, "loss": 1.2442, "step": 8361 }, { "epoch": 2.4905899216292187, "grad_norm": 0.2397473156452179, "learning_rate": 1.7582877194239652e-05, "loss": 1.2504, "step": 8362 }, { "epoch": 2.4908877678288874, "grad_norm": 0.24787132441997528, "learning_rate": 1.7582248338824662e-05, "loss": 1.2522, "step": 8363 }, { "epoch": 2.4911856140285558, "grad_norm": 0.23322662711143494, "learning_rate": 1.7581619412865322e-05, "loss": 1.2616, "step": 8364 }, { "epoch": 2.4914834602282245, "grad_norm": 0.2274780422449112, "learning_rate": 1.7580990416367484e-05, "loss": 1.2441, "step": 8365 }, { "epoch": 2.4917813064278933, "grad_norm": 0.23379875719547272, "learning_rate": 1.7580361349337006e-05, "loss": 1.2425, "step": 8366 }, { "epoch": 2.492079152627562, "grad_norm": 0.23417527973651886, "learning_rate": 1.757973221177973e-05, "loss": 1.2451, "step": 8367 }, { "epoch": 2.4923769988272304, "grad_norm": 0.23419426381587982, "learning_rate": 1.7579103003701514e-05, "loss": 1.2589, "step": 8368 }, { "epoch": 2.492674845026899, "grad_norm": 0.23790061473846436, "learning_rate": 1.757847372510821e-05, "loss": 1.2371, "step": 8369 }, { "epoch": 2.492972691226568, "grad_norm": 0.22814206779003143, "learning_rate": 1.757784437600568e-05, "loss": 1.2725, "step": 8370 }, { "epoch": 2.4932705374262367, "grad_norm": 0.2249986231327057, "learning_rate": 1.757721495639977e-05, "loss": 1.2418, "step": 8371 }, { "epoch": 2.493568383625905, "grad_norm": 0.24132882058620453, "learning_rate": 1.7576585466296346e-05, "loss": 1.2399, "step": 8372 }, { "epoch": 2.493866229825574, "grad_norm": 0.24062736332416534, "learning_rate": 1.7575955905701257e-05, "loss": 1.2523, "step": 8373 }, { "epoch": 2.4941640760252426, "grad_norm": 0.23082296550273895, "learning_rate": 1.7575326274620362e-05, "loss": 1.2506, "step": 8374 }, { "epoch": 2.494461922224911, "grad_norm": 0.23409095406532288, "learning_rate": 1.757469657305952e-05, "loss": 1.2485, "step": 8375 }, { "epoch": 2.4947597684245797, "grad_norm": 0.23145711421966553, "learning_rate": 1.757406680102459e-05, "loss": 1.2593, "step": 8376 }, { "epoch": 2.4950576146242485, "grad_norm": 0.23635199666023254, "learning_rate": 1.7573436958521428e-05, "loss": 1.2491, "step": 8377 }, { "epoch": 2.4953554608239172, "grad_norm": 0.23779116570949554, "learning_rate": 1.75728070455559e-05, "loss": 1.2788, "step": 8378 }, { "epoch": 2.4956533070235856, "grad_norm": 0.23136919736862183, "learning_rate": 1.7572177062133863e-05, "loss": 1.2767, "step": 8379 }, { "epoch": 2.4959511532232543, "grad_norm": 0.2295750081539154, "learning_rate": 1.7571547008261175e-05, "loss": 1.2734, "step": 8380 }, { "epoch": 2.496248999422923, "grad_norm": 0.23830348253250122, "learning_rate": 1.7570916883943704e-05, "loss": 1.2625, "step": 8381 }, { "epoch": 2.496546845622592, "grad_norm": 0.2223721295595169, "learning_rate": 1.7570286689187312e-05, "loss": 1.2436, "step": 8382 }, { "epoch": 2.49684469182226, "grad_norm": 0.29212528467178345, "learning_rate": 1.7569656423997858e-05, "loss": 1.2531, "step": 8383 }, { "epoch": 2.497142538021929, "grad_norm": 0.2426663041114807, "learning_rate": 1.756902608838121e-05, "loss": 1.2533, "step": 8384 }, { "epoch": 2.4974403842215978, "grad_norm": 0.27768445014953613, "learning_rate": 1.7568395682343226e-05, "loss": 1.231, "step": 8385 }, { "epoch": 2.497738230421266, "grad_norm": 0.23751646280288696, "learning_rate": 1.756776520588978e-05, "loss": 1.2624, "step": 8386 }, { "epoch": 2.498036076620935, "grad_norm": 0.26912587881088257, "learning_rate": 1.7567134659026734e-05, "loss": 1.2522, "step": 8387 }, { "epoch": 2.4983339228206036, "grad_norm": 0.2330433577299118, "learning_rate": 1.7566504041759954e-05, "loss": 1.2488, "step": 8388 }, { "epoch": 2.498631769020272, "grad_norm": 0.24332500994205475, "learning_rate": 1.756587335409531e-05, "loss": 1.2642, "step": 8389 }, { "epoch": 2.4989296152199407, "grad_norm": 0.22631646692752838, "learning_rate": 1.7565242596038664e-05, "loss": 1.2486, "step": 8390 }, { "epoch": 2.4992274614196095, "grad_norm": 0.268858939409256, "learning_rate": 1.7564611767595888e-05, "loss": 1.2471, "step": 8391 }, { "epoch": 2.4995253076192783, "grad_norm": 0.2353544533252716, "learning_rate": 1.7563980868772853e-05, "loss": 1.2551, "step": 8392 }, { "epoch": 2.4998231538189466, "grad_norm": 0.23257684707641602, "learning_rate": 1.7563349899575427e-05, "loss": 1.2442, "step": 8393 }, { "epoch": 2.5001210000186154, "grad_norm": 0.24033115804195404, "learning_rate": 1.756271886000948e-05, "loss": 1.2563, "step": 8394 }, { "epoch": 2.500418846218284, "grad_norm": 0.2385256290435791, "learning_rate": 1.7562087750080885e-05, "loss": 1.2446, "step": 8395 }, { "epoch": 2.500716692417953, "grad_norm": 0.2354026585817337, "learning_rate": 1.756145656979551e-05, "loss": 1.2405, "step": 8396 }, { "epoch": 2.5010145386176212, "grad_norm": 0.23773229122161865, "learning_rate": 1.756082531915923e-05, "loss": 1.2423, "step": 8397 }, { "epoch": 2.50131238481729, "grad_norm": 0.38805413246154785, "learning_rate": 1.7560193998177922e-05, "loss": 1.2595, "step": 8398 }, { "epoch": 2.501610231016959, "grad_norm": 0.32371848821640015, "learning_rate": 1.755956260685745e-05, "loss": 1.2408, "step": 8399 }, { "epoch": 2.501908077216627, "grad_norm": 0.28234052658081055, "learning_rate": 1.7558931145203697e-05, "loss": 1.2483, "step": 8400 }, { "epoch": 2.502205923416296, "grad_norm": 0.3353901505470276, "learning_rate": 1.7558299613222534e-05, "loss": 1.263, "step": 8401 }, { "epoch": 2.5025037696159647, "grad_norm": 0.23078328371047974, "learning_rate": 1.755766801091984e-05, "loss": 1.2556, "step": 8402 }, { "epoch": 2.502801615815633, "grad_norm": 0.23619240522384644, "learning_rate": 1.7557036338301486e-05, "loss": 1.2478, "step": 8403 }, { "epoch": 2.5030994620153018, "grad_norm": 0.2363365888595581, "learning_rate": 1.755640459537335e-05, "loss": 1.236, "step": 8404 }, { "epoch": 2.5033973082149705, "grad_norm": 0.2406965047121048, "learning_rate": 1.7555772782141315e-05, "loss": 1.2811, "step": 8405 }, { "epoch": 2.5036951544146393, "grad_norm": 0.23396317660808563, "learning_rate": 1.7555140898611257e-05, "loss": 1.2602, "step": 8406 }, { "epoch": 2.5039930006143076, "grad_norm": 0.23090584576129913, "learning_rate": 1.7554508944789055e-05, "loss": 1.2536, "step": 8407 }, { "epoch": 2.5042908468139764, "grad_norm": 0.2459108680486679, "learning_rate": 1.7553876920680584e-05, "loss": 1.2713, "step": 8408 }, { "epoch": 2.504588693013645, "grad_norm": 0.22830830514431, "learning_rate": 1.755324482629173e-05, "loss": 1.2472, "step": 8409 }, { "epoch": 2.504886539213314, "grad_norm": 0.23577916622161865, "learning_rate": 1.7552612661628373e-05, "loss": 1.2472, "step": 8410 }, { "epoch": 2.5051843854129823, "grad_norm": 0.24144499003887177, "learning_rate": 1.755198042669639e-05, "loss": 1.2436, "step": 8411 }, { "epoch": 2.505482231612651, "grad_norm": 0.2292211502790451, "learning_rate": 1.755134812150167e-05, "loss": 1.2664, "step": 8412 }, { "epoch": 2.50578007781232, "grad_norm": 0.2611818313598633, "learning_rate": 1.755071574605009e-05, "loss": 1.2461, "step": 8413 }, { "epoch": 2.506077924011988, "grad_norm": 0.2922768294811249, "learning_rate": 1.755008330034754e-05, "loss": 1.2442, "step": 8414 }, { "epoch": 2.506375770211657, "grad_norm": 0.24702337384223938, "learning_rate": 1.7549450784399894e-05, "loss": 1.2644, "step": 8415 }, { "epoch": 2.5066736164113257, "grad_norm": 0.22738125920295715, "learning_rate": 1.7548818198213048e-05, "loss": 1.2602, "step": 8416 }, { "epoch": 2.506971462610994, "grad_norm": 0.258711040019989, "learning_rate": 1.7548185541792883e-05, "loss": 1.2542, "step": 8417 }, { "epoch": 2.507269308810663, "grad_norm": 0.2498316615819931, "learning_rate": 1.754755281514528e-05, "loss": 1.2729, "step": 8418 }, { "epoch": 2.5075671550103316, "grad_norm": 0.25507616996765137, "learning_rate": 1.7546920018276136e-05, "loss": 1.2532, "step": 8419 }, { "epoch": 2.5078650012100003, "grad_norm": 0.2289208471775055, "learning_rate": 1.7546287151191332e-05, "loss": 1.2541, "step": 8420 }, { "epoch": 2.508162847409669, "grad_norm": 0.2770848274230957, "learning_rate": 1.7545654213896756e-05, "loss": 1.2645, "step": 8421 }, { "epoch": 2.5084606936093374, "grad_norm": 0.2718277871608734, "learning_rate": 1.7545021206398297e-05, "loss": 1.2522, "step": 8422 }, { "epoch": 2.508758539809006, "grad_norm": 0.2465401440858841, "learning_rate": 1.754438812870185e-05, "loss": 1.2502, "step": 8423 }, { "epoch": 2.509056386008675, "grad_norm": 0.3208794593811035, "learning_rate": 1.7543754980813298e-05, "loss": 1.2609, "step": 8424 }, { "epoch": 2.5093542322083433, "grad_norm": 0.24295663833618164, "learning_rate": 1.754312176273853e-05, "loss": 1.2328, "step": 8425 }, { "epoch": 2.509652078408012, "grad_norm": 0.25769492983818054, "learning_rate": 1.7542488474483446e-05, "loss": 1.2591, "step": 8426 }, { "epoch": 2.509949924607681, "grad_norm": 0.2720673680305481, "learning_rate": 1.7541855116053935e-05, "loss": 1.2625, "step": 8427 }, { "epoch": 2.510247770807349, "grad_norm": 0.22432737052440643, "learning_rate": 1.754122168745589e-05, "loss": 1.2352, "step": 8428 }, { "epoch": 2.510545617007018, "grad_norm": 0.2628263831138611, "learning_rate": 1.7540588188695197e-05, "loss": 1.262, "step": 8429 }, { "epoch": 2.5108434632066867, "grad_norm": 0.2659337520599365, "learning_rate": 1.753995461977776e-05, "loss": 1.2459, "step": 8430 }, { "epoch": 2.511141309406355, "grad_norm": 0.23539134860038757, "learning_rate": 1.7539320980709466e-05, "loss": 1.2491, "step": 8431 }, { "epoch": 2.511439155606024, "grad_norm": 0.2545980215072632, "learning_rate": 1.7538687271496214e-05, "loss": 1.2634, "step": 8432 }, { "epoch": 2.5117370018056926, "grad_norm": 0.2392890602350235, "learning_rate": 1.7538053492143902e-05, "loss": 1.2412, "step": 8433 }, { "epoch": 2.5120348480053614, "grad_norm": 0.25248026847839355, "learning_rate": 1.7537419642658423e-05, "loss": 1.2418, "step": 8434 }, { "epoch": 2.51233269420503, "grad_norm": 0.3714005947113037, "learning_rate": 1.7536785723045674e-05, "loss": 1.2549, "step": 8435 }, { "epoch": 2.5126305404046985, "grad_norm": 0.3023560047149658, "learning_rate": 1.7536151733311557e-05, "loss": 1.2344, "step": 8436 }, { "epoch": 2.5129283866043672, "grad_norm": 0.282634973526001, "learning_rate": 1.753551767346197e-05, "loss": 1.2538, "step": 8437 }, { "epoch": 2.513226232804036, "grad_norm": 0.31312984228134155, "learning_rate": 1.7534883543502804e-05, "loss": 1.2514, "step": 8438 }, { "epoch": 2.5135240790037043, "grad_norm": 0.2593906819820404, "learning_rate": 1.7534249343439967e-05, "loss": 1.2328, "step": 8439 }, { "epoch": 2.513821925203373, "grad_norm": 0.25793519616127014, "learning_rate": 1.7533615073279363e-05, "loss": 1.2425, "step": 8440 }, { "epoch": 2.514119771403042, "grad_norm": 0.2638314366340637, "learning_rate": 1.753298073302688e-05, "loss": 1.2366, "step": 8441 }, { "epoch": 2.51441761760271, "grad_norm": 0.23106567561626434, "learning_rate": 1.7532346322688434e-05, "loss": 1.2503, "step": 8442 }, { "epoch": 2.514715463802379, "grad_norm": 0.3165026605129242, "learning_rate": 1.753171184226992e-05, "loss": 1.2629, "step": 8443 }, { "epoch": 2.5150133100020478, "grad_norm": 0.2291480451822281, "learning_rate": 1.7531077291777242e-05, "loss": 1.2405, "step": 8444 }, { "epoch": 2.515311156201716, "grad_norm": 0.24233947694301605, "learning_rate": 1.75304426712163e-05, "loss": 1.245, "step": 8445 }, { "epoch": 2.515609002401385, "grad_norm": 0.2441457360982895, "learning_rate": 1.7529807980593006e-05, "loss": 1.2502, "step": 8446 }, { "epoch": 2.5159068486010536, "grad_norm": 0.21896037459373474, "learning_rate": 1.752917321991326e-05, "loss": 1.2495, "step": 8447 }, { "epoch": 2.5162046948007224, "grad_norm": 0.2734503448009491, "learning_rate": 1.752853838918297e-05, "loss": 1.2435, "step": 8448 }, { "epoch": 2.516502541000391, "grad_norm": 0.23741139471530914, "learning_rate": 1.7527903488408044e-05, "loss": 1.258, "step": 8449 }, { "epoch": 2.5168003872000595, "grad_norm": 0.24481236934661865, "learning_rate": 1.7527268517594383e-05, "loss": 1.2531, "step": 8450 }, { "epoch": 2.5170982333997283, "grad_norm": 0.23682580888271332, "learning_rate": 1.75266334767479e-05, "loss": 1.2584, "step": 8451 }, { "epoch": 2.517396079599397, "grad_norm": 0.29860150814056396, "learning_rate": 1.75259983658745e-05, "loss": 1.2407, "step": 8452 }, { "epoch": 2.5176939257990654, "grad_norm": 0.23807692527770996, "learning_rate": 1.7525363184980095e-05, "loss": 1.2344, "step": 8453 }, { "epoch": 2.517991771998734, "grad_norm": 0.25496482849121094, "learning_rate": 1.7524727934070596e-05, "loss": 1.2577, "step": 8454 }, { "epoch": 2.518289618198403, "grad_norm": 0.2405746430158615, "learning_rate": 1.752409261315191e-05, "loss": 1.2408, "step": 8455 }, { "epoch": 2.5185874643980712, "grad_norm": 0.31379228830337524, "learning_rate": 1.7523457222229944e-05, "loss": 1.256, "step": 8456 }, { "epoch": 2.51888531059774, "grad_norm": 0.292829304933548, "learning_rate": 1.7522821761310616e-05, "loss": 1.2628, "step": 8457 }, { "epoch": 2.519183156797409, "grad_norm": 0.2625550925731659, "learning_rate": 1.752218623039984e-05, "loss": 1.2408, "step": 8458 }, { "epoch": 2.519481002997077, "grad_norm": 0.2893580198287964, "learning_rate": 1.7521550629503524e-05, "loss": 1.2384, "step": 8459 }, { "epoch": 2.519778849196746, "grad_norm": 0.24691055715084076, "learning_rate": 1.752091495862758e-05, "loss": 1.2414, "step": 8460 }, { "epoch": 2.5200766953964147, "grad_norm": 0.26145702600479126, "learning_rate": 1.752027921777793e-05, "loss": 1.2303, "step": 8461 }, { "epoch": 2.5203745415960834, "grad_norm": 0.22267287969589233, "learning_rate": 1.751964340696048e-05, "loss": 1.24, "step": 8462 }, { "epoch": 2.520672387795752, "grad_norm": 0.26279503107070923, "learning_rate": 1.751900752618115e-05, "loss": 1.2453, "step": 8463 }, { "epoch": 2.5209702339954205, "grad_norm": 0.23793211579322815, "learning_rate": 1.751837157544586e-05, "loss": 1.2428, "step": 8464 }, { "epoch": 2.5212680801950893, "grad_norm": 0.24436452984809875, "learning_rate": 1.7517735554760518e-05, "loss": 1.2452, "step": 8465 }, { "epoch": 2.521565926394758, "grad_norm": 0.24439586699008942, "learning_rate": 1.7517099464131045e-05, "loss": 1.2544, "step": 8466 }, { "epoch": 2.5218637725944264, "grad_norm": 0.2703554630279541, "learning_rate": 1.7516463303563364e-05, "loss": 1.2608, "step": 8467 }, { "epoch": 2.522161618794095, "grad_norm": 0.2439625859260559, "learning_rate": 1.7515827073063388e-05, "loss": 1.2464, "step": 8468 }, { "epoch": 2.522459464993764, "grad_norm": 0.2397177666425705, "learning_rate": 1.7515190772637038e-05, "loss": 1.2514, "step": 8469 }, { "epoch": 2.5227573111934323, "grad_norm": 0.23239301145076752, "learning_rate": 1.7514554402290235e-05, "loss": 1.2628, "step": 8470 }, { "epoch": 2.523055157393101, "grad_norm": 0.2251158356666565, "learning_rate": 1.75139179620289e-05, "loss": 1.2434, "step": 8471 }, { "epoch": 2.52335300359277, "grad_norm": 0.23615312576293945, "learning_rate": 1.751328145185895e-05, "loss": 1.2347, "step": 8472 }, { "epoch": 2.5236508497924386, "grad_norm": 0.2576223909854889, "learning_rate": 1.7512644871786312e-05, "loss": 1.2346, "step": 8473 }, { "epoch": 2.523948695992107, "grad_norm": 0.2452310174703598, "learning_rate": 1.751200822181691e-05, "loss": 1.2472, "step": 8474 }, { "epoch": 2.5242465421917757, "grad_norm": 0.25342339277267456, "learning_rate": 1.751137150195666e-05, "loss": 1.2573, "step": 8475 }, { "epoch": 2.5245443883914445, "grad_norm": 0.3129463493824005, "learning_rate": 1.7510734712211494e-05, "loss": 1.2415, "step": 8476 }, { "epoch": 2.5248422345911132, "grad_norm": 0.30068057775497437, "learning_rate": 1.751009785258733e-05, "loss": 1.2541, "step": 8477 }, { "epoch": 2.5251400807907816, "grad_norm": 0.23290516436100006, "learning_rate": 1.75094609230901e-05, "loss": 1.2629, "step": 8478 }, { "epoch": 2.5254379269904503, "grad_norm": 0.3935732841491699, "learning_rate": 1.7508823923725723e-05, "loss": 1.2463, "step": 8479 }, { "epoch": 2.525735773190119, "grad_norm": 0.30810996890068054, "learning_rate": 1.750818685450013e-05, "loss": 1.2537, "step": 8480 }, { "epoch": 2.5260336193897874, "grad_norm": 0.28003424406051636, "learning_rate": 1.7507549715419245e-05, "loss": 1.2443, "step": 8481 }, { "epoch": 2.526331465589456, "grad_norm": 0.23418410122394562, "learning_rate": 1.7506912506489002e-05, "loss": 1.2507, "step": 8482 }, { "epoch": 2.526629311789125, "grad_norm": 0.32362639904022217, "learning_rate": 1.750627522771532e-05, "loss": 1.2509, "step": 8483 }, { "epoch": 2.5269271579887933, "grad_norm": 0.2395038902759552, "learning_rate": 1.7505637879104137e-05, "loss": 1.2429, "step": 8484 }, { "epoch": 2.527225004188462, "grad_norm": 0.24742889404296875, "learning_rate": 1.7505000460661378e-05, "loss": 1.2534, "step": 8485 }, { "epoch": 2.527522850388131, "grad_norm": 0.2470606416463852, "learning_rate": 1.7504362972392977e-05, "loss": 1.2462, "step": 8486 }, { "epoch": 2.5278206965877996, "grad_norm": 0.23291617631912231, "learning_rate": 1.750372541430486e-05, "loss": 1.2446, "step": 8487 }, { "epoch": 2.5281185427874684, "grad_norm": 0.22437015175819397, "learning_rate": 1.7503087786402962e-05, "loss": 1.23, "step": 8488 }, { "epoch": 2.5284163889871367, "grad_norm": 0.2314341962337494, "learning_rate": 1.750245008869322e-05, "loss": 1.2428, "step": 8489 }, { "epoch": 2.5287142351868055, "grad_norm": 0.24021659791469574, "learning_rate": 1.7501812321181556e-05, "loss": 1.25, "step": 8490 }, { "epoch": 2.5290120813864743, "grad_norm": 0.23177403211593628, "learning_rate": 1.7501174483873914e-05, "loss": 1.2478, "step": 8491 }, { "epoch": 2.5293099275861426, "grad_norm": 0.22764109075069427, "learning_rate": 1.750053657677622e-05, "loss": 1.2393, "step": 8492 }, { "epoch": 2.5296077737858114, "grad_norm": 0.24774837493896484, "learning_rate": 1.7499898599894415e-05, "loss": 1.2576, "step": 8493 }, { "epoch": 2.52990561998548, "grad_norm": 0.22911469638347626, "learning_rate": 1.7499260553234434e-05, "loss": 1.2438, "step": 8494 }, { "epoch": 2.5302034661851485, "grad_norm": 0.24415606260299683, "learning_rate": 1.749862243680221e-05, "loss": 1.2526, "step": 8495 }, { "epoch": 2.5305013123848172, "grad_norm": 0.24845975637435913, "learning_rate": 1.7497984250603687e-05, "loss": 1.2468, "step": 8496 }, { "epoch": 2.530799158584486, "grad_norm": 0.2423754185438156, "learning_rate": 1.7497345994644795e-05, "loss": 1.261, "step": 8497 }, { "epoch": 2.5310970047841543, "grad_norm": 0.2517605125904083, "learning_rate": 1.7496707668931474e-05, "loss": 1.2638, "step": 8498 }, { "epoch": 2.531394850983823, "grad_norm": 0.2364327609539032, "learning_rate": 1.7496069273469664e-05, "loss": 1.2519, "step": 8499 }, { "epoch": 2.531692697183492, "grad_norm": 0.2540130019187927, "learning_rate": 1.7495430808265307e-05, "loss": 1.2511, "step": 8500 }, { "epoch": 2.531692697183492, "eval_loss": 1.3396437168121338, "eval_runtime": 19.6651, "eval_samples_per_second": 88.177, "eval_steps_per_second": 5.543, "step": 8500 }, { "epoch": 2.5319905433831607, "grad_norm": 0.23343361914157867, "learning_rate": 1.7494792273324337e-05, "loss": 1.2657, "step": 8501 }, { "epoch": 2.5322883895828294, "grad_norm": 0.238958477973938, "learning_rate": 1.7494153668652702e-05, "loss": 1.2398, "step": 8502 }, { "epoch": 2.5325862357824978, "grad_norm": 0.2486771047115326, "learning_rate": 1.7493514994256336e-05, "loss": 1.2617, "step": 8503 }, { "epoch": 2.5328840819821665, "grad_norm": 0.24094340205192566, "learning_rate": 1.749287625014119e-05, "loss": 1.2398, "step": 8504 }, { "epoch": 2.5331819281818353, "grad_norm": 0.24892079830169678, "learning_rate": 1.74922374363132e-05, "loss": 1.2634, "step": 8505 }, { "epoch": 2.5334797743815036, "grad_norm": 0.24563492834568024, "learning_rate": 1.749159855277831e-05, "loss": 1.2632, "step": 8506 }, { "epoch": 2.5337776205811724, "grad_norm": 0.268160343170166, "learning_rate": 1.7490959599542467e-05, "loss": 1.2442, "step": 8507 }, { "epoch": 2.534075466780841, "grad_norm": 0.25259169936180115, "learning_rate": 1.7490320576611613e-05, "loss": 1.2319, "step": 8508 }, { "epoch": 2.5343733129805095, "grad_norm": 0.29176679253578186, "learning_rate": 1.7489681483991696e-05, "loss": 1.2304, "step": 8509 }, { "epoch": 2.5346711591801783, "grad_norm": 0.24003355205059052, "learning_rate": 1.748904232168866e-05, "loss": 1.2592, "step": 8510 }, { "epoch": 2.534969005379847, "grad_norm": 0.24610748887062073, "learning_rate": 1.7488403089708455e-05, "loss": 1.2395, "step": 8511 }, { "epoch": 2.5352668515795154, "grad_norm": 0.24857553839683533, "learning_rate": 1.7487763788057022e-05, "loss": 1.2578, "step": 8512 }, { "epoch": 2.535564697779184, "grad_norm": 0.2514202296733856, "learning_rate": 1.7487124416740315e-05, "loss": 1.2422, "step": 8513 }, { "epoch": 2.535862543978853, "grad_norm": 0.26457077264785767, "learning_rate": 1.7486484975764278e-05, "loss": 1.2681, "step": 8514 }, { "epoch": 2.5361603901785217, "grad_norm": 0.2324807494878769, "learning_rate": 1.7485845465134866e-05, "loss": 1.2478, "step": 8515 }, { "epoch": 2.5364582363781905, "grad_norm": 0.2839811146259308, "learning_rate": 1.7485205884858024e-05, "loss": 1.246, "step": 8516 }, { "epoch": 2.536756082577859, "grad_norm": 0.24369819462299347, "learning_rate": 1.7484566234939705e-05, "loss": 1.2394, "step": 8517 }, { "epoch": 2.5370539287775276, "grad_norm": 0.24273979663848877, "learning_rate": 1.7483926515385862e-05, "loss": 1.2449, "step": 8518 }, { "epoch": 2.5373517749771963, "grad_norm": 0.2644827365875244, "learning_rate": 1.748328672620244e-05, "loss": 1.2445, "step": 8519 }, { "epoch": 2.5376496211768647, "grad_norm": 0.23556312918663025, "learning_rate": 1.7482646867395396e-05, "loss": 1.2518, "step": 8520 }, { "epoch": 2.5379474673765334, "grad_norm": 0.2538624107837677, "learning_rate": 1.7482006938970685e-05, "loss": 1.2402, "step": 8521 }, { "epoch": 2.538245313576202, "grad_norm": 0.23888514935970306, "learning_rate": 1.7481366940934256e-05, "loss": 1.2346, "step": 8522 }, { "epoch": 2.5385431597758705, "grad_norm": 0.2887038290500641, "learning_rate": 1.748072687329207e-05, "loss": 1.244, "step": 8523 }, { "epoch": 2.5388410059755393, "grad_norm": 0.3057018518447876, "learning_rate": 1.7480086736050076e-05, "loss": 1.2572, "step": 8524 }, { "epoch": 2.539138852175208, "grad_norm": 0.2414216697216034, "learning_rate": 1.7479446529214232e-05, "loss": 1.2524, "step": 8525 }, { "epoch": 2.5394366983748764, "grad_norm": 0.4179585874080658, "learning_rate": 1.7478806252790497e-05, "loss": 1.2575, "step": 8526 }, { "epoch": 2.539734544574545, "grad_norm": 0.33019116520881653, "learning_rate": 1.7478165906784826e-05, "loss": 1.2562, "step": 8527 }, { "epoch": 2.540032390774214, "grad_norm": 0.2703005373477936, "learning_rate": 1.747752549120317e-05, "loss": 1.2565, "step": 8528 }, { "epoch": 2.5403302369738827, "grad_norm": 0.23929455876350403, "learning_rate": 1.74768850060515e-05, "loss": 1.2523, "step": 8529 }, { "epoch": 2.5406280831735515, "grad_norm": 0.30801793932914734, "learning_rate": 1.7476244451335767e-05, "loss": 1.2488, "step": 8530 }, { "epoch": 2.54092592937322, "grad_norm": 0.2433696687221527, "learning_rate": 1.747560382706193e-05, "loss": 1.2386, "step": 8531 }, { "epoch": 2.5412237755728886, "grad_norm": 0.241928830742836, "learning_rate": 1.7474963133235955e-05, "loss": 1.2512, "step": 8532 }, { "epoch": 2.5415216217725574, "grad_norm": 0.2384069710969925, "learning_rate": 1.7474322369863797e-05, "loss": 1.2702, "step": 8533 }, { "epoch": 2.5418194679722257, "grad_norm": 0.2398209571838379, "learning_rate": 1.7473681536951424e-05, "loss": 1.2574, "step": 8534 }, { "epoch": 2.5421173141718945, "grad_norm": 0.23053257167339325, "learning_rate": 1.747304063450479e-05, "loss": 1.2515, "step": 8535 }, { "epoch": 2.5424151603715632, "grad_norm": 0.23549732565879822, "learning_rate": 1.7472399662529865e-05, "loss": 1.2402, "step": 8536 }, { "epoch": 2.5427130065712316, "grad_norm": 0.2632571756839752, "learning_rate": 1.747175862103261e-05, "loss": 1.2505, "step": 8537 }, { "epoch": 2.5430108527709003, "grad_norm": 0.23406517505645752, "learning_rate": 1.7471117510018988e-05, "loss": 1.2592, "step": 8538 }, { "epoch": 2.543308698970569, "grad_norm": 0.23157420754432678, "learning_rate": 1.7470476329494962e-05, "loss": 1.2542, "step": 8539 }, { "epoch": 2.543606545170238, "grad_norm": 0.2320341169834137, "learning_rate": 1.7469835079466502e-05, "loss": 1.2353, "step": 8540 }, { "epoch": 2.543904391369906, "grad_norm": 0.2633756697177887, "learning_rate": 1.7469193759939576e-05, "loss": 1.2778, "step": 8541 }, { "epoch": 2.544202237569575, "grad_norm": 0.2357613444328308, "learning_rate": 1.7468552370920145e-05, "loss": 1.2499, "step": 8542 }, { "epoch": 2.5445000837692437, "grad_norm": 0.23448744416236877, "learning_rate": 1.7467910912414177e-05, "loss": 1.2647, "step": 8543 }, { "epoch": 2.5447979299689125, "grad_norm": 0.24945658445358276, "learning_rate": 1.7467269384427644e-05, "loss": 1.2528, "step": 8544 }, { "epoch": 2.545095776168581, "grad_norm": 0.23232224583625793, "learning_rate": 1.746662778696651e-05, "loss": 1.2527, "step": 8545 }, { "epoch": 2.5453936223682496, "grad_norm": 0.2522088587284088, "learning_rate": 1.7465986120036746e-05, "loss": 1.2477, "step": 8546 }, { "epoch": 2.5456914685679184, "grad_norm": 0.2554646134376526, "learning_rate": 1.7465344383644326e-05, "loss": 1.2545, "step": 8547 }, { "epoch": 2.5459893147675867, "grad_norm": 0.2312830537557602, "learning_rate": 1.7464702577795215e-05, "loss": 1.253, "step": 8548 }, { "epoch": 2.5462871609672555, "grad_norm": 0.25109291076660156, "learning_rate": 1.7464060702495386e-05, "loss": 1.2509, "step": 8549 }, { "epoch": 2.5465850071669243, "grad_norm": 0.257561594247818, "learning_rate": 1.7463418757750815e-05, "loss": 1.2446, "step": 8550 }, { "epoch": 2.5468828533665926, "grad_norm": 0.22968845069408417, "learning_rate": 1.7462776743567465e-05, "loss": 1.2351, "step": 8551 }, { "epoch": 2.5471806995662614, "grad_norm": 0.2320071905851364, "learning_rate": 1.746213465995132e-05, "loss": 1.2455, "step": 8552 }, { "epoch": 2.54747854576593, "grad_norm": 0.24376271665096283, "learning_rate": 1.7461492506908348e-05, "loss": 1.2612, "step": 8553 }, { "epoch": 2.547776391965599, "grad_norm": 0.23271118104457855, "learning_rate": 1.7460850284444527e-05, "loss": 1.272, "step": 8554 }, { "epoch": 2.5480742381652677, "grad_norm": 0.25115910172462463, "learning_rate": 1.7460207992565827e-05, "loss": 1.2402, "step": 8555 }, { "epoch": 2.548372084364936, "grad_norm": 0.2954905331134796, "learning_rate": 1.745956563127823e-05, "loss": 1.2444, "step": 8556 }, { "epoch": 2.548669930564605, "grad_norm": 0.25200897455215454, "learning_rate": 1.7458923200587705e-05, "loss": 1.2584, "step": 8557 }, { "epoch": 2.5489677767642736, "grad_norm": 0.27656984329223633, "learning_rate": 1.7458280700500232e-05, "loss": 1.2443, "step": 8558 }, { "epoch": 2.549265622963942, "grad_norm": 0.47544968128204346, "learning_rate": 1.7457638131021795e-05, "loss": 1.2482, "step": 8559 }, { "epoch": 2.5495634691636107, "grad_norm": 0.30716463923454285, "learning_rate": 1.7456995492158366e-05, "loss": 1.2535, "step": 8560 }, { "epoch": 2.5498613153632794, "grad_norm": 0.2898482084274292, "learning_rate": 1.745635278391592e-05, "loss": 1.243, "step": 8561 }, { "epoch": 2.5501591615629478, "grad_norm": 0.23438648879528046, "learning_rate": 1.745571000630045e-05, "loss": 1.2324, "step": 8562 }, { "epoch": 2.5504570077626165, "grad_norm": 0.28218358755111694, "learning_rate": 1.7455067159317924e-05, "loss": 1.2544, "step": 8563 }, { "epoch": 2.5507548539622853, "grad_norm": 0.2600075602531433, "learning_rate": 1.7454424242974327e-05, "loss": 1.2629, "step": 8564 }, { "epoch": 2.5510527001619536, "grad_norm": 0.224159374833107, "learning_rate": 1.7453781257275643e-05, "loss": 1.258, "step": 8565 }, { "epoch": 2.5513505463616224, "grad_norm": 0.22335480153560638, "learning_rate": 1.745313820222785e-05, "loss": 1.2405, "step": 8566 }, { "epoch": 2.551648392561291, "grad_norm": 0.22238238155841827, "learning_rate": 1.7452495077836936e-05, "loss": 1.2426, "step": 8567 }, { "epoch": 2.55194623876096, "grad_norm": 0.2267833650112152, "learning_rate": 1.745185188410888e-05, "loss": 1.2595, "step": 8568 }, { "epoch": 2.5522440849606287, "grad_norm": 0.23567399382591248, "learning_rate": 1.7451208621049668e-05, "loss": 1.2339, "step": 8569 }, { "epoch": 2.552541931160297, "grad_norm": 0.2311100959777832, "learning_rate": 1.7450565288665284e-05, "loss": 1.2591, "step": 8570 }, { "epoch": 2.552839777359966, "grad_norm": 0.22517523169517517, "learning_rate": 1.7449921886961716e-05, "loss": 1.2495, "step": 8571 }, { "epoch": 2.5531376235596346, "grad_norm": 0.2333524376153946, "learning_rate": 1.7449278415944947e-05, "loss": 1.2469, "step": 8572 }, { "epoch": 2.553435469759303, "grad_norm": 0.23362566530704498, "learning_rate": 1.7448634875620967e-05, "loss": 1.2544, "step": 8573 }, { "epoch": 2.5537333159589717, "grad_norm": 0.23605845868587494, "learning_rate": 1.7447991265995764e-05, "loss": 1.2485, "step": 8574 }, { "epoch": 2.5540311621586405, "grad_norm": 0.23068703711032867, "learning_rate": 1.7447347587075317e-05, "loss": 1.2562, "step": 8575 }, { "epoch": 2.554329008358309, "grad_norm": 0.23618610203266144, "learning_rate": 1.7446703838865624e-05, "loss": 1.2611, "step": 8576 }, { "epoch": 2.5546268545579776, "grad_norm": 0.22635234892368317, "learning_rate": 1.7446060021372674e-05, "loss": 1.2531, "step": 8577 }, { "epoch": 2.5549247007576463, "grad_norm": 0.22549006342887878, "learning_rate": 1.744541613460245e-05, "loss": 1.2445, "step": 8578 }, { "epoch": 2.5552225469573147, "grad_norm": 0.22893446683883667, "learning_rate": 1.7444772178560955e-05, "loss": 1.2486, "step": 8579 }, { "epoch": 2.5555203931569834, "grad_norm": 0.22884921729564667, "learning_rate": 1.7444128153254164e-05, "loss": 1.2346, "step": 8580 }, { "epoch": 2.555818239356652, "grad_norm": 0.2412279099225998, "learning_rate": 1.744348405868808e-05, "loss": 1.2553, "step": 8581 }, { "epoch": 2.556116085556321, "grad_norm": 0.23234815895557404, "learning_rate": 1.7442839894868698e-05, "loss": 1.2352, "step": 8582 }, { "epoch": 2.5564139317559897, "grad_norm": 0.2420787811279297, "learning_rate": 1.7442195661802004e-05, "loss": 1.2629, "step": 8583 }, { "epoch": 2.556711777955658, "grad_norm": 0.23283688724040985, "learning_rate": 1.744155135949399e-05, "loss": 1.2468, "step": 8584 }, { "epoch": 2.557009624155327, "grad_norm": 0.2487560659646988, "learning_rate": 1.744090698795066e-05, "loss": 1.2376, "step": 8585 }, { "epoch": 2.5573074703549956, "grad_norm": 0.23037666082382202, "learning_rate": 1.7440262547178e-05, "loss": 1.2385, "step": 8586 }, { "epoch": 2.557605316554664, "grad_norm": 0.23633430898189545, "learning_rate": 1.7439618037182013e-05, "loss": 1.2423, "step": 8587 }, { "epoch": 2.5579031627543327, "grad_norm": 0.22492575645446777, "learning_rate": 1.743897345796869e-05, "loss": 1.2616, "step": 8588 }, { "epoch": 2.5582010089540015, "grad_norm": 0.2327558845281601, "learning_rate": 1.7438328809544033e-05, "loss": 1.2409, "step": 8589 }, { "epoch": 2.55849885515367, "grad_norm": 0.23688359558582306, "learning_rate": 1.7437684091914036e-05, "loss": 1.2617, "step": 8590 }, { "epoch": 2.5587967013533386, "grad_norm": 0.22410444915294647, "learning_rate": 1.74370393050847e-05, "loss": 1.2518, "step": 8591 }, { "epoch": 2.5590945475530074, "grad_norm": 0.22046175599098206, "learning_rate": 1.7436394449062016e-05, "loss": 1.2572, "step": 8592 }, { "epoch": 2.5593923937526757, "grad_norm": 0.2666095197200775, "learning_rate": 1.7435749523851996e-05, "loss": 1.2697, "step": 8593 }, { "epoch": 2.5596902399523445, "grad_norm": 0.2578844726085663, "learning_rate": 1.743510452946063e-05, "loss": 1.2382, "step": 8594 }, { "epoch": 2.5599880861520132, "grad_norm": 0.23768317699432373, "learning_rate": 1.7434459465893927e-05, "loss": 1.2316, "step": 8595 }, { "epoch": 2.560285932351682, "grad_norm": 0.35362330079078674, "learning_rate": 1.7433814333157886e-05, "loss": 1.2572, "step": 8596 }, { "epoch": 2.5605837785513508, "grad_norm": 0.31020310521125793, "learning_rate": 1.743316913125851e-05, "loss": 1.269, "step": 8597 }, { "epoch": 2.560881624751019, "grad_norm": 0.28512439131736755, "learning_rate": 1.743252386020179e-05, "loss": 1.238, "step": 8598 }, { "epoch": 2.561179470950688, "grad_norm": 0.4712526500225067, "learning_rate": 1.7431878519993745e-05, "loss": 1.2553, "step": 8599 }, { "epoch": 2.5614773171503566, "grad_norm": 0.23439401388168335, "learning_rate": 1.743123311064038e-05, "loss": 1.2388, "step": 8600 }, { "epoch": 2.561775163350025, "grad_norm": 0.2374618649482727, "learning_rate": 1.7430587632147685e-05, "loss": 1.2579, "step": 8601 }, { "epoch": 2.5620730095496937, "grad_norm": 0.2444273978471756, "learning_rate": 1.7429942084521676e-05, "loss": 1.2566, "step": 8602 }, { "epoch": 2.5623708557493625, "grad_norm": 0.23217162489891052, "learning_rate": 1.742929646776836e-05, "loss": 1.2568, "step": 8603 }, { "epoch": 2.562668701949031, "grad_norm": 0.23166996240615845, "learning_rate": 1.742865078189374e-05, "loss": 1.243, "step": 8604 }, { "epoch": 2.5629665481486996, "grad_norm": 0.22938452661037445, "learning_rate": 1.7428005026903823e-05, "loss": 1.2373, "step": 8605 }, { "epoch": 2.5632643943483684, "grad_norm": 0.2335512489080429, "learning_rate": 1.742735920280462e-05, "loss": 1.2368, "step": 8606 }, { "epoch": 2.563562240548037, "grad_norm": 0.23666122555732727, "learning_rate": 1.7426713309602132e-05, "loss": 1.2343, "step": 8607 }, { "epoch": 2.5638600867477055, "grad_norm": 0.23143525421619415, "learning_rate": 1.742606734730238e-05, "loss": 1.2336, "step": 8608 }, { "epoch": 2.5641579329473743, "grad_norm": 0.23421306908130646, "learning_rate": 1.7425421315911368e-05, "loss": 1.2564, "step": 8609 }, { "epoch": 2.564455779147043, "grad_norm": 0.2414148598909378, "learning_rate": 1.7424775215435106e-05, "loss": 1.2594, "step": 8610 }, { "epoch": 2.564753625346712, "grad_norm": 0.2428535521030426, "learning_rate": 1.7424129045879605e-05, "loss": 1.2325, "step": 8611 }, { "epoch": 2.56505147154638, "grad_norm": 0.23873963952064514, "learning_rate": 1.742348280725088e-05, "loss": 1.24, "step": 8612 }, { "epoch": 2.565349317746049, "grad_norm": 0.22313565015792847, "learning_rate": 1.742283649955494e-05, "loss": 1.258, "step": 8613 }, { "epoch": 2.5656471639457177, "grad_norm": 0.2256631702184677, "learning_rate": 1.74221901227978e-05, "loss": 1.2456, "step": 8614 }, { "epoch": 2.565945010145386, "grad_norm": 0.22985881567001343, "learning_rate": 1.7421543676985476e-05, "loss": 1.2435, "step": 8615 }, { "epoch": 2.566242856345055, "grad_norm": 0.22781185805797577, "learning_rate": 1.7420897162123976e-05, "loss": 1.2363, "step": 8616 }, { "epoch": 2.5665407025447236, "grad_norm": 0.2470318078994751, "learning_rate": 1.742025057821932e-05, "loss": 1.2529, "step": 8617 }, { "epoch": 2.566838548744392, "grad_norm": 0.22905333340168, "learning_rate": 1.7419603925277524e-05, "loss": 1.2711, "step": 8618 }, { "epoch": 2.5671363949440607, "grad_norm": 0.2343169003725052, "learning_rate": 1.7418957203304604e-05, "loss": 1.2482, "step": 8619 }, { "epoch": 2.5674342411437294, "grad_norm": 0.22915878891944885, "learning_rate": 1.741831041230657e-05, "loss": 1.2481, "step": 8620 }, { "epoch": 2.567732087343398, "grad_norm": 0.22456571459770203, "learning_rate": 1.7417663552289452e-05, "loss": 1.2636, "step": 8621 }, { "epoch": 2.568029933543067, "grad_norm": 0.22265848517417908, "learning_rate": 1.7417016623259263e-05, "loss": 1.2563, "step": 8622 }, { "epoch": 2.5683277797427353, "grad_norm": 0.225847065448761, "learning_rate": 1.741636962522202e-05, "loss": 1.2412, "step": 8623 }, { "epoch": 2.568625625942404, "grad_norm": 0.23029930889606476, "learning_rate": 1.7415722558183738e-05, "loss": 1.24, "step": 8624 }, { "epoch": 2.568923472142073, "grad_norm": 0.2382858693599701, "learning_rate": 1.741507542215045e-05, "loss": 1.2733, "step": 8625 }, { "epoch": 2.569221318341741, "grad_norm": 0.224764883518219, "learning_rate": 1.7414428217128165e-05, "loss": 1.2436, "step": 8626 }, { "epoch": 2.56951916454141, "grad_norm": 0.2427278608083725, "learning_rate": 1.741378094312291e-05, "loss": 1.2535, "step": 8627 }, { "epoch": 2.5698170107410787, "grad_norm": 0.25812211632728577, "learning_rate": 1.741313360014071e-05, "loss": 1.2486, "step": 8628 }, { "epoch": 2.570114856940747, "grad_norm": 0.22946663200855255, "learning_rate": 1.741248618818758e-05, "loss": 1.2525, "step": 8629 }, { "epoch": 2.570412703140416, "grad_norm": 0.23297150433063507, "learning_rate": 1.7411838707269552e-05, "loss": 1.2492, "step": 8630 }, { "epoch": 2.5707105493400846, "grad_norm": 0.23864395916461945, "learning_rate": 1.7411191157392642e-05, "loss": 1.252, "step": 8631 }, { "epoch": 2.571008395539753, "grad_norm": 0.22829514741897583, "learning_rate": 1.7410543538562884e-05, "loss": 1.2591, "step": 8632 }, { "epoch": 2.5713062417394217, "grad_norm": 0.236654132604599, "learning_rate": 1.7409895850786293e-05, "loss": 1.2432, "step": 8633 }, { "epoch": 2.5716040879390905, "grad_norm": 0.23477168381214142, "learning_rate": 1.74092480940689e-05, "loss": 1.2565, "step": 8634 }, { "epoch": 2.5719019341387592, "grad_norm": 0.23336704075336456, "learning_rate": 1.7408600268416733e-05, "loss": 1.2504, "step": 8635 }, { "epoch": 2.572199780338428, "grad_norm": 0.23772408068180084, "learning_rate": 1.7407952373835818e-05, "loss": 1.2576, "step": 8636 }, { "epoch": 2.5724976265380963, "grad_norm": 0.22036735713481903, "learning_rate": 1.740730441033218e-05, "loss": 1.2614, "step": 8637 }, { "epoch": 2.572795472737765, "grad_norm": 0.23383082449436188, "learning_rate": 1.7406656377911854e-05, "loss": 1.2492, "step": 8638 }, { "epoch": 2.573093318937434, "grad_norm": 0.2365119755268097, "learning_rate": 1.7406008276580866e-05, "loss": 1.2564, "step": 8639 }, { "epoch": 2.573391165137102, "grad_norm": 0.254342257976532, "learning_rate": 1.7405360106345242e-05, "loss": 1.2643, "step": 8640 }, { "epoch": 2.573689011336771, "grad_norm": 0.24152617156505585, "learning_rate": 1.740471186721102e-05, "loss": 1.2541, "step": 8641 }, { "epoch": 2.5739868575364397, "grad_norm": 0.2462647706270218, "learning_rate": 1.7404063559184227e-05, "loss": 1.2535, "step": 8642 }, { "epoch": 2.574284703736108, "grad_norm": 0.29389768838882446, "learning_rate": 1.740341518227089e-05, "loss": 1.2555, "step": 8643 }, { "epoch": 2.574582549935777, "grad_norm": 0.23136746883392334, "learning_rate": 1.740276673647705e-05, "loss": 1.2454, "step": 8644 }, { "epoch": 2.5748803961354456, "grad_norm": 0.2376822680234909, "learning_rate": 1.7402118221808733e-05, "loss": 1.2337, "step": 8645 }, { "epoch": 2.575178242335114, "grad_norm": 0.22440697252750397, "learning_rate": 1.740146963827198e-05, "loss": 1.2516, "step": 8646 }, { "epoch": 2.5754760885347827, "grad_norm": 0.2741200029850006, "learning_rate": 1.740082098587282e-05, "loss": 1.2492, "step": 8647 }, { "epoch": 2.5757739347344515, "grad_norm": 0.22482140362262726, "learning_rate": 1.740017226461729e-05, "loss": 1.2261, "step": 8648 }, { "epoch": 2.5760717809341203, "grad_norm": 0.2622748017311096, "learning_rate": 1.7399523474511423e-05, "loss": 1.2265, "step": 8649 }, { "epoch": 2.576369627133789, "grad_norm": 0.23331144452095032, "learning_rate": 1.7398874615561258e-05, "loss": 1.2458, "step": 8650 }, { "epoch": 2.5766674733334574, "grad_norm": 0.295108824968338, "learning_rate": 1.7398225687772834e-05, "loss": 1.2508, "step": 8651 }, { "epoch": 2.576965319533126, "grad_norm": 0.2672525644302368, "learning_rate": 1.7397576691152185e-05, "loss": 1.2467, "step": 8652 }, { "epoch": 2.577263165732795, "grad_norm": 0.24952372908592224, "learning_rate": 1.7396927625705345e-05, "loss": 1.2628, "step": 8653 }, { "epoch": 2.5775610119324632, "grad_norm": 0.28745022416114807, "learning_rate": 1.7396278491438363e-05, "loss": 1.2386, "step": 8654 }, { "epoch": 2.577858858132132, "grad_norm": 0.23667721450328827, "learning_rate": 1.7395629288357275e-05, "loss": 1.2449, "step": 8655 }, { "epoch": 2.5781567043318008, "grad_norm": 0.2344713658094406, "learning_rate": 1.7394980016468113e-05, "loss": 1.2529, "step": 8656 }, { "epoch": 2.578454550531469, "grad_norm": 0.22538915276527405, "learning_rate": 1.739433067577693e-05, "loss": 1.2434, "step": 8657 }, { "epoch": 2.578752396731138, "grad_norm": 0.25074049830436707, "learning_rate": 1.7393681266289758e-05, "loss": 1.2527, "step": 8658 }, { "epoch": 2.5790502429308066, "grad_norm": 0.25742632150650024, "learning_rate": 1.7393031788012643e-05, "loss": 1.2529, "step": 8659 }, { "epoch": 2.579348089130475, "grad_norm": 0.2562355399131775, "learning_rate": 1.7392382240951628e-05, "loss": 1.2493, "step": 8660 }, { "epoch": 2.5796459353301437, "grad_norm": 0.318065345287323, "learning_rate": 1.7391732625112754e-05, "loss": 1.2406, "step": 8661 }, { "epoch": 2.5799437815298125, "grad_norm": 0.2281266450881958, "learning_rate": 1.7391082940502065e-05, "loss": 1.2515, "step": 8662 }, { "epoch": 2.5802416277294813, "grad_norm": 0.26424264907836914, "learning_rate": 1.739043318712561e-05, "loss": 1.2479, "step": 8663 }, { "epoch": 2.58053947392915, "grad_norm": 0.25391125679016113, "learning_rate": 1.7389783364989432e-05, "loss": 1.2616, "step": 8664 }, { "epoch": 2.5808373201288184, "grad_norm": 0.366413414478302, "learning_rate": 1.7389133474099577e-05, "loss": 1.2408, "step": 8665 }, { "epoch": 2.581135166328487, "grad_norm": 0.2793494760990143, "learning_rate": 1.7388483514462088e-05, "loss": 1.2511, "step": 8666 }, { "epoch": 2.581433012528156, "grad_norm": 0.26306232810020447, "learning_rate": 1.7387833486083013e-05, "loss": 1.2346, "step": 8667 }, { "epoch": 2.5817308587278243, "grad_norm": 0.2654370367527008, "learning_rate": 1.7387183388968404e-05, "loss": 1.2446, "step": 8668 }, { "epoch": 2.582028704927493, "grad_norm": 0.2921411991119385, "learning_rate": 1.7386533223124308e-05, "loss": 1.2647, "step": 8669 }, { "epoch": 2.582326551127162, "grad_norm": 0.2798786163330078, "learning_rate": 1.7385882988556774e-05, "loss": 1.2329, "step": 8670 }, { "epoch": 2.58262439732683, "grad_norm": 0.26488715410232544, "learning_rate": 1.7385232685271845e-05, "loss": 1.2483, "step": 8671 }, { "epoch": 2.582922243526499, "grad_norm": 0.23640671372413635, "learning_rate": 1.7384582313275583e-05, "loss": 1.2382, "step": 8672 }, { "epoch": 2.5832200897261677, "grad_norm": 0.34911125898361206, "learning_rate": 1.738393187257403e-05, "loss": 1.259, "step": 8673 }, { "epoch": 2.5835179359258365, "grad_norm": 0.2304478883743286, "learning_rate": 1.738328136317324e-05, "loss": 1.2427, "step": 8674 }, { "epoch": 2.583815782125505, "grad_norm": 0.25745683908462524, "learning_rate": 1.7382630785079267e-05, "loss": 1.2363, "step": 8675 }, { "epoch": 2.5841136283251736, "grad_norm": 0.2522353231906891, "learning_rate": 1.7381980138298165e-05, "loss": 1.2429, "step": 8676 }, { "epoch": 2.5844114745248423, "grad_norm": 0.22577343881130219, "learning_rate": 1.7381329422835986e-05, "loss": 1.2455, "step": 8677 }, { "epoch": 2.584709320724511, "grad_norm": 0.257718563079834, "learning_rate": 1.738067863869878e-05, "loss": 1.2662, "step": 8678 }, { "epoch": 2.5850071669241794, "grad_norm": 0.24879182875156403, "learning_rate": 1.738002778589261e-05, "loss": 1.2385, "step": 8679 }, { "epoch": 2.585305013123848, "grad_norm": 0.2378930002450943, "learning_rate": 1.737937686442352e-05, "loss": 1.2622, "step": 8680 }, { "epoch": 2.585602859323517, "grad_norm": 0.2391463816165924, "learning_rate": 1.7378725874297578e-05, "loss": 1.2576, "step": 8681 }, { "epoch": 2.5859007055231853, "grad_norm": 0.25435328483581543, "learning_rate": 1.7378074815520836e-05, "loss": 1.2336, "step": 8682 }, { "epoch": 2.586198551722854, "grad_norm": 0.21997854113578796, "learning_rate": 1.737742368809935e-05, "loss": 1.2388, "step": 8683 }, { "epoch": 2.586496397922523, "grad_norm": 0.2943100035190582, "learning_rate": 1.737677249203918e-05, "loss": 1.2491, "step": 8684 }, { "epoch": 2.586794244122191, "grad_norm": 0.23588617146015167, "learning_rate": 1.737612122734638e-05, "loss": 1.2534, "step": 8685 }, { "epoch": 2.58709209032186, "grad_norm": 0.23237860202789307, "learning_rate": 1.7375469894027018e-05, "loss": 1.242, "step": 8686 }, { "epoch": 2.5873899365215287, "grad_norm": 0.22945088148117065, "learning_rate": 1.7374818492087146e-05, "loss": 1.2466, "step": 8687 }, { "epoch": 2.5876877827211975, "grad_norm": 0.30818605422973633, "learning_rate": 1.7374167021532828e-05, "loss": 1.2404, "step": 8688 }, { "epoch": 2.5879856289208663, "grad_norm": 0.27361810207366943, "learning_rate": 1.7373515482370125e-05, "loss": 1.2653, "step": 8689 }, { "epoch": 2.5882834751205346, "grad_norm": 0.24611662328243256, "learning_rate": 1.7372863874605103e-05, "loss": 1.2407, "step": 8690 }, { "epoch": 2.5885813213202034, "grad_norm": 0.2687990069389343, "learning_rate": 1.7372212198243815e-05, "loss": 1.2316, "step": 8691 }, { "epoch": 2.588879167519872, "grad_norm": 0.25579991936683655, "learning_rate": 1.7371560453292327e-05, "loss": 1.2482, "step": 8692 }, { "epoch": 2.5891770137195405, "grad_norm": 0.263778418302536, "learning_rate": 1.737090863975671e-05, "loss": 1.2404, "step": 8693 }, { "epoch": 2.5894748599192092, "grad_norm": 0.238523930311203, "learning_rate": 1.737025675764302e-05, "loss": 1.2448, "step": 8694 }, { "epoch": 2.589772706118878, "grad_norm": 0.2523680627346039, "learning_rate": 1.7369604806957326e-05, "loss": 1.2455, "step": 8695 }, { "epoch": 2.5900705523185463, "grad_norm": 0.2315751165151596, "learning_rate": 1.7368952787705694e-05, "loss": 1.2293, "step": 8696 }, { "epoch": 2.590368398518215, "grad_norm": 0.23198474943637848, "learning_rate": 1.736830069989419e-05, "loss": 1.2535, "step": 8697 }, { "epoch": 2.590666244717884, "grad_norm": 0.2466469556093216, "learning_rate": 1.736764854352888e-05, "loss": 1.2425, "step": 8698 }, { "epoch": 2.590964090917552, "grad_norm": 0.23793677985668182, "learning_rate": 1.736699631861583e-05, "loss": 1.2635, "step": 8699 }, { "epoch": 2.591261937117221, "grad_norm": 0.2367255985736847, "learning_rate": 1.7366344025161114e-05, "loss": 1.2564, "step": 8700 }, { "epoch": 2.5915597833168897, "grad_norm": 0.22675922513008118, "learning_rate": 1.7365691663170793e-05, "loss": 1.2563, "step": 8701 }, { "epoch": 2.5918576295165585, "grad_norm": 0.2322254180908203, "learning_rate": 1.7365039232650945e-05, "loss": 1.2597, "step": 8702 }, { "epoch": 2.5921554757162273, "grad_norm": 0.26231998205184937, "learning_rate": 1.7364386733607634e-05, "loss": 1.2647, "step": 8703 }, { "epoch": 2.5924533219158956, "grad_norm": 0.265171080827713, "learning_rate": 1.736373416604693e-05, "loss": 1.2474, "step": 8704 }, { "epoch": 2.5927511681155644, "grad_norm": 0.2530490458011627, "learning_rate": 1.7363081529974906e-05, "loss": 1.2497, "step": 8705 }, { "epoch": 2.593049014315233, "grad_norm": 0.23230291903018951, "learning_rate": 1.736242882539764e-05, "loss": 1.2536, "step": 8706 }, { "epoch": 2.5933468605149015, "grad_norm": 0.35599738359451294, "learning_rate": 1.7361776052321196e-05, "loss": 1.2626, "step": 8707 }, { "epoch": 2.5936447067145703, "grad_norm": 0.3302066922187805, "learning_rate": 1.7361123210751652e-05, "loss": 1.268, "step": 8708 }, { "epoch": 2.593942552914239, "grad_norm": 0.300210565328598, "learning_rate": 1.736047030069508e-05, "loss": 1.2578, "step": 8709 }, { "epoch": 2.5942403991139074, "grad_norm": 0.47248217463493347, "learning_rate": 1.7359817322157556e-05, "loss": 1.267, "step": 8710 }, { "epoch": 2.594538245313576, "grad_norm": 0.23757919669151306, "learning_rate": 1.7359164275145154e-05, "loss": 1.2513, "step": 8711 }, { "epoch": 2.594836091513245, "grad_norm": 0.24626424908638, "learning_rate": 1.7358511159663952e-05, "loss": 1.2544, "step": 8712 }, { "epoch": 2.5951339377129132, "grad_norm": 0.23010654747486115, "learning_rate": 1.735785797572002e-05, "loss": 1.2376, "step": 8713 }, { "epoch": 2.595431783912582, "grad_norm": 0.23664626479148865, "learning_rate": 1.7357204723319447e-05, "loss": 1.2512, "step": 8714 }, { "epoch": 2.5957296301122508, "grad_norm": 0.2520230710506439, "learning_rate": 1.7356551402468303e-05, "loss": 1.2513, "step": 8715 }, { "epoch": 2.5960274763119195, "grad_norm": 0.2515430748462677, "learning_rate": 1.7355898013172666e-05, "loss": 1.247, "step": 8716 }, { "epoch": 2.5963253225115883, "grad_norm": 0.24009092152118683, "learning_rate": 1.7355244555438616e-05, "loss": 1.256, "step": 8717 }, { "epoch": 2.5966231687112566, "grad_norm": 0.23353387415409088, "learning_rate": 1.7354591029272236e-05, "loss": 1.2531, "step": 8718 }, { "epoch": 2.5969210149109254, "grad_norm": 0.24309256672859192, "learning_rate": 1.7353937434679597e-05, "loss": 1.2393, "step": 8719 }, { "epoch": 2.597218861110594, "grad_norm": 0.24077485501766205, "learning_rate": 1.735328377166679e-05, "loss": 1.2395, "step": 8720 }, { "epoch": 2.5975167073102625, "grad_norm": 0.24538271129131317, "learning_rate": 1.7352630040239895e-05, "loss": 1.2602, "step": 8721 }, { "epoch": 2.5978145535099313, "grad_norm": 0.2278101146221161, "learning_rate": 1.735197624040499e-05, "loss": 1.2472, "step": 8722 }, { "epoch": 2.5981123997096, "grad_norm": 0.23970970511436462, "learning_rate": 1.7351322372168162e-05, "loss": 1.2486, "step": 8723 }, { "epoch": 2.5984102459092684, "grad_norm": 0.24316518008708954, "learning_rate": 1.735066843553549e-05, "loss": 1.2553, "step": 8724 }, { "epoch": 2.598708092108937, "grad_norm": 0.2406611293554306, "learning_rate": 1.7350014430513064e-05, "loss": 1.2492, "step": 8725 }, { "epoch": 2.599005938308606, "grad_norm": 0.23540037870407104, "learning_rate": 1.7349360357106967e-05, "loss": 1.2318, "step": 8726 }, { "epoch": 2.5993037845082747, "grad_norm": 0.2322627454996109, "learning_rate": 1.734870621532328e-05, "loss": 1.2515, "step": 8727 }, { "epoch": 2.599601630707943, "grad_norm": 0.24906675517559052, "learning_rate": 1.734805200516809e-05, "loss": 1.2591, "step": 8728 }, { "epoch": 2.599899476907612, "grad_norm": 0.23014332354068756, "learning_rate": 1.734739772664749e-05, "loss": 1.2416, "step": 8729 }, { "epoch": 2.6001973231072806, "grad_norm": 0.23581655323505402, "learning_rate": 1.734674337976756e-05, "loss": 1.2431, "step": 8730 }, { "epoch": 2.6004951693069494, "grad_norm": 0.22533650696277618, "learning_rate": 1.7346088964534395e-05, "loss": 1.2538, "step": 8731 }, { "epoch": 2.6007930155066177, "grad_norm": 0.2528383433818817, "learning_rate": 1.7345434480954074e-05, "loss": 1.2367, "step": 8732 }, { "epoch": 2.6010908617062865, "grad_norm": 0.24218665063381195, "learning_rate": 1.7344779929032695e-05, "loss": 1.2365, "step": 8733 }, { "epoch": 2.6013887079059552, "grad_norm": 0.2333836704492569, "learning_rate": 1.7344125308776348e-05, "loss": 1.2715, "step": 8734 }, { "epoch": 2.6016865541056236, "grad_norm": 0.24962691962718964, "learning_rate": 1.7343470620191112e-05, "loss": 1.2621, "step": 8735 }, { "epoch": 2.6019844003052923, "grad_norm": 0.25837767124176025, "learning_rate": 1.7342815863283092e-05, "loss": 1.2569, "step": 8736 }, { "epoch": 2.602282246504961, "grad_norm": 0.24159511923789978, "learning_rate": 1.7342161038058378e-05, "loss": 1.252, "step": 8737 }, { "epoch": 2.6025800927046294, "grad_norm": 0.256168931722641, "learning_rate": 1.734150614452305e-05, "loss": 1.2546, "step": 8738 }, { "epoch": 2.602877938904298, "grad_norm": 0.24095866084098816, "learning_rate": 1.734085118268322e-05, "loss": 1.2452, "step": 8739 }, { "epoch": 2.603175785103967, "grad_norm": 0.2776990234851837, "learning_rate": 1.7340196152544965e-05, "loss": 1.2586, "step": 8740 }, { "epoch": 2.6034736313036357, "grad_norm": 0.24577198922634125, "learning_rate": 1.7339541054114385e-05, "loss": 1.2543, "step": 8741 }, { "epoch": 2.603771477503304, "grad_norm": 0.259563148021698, "learning_rate": 1.7338885887397577e-05, "loss": 1.2554, "step": 8742 }, { "epoch": 2.604069323702973, "grad_norm": 0.25315192341804504, "learning_rate": 1.7338230652400637e-05, "loss": 1.2538, "step": 8743 }, { "epoch": 2.6043671699026416, "grad_norm": 0.2514294683933258, "learning_rate": 1.7337575349129657e-05, "loss": 1.2538, "step": 8744 }, { "epoch": 2.6046650161023104, "grad_norm": 0.2541000545024872, "learning_rate": 1.7336919977590742e-05, "loss": 1.2407, "step": 8745 }, { "epoch": 2.6049628623019787, "grad_norm": 0.23705552518367767, "learning_rate": 1.7336264537789977e-05, "loss": 1.2447, "step": 8746 }, { "epoch": 2.6052607085016475, "grad_norm": 0.2308826446533203, "learning_rate": 1.733560902973347e-05, "loss": 1.2509, "step": 8747 }, { "epoch": 2.6055585547013163, "grad_norm": 0.23354917764663696, "learning_rate": 1.7334953453427315e-05, "loss": 1.2605, "step": 8748 }, { "epoch": 2.6058564009009846, "grad_norm": 0.2550734281539917, "learning_rate": 1.7334297808877612e-05, "loss": 1.2496, "step": 8749 }, { "epoch": 2.6061542471006534, "grad_norm": 0.25695809721946716, "learning_rate": 1.7333642096090468e-05, "loss": 1.2624, "step": 8750 }, { "epoch": 2.606452093300322, "grad_norm": 0.23154670000076294, "learning_rate": 1.7332986315071977e-05, "loss": 1.274, "step": 8751 }, { "epoch": 2.6067499394999905, "grad_norm": 0.25509071350097656, "learning_rate": 1.7332330465828238e-05, "loss": 1.2524, "step": 8752 }, { "epoch": 2.6070477856996592, "grad_norm": 0.22895443439483643, "learning_rate": 1.7331674548365357e-05, "loss": 1.2541, "step": 8753 }, { "epoch": 2.607345631899328, "grad_norm": 0.23581722378730774, "learning_rate": 1.7331018562689435e-05, "loss": 1.2499, "step": 8754 }, { "epoch": 2.6076434780989968, "grad_norm": 0.24990935623645782, "learning_rate": 1.7330362508806578e-05, "loss": 1.2467, "step": 8755 }, { "epoch": 2.6079413242986655, "grad_norm": 0.29261094331741333, "learning_rate": 1.7329706386722888e-05, "loss": 1.2502, "step": 8756 }, { "epoch": 2.608239170498334, "grad_norm": 0.2690500020980835, "learning_rate": 1.7329050196444467e-05, "loss": 1.245, "step": 8757 }, { "epoch": 2.6085370166980026, "grad_norm": 0.22287563979625702, "learning_rate": 1.7328393937977424e-05, "loss": 1.2421, "step": 8758 }, { "epoch": 2.6088348628976714, "grad_norm": 0.3303421437740326, "learning_rate": 1.732773761132786e-05, "loss": 1.2412, "step": 8759 }, { "epoch": 2.6091327090973397, "grad_norm": 0.2834303379058838, "learning_rate": 1.732708121650189e-05, "loss": 1.2459, "step": 8760 }, { "epoch": 2.6094305552970085, "grad_norm": 0.2834492027759552, "learning_rate": 1.7326424753505612e-05, "loss": 1.2489, "step": 8761 }, { "epoch": 2.6097284014966773, "grad_norm": 0.27727025747299194, "learning_rate": 1.7325768222345137e-05, "loss": 1.2515, "step": 8762 }, { "epoch": 2.6100262476963456, "grad_norm": 0.2752852141857147, "learning_rate": 1.7325111623026575e-05, "loss": 1.2469, "step": 8763 }, { "epoch": 2.6103240938960144, "grad_norm": 0.2830751836299896, "learning_rate": 1.7324454955556032e-05, "loss": 1.2554, "step": 8764 }, { "epoch": 2.610621940095683, "grad_norm": 0.2597208321094513, "learning_rate": 1.732379821993962e-05, "loss": 1.2391, "step": 8765 }, { "epoch": 2.6109197862953515, "grad_norm": 0.3108302652835846, "learning_rate": 1.7323141416183448e-05, "loss": 1.2347, "step": 8766 }, { "epoch": 2.6112176324950203, "grad_norm": 0.22924266755580902, "learning_rate": 1.732248454429363e-05, "loss": 1.2474, "step": 8767 }, { "epoch": 2.611515478694689, "grad_norm": 0.2544800043106079, "learning_rate": 1.732182760427627e-05, "loss": 1.2372, "step": 8768 }, { "epoch": 2.611813324894358, "grad_norm": 0.24528615176677704, "learning_rate": 1.7321170596137486e-05, "loss": 1.2595, "step": 8769 }, { "epoch": 2.6121111710940266, "grad_norm": 0.3880791664123535, "learning_rate": 1.7320513519883392e-05, "loss": 1.2448, "step": 8770 }, { "epoch": 2.612409017293695, "grad_norm": 0.2784728407859802, "learning_rate": 1.7319856375520093e-05, "loss": 1.2508, "step": 8771 }, { "epoch": 2.6127068634933637, "grad_norm": 0.2712165415287018, "learning_rate": 1.7319199163053713e-05, "loss": 1.2603, "step": 8772 }, { "epoch": 2.6130047096930324, "grad_norm": 0.2408665120601654, "learning_rate": 1.7318541882490362e-05, "loss": 1.2371, "step": 8773 }, { "epoch": 2.6133025558927008, "grad_norm": 0.3493192195892334, "learning_rate": 1.7317884533836154e-05, "loss": 1.2572, "step": 8774 }, { "epoch": 2.6136004020923695, "grad_norm": 0.2725376784801483, "learning_rate": 1.7317227117097207e-05, "loss": 1.2561, "step": 8775 }, { "epoch": 2.6138982482920383, "grad_norm": 0.25890398025512695, "learning_rate": 1.7316569632279637e-05, "loss": 1.2454, "step": 8776 }, { "epoch": 2.6141960944917066, "grad_norm": 0.23684841394424438, "learning_rate": 1.731591207938956e-05, "loss": 1.2578, "step": 8777 }, { "epoch": 2.6144939406913754, "grad_norm": 0.34290099143981934, "learning_rate": 1.7315254458433097e-05, "loss": 1.232, "step": 8778 }, { "epoch": 2.614791786891044, "grad_norm": 0.22663018107414246, "learning_rate": 1.731459676941636e-05, "loss": 1.236, "step": 8779 }, { "epoch": 2.6150896330907125, "grad_norm": 0.2505500614643097, "learning_rate": 1.731393901234548e-05, "loss": 1.2405, "step": 8780 }, { "epoch": 2.6153874792903813, "grad_norm": 0.24107787013053894, "learning_rate": 1.7313281187226564e-05, "loss": 1.2511, "step": 8781 }, { "epoch": 2.61568532549005, "grad_norm": 0.2568422853946686, "learning_rate": 1.7312623294065737e-05, "loss": 1.2387, "step": 8782 }, { "epoch": 2.615983171689719, "grad_norm": 0.3152964413166046, "learning_rate": 1.7311965332869122e-05, "loss": 1.2349, "step": 8783 }, { "epoch": 2.6162810178893876, "grad_norm": 0.2565925419330597, "learning_rate": 1.731130730364284e-05, "loss": 1.2345, "step": 8784 }, { "epoch": 2.616578864089056, "grad_norm": 0.25363919138908386, "learning_rate": 1.7310649206393012e-05, "loss": 1.2492, "step": 8785 }, { "epoch": 2.6168767102887247, "grad_norm": 0.22520498931407928, "learning_rate": 1.730999104112576e-05, "loss": 1.2495, "step": 8786 }, { "epoch": 2.6171745564883935, "grad_norm": 0.30125007033348083, "learning_rate": 1.730933280784721e-05, "loss": 1.2478, "step": 8787 }, { "epoch": 2.617472402688062, "grad_norm": 0.24418741464614868, "learning_rate": 1.730867450656348e-05, "loss": 1.2557, "step": 8788 }, { "epoch": 2.6177702488877306, "grad_norm": 0.2495867758989334, "learning_rate": 1.7308016137280705e-05, "loss": 1.243, "step": 8789 }, { "epoch": 2.6180680950873993, "grad_norm": 0.24253544211387634, "learning_rate": 1.7307357700005e-05, "loss": 1.2542, "step": 8790 }, { "epoch": 2.6183659412870677, "grad_norm": 0.2348175346851349, "learning_rate": 1.73066991947425e-05, "loss": 1.2529, "step": 8791 }, { "epoch": 2.6186637874867364, "grad_norm": 0.26520952582359314, "learning_rate": 1.730604062149933e-05, "loss": 1.2565, "step": 8792 }, { "epoch": 2.618961633686405, "grad_norm": 0.23837199807167053, "learning_rate": 1.7305381980281608e-05, "loss": 1.2457, "step": 8793 }, { "epoch": 2.619259479886074, "grad_norm": 0.26091358065605164, "learning_rate": 1.7304723271095473e-05, "loss": 1.2593, "step": 8794 }, { "epoch": 2.6195573260857423, "grad_norm": 0.22857588529586792, "learning_rate": 1.7304064493947047e-05, "loss": 1.2514, "step": 8795 }, { "epoch": 2.619855172285411, "grad_norm": 0.314048707485199, "learning_rate": 1.7303405648842462e-05, "loss": 1.2584, "step": 8796 }, { "epoch": 2.62015301848508, "grad_norm": 0.2389499694108963, "learning_rate": 1.7302746735787847e-05, "loss": 1.2465, "step": 8797 }, { "epoch": 2.6204508646847486, "grad_norm": 0.2749323546886444, "learning_rate": 1.7302087754789334e-05, "loss": 1.2414, "step": 8798 }, { "epoch": 2.620748710884417, "grad_norm": 0.2627394497394562, "learning_rate": 1.7301428705853053e-05, "loss": 1.2636, "step": 8799 }, { "epoch": 2.6210465570840857, "grad_norm": 0.2637195885181427, "learning_rate": 1.730076958898513e-05, "loss": 1.2467, "step": 8800 }, { "epoch": 2.6213444032837545, "grad_norm": 0.2565141022205353, "learning_rate": 1.730011040419171e-05, "loss": 1.2584, "step": 8801 }, { "epoch": 2.621642249483423, "grad_norm": 0.250991553068161, "learning_rate": 1.7299451151478915e-05, "loss": 1.2471, "step": 8802 }, { "epoch": 2.6219400956830916, "grad_norm": 0.344510942697525, "learning_rate": 1.7298791830852886e-05, "loss": 1.2504, "step": 8803 }, { "epoch": 2.6222379418827604, "grad_norm": 0.28531351685523987, "learning_rate": 1.7298132442319752e-05, "loss": 1.2615, "step": 8804 }, { "epoch": 2.6225357880824287, "grad_norm": 0.24256275594234467, "learning_rate": 1.7297472985885647e-05, "loss": 1.2617, "step": 8805 }, { "epoch": 2.6228336342820975, "grad_norm": 0.30849483609199524, "learning_rate": 1.7296813461556712e-05, "loss": 1.2423, "step": 8806 }, { "epoch": 2.6231314804817663, "grad_norm": 0.23779423534870148, "learning_rate": 1.729615386933908e-05, "loss": 1.2356, "step": 8807 }, { "epoch": 2.623429326681435, "grad_norm": 0.27032792568206787, "learning_rate": 1.7295494209238887e-05, "loss": 1.2396, "step": 8808 }, { "epoch": 2.623727172881104, "grad_norm": 0.2340458184480667, "learning_rate": 1.729483448126227e-05, "loss": 1.2555, "step": 8809 }, { "epoch": 2.624025019080772, "grad_norm": 0.3646049499511719, "learning_rate": 1.729417468541537e-05, "loss": 1.2333, "step": 8810 }, { "epoch": 2.624322865280441, "grad_norm": 0.2888542413711548, "learning_rate": 1.7293514821704326e-05, "loss": 1.2538, "step": 8811 }, { "epoch": 2.6246207114801097, "grad_norm": 0.268113911151886, "learning_rate": 1.7292854890135275e-05, "loss": 1.233, "step": 8812 }, { "epoch": 2.624918557679778, "grad_norm": 0.26564687490463257, "learning_rate": 1.7292194890714356e-05, "loss": 1.233, "step": 8813 }, { "epoch": 2.6252164038794468, "grad_norm": 0.2861003875732422, "learning_rate": 1.729153482344771e-05, "loss": 1.2288, "step": 8814 }, { "epoch": 2.6255142500791155, "grad_norm": 0.2574459910392761, "learning_rate": 1.729087468834148e-05, "loss": 1.2522, "step": 8815 }, { "epoch": 2.625812096278784, "grad_norm": 0.26207756996154785, "learning_rate": 1.7290214485401806e-05, "loss": 1.2457, "step": 8816 }, { "epoch": 2.6261099424784526, "grad_norm": 0.24351252615451813, "learning_rate": 1.7289554214634834e-05, "loss": 1.2381, "step": 8817 }, { "epoch": 2.6264077886781214, "grad_norm": 0.3597773611545563, "learning_rate": 1.72888938760467e-05, "loss": 1.2461, "step": 8818 }, { "epoch": 2.6267056348777897, "grad_norm": 0.25255200266838074, "learning_rate": 1.7288233469643555e-05, "loss": 1.2269, "step": 8819 }, { "epoch": 2.6270034810774585, "grad_norm": 0.26288414001464844, "learning_rate": 1.728757299543154e-05, "loss": 1.2437, "step": 8820 }, { "epoch": 2.6273013272771273, "grad_norm": 0.25060439109802246, "learning_rate": 1.7286912453416803e-05, "loss": 1.2679, "step": 8821 }, { "epoch": 2.627599173476796, "grad_norm": 0.2610555589199066, "learning_rate": 1.7286251843605483e-05, "loss": 1.2498, "step": 8822 }, { "epoch": 2.627897019676465, "grad_norm": 0.3638914227485657, "learning_rate": 1.728559116600373e-05, "loss": 1.2395, "step": 8823 }, { "epoch": 2.628194865876133, "grad_norm": 0.23534813523292542, "learning_rate": 1.7284930420617696e-05, "loss": 1.2355, "step": 8824 }, { "epoch": 2.628492712075802, "grad_norm": 0.25642281770706177, "learning_rate": 1.7284269607453522e-05, "loss": 1.2522, "step": 8825 }, { "epoch": 2.6287905582754707, "grad_norm": 0.24854740500450134, "learning_rate": 1.7283608726517354e-05, "loss": 1.2413, "step": 8826 }, { "epoch": 2.629088404475139, "grad_norm": 0.231789693236351, "learning_rate": 1.7282947777815348e-05, "loss": 1.2545, "step": 8827 }, { "epoch": 2.629386250674808, "grad_norm": 0.26283374428749084, "learning_rate": 1.7282286761353648e-05, "loss": 1.2466, "step": 8828 }, { "epoch": 2.6296840968744766, "grad_norm": 0.24622972309589386, "learning_rate": 1.7281625677138408e-05, "loss": 1.2502, "step": 8829 }, { "epoch": 2.629981943074145, "grad_norm": 0.2555600702762604, "learning_rate": 1.7280964525175773e-05, "loss": 1.242, "step": 8830 }, { "epoch": 2.6302797892738137, "grad_norm": 0.25393983721733093, "learning_rate": 1.72803033054719e-05, "loss": 1.2529, "step": 8831 }, { "epoch": 2.6305776354734824, "grad_norm": 0.22881856560707092, "learning_rate": 1.727964201803294e-05, "loss": 1.2709, "step": 8832 }, { "epoch": 2.6308754816731508, "grad_norm": 0.31835678219795227, "learning_rate": 1.7278980662865044e-05, "loss": 1.2738, "step": 8833 }, { "epoch": 2.6311733278728195, "grad_norm": 0.2397090345621109, "learning_rate": 1.7278319239974363e-05, "loss": 1.2488, "step": 8834 }, { "epoch": 2.6314711740724883, "grad_norm": 0.26797693967819214, "learning_rate": 1.7277657749367055e-05, "loss": 1.247, "step": 8835 }, { "epoch": 2.631769020272157, "grad_norm": 0.2677474617958069, "learning_rate": 1.7276996191049274e-05, "loss": 1.2328, "step": 8836 }, { "epoch": 2.632066866471826, "grad_norm": 0.2518165111541748, "learning_rate": 1.727633456502717e-05, "loss": 1.26, "step": 8837 }, { "epoch": 2.632364712671494, "grad_norm": 0.27545467019081116, "learning_rate": 1.7275672871306907e-05, "loss": 1.2531, "step": 8838 }, { "epoch": 2.632662558871163, "grad_norm": 0.23853330314159393, "learning_rate": 1.7275011109894634e-05, "loss": 1.2507, "step": 8839 }, { "epoch": 2.6329604050708317, "grad_norm": 0.24203594028949738, "learning_rate": 1.7274349280796513e-05, "loss": 1.2516, "step": 8840 }, { "epoch": 2.6332582512705, "grad_norm": 0.24164514243602753, "learning_rate": 1.72736873840187e-05, "loss": 1.2553, "step": 8841 }, { "epoch": 2.633556097470169, "grad_norm": 0.25497618317604065, "learning_rate": 1.727302541956735e-05, "loss": 1.2582, "step": 8842 }, { "epoch": 2.6338539436698376, "grad_norm": 0.27957555651664734, "learning_rate": 1.7272363387448625e-05, "loss": 1.247, "step": 8843 }, { "epoch": 2.634151789869506, "grad_norm": 0.24557183682918549, "learning_rate": 1.727170128766868e-05, "loss": 1.2357, "step": 8844 }, { "epoch": 2.6344496360691747, "grad_norm": 0.2701668441295624, "learning_rate": 1.7271039120233685e-05, "loss": 1.2467, "step": 8845 }, { "epoch": 2.6347474822688435, "grad_norm": 0.2438381314277649, "learning_rate": 1.727037688514979e-05, "loss": 1.2555, "step": 8846 }, { "epoch": 2.635045328468512, "grad_norm": 0.37517014145851135, "learning_rate": 1.7269714582423165e-05, "loss": 1.2504, "step": 8847 }, { "epoch": 2.6353431746681806, "grad_norm": 0.24342043697834015, "learning_rate": 1.7269052212059966e-05, "loss": 1.2584, "step": 8848 }, { "epoch": 2.6356410208678493, "grad_norm": 0.2818973958492279, "learning_rate": 1.726838977406636e-05, "loss": 1.2392, "step": 8849 }, { "epoch": 2.635938867067518, "grad_norm": 0.23356172442436218, "learning_rate": 1.7267727268448503e-05, "loss": 1.2671, "step": 8850 }, { "epoch": 2.636236713267187, "grad_norm": 0.27586445212364197, "learning_rate": 1.726706469521257e-05, "loss": 1.2602, "step": 8851 }, { "epoch": 2.636534559466855, "grad_norm": 0.2238815575838089, "learning_rate": 1.7266402054364712e-05, "loss": 1.2456, "step": 8852 }, { "epoch": 2.636832405666524, "grad_norm": 0.23642008006572723, "learning_rate": 1.7265739345911105e-05, "loss": 1.2401, "step": 8853 }, { "epoch": 2.6371302518661928, "grad_norm": 0.24406154453754425, "learning_rate": 1.726507656985791e-05, "loss": 1.2396, "step": 8854 }, { "epoch": 2.637428098065861, "grad_norm": 0.2447628676891327, "learning_rate": 1.72644137262113e-05, "loss": 1.2439, "step": 8855 }, { "epoch": 2.63772594426553, "grad_norm": 0.23084242641925812, "learning_rate": 1.726375081497743e-05, "loss": 1.2395, "step": 8856 }, { "epoch": 2.6380237904651986, "grad_norm": 0.21892322599887848, "learning_rate": 1.7263087836162477e-05, "loss": 1.2503, "step": 8857 }, { "epoch": 2.638321636664867, "grad_norm": 0.24469958245754242, "learning_rate": 1.7262424789772607e-05, "loss": 1.2472, "step": 8858 }, { "epoch": 2.6386194828645357, "grad_norm": 0.24429583549499512, "learning_rate": 1.726176167581399e-05, "loss": 1.2565, "step": 8859 }, { "epoch": 2.6389173290642045, "grad_norm": 0.2654368281364441, "learning_rate": 1.7261098494292786e-05, "loss": 1.2473, "step": 8860 }, { "epoch": 2.6392151752638733, "grad_norm": 0.22381658852100372, "learning_rate": 1.7260435245215182e-05, "loss": 1.2494, "step": 8861 }, { "epoch": 2.6395130214635416, "grad_norm": 0.2904224395751953, "learning_rate": 1.7259771928587334e-05, "loss": 1.2409, "step": 8862 }, { "epoch": 2.6398108676632104, "grad_norm": 0.23257623612880707, "learning_rate": 1.7259108544415425e-05, "loss": 1.242, "step": 8863 }, { "epoch": 2.640108713862879, "grad_norm": 0.3013313114643097, "learning_rate": 1.725844509270562e-05, "loss": 1.2302, "step": 8864 }, { "epoch": 2.640406560062548, "grad_norm": 0.27417224645614624, "learning_rate": 1.725778157346409e-05, "loss": 1.2538, "step": 8865 }, { "epoch": 2.6407044062622163, "grad_norm": 0.2881911098957062, "learning_rate": 1.7257117986697013e-05, "loss": 1.2465, "step": 8866 }, { "epoch": 2.641002252461885, "grad_norm": 0.27448558807373047, "learning_rate": 1.7256454332410556e-05, "loss": 1.2466, "step": 8867 }, { "epoch": 2.641300098661554, "grad_norm": 0.26696836948394775, "learning_rate": 1.7255790610610908e-05, "loss": 1.2378, "step": 8868 }, { "epoch": 2.641597944861222, "grad_norm": 0.25289347767829895, "learning_rate": 1.7255126821304228e-05, "loss": 1.2508, "step": 8869 }, { "epoch": 2.641895791060891, "grad_norm": 0.27484360337257385, "learning_rate": 1.72544629644967e-05, "loss": 1.2532, "step": 8870 }, { "epoch": 2.6421936372605597, "grad_norm": 0.2266952246427536, "learning_rate": 1.7253799040194503e-05, "loss": 1.2331, "step": 8871 }, { "epoch": 2.642491483460228, "grad_norm": 0.4980355203151703, "learning_rate": 1.7253135048403808e-05, "loss": 1.2608, "step": 8872 }, { "epoch": 2.6427893296598968, "grad_norm": 0.28572672605514526, "learning_rate": 1.7252470989130794e-05, "loss": 1.2465, "step": 8873 }, { "epoch": 2.6430871758595655, "grad_norm": 0.28275179862976074, "learning_rate": 1.725180686238164e-05, "loss": 1.2432, "step": 8874 }, { "epoch": 2.6433850220592343, "grad_norm": 0.2620451748371124, "learning_rate": 1.725114266816253e-05, "loss": 1.2261, "step": 8875 }, { "epoch": 2.643682868258903, "grad_norm": 0.23029804229736328, "learning_rate": 1.7250478406479632e-05, "loss": 1.2427, "step": 8876 }, { "epoch": 2.6439807144585714, "grad_norm": 0.2825107276439667, "learning_rate": 1.724981407733914e-05, "loss": 1.2483, "step": 8877 }, { "epoch": 2.64427856065824, "grad_norm": 0.30828404426574707, "learning_rate": 1.7249149680747225e-05, "loss": 1.2614, "step": 8878 }, { "epoch": 2.644576406857909, "grad_norm": 0.23624587059020996, "learning_rate": 1.724848521671007e-05, "loss": 1.2612, "step": 8879 }, { "epoch": 2.6448742530575773, "grad_norm": 0.24183116853237152, "learning_rate": 1.724782068523386e-05, "loss": 1.2401, "step": 8880 }, { "epoch": 2.645172099257246, "grad_norm": 0.2445949912071228, "learning_rate": 1.7247156086324776e-05, "loss": 1.256, "step": 8881 }, { "epoch": 2.645469945456915, "grad_norm": 0.25342297554016113, "learning_rate": 1.7246491419989002e-05, "loss": 1.2516, "step": 8882 }, { "epoch": 2.645767791656583, "grad_norm": 0.24959760904312134, "learning_rate": 1.7245826686232722e-05, "loss": 1.2689, "step": 8883 }, { "epoch": 2.646065637856252, "grad_norm": 0.23694045841693878, "learning_rate": 1.724516188506212e-05, "loss": 1.2302, "step": 8884 }, { "epoch": 2.6463634840559207, "grad_norm": 0.3361573815345764, "learning_rate": 1.724449701648338e-05, "loss": 1.2655, "step": 8885 }, { "epoch": 2.646661330255589, "grad_norm": 0.2407407909631729, "learning_rate": 1.724383208050269e-05, "loss": 1.2471, "step": 8886 }, { "epoch": 2.646959176455258, "grad_norm": 0.24118772149085999, "learning_rate": 1.7243167077126234e-05, "loss": 1.2424, "step": 8887 }, { "epoch": 2.6472570226549266, "grad_norm": 0.23885789513587952, "learning_rate": 1.7242502006360203e-05, "loss": 1.2703, "step": 8888 }, { "epoch": 2.6475548688545953, "grad_norm": 0.24546580016613007, "learning_rate": 1.7241836868210783e-05, "loss": 1.2654, "step": 8889 }, { "epoch": 2.647852715054264, "grad_norm": 0.22894099354743958, "learning_rate": 1.7241171662684162e-05, "loss": 1.2448, "step": 8890 }, { "epoch": 2.6481505612539324, "grad_norm": 0.2342892438173294, "learning_rate": 1.724050638978653e-05, "loss": 1.2566, "step": 8891 }, { "epoch": 2.648448407453601, "grad_norm": 0.23162709176540375, "learning_rate": 1.7239841049524074e-05, "loss": 1.2571, "step": 8892 }, { "epoch": 2.64874625365327, "grad_norm": 0.2768057584762573, "learning_rate": 1.7239175641902985e-05, "loss": 1.2423, "step": 8893 }, { "epoch": 2.6490440998529383, "grad_norm": 0.23433153331279755, "learning_rate": 1.7238510166929457e-05, "loss": 1.2486, "step": 8894 }, { "epoch": 2.649341946052607, "grad_norm": 0.24096284806728363, "learning_rate": 1.7237844624609678e-05, "loss": 1.246, "step": 8895 }, { "epoch": 2.649639792252276, "grad_norm": 0.24480442702770233, "learning_rate": 1.723717901494984e-05, "loss": 1.2442, "step": 8896 }, { "epoch": 2.649937638451944, "grad_norm": 0.23331323266029358, "learning_rate": 1.7236513337956136e-05, "loss": 1.2463, "step": 8897 }, { "epoch": 2.650235484651613, "grad_norm": 0.23452609777450562, "learning_rate": 1.7235847593634764e-05, "loss": 1.2486, "step": 8898 }, { "epoch": 2.6505333308512817, "grad_norm": 0.2568226158618927, "learning_rate": 1.7235181781991915e-05, "loss": 1.2689, "step": 8899 }, { "epoch": 2.65083117705095, "grad_norm": 0.2641594111919403, "learning_rate": 1.7234515903033782e-05, "loss": 1.2318, "step": 8900 }, { "epoch": 2.651129023250619, "grad_norm": 0.231359601020813, "learning_rate": 1.723384995676656e-05, "loss": 1.2387, "step": 8901 }, { "epoch": 2.6514268694502876, "grad_norm": 0.23017573356628418, "learning_rate": 1.723318394319645e-05, "loss": 1.2469, "step": 8902 }, { "epoch": 2.6517247156499564, "grad_norm": 0.2576947808265686, "learning_rate": 1.7232517862329642e-05, "loss": 1.2415, "step": 8903 }, { "epoch": 2.652022561849625, "grad_norm": 0.22903068363666534, "learning_rate": 1.7231851714172336e-05, "loss": 1.2454, "step": 8904 }, { "epoch": 2.6523204080492935, "grad_norm": 0.24105381965637207, "learning_rate": 1.723118549873073e-05, "loss": 1.2361, "step": 8905 }, { "epoch": 2.6526182542489622, "grad_norm": 0.2395734190940857, "learning_rate": 1.723051921601102e-05, "loss": 1.2416, "step": 8906 }, { "epoch": 2.652916100448631, "grad_norm": 0.24813932180404663, "learning_rate": 1.722985286601941e-05, "loss": 1.2449, "step": 8907 }, { "epoch": 2.6532139466482993, "grad_norm": 0.2432156652212143, "learning_rate": 1.7229186448762093e-05, "loss": 1.2563, "step": 8908 }, { "epoch": 2.653511792847968, "grad_norm": 0.2334570735692978, "learning_rate": 1.722851996424528e-05, "loss": 1.241, "step": 8909 }, { "epoch": 2.653809639047637, "grad_norm": 0.23906110227108002, "learning_rate": 1.7227853412475158e-05, "loss": 1.2623, "step": 8910 }, { "epoch": 2.654107485247305, "grad_norm": 0.23201993107795715, "learning_rate": 1.722718679345794e-05, "loss": 1.2486, "step": 8911 }, { "epoch": 2.654405331446974, "grad_norm": 0.2461710125207901, "learning_rate": 1.722652010719982e-05, "loss": 1.2365, "step": 8912 }, { "epoch": 2.6547031776466428, "grad_norm": 0.23649927973747253, "learning_rate": 1.7225853353707e-05, "loss": 1.2453, "step": 8913 }, { "epoch": 2.655001023846311, "grad_norm": 0.2550906538963318, "learning_rate": 1.7225186532985698e-05, "loss": 1.2586, "step": 8914 }, { "epoch": 2.65529887004598, "grad_norm": 0.23077456653118134, "learning_rate": 1.7224519645042104e-05, "loss": 1.2569, "step": 8915 }, { "epoch": 2.6555967162456486, "grad_norm": 0.2596879303455353, "learning_rate": 1.7223852689882425e-05, "loss": 1.2433, "step": 8916 }, { "epoch": 2.6558945624453174, "grad_norm": 0.23304714262485504, "learning_rate": 1.722318566751287e-05, "loss": 1.2546, "step": 8917 }, { "epoch": 2.656192408644986, "grad_norm": 0.25453877449035645, "learning_rate": 1.722251857793964e-05, "loss": 1.251, "step": 8918 }, { "epoch": 2.6564902548446545, "grad_norm": 0.24530337750911713, "learning_rate": 1.7221851421168943e-05, "loss": 1.2359, "step": 8919 }, { "epoch": 2.6567881010443233, "grad_norm": 0.36431068181991577, "learning_rate": 1.7221184197206993e-05, "loss": 1.2488, "step": 8920 }, { "epoch": 2.657085947243992, "grad_norm": 0.31601202487945557, "learning_rate": 1.7220516906059986e-05, "loss": 1.2416, "step": 8921 }, { "epoch": 2.6573837934436604, "grad_norm": 0.24828962981700897, "learning_rate": 1.721984954773414e-05, "loss": 1.2479, "step": 8922 }, { "epoch": 2.657681639643329, "grad_norm": 0.36653435230255127, "learning_rate": 1.721918212223566e-05, "loss": 1.2519, "step": 8923 }, { "epoch": 2.657979485842998, "grad_norm": 0.26021063327789307, "learning_rate": 1.7218514629570756e-05, "loss": 1.2496, "step": 8924 }, { "epoch": 2.6582773320426663, "grad_norm": 0.2584244906902313, "learning_rate": 1.721784706974564e-05, "loss": 1.2466, "step": 8925 }, { "epoch": 2.658575178242335, "grad_norm": 0.22850880026817322, "learning_rate": 1.7217179442766515e-05, "loss": 1.2398, "step": 8926 }, { "epoch": 2.658873024442004, "grad_norm": 0.28881415724754333, "learning_rate": 1.7216511748639605e-05, "loss": 1.2431, "step": 8927 }, { "epoch": 2.6591708706416726, "grad_norm": 0.2384619563817978, "learning_rate": 1.7215843987371114e-05, "loss": 1.2348, "step": 8928 }, { "epoch": 2.659468716841341, "grad_norm": 0.26697084307670593, "learning_rate": 1.7215176158967256e-05, "loss": 1.2305, "step": 8929 }, { "epoch": 2.6597665630410097, "grad_norm": 0.2338797152042389, "learning_rate": 1.7214508263434244e-05, "loss": 1.2421, "step": 8930 }, { "epoch": 2.6600644092406784, "grad_norm": 0.34127315878868103, "learning_rate": 1.7213840300778297e-05, "loss": 1.2468, "step": 8931 }, { "epoch": 2.660362255440347, "grad_norm": 0.26202157139778137, "learning_rate": 1.7213172271005623e-05, "loss": 1.2385, "step": 8932 }, { "epoch": 2.6606601016400155, "grad_norm": 0.3033076524734497, "learning_rate": 1.721250417412244e-05, "loss": 1.236, "step": 8933 }, { "epoch": 2.6609579478396843, "grad_norm": 0.24419964849948883, "learning_rate": 1.7211836010134965e-05, "loss": 1.2542, "step": 8934 }, { "epoch": 2.661255794039353, "grad_norm": 0.49213454127311707, "learning_rate": 1.7211167779049414e-05, "loss": 1.2475, "step": 8935 }, { "epoch": 2.6615536402390214, "grad_norm": 0.23509784042835236, "learning_rate": 1.7210499480872003e-05, "loss": 1.2434, "step": 8936 }, { "epoch": 2.66185148643869, "grad_norm": 0.24729567766189575, "learning_rate": 1.720983111560895e-05, "loss": 1.2399, "step": 8937 }, { "epoch": 2.662149332638359, "grad_norm": 0.24425852298736572, "learning_rate": 1.7209162683266473e-05, "loss": 1.2512, "step": 8938 }, { "epoch": 2.6624471788380273, "grad_norm": 0.23785199224948883, "learning_rate": 1.720849418385079e-05, "loss": 1.2525, "step": 8939 }, { "epoch": 2.662745025037696, "grad_norm": 0.23505166172981262, "learning_rate": 1.7207825617368125e-05, "loss": 1.2473, "step": 8940 }, { "epoch": 2.663042871237365, "grad_norm": 0.23657561838626862, "learning_rate": 1.72071569838247e-05, "loss": 1.254, "step": 8941 }, { "epoch": 2.6633407174370336, "grad_norm": 0.23401376605033875, "learning_rate": 1.7206488283226726e-05, "loss": 1.2789, "step": 8942 }, { "epoch": 2.6636385636367024, "grad_norm": 0.2471046894788742, "learning_rate": 1.720581951558043e-05, "loss": 1.2377, "step": 8943 }, { "epoch": 2.6639364098363707, "grad_norm": 0.2282712161540985, "learning_rate": 1.7205150680892035e-05, "loss": 1.2615, "step": 8944 }, { "epoch": 2.6642342560360395, "grad_norm": 0.2226599007844925, "learning_rate": 1.7204481779167762e-05, "loss": 1.2393, "step": 8945 }, { "epoch": 2.6645321022357082, "grad_norm": 0.23276494443416595, "learning_rate": 1.7203812810413838e-05, "loss": 1.2498, "step": 8946 }, { "epoch": 2.6648299484353766, "grad_norm": 0.2419513314962387, "learning_rate": 1.7203143774636485e-05, "loss": 1.2484, "step": 8947 }, { "epoch": 2.6651277946350453, "grad_norm": 0.22819313406944275, "learning_rate": 1.7202474671841925e-05, "loss": 1.2448, "step": 8948 }, { "epoch": 2.665425640834714, "grad_norm": 0.24116648733615875, "learning_rate": 1.7201805502036386e-05, "loss": 1.2422, "step": 8949 }, { "epoch": 2.6657234870343824, "grad_norm": 0.25838974118232727, "learning_rate": 1.7201136265226093e-05, "loss": 1.2455, "step": 8950 }, { "epoch": 2.666021333234051, "grad_norm": 0.22826877236366272, "learning_rate": 1.7200466961417272e-05, "loss": 1.2466, "step": 8951 }, { "epoch": 2.66631917943372, "grad_norm": 0.2496095895767212, "learning_rate": 1.719979759061615e-05, "loss": 1.2475, "step": 8952 }, { "epoch": 2.6666170256333883, "grad_norm": 0.24863339960575104, "learning_rate": 1.7199128152828958e-05, "loss": 1.2442, "step": 8953 }, { "epoch": 2.666914871833057, "grad_norm": 0.2691372334957123, "learning_rate": 1.719845864806192e-05, "loss": 1.247, "step": 8954 }, { "epoch": 2.667212718032726, "grad_norm": 0.23048639297485352, "learning_rate": 1.7197789076321267e-05, "loss": 1.2415, "step": 8955 }, { "epoch": 2.6675105642323946, "grad_norm": 0.24400363862514496, "learning_rate": 1.7197119437613227e-05, "loss": 1.255, "step": 8956 }, { "epoch": 2.6678084104320634, "grad_norm": 0.2377277910709381, "learning_rate": 1.7196449731944037e-05, "loss": 1.2548, "step": 8957 }, { "epoch": 2.6681062566317317, "grad_norm": 0.23374474048614502, "learning_rate": 1.719577995931992e-05, "loss": 1.2587, "step": 8958 }, { "epoch": 2.6684041028314005, "grad_norm": 0.28326719999313354, "learning_rate": 1.7195110119747106e-05, "loss": 1.243, "step": 8959 }, { "epoch": 2.6687019490310693, "grad_norm": 0.2740752696990967, "learning_rate": 1.7194440213231836e-05, "loss": 1.2416, "step": 8960 }, { "epoch": 2.6689997952307376, "grad_norm": 0.23700512945652008, "learning_rate": 1.7193770239780336e-05, "loss": 1.246, "step": 8961 }, { "epoch": 2.6692976414304064, "grad_norm": 0.23480559885501862, "learning_rate": 1.7193100199398843e-05, "loss": 1.2547, "step": 8962 }, { "epoch": 2.669595487630075, "grad_norm": 0.2860928177833557, "learning_rate": 1.7192430092093586e-05, "loss": 1.2521, "step": 8963 }, { "epoch": 2.6698933338297435, "grad_norm": 0.25524553656578064, "learning_rate": 1.7191759917870805e-05, "loss": 1.2596, "step": 8964 }, { "epoch": 2.6701911800294122, "grad_norm": 0.25950267910957336, "learning_rate": 1.719108967673673e-05, "loss": 1.2354, "step": 8965 }, { "epoch": 2.670489026229081, "grad_norm": 0.22618107497692108, "learning_rate": 1.7190419368697605e-05, "loss": 1.2401, "step": 8966 }, { "epoch": 2.6707868724287493, "grad_norm": 0.24542158842086792, "learning_rate": 1.718974899375966e-05, "loss": 1.2799, "step": 8967 }, { "epoch": 2.671084718628418, "grad_norm": 0.24449419975280762, "learning_rate": 1.7189078551929133e-05, "loss": 1.2433, "step": 8968 }, { "epoch": 2.671382564828087, "grad_norm": 0.26330330967903137, "learning_rate": 1.7188408043212258e-05, "loss": 1.253, "step": 8969 }, { "epoch": 2.6716804110277557, "grad_norm": 0.244807630777359, "learning_rate": 1.7187737467615285e-05, "loss": 1.2448, "step": 8970 }, { "epoch": 2.6719782572274244, "grad_norm": 0.23932670056819916, "learning_rate": 1.718706682514444e-05, "loss": 1.2532, "step": 8971 }, { "epoch": 2.6722761034270928, "grad_norm": 0.27525243163108826, "learning_rate": 1.718639611580597e-05, "loss": 1.2414, "step": 8972 }, { "epoch": 2.6725739496267615, "grad_norm": 0.26601141691207886, "learning_rate": 1.7185725339606116e-05, "loss": 1.2491, "step": 8973 }, { "epoch": 2.6728717958264303, "grad_norm": 0.2720913589000702, "learning_rate": 1.718505449655111e-05, "loss": 1.2546, "step": 8974 }, { "epoch": 2.6731696420260986, "grad_norm": 0.31385520100593567, "learning_rate": 1.71843835866472e-05, "loss": 1.2641, "step": 8975 }, { "epoch": 2.6734674882257674, "grad_norm": 0.23295383155345917, "learning_rate": 1.7183712609900635e-05, "loss": 1.2493, "step": 8976 }, { "epoch": 2.673765334425436, "grad_norm": 0.262129545211792, "learning_rate": 1.7183041566317643e-05, "loss": 1.2486, "step": 8977 }, { "epoch": 2.6740631806251045, "grad_norm": 0.22411711513996124, "learning_rate": 1.7182370455904477e-05, "loss": 1.243, "step": 8978 }, { "epoch": 2.6743610268247733, "grad_norm": 0.31297796964645386, "learning_rate": 1.7181699278667378e-05, "loss": 1.2644, "step": 8979 }, { "epoch": 2.674658873024442, "grad_norm": 0.23683494329452515, "learning_rate": 1.7181028034612596e-05, "loss": 1.2426, "step": 8980 }, { "epoch": 2.6749567192241104, "grad_norm": 0.2417498379945755, "learning_rate": 1.7180356723746365e-05, "loss": 1.2402, "step": 8981 }, { "epoch": 2.675254565423779, "grad_norm": 0.2354615032672882, "learning_rate": 1.7179685346074938e-05, "loss": 1.2343, "step": 8982 }, { "epoch": 2.675552411623448, "grad_norm": 0.2346847802400589, "learning_rate": 1.717901390160456e-05, "loss": 1.2688, "step": 8983 }, { "epoch": 2.6758502578231167, "grad_norm": 0.22196222841739655, "learning_rate": 1.717834239034148e-05, "loss": 1.2431, "step": 8984 }, { "epoch": 2.6761481040227855, "grad_norm": 0.2591729164123535, "learning_rate": 1.7177670812291946e-05, "loss": 1.2365, "step": 8985 }, { "epoch": 2.676445950222454, "grad_norm": 0.23349237442016602, "learning_rate": 1.7176999167462204e-05, "loss": 1.2561, "step": 8986 }, { "epoch": 2.6767437964221226, "grad_norm": 0.25721341371536255, "learning_rate": 1.7176327455858503e-05, "loss": 1.2503, "step": 8987 }, { "epoch": 2.6770416426217913, "grad_norm": 0.22645686566829681, "learning_rate": 1.717565567748709e-05, "loss": 1.2461, "step": 8988 }, { "epoch": 2.6773394888214597, "grad_norm": 0.2434178739786148, "learning_rate": 1.717498383235422e-05, "loss": 1.2475, "step": 8989 }, { "epoch": 2.6776373350211284, "grad_norm": 0.23041358590126038, "learning_rate": 1.7174311920466143e-05, "loss": 1.2682, "step": 8990 }, { "epoch": 2.677935181220797, "grad_norm": 0.2594456076622009, "learning_rate": 1.7173639941829105e-05, "loss": 1.2538, "step": 8991 }, { "epoch": 2.6782330274204655, "grad_norm": 0.23500095307826996, "learning_rate": 1.7172967896449367e-05, "loss": 1.2457, "step": 8992 }, { "epoch": 2.6785308736201343, "grad_norm": 0.24464409053325653, "learning_rate": 1.7172295784333174e-05, "loss": 1.2386, "step": 8993 }, { "epoch": 2.678828719819803, "grad_norm": 0.2371082305908203, "learning_rate": 1.7171623605486784e-05, "loss": 1.2359, "step": 8994 }, { "epoch": 2.679126566019472, "grad_norm": 0.43126392364501953, "learning_rate": 1.7170951359916447e-05, "loss": 1.2472, "step": 8995 }, { "epoch": 2.67942441221914, "grad_norm": 0.3418256938457489, "learning_rate": 1.717027904762842e-05, "loss": 1.2415, "step": 8996 }, { "epoch": 2.679722258418809, "grad_norm": 0.33027663826942444, "learning_rate": 1.716960666862896e-05, "loss": 1.2323, "step": 8997 }, { "epoch": 2.6800201046184777, "grad_norm": 0.5020977258682251, "learning_rate": 1.7168934222924317e-05, "loss": 1.2485, "step": 8998 }, { "epoch": 2.6803179508181465, "grad_norm": 0.2648580074310303, "learning_rate": 1.7168261710520753e-05, "loss": 1.2492, "step": 8999 }, { "epoch": 2.680615797017815, "grad_norm": 0.2582843601703644, "learning_rate": 1.716758913142452e-05, "loss": 1.2548, "step": 9000 }, { "epoch": 2.680615797017815, "eval_loss": 1.33614182472229, "eval_runtime": 19.8208, "eval_samples_per_second": 87.484, "eval_steps_per_second": 5.499, "step": 9000 }, { "epoch": 2.6809136432174836, "grad_norm": 0.25006046891212463, "learning_rate": 1.7166916485641882e-05, "loss": 1.2476, "step": 9001 }, { "epoch": 2.6812114894171524, "grad_norm": 0.23376592993736267, "learning_rate": 1.716624377317909e-05, "loss": 1.2347, "step": 9002 }, { "epoch": 2.6815093356168207, "grad_norm": 0.3001675009727478, "learning_rate": 1.716557099404241e-05, "loss": 1.2519, "step": 9003 }, { "epoch": 2.6818071818164895, "grad_norm": 0.2258872091770172, "learning_rate": 1.7164898148238094e-05, "loss": 1.2471, "step": 9004 }, { "epoch": 2.6821050280161582, "grad_norm": 0.24445126950740814, "learning_rate": 1.7164225235772406e-05, "loss": 1.2362, "step": 9005 }, { "epoch": 2.6824028742158266, "grad_norm": 0.25726720690727234, "learning_rate": 1.7163552256651608e-05, "loss": 1.2454, "step": 9006 }, { "epoch": 2.6827007204154953, "grad_norm": 0.23210914433002472, "learning_rate": 1.716287921088196e-05, "loss": 1.2582, "step": 9007 }, { "epoch": 2.682998566615164, "grad_norm": 0.29994168877601624, "learning_rate": 1.7162206098469724e-05, "loss": 1.2275, "step": 9008 }, { "epoch": 2.683296412814833, "grad_norm": 0.24527543783187866, "learning_rate": 1.716153291942116e-05, "loss": 1.2275, "step": 9009 }, { "epoch": 2.6835942590145017, "grad_norm": 0.2653655707836151, "learning_rate": 1.7160859673742537e-05, "loss": 1.2394, "step": 9010 }, { "epoch": 2.68389210521417, "grad_norm": 0.2707829177379608, "learning_rate": 1.7160186361440113e-05, "loss": 1.2532, "step": 9011 }, { "epoch": 2.6841899514138388, "grad_norm": 0.23000174760818481, "learning_rate": 1.7159512982520153e-05, "loss": 1.2502, "step": 9012 }, { "epoch": 2.6844877976135075, "grad_norm": 0.23766934871673584, "learning_rate": 1.7158839536988926e-05, "loss": 1.2514, "step": 9013 }, { "epoch": 2.684785643813176, "grad_norm": 0.2337920069694519, "learning_rate": 1.7158166024852697e-05, "loss": 1.2418, "step": 9014 }, { "epoch": 2.6850834900128446, "grad_norm": 0.22401787340641022, "learning_rate": 1.715749244611773e-05, "loss": 1.2556, "step": 9015 }, { "epoch": 2.6853813362125134, "grad_norm": 0.254700243473053, "learning_rate": 1.7156818800790292e-05, "loss": 1.2355, "step": 9016 }, { "epoch": 2.6856791824121817, "grad_norm": 0.24101154506206512, "learning_rate": 1.715614508887665e-05, "loss": 1.2473, "step": 9017 }, { "epoch": 2.6859770286118505, "grad_norm": 0.24811016023159027, "learning_rate": 1.7155471310383073e-05, "loss": 1.243, "step": 9018 }, { "epoch": 2.6862748748115193, "grad_norm": 0.23110364377498627, "learning_rate": 1.7154797465315834e-05, "loss": 1.2407, "step": 9019 }, { "epoch": 2.6865727210111876, "grad_norm": 0.22973401844501495, "learning_rate": 1.7154123553681194e-05, "loss": 1.2461, "step": 9020 }, { "epoch": 2.6868705672108564, "grad_norm": 0.23058158159255981, "learning_rate": 1.7153449575485428e-05, "loss": 1.2422, "step": 9021 }, { "epoch": 2.687168413410525, "grad_norm": 0.25269100069999695, "learning_rate": 1.715277553073481e-05, "loss": 1.256, "step": 9022 }, { "epoch": 2.687466259610194, "grad_norm": 0.24133670330047607, "learning_rate": 1.7152101419435602e-05, "loss": 1.2458, "step": 9023 }, { "epoch": 2.6877641058098627, "grad_norm": 0.24933025240898132, "learning_rate": 1.715142724159408e-05, "loss": 1.243, "step": 9024 }, { "epoch": 2.688061952009531, "grad_norm": 0.24056760966777802, "learning_rate": 1.7150752997216524e-05, "loss": 1.2355, "step": 9025 }, { "epoch": 2.6883597982092, "grad_norm": 0.2292553037405014, "learning_rate": 1.7150078686309198e-05, "loss": 1.2432, "step": 9026 }, { "epoch": 2.6886576444088686, "grad_norm": 0.2316311001777649, "learning_rate": 1.714940430887838e-05, "loss": 1.2593, "step": 9027 }, { "epoch": 2.688955490608537, "grad_norm": 0.23243388533592224, "learning_rate": 1.7148729864930337e-05, "loss": 1.2449, "step": 9028 }, { "epoch": 2.6892533368082057, "grad_norm": 0.23164501786231995, "learning_rate": 1.7148055354471355e-05, "loss": 1.2512, "step": 9029 }, { "epoch": 2.6895511830078744, "grad_norm": 0.2510435879230499, "learning_rate": 1.7147380777507703e-05, "loss": 1.2529, "step": 9030 }, { "epoch": 2.6898490292075428, "grad_norm": 0.2775104343891144, "learning_rate": 1.714670613404566e-05, "loss": 1.2607, "step": 9031 }, { "epoch": 2.6901468754072115, "grad_norm": 0.2796226143836975, "learning_rate": 1.7146031424091497e-05, "loss": 1.2476, "step": 9032 }, { "epoch": 2.6904447216068803, "grad_norm": 0.2853524088859558, "learning_rate": 1.7145356647651498e-05, "loss": 1.2699, "step": 9033 }, { "epoch": 2.6907425678065486, "grad_norm": 0.2359253168106079, "learning_rate": 1.714468180473194e-05, "loss": 1.233, "step": 9034 }, { "epoch": 2.6910404140062174, "grad_norm": 0.42600083351135254, "learning_rate": 1.7144006895339098e-05, "loss": 1.2383, "step": 9035 }, { "epoch": 2.691338260205886, "grad_norm": 0.28889214992523193, "learning_rate": 1.7143331919479252e-05, "loss": 1.2344, "step": 9036 }, { "epoch": 2.691636106405555, "grad_norm": 0.2893359959125519, "learning_rate": 1.7142656877158684e-05, "loss": 1.2479, "step": 9037 }, { "epoch": 2.6919339526052237, "grad_norm": 0.2502742409706116, "learning_rate": 1.714198176838368e-05, "loss": 1.2388, "step": 9038 }, { "epoch": 2.692231798804892, "grad_norm": 0.31276842951774597, "learning_rate": 1.7141306593160508e-05, "loss": 1.2458, "step": 9039 }, { "epoch": 2.692529645004561, "grad_norm": 0.22691497206687927, "learning_rate": 1.714063135149546e-05, "loss": 1.2313, "step": 9040 }, { "epoch": 2.6928274912042296, "grad_norm": 0.23439256846904755, "learning_rate": 1.7139956043394814e-05, "loss": 1.2554, "step": 9041 }, { "epoch": 2.693125337403898, "grad_norm": 0.24257373809814453, "learning_rate": 1.7139280668864855e-05, "loss": 1.254, "step": 9042 }, { "epoch": 2.6934231836035667, "grad_norm": 0.23966045677661896, "learning_rate": 1.713860522791187e-05, "loss": 1.2383, "step": 9043 }, { "epoch": 2.6937210298032355, "grad_norm": 0.24259738624095917, "learning_rate": 1.7137929720542136e-05, "loss": 1.2466, "step": 9044 }, { "epoch": 2.694018876002904, "grad_norm": 0.2348119467496872, "learning_rate": 1.7137254146761936e-05, "loss": 1.2496, "step": 9045 }, { "epoch": 2.6943167222025726, "grad_norm": 0.24347592890262604, "learning_rate": 1.7136578506577562e-05, "loss": 1.2474, "step": 9046 }, { "epoch": 2.6946145684022413, "grad_norm": 0.2700434625148773, "learning_rate": 1.7135902799995302e-05, "loss": 1.2574, "step": 9047 }, { "epoch": 2.6949124146019097, "grad_norm": 0.28286752104759216, "learning_rate": 1.713522702702144e-05, "loss": 1.2407, "step": 9048 }, { "epoch": 2.6952102608015784, "grad_norm": 0.35509929060935974, "learning_rate": 1.713455118766226e-05, "loss": 1.2573, "step": 9049 }, { "epoch": 2.695508107001247, "grad_norm": 0.6191769242286682, "learning_rate": 1.7133875281924054e-05, "loss": 1.236, "step": 9050 }, { "epoch": 2.695805953200916, "grad_norm": 0.2674236595630646, "learning_rate": 1.7133199309813106e-05, "loss": 1.2502, "step": 9051 }, { "epoch": 2.6961037994005848, "grad_norm": 0.6483727693557739, "learning_rate": 1.7132523271335714e-05, "loss": 1.2507, "step": 9052 }, { "epoch": 2.696401645600253, "grad_norm": 0.3155398666858673, "learning_rate": 1.7131847166498156e-05, "loss": 1.2408, "step": 9053 }, { "epoch": 2.696699491799922, "grad_norm": 0.27703872323036194, "learning_rate": 1.713117099530673e-05, "loss": 1.2381, "step": 9054 }, { "epoch": 2.6969973379995906, "grad_norm": 0.24894583225250244, "learning_rate": 1.7130494757767724e-05, "loss": 1.2517, "step": 9055 }, { "epoch": 2.697295184199259, "grad_norm": 0.24170449376106262, "learning_rate": 1.7129818453887432e-05, "loss": 1.2401, "step": 9056 }, { "epoch": 2.6975930303989277, "grad_norm": 0.2369874268770218, "learning_rate": 1.7129142083672147e-05, "loss": 1.2461, "step": 9057 }, { "epoch": 2.6978908765985965, "grad_norm": 0.2657559812068939, "learning_rate": 1.712846564712816e-05, "loss": 1.2578, "step": 9058 }, { "epoch": 2.698188722798265, "grad_norm": 0.25176188349723816, "learning_rate": 1.712778914426176e-05, "loss": 1.2268, "step": 9059 }, { "epoch": 2.6984865689979336, "grad_norm": 0.22299958765506744, "learning_rate": 1.712711257507925e-05, "loss": 1.2458, "step": 9060 }, { "epoch": 2.6987844151976024, "grad_norm": 0.2352006584405899, "learning_rate": 1.712643593958692e-05, "loss": 1.2495, "step": 9061 }, { "epoch": 2.699082261397271, "grad_norm": 0.22989533841609955, "learning_rate": 1.7125759237791065e-05, "loss": 1.2413, "step": 9062 }, { "epoch": 2.6993801075969395, "grad_norm": 0.23323993384838104, "learning_rate": 1.712508246969798e-05, "loss": 1.2525, "step": 9063 }, { "epoch": 2.6996779537966082, "grad_norm": 0.23476077616214752, "learning_rate": 1.7124405635313964e-05, "loss": 1.2624, "step": 9064 }, { "epoch": 2.699975799996277, "grad_norm": 0.23075537383556366, "learning_rate": 1.7123728734645316e-05, "loss": 1.2441, "step": 9065 }, { "epoch": 2.700273646195946, "grad_norm": 0.236238032579422, "learning_rate": 1.712305176769833e-05, "loss": 1.2417, "step": 9066 }, { "epoch": 2.700571492395614, "grad_norm": 0.2502848505973816, "learning_rate": 1.7122374734479304e-05, "loss": 1.2418, "step": 9067 }, { "epoch": 2.700869338595283, "grad_norm": 0.235133096575737, "learning_rate": 1.712169763499454e-05, "loss": 1.2432, "step": 9068 }, { "epoch": 2.7011671847949517, "grad_norm": 0.23175573348999023, "learning_rate": 1.7121020469250336e-05, "loss": 1.2393, "step": 9069 }, { "epoch": 2.70146503099462, "grad_norm": 0.22638940811157227, "learning_rate": 1.7120343237252994e-05, "loss": 1.2297, "step": 9070 }, { "epoch": 2.7017628771942888, "grad_norm": 0.23068824410438538, "learning_rate": 1.7119665939008808e-05, "loss": 1.2597, "step": 9071 }, { "epoch": 2.7020607233939575, "grad_norm": 0.23865114152431488, "learning_rate": 1.7118988574524095e-05, "loss": 1.2633, "step": 9072 }, { "epoch": 2.702358569593626, "grad_norm": 0.22494173049926758, "learning_rate": 1.711831114380514e-05, "loss": 1.2515, "step": 9073 }, { "epoch": 2.7026564157932946, "grad_norm": 0.2372240573167801, "learning_rate": 1.7117633646858252e-05, "loss": 1.2282, "step": 9074 }, { "epoch": 2.7029542619929634, "grad_norm": 0.238014817237854, "learning_rate": 1.7116956083689737e-05, "loss": 1.2564, "step": 9075 }, { "epoch": 2.703252108192632, "grad_norm": 0.2317420244216919, "learning_rate": 1.7116278454305898e-05, "loss": 1.2509, "step": 9076 }, { "epoch": 2.703549954392301, "grad_norm": 0.23609192669391632, "learning_rate": 1.711560075871304e-05, "loss": 1.2327, "step": 9077 }, { "epoch": 2.7038478005919693, "grad_norm": 0.2346174120903015, "learning_rate": 1.7114922996917465e-05, "loss": 1.2454, "step": 9078 }, { "epoch": 2.704145646791638, "grad_norm": 0.2298157811164856, "learning_rate": 1.711424516892548e-05, "loss": 1.258, "step": 9079 }, { "epoch": 2.704443492991307, "grad_norm": 0.22545742988586426, "learning_rate": 1.711356727474339e-05, "loss": 1.2577, "step": 9080 }, { "epoch": 2.704741339190975, "grad_norm": 0.23808005452156067, "learning_rate": 1.7112889314377508e-05, "loss": 1.2388, "step": 9081 }, { "epoch": 2.705039185390644, "grad_norm": 0.21936507523059845, "learning_rate": 1.711221128783414e-05, "loss": 1.2303, "step": 9082 }, { "epoch": 2.7053370315903127, "grad_norm": 0.23477523028850555, "learning_rate": 1.7111533195119586e-05, "loss": 1.2479, "step": 9083 }, { "epoch": 2.705634877789981, "grad_norm": 0.2425217479467392, "learning_rate": 1.711085503624017e-05, "loss": 1.244, "step": 9084 }, { "epoch": 2.70593272398965, "grad_norm": 0.2296759933233261, "learning_rate": 1.7110176811202185e-05, "loss": 1.2626, "step": 9085 }, { "epoch": 2.7062305701893186, "grad_norm": 0.22694678604602814, "learning_rate": 1.710949852001195e-05, "loss": 1.2469, "step": 9086 }, { "epoch": 2.706528416388987, "grad_norm": 0.225328266620636, "learning_rate": 1.7108820162675777e-05, "loss": 1.2362, "step": 9087 }, { "epoch": 2.7068262625886557, "grad_norm": 0.24670012295246124, "learning_rate": 1.7108141739199973e-05, "loss": 1.2357, "step": 9088 }, { "epoch": 2.7071241087883244, "grad_norm": 0.23392459750175476, "learning_rate": 1.7107463249590852e-05, "loss": 1.241, "step": 9089 }, { "epoch": 2.707421954987993, "grad_norm": 0.2339889109134674, "learning_rate": 1.7106784693854726e-05, "loss": 1.238, "step": 9090 }, { "epoch": 2.707719801187662, "grad_norm": 0.23321010172367096, "learning_rate": 1.710610607199791e-05, "loss": 1.2428, "step": 9091 }, { "epoch": 2.7080176473873303, "grad_norm": 0.2305080145597458, "learning_rate": 1.7105427384026715e-05, "loss": 1.2368, "step": 9092 }, { "epoch": 2.708315493586999, "grad_norm": 0.23064188659191132, "learning_rate": 1.7104748629947456e-05, "loss": 1.2597, "step": 9093 }, { "epoch": 2.708613339786668, "grad_norm": 0.2329948991537094, "learning_rate": 1.710406980976645e-05, "loss": 1.2711, "step": 9094 }, { "epoch": 2.708911185986336, "grad_norm": 0.24623043835163116, "learning_rate": 1.7103390923490013e-05, "loss": 1.2457, "step": 9095 }, { "epoch": 2.709209032186005, "grad_norm": 0.23980486392974854, "learning_rate": 1.7102711971124458e-05, "loss": 1.2447, "step": 9096 }, { "epoch": 2.7095068783856737, "grad_norm": 0.23110947012901306, "learning_rate": 1.7102032952676103e-05, "loss": 1.2432, "step": 9097 }, { "epoch": 2.709804724585342, "grad_norm": 0.23680636286735535, "learning_rate": 1.7101353868151268e-05, "loss": 1.2545, "step": 9098 }, { "epoch": 2.710102570785011, "grad_norm": 0.2491663545370102, "learning_rate": 1.7100674717556273e-05, "loss": 1.239, "step": 9099 }, { "epoch": 2.7104004169846796, "grad_norm": 0.22969883680343628, "learning_rate": 1.7099995500897425e-05, "loss": 1.2437, "step": 9100 }, { "epoch": 2.710698263184348, "grad_norm": 0.23873046040534973, "learning_rate": 1.709931621818106e-05, "loss": 1.2533, "step": 9101 }, { "epoch": 2.7109961093840167, "grad_norm": 0.24019987881183624, "learning_rate": 1.7098636869413483e-05, "loss": 1.2535, "step": 9102 }, { "epoch": 2.7112939555836855, "grad_norm": 0.23425638675689697, "learning_rate": 1.709795745460102e-05, "loss": 1.2585, "step": 9103 }, { "epoch": 2.7115918017833542, "grad_norm": 0.22539055347442627, "learning_rate": 1.7097277973749998e-05, "loss": 1.2401, "step": 9104 }, { "epoch": 2.711889647983023, "grad_norm": 0.2362004667520523, "learning_rate": 1.7096598426866732e-05, "loss": 1.2391, "step": 9105 }, { "epoch": 2.7121874941826913, "grad_norm": 0.22755102813243866, "learning_rate": 1.7095918813957547e-05, "loss": 1.2386, "step": 9106 }, { "epoch": 2.71248534038236, "grad_norm": 0.2261369824409485, "learning_rate": 1.7095239135028767e-05, "loss": 1.2402, "step": 9107 }, { "epoch": 2.712783186582029, "grad_norm": 0.23672804236412048, "learning_rate": 1.709455939008671e-05, "loss": 1.2347, "step": 9108 }, { "epoch": 2.713081032781697, "grad_norm": 0.23127327859401703, "learning_rate": 1.7093879579137705e-05, "loss": 1.2493, "step": 9109 }, { "epoch": 2.713378878981366, "grad_norm": 0.2461974322795868, "learning_rate": 1.709319970218808e-05, "loss": 1.248, "step": 9110 }, { "epoch": 2.7136767251810348, "grad_norm": 0.24692919850349426, "learning_rate": 1.7092519759244153e-05, "loss": 1.2595, "step": 9111 }, { "epoch": 2.713974571380703, "grad_norm": 0.23184597492218018, "learning_rate": 1.7091839750312255e-05, "loss": 1.2519, "step": 9112 }, { "epoch": 2.714272417580372, "grad_norm": 0.2404884696006775, "learning_rate": 1.7091159675398713e-05, "loss": 1.2429, "step": 9113 }, { "epoch": 2.7145702637800406, "grad_norm": 0.2422194927930832, "learning_rate": 1.7090479534509853e-05, "loss": 1.2648, "step": 9114 }, { "epoch": 2.714868109979709, "grad_norm": 0.23007488250732422, "learning_rate": 1.7089799327652003e-05, "loss": 1.2389, "step": 9115 }, { "epoch": 2.7151659561793777, "grad_norm": 0.23888500034809113, "learning_rate": 1.7089119054831486e-05, "loss": 1.2513, "step": 9116 }, { "epoch": 2.7154638023790465, "grad_norm": 0.22726529836654663, "learning_rate": 1.708843871605464e-05, "loss": 1.2396, "step": 9117 }, { "epoch": 2.7157616485787153, "grad_norm": 0.2422054260969162, "learning_rate": 1.7087758311327794e-05, "loss": 1.2355, "step": 9118 }, { "epoch": 2.716059494778384, "grad_norm": 0.23579560220241547, "learning_rate": 1.7087077840657274e-05, "loss": 1.2475, "step": 9119 }, { "epoch": 2.7163573409780524, "grad_norm": 0.22729834914207458, "learning_rate": 1.7086397304049413e-05, "loss": 1.2398, "step": 9120 }, { "epoch": 2.716655187177721, "grad_norm": 0.23567086458206177, "learning_rate": 1.708571670151054e-05, "loss": 1.2448, "step": 9121 }, { "epoch": 2.71695303337739, "grad_norm": 0.22253277897834778, "learning_rate": 1.7085036033046996e-05, "loss": 1.2379, "step": 9122 }, { "epoch": 2.7172508795770582, "grad_norm": 0.22736045718193054, "learning_rate": 1.7084355298665104e-05, "loss": 1.2387, "step": 9123 }, { "epoch": 2.717548725776727, "grad_norm": 0.23885032534599304, "learning_rate": 1.70836744983712e-05, "loss": 1.2433, "step": 9124 }, { "epoch": 2.717846571976396, "grad_norm": 0.24466484785079956, "learning_rate": 1.7082993632171622e-05, "loss": 1.2455, "step": 9125 }, { "epoch": 2.718144418176064, "grad_norm": 0.25239646434783936, "learning_rate": 1.7082312700072697e-05, "loss": 1.2287, "step": 9126 }, { "epoch": 2.718442264375733, "grad_norm": 0.23017901182174683, "learning_rate": 1.708163170208077e-05, "loss": 1.2397, "step": 9127 }, { "epoch": 2.7187401105754017, "grad_norm": 0.2403314709663391, "learning_rate": 1.708095063820217e-05, "loss": 1.2339, "step": 9128 }, { "epoch": 2.7190379567750704, "grad_norm": 0.2271287739276886, "learning_rate": 1.708026950844323e-05, "loss": 1.2424, "step": 9129 }, { "epoch": 2.7193358029747388, "grad_norm": 0.2517382502555847, "learning_rate": 1.70795883128103e-05, "loss": 1.2648, "step": 9130 }, { "epoch": 2.7196336491744075, "grad_norm": 0.2596951723098755, "learning_rate": 1.707890705130971e-05, "loss": 1.2476, "step": 9131 }, { "epoch": 2.7199314953740763, "grad_norm": 0.24782678484916687, "learning_rate": 1.7078225723947798e-05, "loss": 1.2248, "step": 9132 }, { "epoch": 2.720229341573745, "grad_norm": 0.2564922571182251, "learning_rate": 1.7077544330730903e-05, "loss": 1.2654, "step": 9133 }, { "epoch": 2.7205271877734134, "grad_norm": 0.2629776895046234, "learning_rate": 1.707686287166537e-05, "loss": 1.2564, "step": 9134 }, { "epoch": 2.720825033973082, "grad_norm": 0.2627808451652527, "learning_rate": 1.7076181346757527e-05, "loss": 1.2343, "step": 9135 }, { "epoch": 2.721122880172751, "grad_norm": 0.26616424322128296, "learning_rate": 1.7075499756013728e-05, "loss": 1.2442, "step": 9136 }, { "epoch": 2.7214207263724193, "grad_norm": 0.3545578122138977, "learning_rate": 1.7074818099440306e-05, "loss": 1.2486, "step": 9137 }, { "epoch": 2.721718572572088, "grad_norm": 0.24880005419254303, "learning_rate": 1.707413637704361e-05, "loss": 1.2425, "step": 9138 }, { "epoch": 2.722016418771757, "grad_norm": 0.26369282603263855, "learning_rate": 1.7073454588829976e-05, "loss": 1.2397, "step": 9139 }, { "epoch": 2.722314264971425, "grad_norm": 0.23869788646697998, "learning_rate": 1.707277273480575e-05, "loss": 1.2683, "step": 9140 }, { "epoch": 2.722612111171094, "grad_norm": 0.2874651849269867, "learning_rate": 1.7072090814977275e-05, "loss": 1.2305, "step": 9141 }, { "epoch": 2.7229099573707627, "grad_norm": 0.25917285680770874, "learning_rate": 1.7071408829350896e-05, "loss": 1.2485, "step": 9142 }, { "epoch": 2.7232078035704315, "grad_norm": 0.2596162259578705, "learning_rate": 1.707072677793296e-05, "loss": 1.2544, "step": 9143 }, { "epoch": 2.7235056497701002, "grad_norm": 0.26855939626693726, "learning_rate": 1.7070044660729814e-05, "loss": 1.2483, "step": 9144 }, { "epoch": 2.7238034959697686, "grad_norm": 0.23040041327476501, "learning_rate": 1.7069362477747798e-05, "loss": 1.2545, "step": 9145 }, { "epoch": 2.7241013421694373, "grad_norm": 0.2614309787750244, "learning_rate": 1.7068680228993263e-05, "loss": 1.2396, "step": 9146 }, { "epoch": 2.724399188369106, "grad_norm": 0.23362092673778534, "learning_rate": 1.7067997914472557e-05, "loss": 1.2332, "step": 9147 }, { "epoch": 2.7246970345687744, "grad_norm": 0.320008248090744, "learning_rate": 1.7067315534192024e-05, "loss": 1.2546, "step": 9148 }, { "epoch": 2.724994880768443, "grad_norm": 0.25644442439079285, "learning_rate": 1.7066633088158017e-05, "loss": 1.2362, "step": 9149 }, { "epoch": 2.725292726968112, "grad_norm": 0.24778707325458527, "learning_rate": 1.7065950576376886e-05, "loss": 1.2299, "step": 9150 }, { "epoch": 2.7255905731677803, "grad_norm": 0.2621813118457794, "learning_rate": 1.706526799885498e-05, "loss": 1.2734, "step": 9151 }, { "epoch": 2.725888419367449, "grad_norm": 0.2463354915380478, "learning_rate": 1.706458535559865e-05, "loss": 1.2475, "step": 9152 }, { "epoch": 2.726186265567118, "grad_norm": 0.26674890518188477, "learning_rate": 1.7063902646614242e-05, "loss": 1.2425, "step": 9153 }, { "epoch": 2.726484111766786, "grad_norm": 0.2545463442802429, "learning_rate": 1.7063219871908118e-05, "loss": 1.2567, "step": 9154 }, { "epoch": 2.726781957966455, "grad_norm": 0.2650394141674042, "learning_rate": 1.706253703148662e-05, "loss": 1.2505, "step": 9155 }, { "epoch": 2.7270798041661237, "grad_norm": 0.23900756239891052, "learning_rate": 1.7061854125356107e-05, "loss": 1.2443, "step": 9156 }, { "epoch": 2.7273776503657925, "grad_norm": 0.22633816301822662, "learning_rate": 1.7061171153522932e-05, "loss": 1.2559, "step": 9157 }, { "epoch": 2.7276754965654613, "grad_norm": 0.269744873046875, "learning_rate": 1.7060488115993448e-05, "loss": 1.2538, "step": 9158 }, { "epoch": 2.7279733427651296, "grad_norm": 0.3422834277153015, "learning_rate": 1.7059805012774008e-05, "loss": 1.267, "step": 9159 }, { "epoch": 2.7282711889647984, "grad_norm": 0.28951409459114075, "learning_rate": 1.7059121843870975e-05, "loss": 1.2645, "step": 9160 }, { "epoch": 2.728569035164467, "grad_norm": 0.25171276926994324, "learning_rate": 1.7058438609290697e-05, "loss": 1.2359, "step": 9161 }, { "epoch": 2.7288668813641355, "grad_norm": 0.3104507029056549, "learning_rate": 1.7057755309039535e-05, "loss": 1.2523, "step": 9162 }, { "epoch": 2.7291647275638042, "grad_norm": 0.2615697383880615, "learning_rate": 1.7057071943123845e-05, "loss": 1.2636, "step": 9163 }, { "epoch": 2.729462573763473, "grad_norm": 0.264263391494751, "learning_rate": 1.7056388511549985e-05, "loss": 1.2327, "step": 9164 }, { "epoch": 2.7297604199631413, "grad_norm": 0.30043691396713257, "learning_rate": 1.7055705014324313e-05, "loss": 1.2422, "step": 9165 }, { "epoch": 2.73005826616281, "grad_norm": 0.25919532775878906, "learning_rate": 1.7055021451453188e-05, "loss": 1.2371, "step": 9166 }, { "epoch": 2.730356112362479, "grad_norm": 0.4882142245769501, "learning_rate": 1.7054337822942976e-05, "loss": 1.2396, "step": 9167 }, { "epoch": 2.730653958562147, "grad_norm": 0.302798867225647, "learning_rate": 1.7053654128800026e-05, "loss": 1.2582, "step": 9168 }, { "epoch": 2.730951804761816, "grad_norm": 0.2920604348182678, "learning_rate": 1.705297036903071e-05, "loss": 1.2511, "step": 9169 }, { "epoch": 2.7312496509614848, "grad_norm": 0.2750656306743622, "learning_rate": 1.705228654364138e-05, "loss": 1.2484, "step": 9170 }, { "epoch": 2.7315474971611535, "grad_norm": 0.3999350666999817, "learning_rate": 1.7051602652638405e-05, "loss": 1.2464, "step": 9171 }, { "epoch": 2.7318453433608223, "grad_norm": 0.2648412883281708, "learning_rate": 1.7050918696028147e-05, "loss": 1.2611, "step": 9172 }, { "epoch": 2.7321431895604906, "grad_norm": 0.23810319602489471, "learning_rate": 1.7050234673816967e-05, "loss": 1.2693, "step": 9173 }, { "epoch": 2.7324410357601594, "grad_norm": 0.24768781661987305, "learning_rate": 1.7049550586011234e-05, "loss": 1.2419, "step": 9174 }, { "epoch": 2.732738881959828, "grad_norm": 0.23804834485054016, "learning_rate": 1.7048866432617303e-05, "loss": 1.2585, "step": 9175 }, { "epoch": 2.7330367281594965, "grad_norm": 0.23359544575214386, "learning_rate": 1.7048182213641548e-05, "loss": 1.2363, "step": 9176 }, { "epoch": 2.7333345743591653, "grad_norm": 0.24701635539531708, "learning_rate": 1.7047497929090332e-05, "loss": 1.2334, "step": 9177 }, { "epoch": 2.733632420558834, "grad_norm": 0.43969398736953735, "learning_rate": 1.704681357897002e-05, "loss": 1.246, "step": 9178 }, { "epoch": 2.7339302667585024, "grad_norm": 0.27062731981277466, "learning_rate": 1.7046129163286985e-05, "loss": 1.257, "step": 9179 }, { "epoch": 2.734228112958171, "grad_norm": 0.23545852303504944, "learning_rate": 1.704544468204759e-05, "loss": 1.2603, "step": 9180 }, { "epoch": 2.73452595915784, "grad_norm": 0.23939058184623718, "learning_rate": 1.7044760135258203e-05, "loss": 1.2456, "step": 9181 }, { "epoch": 2.7348238053575082, "grad_norm": 0.25682955980300903, "learning_rate": 1.7044075522925192e-05, "loss": 1.2545, "step": 9182 }, { "epoch": 2.735121651557177, "grad_norm": 0.24427543580532074, "learning_rate": 1.704339084505493e-05, "loss": 1.2584, "step": 9183 }, { "epoch": 2.735419497756846, "grad_norm": 0.33743566274642944, "learning_rate": 1.7042706101653784e-05, "loss": 1.2453, "step": 9184 }, { "epoch": 2.7357173439565146, "grad_norm": 0.2339986264705658, "learning_rate": 1.704202129272813e-05, "loss": 1.2458, "step": 9185 }, { "epoch": 2.7360151901561833, "grad_norm": 0.24717910587787628, "learning_rate": 1.704133641828433e-05, "loss": 1.2488, "step": 9186 }, { "epoch": 2.7363130363558517, "grad_norm": 0.2506367564201355, "learning_rate": 1.7040651478328765e-05, "loss": 1.2379, "step": 9187 }, { "epoch": 2.7366108825555204, "grad_norm": 0.24618619680404663, "learning_rate": 1.7039966472867805e-05, "loss": 1.2304, "step": 9188 }, { "epoch": 2.736908728755189, "grad_norm": 0.24516215920448303, "learning_rate": 1.7039281401907822e-05, "loss": 1.2407, "step": 9189 }, { "epoch": 2.7372065749548575, "grad_norm": 0.2300100177526474, "learning_rate": 1.7038596265455188e-05, "loss": 1.2432, "step": 9190 }, { "epoch": 2.7375044211545263, "grad_norm": 0.25353389978408813, "learning_rate": 1.7037911063516285e-05, "loss": 1.2352, "step": 9191 }, { "epoch": 2.737802267354195, "grad_norm": 0.23846863210201263, "learning_rate": 1.703722579609748e-05, "loss": 1.2391, "step": 9192 }, { "epoch": 2.7381001135538634, "grad_norm": 0.2458648383617401, "learning_rate": 1.703654046320515e-05, "loss": 1.2521, "step": 9193 }, { "epoch": 2.738397959753532, "grad_norm": 0.2297109216451645, "learning_rate": 1.7035855064845672e-05, "loss": 1.2378, "step": 9194 }, { "epoch": 2.738695805953201, "grad_norm": 0.23073622584342957, "learning_rate": 1.7035169601025426e-05, "loss": 1.2488, "step": 9195 }, { "epoch": 2.7389936521528697, "grad_norm": 0.24436400830745697, "learning_rate": 1.7034484071750786e-05, "loss": 1.2352, "step": 9196 }, { "epoch": 2.739291498352538, "grad_norm": 0.24284270405769348, "learning_rate": 1.703379847702813e-05, "loss": 1.2321, "step": 9197 }, { "epoch": 2.739589344552207, "grad_norm": 0.27496808767318726, "learning_rate": 1.703311281686384e-05, "loss": 1.2525, "step": 9198 }, { "epoch": 2.7398871907518756, "grad_norm": 0.2907407581806183, "learning_rate": 1.703242709126429e-05, "loss": 1.2437, "step": 9199 }, { "epoch": 2.7401850369515444, "grad_norm": 0.2722211182117462, "learning_rate": 1.7031741300235863e-05, "loss": 1.262, "step": 9200 }, { "epoch": 2.7404828831512127, "grad_norm": 0.41545718908309937, "learning_rate": 1.7031055443784943e-05, "loss": 1.2439, "step": 9201 }, { "epoch": 2.7407807293508815, "grad_norm": 0.2844906151294708, "learning_rate": 1.7030369521917908e-05, "loss": 1.2572, "step": 9202 }, { "epoch": 2.7410785755505502, "grad_norm": 0.28576919436454773, "learning_rate": 1.7029683534641136e-05, "loss": 1.2475, "step": 9203 }, { "epoch": 2.7413764217502186, "grad_norm": 0.30909910798072815, "learning_rate": 1.7028997481961016e-05, "loss": 1.2406, "step": 9204 }, { "epoch": 2.7416742679498873, "grad_norm": 0.2515838146209717, "learning_rate": 1.7028311363883925e-05, "loss": 1.2579, "step": 9205 }, { "epoch": 2.741972114149556, "grad_norm": 0.2506393492221832, "learning_rate": 1.7027625180416247e-05, "loss": 1.2601, "step": 9206 }, { "epoch": 2.7422699603492244, "grad_norm": 0.2576870024204254, "learning_rate": 1.7026938931564374e-05, "loss": 1.2521, "step": 9207 }, { "epoch": 2.742567806548893, "grad_norm": 0.24623283743858337, "learning_rate": 1.7026252617334683e-05, "loss": 1.2455, "step": 9208 }, { "epoch": 2.742865652748562, "grad_norm": 0.30400317907333374, "learning_rate": 1.702556623773356e-05, "loss": 1.2555, "step": 9209 }, { "epoch": 2.7431634989482307, "grad_norm": 0.2484186589717865, "learning_rate": 1.7024879792767395e-05, "loss": 1.2636, "step": 9210 }, { "epoch": 2.7434613451478995, "grad_norm": 0.2822059392929077, "learning_rate": 1.702419328244257e-05, "loss": 1.2549, "step": 9211 }, { "epoch": 2.743759191347568, "grad_norm": 0.24002540111541748, "learning_rate": 1.7023506706765477e-05, "loss": 1.2465, "step": 9212 }, { "epoch": 2.7440570375472366, "grad_norm": 0.33280149102211, "learning_rate": 1.70228200657425e-05, "loss": 1.248, "step": 9213 }, { "epoch": 2.7443548837469054, "grad_norm": 0.22955799102783203, "learning_rate": 1.7022133359380028e-05, "loss": 1.2651, "step": 9214 }, { "epoch": 2.7446527299465737, "grad_norm": 0.2753639817237854, "learning_rate": 1.702144658768445e-05, "loss": 1.2535, "step": 9215 }, { "epoch": 2.7449505761462425, "grad_norm": 0.25297975540161133, "learning_rate": 1.7020759750662156e-05, "loss": 1.2441, "step": 9216 }, { "epoch": 2.7452484223459113, "grad_norm": 0.31673431396484375, "learning_rate": 1.702007284831954e-05, "loss": 1.2441, "step": 9217 }, { "epoch": 2.7455462685455796, "grad_norm": 0.24094048142433167, "learning_rate": 1.7019385880662985e-05, "loss": 1.2575, "step": 9218 }, { "epoch": 2.7458441147452484, "grad_norm": 0.2683247923851013, "learning_rate": 1.7018698847698893e-05, "loss": 1.268, "step": 9219 }, { "epoch": 2.746141960944917, "grad_norm": 0.2574305236339569, "learning_rate": 1.7018011749433646e-05, "loss": 1.2405, "step": 9220 }, { "epoch": 2.7464398071445855, "grad_norm": 0.28552350401878357, "learning_rate": 1.701732458587364e-05, "loss": 1.2457, "step": 9221 }, { "epoch": 2.7467376533442542, "grad_norm": 0.2695595920085907, "learning_rate": 1.701663735702527e-05, "loss": 1.2521, "step": 9222 }, { "epoch": 2.747035499543923, "grad_norm": 0.27349793910980225, "learning_rate": 1.7015950062894928e-05, "loss": 1.2538, "step": 9223 }, { "epoch": 2.747333345743592, "grad_norm": 0.27242761850357056, "learning_rate": 1.701526270348901e-05, "loss": 1.2426, "step": 9224 }, { "epoch": 2.7476311919432606, "grad_norm": 0.2456509917974472, "learning_rate": 1.701457527881391e-05, "loss": 1.2544, "step": 9225 }, { "epoch": 2.747929038142929, "grad_norm": 0.27910783886909485, "learning_rate": 1.7013887788876025e-05, "loss": 1.2508, "step": 9226 }, { "epoch": 2.7482268843425977, "grad_norm": 0.26685217022895813, "learning_rate": 1.7013200233681752e-05, "loss": 1.2436, "step": 9227 }, { "epoch": 2.7485247305422664, "grad_norm": 0.2567109763622284, "learning_rate": 1.7012512613237488e-05, "loss": 1.2398, "step": 9228 }, { "epoch": 2.7488225767419348, "grad_norm": 0.24262216687202454, "learning_rate": 1.701182492754962e-05, "loss": 1.2368, "step": 9229 }, { "epoch": 2.7491204229416035, "grad_norm": 0.2572483420372009, "learning_rate": 1.7011137176624562e-05, "loss": 1.2506, "step": 9230 }, { "epoch": 2.7494182691412723, "grad_norm": 0.24260063469409943, "learning_rate": 1.7010449360468704e-05, "loss": 1.257, "step": 9231 }, { "epoch": 2.7497161153409406, "grad_norm": 0.24273715913295746, "learning_rate": 1.700976147908845e-05, "loss": 1.2426, "step": 9232 }, { "epoch": 2.7500139615406094, "grad_norm": 0.2752387225627899, "learning_rate": 1.7009073532490195e-05, "loss": 1.2475, "step": 9233 }, { "epoch": 2.750311807740278, "grad_norm": 0.22555620968341827, "learning_rate": 1.700838552068034e-05, "loss": 1.2351, "step": 9234 }, { "epoch": 2.7506096539399465, "grad_norm": 0.26723402738571167, "learning_rate": 1.7007697443665292e-05, "loss": 1.2399, "step": 9235 }, { "epoch": 2.7509075001396153, "grad_norm": 0.27106043696403503, "learning_rate": 1.7007009301451446e-05, "loss": 1.2398, "step": 9236 }, { "epoch": 2.751205346339284, "grad_norm": 0.2399388551712036, "learning_rate": 1.7006321094045205e-05, "loss": 1.2508, "step": 9237 }, { "epoch": 2.751503192538953, "grad_norm": 0.2277744561433792, "learning_rate": 1.700563282145298e-05, "loss": 1.2556, "step": 9238 }, { "epoch": 2.7518010387386216, "grad_norm": 0.23171816766262054, "learning_rate": 1.7004944483681164e-05, "loss": 1.2339, "step": 9239 }, { "epoch": 2.75209888493829, "grad_norm": 0.24868685007095337, "learning_rate": 1.7004256080736167e-05, "loss": 1.255, "step": 9240 }, { "epoch": 2.7523967311379587, "grad_norm": 0.2417503297328949, "learning_rate": 1.7003567612624393e-05, "loss": 1.2489, "step": 9241 }, { "epoch": 2.7526945773376275, "grad_norm": 0.27626389265060425, "learning_rate": 1.7002879079352247e-05, "loss": 1.247, "step": 9242 }, { "epoch": 2.752992423537296, "grad_norm": 0.30410346388816833, "learning_rate": 1.7002190480926138e-05, "loss": 1.237, "step": 9243 }, { "epoch": 2.7532902697369646, "grad_norm": 0.2708747982978821, "learning_rate": 1.7001501817352468e-05, "loss": 1.2487, "step": 9244 }, { "epoch": 2.7535881159366333, "grad_norm": 0.2584022581577301, "learning_rate": 1.7000813088637645e-05, "loss": 1.2526, "step": 9245 }, { "epoch": 2.7538859621363017, "grad_norm": 0.4966140687465668, "learning_rate": 1.7000124294788078e-05, "loss": 1.2393, "step": 9246 }, { "epoch": 2.7541838083359704, "grad_norm": 0.32315561175346375, "learning_rate": 1.6999435435810175e-05, "loss": 1.2388, "step": 9247 }, { "epoch": 2.754481654535639, "grad_norm": 0.2769085168838501, "learning_rate": 1.699874651171035e-05, "loss": 1.2641, "step": 9248 }, { "epoch": 2.7547795007353075, "grad_norm": 0.2551030218601227, "learning_rate": 1.6998057522495002e-05, "loss": 1.2456, "step": 9249 }, { "epoch": 2.7550773469349763, "grad_norm": 0.24251501262187958, "learning_rate": 1.699736846817055e-05, "loss": 1.2481, "step": 9250 }, { "epoch": 2.755375193134645, "grad_norm": 0.3502722382545471, "learning_rate": 1.6996679348743402e-05, "loss": 1.236, "step": 9251 }, { "epoch": 2.755673039334314, "grad_norm": 0.24152779579162598, "learning_rate": 1.6995990164219973e-05, "loss": 1.2549, "step": 9252 }, { "epoch": 2.7559708855339826, "grad_norm": 0.23586568236351013, "learning_rate": 1.699530091460667e-05, "loss": 1.2281, "step": 9253 }, { "epoch": 2.756268731733651, "grad_norm": 0.25303852558135986, "learning_rate": 1.6994611599909907e-05, "loss": 1.255, "step": 9254 }, { "epoch": 2.7565665779333197, "grad_norm": 0.26098665595054626, "learning_rate": 1.6993922220136098e-05, "loss": 1.2457, "step": 9255 }, { "epoch": 2.7568644241329885, "grad_norm": 0.23625260591506958, "learning_rate": 1.6993232775291658e-05, "loss": 1.2349, "step": 9256 }, { "epoch": 2.757162270332657, "grad_norm": 0.28086790442466736, "learning_rate": 1.6992543265382996e-05, "loss": 1.245, "step": 9257 }, { "epoch": 2.7574601165323256, "grad_norm": 0.2951943576335907, "learning_rate": 1.6991853690416535e-05, "loss": 1.254, "step": 9258 }, { "epoch": 2.7577579627319944, "grad_norm": 0.2557200789451599, "learning_rate": 1.6991164050398686e-05, "loss": 1.2471, "step": 9259 }, { "epoch": 2.7580558089316627, "grad_norm": 0.2863323390483856, "learning_rate": 1.6990474345335866e-05, "loss": 1.2658, "step": 9260 }, { "epoch": 2.7583536551313315, "grad_norm": 0.25151994824409485, "learning_rate": 1.6989784575234495e-05, "loss": 1.2448, "step": 9261 }, { "epoch": 2.7586515013310002, "grad_norm": 0.2653193771839142, "learning_rate": 1.6989094740100987e-05, "loss": 1.2325, "step": 9262 }, { "epoch": 2.758949347530669, "grad_norm": 0.30489224195480347, "learning_rate": 1.6988404839941763e-05, "loss": 1.2447, "step": 9263 }, { "epoch": 2.7592471937303373, "grad_norm": 0.2491319179534912, "learning_rate": 1.6987714874763236e-05, "loss": 1.2418, "step": 9264 }, { "epoch": 2.759545039930006, "grad_norm": 0.28793221712112427, "learning_rate": 1.698702484457183e-05, "loss": 1.2545, "step": 9265 }, { "epoch": 2.759842886129675, "grad_norm": 0.2668450176715851, "learning_rate": 1.6986334749373965e-05, "loss": 1.2442, "step": 9266 }, { "epoch": 2.7601407323293436, "grad_norm": 0.23432527482509613, "learning_rate": 1.698564458917606e-05, "loss": 1.23, "step": 9267 }, { "epoch": 2.760438578529012, "grad_norm": 0.3287263512611389, "learning_rate": 1.6984954363984537e-05, "loss": 1.2459, "step": 9268 }, { "epoch": 2.7607364247286807, "grad_norm": 0.24850362539291382, "learning_rate": 1.698426407380582e-05, "loss": 1.2465, "step": 9269 }, { "epoch": 2.7610342709283495, "grad_norm": 0.2640020251274109, "learning_rate": 1.6983573718646328e-05, "loss": 1.2357, "step": 9270 }, { "epoch": 2.761332117128018, "grad_norm": 0.27306121587753296, "learning_rate": 1.6982883298512483e-05, "loss": 1.2486, "step": 9271 }, { "epoch": 2.7616299633276866, "grad_norm": 0.24212612211704254, "learning_rate": 1.6982192813410713e-05, "loss": 1.2317, "step": 9272 }, { "epoch": 2.7619278095273554, "grad_norm": 0.2920074164867401, "learning_rate": 1.6981502263347438e-05, "loss": 1.2299, "step": 9273 }, { "epoch": 2.7622256557270237, "grad_norm": 0.25239890813827515, "learning_rate": 1.6980811648329086e-05, "loss": 1.2516, "step": 9274 }, { "epoch": 2.7625235019266925, "grad_norm": 0.2624033987522125, "learning_rate": 1.698012096836208e-05, "loss": 1.2487, "step": 9275 }, { "epoch": 2.7628213481263613, "grad_norm": 0.2670723497867584, "learning_rate": 1.697943022345285e-05, "loss": 1.2631, "step": 9276 }, { "epoch": 2.76311919432603, "grad_norm": 0.24647146463394165, "learning_rate": 1.6978739413607815e-05, "loss": 1.2384, "step": 9277 }, { "epoch": 2.763417040525699, "grad_norm": 0.27723342180252075, "learning_rate": 1.697804853883341e-05, "loss": 1.255, "step": 9278 }, { "epoch": 2.763714886725367, "grad_norm": 0.2784757614135742, "learning_rate": 1.6977357599136057e-05, "loss": 1.2453, "step": 9279 }, { "epoch": 2.764012732925036, "grad_norm": 0.27276426553726196, "learning_rate": 1.697666659452219e-05, "loss": 1.2494, "step": 9280 }, { "epoch": 2.7643105791247047, "grad_norm": 0.2571217715740204, "learning_rate": 1.6975975524998234e-05, "loss": 1.2457, "step": 9281 }, { "epoch": 2.764608425324373, "grad_norm": 0.23558710515499115, "learning_rate": 1.6975284390570622e-05, "loss": 1.2638, "step": 9282 }, { "epoch": 2.764906271524042, "grad_norm": 0.2826679050922394, "learning_rate": 1.6974593191245778e-05, "loss": 1.2606, "step": 9283 }, { "epoch": 2.7652041177237106, "grad_norm": 0.2285085916519165, "learning_rate": 1.697390192703014e-05, "loss": 1.2543, "step": 9284 }, { "epoch": 2.765501963923379, "grad_norm": 0.2481565624475479, "learning_rate": 1.6973210597930135e-05, "loss": 1.2418, "step": 9285 }, { "epoch": 2.7657998101230477, "grad_norm": 0.24361467361450195, "learning_rate": 1.6972519203952194e-05, "loss": 1.2509, "step": 9286 }, { "epoch": 2.7660976563227164, "grad_norm": 0.24368710815906525, "learning_rate": 1.6971827745102754e-05, "loss": 1.2552, "step": 9287 }, { "epoch": 2.7663955025223848, "grad_norm": 0.258975625038147, "learning_rate": 1.697113622138825e-05, "loss": 1.2506, "step": 9288 }, { "epoch": 2.7666933487220535, "grad_norm": 0.23841984570026398, "learning_rate": 1.6970444632815106e-05, "loss": 1.2537, "step": 9289 }, { "epoch": 2.7669911949217223, "grad_norm": 0.24847932159900665, "learning_rate": 1.6969752979389763e-05, "loss": 1.2323, "step": 9290 }, { "epoch": 2.767289041121391, "grad_norm": 0.24395668506622314, "learning_rate": 1.6969061261118658e-05, "loss": 1.2449, "step": 9291 }, { "epoch": 2.76758688732106, "grad_norm": 0.29188939929008484, "learning_rate": 1.6968369478008224e-05, "loss": 1.2464, "step": 9292 }, { "epoch": 2.767884733520728, "grad_norm": 0.24827846884727478, "learning_rate": 1.69676776300649e-05, "loss": 1.2365, "step": 9293 }, { "epoch": 2.768182579720397, "grad_norm": 0.27385246753692627, "learning_rate": 1.6966985717295114e-05, "loss": 1.235, "step": 9294 }, { "epoch": 2.7684804259200657, "grad_norm": 0.30806779861450195, "learning_rate": 1.6966293739705316e-05, "loss": 1.2468, "step": 9295 }, { "epoch": 2.768778272119734, "grad_norm": 0.23335260152816772, "learning_rate": 1.6965601697301935e-05, "loss": 1.2356, "step": 9296 }, { "epoch": 2.769076118319403, "grad_norm": 0.23949144780635834, "learning_rate": 1.6964909590091414e-05, "loss": 1.2563, "step": 9297 }, { "epoch": 2.7693739645190716, "grad_norm": 0.24809135496616364, "learning_rate": 1.696421741808019e-05, "loss": 1.2463, "step": 9298 }, { "epoch": 2.76967181071874, "grad_norm": 0.29287445545196533, "learning_rate": 1.6963525181274706e-05, "loss": 1.2589, "step": 9299 }, { "epoch": 2.7699696569184087, "grad_norm": 0.23110057413578033, "learning_rate": 1.6962832879681396e-05, "loss": 1.2331, "step": 9300 }, { "epoch": 2.7702675031180775, "grad_norm": 0.26416778564453125, "learning_rate": 1.6962140513306707e-05, "loss": 1.2619, "step": 9301 }, { "epoch": 2.770565349317746, "grad_norm": 0.27834850549697876, "learning_rate": 1.696144808215708e-05, "loss": 1.2324, "step": 9302 }, { "epoch": 2.7708631955174146, "grad_norm": 0.2294827103614807, "learning_rate": 1.696075558623896e-05, "loss": 1.2399, "step": 9303 }, { "epoch": 2.7711610417170833, "grad_norm": 0.24087277054786682, "learning_rate": 1.6960063025558778e-05, "loss": 1.2491, "step": 9304 }, { "epoch": 2.771458887916752, "grad_norm": 0.2239975780248642, "learning_rate": 1.6959370400122993e-05, "loss": 1.231, "step": 9305 }, { "epoch": 2.771756734116421, "grad_norm": 0.24244676530361176, "learning_rate": 1.6958677709938037e-05, "loss": 1.2539, "step": 9306 }, { "epoch": 2.772054580316089, "grad_norm": 0.24967491626739502, "learning_rate": 1.695798495501036e-05, "loss": 1.2625, "step": 9307 }, { "epoch": 2.772352426515758, "grad_norm": 0.23692266643047333, "learning_rate": 1.695729213534641e-05, "loss": 1.234, "step": 9308 }, { "epoch": 2.7726502727154267, "grad_norm": 0.24569763243198395, "learning_rate": 1.6956599250952627e-05, "loss": 1.2449, "step": 9309 }, { "epoch": 2.772948118915095, "grad_norm": 0.24894258379936218, "learning_rate": 1.6955906301835465e-05, "loss": 1.2534, "step": 9310 }, { "epoch": 2.773245965114764, "grad_norm": 0.23010030388832092, "learning_rate": 1.6955213288001362e-05, "loss": 1.2241, "step": 9311 }, { "epoch": 2.7735438113144326, "grad_norm": 0.23309226334095, "learning_rate": 1.695452020945677e-05, "loss": 1.2369, "step": 9312 }, { "epoch": 2.773841657514101, "grad_norm": 0.23277804255485535, "learning_rate": 1.6953827066208138e-05, "loss": 1.2417, "step": 9313 }, { "epoch": 2.7741395037137697, "grad_norm": 0.24053636193275452, "learning_rate": 1.6953133858261916e-05, "loss": 1.2355, "step": 9314 }, { "epoch": 2.7744373499134385, "grad_norm": 0.2710404396057129, "learning_rate": 1.6952440585624553e-05, "loss": 1.2484, "step": 9315 }, { "epoch": 2.7747351961131073, "grad_norm": 0.27087676525115967, "learning_rate": 1.6951747248302495e-05, "loss": 1.2536, "step": 9316 }, { "epoch": 2.7750330423127756, "grad_norm": 0.24413885176181793, "learning_rate": 1.6951053846302198e-05, "loss": 1.2407, "step": 9317 }, { "epoch": 2.7753308885124444, "grad_norm": 0.5456312298774719, "learning_rate": 1.695036037963011e-05, "loss": 1.2433, "step": 9318 }, { "epoch": 2.775628734712113, "grad_norm": 0.3109140694141388, "learning_rate": 1.6949666848292683e-05, "loss": 1.2629, "step": 9319 }, { "epoch": 2.775926580911782, "grad_norm": 0.2805221974849701, "learning_rate": 1.6948973252296376e-05, "loss": 1.2384, "step": 9320 }, { "epoch": 2.7762244271114502, "grad_norm": 0.26506713032722473, "learning_rate": 1.694827959164763e-05, "loss": 1.2481, "step": 9321 }, { "epoch": 2.776522273311119, "grad_norm": 0.24361567199230194, "learning_rate": 1.694758586635291e-05, "loss": 1.2376, "step": 9322 }, { "epoch": 2.7768201195107878, "grad_norm": 0.24067051708698273, "learning_rate": 1.6946892076418665e-05, "loss": 1.2508, "step": 9323 }, { "epoch": 2.777117965710456, "grad_norm": 0.27331697940826416, "learning_rate": 1.6946198221851348e-05, "loss": 1.2426, "step": 9324 }, { "epoch": 2.777415811910125, "grad_norm": 0.2714940011501312, "learning_rate": 1.6945504302657418e-05, "loss": 1.2588, "step": 9325 }, { "epoch": 2.7777136581097936, "grad_norm": 0.2522963583469391, "learning_rate": 1.6944810318843332e-05, "loss": 1.2494, "step": 9326 }, { "epoch": 2.778011504309462, "grad_norm": 0.2336743324995041, "learning_rate": 1.6944116270415546e-05, "loss": 1.2546, "step": 9327 }, { "epoch": 2.7783093505091307, "grad_norm": 0.23527158796787262, "learning_rate": 1.6943422157380515e-05, "loss": 1.2401, "step": 9328 }, { "epoch": 2.7786071967087995, "grad_norm": 0.24809874594211578, "learning_rate": 1.6942727979744697e-05, "loss": 1.2405, "step": 9329 }, { "epoch": 2.7789050429084683, "grad_norm": 0.24884772300720215, "learning_rate": 1.694203373751455e-05, "loss": 1.2594, "step": 9330 }, { "epoch": 2.7792028891081366, "grad_norm": 0.2442944049835205, "learning_rate": 1.6941339430696545e-05, "loss": 1.2556, "step": 9331 }, { "epoch": 2.7795007353078054, "grad_norm": 0.22943776845932007, "learning_rate": 1.6940645059297122e-05, "loss": 1.2518, "step": 9332 }, { "epoch": 2.779798581507474, "grad_norm": 0.245171919465065, "learning_rate": 1.6939950623322757e-05, "loss": 1.231, "step": 9333 }, { "epoch": 2.780096427707143, "grad_norm": 0.23680929839611053, "learning_rate": 1.6939256122779904e-05, "loss": 1.2571, "step": 9334 }, { "epoch": 2.7803942739068113, "grad_norm": 0.2431255429983139, "learning_rate": 1.6938561557675024e-05, "loss": 1.2583, "step": 9335 }, { "epoch": 2.78069212010648, "grad_norm": 0.22574183344841003, "learning_rate": 1.6937866928014582e-05, "loss": 1.2372, "step": 9336 }, { "epoch": 2.780989966306149, "grad_norm": 0.25322556495666504, "learning_rate": 1.693717223380504e-05, "loss": 1.2282, "step": 9337 }, { "epoch": 2.781287812505817, "grad_norm": 0.23538853228092194, "learning_rate": 1.6936477475052862e-05, "loss": 1.2342, "step": 9338 }, { "epoch": 2.781585658705486, "grad_norm": 0.23108044266700745, "learning_rate": 1.6935782651764506e-05, "loss": 1.2423, "step": 9339 }, { "epoch": 2.7818835049051547, "grad_norm": 0.2336004227399826, "learning_rate": 1.6935087763946446e-05, "loss": 1.2539, "step": 9340 }, { "epoch": 2.782181351104823, "grad_norm": 0.23251007497310638, "learning_rate": 1.6934392811605144e-05, "loss": 1.2472, "step": 9341 }, { "epoch": 2.782479197304492, "grad_norm": 0.23549939692020416, "learning_rate": 1.6933697794747062e-05, "loss": 1.2491, "step": 9342 }, { "epoch": 2.7827770435041606, "grad_norm": 0.23823584616184235, "learning_rate": 1.6933002713378667e-05, "loss": 1.2329, "step": 9343 }, { "epoch": 2.7830748897038293, "grad_norm": 0.22635841369628906, "learning_rate": 1.6932307567506433e-05, "loss": 1.2418, "step": 9344 }, { "epoch": 2.783372735903498, "grad_norm": 0.23029498755931854, "learning_rate": 1.6931612357136817e-05, "loss": 1.2368, "step": 9345 }, { "epoch": 2.7836705821031664, "grad_norm": 0.23168355226516724, "learning_rate": 1.6930917082276295e-05, "loss": 1.2332, "step": 9346 }, { "epoch": 2.783968428302835, "grad_norm": 0.2235199213027954, "learning_rate": 1.6930221742931334e-05, "loss": 1.2566, "step": 9347 }, { "epoch": 2.784266274502504, "grad_norm": 0.22563199698925018, "learning_rate": 1.69295263391084e-05, "loss": 1.2341, "step": 9348 }, { "epoch": 2.7845641207021723, "grad_norm": 0.23209375143051147, "learning_rate": 1.6928830870813965e-05, "loss": 1.26, "step": 9349 }, { "epoch": 2.784861966901841, "grad_norm": 0.252802312374115, "learning_rate": 1.6928135338054502e-05, "loss": 1.2449, "step": 9350 }, { "epoch": 2.78515981310151, "grad_norm": 0.22846385836601257, "learning_rate": 1.6927439740836483e-05, "loss": 1.2584, "step": 9351 }, { "epoch": 2.785457659301178, "grad_norm": 0.23619228601455688, "learning_rate": 1.6926744079166375e-05, "loss": 1.2397, "step": 9352 }, { "epoch": 2.785755505500847, "grad_norm": 0.2411826252937317, "learning_rate": 1.692604835305065e-05, "loss": 1.2514, "step": 9353 }, { "epoch": 2.7860533517005157, "grad_norm": 0.237585186958313, "learning_rate": 1.6925352562495784e-05, "loss": 1.2458, "step": 9354 }, { "epoch": 2.786351197900184, "grad_norm": 0.23259398341178894, "learning_rate": 1.6924656707508254e-05, "loss": 1.243, "step": 9355 }, { "epoch": 2.786649044099853, "grad_norm": 0.252854585647583, "learning_rate": 1.6923960788094524e-05, "loss": 1.2571, "step": 9356 }, { "epoch": 2.7869468902995216, "grad_norm": 0.2337467074394226, "learning_rate": 1.692326480426108e-05, "loss": 1.2497, "step": 9357 }, { "epoch": 2.7872447364991904, "grad_norm": 0.25307777523994446, "learning_rate": 1.692256875601439e-05, "loss": 1.2414, "step": 9358 }, { "epoch": 2.787542582698859, "grad_norm": 0.23664496839046478, "learning_rate": 1.692187264336093e-05, "loss": 1.2317, "step": 9359 }, { "epoch": 2.7878404288985275, "grad_norm": 0.28514930605888367, "learning_rate": 1.692117646630718e-05, "loss": 1.2497, "step": 9360 }, { "epoch": 2.7881382750981962, "grad_norm": 0.23014235496520996, "learning_rate": 1.6920480224859618e-05, "loss": 1.2328, "step": 9361 }, { "epoch": 2.788436121297865, "grad_norm": 0.2634677290916443, "learning_rate": 1.6919783919024717e-05, "loss": 1.2427, "step": 9362 }, { "epoch": 2.7887339674975333, "grad_norm": 0.23677754402160645, "learning_rate": 1.691908754880896e-05, "loss": 1.2506, "step": 9363 }, { "epoch": 2.789031813697202, "grad_norm": 0.3485448658466339, "learning_rate": 1.691839111421882e-05, "loss": 1.2555, "step": 9364 }, { "epoch": 2.789329659896871, "grad_norm": 0.24845841526985168, "learning_rate": 1.6917694615260785e-05, "loss": 1.2627, "step": 9365 }, { "epoch": 2.789627506096539, "grad_norm": 0.2844274938106537, "learning_rate": 1.6916998051941326e-05, "loss": 1.2422, "step": 9366 }, { "epoch": 2.789925352296208, "grad_norm": 0.2269563376903534, "learning_rate": 1.6916301424266933e-05, "loss": 1.243, "step": 9367 }, { "epoch": 2.7902231984958767, "grad_norm": 0.29760172963142395, "learning_rate": 1.691560473224408e-05, "loss": 1.2218, "step": 9368 }, { "epoch": 2.790521044695545, "grad_norm": 0.24669700860977173, "learning_rate": 1.691490797587925e-05, "loss": 1.255, "step": 9369 }, { "epoch": 2.790818890895214, "grad_norm": 0.28839683532714844, "learning_rate": 1.691421115517893e-05, "loss": 1.2516, "step": 9370 }, { "epoch": 2.7911167370948826, "grad_norm": 0.23452536761760712, "learning_rate": 1.69135142701496e-05, "loss": 1.2457, "step": 9371 }, { "epoch": 2.7914145832945514, "grad_norm": 0.45230355858802795, "learning_rate": 1.6912817320797742e-05, "loss": 1.2415, "step": 9372 }, { "epoch": 2.79171242949422, "grad_norm": 0.2951183617115021, "learning_rate": 1.6912120307129845e-05, "loss": 1.2297, "step": 9373 }, { "epoch": 2.7920102756938885, "grad_norm": 0.29169294238090515, "learning_rate": 1.691142322915239e-05, "loss": 1.2703, "step": 9374 }, { "epoch": 2.7923081218935573, "grad_norm": 0.2725238800048828, "learning_rate": 1.6910726086871863e-05, "loss": 1.2458, "step": 9375 }, { "epoch": 2.792605968093226, "grad_norm": 0.28612467646598816, "learning_rate": 1.6910028880294748e-05, "loss": 1.257, "step": 9376 }, { "epoch": 2.7929038142928944, "grad_norm": 0.28999632596969604, "learning_rate": 1.6909331609427536e-05, "loss": 1.2396, "step": 9377 }, { "epoch": 2.793201660492563, "grad_norm": 0.23493905365467072, "learning_rate": 1.6908634274276718e-05, "loss": 1.2394, "step": 9378 }, { "epoch": 2.793499506692232, "grad_norm": 0.2562492787837982, "learning_rate": 1.6907936874848774e-05, "loss": 1.2443, "step": 9379 }, { "epoch": 2.7937973528919002, "grad_norm": 0.2693139612674713, "learning_rate": 1.6907239411150192e-05, "loss": 1.2583, "step": 9380 }, { "epoch": 2.794095199091569, "grad_norm": 0.2592118978500366, "learning_rate": 1.6906541883187468e-05, "loss": 1.2534, "step": 9381 }, { "epoch": 2.7943930452912378, "grad_norm": 0.2605198621749878, "learning_rate": 1.6905844290967087e-05, "loss": 1.2462, "step": 9382 }, { "epoch": 2.7946908914909065, "grad_norm": 0.28104788064956665, "learning_rate": 1.6905146634495543e-05, "loss": 1.2401, "step": 9383 }, { "epoch": 2.794988737690575, "grad_norm": 0.24958474934101105, "learning_rate": 1.6904448913779325e-05, "loss": 1.2553, "step": 9384 }, { "epoch": 2.7952865838902436, "grad_norm": 0.27100545167922974, "learning_rate": 1.6903751128824917e-05, "loss": 1.2469, "step": 9385 }, { "epoch": 2.7955844300899124, "grad_norm": 0.2590443193912506, "learning_rate": 1.6903053279638826e-05, "loss": 1.2494, "step": 9386 }, { "epoch": 2.795882276289581, "grad_norm": 0.262961208820343, "learning_rate": 1.6902355366227535e-05, "loss": 1.2557, "step": 9387 }, { "epoch": 2.7961801224892495, "grad_norm": 0.31127819418907166, "learning_rate": 1.6901657388597534e-05, "loss": 1.2347, "step": 9388 }, { "epoch": 2.7964779686889183, "grad_norm": 0.2874647378921509, "learning_rate": 1.6900959346755327e-05, "loss": 1.2609, "step": 9389 }, { "epoch": 2.796775814888587, "grad_norm": 0.29039305448532104, "learning_rate": 1.6900261240707402e-05, "loss": 1.2289, "step": 9390 }, { "epoch": 2.7970736610882554, "grad_norm": 0.24280205368995667, "learning_rate": 1.6899563070460255e-05, "loss": 1.2159, "step": 9391 }, { "epoch": 2.797371507287924, "grad_norm": 0.32225850224494934, "learning_rate": 1.6898864836020384e-05, "loss": 1.2248, "step": 9392 }, { "epoch": 2.797669353487593, "grad_norm": 0.23303130269050598, "learning_rate": 1.689816653739428e-05, "loss": 1.2548, "step": 9393 }, { "epoch": 2.7979671996872613, "grad_norm": 0.256865918636322, "learning_rate": 1.6897468174588447e-05, "loss": 1.2558, "step": 9394 }, { "epoch": 2.79826504588693, "grad_norm": 0.24131427705287933, "learning_rate": 1.6896769747609378e-05, "loss": 1.2383, "step": 9395 }, { "epoch": 2.798562892086599, "grad_norm": 0.23806166648864746, "learning_rate": 1.689607125646357e-05, "loss": 1.2578, "step": 9396 }, { "epoch": 2.7988607382862676, "grad_norm": 0.2850431501865387, "learning_rate": 1.6895372701157527e-05, "loss": 1.2362, "step": 9397 }, { "epoch": 2.7991585844859364, "grad_norm": 0.2668682932853699, "learning_rate": 1.6894674081697746e-05, "loss": 1.2443, "step": 9398 }, { "epoch": 2.7994564306856047, "grad_norm": 0.25615400075912476, "learning_rate": 1.6893975398090723e-05, "loss": 1.2526, "step": 9399 }, { "epoch": 2.7997542768852735, "grad_norm": 0.22435817122459412, "learning_rate": 1.689327665034296e-05, "loss": 1.2431, "step": 9400 }, { "epoch": 2.8000521230849422, "grad_norm": 0.2976968288421631, "learning_rate": 1.6892577838460962e-05, "loss": 1.2483, "step": 9401 }, { "epoch": 2.8003499692846106, "grad_norm": 0.24760279059410095, "learning_rate": 1.689187896245123e-05, "loss": 1.2275, "step": 9402 }, { "epoch": 2.8006478154842793, "grad_norm": 0.3072332441806793, "learning_rate": 1.689118002232026e-05, "loss": 1.2455, "step": 9403 }, { "epoch": 2.800945661683948, "grad_norm": 0.2671337127685547, "learning_rate": 1.6890481018074557e-05, "loss": 1.2407, "step": 9404 }, { "epoch": 2.8012435078836164, "grad_norm": 0.30636167526245117, "learning_rate": 1.6889781949720632e-05, "loss": 1.251, "step": 9405 }, { "epoch": 2.801541354083285, "grad_norm": 0.2866518199443817, "learning_rate": 1.6889082817264982e-05, "loss": 1.2559, "step": 9406 }, { "epoch": 2.801839200282954, "grad_norm": 0.2870074212551117, "learning_rate": 1.6888383620714112e-05, "loss": 1.2558, "step": 9407 }, { "epoch": 2.8021370464826223, "grad_norm": 0.26505526900291443, "learning_rate": 1.688768436007453e-05, "loss": 1.2554, "step": 9408 }, { "epoch": 2.802434892682291, "grad_norm": 0.2759658396244049, "learning_rate": 1.688698503535274e-05, "loss": 1.2309, "step": 9409 }, { "epoch": 2.80273273888196, "grad_norm": 0.24766267836093903, "learning_rate": 1.6886285646555248e-05, "loss": 1.2385, "step": 9410 }, { "epoch": 2.8030305850816286, "grad_norm": 0.2621782124042511, "learning_rate": 1.6885586193688562e-05, "loss": 1.2417, "step": 9411 }, { "epoch": 2.8033284312812974, "grad_norm": 0.24561628699302673, "learning_rate": 1.6884886676759193e-05, "loss": 1.2317, "step": 9412 }, { "epoch": 2.8036262774809657, "grad_norm": 0.24416610598564148, "learning_rate": 1.6884187095773644e-05, "loss": 1.243, "step": 9413 }, { "epoch": 2.8039241236806345, "grad_norm": 0.24400001764297485, "learning_rate": 1.688348745073842e-05, "loss": 1.2522, "step": 9414 }, { "epoch": 2.8042219698803033, "grad_norm": 0.24346937239170074, "learning_rate": 1.6882787741660044e-05, "loss": 1.2334, "step": 9415 }, { "epoch": 2.8045198160799716, "grad_norm": 0.22662796080112457, "learning_rate": 1.6882087968545014e-05, "loss": 1.2643, "step": 9416 }, { "epoch": 2.8048176622796404, "grad_norm": 0.2588282823562622, "learning_rate": 1.6881388131399846e-05, "loss": 1.2569, "step": 9417 }, { "epoch": 2.805115508479309, "grad_norm": 0.2453807145357132, "learning_rate": 1.688068823023105e-05, "loss": 1.2205, "step": 9418 }, { "epoch": 2.8054133546789775, "grad_norm": 0.2423257827758789, "learning_rate": 1.687998826504514e-05, "loss": 1.2279, "step": 9419 }, { "epoch": 2.8057112008786462, "grad_norm": 0.24520215392112732, "learning_rate": 1.6879288235848616e-05, "loss": 1.2503, "step": 9420 }, { "epoch": 2.806009047078315, "grad_norm": 0.23110315203666687, "learning_rate": 1.687858814264801e-05, "loss": 1.2406, "step": 9421 }, { "epoch": 2.8063068932779833, "grad_norm": 0.23562389612197876, "learning_rate": 1.6877887985449824e-05, "loss": 1.2335, "step": 9422 }, { "epoch": 2.806604739477652, "grad_norm": 0.23645806312561035, "learning_rate": 1.6877187764260575e-05, "loss": 1.2421, "step": 9423 }, { "epoch": 2.806902585677321, "grad_norm": 0.2370063066482544, "learning_rate": 1.6876487479086776e-05, "loss": 1.2576, "step": 9424 }, { "epoch": 2.8072004318769896, "grad_norm": 0.2854611575603485, "learning_rate": 1.6875787129934945e-05, "loss": 1.2474, "step": 9425 }, { "epoch": 2.8074982780766584, "grad_norm": 0.2618495523929596, "learning_rate": 1.6875086716811598e-05, "loss": 1.245, "step": 9426 }, { "epoch": 2.8077961242763267, "grad_norm": 0.24259112775325775, "learning_rate": 1.6874386239723247e-05, "loss": 1.2327, "step": 9427 }, { "epoch": 2.8080939704759955, "grad_norm": 0.23218576610088348, "learning_rate": 1.6873685698676417e-05, "loss": 1.2481, "step": 9428 }, { "epoch": 2.8083918166756643, "grad_norm": 0.29377731680870056, "learning_rate": 1.6872985093677617e-05, "loss": 1.2475, "step": 9429 }, { "epoch": 2.8086896628753326, "grad_norm": 0.29341045022010803, "learning_rate": 1.6872284424733373e-05, "loss": 1.2406, "step": 9430 }, { "epoch": 2.8089875090750014, "grad_norm": 0.2425510734319687, "learning_rate": 1.6871583691850202e-05, "loss": 1.2419, "step": 9431 }, { "epoch": 2.80928535527467, "grad_norm": 0.2322937250137329, "learning_rate": 1.6870882895034618e-05, "loss": 1.2328, "step": 9432 }, { "epoch": 2.8095832014743385, "grad_norm": 0.2481030374765396, "learning_rate": 1.687018203429315e-05, "loss": 1.2613, "step": 9433 }, { "epoch": 2.8098810476740073, "grad_norm": 0.2392769157886505, "learning_rate": 1.6869481109632308e-05, "loss": 1.2473, "step": 9434 }, { "epoch": 2.810178893873676, "grad_norm": 0.23280951380729675, "learning_rate": 1.6868780121058622e-05, "loss": 1.2301, "step": 9435 }, { "epoch": 2.8104767400733444, "grad_norm": 0.226578027009964, "learning_rate": 1.6868079068578614e-05, "loss": 1.2368, "step": 9436 }, { "epoch": 2.810774586273013, "grad_norm": 0.23894715309143066, "learning_rate": 1.6867377952198797e-05, "loss": 1.2347, "step": 9437 }, { "epoch": 2.811072432472682, "grad_norm": 0.26301658153533936, "learning_rate": 1.6866676771925706e-05, "loss": 1.2508, "step": 9438 }, { "epoch": 2.8113702786723507, "grad_norm": 0.274747371673584, "learning_rate": 1.686597552776586e-05, "loss": 1.2224, "step": 9439 }, { "epoch": 2.8116681248720194, "grad_norm": 0.26488688588142395, "learning_rate": 1.686527421972578e-05, "loss": 1.2567, "step": 9440 }, { "epoch": 2.8119659710716878, "grad_norm": 0.24709691107273102, "learning_rate": 1.6864572847811995e-05, "loss": 1.2406, "step": 9441 }, { "epoch": 2.8122638172713565, "grad_norm": 0.25358718633651733, "learning_rate": 1.686387141203103e-05, "loss": 1.2619, "step": 9442 }, { "epoch": 2.8125616634710253, "grad_norm": 0.450490802526474, "learning_rate": 1.686316991238941e-05, "loss": 1.2303, "step": 9443 }, { "epoch": 2.8128595096706936, "grad_norm": 0.3292935788631439, "learning_rate": 1.686246834889366e-05, "loss": 1.2535, "step": 9444 }, { "epoch": 2.8131573558703624, "grad_norm": 0.2989583909511566, "learning_rate": 1.686176672155031e-05, "loss": 1.2321, "step": 9445 }, { "epoch": 2.813455202070031, "grad_norm": 0.2783900797367096, "learning_rate": 1.686106503036589e-05, "loss": 1.2499, "step": 9446 }, { "epoch": 2.8137530482696995, "grad_norm": 0.5152345895767212, "learning_rate": 1.6860363275346922e-05, "loss": 1.2637, "step": 9447 }, { "epoch": 2.8140508944693683, "grad_norm": 0.2280631810426712, "learning_rate": 1.685966145649994e-05, "loss": 1.2355, "step": 9448 }, { "epoch": 2.814348740669037, "grad_norm": 0.2405257523059845, "learning_rate": 1.685895957383147e-05, "loss": 1.2655, "step": 9449 }, { "epoch": 2.814646586868706, "grad_norm": 0.23572790622711182, "learning_rate": 1.685825762734805e-05, "loss": 1.2391, "step": 9450 }, { "epoch": 2.814944433068374, "grad_norm": 0.7869846224784851, "learning_rate": 1.68575556170562e-05, "loss": 1.2549, "step": 9451 }, { "epoch": 2.815242279268043, "grad_norm": 0.23556984961032867, "learning_rate": 1.685685354296246e-05, "loss": 1.2379, "step": 9452 }, { "epoch": 2.8155401254677117, "grad_norm": 0.24267776310443878, "learning_rate": 1.6856151405073357e-05, "loss": 1.2398, "step": 9453 }, { "epoch": 2.8158379716673805, "grad_norm": 0.24611473083496094, "learning_rate": 1.6855449203395425e-05, "loss": 1.2371, "step": 9454 }, { "epoch": 2.816135817867049, "grad_norm": 0.23646798729896545, "learning_rate": 1.6854746937935197e-05, "loss": 1.262, "step": 9455 }, { "epoch": 2.8164336640667176, "grad_norm": 0.24050743877887726, "learning_rate": 1.6854044608699206e-05, "loss": 1.256, "step": 9456 }, { "epoch": 2.8167315102663864, "grad_norm": 0.2404586374759674, "learning_rate": 1.6853342215693994e-05, "loss": 1.2494, "step": 9457 }, { "epoch": 2.8170293564660547, "grad_norm": 0.23830142617225647, "learning_rate": 1.6852639758926086e-05, "loss": 1.2421, "step": 9458 }, { "epoch": 2.8173272026657235, "grad_norm": 0.2322382777929306, "learning_rate": 1.685193723840202e-05, "loss": 1.2447, "step": 9459 }, { "epoch": 2.8176250488653922, "grad_norm": 0.23787865042686462, "learning_rate": 1.6851234654128334e-05, "loss": 1.2381, "step": 9460 }, { "epoch": 2.8179228950650606, "grad_norm": 0.2354193925857544, "learning_rate": 1.6850532006111568e-05, "loss": 1.242, "step": 9461 }, { "epoch": 2.8182207412647293, "grad_norm": 0.23611146211624146, "learning_rate": 1.684982929435825e-05, "loss": 1.2408, "step": 9462 }, { "epoch": 2.818518587464398, "grad_norm": 0.23927001655101776, "learning_rate": 1.6849126518874923e-05, "loss": 1.2625, "step": 9463 }, { "epoch": 2.818816433664067, "grad_norm": 0.22399203479290009, "learning_rate": 1.684842367966813e-05, "loss": 1.2658, "step": 9464 }, { "epoch": 2.8191142798637356, "grad_norm": 0.24528461694717407, "learning_rate": 1.6847720776744404e-05, "loss": 1.2563, "step": 9465 }, { "epoch": 2.819412126063404, "grad_norm": 0.23209093511104584, "learning_rate": 1.684701781011029e-05, "loss": 1.2479, "step": 9466 }, { "epoch": 2.8197099722630727, "grad_norm": 0.2589212656021118, "learning_rate": 1.684631477977232e-05, "loss": 1.2366, "step": 9467 }, { "epoch": 2.8200078184627415, "grad_norm": 0.22971342504024506, "learning_rate": 1.6845611685737044e-05, "loss": 1.2532, "step": 9468 }, { "epoch": 2.82030566466241, "grad_norm": 0.2506147623062134, "learning_rate": 1.6844908528011e-05, "loss": 1.236, "step": 9469 }, { "epoch": 2.8206035108620786, "grad_norm": 0.2360735684633255, "learning_rate": 1.6844205306600727e-05, "loss": 1.2402, "step": 9470 }, { "epoch": 2.8209013570617474, "grad_norm": 0.23734350502490997, "learning_rate": 1.6843502021512774e-05, "loss": 1.2565, "step": 9471 }, { "epoch": 2.8211992032614157, "grad_norm": 0.2421860247850418, "learning_rate": 1.6842798672753677e-05, "loss": 1.2343, "step": 9472 }, { "epoch": 2.8214970494610845, "grad_norm": 0.23866495490074158, "learning_rate": 1.6842095260329988e-05, "loss": 1.2541, "step": 9473 }, { "epoch": 2.8217948956607533, "grad_norm": 0.26501211524009705, "learning_rate": 1.6841391784248246e-05, "loss": 1.2583, "step": 9474 }, { "epoch": 2.8220927418604216, "grad_norm": 0.22950270771980286, "learning_rate": 1.684068824451499e-05, "loss": 1.2452, "step": 9475 }, { "epoch": 2.8223905880600904, "grad_norm": 0.24660466611385345, "learning_rate": 1.683998464113678e-05, "loss": 1.2573, "step": 9476 }, { "epoch": 2.822688434259759, "grad_norm": 0.23101870715618134, "learning_rate": 1.6839280974120153e-05, "loss": 1.2362, "step": 9477 }, { "epoch": 2.822986280459428, "grad_norm": 0.29405537247657776, "learning_rate": 1.6838577243471657e-05, "loss": 1.2485, "step": 9478 }, { "epoch": 2.8232841266590967, "grad_norm": 0.3243562877178192, "learning_rate": 1.683787344919784e-05, "loss": 1.2392, "step": 9479 }, { "epoch": 2.823581972858765, "grad_norm": 0.23325003683567047, "learning_rate": 1.6837169591305254e-05, "loss": 1.252, "step": 9480 }, { "epoch": 2.8238798190584338, "grad_norm": 0.2814958691596985, "learning_rate": 1.6836465669800442e-05, "loss": 1.2445, "step": 9481 }, { "epoch": 2.8241776652581025, "grad_norm": 0.2902512550354004, "learning_rate": 1.6835761684689954e-05, "loss": 1.2623, "step": 9482 }, { "epoch": 2.824475511457771, "grad_norm": 0.24863779544830322, "learning_rate": 1.683505763598034e-05, "loss": 1.2478, "step": 9483 }, { "epoch": 2.8247733576574396, "grad_norm": 0.63090580701828, "learning_rate": 1.6834353523678154e-05, "loss": 1.2607, "step": 9484 }, { "epoch": 2.8250712038571084, "grad_norm": 0.2958478629589081, "learning_rate": 1.683364934778994e-05, "loss": 1.2441, "step": 9485 }, { "epoch": 2.8253690500567767, "grad_norm": 0.2794075608253479, "learning_rate": 1.683294510832226e-05, "loss": 1.2454, "step": 9486 }, { "epoch": 2.8256668962564455, "grad_norm": 0.24498534202575684, "learning_rate": 1.6832240805281654e-05, "loss": 1.2481, "step": 9487 }, { "epoch": 2.8259647424561143, "grad_norm": 0.23207266628742218, "learning_rate": 1.6831536438674685e-05, "loss": 1.2408, "step": 9488 }, { "epoch": 2.8262625886557826, "grad_norm": 0.24805179238319397, "learning_rate": 1.68308320085079e-05, "loss": 1.2397, "step": 9489 }, { "epoch": 2.8265604348554514, "grad_norm": 0.24835854768753052, "learning_rate": 1.6830127514787856e-05, "loss": 1.2409, "step": 9490 }, { "epoch": 2.82685828105512, "grad_norm": 0.25730326771736145, "learning_rate": 1.6829422957521106e-05, "loss": 1.2664, "step": 9491 }, { "epoch": 2.827156127254789, "grad_norm": 0.2531678378582001, "learning_rate": 1.6828718336714204e-05, "loss": 1.2582, "step": 9492 }, { "epoch": 2.8274539734544577, "grad_norm": 0.2367067188024521, "learning_rate": 1.682801365237371e-05, "loss": 1.2395, "step": 9493 }, { "epoch": 2.827751819654126, "grad_norm": 0.2317972034215927, "learning_rate": 1.6827308904506175e-05, "loss": 1.2383, "step": 9494 }, { "epoch": 2.828049665853795, "grad_norm": 0.2436864972114563, "learning_rate": 1.6826604093118164e-05, "loss": 1.2677, "step": 9495 }, { "epoch": 2.8283475120534636, "grad_norm": 0.24664410948753357, "learning_rate": 1.6825899218216224e-05, "loss": 1.2388, "step": 9496 }, { "epoch": 2.828645358253132, "grad_norm": 0.23872210085391998, "learning_rate": 1.682519427980692e-05, "loss": 1.258, "step": 9497 }, { "epoch": 2.8289432044528007, "grad_norm": 0.23003363609313965, "learning_rate": 1.6824489277896807e-05, "loss": 1.2481, "step": 9498 }, { "epoch": 2.8292410506524694, "grad_norm": 0.22335799038410187, "learning_rate": 1.6823784212492448e-05, "loss": 1.2492, "step": 9499 }, { "epoch": 2.8295388968521378, "grad_norm": 0.23637713491916656, "learning_rate": 1.68230790836004e-05, "loss": 1.2483, "step": 9500 }, { "epoch": 2.8295388968521378, "eval_loss": 1.3350661993026733, "eval_runtime": 20.7853, "eval_samples_per_second": 83.424, "eval_steps_per_second": 5.244, "step": 9500 }, { "epoch": 2.8298367430518065, "grad_norm": 0.23937050998210907, "learning_rate": 1.6822373891227223e-05, "loss": 1.2397, "step": 9501 }, { "epoch": 2.8301345892514753, "grad_norm": 0.23899784684181213, "learning_rate": 1.6821668635379486e-05, "loss": 1.2504, "step": 9502 }, { "epoch": 2.8304324354511436, "grad_norm": 0.2262616753578186, "learning_rate": 1.6820963316063733e-05, "loss": 1.2707, "step": 9503 }, { "epoch": 2.8307302816508124, "grad_norm": 0.23564161360263824, "learning_rate": 1.6820257933286544e-05, "loss": 1.2389, "step": 9504 }, { "epoch": 2.831028127850481, "grad_norm": 0.22219909727573395, "learning_rate": 1.6819552487054474e-05, "loss": 1.2446, "step": 9505 }, { "epoch": 2.83132597405015, "grad_norm": 0.23048289120197296, "learning_rate": 1.6818846977374087e-05, "loss": 1.228, "step": 9506 }, { "epoch": 2.8316238202498187, "grad_norm": 0.24245646595954895, "learning_rate": 1.681814140425195e-05, "loss": 1.2585, "step": 9507 }, { "epoch": 2.831921666449487, "grad_norm": 0.23912543058395386, "learning_rate": 1.6817435767694622e-05, "loss": 1.2476, "step": 9508 }, { "epoch": 2.832219512649156, "grad_norm": 0.23558421432971954, "learning_rate": 1.681673006770867e-05, "loss": 1.2515, "step": 9509 }, { "epoch": 2.8325173588488246, "grad_norm": 0.23728139698505402, "learning_rate": 1.681602430430066e-05, "loss": 1.2538, "step": 9510 }, { "epoch": 2.832815205048493, "grad_norm": 0.24715609848499298, "learning_rate": 1.6815318477477166e-05, "loss": 1.2464, "step": 9511 }, { "epoch": 2.8331130512481617, "grad_norm": 0.2315111607313156, "learning_rate": 1.6814612587244743e-05, "loss": 1.24, "step": 9512 }, { "epoch": 2.8334108974478305, "grad_norm": 0.22887147963047028, "learning_rate": 1.6813906633609964e-05, "loss": 1.2469, "step": 9513 }, { "epoch": 2.833708743647499, "grad_norm": 0.23694492876529694, "learning_rate": 1.6813200616579396e-05, "loss": 1.2523, "step": 9514 }, { "epoch": 2.8340065898471676, "grad_norm": 0.2390718162059784, "learning_rate": 1.681249453615961e-05, "loss": 1.2363, "step": 9515 }, { "epoch": 2.8343044360468364, "grad_norm": 0.23013268411159515, "learning_rate": 1.6811788392357175e-05, "loss": 1.2423, "step": 9516 }, { "epoch": 2.834602282246505, "grad_norm": 0.23531004786491394, "learning_rate": 1.6811082185178658e-05, "loss": 1.2487, "step": 9517 }, { "epoch": 2.8349001284461735, "grad_norm": 0.24330763518810272, "learning_rate": 1.681037591463063e-05, "loss": 1.2301, "step": 9518 }, { "epoch": 2.8351979746458422, "grad_norm": 0.24473343789577484, "learning_rate": 1.6809669580719664e-05, "loss": 1.2649, "step": 9519 }, { "epoch": 2.835495820845511, "grad_norm": 0.22644692659378052, "learning_rate": 1.680896318345233e-05, "loss": 1.2358, "step": 9520 }, { "epoch": 2.8357936670451798, "grad_norm": 0.23970761895179749, "learning_rate": 1.6808256722835202e-05, "loss": 1.2454, "step": 9521 }, { "epoch": 2.836091513244848, "grad_norm": 0.23349694907665253, "learning_rate": 1.680755019887485e-05, "loss": 1.2553, "step": 9522 }, { "epoch": 2.836389359444517, "grad_norm": 0.2522037923336029, "learning_rate": 1.680684361157785e-05, "loss": 1.2487, "step": 9523 }, { "epoch": 2.8366872056441856, "grad_norm": 0.24215124547481537, "learning_rate": 1.6806136960950778e-05, "loss": 1.2278, "step": 9524 }, { "epoch": 2.836985051843854, "grad_norm": 0.23304861783981323, "learning_rate": 1.6805430247000203e-05, "loss": 1.2435, "step": 9525 }, { "epoch": 2.8372828980435227, "grad_norm": 0.2447548359632492, "learning_rate": 1.6804723469732704e-05, "loss": 1.2606, "step": 9526 }, { "epoch": 2.8375807442431915, "grad_norm": 0.23060426115989685, "learning_rate": 1.6804016629154854e-05, "loss": 1.2403, "step": 9527 }, { "epoch": 2.83787859044286, "grad_norm": 0.22644715011119843, "learning_rate": 1.6803309725273235e-05, "loss": 1.2252, "step": 9528 }, { "epoch": 2.8381764366425286, "grad_norm": 0.2479182928800583, "learning_rate": 1.6802602758094416e-05, "loss": 1.2555, "step": 9529 }, { "epoch": 2.8384742828421974, "grad_norm": 0.24726583063602448, "learning_rate": 1.680189572762498e-05, "loss": 1.24, "step": 9530 }, { "epoch": 2.838772129041866, "grad_norm": 0.24094510078430176, "learning_rate": 1.6801188633871507e-05, "loss": 1.254, "step": 9531 }, { "epoch": 2.839069975241535, "grad_norm": 0.23302826285362244, "learning_rate": 1.680048147684057e-05, "loss": 1.2621, "step": 9532 }, { "epoch": 2.8393678214412033, "grad_norm": 0.22267597913742065, "learning_rate": 1.679977425653875e-05, "loss": 1.2357, "step": 9533 }, { "epoch": 2.839665667640872, "grad_norm": 0.23312880098819733, "learning_rate": 1.6799066972972628e-05, "loss": 1.2624, "step": 9534 }, { "epoch": 2.839963513840541, "grad_norm": 0.22978711128234863, "learning_rate": 1.6798359626148787e-05, "loss": 1.2534, "step": 9535 }, { "epoch": 2.840261360040209, "grad_norm": 0.24274848401546478, "learning_rate": 1.67976522160738e-05, "loss": 1.2629, "step": 9536 }, { "epoch": 2.840559206239878, "grad_norm": 0.23926518857479095, "learning_rate": 1.679694474275426e-05, "loss": 1.2362, "step": 9537 }, { "epoch": 2.8408570524395467, "grad_norm": 0.23229800164699554, "learning_rate": 1.679623720619674e-05, "loss": 1.244, "step": 9538 }, { "epoch": 2.841154898639215, "grad_norm": 0.22292126715183258, "learning_rate": 1.6795529606407822e-05, "loss": 1.2394, "step": 9539 }, { "epoch": 2.8414527448388838, "grad_norm": 0.2399204522371292, "learning_rate": 1.6794821943394097e-05, "loss": 1.2399, "step": 9540 }, { "epoch": 2.8417505910385525, "grad_norm": 0.23259685933589935, "learning_rate": 1.6794114217162145e-05, "loss": 1.2324, "step": 9541 }, { "epoch": 2.842048437238221, "grad_norm": 0.22789466381072998, "learning_rate": 1.6793406427718554e-05, "loss": 1.2245, "step": 9542 }, { "epoch": 2.8423462834378896, "grad_norm": 0.23625433444976807, "learning_rate": 1.6792698575069906e-05, "loss": 1.2465, "step": 9543 }, { "epoch": 2.8426441296375584, "grad_norm": 0.23433490097522736, "learning_rate": 1.6791990659222782e-05, "loss": 1.235, "step": 9544 }, { "epoch": 2.842941975837227, "grad_norm": 0.24511510133743286, "learning_rate": 1.6791282680183778e-05, "loss": 1.242, "step": 9545 }, { "epoch": 2.843239822036896, "grad_norm": 0.23765471577644348, "learning_rate": 1.6790574637959473e-05, "loss": 1.2391, "step": 9546 }, { "epoch": 2.8435376682365643, "grad_norm": 0.23262514173984528, "learning_rate": 1.6789866532556462e-05, "loss": 1.2363, "step": 9547 }, { "epoch": 2.843835514436233, "grad_norm": 0.23182271420955658, "learning_rate": 1.6789158363981327e-05, "loss": 1.2435, "step": 9548 }, { "epoch": 2.844133360635902, "grad_norm": 0.247736856341362, "learning_rate": 1.6788450132240658e-05, "loss": 1.2321, "step": 9549 }, { "epoch": 2.84443120683557, "grad_norm": 0.24930934607982635, "learning_rate": 1.678774183734104e-05, "loss": 1.2445, "step": 9550 }, { "epoch": 2.844729053035239, "grad_norm": 0.24114583432674408, "learning_rate": 1.6787033479289077e-05, "loss": 1.2331, "step": 9551 }, { "epoch": 2.8450268992349077, "grad_norm": 0.22924873232841492, "learning_rate": 1.6786325058091346e-05, "loss": 1.238, "step": 9552 }, { "epoch": 2.845324745434576, "grad_norm": 0.25451821088790894, "learning_rate": 1.6785616573754444e-05, "loss": 1.2356, "step": 9553 }, { "epoch": 2.845622591634245, "grad_norm": 0.23745770752429962, "learning_rate": 1.6784908026284958e-05, "loss": 1.2539, "step": 9554 }, { "epoch": 2.8459204378339136, "grad_norm": 0.25693124532699585, "learning_rate": 1.6784199415689488e-05, "loss": 1.2456, "step": 9555 }, { "epoch": 2.846218284033582, "grad_norm": 0.2471061795949936, "learning_rate": 1.678349074197462e-05, "loss": 1.2238, "step": 9556 }, { "epoch": 2.8465161302332507, "grad_norm": 0.2600265443325043, "learning_rate": 1.678278200514695e-05, "loss": 1.2429, "step": 9557 }, { "epoch": 2.8468139764329194, "grad_norm": 0.23116445541381836, "learning_rate": 1.678207320521307e-05, "loss": 1.2502, "step": 9558 }, { "epoch": 2.847111822632588, "grad_norm": 0.2419513314962387, "learning_rate": 1.678136434217958e-05, "loss": 1.2478, "step": 9559 }, { "epoch": 2.847409668832257, "grad_norm": 0.23644323647022247, "learning_rate": 1.6780655416053067e-05, "loss": 1.2479, "step": 9560 }, { "epoch": 2.8477075150319253, "grad_norm": 0.23632511496543884, "learning_rate": 1.6779946426840134e-05, "loss": 1.2474, "step": 9561 }, { "epoch": 2.848005361231594, "grad_norm": 0.22760628163814545, "learning_rate": 1.677923737454737e-05, "loss": 1.2512, "step": 9562 }, { "epoch": 2.848303207431263, "grad_norm": 0.2346828281879425, "learning_rate": 1.677852825918138e-05, "loss": 1.2428, "step": 9563 }, { "epoch": 2.848601053630931, "grad_norm": 0.2310081124305725, "learning_rate": 1.6777819080748758e-05, "loss": 1.2424, "step": 9564 }, { "epoch": 2.8488988998306, "grad_norm": 0.23505978286266327, "learning_rate": 1.67771098392561e-05, "loss": 1.2535, "step": 9565 }, { "epoch": 2.8491967460302687, "grad_norm": 0.22905951738357544, "learning_rate": 1.677640053471001e-05, "loss": 1.2335, "step": 9566 }, { "epoch": 2.849494592229937, "grad_norm": 0.23601317405700684, "learning_rate": 1.677569116711708e-05, "loss": 1.2329, "step": 9567 }, { "epoch": 2.849792438429606, "grad_norm": 0.2386091947555542, "learning_rate": 1.677498173648392e-05, "loss": 1.2542, "step": 9568 }, { "epoch": 2.8500902846292746, "grad_norm": 0.24330885708332062, "learning_rate": 1.6774272242817122e-05, "loss": 1.2576, "step": 9569 }, { "epoch": 2.850388130828943, "grad_norm": 0.25018224120140076, "learning_rate": 1.6773562686123285e-05, "loss": 1.2348, "step": 9570 }, { "epoch": 2.8506859770286117, "grad_norm": 0.24150174856185913, "learning_rate": 1.677285306640902e-05, "loss": 1.2293, "step": 9571 }, { "epoch": 2.8509838232282805, "grad_norm": 0.31612199544906616, "learning_rate": 1.677214338368092e-05, "loss": 1.2493, "step": 9572 }, { "epoch": 2.8512816694279493, "grad_norm": 0.2653801441192627, "learning_rate": 1.67714336379456e-05, "loss": 1.25, "step": 9573 }, { "epoch": 2.851579515627618, "grad_norm": 0.24127154052257538, "learning_rate": 1.6770723829209648e-05, "loss": 1.2442, "step": 9574 }, { "epoch": 2.8518773618272864, "grad_norm": 0.2629128694534302, "learning_rate": 1.6770013957479677e-05, "loss": 1.2571, "step": 9575 }, { "epoch": 2.852175208026955, "grad_norm": 0.23995302617549896, "learning_rate": 1.6769304022762292e-05, "loss": 1.2426, "step": 9576 }, { "epoch": 2.852473054226624, "grad_norm": 0.228716641664505, "learning_rate": 1.6768594025064095e-05, "loss": 1.2414, "step": 9577 }, { "epoch": 2.8527709004262922, "grad_norm": 0.23739491403102875, "learning_rate": 1.6767883964391692e-05, "loss": 1.2336, "step": 9578 }, { "epoch": 2.853068746625961, "grad_norm": 0.26312458515167236, "learning_rate": 1.676717384075169e-05, "loss": 1.2566, "step": 9579 }, { "epoch": 2.8533665928256298, "grad_norm": 0.24080444872379303, "learning_rate": 1.67664636541507e-05, "loss": 1.2411, "step": 9580 }, { "epoch": 2.853664439025298, "grad_norm": 0.2274615615606308, "learning_rate": 1.6765753404595322e-05, "loss": 1.2481, "step": 9581 }, { "epoch": 2.853962285224967, "grad_norm": 0.2274451106786728, "learning_rate": 1.6765043092092167e-05, "loss": 1.2453, "step": 9582 }, { "epoch": 2.8542601314246356, "grad_norm": 0.23121044039726257, "learning_rate": 1.676433271664785e-05, "loss": 1.2562, "step": 9583 }, { "epoch": 2.8545579776243044, "grad_norm": 0.23248712718486786, "learning_rate": 1.6763622278268968e-05, "loss": 1.2601, "step": 9584 }, { "epoch": 2.8548558238239727, "grad_norm": 0.24338187277317047, "learning_rate": 1.676291177696214e-05, "loss": 1.2459, "step": 9585 }, { "epoch": 2.8551536700236415, "grad_norm": 0.2362908124923706, "learning_rate": 1.6762201212733974e-05, "loss": 1.2413, "step": 9586 }, { "epoch": 2.8554515162233103, "grad_norm": 0.3114035427570343, "learning_rate": 1.676149058559108e-05, "loss": 1.2266, "step": 9587 }, { "epoch": 2.855749362422979, "grad_norm": 0.30946192145347595, "learning_rate": 1.676077989554007e-05, "loss": 1.249, "step": 9588 }, { "epoch": 2.8560472086226474, "grad_norm": 0.2479553520679474, "learning_rate": 1.6760069142587562e-05, "loss": 1.2431, "step": 9589 }, { "epoch": 2.856345054822316, "grad_norm": 0.5116506218910217, "learning_rate": 1.6759358326740157e-05, "loss": 1.2438, "step": 9590 }, { "epoch": 2.856642901021985, "grad_norm": 0.3225812017917633, "learning_rate": 1.675864744800448e-05, "loss": 1.2424, "step": 9591 }, { "epoch": 2.8569407472216533, "grad_norm": 0.2772776782512665, "learning_rate": 1.6757936506387134e-05, "loss": 1.2536, "step": 9592 }, { "epoch": 2.857238593421322, "grad_norm": 0.238128662109375, "learning_rate": 1.6757225501894744e-05, "loss": 1.2472, "step": 9593 }, { "epoch": 2.857536439620991, "grad_norm": 0.23039375245571136, "learning_rate": 1.6756514434533915e-05, "loss": 1.2291, "step": 9594 }, { "epoch": 2.857834285820659, "grad_norm": 0.3229292035102844, "learning_rate": 1.6755803304311272e-05, "loss": 1.2391, "step": 9595 }, { "epoch": 2.858132132020328, "grad_norm": 0.25468021631240845, "learning_rate": 1.675509211123343e-05, "loss": 1.2454, "step": 9596 }, { "epoch": 2.8584299782199967, "grad_norm": 0.22903305292129517, "learning_rate": 1.6754380855306998e-05, "loss": 1.2355, "step": 9597 }, { "epoch": 2.8587278244196654, "grad_norm": 0.23688064515590668, "learning_rate": 1.67536695365386e-05, "loss": 1.2426, "step": 9598 }, { "epoch": 2.859025670619334, "grad_norm": 0.2441447377204895, "learning_rate": 1.6752958154934854e-05, "loss": 1.2289, "step": 9599 }, { "epoch": 2.8593235168190025, "grad_norm": 0.2363729327917099, "learning_rate": 1.6752246710502377e-05, "loss": 1.2306, "step": 9600 }, { "epoch": 2.8596213630186713, "grad_norm": 0.25469234585762024, "learning_rate": 1.6751535203247785e-05, "loss": 1.2572, "step": 9601 }, { "epoch": 2.85991920921834, "grad_norm": 0.23895514011383057, "learning_rate": 1.6750823633177703e-05, "loss": 1.2399, "step": 9602 }, { "epoch": 2.8602170554180084, "grad_norm": 0.2803517282009125, "learning_rate": 1.6750112000298752e-05, "loss": 1.2436, "step": 9603 }, { "epoch": 2.860514901617677, "grad_norm": 0.2815166115760803, "learning_rate": 1.674940030461755e-05, "loss": 1.235, "step": 9604 }, { "epoch": 2.860812747817346, "grad_norm": 0.2441072165966034, "learning_rate": 1.6748688546140717e-05, "loss": 1.2388, "step": 9605 }, { "epoch": 2.8611105940170143, "grad_norm": 0.25432687997817993, "learning_rate": 1.674797672487488e-05, "loss": 1.2639, "step": 9606 }, { "epoch": 2.861408440216683, "grad_norm": 0.26335468888282776, "learning_rate": 1.674726484082666e-05, "loss": 1.2202, "step": 9607 }, { "epoch": 2.861706286416352, "grad_norm": 0.2342562973499298, "learning_rate": 1.674655289400267e-05, "loss": 1.2534, "step": 9608 }, { "epoch": 2.86200413261602, "grad_norm": 0.294867604970932, "learning_rate": 1.674584088440955e-05, "loss": 1.2438, "step": 9609 }, { "epoch": 2.862301978815689, "grad_norm": 0.23600003123283386, "learning_rate": 1.674512881205392e-05, "loss": 1.2456, "step": 9610 }, { "epoch": 2.8625998250153577, "grad_norm": 0.2641909718513489, "learning_rate": 1.6744416676942398e-05, "loss": 1.2253, "step": 9611 }, { "epoch": 2.8628976712150265, "grad_norm": 0.24446533620357513, "learning_rate": 1.6743704479081616e-05, "loss": 1.2466, "step": 9612 }, { "epoch": 2.8631955174146952, "grad_norm": 0.25604259967803955, "learning_rate": 1.6742992218478197e-05, "loss": 1.2358, "step": 9613 }, { "epoch": 2.8634933636143636, "grad_norm": 0.25585129857063293, "learning_rate": 1.674227989513877e-05, "loss": 1.2427, "step": 9614 }, { "epoch": 2.8637912098140323, "grad_norm": 0.23947425186634064, "learning_rate": 1.674156750906996e-05, "loss": 1.2284, "step": 9615 }, { "epoch": 2.864089056013701, "grad_norm": 0.25628355145454407, "learning_rate": 1.6740855060278396e-05, "loss": 1.2357, "step": 9616 }, { "epoch": 2.8643869022133694, "grad_norm": 0.23210285604000092, "learning_rate": 1.674014254877071e-05, "loss": 1.2336, "step": 9617 }, { "epoch": 2.864684748413038, "grad_norm": 0.29729771614074707, "learning_rate": 1.6739429974553527e-05, "loss": 1.2457, "step": 9618 }, { "epoch": 2.864982594612707, "grad_norm": 0.24117979407310486, "learning_rate": 1.6738717337633478e-05, "loss": 1.2576, "step": 9619 }, { "epoch": 2.8652804408123753, "grad_norm": 0.25205516815185547, "learning_rate": 1.6738004638017192e-05, "loss": 1.2615, "step": 9620 }, { "epoch": 2.865578287012044, "grad_norm": 0.2526441514492035, "learning_rate": 1.6737291875711303e-05, "loss": 1.2409, "step": 9621 }, { "epoch": 2.865876133211713, "grad_norm": 0.26580315828323364, "learning_rate": 1.6736579050722438e-05, "loss": 1.2576, "step": 9622 }, { "epoch": 2.866173979411381, "grad_norm": 0.26286303997039795, "learning_rate": 1.6735866163057234e-05, "loss": 1.2408, "step": 9623 }, { "epoch": 2.86647182561105, "grad_norm": 0.25381582975387573, "learning_rate": 1.673515321272232e-05, "loss": 1.2359, "step": 9624 }, { "epoch": 2.8667696718107187, "grad_norm": 0.2711793780326843, "learning_rate": 1.6734440199724328e-05, "loss": 1.2412, "step": 9625 }, { "epoch": 2.8670675180103875, "grad_norm": 0.23535223305225372, "learning_rate": 1.6733727124069896e-05, "loss": 1.2361, "step": 9626 }, { "epoch": 2.8673653642100563, "grad_norm": 0.3429569900035858, "learning_rate": 1.6733013985765658e-05, "loss": 1.2486, "step": 9627 }, { "epoch": 2.8676632104097246, "grad_norm": 0.2517103850841522, "learning_rate": 1.6732300784818244e-05, "loss": 1.2309, "step": 9628 }, { "epoch": 2.8679610566093934, "grad_norm": 0.27148956060409546, "learning_rate": 1.6731587521234296e-05, "loss": 1.2542, "step": 9629 }, { "epoch": 2.868258902809062, "grad_norm": 0.2414814680814743, "learning_rate": 1.6730874195020447e-05, "loss": 1.2581, "step": 9630 }, { "epoch": 2.8685567490087305, "grad_norm": 0.35265201330184937, "learning_rate": 1.673016080618333e-05, "loss": 1.2499, "step": 9631 }, { "epoch": 2.8688545952083992, "grad_norm": 0.22770635783672333, "learning_rate": 1.672944735472959e-05, "loss": 1.2165, "step": 9632 }, { "epoch": 2.869152441408068, "grad_norm": 0.26756811141967773, "learning_rate": 1.672873384066586e-05, "loss": 1.2399, "step": 9633 }, { "epoch": 2.8694502876077363, "grad_norm": 0.24693620204925537, "learning_rate": 1.672802026399878e-05, "loss": 1.2505, "step": 9634 }, { "epoch": 2.869748133807405, "grad_norm": 0.36966007947921753, "learning_rate": 1.672730662473499e-05, "loss": 1.2275, "step": 9635 }, { "epoch": 2.870045980007074, "grad_norm": 0.2499130368232727, "learning_rate": 1.6726592922881124e-05, "loss": 1.2419, "step": 9636 }, { "epoch": 2.870343826206742, "grad_norm": 0.27664774656295776, "learning_rate": 1.6725879158443826e-05, "loss": 1.2279, "step": 9637 }, { "epoch": 2.870641672406411, "grad_norm": 0.2373204231262207, "learning_rate": 1.6725165331429743e-05, "loss": 1.2387, "step": 9638 }, { "epoch": 2.8709395186060798, "grad_norm": 0.3052253723144531, "learning_rate": 1.6724451441845502e-05, "loss": 1.2499, "step": 9639 }, { "epoch": 2.8712373648057485, "grad_norm": 0.2708706259727478, "learning_rate": 1.672373748969776e-05, "loss": 1.2603, "step": 9640 }, { "epoch": 2.8715352110054173, "grad_norm": 0.24858544766902924, "learning_rate": 1.672302347499315e-05, "loss": 1.2466, "step": 9641 }, { "epoch": 2.8718330572050856, "grad_norm": 0.298337459564209, "learning_rate": 1.6722309397738322e-05, "loss": 1.2356, "step": 9642 }, { "epoch": 2.8721309034047544, "grad_norm": 0.22833894193172455, "learning_rate": 1.672159525793991e-05, "loss": 1.2413, "step": 9643 }, { "epoch": 2.872428749604423, "grad_norm": 0.23819833993911743, "learning_rate": 1.6720881055604565e-05, "loss": 1.2289, "step": 9644 }, { "epoch": 2.8727265958040915, "grad_norm": 0.24357788264751434, "learning_rate": 1.6720166790738934e-05, "loss": 1.241, "step": 9645 }, { "epoch": 2.8730244420037603, "grad_norm": 0.26105859875679016, "learning_rate": 1.671945246334966e-05, "loss": 1.2417, "step": 9646 }, { "epoch": 2.873322288203429, "grad_norm": 0.22648964822292328, "learning_rate": 1.671873807344338e-05, "loss": 1.2446, "step": 9647 }, { "epoch": 2.8736201344030974, "grad_norm": 0.26460540294647217, "learning_rate": 1.6718023621026757e-05, "loss": 1.2576, "step": 9648 }, { "epoch": 2.873917980602766, "grad_norm": 0.25131869316101074, "learning_rate": 1.6717309106106426e-05, "loss": 1.2362, "step": 9649 }, { "epoch": 2.874215826802435, "grad_norm": 0.42317044734954834, "learning_rate": 1.671659452868904e-05, "loss": 1.2244, "step": 9650 }, { "epoch": 2.8745136730021037, "grad_norm": 0.2991759777069092, "learning_rate": 1.671587988878125e-05, "loss": 1.2495, "step": 9651 }, { "epoch": 2.874811519201772, "grad_norm": 0.3073268532752991, "learning_rate": 1.671516518638969e-05, "loss": 1.23, "step": 9652 }, { "epoch": 2.875109365401441, "grad_norm": 0.23857441544532776, "learning_rate": 1.671445042152103e-05, "loss": 1.2352, "step": 9653 }, { "epoch": 2.8754072116011096, "grad_norm": 0.5188261270523071, "learning_rate": 1.671373559418191e-05, "loss": 1.2468, "step": 9654 }, { "epoch": 2.8757050578007783, "grad_norm": 0.26395002007484436, "learning_rate": 1.671302070437898e-05, "loss": 1.2334, "step": 9655 }, { "epoch": 2.8760029040004467, "grad_norm": 0.24812112748622894, "learning_rate": 1.6712305752118894e-05, "loss": 1.2381, "step": 9656 }, { "epoch": 2.8763007502001154, "grad_norm": 0.24590472877025604, "learning_rate": 1.6711590737408297e-05, "loss": 1.2276, "step": 9657 }, { "epoch": 2.876598596399784, "grad_norm": 0.23740769922733307, "learning_rate": 1.6710875660253852e-05, "loss": 1.2413, "step": 9658 }, { "epoch": 2.8768964425994525, "grad_norm": 0.23263880610466003, "learning_rate": 1.6710160520662207e-05, "loss": 1.2511, "step": 9659 }, { "epoch": 2.8771942887991213, "grad_norm": 0.24080054461956024, "learning_rate": 1.6709445318640015e-05, "loss": 1.2656, "step": 9660 }, { "epoch": 2.87749213499879, "grad_norm": 0.2370612770318985, "learning_rate": 1.670873005419393e-05, "loss": 1.2597, "step": 9661 }, { "epoch": 2.8777899811984584, "grad_norm": 0.2382216602563858, "learning_rate": 1.6708014727330605e-05, "loss": 1.2182, "step": 9662 }, { "epoch": 2.878087827398127, "grad_norm": 0.23752722144126892, "learning_rate": 1.67072993380567e-05, "loss": 1.2448, "step": 9663 }, { "epoch": 2.878385673597796, "grad_norm": 0.23380430042743683, "learning_rate": 1.6706583886378868e-05, "loss": 1.2295, "step": 9664 }, { "epoch": 2.8786835197974647, "grad_norm": 0.2371366024017334, "learning_rate": 1.6705868372303765e-05, "loss": 1.2584, "step": 9665 }, { "epoch": 2.8789813659971335, "grad_norm": 0.238836407661438, "learning_rate": 1.670515279583805e-05, "loss": 1.2522, "step": 9666 }, { "epoch": 2.879279212196802, "grad_norm": 0.23018185794353485, "learning_rate": 1.670443715698838e-05, "loss": 1.2475, "step": 9667 }, { "epoch": 2.8795770583964706, "grad_norm": 0.23417013883590698, "learning_rate": 1.670372145576141e-05, "loss": 1.2362, "step": 9668 }, { "epoch": 2.8798749045961394, "grad_norm": 0.24360010027885437, "learning_rate": 1.6703005692163804e-05, "loss": 1.2639, "step": 9669 }, { "epoch": 2.8801727507958077, "grad_norm": 0.2241598516702652, "learning_rate": 1.670228986620222e-05, "loss": 1.244, "step": 9670 }, { "epoch": 2.8804705969954765, "grad_norm": 0.23797936737537384, "learning_rate": 1.6701573977883314e-05, "loss": 1.2357, "step": 9671 }, { "epoch": 2.8807684431951452, "grad_norm": 0.23201867938041687, "learning_rate": 1.6700858027213755e-05, "loss": 1.2564, "step": 9672 }, { "epoch": 2.8810662893948136, "grad_norm": 0.22328731417655945, "learning_rate": 1.6700142014200197e-05, "loss": 1.2547, "step": 9673 }, { "epoch": 2.8813641355944823, "grad_norm": 0.23917587101459503, "learning_rate": 1.66994259388493e-05, "loss": 1.2463, "step": 9674 }, { "epoch": 2.881661981794151, "grad_norm": 0.2283514440059662, "learning_rate": 1.669870980116773e-05, "loss": 1.2639, "step": 9675 }, { "epoch": 2.8819598279938194, "grad_norm": 0.23538215458393097, "learning_rate": 1.6697993601162152e-05, "loss": 1.234, "step": 9676 }, { "epoch": 2.882257674193488, "grad_norm": 0.23896881937980652, "learning_rate": 1.6697277338839227e-05, "loss": 1.2432, "step": 9677 }, { "epoch": 2.882555520393157, "grad_norm": 0.23404815793037415, "learning_rate": 1.6696561014205615e-05, "loss": 1.237, "step": 9678 }, { "epoch": 2.8828533665928258, "grad_norm": 0.22328928112983704, "learning_rate": 1.669584462726799e-05, "loss": 1.2493, "step": 9679 }, { "epoch": 2.8831512127924945, "grad_norm": 0.22693878412246704, "learning_rate": 1.669512817803301e-05, "loss": 1.2638, "step": 9680 }, { "epoch": 2.883449058992163, "grad_norm": 0.23578110337257385, "learning_rate": 1.6694411666507343e-05, "loss": 1.2557, "step": 9681 }, { "epoch": 2.8837469051918316, "grad_norm": 0.23133975267410278, "learning_rate": 1.669369509269765e-05, "loss": 1.2467, "step": 9682 }, { "epoch": 2.8840447513915004, "grad_norm": 0.2348644733428955, "learning_rate": 1.6692978456610607e-05, "loss": 1.2256, "step": 9683 }, { "epoch": 2.8843425975911687, "grad_norm": 0.22899694740772247, "learning_rate": 1.669226175825288e-05, "loss": 1.2205, "step": 9684 }, { "epoch": 2.8846404437908375, "grad_norm": 0.23254863917827606, "learning_rate": 1.669154499763113e-05, "loss": 1.2274, "step": 9685 }, { "epoch": 2.8849382899905063, "grad_norm": 0.24034923315048218, "learning_rate": 1.669082817475203e-05, "loss": 1.2576, "step": 9686 }, { "epoch": 2.8852361361901746, "grad_norm": 0.23759359121322632, "learning_rate": 1.6690111289622254e-05, "loss": 1.2472, "step": 9687 }, { "epoch": 2.8855339823898434, "grad_norm": 0.23265060782432556, "learning_rate": 1.668939434224846e-05, "loss": 1.2261, "step": 9688 }, { "epoch": 2.885831828589512, "grad_norm": 0.2382369488477707, "learning_rate": 1.668867733263733e-05, "loss": 1.2542, "step": 9689 }, { "epoch": 2.8861296747891805, "grad_norm": 0.22997981309890747, "learning_rate": 1.668796026079553e-05, "loss": 1.2596, "step": 9690 }, { "epoch": 2.8864275209888492, "grad_norm": 0.23344625532627106, "learning_rate": 1.668724312672973e-05, "loss": 1.2658, "step": 9691 }, { "epoch": 2.886725367188518, "grad_norm": 0.23847095668315887, "learning_rate": 1.668652593044661e-05, "loss": 1.2479, "step": 9692 }, { "epoch": 2.887023213388187, "grad_norm": 0.2525794506072998, "learning_rate": 1.6685808671952827e-05, "loss": 1.2473, "step": 9693 }, { "epoch": 2.8873210595878556, "grad_norm": 0.24680624902248383, "learning_rate": 1.6685091351255072e-05, "loss": 1.2422, "step": 9694 }, { "epoch": 2.887618905787524, "grad_norm": 0.24009333550930023, "learning_rate": 1.6684373968360007e-05, "loss": 1.2432, "step": 9695 }, { "epoch": 2.8879167519871927, "grad_norm": 0.23311027884483337, "learning_rate": 1.6683656523274316e-05, "loss": 1.2285, "step": 9696 }, { "epoch": 2.8882145981868614, "grad_norm": 0.24567042291164398, "learning_rate": 1.6682939016004663e-05, "loss": 1.2203, "step": 9697 }, { "epoch": 2.8885124443865298, "grad_norm": 0.25748467445373535, "learning_rate": 1.668222144655773e-05, "loss": 1.2479, "step": 9698 }, { "epoch": 2.8888102905861985, "grad_norm": 0.26577097177505493, "learning_rate": 1.6681503814940195e-05, "loss": 1.2561, "step": 9699 }, { "epoch": 2.8891081367858673, "grad_norm": 0.24358929693698883, "learning_rate": 1.668078612115873e-05, "loss": 1.2498, "step": 9700 }, { "epoch": 2.8894059829855356, "grad_norm": 0.24860145151615143, "learning_rate": 1.6680068365220013e-05, "loss": 1.2476, "step": 9701 }, { "epoch": 2.8897038291852044, "grad_norm": 0.23227480053901672, "learning_rate": 1.6679350547130725e-05, "loss": 1.2411, "step": 9702 }, { "epoch": 2.890001675384873, "grad_norm": 0.24423043429851532, "learning_rate": 1.667863266689754e-05, "loss": 1.2468, "step": 9703 }, { "epoch": 2.8902995215845415, "grad_norm": 0.23363593220710754, "learning_rate": 1.6677914724527145e-05, "loss": 1.2563, "step": 9704 }, { "epoch": 2.8905973677842103, "grad_norm": 0.23444494605064392, "learning_rate": 1.667719672002621e-05, "loss": 1.2416, "step": 9705 }, { "epoch": 2.890895213983879, "grad_norm": 0.23438218235969543, "learning_rate": 1.667647865340142e-05, "loss": 1.2299, "step": 9706 }, { "epoch": 2.891193060183548, "grad_norm": 0.22477249801158905, "learning_rate": 1.667576052465946e-05, "loss": 1.2562, "step": 9707 }, { "epoch": 2.8914909063832166, "grad_norm": 0.22039125859737396, "learning_rate": 1.6675042333807004e-05, "loss": 1.2288, "step": 9708 }, { "epoch": 2.891788752582885, "grad_norm": 0.23163697123527527, "learning_rate": 1.6674324080850738e-05, "loss": 1.2629, "step": 9709 }, { "epoch": 2.8920865987825537, "grad_norm": 0.23662646114826202, "learning_rate": 1.667360576579734e-05, "loss": 1.239, "step": 9710 }, { "epoch": 2.8923844449822225, "grad_norm": 0.23235507309436798, "learning_rate": 1.6672887388653497e-05, "loss": 1.2331, "step": 9711 }, { "epoch": 2.892682291181891, "grad_norm": 0.23247478902339935, "learning_rate": 1.6672168949425897e-05, "loss": 1.2475, "step": 9712 }, { "epoch": 2.8929801373815596, "grad_norm": 0.22635920345783234, "learning_rate": 1.6671450448121217e-05, "loss": 1.2385, "step": 9713 }, { "epoch": 2.8932779835812283, "grad_norm": 0.2372814565896988, "learning_rate": 1.667073188474614e-05, "loss": 1.2609, "step": 9714 }, { "epoch": 2.8935758297808967, "grad_norm": 0.2527531087398529, "learning_rate": 1.6670013259307363e-05, "loss": 1.2417, "step": 9715 }, { "epoch": 2.8938736759805654, "grad_norm": 0.24779640138149261, "learning_rate": 1.666929457181156e-05, "loss": 1.256, "step": 9716 }, { "epoch": 2.894171522180234, "grad_norm": 0.23665443062782288, "learning_rate": 1.6668575822265426e-05, "loss": 1.2405, "step": 9717 }, { "epoch": 2.894469368379903, "grad_norm": 0.2807767689228058, "learning_rate": 1.666785701067564e-05, "loss": 1.2478, "step": 9718 }, { "epoch": 2.8947672145795713, "grad_norm": 0.25625476241111755, "learning_rate": 1.6667138137048896e-05, "loss": 1.2354, "step": 9719 }, { "epoch": 2.89506506077924, "grad_norm": 0.22842606902122498, "learning_rate": 1.6666419201391884e-05, "loss": 1.2347, "step": 9720 }, { "epoch": 2.895362906978909, "grad_norm": 0.271775484085083, "learning_rate": 1.6665700203711287e-05, "loss": 1.2421, "step": 9721 }, { "epoch": 2.8956607531785776, "grad_norm": 0.24809524416923523, "learning_rate": 1.6664981144013794e-05, "loss": 1.2492, "step": 9722 }, { "epoch": 2.895958599378246, "grad_norm": 0.2469061017036438, "learning_rate": 1.6664262022306103e-05, "loss": 1.2309, "step": 9723 }, { "epoch": 2.8962564455779147, "grad_norm": 0.33535274863243103, "learning_rate": 1.6663542838594895e-05, "loss": 1.2476, "step": 9724 }, { "epoch": 2.8965542917775835, "grad_norm": 0.30277758836746216, "learning_rate": 1.6662823592886866e-05, "loss": 1.242, "step": 9725 }, { "epoch": 2.896852137977252, "grad_norm": 0.23677143454551697, "learning_rate": 1.6662104285188714e-05, "loss": 1.2373, "step": 9726 }, { "epoch": 2.8971499841769206, "grad_norm": 0.26896932721138, "learning_rate": 1.666138491550712e-05, "loss": 1.2305, "step": 9727 }, { "epoch": 2.8974478303765894, "grad_norm": 0.245836541056633, "learning_rate": 1.6660665483848783e-05, "loss": 1.2537, "step": 9728 }, { "epoch": 2.8977456765762577, "grad_norm": 0.2385386824607849, "learning_rate": 1.6659945990220393e-05, "loss": 1.2649, "step": 9729 }, { "epoch": 2.8980435227759265, "grad_norm": 0.24647222459316254, "learning_rate": 1.665922643462865e-05, "loss": 1.2528, "step": 9730 }, { "epoch": 2.8983413689755952, "grad_norm": 0.23912402987480164, "learning_rate": 1.6658506817080246e-05, "loss": 1.2577, "step": 9731 }, { "epoch": 2.898639215175264, "grad_norm": 0.2441381812095642, "learning_rate": 1.665778713758187e-05, "loss": 1.2274, "step": 9732 }, { "epoch": 2.898937061374933, "grad_norm": 0.2690194845199585, "learning_rate": 1.6657067396140226e-05, "loss": 1.2369, "step": 9733 }, { "epoch": 2.899234907574601, "grad_norm": 0.2437698096036911, "learning_rate": 1.665634759276201e-05, "loss": 1.2469, "step": 9734 }, { "epoch": 2.89953275377427, "grad_norm": 0.24350915849208832, "learning_rate": 1.6655627727453912e-05, "loss": 1.2442, "step": 9735 }, { "epoch": 2.8998305999739387, "grad_norm": 0.27060574293136597, "learning_rate": 1.6654907800222638e-05, "loss": 1.2428, "step": 9736 }, { "epoch": 2.900128446173607, "grad_norm": 0.2574555575847626, "learning_rate": 1.665418781107488e-05, "loss": 1.2533, "step": 9737 }, { "epoch": 2.9004262923732758, "grad_norm": 0.23176690936088562, "learning_rate": 1.665346776001734e-05, "loss": 1.2595, "step": 9738 }, { "epoch": 2.9007241385729445, "grad_norm": 0.2270326465368271, "learning_rate": 1.6652747647056714e-05, "loss": 1.2263, "step": 9739 }, { "epoch": 2.901021984772613, "grad_norm": 0.2572081685066223, "learning_rate": 1.665202747219971e-05, "loss": 1.2456, "step": 9740 }, { "epoch": 2.9013198309722816, "grad_norm": 0.24711395800113678, "learning_rate": 1.6651307235453015e-05, "loss": 1.2354, "step": 9741 }, { "epoch": 2.9016176771719504, "grad_norm": 0.23309578001499176, "learning_rate": 1.665058693682334e-05, "loss": 1.244, "step": 9742 }, { "epoch": 2.9019155233716187, "grad_norm": 0.25636541843414307, "learning_rate": 1.6649866576317387e-05, "loss": 1.2501, "step": 9743 }, { "epoch": 2.9022133695712875, "grad_norm": 0.23264876008033752, "learning_rate": 1.6649146153941854e-05, "loss": 1.2478, "step": 9744 }, { "epoch": 2.9025112157709563, "grad_norm": 0.2976943552494049, "learning_rate": 1.6648425669703442e-05, "loss": 1.2496, "step": 9745 }, { "epoch": 2.902809061970625, "grad_norm": 0.31915852427482605, "learning_rate": 1.664770512360886e-05, "loss": 1.2558, "step": 9746 }, { "epoch": 2.903106908170294, "grad_norm": 0.2434307187795639, "learning_rate": 1.6646984515664806e-05, "loss": 1.2483, "step": 9747 }, { "epoch": 2.903404754369962, "grad_norm": 0.7460813522338867, "learning_rate": 1.6646263845877993e-05, "loss": 1.23, "step": 9748 }, { "epoch": 2.903702600569631, "grad_norm": 0.30449822545051575, "learning_rate": 1.6645543114255115e-05, "loss": 1.2504, "step": 9749 }, { "epoch": 2.9040004467692997, "grad_norm": 0.30233049392700195, "learning_rate": 1.664482232080289e-05, "loss": 1.2721, "step": 9750 }, { "epoch": 2.904298292968968, "grad_norm": 0.2563141882419586, "learning_rate": 1.664410146552801e-05, "loss": 1.2322, "step": 9751 }, { "epoch": 2.904596139168637, "grad_norm": 0.24350601434707642, "learning_rate": 1.664338054843719e-05, "loss": 1.2413, "step": 9752 }, { "epoch": 2.9048939853683056, "grad_norm": 0.23631763458251953, "learning_rate": 1.664265956953714e-05, "loss": 1.2587, "step": 9753 }, { "epoch": 2.905191831567974, "grad_norm": 0.26359960436820984, "learning_rate": 1.6641938528834566e-05, "loss": 1.2554, "step": 9754 }, { "epoch": 2.9054896777676427, "grad_norm": 0.2576713562011719, "learning_rate": 1.6641217426336167e-05, "loss": 1.2564, "step": 9755 }, { "epoch": 2.9057875239673114, "grad_norm": 0.253892183303833, "learning_rate": 1.6640496262048663e-05, "loss": 1.2217, "step": 9756 }, { "epoch": 2.9060853701669798, "grad_norm": 0.23795241117477417, "learning_rate": 1.6639775035978766e-05, "loss": 1.2464, "step": 9757 }, { "epoch": 2.9063832163666485, "grad_norm": 0.22954751551151276, "learning_rate": 1.6639053748133176e-05, "loss": 1.2469, "step": 9758 }, { "epoch": 2.9066810625663173, "grad_norm": 0.24584701657295227, "learning_rate": 1.6638332398518607e-05, "loss": 1.2331, "step": 9759 }, { "epoch": 2.906978908765986, "grad_norm": 0.2480352371931076, "learning_rate": 1.6637610987141774e-05, "loss": 1.2469, "step": 9760 }, { "epoch": 2.907276754965655, "grad_norm": 0.24403853714466095, "learning_rate": 1.6636889514009387e-05, "loss": 1.2408, "step": 9761 }, { "epoch": 2.907574601165323, "grad_norm": 0.24723169207572937, "learning_rate": 1.6636167979128157e-05, "loss": 1.2423, "step": 9762 }, { "epoch": 2.907872447364992, "grad_norm": 0.2441592812538147, "learning_rate": 1.66354463825048e-05, "loss": 1.2566, "step": 9763 }, { "epoch": 2.9081702935646607, "grad_norm": 0.2384587973356247, "learning_rate": 1.6634724724146028e-05, "loss": 1.2588, "step": 9764 }, { "epoch": 2.908468139764329, "grad_norm": 0.2357000708580017, "learning_rate": 1.6634003004058553e-05, "loss": 1.2239, "step": 9765 }, { "epoch": 2.908765985963998, "grad_norm": 0.2468337118625641, "learning_rate": 1.6633281222249092e-05, "loss": 1.2347, "step": 9766 }, { "epoch": 2.9090638321636666, "grad_norm": 0.22989989817142487, "learning_rate": 1.6632559378724364e-05, "loss": 1.2435, "step": 9767 }, { "epoch": 2.909361678363335, "grad_norm": 0.23285478353500366, "learning_rate": 1.6631837473491075e-05, "loss": 1.2466, "step": 9768 }, { "epoch": 2.9096595245630037, "grad_norm": 0.23242908716201782, "learning_rate": 1.6631115506555953e-05, "loss": 1.2317, "step": 9769 }, { "epoch": 2.9099573707626725, "grad_norm": 0.2319924533367157, "learning_rate": 1.663039347792571e-05, "loss": 1.2345, "step": 9770 }, { "epoch": 2.910255216962341, "grad_norm": 0.23280765116214752, "learning_rate": 1.6629671387607062e-05, "loss": 1.2381, "step": 9771 }, { "epoch": 2.9105530631620096, "grad_norm": 0.23628489673137665, "learning_rate": 1.662894923560673e-05, "loss": 1.2329, "step": 9772 }, { "epoch": 2.9108509093616783, "grad_norm": 0.23635146021842957, "learning_rate": 1.662822702193143e-05, "loss": 1.2503, "step": 9773 }, { "epoch": 2.911148755561347, "grad_norm": 0.2271934151649475, "learning_rate": 1.6627504746587885e-05, "loss": 1.246, "step": 9774 }, { "epoch": 2.911446601761016, "grad_norm": 0.2384863942861557, "learning_rate": 1.662678240958281e-05, "loss": 1.2476, "step": 9775 }, { "epoch": 2.911744447960684, "grad_norm": 0.23095139861106873, "learning_rate": 1.6626060010922927e-05, "loss": 1.2491, "step": 9776 }, { "epoch": 2.912042294160353, "grad_norm": 0.23577161133289337, "learning_rate": 1.6625337550614963e-05, "loss": 1.2349, "step": 9777 }, { "epoch": 2.9123401403600218, "grad_norm": 0.2283676713705063, "learning_rate": 1.6624615028665636e-05, "loss": 1.2447, "step": 9778 }, { "epoch": 2.91263798655969, "grad_norm": 0.22701942920684814, "learning_rate": 1.6623892445081665e-05, "loss": 1.2444, "step": 9779 }, { "epoch": 2.912935832759359, "grad_norm": 0.22096702456474304, "learning_rate": 1.662316979986978e-05, "loss": 1.2516, "step": 9780 }, { "epoch": 2.9132336789590276, "grad_norm": 0.2332075834274292, "learning_rate": 1.6622447093036697e-05, "loss": 1.251, "step": 9781 }, { "epoch": 2.913531525158696, "grad_norm": 0.23211286962032318, "learning_rate": 1.662172432458914e-05, "loss": 1.2372, "step": 9782 }, { "epoch": 2.9138293713583647, "grad_norm": 0.22454820573329926, "learning_rate": 1.662100149453384e-05, "loss": 1.2394, "step": 9783 }, { "epoch": 2.9141272175580335, "grad_norm": 0.23275113105773926, "learning_rate": 1.6620278602877517e-05, "loss": 1.2469, "step": 9784 }, { "epoch": 2.9144250637577023, "grad_norm": 0.23850785195827484, "learning_rate": 1.6619555649626894e-05, "loss": 1.2473, "step": 9785 }, { "epoch": 2.9147229099573706, "grad_norm": 0.23938463628292084, "learning_rate": 1.661883263478871e-05, "loss": 1.2342, "step": 9786 }, { "epoch": 2.9150207561570394, "grad_norm": 0.22498579323291779, "learning_rate": 1.6618109558369676e-05, "loss": 1.2506, "step": 9787 }, { "epoch": 2.915318602356708, "grad_norm": 0.23149646818637848, "learning_rate": 1.6617386420376532e-05, "loss": 1.2331, "step": 9788 }, { "epoch": 2.915616448556377, "grad_norm": 0.2308996319770813, "learning_rate": 1.6616663220815996e-05, "loss": 1.2339, "step": 9789 }, { "epoch": 2.9159142947560452, "grad_norm": 0.23279112577438354, "learning_rate": 1.6615939959694805e-05, "loss": 1.2537, "step": 9790 }, { "epoch": 2.916212140955714, "grad_norm": 0.22437404096126556, "learning_rate": 1.6615216637019683e-05, "loss": 1.2215, "step": 9791 }, { "epoch": 2.916509987155383, "grad_norm": 0.22464318573474884, "learning_rate": 1.6614493252797365e-05, "loss": 1.2589, "step": 9792 }, { "epoch": 2.916807833355051, "grad_norm": 0.2366705983877182, "learning_rate": 1.661376980703457e-05, "loss": 1.2355, "step": 9793 }, { "epoch": 2.91710567955472, "grad_norm": 0.22409525513648987, "learning_rate": 1.661304629973804e-05, "loss": 1.2458, "step": 9794 }, { "epoch": 2.9174035257543887, "grad_norm": 0.23578611016273499, "learning_rate": 1.6612322730914505e-05, "loss": 1.2289, "step": 9795 }, { "epoch": 2.917701371954057, "grad_norm": 0.24363459646701813, "learning_rate": 1.661159910057069e-05, "loss": 1.2426, "step": 9796 }, { "epoch": 2.9179992181537258, "grad_norm": 0.2294311672449112, "learning_rate": 1.6610875408713335e-05, "loss": 1.2305, "step": 9797 }, { "epoch": 2.9182970643533945, "grad_norm": 0.2467951625585556, "learning_rate": 1.6610151655349173e-05, "loss": 1.2366, "step": 9798 }, { "epoch": 2.9185949105530633, "grad_norm": 0.22517724335193634, "learning_rate": 1.660942784048493e-05, "loss": 1.2428, "step": 9799 }, { "epoch": 2.918892756752732, "grad_norm": 0.2336469143629074, "learning_rate": 1.660870396412735e-05, "loss": 1.2372, "step": 9800 }, { "epoch": 2.9191906029524004, "grad_norm": 0.22752323746681213, "learning_rate": 1.660798002628316e-05, "loss": 1.2208, "step": 9801 }, { "epoch": 2.919488449152069, "grad_norm": 0.22840507328510284, "learning_rate": 1.66072560269591e-05, "loss": 1.247, "step": 9802 }, { "epoch": 2.919786295351738, "grad_norm": 0.2431611567735672, "learning_rate": 1.6606531966161906e-05, "loss": 1.2537, "step": 9803 }, { "epoch": 2.9200841415514063, "grad_norm": 0.24127916991710663, "learning_rate": 1.660580784389831e-05, "loss": 1.2346, "step": 9804 }, { "epoch": 2.920381987751075, "grad_norm": 0.21927498281002045, "learning_rate": 1.6605083660175055e-05, "loss": 1.2371, "step": 9805 }, { "epoch": 2.920679833950744, "grad_norm": 0.23330777883529663, "learning_rate": 1.6604359414998877e-05, "loss": 1.258, "step": 9806 }, { "epoch": 2.920977680150412, "grad_norm": 0.23680800199508667, "learning_rate": 1.660363510837651e-05, "loss": 1.2515, "step": 9807 }, { "epoch": 2.921275526350081, "grad_norm": 0.23028503358364105, "learning_rate": 1.66029107403147e-05, "loss": 1.2444, "step": 9808 }, { "epoch": 2.9215733725497497, "grad_norm": 0.24483390152454376, "learning_rate": 1.660218631082018e-05, "loss": 1.2477, "step": 9809 }, { "epoch": 2.921871218749418, "grad_norm": 0.23751649260520935, "learning_rate": 1.6601461819899694e-05, "loss": 1.2474, "step": 9810 }, { "epoch": 2.922169064949087, "grad_norm": 0.22798416018486023, "learning_rate": 1.660073726755998e-05, "loss": 1.2276, "step": 9811 }, { "epoch": 2.9224669111487556, "grad_norm": 0.23340913653373718, "learning_rate": 1.660001265380778e-05, "loss": 1.2574, "step": 9812 }, { "epoch": 2.9227647573484243, "grad_norm": 0.22620655596256256, "learning_rate": 1.659928797864984e-05, "loss": 1.2226, "step": 9813 }, { "epoch": 2.923062603548093, "grad_norm": 0.23551420867443085, "learning_rate": 1.6598563242092895e-05, "loss": 1.2439, "step": 9814 }, { "epoch": 2.9233604497477614, "grad_norm": 0.24146650731563568, "learning_rate": 1.6597838444143697e-05, "loss": 1.2468, "step": 9815 }, { "epoch": 2.92365829594743, "grad_norm": 0.24275468289852142, "learning_rate": 1.659711358480898e-05, "loss": 1.2258, "step": 9816 }, { "epoch": 2.923956142147099, "grad_norm": 0.2402866780757904, "learning_rate": 1.6596388664095487e-05, "loss": 1.2566, "step": 9817 }, { "epoch": 2.9242539883467673, "grad_norm": 0.23758667707443237, "learning_rate": 1.6595663682009973e-05, "loss": 1.2603, "step": 9818 }, { "epoch": 2.924551834546436, "grad_norm": 0.23961032927036285, "learning_rate": 1.659493863855918e-05, "loss": 1.2449, "step": 9819 }, { "epoch": 2.924849680746105, "grad_norm": 0.23793166875839233, "learning_rate": 1.6594213533749846e-05, "loss": 1.2412, "step": 9820 }, { "epoch": 2.925147526945773, "grad_norm": 0.22784459590911865, "learning_rate": 1.659348836758872e-05, "loss": 1.2201, "step": 9821 }, { "epoch": 2.925445373145442, "grad_norm": 0.23748725652694702, "learning_rate": 1.6592763140082556e-05, "loss": 1.2613, "step": 9822 }, { "epoch": 2.9257432193451107, "grad_norm": 0.2378886193037033, "learning_rate": 1.6592037851238097e-05, "loss": 1.232, "step": 9823 }, { "epoch": 2.926041065544779, "grad_norm": 0.2596132159233093, "learning_rate": 1.659131250106209e-05, "loss": 1.2399, "step": 9824 }, { "epoch": 2.926338911744448, "grad_norm": 0.23729299008846283, "learning_rate": 1.6590587089561287e-05, "loss": 1.2499, "step": 9825 }, { "epoch": 2.9266367579441166, "grad_norm": 0.2374809831380844, "learning_rate": 1.6589861616742434e-05, "loss": 1.2425, "step": 9826 }, { "epoch": 2.9269346041437854, "grad_norm": 0.23805709183216095, "learning_rate": 1.6589136082612277e-05, "loss": 1.2432, "step": 9827 }, { "epoch": 2.927232450343454, "grad_norm": 0.24178627133369446, "learning_rate": 1.6588410487177572e-05, "loss": 1.2409, "step": 9828 }, { "epoch": 2.9275302965431225, "grad_norm": 0.24564237892627716, "learning_rate": 1.6587684830445074e-05, "loss": 1.2289, "step": 9829 }, { "epoch": 2.9278281427427912, "grad_norm": 0.22558070719242096, "learning_rate": 1.6586959112421523e-05, "loss": 1.2531, "step": 9830 }, { "epoch": 2.92812598894246, "grad_norm": 0.23690807819366455, "learning_rate": 1.6586233333113678e-05, "loss": 1.2414, "step": 9831 }, { "epoch": 2.9284238351421283, "grad_norm": 0.22581611573696136, "learning_rate": 1.658550749252829e-05, "loss": 1.2254, "step": 9832 }, { "epoch": 2.928721681341797, "grad_norm": 0.2420835644006729, "learning_rate": 1.6584781590672116e-05, "loss": 1.2655, "step": 9833 }, { "epoch": 2.929019527541466, "grad_norm": 0.23327945172786713, "learning_rate": 1.6584055627551904e-05, "loss": 1.2352, "step": 9834 }, { "epoch": 2.929317373741134, "grad_norm": 0.2318052500486374, "learning_rate": 1.658332960317441e-05, "loss": 1.238, "step": 9835 }, { "epoch": 2.929615219940803, "grad_norm": 0.23198439180850983, "learning_rate": 1.6582603517546388e-05, "loss": 1.2388, "step": 9836 }, { "epoch": 2.9299130661404718, "grad_norm": 0.22870133817195892, "learning_rate": 1.6581877370674596e-05, "loss": 1.2363, "step": 9837 }, { "epoch": 2.93021091234014, "grad_norm": 0.22588759660720825, "learning_rate": 1.6581151162565788e-05, "loss": 1.2404, "step": 9838 }, { "epoch": 2.930508758539809, "grad_norm": 0.23626545071601868, "learning_rate": 1.6580424893226723e-05, "loss": 1.2428, "step": 9839 }, { "epoch": 2.9308066047394776, "grad_norm": 0.23212206363677979, "learning_rate": 1.6579698562664157e-05, "loss": 1.2289, "step": 9840 }, { "epoch": 2.9311044509391464, "grad_norm": 0.2351151555776596, "learning_rate": 1.6578972170884843e-05, "loss": 1.2396, "step": 9841 }, { "epoch": 2.931402297138815, "grad_norm": 0.24143719673156738, "learning_rate": 1.657824571789555e-05, "loss": 1.2353, "step": 9842 }, { "epoch": 2.9317001433384835, "grad_norm": 0.23602788150310516, "learning_rate": 1.6577519203703025e-05, "loss": 1.2361, "step": 9843 }, { "epoch": 2.9319979895381523, "grad_norm": 0.23900477588176727, "learning_rate": 1.6576792628314033e-05, "loss": 1.2599, "step": 9844 }, { "epoch": 2.932295835737821, "grad_norm": 0.22072771191596985, "learning_rate": 1.6576065991735336e-05, "loss": 1.248, "step": 9845 }, { "epoch": 2.9325936819374894, "grad_norm": 0.22451645135879517, "learning_rate": 1.657533929397369e-05, "loss": 1.2394, "step": 9846 }, { "epoch": 2.932891528137158, "grad_norm": 0.23405350744724274, "learning_rate": 1.6574612535035857e-05, "loss": 1.2491, "step": 9847 }, { "epoch": 2.933189374336827, "grad_norm": 0.23247672617435455, "learning_rate": 1.6573885714928604e-05, "loss": 1.2508, "step": 9848 }, { "epoch": 2.9334872205364952, "grad_norm": 0.23231729865074158, "learning_rate": 1.6573158833658688e-05, "loss": 1.2512, "step": 9849 }, { "epoch": 2.933785066736164, "grad_norm": 0.23208829760551453, "learning_rate": 1.657243189123287e-05, "loss": 1.2394, "step": 9850 }, { "epoch": 2.934082912935833, "grad_norm": 0.22789599001407623, "learning_rate": 1.657170488765792e-05, "loss": 1.222, "step": 9851 }, { "epoch": 2.9343807591355016, "grad_norm": 0.22650061547756195, "learning_rate": 1.65709778229406e-05, "loss": 1.2518, "step": 9852 }, { "epoch": 2.93467860533517, "grad_norm": 0.22586821019649506, "learning_rate": 1.6570250697087668e-05, "loss": 1.2375, "step": 9853 }, { "epoch": 2.9349764515348387, "grad_norm": 0.2344573438167572, "learning_rate": 1.6569523510105898e-05, "loss": 1.2535, "step": 9854 }, { "epoch": 2.9352742977345074, "grad_norm": 0.2413981705904007, "learning_rate": 1.6568796262002048e-05, "loss": 1.2379, "step": 9855 }, { "epoch": 2.935572143934176, "grad_norm": 0.2279159277677536, "learning_rate": 1.656806895278289e-05, "loss": 1.2452, "step": 9856 }, { "epoch": 2.9358699901338445, "grad_norm": 0.23538503050804138, "learning_rate": 1.656734158245519e-05, "loss": 1.2397, "step": 9857 }, { "epoch": 2.9361678363335133, "grad_norm": 0.22665321826934814, "learning_rate": 1.656661415102571e-05, "loss": 1.2455, "step": 9858 }, { "epoch": 2.936465682533182, "grad_norm": 0.23031972348690033, "learning_rate": 1.6565886658501226e-05, "loss": 1.2423, "step": 9859 }, { "epoch": 2.9367635287328504, "grad_norm": 0.22960679233074188, "learning_rate": 1.65651591048885e-05, "loss": 1.2505, "step": 9860 }, { "epoch": 2.937061374932519, "grad_norm": 0.2389422059059143, "learning_rate": 1.6564431490194306e-05, "loss": 1.234, "step": 9861 }, { "epoch": 2.937359221132188, "grad_norm": 0.23131327331066132, "learning_rate": 1.6563703814425408e-05, "loss": 1.25, "step": 9862 }, { "epoch": 2.9376570673318563, "grad_norm": 0.23465389013290405, "learning_rate": 1.6562976077588582e-05, "loss": 1.2497, "step": 9863 }, { "epoch": 2.937954913531525, "grad_norm": 0.2397429347038269, "learning_rate": 1.6562248279690594e-05, "loss": 1.2384, "step": 9864 }, { "epoch": 2.938252759731194, "grad_norm": 0.2289528101682663, "learning_rate": 1.656152042073822e-05, "loss": 1.241, "step": 9865 }, { "epoch": 2.9385506059308626, "grad_norm": 0.22949634492397308, "learning_rate": 1.656079250073823e-05, "loss": 1.2297, "step": 9866 }, { "epoch": 2.9388484521305314, "grad_norm": 0.22586174309253693, "learning_rate": 1.6560064519697393e-05, "loss": 1.2611, "step": 9867 }, { "epoch": 2.9391462983301997, "grad_norm": 0.22224444150924683, "learning_rate": 1.6559336477622486e-05, "loss": 1.2322, "step": 9868 }, { "epoch": 2.9394441445298685, "grad_norm": 0.22912226617336273, "learning_rate": 1.655860837452028e-05, "loss": 1.2504, "step": 9869 }, { "epoch": 2.9397419907295372, "grad_norm": 0.2326570451259613, "learning_rate": 1.655788021039755e-05, "loss": 1.247, "step": 9870 }, { "epoch": 2.9400398369292056, "grad_norm": 0.2364460974931717, "learning_rate": 1.6557151985261074e-05, "loss": 1.2426, "step": 9871 }, { "epoch": 2.9403376831288743, "grad_norm": 0.22613415122032166, "learning_rate": 1.6556423699117626e-05, "loss": 1.2329, "step": 9872 }, { "epoch": 2.940635529328543, "grad_norm": 0.23236924409866333, "learning_rate": 1.655569535197398e-05, "loss": 1.2511, "step": 9873 }, { "epoch": 2.9409333755282114, "grad_norm": 0.24027229845523834, "learning_rate": 1.6554966943836914e-05, "loss": 1.2489, "step": 9874 }, { "epoch": 2.94123122172788, "grad_norm": 0.23759745061397552, "learning_rate": 1.6554238474713204e-05, "loss": 1.2559, "step": 9875 }, { "epoch": 2.941529067927549, "grad_norm": 0.23697206377983093, "learning_rate": 1.6553509944609626e-05, "loss": 1.2519, "step": 9876 }, { "epoch": 2.9418269141272173, "grad_norm": 0.23689374327659607, "learning_rate": 1.655278135353296e-05, "loss": 1.253, "step": 9877 }, { "epoch": 2.942124760326886, "grad_norm": 0.22923915088176727, "learning_rate": 1.6552052701489985e-05, "loss": 1.2367, "step": 9878 }, { "epoch": 2.942422606526555, "grad_norm": 0.226065993309021, "learning_rate": 1.655132398848748e-05, "loss": 1.2276, "step": 9879 }, { "epoch": 2.9427204527262236, "grad_norm": 0.23301160335540771, "learning_rate": 1.6550595214532226e-05, "loss": 1.2392, "step": 9880 }, { "epoch": 2.9430182989258924, "grad_norm": 0.22971084713935852, "learning_rate": 1.6549866379631005e-05, "loss": 1.2288, "step": 9881 }, { "epoch": 2.9433161451255607, "grad_norm": 0.22882910072803497, "learning_rate": 1.6549137483790588e-05, "loss": 1.2432, "step": 9882 }, { "epoch": 2.9436139913252295, "grad_norm": 0.2380112111568451, "learning_rate": 1.6548408527017768e-05, "loss": 1.2654, "step": 9883 }, { "epoch": 2.9439118375248983, "grad_norm": 0.2446528822183609, "learning_rate": 1.6547679509319322e-05, "loss": 1.2594, "step": 9884 }, { "epoch": 2.9442096837245666, "grad_norm": 0.22983220219612122, "learning_rate": 1.6546950430702036e-05, "loss": 1.2399, "step": 9885 }, { "epoch": 2.9445075299242354, "grad_norm": 0.23347225785255432, "learning_rate": 1.6546221291172687e-05, "loss": 1.2367, "step": 9886 }, { "epoch": 2.944805376123904, "grad_norm": 0.24225640296936035, "learning_rate": 1.6545492090738067e-05, "loss": 1.2312, "step": 9887 }, { "epoch": 2.9451032223235725, "grad_norm": 0.23472385108470917, "learning_rate": 1.6544762829404956e-05, "loss": 1.2397, "step": 9888 }, { "epoch": 2.9454010685232412, "grad_norm": 0.22931373119354248, "learning_rate": 1.6544033507180133e-05, "loss": 1.2454, "step": 9889 }, { "epoch": 2.94569891472291, "grad_norm": 0.23852002620697021, "learning_rate": 1.6543304124070397e-05, "loss": 1.2512, "step": 9890 }, { "epoch": 2.9459967609225783, "grad_norm": 0.24039921164512634, "learning_rate": 1.6542574680082525e-05, "loss": 1.2504, "step": 9891 }, { "epoch": 2.946294607122247, "grad_norm": 0.24208903312683105, "learning_rate": 1.6541845175223305e-05, "loss": 1.247, "step": 9892 }, { "epoch": 2.946592453321916, "grad_norm": 0.23167331516742706, "learning_rate": 1.654111560949952e-05, "loss": 1.2448, "step": 9893 }, { "epoch": 2.9468902995215847, "grad_norm": 0.2375795543193817, "learning_rate": 1.6540385982917968e-05, "loss": 1.2339, "step": 9894 }, { "epoch": 2.9471881457212534, "grad_norm": 0.21858654916286469, "learning_rate": 1.653965629548543e-05, "loss": 1.233, "step": 9895 }, { "epoch": 2.9474859919209218, "grad_norm": 0.24020157754421234, "learning_rate": 1.6538926547208695e-05, "loss": 1.2513, "step": 9896 }, { "epoch": 2.9477838381205905, "grad_norm": 0.2464299201965332, "learning_rate": 1.653819673809456e-05, "loss": 1.2628, "step": 9897 }, { "epoch": 2.9480816843202593, "grad_norm": 0.23754477500915527, "learning_rate": 1.65374668681498e-05, "loss": 1.2486, "step": 9898 }, { "epoch": 2.9483795305199276, "grad_norm": 0.2259082794189453, "learning_rate": 1.653673693738122e-05, "loss": 1.2404, "step": 9899 }, { "epoch": 2.9486773767195964, "grad_norm": 0.2358454018831253, "learning_rate": 1.653600694579561e-05, "loss": 1.2525, "step": 9900 }, { "epoch": 2.948975222919265, "grad_norm": 0.22510115802288055, "learning_rate": 1.6535276893399753e-05, "loss": 1.2303, "step": 9901 }, { "epoch": 2.9492730691189335, "grad_norm": 0.22290019690990448, "learning_rate": 1.6534546780200447e-05, "loss": 1.2473, "step": 9902 }, { "epoch": 2.9495709153186023, "grad_norm": 0.2295389622449875, "learning_rate": 1.6533816606204483e-05, "loss": 1.237, "step": 9903 }, { "epoch": 2.949868761518271, "grad_norm": 0.24453958868980408, "learning_rate": 1.6533086371418656e-05, "loss": 1.2511, "step": 9904 }, { "epoch": 2.95016660771794, "grad_norm": 0.22952069342136383, "learning_rate": 1.653235607584976e-05, "loss": 1.2452, "step": 9905 }, { "epoch": 2.950464453917608, "grad_norm": 0.2298763394355774, "learning_rate": 1.653162571950459e-05, "loss": 1.2374, "step": 9906 }, { "epoch": 2.950762300117277, "grad_norm": 0.23416483402252197, "learning_rate": 1.6530895302389936e-05, "loss": 1.2336, "step": 9907 }, { "epoch": 2.9510601463169457, "grad_norm": 0.2344265580177307, "learning_rate": 1.6530164824512606e-05, "loss": 1.2541, "step": 9908 }, { "epoch": 2.9513579925166145, "grad_norm": 0.23769165575504303, "learning_rate": 1.652943428587938e-05, "loss": 1.2336, "step": 9909 }, { "epoch": 2.951655838716283, "grad_norm": 0.22906984388828278, "learning_rate": 1.652870368649707e-05, "loss": 1.2537, "step": 9910 }, { "epoch": 2.9519536849159516, "grad_norm": 0.2274005115032196, "learning_rate": 1.652797302637246e-05, "loss": 1.2478, "step": 9911 }, { "epoch": 2.9522515311156203, "grad_norm": 0.22509224712848663, "learning_rate": 1.6527242305512358e-05, "loss": 1.235, "step": 9912 }, { "epoch": 2.9525493773152887, "grad_norm": 0.22659392654895782, "learning_rate": 1.6526511523923558e-05, "loss": 1.2266, "step": 9913 }, { "epoch": 2.9528472235149574, "grad_norm": 0.22773674130439758, "learning_rate": 1.6525780681612863e-05, "loss": 1.2315, "step": 9914 }, { "epoch": 2.953145069714626, "grad_norm": 0.23700878024101257, "learning_rate": 1.6525049778587067e-05, "loss": 1.2478, "step": 9915 }, { "epoch": 2.9534429159142945, "grad_norm": 0.22286926209926605, "learning_rate": 1.6524318814852973e-05, "loss": 1.24, "step": 9916 }, { "epoch": 2.9537407621139633, "grad_norm": 0.2216832935810089, "learning_rate": 1.6523587790417384e-05, "loss": 1.2419, "step": 9917 }, { "epoch": 2.954038608313632, "grad_norm": 0.23151254653930664, "learning_rate": 1.6522856705287097e-05, "loss": 1.222, "step": 9918 }, { "epoch": 2.954336454513301, "grad_norm": 0.23313990235328674, "learning_rate": 1.6522125559468918e-05, "loss": 1.2469, "step": 9919 }, { "epoch": 2.954634300712969, "grad_norm": 0.2344045490026474, "learning_rate": 1.6521394352969645e-05, "loss": 1.2443, "step": 9920 }, { "epoch": 2.954932146912638, "grad_norm": 0.22582167387008667, "learning_rate": 1.6520663085796087e-05, "loss": 1.246, "step": 9921 }, { "epoch": 2.9552299931123067, "grad_norm": 0.23233038187026978, "learning_rate": 1.651993175795504e-05, "loss": 1.2395, "step": 9922 }, { "epoch": 2.9555278393119755, "grad_norm": 0.2369987815618515, "learning_rate": 1.6519200369453314e-05, "loss": 1.2418, "step": 9923 }, { "epoch": 2.955825685511644, "grad_norm": 0.23401440680027008, "learning_rate": 1.6518468920297713e-05, "loss": 1.2275, "step": 9924 }, { "epoch": 2.9561235317113126, "grad_norm": 0.22939825057983398, "learning_rate": 1.6517737410495044e-05, "loss": 1.2331, "step": 9925 }, { "epoch": 2.9564213779109814, "grad_norm": 0.2362087219953537, "learning_rate": 1.6517005840052107e-05, "loss": 1.2517, "step": 9926 }, { "epoch": 2.9567192241106497, "grad_norm": 0.2244790494441986, "learning_rate": 1.6516274208975712e-05, "loss": 1.2336, "step": 9927 }, { "epoch": 2.9570170703103185, "grad_norm": 0.2383660078048706, "learning_rate": 1.6515542517272666e-05, "loss": 1.2298, "step": 9928 }, { "epoch": 2.9573149165099872, "grad_norm": 0.226767897605896, "learning_rate": 1.6514810764949777e-05, "loss": 1.228, "step": 9929 }, { "epoch": 2.9576127627096556, "grad_norm": 0.23022127151489258, "learning_rate": 1.6514078952013853e-05, "loss": 1.2577, "step": 9930 }, { "epoch": 2.9579106089093243, "grad_norm": 0.23604728281497955, "learning_rate": 1.6513347078471706e-05, "loss": 1.2478, "step": 9931 }, { "epoch": 2.958208455108993, "grad_norm": 0.22337941825389862, "learning_rate": 1.6512615144330134e-05, "loss": 1.2392, "step": 9932 }, { "epoch": 2.958506301308662, "grad_norm": 0.2234209179878235, "learning_rate": 1.651188314959596e-05, "loss": 1.246, "step": 9933 }, { "epoch": 2.9588041475083307, "grad_norm": 0.22856219112873077, "learning_rate": 1.6511151094275985e-05, "loss": 1.2547, "step": 9934 }, { "epoch": 2.959101993707999, "grad_norm": 0.2265620231628418, "learning_rate": 1.6510418978377027e-05, "loss": 1.2484, "step": 9935 }, { "epoch": 2.9593998399076678, "grad_norm": 0.22914516925811768, "learning_rate": 1.6509686801905892e-05, "loss": 1.2492, "step": 9936 }, { "epoch": 2.9596976861073365, "grad_norm": 0.23201251029968262, "learning_rate": 1.6508954564869398e-05, "loss": 1.2441, "step": 9937 }, { "epoch": 2.959995532307005, "grad_norm": 0.23081888258457184, "learning_rate": 1.650822226727435e-05, "loss": 1.2398, "step": 9938 }, { "epoch": 2.9602933785066736, "grad_norm": 0.23917965590953827, "learning_rate": 1.6507489909127564e-05, "loss": 1.2339, "step": 9939 }, { "epoch": 2.9605912247063424, "grad_norm": 0.23118039965629578, "learning_rate": 1.650675749043586e-05, "loss": 1.2522, "step": 9940 }, { "epoch": 2.9608890709060107, "grad_norm": 0.23159939050674438, "learning_rate": 1.6506025011206045e-05, "loss": 1.2628, "step": 9941 }, { "epoch": 2.9611869171056795, "grad_norm": 0.2369726151227951, "learning_rate": 1.6505292471444936e-05, "loss": 1.2414, "step": 9942 }, { "epoch": 2.9614847633053483, "grad_norm": 0.2323664128780365, "learning_rate": 1.6504559871159348e-05, "loss": 1.2513, "step": 9943 }, { "epoch": 2.9617826095050166, "grad_norm": 0.23310616612434387, "learning_rate": 1.65038272103561e-05, "loss": 1.2316, "step": 9944 }, { "epoch": 2.9620804557046854, "grad_norm": 0.2253895252943039, "learning_rate": 1.6503094489042005e-05, "loss": 1.2237, "step": 9945 }, { "epoch": 2.962378301904354, "grad_norm": 0.228556826710701, "learning_rate": 1.650236170722388e-05, "loss": 1.2499, "step": 9946 }, { "epoch": 2.962676148104023, "grad_norm": 0.2337954193353653, "learning_rate": 1.6501628864908546e-05, "loss": 1.242, "step": 9947 }, { "epoch": 2.9629739943036917, "grad_norm": 0.23642444610595703, "learning_rate": 1.650089596210282e-05, "loss": 1.2356, "step": 9948 }, { "epoch": 2.96327184050336, "grad_norm": 0.23544715344905853, "learning_rate": 1.6500162998813517e-05, "loss": 1.2414, "step": 9949 }, { "epoch": 2.963569686703029, "grad_norm": 0.239651620388031, "learning_rate": 1.6499429975047462e-05, "loss": 1.228, "step": 9950 }, { "epoch": 2.9638675329026976, "grad_norm": 0.23488640785217285, "learning_rate": 1.6498696890811473e-05, "loss": 1.2401, "step": 9951 }, { "epoch": 2.964165379102366, "grad_norm": 0.23645220696926117, "learning_rate": 1.6497963746112372e-05, "loss": 1.2485, "step": 9952 }, { "epoch": 2.9644632253020347, "grad_norm": 0.24776875972747803, "learning_rate": 1.6497230540956976e-05, "loss": 1.2586, "step": 9953 }, { "epoch": 2.9647610715017034, "grad_norm": 0.2415783405303955, "learning_rate": 1.6496497275352107e-05, "loss": 1.2551, "step": 9954 }, { "epoch": 2.9650589177013718, "grad_norm": 0.2303195595741272, "learning_rate": 1.6495763949304594e-05, "loss": 1.2505, "step": 9955 }, { "epoch": 2.9653567639010405, "grad_norm": 0.23608770966529846, "learning_rate": 1.649503056282125e-05, "loss": 1.2421, "step": 9956 }, { "epoch": 2.9656546101007093, "grad_norm": 0.2326173037290573, "learning_rate": 1.6494297115908907e-05, "loss": 1.2416, "step": 9957 }, { "epoch": 2.9659524563003776, "grad_norm": 0.2311028689146042, "learning_rate": 1.6493563608574385e-05, "loss": 1.2425, "step": 9958 }, { "epoch": 2.9662503025000464, "grad_norm": 0.23269657790660858, "learning_rate": 1.6492830040824506e-05, "loss": 1.2367, "step": 9959 }, { "epoch": 2.966548148699715, "grad_norm": 0.24212895333766937, "learning_rate": 1.64920964126661e-05, "loss": 1.2466, "step": 9960 }, { "epoch": 2.966845994899384, "grad_norm": 0.22782926261425018, "learning_rate": 1.6491362724105985e-05, "loss": 1.2348, "step": 9961 }, { "epoch": 2.9671438410990527, "grad_norm": 0.24124827980995178, "learning_rate": 1.6490628975150998e-05, "loss": 1.2454, "step": 9962 }, { "epoch": 2.967441687298721, "grad_norm": 0.2469920665025711, "learning_rate": 1.6489895165807962e-05, "loss": 1.2412, "step": 9963 }, { "epoch": 2.96773953349839, "grad_norm": 0.24608944356441498, "learning_rate": 1.64891612960837e-05, "loss": 1.2487, "step": 9964 }, { "epoch": 2.9680373796980586, "grad_norm": 0.24406841397285461, "learning_rate": 1.648842736598504e-05, "loss": 1.2477, "step": 9965 }, { "epoch": 2.968335225897727, "grad_norm": 0.2579127848148346, "learning_rate": 1.6487693375518815e-05, "loss": 1.2541, "step": 9966 }, { "epoch": 2.9686330720973957, "grad_norm": 0.24070177972316742, "learning_rate": 1.648695932469185e-05, "loss": 1.2532, "step": 9967 }, { "epoch": 2.9689309182970645, "grad_norm": 0.24999187886714935, "learning_rate": 1.6486225213510975e-05, "loss": 1.2431, "step": 9968 }, { "epoch": 2.969228764496733, "grad_norm": 0.2467721551656723, "learning_rate": 1.6485491041983027e-05, "loss": 1.2389, "step": 9969 }, { "epoch": 2.9695266106964016, "grad_norm": 0.2413964718580246, "learning_rate": 1.6484756810114825e-05, "loss": 1.2456, "step": 9970 }, { "epoch": 2.9698244568960703, "grad_norm": 0.25925734639167786, "learning_rate": 1.6484022517913207e-05, "loss": 1.2325, "step": 9971 }, { "epoch": 2.970122303095739, "grad_norm": 0.2431621253490448, "learning_rate": 1.6483288165385008e-05, "loss": 1.2285, "step": 9972 }, { "epoch": 2.9704201492954074, "grad_norm": 0.2343003749847412, "learning_rate": 1.648255375253705e-05, "loss": 1.2302, "step": 9973 }, { "epoch": 2.970717995495076, "grad_norm": 0.23708860576152802, "learning_rate": 1.6481819279376176e-05, "loss": 1.2539, "step": 9974 }, { "epoch": 2.971015841694745, "grad_norm": 0.23648568987846375, "learning_rate": 1.6481084745909214e-05, "loss": 1.2507, "step": 9975 }, { "epoch": 2.9713136878944137, "grad_norm": 0.23991839587688446, "learning_rate": 1.6480350152143e-05, "loss": 1.2349, "step": 9976 }, { "epoch": 2.971611534094082, "grad_norm": 0.24438275396823883, "learning_rate": 1.647961549808437e-05, "loss": 1.2539, "step": 9977 }, { "epoch": 2.971909380293751, "grad_norm": 0.24863120913505554, "learning_rate": 1.6478880783740153e-05, "loss": 1.2439, "step": 9978 }, { "epoch": 2.9722072264934196, "grad_norm": 0.23804743587970734, "learning_rate": 1.6478146009117195e-05, "loss": 1.2279, "step": 9979 }, { "epoch": 2.972505072693088, "grad_norm": 0.2433329075574875, "learning_rate": 1.6477411174222323e-05, "loss": 1.2343, "step": 9980 }, { "epoch": 2.9728029188927567, "grad_norm": 0.24504166841506958, "learning_rate": 1.6476676279062376e-05, "loss": 1.2495, "step": 9981 }, { "epoch": 2.9731007650924255, "grad_norm": 0.23672258853912354, "learning_rate": 1.6475941323644192e-05, "loss": 1.248, "step": 9982 }, { "epoch": 2.973398611292094, "grad_norm": 0.24193841218948364, "learning_rate": 1.647520630797461e-05, "loss": 1.2482, "step": 9983 }, { "epoch": 2.9736964574917626, "grad_norm": 0.22066807746887207, "learning_rate": 1.6474471232060468e-05, "loss": 1.2481, "step": 9984 }, { "epoch": 2.9739943036914314, "grad_norm": 0.29701849818229675, "learning_rate": 1.6473736095908605e-05, "loss": 1.2438, "step": 9985 }, { "epoch": 2.9742921498911, "grad_norm": 0.2828117907047272, "learning_rate": 1.6473000899525862e-05, "loss": 1.2446, "step": 9986 }, { "epoch": 2.974589996090769, "grad_norm": 0.26809558272361755, "learning_rate": 1.6472265642919077e-05, "loss": 1.2423, "step": 9987 }, { "epoch": 2.9748878422904372, "grad_norm": 0.28550103306770325, "learning_rate": 1.647153032609509e-05, "loss": 1.2449, "step": 9988 }, { "epoch": 2.975185688490106, "grad_norm": 0.22642813622951508, "learning_rate": 1.6470794949060748e-05, "loss": 1.2275, "step": 9989 }, { "epoch": 2.975483534689775, "grad_norm": 0.2423151582479477, "learning_rate": 1.6470059511822882e-05, "loss": 1.2458, "step": 9990 }, { "epoch": 2.975781380889443, "grad_norm": 0.23246872425079346, "learning_rate": 1.6469324014388343e-05, "loss": 1.2529, "step": 9991 }, { "epoch": 2.976079227089112, "grad_norm": 0.2409932166337967, "learning_rate": 1.6468588456763974e-05, "loss": 1.2341, "step": 9992 }, { "epoch": 2.9763770732887807, "grad_norm": 0.2303859293460846, "learning_rate": 1.6467852838956618e-05, "loss": 1.2365, "step": 9993 }, { "epoch": 2.976674919488449, "grad_norm": 0.2380605936050415, "learning_rate": 1.6467117160973116e-05, "loss": 1.2535, "step": 9994 }, { "epoch": 2.9769727656881178, "grad_norm": 0.24125492572784424, "learning_rate": 1.646638142282031e-05, "loss": 1.2511, "step": 9995 }, { "epoch": 2.9772706118877865, "grad_norm": 0.23002773523330688, "learning_rate": 1.6465645624505056e-05, "loss": 1.2329, "step": 9996 }, { "epoch": 2.977568458087455, "grad_norm": 0.23827829957008362, "learning_rate": 1.6464909766034192e-05, "loss": 1.2401, "step": 9997 }, { "epoch": 2.9778663042871236, "grad_norm": 0.26324141025543213, "learning_rate": 1.6464173847414563e-05, "loss": 1.241, "step": 9998 }, { "epoch": 2.9781641504867924, "grad_norm": 0.28758561611175537, "learning_rate": 1.646343786865302e-05, "loss": 1.2317, "step": 9999 }, { "epoch": 2.978461996686461, "grad_norm": 0.23155654966831207, "learning_rate": 1.646270182975641e-05, "loss": 1.2442, "step": 10000 }, { "epoch": 2.978461996686461, "eval_loss": 1.3382065296173096, "eval_runtime": 21.0972, "eval_samples_per_second": 82.191, "eval_steps_per_second": 5.167, "step": 10000 }, { "epoch": 2.97875984288613, "grad_norm": 0.4102727770805359, "learning_rate": 1.6461965730731577e-05, "loss": 1.2258, "step": 10001 }, { "epoch": 2.9790576890857983, "grad_norm": 0.30318817496299744, "learning_rate": 1.646122957158537e-05, "loss": 1.2344, "step": 10002 }, { "epoch": 2.979355535285467, "grad_norm": 0.3035779893398285, "learning_rate": 1.6460493352324652e-05, "loss": 1.2306, "step": 10003 }, { "epoch": 2.979653381485136, "grad_norm": 0.24000269174575806, "learning_rate": 1.6459757072956252e-05, "loss": 1.2533, "step": 10004 }, { "epoch": 2.979951227684804, "grad_norm": 0.25043413043022156, "learning_rate": 1.6459020733487033e-05, "loss": 1.244, "step": 10005 }, { "epoch": 2.980249073884473, "grad_norm": 0.29275935888290405, "learning_rate": 1.645828433392384e-05, "loss": 1.2406, "step": 10006 }, { "epoch": 2.9805469200841417, "grad_norm": 0.2531569302082062, "learning_rate": 1.645754787427353e-05, "loss": 1.2479, "step": 10007 }, { "epoch": 2.98084476628381, "grad_norm": 0.24172085523605347, "learning_rate": 1.6456811354542948e-05, "loss": 1.2395, "step": 10008 }, { "epoch": 2.981142612483479, "grad_norm": 0.23907482624053955, "learning_rate": 1.6456074774738955e-05, "loss": 1.2268, "step": 10009 }, { "epoch": 2.9814404586831476, "grad_norm": 0.251799076795578, "learning_rate": 1.6455338134868396e-05, "loss": 1.2493, "step": 10010 }, { "epoch": 2.981738304882816, "grad_norm": 0.2671225666999817, "learning_rate": 1.645460143493813e-05, "loss": 1.2318, "step": 10011 }, { "epoch": 2.9820361510824847, "grad_norm": 0.2631953954696655, "learning_rate": 1.645386467495501e-05, "loss": 1.2437, "step": 10012 }, { "epoch": 2.9823339972821534, "grad_norm": 0.22674435377120972, "learning_rate": 1.645312785492589e-05, "loss": 1.2618, "step": 10013 }, { "epoch": 2.982631843481822, "grad_norm": 0.23676465451717377, "learning_rate": 1.645239097485762e-05, "loss": 1.2477, "step": 10014 }, { "epoch": 2.982929689681491, "grad_norm": 0.23759299516677856, "learning_rate": 1.6451654034757066e-05, "loss": 1.2551, "step": 10015 }, { "epoch": 2.9832275358811593, "grad_norm": 0.25936782360076904, "learning_rate": 1.6450917034631076e-05, "loss": 1.2643, "step": 10016 }, { "epoch": 2.983525382080828, "grad_norm": 0.23249895870685577, "learning_rate": 1.6450179974486516e-05, "loss": 1.252, "step": 10017 }, { "epoch": 2.983823228280497, "grad_norm": 0.2421620935201645, "learning_rate": 1.6449442854330235e-05, "loss": 1.2437, "step": 10018 }, { "epoch": 2.984121074480165, "grad_norm": 0.22764573991298676, "learning_rate": 1.644870567416909e-05, "loss": 1.2379, "step": 10019 }, { "epoch": 2.984418920679834, "grad_norm": 0.25051426887512207, "learning_rate": 1.644796843400995e-05, "loss": 1.2374, "step": 10020 }, { "epoch": 2.9847167668795027, "grad_norm": 0.2495751678943634, "learning_rate": 1.644723113385966e-05, "loss": 1.2337, "step": 10021 }, { "epoch": 2.985014613079171, "grad_norm": 0.24617891013622284, "learning_rate": 1.6446493773725095e-05, "loss": 1.2517, "step": 10022 }, { "epoch": 2.98531245927884, "grad_norm": 0.2377837598323822, "learning_rate": 1.6445756353613105e-05, "loss": 1.264, "step": 10023 }, { "epoch": 2.9856103054785086, "grad_norm": 0.23834186792373657, "learning_rate": 1.6445018873530552e-05, "loss": 1.2531, "step": 10024 }, { "epoch": 2.985908151678177, "grad_norm": 0.23706509172916412, "learning_rate": 1.64442813334843e-05, "loss": 1.2448, "step": 10025 }, { "epoch": 2.9862059978778457, "grad_norm": 0.25559601187705994, "learning_rate": 1.644354373348121e-05, "loss": 1.2515, "step": 10026 }, { "epoch": 2.9865038440775145, "grad_norm": 0.23977580666542053, "learning_rate": 1.644280607352815e-05, "loss": 1.2304, "step": 10027 }, { "epoch": 2.9868016902771832, "grad_norm": 0.22511330246925354, "learning_rate": 1.644206835363197e-05, "loss": 1.2362, "step": 10028 }, { "epoch": 2.987099536476852, "grad_norm": 0.2313072681427002, "learning_rate": 1.6441330573799546e-05, "loss": 1.2339, "step": 10029 }, { "epoch": 2.9873973826765203, "grad_norm": 0.23325109481811523, "learning_rate": 1.644059273403774e-05, "loss": 1.2507, "step": 10030 }, { "epoch": 2.987695228876189, "grad_norm": 0.23916980624198914, "learning_rate": 1.6439854834353412e-05, "loss": 1.2473, "step": 10031 }, { "epoch": 2.987993075075858, "grad_norm": 0.23768900334835052, "learning_rate": 1.6439116874753426e-05, "loss": 1.2488, "step": 10032 }, { "epoch": 2.988290921275526, "grad_norm": 0.2517053186893463, "learning_rate": 1.6438378855244655e-05, "loss": 1.2495, "step": 10033 }, { "epoch": 2.988588767475195, "grad_norm": 0.23658375442028046, "learning_rate": 1.6437640775833963e-05, "loss": 1.2394, "step": 10034 }, { "epoch": 2.9888866136748637, "grad_norm": 0.23645053803920746, "learning_rate": 1.6436902636528215e-05, "loss": 1.2206, "step": 10035 }, { "epoch": 2.989184459874532, "grad_norm": 0.2451772540807724, "learning_rate": 1.643616443733428e-05, "loss": 1.2519, "step": 10036 }, { "epoch": 2.989482306074201, "grad_norm": 0.22844362258911133, "learning_rate": 1.6435426178259025e-05, "loss": 1.2361, "step": 10037 }, { "epoch": 2.9897801522738696, "grad_norm": 0.24487826228141785, "learning_rate": 1.643468785930932e-05, "loss": 1.2463, "step": 10038 }, { "epoch": 2.9900779984735384, "grad_norm": 0.24516311287879944, "learning_rate": 1.6433949480492032e-05, "loss": 1.2239, "step": 10039 }, { "epoch": 2.9903758446732067, "grad_norm": 0.2422989457845688, "learning_rate": 1.6433211041814036e-05, "loss": 1.2561, "step": 10040 }, { "epoch": 2.9906736908728755, "grad_norm": 0.22691424190998077, "learning_rate": 1.6432472543282195e-05, "loss": 1.2112, "step": 10041 }, { "epoch": 2.9909715370725443, "grad_norm": 0.22487223148345947, "learning_rate": 1.6431733984903386e-05, "loss": 1.2274, "step": 10042 }, { "epoch": 2.991269383272213, "grad_norm": 0.22842620313167572, "learning_rate": 1.6430995366684478e-05, "loss": 1.23, "step": 10043 }, { "epoch": 2.9915672294718814, "grad_norm": 0.2280990481376648, "learning_rate": 1.6430256688632345e-05, "loss": 1.2465, "step": 10044 }, { "epoch": 2.99186507567155, "grad_norm": 0.24010218679904938, "learning_rate": 1.642951795075386e-05, "loss": 1.2387, "step": 10045 }, { "epoch": 2.992162921871219, "grad_norm": 0.23180556297302246, "learning_rate": 1.6428779153055886e-05, "loss": 1.2317, "step": 10046 }, { "epoch": 2.9924607680708872, "grad_norm": 0.24207353591918945, "learning_rate": 1.6428040295545308e-05, "loss": 1.245, "step": 10047 }, { "epoch": 2.992758614270556, "grad_norm": 0.23159845173358917, "learning_rate": 1.6427301378229e-05, "loss": 1.2436, "step": 10048 }, { "epoch": 2.9930564604702248, "grad_norm": 0.24207107722759247, "learning_rate": 1.642656240111383e-05, "loss": 1.2449, "step": 10049 }, { "epoch": 2.993354306669893, "grad_norm": 0.22876659035682678, "learning_rate": 1.642582336420668e-05, "loss": 1.2268, "step": 10050 }, { "epoch": 2.993652152869562, "grad_norm": 0.23025888204574585, "learning_rate": 1.642508426751442e-05, "loss": 1.2499, "step": 10051 }, { "epoch": 2.9939499990692306, "grad_norm": 0.2378217875957489, "learning_rate": 1.642434511104393e-05, "loss": 1.2434, "step": 10052 }, { "epoch": 2.9942478452688994, "grad_norm": 0.24848827719688416, "learning_rate": 1.642360589480209e-05, "loss": 1.2405, "step": 10053 }, { "epoch": 2.994545691468568, "grad_norm": 0.26267918944358826, "learning_rate": 1.642286661879577e-05, "loss": 1.2495, "step": 10054 }, { "epoch": 2.9948435376682365, "grad_norm": 0.22974130511283875, "learning_rate": 1.642212728303185e-05, "loss": 1.2466, "step": 10055 }, { "epoch": 2.9951413838679053, "grad_norm": 0.39148205518722534, "learning_rate": 1.6421387887517215e-05, "loss": 1.2371, "step": 10056 }, { "epoch": 2.995439230067574, "grad_norm": 0.2892146706581116, "learning_rate": 1.6420648432258743e-05, "loss": 1.2605, "step": 10057 }, { "epoch": 2.9957370762672424, "grad_norm": 0.284047394990921, "learning_rate": 1.6419908917263305e-05, "loss": 1.2328, "step": 10058 }, { "epoch": 2.996034922466911, "grad_norm": 0.23381011188030243, "learning_rate": 1.641916934253779e-05, "loss": 1.2447, "step": 10059 }, { "epoch": 2.99633276866658, "grad_norm": 0.34840264916419983, "learning_rate": 1.6418429708089076e-05, "loss": 1.2442, "step": 10060 }, { "epoch": 2.9966306148662483, "grad_norm": 0.2386302649974823, "learning_rate": 1.6417690013924046e-05, "loss": 1.242, "step": 10061 }, { "epoch": 2.996928461065917, "grad_norm": 0.26766037940979004, "learning_rate": 1.641695026004958e-05, "loss": 1.2283, "step": 10062 }, { "epoch": 2.997226307265586, "grad_norm": 0.23906347155570984, "learning_rate": 1.6416210446472555e-05, "loss": 1.2335, "step": 10063 }, { "epoch": 2.997524153465254, "grad_norm": 0.26001253724098206, "learning_rate": 1.6415470573199867e-05, "loss": 1.2677, "step": 10064 }, { "epoch": 2.997821999664923, "grad_norm": 0.24827906489372253, "learning_rate": 1.641473064023839e-05, "loss": 1.2384, "step": 10065 }, { "epoch": 2.9981198458645917, "grad_norm": 0.24536694586277008, "learning_rate": 1.6413990647595016e-05, "loss": 1.2615, "step": 10066 }, { "epoch": 2.9984176920642605, "grad_norm": 0.2591598927974701, "learning_rate": 1.6413250595276623e-05, "loss": 1.2352, "step": 10067 }, { "epoch": 2.9987155382639292, "grad_norm": 0.25052574276924133, "learning_rate": 1.6412510483290098e-05, "loss": 1.2383, "step": 10068 }, { "epoch": 2.9990133844635976, "grad_norm": 0.24174103140830994, "learning_rate": 1.6411770311642326e-05, "loss": 1.2475, "step": 10069 }, { "epoch": 2.9993112306632663, "grad_norm": 0.23268310725688934, "learning_rate": 1.6411030080340195e-05, "loss": 1.2196, "step": 10070 }, { "epoch": 2.999609076862935, "grad_norm": 0.2571922540664673, "learning_rate": 1.6410289789390598e-05, "loss": 1.2401, "step": 10071 }, { "epoch": 2.9999069230626034, "grad_norm": 0.24435901641845703, "learning_rate": 1.6409549438800407e-05, "loss": 1.238, "step": 10072 }, { "epoch": 3.000204769262272, "grad_norm": 0.38965585827827454, "learning_rate": 1.6408809028576526e-05, "loss": 1.2529, "step": 10073 }, { "epoch": 3.000502615461941, "grad_norm": 0.32282426953315735, "learning_rate": 1.6408068558725835e-05, "loss": 1.233, "step": 10074 }, { "epoch": 3.0008004616616093, "grad_norm": 0.28637224435806274, "learning_rate": 1.6407328029255225e-05, "loss": 1.2493, "step": 10075 }, { "epoch": 3.001098307861278, "grad_norm": 0.24057908356189728, "learning_rate": 1.640658744017159e-05, "loss": 1.2342, "step": 10076 }, { "epoch": 3.001396154060947, "grad_norm": 0.43580493330955505, "learning_rate": 1.6405846791481813e-05, "loss": 1.2304, "step": 10077 }, { "epoch": 3.0016940002606156, "grad_norm": 0.2577560544013977, "learning_rate": 1.640510608319279e-05, "loss": 1.225, "step": 10078 }, { "epoch": 3.001991846460284, "grad_norm": 0.2544635534286499, "learning_rate": 1.6404365315311412e-05, "loss": 1.2572, "step": 10079 }, { "epoch": 3.0022896926599527, "grad_norm": 0.24499472975730896, "learning_rate": 1.640362448784457e-05, "loss": 1.2353, "step": 10080 }, { "epoch": 3.0025875388596215, "grad_norm": 0.231914684176445, "learning_rate": 1.6402883600799153e-05, "loss": 1.2213, "step": 10081 }, { "epoch": 3.00288538505929, "grad_norm": 0.2565773129463196, "learning_rate": 1.640214265418206e-05, "loss": 1.2523, "step": 10082 }, { "epoch": 3.0031832312589586, "grad_norm": 0.2541150450706482, "learning_rate": 1.6401401648000182e-05, "loss": 1.2475, "step": 10083 }, { "epoch": 3.0034810774586274, "grad_norm": 0.2428370863199234, "learning_rate": 1.640066058226042e-05, "loss": 1.2408, "step": 10084 }, { "epoch": 3.003778923658296, "grad_norm": 0.22767463326454163, "learning_rate": 1.6399919456969654e-05, "loss": 1.2381, "step": 10085 }, { "epoch": 3.0040767698579645, "grad_norm": 0.24175630509853363, "learning_rate": 1.6399178272134793e-05, "loss": 1.2309, "step": 10086 }, { "epoch": 3.0043746160576332, "grad_norm": 0.233686164021492, "learning_rate": 1.6398437027762725e-05, "loss": 1.2292, "step": 10087 }, { "epoch": 3.004672462257302, "grad_norm": 0.236064612865448, "learning_rate": 1.639769572386035e-05, "loss": 1.232, "step": 10088 }, { "epoch": 3.0049703084569703, "grad_norm": 0.23852574825286865, "learning_rate": 1.6396954360434566e-05, "loss": 1.2288, "step": 10089 }, { "epoch": 3.005268154656639, "grad_norm": 0.23065021634101868, "learning_rate": 1.6396212937492265e-05, "loss": 1.2365, "step": 10090 }, { "epoch": 3.005566000856308, "grad_norm": 0.23177464306354523, "learning_rate": 1.639547145504035e-05, "loss": 1.2563, "step": 10091 }, { "epoch": 3.0058638470559766, "grad_norm": 0.24095739424228668, "learning_rate": 1.6394729913085722e-05, "loss": 1.227, "step": 10092 }, { "epoch": 3.006161693255645, "grad_norm": 0.2505260705947876, "learning_rate": 1.6393988311635273e-05, "loss": 1.2523, "step": 10093 }, { "epoch": 3.0064595394553137, "grad_norm": 0.2472001165151596, "learning_rate": 1.6393246650695907e-05, "loss": 1.2516, "step": 10094 }, { "epoch": 3.0067573856549825, "grad_norm": 0.23386529088020325, "learning_rate": 1.6392504930274528e-05, "loss": 1.2361, "step": 10095 }, { "epoch": 3.007055231854651, "grad_norm": 0.23205740749835968, "learning_rate": 1.639176315037803e-05, "loss": 1.244, "step": 10096 }, { "epoch": 3.0073530780543196, "grad_norm": 0.24119286239147186, "learning_rate": 1.6391021311013314e-05, "loss": 1.2468, "step": 10097 }, { "epoch": 3.0076509242539884, "grad_norm": 0.24198414385318756, "learning_rate": 1.639027941218729e-05, "loss": 1.2408, "step": 10098 }, { "epoch": 3.007948770453657, "grad_norm": 0.2378912717103958, "learning_rate": 1.6389537453906854e-05, "loss": 1.2351, "step": 10099 }, { "epoch": 3.0082466166533255, "grad_norm": 0.22698108851909637, "learning_rate": 1.638879543617891e-05, "loss": 1.229, "step": 10100 }, { "epoch": 3.0085444628529943, "grad_norm": 0.23869843780994415, "learning_rate": 1.6388053359010362e-05, "loss": 1.2456, "step": 10101 }, { "epoch": 3.008842309052663, "grad_norm": 0.22983010113239288, "learning_rate": 1.638731122240812e-05, "loss": 1.2401, "step": 10102 }, { "epoch": 3.0091401552523314, "grad_norm": 0.23344950377941132, "learning_rate": 1.6386569026379074e-05, "loss": 1.2348, "step": 10103 }, { "epoch": 3.009438001452, "grad_norm": 0.24137677252292633, "learning_rate": 1.6385826770930147e-05, "loss": 1.2311, "step": 10104 }, { "epoch": 3.009735847651669, "grad_norm": 0.2299661934375763, "learning_rate": 1.638508445606823e-05, "loss": 1.2294, "step": 10105 }, { "epoch": 3.0100336938513377, "grad_norm": 0.24330636858940125, "learning_rate": 1.638434208180024e-05, "loss": 1.2447, "step": 10106 }, { "epoch": 3.010331540051006, "grad_norm": 0.23655346035957336, "learning_rate": 1.6383599648133078e-05, "loss": 1.2402, "step": 10107 }, { "epoch": 3.0106293862506748, "grad_norm": 0.24257723987102509, "learning_rate": 1.6382857155073658e-05, "loss": 1.233, "step": 10108 }, { "epoch": 3.0109272324503435, "grad_norm": 0.23627245426177979, "learning_rate": 1.638211460262888e-05, "loss": 1.2493, "step": 10109 }, { "epoch": 3.0112250786500123, "grad_norm": 0.25192874670028687, "learning_rate": 1.6381371990805656e-05, "loss": 1.2466, "step": 10110 }, { "epoch": 3.0115229248496806, "grad_norm": 0.24103054404258728, "learning_rate": 1.6380629319610894e-05, "loss": 1.2347, "step": 10111 }, { "epoch": 3.0118207710493494, "grad_norm": 0.24143441021442413, "learning_rate": 1.6379886589051506e-05, "loss": 1.2557, "step": 10112 }, { "epoch": 3.012118617249018, "grad_norm": 0.2334558069705963, "learning_rate": 1.63791437991344e-05, "loss": 1.2257, "step": 10113 }, { "epoch": 3.0124164634486865, "grad_norm": 0.23910120129585266, "learning_rate": 1.6378400949866493e-05, "loss": 1.2467, "step": 10114 }, { "epoch": 3.0127143096483553, "grad_norm": 0.24178540706634521, "learning_rate": 1.6377658041254688e-05, "loss": 1.2319, "step": 10115 }, { "epoch": 3.013012155848024, "grad_norm": 0.2362511307001114, "learning_rate": 1.6376915073305904e-05, "loss": 1.237, "step": 10116 }, { "epoch": 3.013310002047693, "grad_norm": 0.23667585849761963, "learning_rate": 1.6376172046027043e-05, "loss": 1.2372, "step": 10117 }, { "epoch": 3.013607848247361, "grad_norm": 0.2358350157737732, "learning_rate": 1.637542895942503e-05, "loss": 1.2308, "step": 10118 }, { "epoch": 3.01390569444703, "grad_norm": 0.2408997267484665, "learning_rate": 1.6374685813506773e-05, "loss": 1.2306, "step": 10119 }, { "epoch": 3.0142035406466987, "grad_norm": 0.23724953830242157, "learning_rate": 1.637394260827919e-05, "loss": 1.2438, "step": 10120 }, { "epoch": 3.014501386846367, "grad_norm": 0.2508860230445862, "learning_rate": 1.6373199343749187e-05, "loss": 1.2254, "step": 10121 }, { "epoch": 3.014799233046036, "grad_norm": 0.24599908292293549, "learning_rate": 1.6372456019923685e-05, "loss": 1.2498, "step": 10122 }, { "epoch": 3.0150970792457046, "grad_norm": 0.24873362481594086, "learning_rate": 1.6371712636809603e-05, "loss": 1.2336, "step": 10123 }, { "epoch": 3.0153949254453734, "grad_norm": 0.24091993272304535, "learning_rate": 1.637096919441385e-05, "loss": 1.2315, "step": 10124 }, { "epoch": 3.0156927716450417, "grad_norm": 0.247350811958313, "learning_rate": 1.6370225692743348e-05, "loss": 1.2399, "step": 10125 }, { "epoch": 3.0159906178447105, "grad_norm": 0.2587589621543884, "learning_rate": 1.6369482131805015e-05, "loss": 1.2417, "step": 10126 }, { "epoch": 3.0162884640443792, "grad_norm": 0.24647526443004608, "learning_rate": 1.6368738511605764e-05, "loss": 1.2353, "step": 10127 }, { "epoch": 3.0165863102440476, "grad_norm": 0.23632420599460602, "learning_rate": 1.636799483215252e-05, "loss": 1.2449, "step": 10128 }, { "epoch": 3.0168841564437163, "grad_norm": 0.24009020626544952, "learning_rate": 1.6367251093452197e-05, "loss": 1.2358, "step": 10129 }, { "epoch": 3.017182002643385, "grad_norm": 0.2408895343542099, "learning_rate": 1.6366507295511715e-05, "loss": 1.2273, "step": 10130 }, { "epoch": 3.017479848843054, "grad_norm": 0.2386654168367386, "learning_rate": 1.6365763438338e-05, "loss": 1.2413, "step": 10131 }, { "epoch": 3.017777695042722, "grad_norm": 0.23126019537448883, "learning_rate": 1.6365019521937964e-05, "loss": 1.2347, "step": 10132 }, { "epoch": 3.018075541242391, "grad_norm": 0.23686255514621735, "learning_rate": 1.6364275546318535e-05, "loss": 1.2486, "step": 10133 }, { "epoch": 3.0183733874420597, "grad_norm": 0.23731625080108643, "learning_rate": 1.6363531511486634e-05, "loss": 1.2391, "step": 10134 }, { "epoch": 3.018671233641728, "grad_norm": 0.2382894605398178, "learning_rate": 1.636278741744918e-05, "loss": 1.245, "step": 10135 }, { "epoch": 3.018969079841397, "grad_norm": 0.23662148416042328, "learning_rate": 1.6362043264213098e-05, "loss": 1.2647, "step": 10136 }, { "epoch": 3.0192669260410656, "grad_norm": 0.23939202725887299, "learning_rate": 1.636129905178531e-05, "loss": 1.2407, "step": 10137 }, { "epoch": 3.0195647722407344, "grad_norm": 0.2415553778409958, "learning_rate": 1.6360554780172745e-05, "loss": 1.2289, "step": 10138 }, { "epoch": 3.0198626184404027, "grad_norm": 0.25028204917907715, "learning_rate": 1.635981044938232e-05, "loss": 1.2224, "step": 10139 }, { "epoch": 3.0201604646400715, "grad_norm": 0.26298055052757263, "learning_rate": 1.6359066059420968e-05, "loss": 1.2614, "step": 10140 }, { "epoch": 3.0204583108397403, "grad_norm": 0.24997131526470184, "learning_rate": 1.635832161029561e-05, "loss": 1.2273, "step": 10141 }, { "epoch": 3.0207561570394086, "grad_norm": 0.28070592880249023, "learning_rate": 1.6357577102013173e-05, "loss": 1.2347, "step": 10142 }, { "epoch": 3.0210540032390774, "grad_norm": 0.2561105191707611, "learning_rate": 1.6356832534580585e-05, "loss": 1.2505, "step": 10143 }, { "epoch": 3.021351849438746, "grad_norm": 0.2369934618473053, "learning_rate": 1.6356087908004773e-05, "loss": 1.2311, "step": 10144 }, { "epoch": 3.021649695638415, "grad_norm": 0.2471408247947693, "learning_rate": 1.6355343222292664e-05, "loss": 1.2433, "step": 10145 }, { "epoch": 3.0219475418380832, "grad_norm": 0.2605624496936798, "learning_rate": 1.6354598477451187e-05, "loss": 1.2439, "step": 10146 }, { "epoch": 3.022245388037752, "grad_norm": 0.2423917055130005, "learning_rate": 1.635385367348727e-05, "loss": 1.2386, "step": 10147 }, { "epoch": 3.0225432342374208, "grad_norm": 0.2808777689933777, "learning_rate": 1.6353108810407845e-05, "loss": 1.2583, "step": 10148 }, { "epoch": 3.022841080437089, "grad_norm": 0.2584322988986969, "learning_rate": 1.6352363888219838e-05, "loss": 1.2342, "step": 10149 }, { "epoch": 3.023138926636758, "grad_norm": 0.25437480211257935, "learning_rate": 1.6351618906930188e-05, "loss": 1.2257, "step": 10150 }, { "epoch": 3.0234367728364266, "grad_norm": 0.3657657504081726, "learning_rate": 1.6350873866545814e-05, "loss": 1.2374, "step": 10151 }, { "epoch": 3.0237346190360954, "grad_norm": 0.2797352075576782, "learning_rate": 1.6350128767073655e-05, "loss": 1.2313, "step": 10152 }, { "epoch": 3.0240324652357637, "grad_norm": 0.30090221762657166, "learning_rate": 1.6349383608520646e-05, "loss": 1.2509, "step": 10153 }, { "epoch": 3.0243303114354325, "grad_norm": 0.30294233560562134, "learning_rate": 1.6348638390893717e-05, "loss": 1.2392, "step": 10154 }, { "epoch": 3.0246281576351013, "grad_norm": 0.2502883970737457, "learning_rate": 1.6347893114199795e-05, "loss": 1.2271, "step": 10155 }, { "epoch": 3.0249260038347696, "grad_norm": 0.23533587157726288, "learning_rate": 1.6347147778445823e-05, "loss": 1.2297, "step": 10156 }, { "epoch": 3.0252238500344384, "grad_norm": 0.25665155053138733, "learning_rate": 1.6346402383638734e-05, "loss": 1.2412, "step": 10157 }, { "epoch": 3.025521696234107, "grad_norm": 0.25076520442962646, "learning_rate": 1.6345656929785462e-05, "loss": 1.2345, "step": 10158 }, { "epoch": 3.025819542433776, "grad_norm": 0.2445412129163742, "learning_rate": 1.634491141689294e-05, "loss": 1.2363, "step": 10159 }, { "epoch": 3.0261173886334443, "grad_norm": 0.2960551679134369, "learning_rate": 1.6344165844968104e-05, "loss": 1.2396, "step": 10160 }, { "epoch": 3.026415234833113, "grad_norm": 0.2761836349964142, "learning_rate": 1.63434202140179e-05, "loss": 1.2329, "step": 10161 }, { "epoch": 3.026713081032782, "grad_norm": 0.28803741931915283, "learning_rate": 1.634267452404925e-05, "loss": 1.2384, "step": 10162 }, { "epoch": 3.02701092723245, "grad_norm": 0.2996866703033447, "learning_rate": 1.6341928775069106e-05, "loss": 1.2492, "step": 10163 }, { "epoch": 3.027308773432119, "grad_norm": 0.2283649444580078, "learning_rate": 1.6341182967084397e-05, "loss": 1.243, "step": 10164 }, { "epoch": 3.0276066196317877, "grad_norm": 0.24121533334255219, "learning_rate": 1.6340437100102067e-05, "loss": 1.2613, "step": 10165 }, { "epoch": 3.0279044658314564, "grad_norm": 0.23092056810855865, "learning_rate": 1.6339691174129053e-05, "loss": 1.2385, "step": 10166 }, { "epoch": 3.0282023120311248, "grad_norm": 0.25815844535827637, "learning_rate": 1.6338945189172297e-05, "loss": 1.2383, "step": 10167 }, { "epoch": 3.0285001582307935, "grad_norm": 0.2433355301618576, "learning_rate": 1.6338199145238737e-05, "loss": 1.2436, "step": 10168 }, { "epoch": 3.0287980044304623, "grad_norm": 0.24668675661087036, "learning_rate": 1.6337453042335315e-05, "loss": 1.2438, "step": 10169 }, { "epoch": 3.0290958506301306, "grad_norm": 0.28404033184051514, "learning_rate": 1.6336706880468972e-05, "loss": 1.2417, "step": 10170 }, { "epoch": 3.0293936968297994, "grad_norm": 0.2578742206096649, "learning_rate": 1.6335960659646655e-05, "loss": 1.2413, "step": 10171 }, { "epoch": 3.029691543029468, "grad_norm": 0.23545892536640167, "learning_rate": 1.63352143798753e-05, "loss": 1.2427, "step": 10172 }, { "epoch": 3.029989389229137, "grad_norm": 0.2703379988670349, "learning_rate": 1.6334468041161854e-05, "loss": 1.2585, "step": 10173 }, { "epoch": 3.0302872354288053, "grad_norm": 0.24328207969665527, "learning_rate": 1.633372164351326e-05, "loss": 1.2272, "step": 10174 }, { "epoch": 3.030585081628474, "grad_norm": 0.252121239900589, "learning_rate": 1.6332975186936464e-05, "loss": 1.2368, "step": 10175 }, { "epoch": 3.030882927828143, "grad_norm": 0.23458684980869293, "learning_rate": 1.633222867143841e-05, "loss": 1.2332, "step": 10176 }, { "epoch": 3.0311807740278116, "grad_norm": 0.26045963168144226, "learning_rate": 1.633148209702604e-05, "loss": 1.2394, "step": 10177 }, { "epoch": 3.03147862022748, "grad_norm": 0.2555118203163147, "learning_rate": 1.6330735463706305e-05, "loss": 1.2362, "step": 10178 }, { "epoch": 3.0317764664271487, "grad_norm": 0.23761750757694244, "learning_rate": 1.6329988771486148e-05, "loss": 1.2389, "step": 10179 }, { "epoch": 3.0320743126268175, "grad_norm": 0.24389159679412842, "learning_rate": 1.632924202037252e-05, "loss": 1.2347, "step": 10180 }, { "epoch": 3.032372158826486, "grad_norm": 0.24950288236141205, "learning_rate": 1.6328495210372363e-05, "loss": 1.2641, "step": 10181 }, { "epoch": 3.0326700050261546, "grad_norm": 0.24158208072185516, "learning_rate": 1.6327748341492633e-05, "loss": 1.247, "step": 10182 }, { "epoch": 3.0329678512258234, "grad_norm": 0.47347182035446167, "learning_rate": 1.632700141374027e-05, "loss": 1.2471, "step": 10183 }, { "epoch": 3.033265697425492, "grad_norm": 0.41325369477272034, "learning_rate": 1.6326254427122236e-05, "loss": 1.2486, "step": 10184 }, { "epoch": 3.0335635436251605, "grad_norm": 0.32898035645484924, "learning_rate": 1.6325507381645464e-05, "loss": 1.2249, "step": 10185 }, { "epoch": 3.0338613898248292, "grad_norm": 0.27398014068603516, "learning_rate": 1.6324760277316917e-05, "loss": 1.2395, "step": 10186 }, { "epoch": 3.034159236024498, "grad_norm": 0.3908424377441406, "learning_rate": 1.632401311414354e-05, "loss": 1.2287, "step": 10187 }, { "epoch": 3.0344570822241663, "grad_norm": 0.24559953808784485, "learning_rate": 1.632326589213229e-05, "loss": 1.2402, "step": 10188 }, { "epoch": 3.034754928423835, "grad_norm": 0.25984713435173035, "learning_rate": 1.632251861129011e-05, "loss": 1.2438, "step": 10189 }, { "epoch": 3.035052774623504, "grad_norm": 0.25673434138298035, "learning_rate": 1.632177127162396e-05, "loss": 1.2219, "step": 10190 }, { "epoch": 3.0353506208231726, "grad_norm": 0.23939383029937744, "learning_rate": 1.6321023873140798e-05, "loss": 1.2314, "step": 10191 }, { "epoch": 3.035648467022841, "grad_norm": 0.24382399022579193, "learning_rate": 1.6320276415847564e-05, "loss": 1.2501, "step": 10192 }, { "epoch": 3.0359463132225097, "grad_norm": 0.23208534717559814, "learning_rate": 1.631952889975122e-05, "loss": 1.2307, "step": 10193 }, { "epoch": 3.0362441594221785, "grad_norm": 0.24375663697719574, "learning_rate": 1.6318781324858723e-05, "loss": 1.235, "step": 10194 }, { "epoch": 3.036542005621847, "grad_norm": 0.2524307370185852, "learning_rate": 1.6318033691177024e-05, "loss": 1.2519, "step": 10195 }, { "epoch": 3.0368398518215156, "grad_norm": 0.25700730085372925, "learning_rate": 1.631728599871308e-05, "loss": 1.2216, "step": 10196 }, { "epoch": 3.0371376980211844, "grad_norm": 0.23987676203250885, "learning_rate": 1.631653824747385e-05, "loss": 1.2316, "step": 10197 }, { "epoch": 3.037435544220853, "grad_norm": 0.23080593347549438, "learning_rate": 1.6315790437466286e-05, "loss": 1.2343, "step": 10198 }, { "epoch": 3.0377333904205215, "grad_norm": 0.24482198059558868, "learning_rate": 1.631504256869735e-05, "loss": 1.2266, "step": 10199 }, { "epoch": 3.0380312366201903, "grad_norm": 0.2541637122631073, "learning_rate": 1.6314294641174e-05, "loss": 1.2489, "step": 10200 }, { "epoch": 3.038329082819859, "grad_norm": 0.23120158910751343, "learning_rate": 1.631354665490319e-05, "loss": 1.243, "step": 10201 }, { "epoch": 3.0386269290195274, "grad_norm": 0.2334720492362976, "learning_rate": 1.6312798609891883e-05, "loss": 1.2342, "step": 10202 }, { "epoch": 3.038924775219196, "grad_norm": 0.24195466935634613, "learning_rate": 1.631205050614704e-05, "loss": 1.2298, "step": 10203 }, { "epoch": 3.039222621418865, "grad_norm": 0.24499407410621643, "learning_rate": 1.6311302343675615e-05, "loss": 1.2444, "step": 10204 }, { "epoch": 3.0395204676185337, "grad_norm": 0.2399866133928299, "learning_rate": 1.631055412248458e-05, "loss": 1.2526, "step": 10205 }, { "epoch": 3.039818313818202, "grad_norm": 0.2301747053861618, "learning_rate": 1.6309805842580882e-05, "loss": 1.2284, "step": 10206 }, { "epoch": 3.0401161600178708, "grad_norm": 0.2331252545118332, "learning_rate": 1.6309057503971497e-05, "loss": 1.211, "step": 10207 }, { "epoch": 3.0404140062175395, "grad_norm": 0.23599116504192352, "learning_rate": 1.6308309106663375e-05, "loss": 1.2338, "step": 10208 }, { "epoch": 3.040711852417208, "grad_norm": 0.23467372357845306, "learning_rate": 1.6307560650663487e-05, "loss": 1.228, "step": 10209 }, { "epoch": 3.0410096986168766, "grad_norm": 0.2402782142162323, "learning_rate": 1.6306812135978794e-05, "loss": 1.2308, "step": 10210 }, { "epoch": 3.0413075448165454, "grad_norm": 0.24587568640708923, "learning_rate": 1.6306063562616263e-05, "loss": 1.2382, "step": 10211 }, { "epoch": 3.041605391016214, "grad_norm": 0.22987627983093262, "learning_rate": 1.6305314930582857e-05, "loss": 1.2438, "step": 10212 }, { "epoch": 3.0419032372158825, "grad_norm": 0.23999477922916412, "learning_rate": 1.6304566239885535e-05, "loss": 1.2467, "step": 10213 }, { "epoch": 3.0422010834155513, "grad_norm": 0.23999232053756714, "learning_rate": 1.6303817490531272e-05, "loss": 1.251, "step": 10214 }, { "epoch": 3.04249892961522, "grad_norm": 0.2424769252538681, "learning_rate": 1.630306868252703e-05, "loss": 1.2478, "step": 10215 }, { "epoch": 3.0427967758148884, "grad_norm": 0.22676314413547516, "learning_rate": 1.6302319815879773e-05, "loss": 1.2393, "step": 10216 }, { "epoch": 3.043094622014557, "grad_norm": 0.23452128469944, "learning_rate": 1.6301570890596473e-05, "loss": 1.2498, "step": 10217 }, { "epoch": 3.043392468214226, "grad_norm": 0.23384414613246918, "learning_rate": 1.63008219066841e-05, "loss": 1.2365, "step": 10218 }, { "epoch": 3.0436903144138947, "grad_norm": 0.24246850609779358, "learning_rate": 1.6300072864149613e-05, "loss": 1.2443, "step": 10219 }, { "epoch": 3.043988160613563, "grad_norm": 0.2528890371322632, "learning_rate": 1.629932376299999e-05, "loss": 1.2387, "step": 10220 }, { "epoch": 3.044286006813232, "grad_norm": 0.24717077612876892, "learning_rate": 1.62985746032422e-05, "loss": 1.253, "step": 10221 }, { "epoch": 3.0445838530129006, "grad_norm": 0.25595220923423767, "learning_rate": 1.6297825384883206e-05, "loss": 1.2348, "step": 10222 }, { "epoch": 3.044881699212569, "grad_norm": 0.22884708642959595, "learning_rate": 1.6297076107929983e-05, "loss": 1.2235, "step": 10223 }, { "epoch": 3.0451795454122377, "grad_norm": 0.2743631899356842, "learning_rate": 1.6296326772389507e-05, "loss": 1.2154, "step": 10224 }, { "epoch": 3.0454773916119064, "grad_norm": 0.22574536502361298, "learning_rate": 1.629557737826874e-05, "loss": 1.2348, "step": 10225 }, { "epoch": 3.045775237811575, "grad_norm": 0.23804114758968353, "learning_rate": 1.6294827925574663e-05, "loss": 1.2326, "step": 10226 }, { "epoch": 3.0460730840112435, "grad_norm": 0.24106575548648834, "learning_rate": 1.6294078414314244e-05, "loss": 1.2226, "step": 10227 }, { "epoch": 3.0463709302109123, "grad_norm": 0.24153001606464386, "learning_rate": 1.6293328844494456e-05, "loss": 1.2385, "step": 10228 }, { "epoch": 3.046668776410581, "grad_norm": 0.2359992265701294, "learning_rate": 1.6292579216122276e-05, "loss": 1.2453, "step": 10229 }, { "epoch": 3.0469666226102494, "grad_norm": 0.2369110882282257, "learning_rate": 1.6291829529204676e-05, "loss": 1.2548, "step": 10230 }, { "epoch": 3.047264468809918, "grad_norm": 0.27357247471809387, "learning_rate": 1.6291079783748632e-05, "loss": 1.2468, "step": 10231 }, { "epoch": 3.047562315009587, "grad_norm": 0.3440316617488861, "learning_rate": 1.629032997976112e-05, "loss": 1.2342, "step": 10232 }, { "epoch": 3.0478601612092557, "grad_norm": 0.2712406814098358, "learning_rate": 1.6289580117249115e-05, "loss": 1.2188, "step": 10233 }, { "epoch": 3.048158007408924, "grad_norm": 0.27531933784484863, "learning_rate": 1.6288830196219595e-05, "loss": 1.2502, "step": 10234 }, { "epoch": 3.048455853608593, "grad_norm": 0.34455469250679016, "learning_rate": 1.6288080216679535e-05, "loss": 1.2447, "step": 10235 }, { "epoch": 3.0487536998082616, "grad_norm": 0.2437841147184372, "learning_rate": 1.6287330178635916e-05, "loss": 1.2342, "step": 10236 }, { "epoch": 3.04905154600793, "grad_norm": 0.2584114074707031, "learning_rate": 1.628658008209571e-05, "loss": 1.2418, "step": 10237 }, { "epoch": 3.0493493922075987, "grad_norm": 0.23838254809379578, "learning_rate": 1.6285829927065907e-05, "loss": 1.2346, "step": 10238 }, { "epoch": 3.0496472384072675, "grad_norm": 0.2907816469669342, "learning_rate": 1.6285079713553474e-05, "loss": 1.2458, "step": 10239 }, { "epoch": 3.0499450846069363, "grad_norm": 0.2896682620048523, "learning_rate": 1.62843294415654e-05, "loss": 1.2405, "step": 10240 }, { "epoch": 3.0502429308066046, "grad_norm": 0.24636253714561462, "learning_rate": 1.628357911110866e-05, "loss": 1.251, "step": 10241 }, { "epoch": 3.0505407770062734, "grad_norm": 0.26349523663520813, "learning_rate": 1.6282828722190234e-05, "loss": 1.2449, "step": 10242 }, { "epoch": 3.050838623205942, "grad_norm": 0.23846521973609924, "learning_rate": 1.628207827481711e-05, "loss": 1.2286, "step": 10243 }, { "epoch": 3.051136469405611, "grad_norm": 0.2381698191165924, "learning_rate": 1.6281327768996266e-05, "loss": 1.2255, "step": 10244 }, { "epoch": 3.0514343156052792, "grad_norm": 0.24267499148845673, "learning_rate": 1.6280577204734682e-05, "loss": 1.2217, "step": 10245 }, { "epoch": 3.051732161804948, "grad_norm": 0.2777910530567169, "learning_rate": 1.6279826582039348e-05, "loss": 1.2409, "step": 10246 }, { "epoch": 3.0520300080046168, "grad_norm": 0.32253384590148926, "learning_rate": 1.627907590091724e-05, "loss": 1.2316, "step": 10247 }, { "epoch": 3.052327854204285, "grad_norm": 0.2844820022583008, "learning_rate": 1.627832516137535e-05, "loss": 1.2502, "step": 10248 }, { "epoch": 3.052625700403954, "grad_norm": 0.5343673825263977, "learning_rate": 1.627757436342066e-05, "loss": 1.2533, "step": 10249 }, { "epoch": 3.0529235466036226, "grad_norm": 0.30444401502609253, "learning_rate": 1.6276823507060152e-05, "loss": 1.231, "step": 10250 }, { "epoch": 3.0532213928032914, "grad_norm": 0.2842743992805481, "learning_rate": 1.627607259230081e-05, "loss": 1.238, "step": 10251 }, { "epoch": 3.0535192390029597, "grad_norm": 0.2443382441997528, "learning_rate": 1.627532161914963e-05, "loss": 1.2513, "step": 10252 }, { "epoch": 3.0538170852026285, "grad_norm": 0.25539764761924744, "learning_rate": 1.6274570587613592e-05, "loss": 1.2262, "step": 10253 }, { "epoch": 3.0541149314022973, "grad_norm": 0.2958544194698334, "learning_rate": 1.6273819497699682e-05, "loss": 1.2302, "step": 10254 }, { "epoch": 3.0544127776019656, "grad_norm": 0.24095271527767181, "learning_rate": 1.6273068349414898e-05, "loss": 1.2275, "step": 10255 }, { "epoch": 3.0547106238016344, "grad_norm": 0.24353492259979248, "learning_rate": 1.6272317142766217e-05, "loss": 1.2339, "step": 10256 }, { "epoch": 3.055008470001303, "grad_norm": 0.25018948316574097, "learning_rate": 1.6271565877760632e-05, "loss": 1.2237, "step": 10257 }, { "epoch": 3.055306316200972, "grad_norm": 0.247747004032135, "learning_rate": 1.6270814554405133e-05, "loss": 1.2347, "step": 10258 }, { "epoch": 3.0556041624006403, "grad_norm": 0.25322288274765015, "learning_rate": 1.627006317270672e-05, "loss": 1.2277, "step": 10259 }, { "epoch": 3.055902008600309, "grad_norm": 0.27417194843292236, "learning_rate": 1.6269311732672363e-05, "loss": 1.2407, "step": 10260 }, { "epoch": 3.056199854799978, "grad_norm": 0.23231066763401031, "learning_rate": 1.626856023430907e-05, "loss": 1.2221, "step": 10261 }, { "epoch": 3.056497700999646, "grad_norm": 0.24387730658054352, "learning_rate": 1.626780867762383e-05, "loss": 1.2457, "step": 10262 }, { "epoch": 3.056795547199315, "grad_norm": 0.26461726427078247, "learning_rate": 1.6267057062623627e-05, "loss": 1.2462, "step": 10263 }, { "epoch": 3.0570933933989837, "grad_norm": 0.22684389352798462, "learning_rate": 1.6266305389315463e-05, "loss": 1.2299, "step": 10264 }, { "epoch": 3.0573912395986524, "grad_norm": 0.2952120900154114, "learning_rate": 1.626555365770633e-05, "loss": 1.2412, "step": 10265 }, { "epoch": 3.0576890857983208, "grad_norm": 0.25628674030303955, "learning_rate": 1.6264801867803218e-05, "loss": 1.2338, "step": 10266 }, { "epoch": 3.0579869319979895, "grad_norm": 0.2316766083240509, "learning_rate": 1.6264050019613125e-05, "loss": 1.2487, "step": 10267 }, { "epoch": 3.0582847781976583, "grad_norm": 0.25355660915374756, "learning_rate": 1.6263298113143044e-05, "loss": 1.2368, "step": 10268 }, { "epoch": 3.0585826243973266, "grad_norm": 0.26539427042007446, "learning_rate": 1.6262546148399977e-05, "loss": 1.2378, "step": 10269 }, { "epoch": 3.0588804705969954, "grad_norm": 0.23979556560516357, "learning_rate": 1.626179412539091e-05, "loss": 1.2353, "step": 10270 }, { "epoch": 3.059178316796664, "grad_norm": 0.24969260394573212, "learning_rate": 1.6261042044122845e-05, "loss": 1.2402, "step": 10271 }, { "epoch": 3.059476162996333, "grad_norm": 0.2942097783088684, "learning_rate": 1.626028990460278e-05, "loss": 1.2314, "step": 10272 }, { "epoch": 3.0597740091960013, "grad_norm": 0.24863791465759277, "learning_rate": 1.6259537706837712e-05, "loss": 1.2532, "step": 10273 }, { "epoch": 3.06007185539567, "grad_norm": 0.230848029255867, "learning_rate": 1.6258785450834638e-05, "loss": 1.215, "step": 10274 }, { "epoch": 3.060369701595339, "grad_norm": 0.25193142890930176, "learning_rate": 1.6258033136600556e-05, "loss": 1.2307, "step": 10275 }, { "epoch": 3.060667547795007, "grad_norm": 0.23902875185012817, "learning_rate": 1.6257280764142472e-05, "loss": 1.2429, "step": 10276 }, { "epoch": 3.060965393994676, "grad_norm": 0.26887065172195435, "learning_rate": 1.625652833346738e-05, "loss": 1.2439, "step": 10277 }, { "epoch": 3.0612632401943447, "grad_norm": 0.24252690374851227, "learning_rate": 1.6255775844582284e-05, "loss": 1.2458, "step": 10278 }, { "epoch": 3.0615610863940135, "grad_norm": 0.2371646612882614, "learning_rate": 1.6255023297494182e-05, "loss": 1.2358, "step": 10279 }, { "epoch": 3.061858932593682, "grad_norm": 0.2452089786529541, "learning_rate": 1.6254270692210076e-05, "loss": 1.2297, "step": 10280 }, { "epoch": 3.0621567787933506, "grad_norm": 0.24095968902111053, "learning_rate": 1.6253518028736967e-05, "loss": 1.2482, "step": 10281 }, { "epoch": 3.0624546249930193, "grad_norm": 0.31817057728767395, "learning_rate": 1.625276530708186e-05, "loss": 1.2335, "step": 10282 }, { "epoch": 3.0627524711926877, "grad_norm": 0.30245932936668396, "learning_rate": 1.625201252725176e-05, "loss": 1.2211, "step": 10283 }, { "epoch": 3.0630503173923564, "grad_norm": 0.2332882583141327, "learning_rate": 1.625125968925367e-05, "loss": 1.2401, "step": 10284 }, { "epoch": 3.063348163592025, "grad_norm": 0.31895169615745544, "learning_rate": 1.625050679309459e-05, "loss": 1.2324, "step": 10285 }, { "epoch": 3.063646009791694, "grad_norm": 0.3017706871032715, "learning_rate": 1.624975383878153e-05, "loss": 1.2357, "step": 10286 }, { "epoch": 3.0639438559913623, "grad_norm": 0.2633976638317108, "learning_rate": 1.624900082632149e-05, "loss": 1.2568, "step": 10287 }, { "epoch": 3.064241702191031, "grad_norm": 0.2533052861690521, "learning_rate": 1.624824775572148e-05, "loss": 1.2254, "step": 10288 }, { "epoch": 3.0645395483907, "grad_norm": 0.25451239943504333, "learning_rate": 1.624749462698851e-05, "loss": 1.2414, "step": 10289 }, { "epoch": 3.064837394590368, "grad_norm": 0.23318979144096375, "learning_rate": 1.6246741440129575e-05, "loss": 1.2284, "step": 10290 }, { "epoch": 3.065135240790037, "grad_norm": 0.2820476293563843, "learning_rate": 1.6245988195151696e-05, "loss": 1.2481, "step": 10291 }, { "epoch": 3.0654330869897057, "grad_norm": 0.27426910400390625, "learning_rate": 1.624523489206187e-05, "loss": 1.2402, "step": 10292 }, { "epoch": 3.0657309331893745, "grad_norm": 0.22696912288665771, "learning_rate": 1.6244481530867117e-05, "loss": 1.2286, "step": 10293 }, { "epoch": 3.066028779389043, "grad_norm": 0.2459140568971634, "learning_rate": 1.6243728111574437e-05, "loss": 1.2324, "step": 10294 }, { "epoch": 3.0663266255887116, "grad_norm": 0.24593108892440796, "learning_rate": 1.6242974634190846e-05, "loss": 1.2344, "step": 10295 }, { "epoch": 3.0666244717883804, "grad_norm": 0.24092045426368713, "learning_rate": 1.6242221098723346e-05, "loss": 1.2329, "step": 10296 }, { "epoch": 3.0669223179880487, "grad_norm": 0.24796484410762787, "learning_rate": 1.6241467505178957e-05, "loss": 1.2413, "step": 10297 }, { "epoch": 3.0672201641877175, "grad_norm": 0.2608395516872406, "learning_rate": 1.6240713853564683e-05, "loss": 1.218, "step": 10298 }, { "epoch": 3.0675180103873863, "grad_norm": 0.2577863335609436, "learning_rate": 1.623996014388754e-05, "loss": 1.2273, "step": 10299 }, { "epoch": 3.067815856587055, "grad_norm": 0.2353697419166565, "learning_rate": 1.6239206376154543e-05, "loss": 1.2224, "step": 10300 }, { "epoch": 3.0681137027867234, "grad_norm": 0.2622057795524597, "learning_rate": 1.6238452550372698e-05, "loss": 1.2271, "step": 10301 }, { "epoch": 3.068411548986392, "grad_norm": 0.2638491094112396, "learning_rate": 1.6237698666549023e-05, "loss": 1.2537, "step": 10302 }, { "epoch": 3.068709395186061, "grad_norm": 0.24457314610481262, "learning_rate": 1.623694472469053e-05, "loss": 1.2341, "step": 10303 }, { "epoch": 3.0690072413857292, "grad_norm": 0.276833176612854, "learning_rate": 1.6236190724804238e-05, "loss": 1.2308, "step": 10304 }, { "epoch": 3.069305087585398, "grad_norm": 0.3370343744754791, "learning_rate": 1.6235436666897153e-05, "loss": 1.2445, "step": 10305 }, { "epoch": 3.0696029337850668, "grad_norm": 0.39063307642936707, "learning_rate": 1.62346825509763e-05, "loss": 1.2447, "step": 10306 }, { "epoch": 3.0699007799847355, "grad_norm": 0.3648836612701416, "learning_rate": 1.623392837704869e-05, "loss": 1.2614, "step": 10307 }, { "epoch": 3.070198626184404, "grad_norm": 0.24658054113388062, "learning_rate": 1.6233174145121346e-05, "loss": 1.2472, "step": 10308 }, { "epoch": 3.0704964723840726, "grad_norm": 0.6062577962875366, "learning_rate": 1.6232419855201275e-05, "loss": 1.2417, "step": 10309 }, { "epoch": 3.0707943185837414, "grad_norm": 0.32666686177253723, "learning_rate": 1.6231665507295503e-05, "loss": 1.2496, "step": 10310 }, { "epoch": 3.07109216478341, "grad_norm": 0.26822617650032043, "learning_rate": 1.6230911101411048e-05, "loss": 1.2204, "step": 10311 }, { "epoch": 3.0713900109830785, "grad_norm": 0.2530498802661896, "learning_rate": 1.6230156637554925e-05, "loss": 1.2329, "step": 10312 }, { "epoch": 3.0716878571827473, "grad_norm": 0.23866677284240723, "learning_rate": 1.6229402115734157e-05, "loss": 1.2285, "step": 10313 }, { "epoch": 3.071985703382416, "grad_norm": 0.2549847364425659, "learning_rate": 1.6228647535955758e-05, "loss": 1.2365, "step": 10314 }, { "epoch": 3.0722835495820844, "grad_norm": 0.2633138597011566, "learning_rate": 1.6227892898226754e-05, "loss": 1.2539, "step": 10315 }, { "epoch": 3.072581395781753, "grad_norm": 0.25155434012413025, "learning_rate": 1.6227138202554167e-05, "loss": 1.2258, "step": 10316 }, { "epoch": 3.072879241981422, "grad_norm": 0.23175309598445892, "learning_rate": 1.6226383448945014e-05, "loss": 1.2388, "step": 10317 }, { "epoch": 3.0731770881810907, "grad_norm": 0.23968221247196198, "learning_rate": 1.6225628637406322e-05, "loss": 1.2301, "step": 10318 }, { "epoch": 3.073474934380759, "grad_norm": 0.24314630031585693, "learning_rate": 1.622487376794511e-05, "loss": 1.2632, "step": 10319 }, { "epoch": 3.073772780580428, "grad_norm": 0.2491050362586975, "learning_rate": 1.62241188405684e-05, "loss": 1.2327, "step": 10320 }, { "epoch": 3.0740706267800966, "grad_norm": 0.2475767433643341, "learning_rate": 1.622336385528322e-05, "loss": 1.2517, "step": 10321 }, { "epoch": 3.074368472979765, "grad_norm": 0.24345384538173676, "learning_rate": 1.6222608812096594e-05, "loss": 1.2258, "step": 10322 }, { "epoch": 3.0746663191794337, "grad_norm": 0.25330597162246704, "learning_rate": 1.6221853711015546e-05, "loss": 1.2298, "step": 10323 }, { "epoch": 3.0749641653791024, "grad_norm": 0.24096539616584778, "learning_rate": 1.62210985520471e-05, "loss": 1.2502, "step": 10324 }, { "epoch": 3.075262011578771, "grad_norm": 0.2532306909561157, "learning_rate": 1.6220343335198278e-05, "loss": 1.2268, "step": 10325 }, { "epoch": 3.0755598577784395, "grad_norm": 0.25411057472229004, "learning_rate": 1.6219588060476116e-05, "loss": 1.2301, "step": 10326 }, { "epoch": 3.0758577039781083, "grad_norm": 0.24418389797210693, "learning_rate": 1.6218832727887635e-05, "loss": 1.2322, "step": 10327 }, { "epoch": 3.076155550177777, "grad_norm": 0.2418355941772461, "learning_rate": 1.621807733743986e-05, "loss": 1.2527, "step": 10328 }, { "epoch": 3.0764533963774454, "grad_norm": 0.24275583028793335, "learning_rate": 1.6217321889139828e-05, "loss": 1.2129, "step": 10329 }, { "epoch": 3.076751242577114, "grad_norm": 0.23158468306064606, "learning_rate": 1.621656638299456e-05, "loss": 1.2479, "step": 10330 }, { "epoch": 3.077049088776783, "grad_norm": 0.2408546507358551, "learning_rate": 1.6215810819011087e-05, "loss": 1.2292, "step": 10331 }, { "epoch": 3.0773469349764517, "grad_norm": 0.2383338361978531, "learning_rate": 1.621505519719644e-05, "loss": 1.2286, "step": 10332 }, { "epoch": 3.07764478117612, "grad_norm": 0.23233291506767273, "learning_rate": 1.6214299517557648e-05, "loss": 1.2297, "step": 10333 }, { "epoch": 3.077942627375789, "grad_norm": 0.24617774784564972, "learning_rate": 1.6213543780101743e-05, "loss": 1.2401, "step": 10334 }, { "epoch": 3.0782404735754576, "grad_norm": 0.23213894665241241, "learning_rate": 1.621278798483575e-05, "loss": 1.2369, "step": 10335 }, { "epoch": 3.078538319775126, "grad_norm": 0.23981331288814545, "learning_rate": 1.621203213176671e-05, "loss": 1.2322, "step": 10336 }, { "epoch": 3.0788361659747947, "grad_norm": 0.23576320707798004, "learning_rate": 1.6211276220901655e-05, "loss": 1.2377, "step": 10337 }, { "epoch": 3.0791340121744635, "grad_norm": 0.24305744469165802, "learning_rate": 1.621052025224761e-05, "loss": 1.2433, "step": 10338 }, { "epoch": 3.0794318583741322, "grad_norm": 0.23215025663375854, "learning_rate": 1.6209764225811615e-05, "loss": 1.2458, "step": 10339 }, { "epoch": 3.0797297045738006, "grad_norm": 0.23149891197681427, "learning_rate": 1.62090081416007e-05, "loss": 1.2241, "step": 10340 }, { "epoch": 3.0800275507734693, "grad_norm": 0.23072408139705658, "learning_rate": 1.6208251999621902e-05, "loss": 1.2215, "step": 10341 }, { "epoch": 3.080325396973138, "grad_norm": 0.22406615316867828, "learning_rate": 1.6207495799882255e-05, "loss": 1.2484, "step": 10342 }, { "epoch": 3.0806232431728064, "grad_norm": 0.253277987241745, "learning_rate": 1.6206739542388795e-05, "loss": 1.2602, "step": 10343 }, { "epoch": 3.080921089372475, "grad_norm": 0.2351500242948532, "learning_rate": 1.6205983227148562e-05, "loss": 1.248, "step": 10344 }, { "epoch": 3.081218935572144, "grad_norm": 0.2387404590845108, "learning_rate": 1.6205226854168583e-05, "loss": 1.2587, "step": 10345 }, { "epoch": 3.0815167817718128, "grad_norm": 0.24336200952529907, "learning_rate": 1.6204470423455902e-05, "loss": 1.2345, "step": 10346 }, { "epoch": 3.081814627971481, "grad_norm": 0.23612387478351593, "learning_rate": 1.620371393501756e-05, "loss": 1.2376, "step": 10347 }, { "epoch": 3.08211247417115, "grad_norm": 0.24053697288036346, "learning_rate": 1.6202957388860588e-05, "loss": 1.2375, "step": 10348 }, { "epoch": 3.0824103203708186, "grad_norm": 0.22427183389663696, "learning_rate": 1.6202200784992025e-05, "loss": 1.2409, "step": 10349 }, { "epoch": 3.082708166570487, "grad_norm": 0.22489367425441742, "learning_rate": 1.620144412341892e-05, "loss": 1.2421, "step": 10350 }, { "epoch": 3.0830060127701557, "grad_norm": 0.23169724643230438, "learning_rate": 1.62006874041483e-05, "loss": 1.2574, "step": 10351 }, { "epoch": 3.0833038589698245, "grad_norm": 0.22526168823242188, "learning_rate": 1.6199930627187215e-05, "loss": 1.2298, "step": 10352 }, { "epoch": 3.0836017051694933, "grad_norm": 0.2435971051454544, "learning_rate": 1.61991737925427e-05, "loss": 1.2262, "step": 10353 }, { "epoch": 3.0838995513691616, "grad_norm": 0.23552916944026947, "learning_rate": 1.61984169002218e-05, "loss": 1.2548, "step": 10354 }, { "epoch": 3.0841973975688304, "grad_norm": 0.23748822510242462, "learning_rate": 1.6197659950231556e-05, "loss": 1.239, "step": 10355 }, { "epoch": 3.084495243768499, "grad_norm": 0.24067635834217072, "learning_rate": 1.6196902942579012e-05, "loss": 1.2358, "step": 10356 }, { "epoch": 3.0847930899681675, "grad_norm": 0.23165231943130493, "learning_rate": 1.619614587727121e-05, "loss": 1.2315, "step": 10357 }, { "epoch": 3.0850909361678363, "grad_norm": 0.23566889762878418, "learning_rate": 1.619538875431519e-05, "loss": 1.2312, "step": 10358 }, { "epoch": 3.085388782367505, "grad_norm": 0.26848578453063965, "learning_rate": 1.6194631573718e-05, "loss": 1.2412, "step": 10359 }, { "epoch": 3.085686628567174, "grad_norm": 0.24758121371269226, "learning_rate": 1.6193874335486687e-05, "loss": 1.2355, "step": 10360 }, { "epoch": 3.085984474766842, "grad_norm": 0.2501175105571747, "learning_rate": 1.6193117039628293e-05, "loss": 1.2436, "step": 10361 }, { "epoch": 3.086282320966511, "grad_norm": 0.2576286494731903, "learning_rate": 1.6192359686149863e-05, "loss": 1.229, "step": 10362 }, { "epoch": 3.0865801671661797, "grad_norm": 0.24462206661701202, "learning_rate": 1.6191602275058444e-05, "loss": 1.2467, "step": 10363 }, { "epoch": 3.0868780133658484, "grad_norm": 0.3026152551174164, "learning_rate": 1.619084480636109e-05, "loss": 1.2202, "step": 10364 }, { "epoch": 3.0871758595655168, "grad_norm": 0.2340417504310608, "learning_rate": 1.6190087280064834e-05, "loss": 1.2348, "step": 10365 }, { "epoch": 3.0874737057651855, "grad_norm": 0.3344682455062866, "learning_rate": 1.6189329696176735e-05, "loss": 1.2428, "step": 10366 }, { "epoch": 3.0877715519648543, "grad_norm": 0.28237810730934143, "learning_rate": 1.6188572054703837e-05, "loss": 1.2158, "step": 10367 }, { "epoch": 3.0880693981645226, "grad_norm": 0.27612945437431335, "learning_rate": 1.6187814355653193e-05, "loss": 1.2359, "step": 10368 }, { "epoch": 3.0883672443641914, "grad_norm": 0.2473704218864441, "learning_rate": 1.6187056599031844e-05, "loss": 1.2402, "step": 10369 }, { "epoch": 3.08866509056386, "grad_norm": 0.3062663972377777, "learning_rate": 1.6186298784846854e-05, "loss": 1.2367, "step": 10370 }, { "epoch": 3.0889629367635285, "grad_norm": 0.2383635938167572, "learning_rate": 1.618554091310526e-05, "loss": 1.245, "step": 10371 }, { "epoch": 3.0892607829631973, "grad_norm": 0.2796862721443176, "learning_rate": 1.618478298381412e-05, "loss": 1.2474, "step": 10372 }, { "epoch": 3.089558629162866, "grad_norm": 0.26445525884628296, "learning_rate": 1.6184024996980485e-05, "loss": 1.237, "step": 10373 }, { "epoch": 3.089856475362535, "grad_norm": 0.27199339866638184, "learning_rate": 1.6183266952611405e-05, "loss": 1.2605, "step": 10374 }, { "epoch": 3.090154321562203, "grad_norm": 0.30469194054603577, "learning_rate": 1.6182508850713937e-05, "loss": 1.2297, "step": 10375 }, { "epoch": 3.090452167761872, "grad_norm": 0.24834252893924713, "learning_rate": 1.618175069129513e-05, "loss": 1.2371, "step": 10376 }, { "epoch": 3.0907500139615407, "grad_norm": 0.2820184528827667, "learning_rate": 1.618099247436204e-05, "loss": 1.2239, "step": 10377 }, { "epoch": 3.0910478601612095, "grad_norm": 0.2574171721935272, "learning_rate": 1.618023419992172e-05, "loss": 1.2436, "step": 10378 }, { "epoch": 3.091345706360878, "grad_norm": 0.2358943074941635, "learning_rate": 1.6179475867981225e-05, "loss": 1.2441, "step": 10379 }, { "epoch": 3.0916435525605466, "grad_norm": 0.3953135013580322, "learning_rate": 1.6178717478547613e-05, "loss": 1.2473, "step": 10380 }, { "epoch": 3.0919413987602153, "grad_norm": 0.24281415343284607, "learning_rate": 1.6177959031627937e-05, "loss": 1.249, "step": 10381 }, { "epoch": 3.0922392449598837, "grad_norm": 0.29101502895355225, "learning_rate": 1.6177200527229256e-05, "loss": 1.2495, "step": 10382 }, { "epoch": 3.0925370911595524, "grad_norm": 0.2608293294906616, "learning_rate": 1.6176441965358624e-05, "loss": 1.2364, "step": 10383 }, { "epoch": 3.092834937359221, "grad_norm": 0.273247092962265, "learning_rate": 1.61756833460231e-05, "loss": 1.2452, "step": 10384 }, { "epoch": 3.09313278355889, "grad_norm": 0.3250589966773987, "learning_rate": 1.6174924669229746e-05, "loss": 1.238, "step": 10385 }, { "epoch": 3.0934306297585583, "grad_norm": 0.24804839491844177, "learning_rate": 1.6174165934985612e-05, "loss": 1.2258, "step": 10386 }, { "epoch": 3.093728475958227, "grad_norm": 0.2775920331478119, "learning_rate": 1.6173407143297767e-05, "loss": 1.2304, "step": 10387 }, { "epoch": 3.094026322157896, "grad_norm": 0.2390071302652359, "learning_rate": 1.6172648294173265e-05, "loss": 1.2396, "step": 10388 }, { "epoch": 3.094324168357564, "grad_norm": 0.2723798453807831, "learning_rate": 1.6171889387619163e-05, "loss": 1.2346, "step": 10389 }, { "epoch": 3.094622014557233, "grad_norm": 0.2721167802810669, "learning_rate": 1.617113042364253e-05, "loss": 1.2442, "step": 10390 }, { "epoch": 3.0949198607569017, "grad_norm": 0.2453288584947586, "learning_rate": 1.6170371402250418e-05, "loss": 1.235, "step": 10391 }, { "epoch": 3.0952177069565705, "grad_norm": 0.2587590217590332, "learning_rate": 1.61696123234499e-05, "loss": 1.2282, "step": 10392 }, { "epoch": 3.095515553156239, "grad_norm": 0.25568780303001404, "learning_rate": 1.616885318724803e-05, "loss": 1.2463, "step": 10393 }, { "epoch": 3.0958133993559076, "grad_norm": 0.26561030745506287, "learning_rate": 1.6168093993651873e-05, "loss": 1.2367, "step": 10394 }, { "epoch": 3.0961112455555764, "grad_norm": 0.2805534899234772, "learning_rate": 1.6167334742668493e-05, "loss": 1.2284, "step": 10395 }, { "epoch": 3.0964090917552447, "grad_norm": 0.24491840600967407, "learning_rate": 1.6166575434304953e-05, "loss": 1.2498, "step": 10396 }, { "epoch": 3.0967069379549135, "grad_norm": 0.2690185308456421, "learning_rate": 1.616581606856832e-05, "loss": 1.2286, "step": 10397 }, { "epoch": 3.0970047841545822, "grad_norm": 0.23889923095703125, "learning_rate": 1.6165056645465657e-05, "loss": 1.2396, "step": 10398 }, { "epoch": 3.097302630354251, "grad_norm": 0.23897404968738556, "learning_rate": 1.616429716500403e-05, "loss": 1.2348, "step": 10399 }, { "epoch": 3.0976004765539193, "grad_norm": 0.2513929009437561, "learning_rate": 1.6163537627190506e-05, "loss": 1.229, "step": 10400 }, { "epoch": 3.097898322753588, "grad_norm": 0.2689685523509979, "learning_rate": 1.6162778032032147e-05, "loss": 1.2385, "step": 10401 }, { "epoch": 3.098196168953257, "grad_norm": 0.2547231912612915, "learning_rate": 1.6162018379536027e-05, "loss": 1.2441, "step": 10402 }, { "epoch": 3.098494015152925, "grad_norm": 0.26805955171585083, "learning_rate": 1.6161258669709208e-05, "loss": 1.2324, "step": 10403 }, { "epoch": 3.098791861352594, "grad_norm": 0.22966517508029938, "learning_rate": 1.6160498902558762e-05, "loss": 1.227, "step": 10404 }, { "epoch": 3.0990897075522628, "grad_norm": 0.2782258987426758, "learning_rate": 1.615973907809176e-05, "loss": 1.2354, "step": 10405 }, { "epoch": 3.0993875537519315, "grad_norm": 0.23317642509937286, "learning_rate": 1.6158979196315266e-05, "loss": 1.2362, "step": 10406 }, { "epoch": 3.0996853999516, "grad_norm": 0.2576300799846649, "learning_rate": 1.6158219257236346e-05, "loss": 1.2264, "step": 10407 }, { "epoch": 3.0999832461512686, "grad_norm": 0.23924222588539124, "learning_rate": 1.6157459260862082e-05, "loss": 1.2451, "step": 10408 }, { "epoch": 3.1002810923509374, "grad_norm": 0.26835858821868896, "learning_rate": 1.615669920719954e-05, "loss": 1.2473, "step": 10409 }, { "epoch": 3.1005789385506057, "grad_norm": 0.2458593100309372, "learning_rate": 1.615593909625579e-05, "loss": 1.2437, "step": 10410 }, { "epoch": 3.1008767847502745, "grad_norm": 0.22977596521377563, "learning_rate": 1.6155178928037904e-05, "loss": 1.2337, "step": 10411 }, { "epoch": 3.1011746309499433, "grad_norm": 0.24524131417274475, "learning_rate": 1.6154418702552953e-05, "loss": 1.225, "step": 10412 }, { "epoch": 3.101472477149612, "grad_norm": 0.25615182518959045, "learning_rate": 1.6153658419808014e-05, "loss": 1.2275, "step": 10413 }, { "epoch": 3.1017703233492804, "grad_norm": 0.23454606533050537, "learning_rate": 1.615289807981016e-05, "loss": 1.237, "step": 10414 }, { "epoch": 3.102068169548949, "grad_norm": 0.28615736961364746, "learning_rate": 1.615213768256646e-05, "loss": 1.2139, "step": 10415 }, { "epoch": 3.102366015748618, "grad_norm": 0.2971478998661041, "learning_rate": 1.6151377228083994e-05, "loss": 1.2346, "step": 10416 }, { "epoch": 3.1026638619482863, "grad_norm": 0.23978576064109802, "learning_rate": 1.6150616716369832e-05, "loss": 1.2468, "step": 10417 }, { "epoch": 3.102961708147955, "grad_norm": 0.2721368372440338, "learning_rate": 1.614985614743106e-05, "loss": 1.2357, "step": 10418 }, { "epoch": 3.103259554347624, "grad_norm": 0.23517760634422302, "learning_rate": 1.6149095521274746e-05, "loss": 1.2485, "step": 10419 }, { "epoch": 3.1035574005472926, "grad_norm": 0.31406596302986145, "learning_rate": 1.6148334837907965e-05, "loss": 1.2359, "step": 10420 }, { "epoch": 3.103855246746961, "grad_norm": 0.31216081976890564, "learning_rate": 1.61475740973378e-05, "loss": 1.2379, "step": 10421 }, { "epoch": 3.1041530929466297, "grad_norm": 0.2626805901527405, "learning_rate": 1.614681329957133e-05, "loss": 1.2457, "step": 10422 }, { "epoch": 3.1044509391462984, "grad_norm": 0.4179708659648895, "learning_rate": 1.6146052444615624e-05, "loss": 1.2328, "step": 10423 }, { "epoch": 3.1047487853459668, "grad_norm": 0.3220103085041046, "learning_rate": 1.614529153247777e-05, "loss": 1.2401, "step": 10424 }, { "epoch": 3.1050466315456355, "grad_norm": 0.25905534625053406, "learning_rate": 1.614453056316484e-05, "loss": 1.2439, "step": 10425 }, { "epoch": 3.1053444777453043, "grad_norm": 0.3609839081764221, "learning_rate": 1.6143769536683926e-05, "loss": 1.2387, "step": 10426 }, { "epoch": 3.105642323944973, "grad_norm": 0.26535311341285706, "learning_rate": 1.6143008453042094e-05, "loss": 1.2566, "step": 10427 }, { "epoch": 3.1059401701446414, "grad_norm": 0.26972994208335876, "learning_rate": 1.6142247312246432e-05, "loss": 1.2366, "step": 10428 }, { "epoch": 3.10623801634431, "grad_norm": 0.2537599802017212, "learning_rate": 1.6141486114304026e-05, "loss": 1.23, "step": 10429 }, { "epoch": 3.106535862543979, "grad_norm": 0.2508467137813568, "learning_rate": 1.6140724859221946e-05, "loss": 1.2542, "step": 10430 }, { "epoch": 3.1068337087436477, "grad_norm": 0.2471814900636673, "learning_rate": 1.6139963547007288e-05, "loss": 1.2352, "step": 10431 }, { "epoch": 3.107131554943316, "grad_norm": 0.25865426659584045, "learning_rate": 1.6139202177667128e-05, "loss": 1.2445, "step": 10432 }, { "epoch": 3.107429401142985, "grad_norm": 0.25801217555999756, "learning_rate": 1.6138440751208554e-05, "loss": 1.236, "step": 10433 }, { "epoch": 3.1077272473426536, "grad_norm": 0.23547405004501343, "learning_rate": 1.6137679267638642e-05, "loss": 1.2303, "step": 10434 }, { "epoch": 3.108025093542322, "grad_norm": 0.23247171938419342, "learning_rate": 1.613691772696448e-05, "loss": 1.2418, "step": 10435 }, { "epoch": 3.1083229397419907, "grad_norm": 0.24636529386043549, "learning_rate": 1.613615612919316e-05, "loss": 1.2471, "step": 10436 }, { "epoch": 3.1086207859416595, "grad_norm": 0.25281500816345215, "learning_rate": 1.6135394474331764e-05, "loss": 1.2402, "step": 10437 }, { "epoch": 3.108918632141328, "grad_norm": 0.24236537516117096, "learning_rate": 1.6134632762387373e-05, "loss": 1.24, "step": 10438 }, { "epoch": 3.1092164783409966, "grad_norm": 0.23605842888355255, "learning_rate": 1.6133870993367077e-05, "loss": 1.2196, "step": 10439 }, { "epoch": 3.1095143245406653, "grad_norm": 0.23824907839298248, "learning_rate": 1.6133109167277973e-05, "loss": 1.2444, "step": 10440 }, { "epoch": 3.109812170740334, "grad_norm": 0.24137970805168152, "learning_rate": 1.6132347284127133e-05, "loss": 1.2342, "step": 10441 }, { "epoch": 3.1101100169400024, "grad_norm": 0.25432923436164856, "learning_rate": 1.6131585343921654e-05, "loss": 1.2226, "step": 10442 }, { "epoch": 3.110407863139671, "grad_norm": 0.23455332219600677, "learning_rate": 1.6130823346668628e-05, "loss": 1.2453, "step": 10443 }, { "epoch": 3.11070570933934, "grad_norm": 0.3594229519367218, "learning_rate": 1.6130061292375133e-05, "loss": 1.2408, "step": 10444 }, { "epoch": 3.1110035555390088, "grad_norm": 0.3436545729637146, "learning_rate": 1.6129299181048273e-05, "loss": 1.2425, "step": 10445 }, { "epoch": 3.111301401738677, "grad_norm": 0.2786564230918884, "learning_rate": 1.612853701269513e-05, "loss": 1.2225, "step": 10446 }, { "epoch": 3.111599247938346, "grad_norm": 0.45427578687667847, "learning_rate": 1.6127774787322797e-05, "loss": 1.2307, "step": 10447 }, { "epoch": 3.1118970941380146, "grad_norm": 0.2831638753414154, "learning_rate": 1.6127012504938366e-05, "loss": 1.2417, "step": 10448 }, { "epoch": 3.112194940337683, "grad_norm": 0.3059787154197693, "learning_rate": 1.6126250165548932e-05, "loss": 1.2278, "step": 10449 }, { "epoch": 3.1124927865373517, "grad_norm": 0.28319501876831055, "learning_rate": 1.612548776916158e-05, "loss": 1.244, "step": 10450 }, { "epoch": 3.1127906327370205, "grad_norm": 0.2845174968242645, "learning_rate": 1.612472531578341e-05, "loss": 1.2483, "step": 10451 }, { "epoch": 3.1130884789366893, "grad_norm": 0.30333369970321655, "learning_rate": 1.6123962805421515e-05, "loss": 1.2265, "step": 10452 }, { "epoch": 3.1133863251363576, "grad_norm": 0.24139992892742157, "learning_rate": 1.612320023808299e-05, "loss": 1.241, "step": 10453 }, { "epoch": 3.1136841713360264, "grad_norm": 0.36457669734954834, "learning_rate": 1.6122437613774925e-05, "loss": 1.2475, "step": 10454 }, { "epoch": 3.113982017535695, "grad_norm": 0.26496487855911255, "learning_rate": 1.612167493250442e-05, "loss": 1.2339, "step": 10455 }, { "epoch": 3.1142798637353635, "grad_norm": 0.28735244274139404, "learning_rate": 1.6120912194278566e-05, "loss": 1.2275, "step": 10456 }, { "epoch": 3.1145777099350322, "grad_norm": 0.2546387016773224, "learning_rate": 1.6120149399104465e-05, "loss": 1.2378, "step": 10457 }, { "epoch": 3.114875556134701, "grad_norm": 0.3327142596244812, "learning_rate": 1.6119386546989214e-05, "loss": 1.2443, "step": 10458 }, { "epoch": 3.11517340233437, "grad_norm": 0.30269357562065125, "learning_rate": 1.6118623637939904e-05, "loss": 1.2258, "step": 10459 }, { "epoch": 3.115471248534038, "grad_norm": 0.29396873712539673, "learning_rate": 1.6117860671963642e-05, "loss": 1.2415, "step": 10460 }, { "epoch": 3.115769094733707, "grad_norm": 0.3337930142879486, "learning_rate": 1.6117097649067517e-05, "loss": 1.2348, "step": 10461 }, { "epoch": 3.1160669409333757, "grad_norm": 0.23203662037849426, "learning_rate": 1.6116334569258633e-05, "loss": 1.2292, "step": 10462 }, { "epoch": 3.116364787133044, "grad_norm": 0.2526288628578186, "learning_rate": 1.6115571432544093e-05, "loss": 1.2363, "step": 10463 }, { "epoch": 3.1166626333327128, "grad_norm": 0.26672297716140747, "learning_rate": 1.611480823893099e-05, "loss": 1.2188, "step": 10464 }, { "epoch": 3.1169604795323815, "grad_norm": 0.2600806653499603, "learning_rate": 1.611404498842643e-05, "loss": 1.2358, "step": 10465 }, { "epoch": 3.1172583257320503, "grad_norm": 0.2883581221103668, "learning_rate": 1.611328168103751e-05, "loss": 1.2414, "step": 10466 }, { "epoch": 3.1175561719317186, "grad_norm": 0.2521454393863678, "learning_rate": 1.611251831677134e-05, "loss": 1.2409, "step": 10467 }, { "epoch": 3.1178540181313874, "grad_norm": 0.28055715560913086, "learning_rate": 1.611175489563501e-05, "loss": 1.2257, "step": 10468 }, { "epoch": 3.118151864331056, "grad_norm": 0.22997577488422394, "learning_rate": 1.6110991417635633e-05, "loss": 1.2432, "step": 10469 }, { "epoch": 3.1184497105307245, "grad_norm": 0.3932369649410248, "learning_rate": 1.6110227882780307e-05, "loss": 1.2493, "step": 10470 }, { "epoch": 3.1187475567303933, "grad_norm": 0.30058619379997253, "learning_rate": 1.6109464291076137e-05, "loss": 1.2401, "step": 10471 }, { "epoch": 3.119045402930062, "grad_norm": 0.28633445501327515, "learning_rate": 1.610870064253023e-05, "loss": 1.2464, "step": 10472 }, { "epoch": 3.119343249129731, "grad_norm": 0.2538171708583832, "learning_rate": 1.6107936937149684e-05, "loss": 1.2413, "step": 10473 }, { "epoch": 3.119641095329399, "grad_norm": 0.37864917516708374, "learning_rate": 1.6107173174941614e-05, "loss": 1.2419, "step": 10474 }, { "epoch": 3.119938941529068, "grad_norm": 0.2942824363708496, "learning_rate": 1.6106409355913117e-05, "loss": 1.2283, "step": 10475 }, { "epoch": 3.1202367877287367, "grad_norm": 0.29793575406074524, "learning_rate": 1.6105645480071305e-05, "loss": 1.2412, "step": 10476 }, { "epoch": 3.120534633928405, "grad_norm": 0.2526813745498657, "learning_rate": 1.6104881547423286e-05, "loss": 1.2465, "step": 10477 }, { "epoch": 3.120832480128074, "grad_norm": 0.3530627489089966, "learning_rate": 1.610411755797616e-05, "loss": 1.2215, "step": 10478 }, { "epoch": 3.1211303263277426, "grad_norm": 0.26530155539512634, "learning_rate": 1.6103353511737046e-05, "loss": 1.2516, "step": 10479 }, { "epoch": 3.1214281725274113, "grad_norm": 0.2525983154773712, "learning_rate": 1.6102589408713042e-05, "loss": 1.2424, "step": 10480 }, { "epoch": 3.1217260187270797, "grad_norm": 0.2890133261680603, "learning_rate": 1.6101825248911264e-05, "loss": 1.2417, "step": 10481 }, { "epoch": 3.1220238649267484, "grad_norm": 0.2439301609992981, "learning_rate": 1.6101061032338817e-05, "loss": 1.2248, "step": 10482 }, { "epoch": 3.122321711126417, "grad_norm": 0.372954398393631, "learning_rate": 1.6100296759002817e-05, "loss": 1.232, "step": 10483 }, { "epoch": 3.1226195573260855, "grad_norm": 0.2565677762031555, "learning_rate": 1.6099532428910367e-05, "loss": 1.2246, "step": 10484 }, { "epoch": 3.1229174035257543, "grad_norm": 0.29265743494033813, "learning_rate": 1.6098768042068587e-05, "loss": 1.2416, "step": 10485 }, { "epoch": 3.123215249725423, "grad_norm": 0.25104451179504395, "learning_rate": 1.6098003598484582e-05, "loss": 1.2654, "step": 10486 }, { "epoch": 3.123513095925092, "grad_norm": 0.43743887543678284, "learning_rate": 1.6097239098165466e-05, "loss": 1.2483, "step": 10487 }, { "epoch": 3.12381094212476, "grad_norm": 0.24702873826026917, "learning_rate": 1.609647454111835e-05, "loss": 1.2509, "step": 10488 }, { "epoch": 3.124108788324429, "grad_norm": 0.2761531174182892, "learning_rate": 1.6095709927350357e-05, "loss": 1.2474, "step": 10489 }, { "epoch": 3.1244066345240977, "grad_norm": 0.2544757127761841, "learning_rate": 1.609494525686859e-05, "loss": 1.2211, "step": 10490 }, { "epoch": 3.124704480723766, "grad_norm": 0.2518884539604187, "learning_rate": 1.6094180529680166e-05, "loss": 1.2381, "step": 10491 }, { "epoch": 3.125002326923435, "grad_norm": 0.27676719427108765, "learning_rate": 1.60934157457922e-05, "loss": 1.2485, "step": 10492 }, { "epoch": 3.1253001731231036, "grad_norm": 0.24191485345363617, "learning_rate": 1.609265090521181e-05, "loss": 1.2344, "step": 10493 }, { "epoch": 3.1255980193227724, "grad_norm": 0.23960517346858978, "learning_rate": 1.6091886007946114e-05, "loss": 1.2341, "step": 10494 }, { "epoch": 3.1258958655224407, "grad_norm": 0.25541895627975464, "learning_rate": 1.609112105400222e-05, "loss": 1.2313, "step": 10495 }, { "epoch": 3.1261937117221095, "grad_norm": 0.23986327648162842, "learning_rate": 1.609035604338725e-05, "loss": 1.247, "step": 10496 }, { "epoch": 3.1264915579217782, "grad_norm": 0.2870851755142212, "learning_rate": 1.6089590976108326e-05, "loss": 1.2471, "step": 10497 }, { "epoch": 3.126789404121447, "grad_norm": 0.26950156688690186, "learning_rate": 1.6088825852172556e-05, "loss": 1.2406, "step": 10498 }, { "epoch": 3.1270872503211153, "grad_norm": 0.2400730550289154, "learning_rate": 1.6088060671587067e-05, "loss": 1.2593, "step": 10499 }, { "epoch": 3.127385096520784, "grad_norm": 0.23325183987617493, "learning_rate": 1.6087295434358977e-05, "loss": 1.2426, "step": 10500 }, { "epoch": 3.127385096520784, "eval_loss": 1.3344223499298096, "eval_runtime": 21.0599, "eval_samples_per_second": 82.337, "eval_steps_per_second": 5.176, "step": 10500 }, { "epoch": 3.127682942720453, "grad_norm": 0.24747851490974426, "learning_rate": 1.60865301404954e-05, "loss": 1.2233, "step": 10501 }, { "epoch": 3.127980788920121, "grad_norm": 0.24357332289218903, "learning_rate": 1.6085764790003465e-05, "loss": 1.2412, "step": 10502 }, { "epoch": 3.12827863511979, "grad_norm": 0.24626581370830536, "learning_rate": 1.6084999382890287e-05, "loss": 1.2391, "step": 10503 }, { "epoch": 3.1285764813194588, "grad_norm": 0.23055605590343475, "learning_rate": 1.6084233919162988e-05, "loss": 1.2247, "step": 10504 }, { "epoch": 3.128874327519127, "grad_norm": 0.2400212287902832, "learning_rate": 1.6083468398828687e-05, "loss": 1.2394, "step": 10505 }, { "epoch": 3.129172173718796, "grad_norm": 0.24657058715820312, "learning_rate": 1.608270282189451e-05, "loss": 1.2261, "step": 10506 }, { "epoch": 3.1294700199184646, "grad_norm": 0.23674990236759186, "learning_rate": 1.6081937188367582e-05, "loss": 1.2436, "step": 10507 }, { "epoch": 3.1297678661181334, "grad_norm": 0.24882350862026215, "learning_rate": 1.6081171498255024e-05, "loss": 1.2608, "step": 10508 }, { "epoch": 3.1300657123178017, "grad_norm": 0.2526969909667969, "learning_rate": 1.608040575156396e-05, "loss": 1.2457, "step": 10509 }, { "epoch": 3.1303635585174705, "grad_norm": 0.2874167859554291, "learning_rate": 1.607963994830151e-05, "loss": 1.2234, "step": 10510 }, { "epoch": 3.1306614047171393, "grad_norm": 0.23955944180488586, "learning_rate": 1.607887408847481e-05, "loss": 1.2525, "step": 10511 }, { "epoch": 3.130959250916808, "grad_norm": 0.4108958840370178, "learning_rate": 1.607810817209097e-05, "loss": 1.2191, "step": 10512 }, { "epoch": 3.1312570971164764, "grad_norm": 0.3459058403968811, "learning_rate": 1.6077342199157125e-05, "loss": 1.244, "step": 10513 }, { "epoch": 3.131554943316145, "grad_norm": 0.3005227744579315, "learning_rate": 1.6076576169680404e-05, "loss": 1.2411, "step": 10514 }, { "epoch": 3.131852789515814, "grad_norm": 0.40811359882354736, "learning_rate": 1.6075810083667933e-05, "loss": 1.2507, "step": 10515 }, { "epoch": 3.1321506357154822, "grad_norm": 0.24512659013271332, "learning_rate": 1.607504394112683e-05, "loss": 1.2354, "step": 10516 }, { "epoch": 3.132448481915151, "grad_norm": 0.26784753799438477, "learning_rate": 1.6074277742064237e-05, "loss": 1.2302, "step": 10517 }, { "epoch": 3.13274632811482, "grad_norm": 0.25369390845298767, "learning_rate": 1.6073511486487276e-05, "loss": 1.246, "step": 10518 }, { "epoch": 3.1330441743144886, "grad_norm": 0.27158766984939575, "learning_rate": 1.6072745174403073e-05, "loss": 1.2461, "step": 10519 }, { "epoch": 3.133342020514157, "grad_norm": 0.2626797556877136, "learning_rate": 1.6071978805818765e-05, "loss": 1.2269, "step": 10520 }, { "epoch": 3.1336398667138257, "grad_norm": 0.26225483417510986, "learning_rate": 1.6071212380741475e-05, "loss": 1.2436, "step": 10521 }, { "epoch": 3.1339377129134944, "grad_norm": 0.30332276225090027, "learning_rate": 1.607044589917834e-05, "loss": 1.2383, "step": 10522 }, { "epoch": 3.1342355591131628, "grad_norm": 0.25240257382392883, "learning_rate": 1.6069679361136484e-05, "loss": 1.2286, "step": 10523 }, { "epoch": 3.1345334053128315, "grad_norm": 0.2635928690433502, "learning_rate": 1.6068912766623043e-05, "loss": 1.245, "step": 10524 }, { "epoch": 3.1348312515125003, "grad_norm": 0.25835949182510376, "learning_rate": 1.6068146115645156e-05, "loss": 1.2267, "step": 10525 }, { "epoch": 3.135129097712169, "grad_norm": 0.23610354959964752, "learning_rate": 1.6067379408209945e-05, "loss": 1.2332, "step": 10526 }, { "epoch": 3.1354269439118374, "grad_norm": 0.23394012451171875, "learning_rate": 1.6066612644324545e-05, "loss": 1.2418, "step": 10527 }, { "epoch": 3.135724790111506, "grad_norm": 0.23350240290164948, "learning_rate": 1.6065845823996095e-05, "loss": 1.2325, "step": 10528 }, { "epoch": 3.136022636311175, "grad_norm": 0.2493225485086441, "learning_rate": 1.6065078947231727e-05, "loss": 1.2402, "step": 10529 }, { "epoch": 3.1363204825108433, "grad_norm": 0.24471496045589447, "learning_rate": 1.6064312014038576e-05, "loss": 1.2323, "step": 10530 }, { "epoch": 3.136618328710512, "grad_norm": 0.23390144109725952, "learning_rate": 1.606354502442377e-05, "loss": 1.2367, "step": 10531 }, { "epoch": 3.136916174910181, "grad_norm": 0.24722252786159515, "learning_rate": 1.606277797839446e-05, "loss": 1.2428, "step": 10532 }, { "epoch": 3.1372140211098496, "grad_norm": 0.24313843250274658, "learning_rate": 1.6062010875957774e-05, "loss": 1.2283, "step": 10533 }, { "epoch": 3.137511867309518, "grad_norm": 0.23428331315517426, "learning_rate": 1.606124371712085e-05, "loss": 1.2348, "step": 10534 }, { "epoch": 3.1378097135091867, "grad_norm": 0.2426571398973465, "learning_rate": 1.6060476501890824e-05, "loss": 1.2467, "step": 10535 }, { "epoch": 3.1381075597088555, "grad_norm": 0.2308879941701889, "learning_rate": 1.6059709230274837e-05, "loss": 1.2375, "step": 10536 }, { "epoch": 3.138405405908524, "grad_norm": 0.2395615577697754, "learning_rate": 1.605894190228002e-05, "loss": 1.2537, "step": 10537 }, { "epoch": 3.1387032521081926, "grad_norm": 0.2675611674785614, "learning_rate": 1.6058174517913525e-05, "loss": 1.2334, "step": 10538 }, { "epoch": 3.1390010983078613, "grad_norm": 0.3454257547855377, "learning_rate": 1.605740707718248e-05, "loss": 1.257, "step": 10539 }, { "epoch": 3.13929894450753, "grad_norm": 0.2523891031742096, "learning_rate": 1.6056639580094032e-05, "loss": 1.2475, "step": 10540 }, { "epoch": 3.1395967907071984, "grad_norm": 0.36959531903266907, "learning_rate": 1.605587202665532e-05, "loss": 1.2261, "step": 10541 }, { "epoch": 3.139894636906867, "grad_norm": 0.34983065724372864, "learning_rate": 1.6055104416873485e-05, "loss": 1.2399, "step": 10542 }, { "epoch": 3.140192483106536, "grad_norm": 0.30975499749183655, "learning_rate": 1.6054336750755667e-05, "loss": 1.2525, "step": 10543 }, { "epoch": 3.1404903293062043, "grad_norm": 0.49515029788017273, "learning_rate": 1.605356902830901e-05, "loss": 1.2196, "step": 10544 }, { "epoch": 3.140788175505873, "grad_norm": 0.28890347480773926, "learning_rate": 1.6052801249540656e-05, "loss": 1.2392, "step": 10545 }, { "epoch": 3.141086021705542, "grad_norm": 0.2818622291088104, "learning_rate": 1.6052033414457753e-05, "loss": 1.2316, "step": 10546 }, { "epoch": 3.1413838679052106, "grad_norm": 0.2801220715045929, "learning_rate": 1.6051265523067437e-05, "loss": 1.2478, "step": 10547 }, { "epoch": 3.141681714104879, "grad_norm": 0.28101953864097595, "learning_rate": 1.6050497575376857e-05, "loss": 1.238, "step": 10548 }, { "epoch": 3.1419795603045477, "grad_norm": 0.24674144387245178, "learning_rate": 1.6049729571393155e-05, "loss": 1.2242, "step": 10549 }, { "epoch": 3.1422774065042165, "grad_norm": 0.2553524672985077, "learning_rate": 1.6048961511123484e-05, "loss": 1.2394, "step": 10550 }, { "epoch": 3.142575252703885, "grad_norm": 0.2494676113128662, "learning_rate": 1.6048193394574978e-05, "loss": 1.2347, "step": 10551 }, { "epoch": 3.1428730989035536, "grad_norm": 0.26495182514190674, "learning_rate": 1.6047425221754794e-05, "loss": 1.2294, "step": 10552 }, { "epoch": 3.1431709451032224, "grad_norm": 0.24896548688411713, "learning_rate": 1.6046656992670074e-05, "loss": 1.2491, "step": 10553 }, { "epoch": 3.143468791302891, "grad_norm": 0.24919770658016205, "learning_rate": 1.6045888707327967e-05, "loss": 1.2173, "step": 10554 }, { "epoch": 3.1437666375025595, "grad_norm": 0.24065832793712616, "learning_rate": 1.6045120365735618e-05, "loss": 1.2394, "step": 10555 }, { "epoch": 3.1440644837022282, "grad_norm": 0.2379603236913681, "learning_rate": 1.6044351967900183e-05, "loss": 1.237, "step": 10556 }, { "epoch": 3.144362329901897, "grad_norm": 0.2357507199048996, "learning_rate": 1.60435835138288e-05, "loss": 1.2368, "step": 10557 }, { "epoch": 3.1446601761015653, "grad_norm": 0.23212207853794098, "learning_rate": 1.6042815003528627e-05, "loss": 1.2521, "step": 10558 }, { "epoch": 3.144958022301234, "grad_norm": 0.24604001641273499, "learning_rate": 1.6042046437006814e-05, "loss": 1.2277, "step": 10559 }, { "epoch": 3.145255868500903, "grad_norm": 0.23578615486621857, "learning_rate": 1.6041277814270508e-05, "loss": 1.2409, "step": 10560 }, { "epoch": 3.1455537147005717, "grad_norm": 0.27423205971717834, "learning_rate": 1.604050913532686e-05, "loss": 1.24, "step": 10561 }, { "epoch": 3.14585156090024, "grad_norm": 0.29923346638679504, "learning_rate": 1.6039740400183024e-05, "loss": 1.2299, "step": 10562 }, { "epoch": 3.1461494070999088, "grad_norm": 0.24445225298404694, "learning_rate": 1.6038971608846155e-05, "loss": 1.222, "step": 10563 }, { "epoch": 3.1464472532995775, "grad_norm": 0.46557796001434326, "learning_rate": 1.6038202761323398e-05, "loss": 1.2244, "step": 10564 }, { "epoch": 3.1467450994992463, "grad_norm": 0.33012205362319946, "learning_rate": 1.6037433857621913e-05, "loss": 1.2332, "step": 10565 }, { "epoch": 3.1470429456989146, "grad_norm": 0.2876785099506378, "learning_rate": 1.6036664897748852e-05, "loss": 1.2284, "step": 10566 }, { "epoch": 3.1473407918985834, "grad_norm": 0.24542902410030365, "learning_rate": 1.6035895881711367e-05, "loss": 1.2457, "step": 10567 }, { "epoch": 3.147638638098252, "grad_norm": 0.32998529076576233, "learning_rate": 1.6035126809516614e-05, "loss": 1.2433, "step": 10568 }, { "epoch": 3.1479364842979205, "grad_norm": 0.23939268290996552, "learning_rate": 1.603435768117175e-05, "loss": 1.227, "step": 10569 }, { "epoch": 3.1482343304975893, "grad_norm": 0.24985511600971222, "learning_rate": 1.6033588496683927e-05, "loss": 1.2424, "step": 10570 }, { "epoch": 3.148532176697258, "grad_norm": 0.26416710019111633, "learning_rate": 1.603281925606031e-05, "loss": 1.2471, "step": 10571 }, { "epoch": 3.1488300228969264, "grad_norm": 0.24465322494506836, "learning_rate": 1.6032049959308044e-05, "loss": 1.2423, "step": 10572 }, { "epoch": 3.149127869096595, "grad_norm": 0.28843262791633606, "learning_rate": 1.6031280606434298e-05, "loss": 1.2557, "step": 10573 }, { "epoch": 3.149425715296264, "grad_norm": 0.2554221451282501, "learning_rate": 1.603051119744622e-05, "loss": 1.2416, "step": 10574 }, { "epoch": 3.1497235614959327, "grad_norm": 0.2663499414920807, "learning_rate": 1.6029741732350973e-05, "loss": 1.2289, "step": 10575 }, { "epoch": 3.150021407695601, "grad_norm": 0.24723687767982483, "learning_rate": 1.602897221115572e-05, "loss": 1.2591, "step": 10576 }, { "epoch": 3.15031925389527, "grad_norm": 0.2523896396160126, "learning_rate": 1.6028202633867615e-05, "loss": 1.2279, "step": 10577 }, { "epoch": 3.1506171000949386, "grad_norm": 0.23409320414066315, "learning_rate": 1.602743300049382e-05, "loss": 1.2491, "step": 10578 }, { "epoch": 3.1509149462946073, "grad_norm": 0.2615102231502533, "learning_rate": 1.6026663311041492e-05, "loss": 1.2392, "step": 10579 }, { "epoch": 3.1512127924942757, "grad_norm": 0.24396833777427673, "learning_rate": 1.60258935655178e-05, "loss": 1.2382, "step": 10580 }, { "epoch": 3.1515106386939444, "grad_norm": 0.24034400284290314, "learning_rate": 1.6025123763929894e-05, "loss": 1.2151, "step": 10581 }, { "epoch": 3.151808484893613, "grad_norm": 0.24459712207317352, "learning_rate": 1.602435390628495e-05, "loss": 1.2251, "step": 10582 }, { "epoch": 3.1521063310932815, "grad_norm": 0.26458051800727844, "learning_rate": 1.6023583992590118e-05, "loss": 1.2436, "step": 10583 }, { "epoch": 3.1524041772929503, "grad_norm": 0.2689766585826874, "learning_rate": 1.6022814022852573e-05, "loss": 1.2425, "step": 10584 }, { "epoch": 3.152702023492619, "grad_norm": 0.2262040078639984, "learning_rate": 1.6022043997079468e-05, "loss": 1.2391, "step": 10585 }, { "epoch": 3.152999869692288, "grad_norm": 0.263418585062027, "learning_rate": 1.6021273915277973e-05, "loss": 1.2237, "step": 10586 }, { "epoch": 3.153297715891956, "grad_norm": 0.29484131932258606, "learning_rate": 1.6020503777455254e-05, "loss": 1.2376, "step": 10587 }, { "epoch": 3.153595562091625, "grad_norm": 0.25313621759414673, "learning_rate": 1.601973358361847e-05, "loss": 1.2397, "step": 10588 }, { "epoch": 3.1538934082912937, "grad_norm": 0.2395484745502472, "learning_rate": 1.6018963333774792e-05, "loss": 1.239, "step": 10589 }, { "epoch": 3.154191254490962, "grad_norm": 0.2361753135919571, "learning_rate": 1.6018193027931385e-05, "loss": 1.2323, "step": 10590 }, { "epoch": 3.154489100690631, "grad_norm": 0.2424056977033615, "learning_rate": 1.6017422666095417e-05, "loss": 1.2449, "step": 10591 }, { "epoch": 3.1547869468902996, "grad_norm": 0.2801514267921448, "learning_rate": 1.6016652248274055e-05, "loss": 1.2446, "step": 10592 }, { "epoch": 3.1550847930899684, "grad_norm": 0.358261376619339, "learning_rate": 1.6015881774474463e-05, "loss": 1.2503, "step": 10593 }, { "epoch": 3.1553826392896367, "grad_norm": 0.3260704576969147, "learning_rate": 1.6015111244703812e-05, "loss": 1.2271, "step": 10594 }, { "epoch": 3.1556804854893055, "grad_norm": 0.25165343284606934, "learning_rate": 1.6014340658969274e-05, "loss": 1.244, "step": 10595 }, { "epoch": 3.1559783316889742, "grad_norm": 0.41584092378616333, "learning_rate": 1.601357001727802e-05, "loss": 1.2474, "step": 10596 }, { "epoch": 3.1562761778886426, "grad_norm": 0.26856347918510437, "learning_rate": 1.6012799319637208e-05, "loss": 1.2424, "step": 10597 }, { "epoch": 3.1565740240883113, "grad_norm": 0.28092923760414124, "learning_rate": 1.6012028566054018e-05, "loss": 1.2372, "step": 10598 }, { "epoch": 3.15687187028798, "grad_norm": 0.2309153825044632, "learning_rate": 1.6011257756535618e-05, "loss": 1.2288, "step": 10599 }, { "epoch": 3.157169716487649, "grad_norm": 0.5773800611495972, "learning_rate": 1.6010486891089184e-05, "loss": 1.2306, "step": 10600 }, { "epoch": 3.157467562687317, "grad_norm": 0.3086642622947693, "learning_rate": 1.6009715969721883e-05, "loss": 1.2397, "step": 10601 }, { "epoch": 3.157765408886986, "grad_norm": 0.2767390012741089, "learning_rate": 1.600894499244089e-05, "loss": 1.2353, "step": 10602 }, { "epoch": 3.1580632550866548, "grad_norm": 0.2537829875946045, "learning_rate": 1.6008173959253378e-05, "loss": 1.219, "step": 10603 }, { "epoch": 3.158361101286323, "grad_norm": 0.23458589613437653, "learning_rate": 1.600740287016652e-05, "loss": 1.2382, "step": 10604 }, { "epoch": 3.158658947485992, "grad_norm": 0.2574675977230072, "learning_rate": 1.600663172518749e-05, "loss": 1.2451, "step": 10605 }, { "epoch": 3.1589567936856606, "grad_norm": 0.23827262222766876, "learning_rate": 1.600586052432346e-05, "loss": 1.2456, "step": 10606 }, { "epoch": 3.1592546398853294, "grad_norm": 0.2524144649505615, "learning_rate": 1.600508926758161e-05, "loss": 1.2392, "step": 10607 }, { "epoch": 3.1595524860849977, "grad_norm": 0.22839228808879852, "learning_rate": 1.6004317954969114e-05, "loss": 1.2532, "step": 10608 }, { "epoch": 3.1598503322846665, "grad_norm": 0.23768730461597443, "learning_rate": 1.600354658649315e-05, "loss": 1.2378, "step": 10609 }, { "epoch": 3.1601481784843353, "grad_norm": 0.2440677136182785, "learning_rate": 1.600277516216089e-05, "loss": 1.2295, "step": 10610 }, { "epoch": 3.1604460246840036, "grad_norm": 0.24208486080169678, "learning_rate": 1.6002003681979513e-05, "loss": 1.2439, "step": 10611 }, { "epoch": 3.1607438708836724, "grad_norm": 0.2367459386587143, "learning_rate": 1.60012321459562e-05, "loss": 1.2267, "step": 10612 }, { "epoch": 3.161041717083341, "grad_norm": 0.22994917631149292, "learning_rate": 1.6000460554098126e-05, "loss": 1.23, "step": 10613 }, { "epoch": 3.16133956328301, "grad_norm": 0.23039232194423676, "learning_rate": 1.599968890641247e-05, "loss": 1.2238, "step": 10614 }, { "epoch": 3.1616374094826782, "grad_norm": 0.24741627275943756, "learning_rate": 1.5998917202906414e-05, "loss": 1.2337, "step": 10615 }, { "epoch": 3.161935255682347, "grad_norm": 0.24553634226322174, "learning_rate": 1.599814544358713e-05, "loss": 1.2307, "step": 10616 }, { "epoch": 3.162233101882016, "grad_norm": 0.2533511519432068, "learning_rate": 1.599737362846181e-05, "loss": 1.2557, "step": 10617 }, { "epoch": 3.162530948081684, "grad_norm": 0.2352992445230484, "learning_rate": 1.599660175753763e-05, "loss": 1.2315, "step": 10618 }, { "epoch": 3.162828794281353, "grad_norm": 0.23641620576381683, "learning_rate": 1.5995829830821766e-05, "loss": 1.2307, "step": 10619 }, { "epoch": 3.1631266404810217, "grad_norm": 0.23822817206382751, "learning_rate": 1.599505784832141e-05, "loss": 1.2334, "step": 10620 }, { "epoch": 3.1634244866806904, "grad_norm": 0.25682422518730164, "learning_rate": 1.5994285810043733e-05, "loss": 1.2388, "step": 10621 }, { "epoch": 3.1637223328803588, "grad_norm": 0.23904050886631012, "learning_rate": 1.5993513715995925e-05, "loss": 1.2381, "step": 10622 }, { "epoch": 3.1640201790800275, "grad_norm": 0.23546943068504333, "learning_rate": 1.599274156618517e-05, "loss": 1.243, "step": 10623 }, { "epoch": 3.1643180252796963, "grad_norm": 0.24305890500545502, "learning_rate": 1.5991969360618647e-05, "loss": 1.2377, "step": 10624 }, { "epoch": 3.1646158714793646, "grad_norm": 0.23653298616409302, "learning_rate": 1.5991197099303546e-05, "loss": 1.2373, "step": 10625 }, { "epoch": 3.1649137176790334, "grad_norm": 0.2401796281337738, "learning_rate": 1.599042478224705e-05, "loss": 1.2428, "step": 10626 }, { "epoch": 3.165211563878702, "grad_norm": 0.23065708577632904, "learning_rate": 1.5989652409456346e-05, "loss": 1.239, "step": 10627 }, { "epoch": 3.165509410078371, "grad_norm": 0.2518618702888489, "learning_rate": 1.5988879980938615e-05, "loss": 1.2256, "step": 10628 }, { "epoch": 3.1658072562780393, "grad_norm": 0.24840891361236572, "learning_rate": 1.5988107496701045e-05, "loss": 1.2454, "step": 10629 }, { "epoch": 3.166105102477708, "grad_norm": 0.23902809619903564, "learning_rate": 1.598733495675083e-05, "loss": 1.239, "step": 10630 }, { "epoch": 3.166402948677377, "grad_norm": 0.2358437478542328, "learning_rate": 1.5986562361095153e-05, "loss": 1.2446, "step": 10631 }, { "epoch": 3.1667007948770456, "grad_norm": 0.2648789584636688, "learning_rate": 1.5985789709741197e-05, "loss": 1.2413, "step": 10632 }, { "epoch": 3.166998641076714, "grad_norm": 0.2546963393688202, "learning_rate": 1.598501700269616e-05, "loss": 1.2379, "step": 10633 }, { "epoch": 3.1672964872763827, "grad_norm": 0.2382189929485321, "learning_rate": 1.5984244239967225e-05, "loss": 1.2242, "step": 10634 }, { "epoch": 3.1675943334760515, "grad_norm": 0.25194206833839417, "learning_rate": 1.5983471421561584e-05, "loss": 1.2239, "step": 10635 }, { "epoch": 3.16789217967572, "grad_norm": 0.24583874642848969, "learning_rate": 1.5982698547486423e-05, "loss": 1.24, "step": 10636 }, { "epoch": 3.1681900258753886, "grad_norm": 0.23640339076519012, "learning_rate": 1.598192561774894e-05, "loss": 1.2345, "step": 10637 }, { "epoch": 3.1684878720750573, "grad_norm": 0.24777191877365112, "learning_rate": 1.5981152632356324e-05, "loss": 1.2611, "step": 10638 }, { "epoch": 3.1687857182747257, "grad_norm": 0.24595105648040771, "learning_rate": 1.598037959131576e-05, "loss": 1.2318, "step": 10639 }, { "epoch": 3.1690835644743944, "grad_norm": 0.23297661542892456, "learning_rate": 1.597960649463445e-05, "loss": 1.2389, "step": 10640 }, { "epoch": 3.169381410674063, "grad_norm": 0.23024040460586548, "learning_rate": 1.597883334231958e-05, "loss": 1.2273, "step": 10641 }, { "epoch": 3.169679256873732, "grad_norm": 0.24169017374515533, "learning_rate": 1.5978060134378348e-05, "loss": 1.2303, "step": 10642 }, { "epoch": 3.1699771030734003, "grad_norm": 0.2444298267364502, "learning_rate": 1.5977286870817943e-05, "loss": 1.2454, "step": 10643 }, { "epoch": 3.170274949273069, "grad_norm": 0.24222426116466522, "learning_rate": 1.5976513551645563e-05, "loss": 1.2487, "step": 10644 }, { "epoch": 3.170572795472738, "grad_norm": 0.2733427882194519, "learning_rate": 1.59757401768684e-05, "loss": 1.2317, "step": 10645 }, { "epoch": 3.1708706416724066, "grad_norm": 0.24904723465442657, "learning_rate": 1.5974966746493656e-05, "loss": 1.2417, "step": 10646 }, { "epoch": 3.171168487872075, "grad_norm": 0.28826475143432617, "learning_rate": 1.5974193260528518e-05, "loss": 1.2374, "step": 10647 }, { "epoch": 3.1714663340717437, "grad_norm": 0.28618383407592773, "learning_rate": 1.5973419718980187e-05, "loss": 1.2489, "step": 10648 }, { "epoch": 3.1717641802714125, "grad_norm": 0.2855173349380493, "learning_rate": 1.597264612185586e-05, "loss": 1.2383, "step": 10649 }, { "epoch": 3.172062026471081, "grad_norm": 0.27851954102516174, "learning_rate": 1.5971872469162732e-05, "loss": 1.2414, "step": 10650 }, { "epoch": 3.1723598726707496, "grad_norm": 0.24966329336166382, "learning_rate": 1.5971098760908007e-05, "loss": 1.2286, "step": 10651 }, { "epoch": 3.1726577188704184, "grad_norm": 0.25896716117858887, "learning_rate": 1.5970324997098875e-05, "loss": 1.2321, "step": 10652 }, { "epoch": 3.172955565070087, "grad_norm": 0.24297773838043213, "learning_rate": 1.5969551177742542e-05, "loss": 1.2283, "step": 10653 }, { "epoch": 3.1732534112697555, "grad_norm": 0.2746482193470001, "learning_rate": 1.5968777302846204e-05, "loss": 1.2415, "step": 10654 }, { "epoch": 3.1735512574694242, "grad_norm": 0.2508457899093628, "learning_rate": 1.596800337241706e-05, "loss": 1.2466, "step": 10655 }, { "epoch": 3.173849103669093, "grad_norm": 0.259013295173645, "learning_rate": 1.5967229386462314e-05, "loss": 1.235, "step": 10656 }, { "epoch": 3.1741469498687613, "grad_norm": 0.2628565728664398, "learning_rate": 1.5966455344989168e-05, "loss": 1.2392, "step": 10657 }, { "epoch": 3.17444479606843, "grad_norm": 0.23000118136405945, "learning_rate": 1.596568124800482e-05, "loss": 1.2404, "step": 10658 }, { "epoch": 3.174742642268099, "grad_norm": 0.3210986256599426, "learning_rate": 1.596490709551647e-05, "loss": 1.2302, "step": 10659 }, { "epoch": 3.1750404884677677, "grad_norm": 0.2435050904750824, "learning_rate": 1.5964132887531327e-05, "loss": 1.2289, "step": 10660 }, { "epoch": 3.175338334667436, "grad_norm": 0.26220378279685974, "learning_rate": 1.596335862405659e-05, "loss": 1.2306, "step": 10661 }, { "epoch": 3.1756361808671048, "grad_norm": 0.23695498704910278, "learning_rate": 1.5962584305099463e-05, "loss": 1.2269, "step": 10662 }, { "epoch": 3.1759340270667735, "grad_norm": 0.26109692454338074, "learning_rate": 1.5961809930667152e-05, "loss": 1.2385, "step": 10663 }, { "epoch": 3.176231873266442, "grad_norm": 0.24443170428276062, "learning_rate": 1.596103550076686e-05, "loss": 1.2492, "step": 10664 }, { "epoch": 3.1765297194661106, "grad_norm": 0.2502957880496979, "learning_rate": 1.5960261015405795e-05, "loss": 1.2291, "step": 10665 }, { "epoch": 3.1768275656657794, "grad_norm": 0.2379729002714157, "learning_rate": 1.5959486474591158e-05, "loss": 1.2359, "step": 10666 }, { "epoch": 3.177125411865448, "grad_norm": 0.276965469121933, "learning_rate": 1.5958711878330158e-05, "loss": 1.2421, "step": 10667 }, { "epoch": 3.1774232580651165, "grad_norm": 0.24670004844665527, "learning_rate": 1.595793722663e-05, "loss": 1.246, "step": 10668 }, { "epoch": 3.1777211042647853, "grad_norm": 0.2542455792427063, "learning_rate": 1.5957162519497897e-05, "loss": 1.2483, "step": 10669 }, { "epoch": 3.178018950464454, "grad_norm": 0.23900531232357025, "learning_rate": 1.595638775694105e-05, "loss": 1.2383, "step": 10670 }, { "epoch": 3.1783167966641224, "grad_norm": 0.2516971230506897, "learning_rate": 1.5955612938966667e-05, "loss": 1.2575, "step": 10671 }, { "epoch": 3.178614642863791, "grad_norm": 0.25015169382095337, "learning_rate": 1.5954838065581962e-05, "loss": 1.228, "step": 10672 }, { "epoch": 3.17891248906346, "grad_norm": 0.23840080201625824, "learning_rate": 1.5954063136794143e-05, "loss": 1.2269, "step": 10673 }, { "epoch": 3.1792103352631287, "grad_norm": 0.2403010129928589, "learning_rate": 1.5953288152610416e-05, "loss": 1.2285, "step": 10674 }, { "epoch": 3.179508181462797, "grad_norm": 0.23212282359600067, "learning_rate": 1.5952513113037996e-05, "loss": 1.2315, "step": 10675 }, { "epoch": 3.179806027662466, "grad_norm": 0.25795066356658936, "learning_rate": 1.595173801808409e-05, "loss": 1.2421, "step": 10676 }, { "epoch": 3.1801038738621346, "grad_norm": 0.2992171347141266, "learning_rate": 1.5950962867755916e-05, "loss": 1.2389, "step": 10677 }, { "epoch": 3.180401720061803, "grad_norm": 0.2883983254432678, "learning_rate": 1.595018766206068e-05, "loss": 1.2252, "step": 10678 }, { "epoch": 3.1806995662614717, "grad_norm": 0.22906909883022308, "learning_rate": 1.5949412401005594e-05, "loss": 1.2426, "step": 10679 }, { "epoch": 3.1809974124611404, "grad_norm": 0.24934309720993042, "learning_rate": 1.5948637084597873e-05, "loss": 1.2319, "step": 10680 }, { "epoch": 3.181295258660809, "grad_norm": 0.33420678973197937, "learning_rate": 1.594786171284473e-05, "loss": 1.2409, "step": 10681 }, { "epoch": 3.1815931048604775, "grad_norm": 0.31714025139808655, "learning_rate": 1.594708628575338e-05, "loss": 1.2292, "step": 10682 }, { "epoch": 3.1818909510601463, "grad_norm": 0.25144240260124207, "learning_rate": 1.5946310803331035e-05, "loss": 1.2403, "step": 10683 }, { "epoch": 3.182188797259815, "grad_norm": 0.4745495319366455, "learning_rate": 1.594553526558491e-05, "loss": 1.2337, "step": 10684 }, { "epoch": 3.182486643459484, "grad_norm": 0.35296693444252014, "learning_rate": 1.5944759672522227e-05, "loss": 1.2524, "step": 10685 }, { "epoch": 3.182784489659152, "grad_norm": 0.3100326657295227, "learning_rate": 1.5943984024150195e-05, "loss": 1.2409, "step": 10686 }, { "epoch": 3.183082335858821, "grad_norm": 0.3078671991825104, "learning_rate": 1.5943208320476032e-05, "loss": 1.2203, "step": 10687 }, { "epoch": 3.1833801820584897, "grad_norm": 0.2713058888912201, "learning_rate": 1.5942432561506956e-05, "loss": 1.2566, "step": 10688 }, { "epoch": 3.183678028258158, "grad_norm": 0.24255773425102234, "learning_rate": 1.5941656747250186e-05, "loss": 1.2302, "step": 10689 }, { "epoch": 3.183975874457827, "grad_norm": 0.2739361822605133, "learning_rate": 1.5940880877712935e-05, "loss": 1.2347, "step": 10690 }, { "epoch": 3.1842737206574956, "grad_norm": 0.24218833446502686, "learning_rate": 1.5940104952902427e-05, "loss": 1.2394, "step": 10691 }, { "epoch": 3.184571566857164, "grad_norm": 0.295357882976532, "learning_rate": 1.593932897282588e-05, "loss": 1.2416, "step": 10692 }, { "epoch": 3.1848694130568327, "grad_norm": 0.23636461794376373, "learning_rate": 1.5938552937490512e-05, "loss": 1.2442, "step": 10693 }, { "epoch": 3.1851672592565015, "grad_norm": 0.25766557455062866, "learning_rate": 1.593777684690354e-05, "loss": 1.2243, "step": 10694 }, { "epoch": 3.1854651054561702, "grad_norm": 0.23682431876659393, "learning_rate": 1.5937000701072193e-05, "loss": 1.2317, "step": 10695 }, { "epoch": 3.1857629516558386, "grad_norm": 0.28741541504859924, "learning_rate": 1.593622450000369e-05, "loss": 1.2417, "step": 10696 }, { "epoch": 3.1860607978555073, "grad_norm": 0.24891297519207, "learning_rate": 1.5935448243705244e-05, "loss": 1.2336, "step": 10697 }, { "epoch": 3.186358644055176, "grad_norm": 0.27397966384887695, "learning_rate": 1.5934671932184088e-05, "loss": 1.2407, "step": 10698 }, { "epoch": 3.186656490254845, "grad_norm": 0.24041998386383057, "learning_rate": 1.5933895565447438e-05, "loss": 1.2267, "step": 10699 }, { "epoch": 3.186954336454513, "grad_norm": 0.44697391986846924, "learning_rate": 1.593311914350252e-05, "loss": 1.2516, "step": 10700 }, { "epoch": 3.187252182654182, "grad_norm": 0.28662437200546265, "learning_rate": 1.593234266635656e-05, "loss": 1.2548, "step": 10701 }, { "epoch": 3.1875500288538507, "grad_norm": 0.29262790083885193, "learning_rate": 1.5931566134016776e-05, "loss": 1.2208, "step": 10702 }, { "epoch": 3.187847875053519, "grad_norm": 0.2651198208332062, "learning_rate": 1.5930789546490397e-05, "loss": 1.2393, "step": 10703 }, { "epoch": 3.188145721253188, "grad_norm": 0.3279496729373932, "learning_rate": 1.5930012903784647e-05, "loss": 1.2232, "step": 10704 }, { "epoch": 3.1884435674528566, "grad_norm": 0.32991090416908264, "learning_rate": 1.5929236205906752e-05, "loss": 1.2306, "step": 10705 }, { "epoch": 3.188741413652525, "grad_norm": 0.2659478783607483, "learning_rate": 1.5928459452863942e-05, "loss": 1.2562, "step": 10706 }, { "epoch": 3.1890392598521937, "grad_norm": 0.29939010739326477, "learning_rate": 1.5927682644663438e-05, "loss": 1.2342, "step": 10707 }, { "epoch": 3.1893371060518625, "grad_norm": 0.256277859210968, "learning_rate": 1.592690578131247e-05, "loss": 1.2411, "step": 10708 }, { "epoch": 3.1896349522515313, "grad_norm": 0.3427721858024597, "learning_rate": 1.5926128862818267e-05, "loss": 1.2419, "step": 10709 }, { "epoch": 3.1899327984511996, "grad_norm": 0.24358530342578888, "learning_rate": 1.5925351889188053e-05, "loss": 1.2348, "step": 10710 }, { "epoch": 3.1902306446508684, "grad_norm": 0.27404895424842834, "learning_rate": 1.5924574860429064e-05, "loss": 1.2291, "step": 10711 }, { "epoch": 3.190528490850537, "grad_norm": 0.24499614536762238, "learning_rate": 1.592379777654852e-05, "loss": 1.2249, "step": 10712 }, { "epoch": 3.190826337050206, "grad_norm": 0.25218456983566284, "learning_rate": 1.592302063755366e-05, "loss": 1.2261, "step": 10713 }, { "epoch": 3.1911241832498742, "grad_norm": 0.2580556273460388, "learning_rate": 1.592224344345171e-05, "loss": 1.244, "step": 10714 }, { "epoch": 3.191422029449543, "grad_norm": 0.25089213252067566, "learning_rate": 1.59214661942499e-05, "loss": 1.2316, "step": 10715 }, { "epoch": 3.191719875649212, "grad_norm": 0.2475767582654953, "learning_rate": 1.5920688889955463e-05, "loss": 1.2315, "step": 10716 }, { "epoch": 3.19201772184888, "grad_norm": 0.2405940592288971, "learning_rate": 1.5919911530575634e-05, "loss": 1.2361, "step": 10717 }, { "epoch": 3.192315568048549, "grad_norm": 0.23215091228485107, "learning_rate": 1.591913411611764e-05, "loss": 1.2113, "step": 10718 }, { "epoch": 3.1926134142482177, "grad_norm": 0.22682683169841766, "learning_rate": 1.591835664658872e-05, "loss": 1.2321, "step": 10719 }, { "epoch": 3.1929112604478864, "grad_norm": 0.23340408504009247, "learning_rate": 1.5917579121996096e-05, "loss": 1.235, "step": 10720 }, { "epoch": 3.1932091066475548, "grad_norm": 0.22769591212272644, "learning_rate": 1.5916801542347013e-05, "loss": 1.224, "step": 10721 }, { "epoch": 3.1935069528472235, "grad_norm": 0.24305878579616547, "learning_rate": 1.5916023907648703e-05, "loss": 1.2164, "step": 10722 }, { "epoch": 3.1938047990468923, "grad_norm": 0.24259690940380096, "learning_rate": 1.5915246217908403e-05, "loss": 1.2328, "step": 10723 }, { "epoch": 3.1941026452465606, "grad_norm": 0.2590729296207428, "learning_rate": 1.5914468473133343e-05, "loss": 1.2271, "step": 10724 }, { "epoch": 3.1944004914462294, "grad_norm": 0.36325037479400635, "learning_rate": 1.591369067333076e-05, "loss": 1.2415, "step": 10725 }, { "epoch": 3.194698337645898, "grad_norm": 0.2736909091472626, "learning_rate": 1.5912912818507897e-05, "loss": 1.2337, "step": 10726 }, { "epoch": 3.194996183845567, "grad_norm": 0.26318758726119995, "learning_rate": 1.5912134908671985e-05, "loss": 1.2378, "step": 10727 }, { "epoch": 3.1952940300452353, "grad_norm": 0.3223790228366852, "learning_rate": 1.5911356943830264e-05, "loss": 1.2276, "step": 10728 }, { "epoch": 3.195591876244904, "grad_norm": 0.23630988597869873, "learning_rate": 1.591057892398997e-05, "loss": 1.2438, "step": 10729 }, { "epoch": 3.195889722444573, "grad_norm": 0.2934418320655823, "learning_rate": 1.590980084915834e-05, "loss": 1.222, "step": 10730 }, { "epoch": 3.196187568644241, "grad_norm": 0.2269815057516098, "learning_rate": 1.590902271934262e-05, "loss": 1.2147, "step": 10731 }, { "epoch": 3.19648541484391, "grad_norm": 0.3533363938331604, "learning_rate": 1.590824453455004e-05, "loss": 1.2396, "step": 10732 }, { "epoch": 3.1967832610435787, "grad_norm": 0.3048786222934723, "learning_rate": 1.5907466294787852e-05, "loss": 1.2195, "step": 10733 }, { "epoch": 3.1970811072432475, "grad_norm": 0.2923799455165863, "learning_rate": 1.590668800006329e-05, "loss": 1.2354, "step": 10734 }, { "epoch": 3.197378953442916, "grad_norm": 0.4716499149799347, "learning_rate": 1.590590965038359e-05, "loss": 1.233, "step": 10735 }, { "epoch": 3.1976767996425846, "grad_norm": 0.308133602142334, "learning_rate": 1.5905131245756004e-05, "loss": 1.2179, "step": 10736 }, { "epoch": 3.1979746458422533, "grad_norm": 0.29060718417167664, "learning_rate": 1.590435278618777e-05, "loss": 1.2341, "step": 10737 }, { "epoch": 3.1982724920419217, "grad_norm": 0.3171718120574951, "learning_rate": 1.5903574271686126e-05, "loss": 1.2442, "step": 10738 }, { "epoch": 3.1985703382415904, "grad_norm": 0.25217702984809875, "learning_rate": 1.590279570225832e-05, "loss": 1.2133, "step": 10739 }, { "epoch": 3.198868184441259, "grad_norm": 0.28292983770370483, "learning_rate": 1.5902017077911596e-05, "loss": 1.2306, "step": 10740 }, { "epoch": 3.199166030640928, "grad_norm": 0.26874682307243347, "learning_rate": 1.5901238398653197e-05, "loss": 1.2335, "step": 10741 }, { "epoch": 3.1994638768405963, "grad_norm": 0.3263053894042969, "learning_rate": 1.590045966449037e-05, "loss": 1.2385, "step": 10742 }, { "epoch": 3.199761723040265, "grad_norm": 0.23493176698684692, "learning_rate": 1.5899680875430355e-05, "loss": 1.2344, "step": 10743 }, { "epoch": 3.200059569239934, "grad_norm": 0.25662684440612793, "learning_rate": 1.5898902031480403e-05, "loss": 1.236, "step": 10744 }, { "epoch": 3.200357415439602, "grad_norm": 0.2345069795846939, "learning_rate": 1.5898123132647757e-05, "loss": 1.2223, "step": 10745 }, { "epoch": 3.200655261639271, "grad_norm": 0.2831743061542511, "learning_rate": 1.5897344178939666e-05, "loss": 1.2452, "step": 10746 }, { "epoch": 3.2009531078389397, "grad_norm": 0.24404355883598328, "learning_rate": 1.5896565170363375e-05, "loss": 1.2286, "step": 10747 }, { "epoch": 3.2012509540386085, "grad_norm": 0.30020880699157715, "learning_rate": 1.5895786106926135e-05, "loss": 1.2309, "step": 10748 }, { "epoch": 3.201548800238277, "grad_norm": 0.23508435487747192, "learning_rate": 1.5895006988635195e-05, "loss": 1.2301, "step": 10749 }, { "epoch": 3.2018466464379456, "grad_norm": 0.38590341806411743, "learning_rate": 1.5894227815497797e-05, "loss": 1.2362, "step": 10750 }, { "epoch": 3.2021444926376144, "grad_norm": 0.3294226825237274, "learning_rate": 1.5893448587521196e-05, "loss": 1.2446, "step": 10751 }, { "epoch": 3.202442338837283, "grad_norm": 0.29960083961486816, "learning_rate": 1.589266930471264e-05, "loss": 1.2398, "step": 10752 }, { "epoch": 3.2027401850369515, "grad_norm": 0.3394383490085602, "learning_rate": 1.589188996707938e-05, "loss": 1.2325, "step": 10753 }, { "epoch": 3.2030380312366202, "grad_norm": 0.23774844408035278, "learning_rate": 1.5891110574628664e-05, "loss": 1.2313, "step": 10754 }, { "epoch": 3.203335877436289, "grad_norm": 0.2551165223121643, "learning_rate": 1.589033112736775e-05, "loss": 1.2399, "step": 10755 }, { "epoch": 3.2036337236359573, "grad_norm": 0.24792854487895966, "learning_rate": 1.5889551625303883e-05, "loss": 1.2388, "step": 10756 }, { "epoch": 3.203931569835626, "grad_norm": 0.3705975413322449, "learning_rate": 1.588877206844432e-05, "loss": 1.227, "step": 10757 }, { "epoch": 3.204229416035295, "grad_norm": 0.29245322942733765, "learning_rate": 1.5887992456796313e-05, "loss": 1.2415, "step": 10758 }, { "epoch": 3.204527262234963, "grad_norm": 0.3218289315700531, "learning_rate": 1.5887212790367113e-05, "loss": 1.2421, "step": 10759 }, { "epoch": 3.204825108434632, "grad_norm": 0.4285733699798584, "learning_rate": 1.5886433069163975e-05, "loss": 1.249, "step": 10760 }, { "epoch": 3.2051229546343007, "grad_norm": 0.25649911165237427, "learning_rate": 1.5885653293194157e-05, "loss": 1.226, "step": 10761 }, { "epoch": 3.2054208008339695, "grad_norm": 0.26349279284477234, "learning_rate": 1.588487346246491e-05, "loss": 1.2504, "step": 10762 }, { "epoch": 3.205718647033638, "grad_norm": 0.23419275879859924, "learning_rate": 1.588409357698349e-05, "loss": 1.2246, "step": 10763 }, { "epoch": 3.2060164932333066, "grad_norm": 0.26687389612197876, "learning_rate": 1.588331363675715e-05, "loss": 1.2408, "step": 10764 }, { "epoch": 3.2063143394329754, "grad_norm": 0.24406246840953827, "learning_rate": 1.5882533641793154e-05, "loss": 1.2235, "step": 10765 }, { "epoch": 3.206612185632644, "grad_norm": 0.28717195987701416, "learning_rate": 1.5881753592098753e-05, "loss": 1.2321, "step": 10766 }, { "epoch": 3.2069100318323125, "grad_norm": 0.23939520120620728, "learning_rate": 1.5880973487681207e-05, "loss": 1.2357, "step": 10767 }, { "epoch": 3.2072078780319813, "grad_norm": 0.2434358447790146, "learning_rate": 1.588019332854777e-05, "loss": 1.2345, "step": 10768 }, { "epoch": 3.20750572423165, "grad_norm": 0.23970021307468414, "learning_rate": 1.587941311470571e-05, "loss": 1.2256, "step": 10769 }, { "epoch": 3.2078035704313184, "grad_norm": 0.25342997908592224, "learning_rate": 1.5878632846162277e-05, "loss": 1.2514, "step": 10770 }, { "epoch": 3.208101416630987, "grad_norm": 0.2785789966583252, "learning_rate": 1.5877852522924733e-05, "loss": 1.245, "step": 10771 }, { "epoch": 3.208399262830656, "grad_norm": 0.22904382646083832, "learning_rate": 1.587707214500034e-05, "loss": 1.2308, "step": 10772 }, { "epoch": 3.2086971090303242, "grad_norm": 0.25558245182037354, "learning_rate": 1.5876291712396353e-05, "loss": 1.2583, "step": 10773 }, { "epoch": 3.208994955229993, "grad_norm": 0.2670227587223053, "learning_rate": 1.587551122512004e-05, "loss": 1.221, "step": 10774 }, { "epoch": 3.209292801429662, "grad_norm": 0.24412156641483307, "learning_rate": 1.587473068317866e-05, "loss": 1.2382, "step": 10775 }, { "epoch": 3.2095906476293306, "grad_norm": 0.24204950034618378, "learning_rate": 1.5873950086579473e-05, "loss": 1.2446, "step": 10776 }, { "epoch": 3.209888493828999, "grad_norm": 0.2510155737400055, "learning_rate": 1.5873169435329745e-05, "loss": 1.2272, "step": 10777 }, { "epoch": 3.2101863400286677, "grad_norm": 0.2400822639465332, "learning_rate": 1.5872388729436735e-05, "loss": 1.2434, "step": 10778 }, { "epoch": 3.2104841862283364, "grad_norm": 0.2394513189792633, "learning_rate": 1.5871607968907712e-05, "loss": 1.2273, "step": 10779 }, { "epoch": 3.210782032428005, "grad_norm": 0.2357054054737091, "learning_rate": 1.5870827153749932e-05, "loss": 1.226, "step": 10780 }, { "epoch": 3.2110798786276735, "grad_norm": 0.23843006789684296, "learning_rate": 1.5870046283970667e-05, "loss": 1.2487, "step": 10781 }, { "epoch": 3.2113777248273423, "grad_norm": 0.23922370374202728, "learning_rate": 1.5869265359577184e-05, "loss": 1.225, "step": 10782 }, { "epoch": 3.211675571027011, "grad_norm": 0.23820941150188446, "learning_rate": 1.586848438057674e-05, "loss": 1.2506, "step": 10783 }, { "epoch": 3.2119734172266794, "grad_norm": 0.23231257498264313, "learning_rate": 1.5867703346976607e-05, "loss": 1.2477, "step": 10784 }, { "epoch": 3.212271263426348, "grad_norm": 0.2385302484035492, "learning_rate": 1.5866922258784048e-05, "loss": 1.2381, "step": 10785 }, { "epoch": 3.212569109626017, "grad_norm": 0.28267809748649597, "learning_rate": 1.586614111600633e-05, "loss": 1.2455, "step": 10786 }, { "epoch": 3.2128669558256857, "grad_norm": 0.30639517307281494, "learning_rate": 1.5865359918650728e-05, "loss": 1.2284, "step": 10787 }, { "epoch": 3.213164802025354, "grad_norm": 0.2467852681875229, "learning_rate": 1.5864578666724505e-05, "loss": 1.2389, "step": 10788 }, { "epoch": 3.213462648225023, "grad_norm": 0.2759745121002197, "learning_rate": 1.586379736023493e-05, "loss": 1.2245, "step": 10789 }, { "epoch": 3.2137604944246916, "grad_norm": 0.2421119511127472, "learning_rate": 1.586301599918927e-05, "loss": 1.2296, "step": 10790 }, { "epoch": 3.21405834062436, "grad_norm": 0.24279387295246124, "learning_rate": 1.58622345835948e-05, "loss": 1.2397, "step": 10791 }, { "epoch": 3.2143561868240287, "grad_norm": 0.2437242716550827, "learning_rate": 1.586145311345878e-05, "loss": 1.241, "step": 10792 }, { "epoch": 3.2146540330236975, "grad_norm": 0.35955995321273804, "learning_rate": 1.586067158878849e-05, "loss": 1.2486, "step": 10793 }, { "epoch": 3.2149518792233662, "grad_norm": 0.35823380947113037, "learning_rate": 1.5859890009591204e-05, "loss": 1.2346, "step": 10794 }, { "epoch": 3.2152497254230346, "grad_norm": 0.2452254593372345, "learning_rate": 1.5859108375874184e-05, "loss": 1.2518, "step": 10795 }, { "epoch": 3.2155475716227033, "grad_norm": 0.533737301826477, "learning_rate": 1.585832668764471e-05, "loss": 1.2364, "step": 10796 }, { "epoch": 3.215845417822372, "grad_norm": 0.36993083357810974, "learning_rate": 1.585754494491005e-05, "loss": 1.2328, "step": 10797 }, { "epoch": 3.2161432640220404, "grad_norm": 0.3465012311935425, "learning_rate": 1.5856763147677476e-05, "loss": 1.2261, "step": 10798 }, { "epoch": 3.216441110221709, "grad_norm": 0.24372775852680206, "learning_rate": 1.5855981295954267e-05, "loss": 1.2389, "step": 10799 }, { "epoch": 3.216738956421378, "grad_norm": 0.49915915727615356, "learning_rate": 1.5855199389747693e-05, "loss": 1.2295, "step": 10800 }, { "epoch": 3.2170368026210467, "grad_norm": 0.272409588098526, "learning_rate": 1.5854417429065032e-05, "loss": 1.2399, "step": 10801 }, { "epoch": 3.217334648820715, "grad_norm": 0.27181148529052734, "learning_rate": 1.585363541391356e-05, "loss": 1.2167, "step": 10802 }, { "epoch": 3.217632495020384, "grad_norm": 0.2770851254463196, "learning_rate": 1.5852853344300546e-05, "loss": 1.2477, "step": 10803 }, { "epoch": 3.2179303412200526, "grad_norm": 0.23387804627418518, "learning_rate": 1.585207122023327e-05, "loss": 1.2409, "step": 10804 }, { "epoch": 3.218228187419721, "grad_norm": 0.37050560116767883, "learning_rate": 1.5851289041719017e-05, "loss": 1.2347, "step": 10805 }, { "epoch": 3.2185260336193897, "grad_norm": 0.24400877952575684, "learning_rate": 1.585050680876505e-05, "loss": 1.2295, "step": 10806 }, { "epoch": 3.2188238798190585, "grad_norm": 0.2682911455631256, "learning_rate": 1.5849724521378656e-05, "loss": 1.2426, "step": 10807 }, { "epoch": 3.2191217260187273, "grad_norm": 0.26314225792884827, "learning_rate": 1.5848942179567114e-05, "loss": 1.2317, "step": 10808 }, { "epoch": 3.2194195722183956, "grad_norm": 0.24134303629398346, "learning_rate": 1.5848159783337692e-05, "loss": 1.2355, "step": 10809 }, { "epoch": 3.2197174184180644, "grad_norm": 0.2847636044025421, "learning_rate": 1.5847377332697683e-05, "loss": 1.2257, "step": 10810 }, { "epoch": 3.220015264617733, "grad_norm": 0.24767592549324036, "learning_rate": 1.584659482765436e-05, "loss": 1.2428, "step": 10811 }, { "epoch": 3.2203131108174015, "grad_norm": 0.2538882791996002, "learning_rate": 1.5845812268215005e-05, "loss": 1.239, "step": 10812 }, { "epoch": 3.2206109570170702, "grad_norm": 0.2436237782239914, "learning_rate": 1.5845029654386895e-05, "loss": 1.2261, "step": 10813 }, { "epoch": 3.220908803216739, "grad_norm": 0.24263125658035278, "learning_rate": 1.5844246986177322e-05, "loss": 1.2439, "step": 10814 }, { "epoch": 3.2212066494164078, "grad_norm": 0.2770474851131439, "learning_rate": 1.5843464263593553e-05, "loss": 1.2418, "step": 10815 }, { "epoch": 3.221504495616076, "grad_norm": 0.258004367351532, "learning_rate": 1.584268148664288e-05, "loss": 1.2383, "step": 10816 }, { "epoch": 3.221802341815745, "grad_norm": 0.23440416157245636, "learning_rate": 1.5841898655332582e-05, "loss": 1.2377, "step": 10817 }, { "epoch": 3.2221001880154136, "grad_norm": 0.24034483730793, "learning_rate": 1.584111576966995e-05, "loss": 1.2303, "step": 10818 }, { "epoch": 3.2223980342150824, "grad_norm": 0.24852849543094635, "learning_rate": 1.5840332829662255e-05, "loss": 1.2275, "step": 10819 }, { "epoch": 3.2226958804147507, "grad_norm": 0.2526701092720032, "learning_rate": 1.583954983531679e-05, "loss": 1.2381, "step": 10820 }, { "epoch": 3.2229937266144195, "grad_norm": 0.25460922718048096, "learning_rate": 1.5838766786640842e-05, "loss": 1.2287, "step": 10821 }, { "epoch": 3.2232915728140883, "grad_norm": 0.25999483466148376, "learning_rate": 1.5837983683641688e-05, "loss": 1.223, "step": 10822 }, { "epoch": 3.2235894190137566, "grad_norm": 0.23224976658821106, "learning_rate": 1.583720052632662e-05, "loss": 1.2314, "step": 10823 }, { "epoch": 3.2238872652134254, "grad_norm": 0.270042359828949, "learning_rate": 1.583641731470292e-05, "loss": 1.2612, "step": 10824 }, { "epoch": 3.224185111413094, "grad_norm": 0.23013587296009064, "learning_rate": 1.583563404877788e-05, "loss": 1.2248, "step": 10825 }, { "epoch": 3.2244829576127625, "grad_norm": 0.26332661509513855, "learning_rate": 1.5834850728558787e-05, "loss": 1.2432, "step": 10826 }, { "epoch": 3.2247808038124313, "grad_norm": 0.28763964772224426, "learning_rate": 1.5834067354052926e-05, "loss": 1.243, "step": 10827 }, { "epoch": 3.2250786500121, "grad_norm": 0.23542463779449463, "learning_rate": 1.5833283925267587e-05, "loss": 1.2295, "step": 10828 }, { "epoch": 3.225376496211769, "grad_norm": 0.3327104151248932, "learning_rate": 1.583250044221006e-05, "loss": 1.2452, "step": 10829 }, { "epoch": 3.225674342411437, "grad_norm": 0.3312518000602722, "learning_rate": 1.5831716904887625e-05, "loss": 1.2219, "step": 10830 }, { "epoch": 3.225972188611106, "grad_norm": 0.24583038687705994, "learning_rate": 1.5830933313307585e-05, "loss": 1.2252, "step": 10831 }, { "epoch": 3.2262700348107747, "grad_norm": 0.37174612283706665, "learning_rate": 1.583014966747723e-05, "loss": 1.2595, "step": 10832 }, { "epoch": 3.2265678810104435, "grad_norm": 0.2938191592693329, "learning_rate": 1.582936596740384e-05, "loss": 1.2381, "step": 10833 }, { "epoch": 3.226865727210112, "grad_norm": 0.257517546415329, "learning_rate": 1.5828582213094713e-05, "loss": 1.2322, "step": 10834 }, { "epoch": 3.2271635734097806, "grad_norm": 0.25996196269989014, "learning_rate": 1.582779840455714e-05, "loss": 1.2467, "step": 10835 }, { "epoch": 3.2274614196094493, "grad_norm": 0.23953726887702942, "learning_rate": 1.5827014541798415e-05, "loss": 1.2305, "step": 10836 }, { "epoch": 3.2277592658091177, "grad_norm": 0.23734961450099945, "learning_rate": 1.582623062482583e-05, "loss": 1.2248, "step": 10837 }, { "epoch": 3.2280571120087864, "grad_norm": 0.26539120078086853, "learning_rate": 1.582544665364668e-05, "loss": 1.2496, "step": 10838 }, { "epoch": 3.228354958208455, "grad_norm": 0.2751491367816925, "learning_rate": 1.5824662628268258e-05, "loss": 1.247, "step": 10839 }, { "epoch": 3.2286528044081235, "grad_norm": 0.24858997762203217, "learning_rate": 1.5823878548697856e-05, "loss": 1.2419, "step": 10840 }, { "epoch": 3.2289506506077923, "grad_norm": 0.28213387727737427, "learning_rate": 1.5823094414942774e-05, "loss": 1.2231, "step": 10841 }, { "epoch": 3.229248496807461, "grad_norm": 0.23853297531604767, "learning_rate": 1.58223102270103e-05, "loss": 1.2393, "step": 10842 }, { "epoch": 3.22954634300713, "grad_norm": 0.25614315271377563, "learning_rate": 1.582152598490774e-05, "loss": 1.2393, "step": 10843 }, { "epoch": 3.229844189206798, "grad_norm": 0.2433617264032364, "learning_rate": 1.582074168864238e-05, "loss": 1.2434, "step": 10844 }, { "epoch": 3.230142035406467, "grad_norm": 0.2417619228363037, "learning_rate": 1.581995733822152e-05, "loss": 1.2409, "step": 10845 }, { "epoch": 3.2304398816061357, "grad_norm": 0.2536410689353943, "learning_rate": 1.5819172933652464e-05, "loss": 1.2197, "step": 10846 }, { "epoch": 3.2307377278058045, "grad_norm": 0.23557448387145996, "learning_rate": 1.58183884749425e-05, "loss": 1.2417, "step": 10847 }, { "epoch": 3.231035574005473, "grad_norm": 0.2389982044696808, "learning_rate": 1.5817603962098938e-05, "loss": 1.2517, "step": 10848 }, { "epoch": 3.2313334202051416, "grad_norm": 0.26373356580734253, "learning_rate": 1.5816819395129072e-05, "loss": 1.2488, "step": 10849 }, { "epoch": 3.2316312664048104, "grad_norm": 0.2483980804681778, "learning_rate": 1.5816034774040193e-05, "loss": 1.2431, "step": 10850 }, { "epoch": 3.2319291126044787, "grad_norm": 0.237098827958107, "learning_rate": 1.5815250098839615e-05, "loss": 1.2162, "step": 10851 }, { "epoch": 3.2322269588041475, "grad_norm": 0.2552669048309326, "learning_rate": 1.581446536953463e-05, "loss": 1.2449, "step": 10852 }, { "epoch": 3.2325248050038162, "grad_norm": 0.24945805966854095, "learning_rate": 1.581368058613254e-05, "loss": 1.2326, "step": 10853 }, { "epoch": 3.232822651203485, "grad_norm": 0.2533094882965088, "learning_rate": 1.581289574864065e-05, "loss": 1.2341, "step": 10854 }, { "epoch": 3.2331204974031533, "grad_norm": 0.2475655972957611, "learning_rate": 1.5812110857066257e-05, "loss": 1.241, "step": 10855 }, { "epoch": 3.233418343602822, "grad_norm": 0.25987479090690613, "learning_rate": 1.5811325911416668e-05, "loss": 1.2436, "step": 10856 }, { "epoch": 3.233716189802491, "grad_norm": 0.3800262212753296, "learning_rate": 1.5810540911699183e-05, "loss": 1.2387, "step": 10857 }, { "epoch": 3.234014036002159, "grad_norm": 0.34478265047073364, "learning_rate": 1.5809755857921104e-05, "loss": 1.245, "step": 10858 }, { "epoch": 3.234311882201828, "grad_norm": 0.2621350884437561, "learning_rate": 1.5808970750089744e-05, "loss": 1.2449, "step": 10859 }, { "epoch": 3.2346097284014967, "grad_norm": 0.7660012245178223, "learning_rate": 1.5808185588212396e-05, "loss": 1.2498, "step": 10860 }, { "epoch": 3.2349075746011655, "grad_norm": 0.26017045974731445, "learning_rate": 1.580740037229637e-05, "loss": 1.2325, "step": 10861 }, { "epoch": 3.235205420800834, "grad_norm": 0.2521427869796753, "learning_rate": 1.5806615102348976e-05, "loss": 1.2429, "step": 10862 }, { "epoch": 3.2355032670005026, "grad_norm": 0.2565779983997345, "learning_rate": 1.5805829778377518e-05, "loss": 1.2556, "step": 10863 }, { "epoch": 3.2358011132001714, "grad_norm": 0.23941807448863983, "learning_rate": 1.5805044400389295e-05, "loss": 1.2361, "step": 10864 }, { "epoch": 3.2360989593998397, "grad_norm": 0.24612638354301453, "learning_rate": 1.5804258968391622e-05, "loss": 1.2244, "step": 10865 }, { "epoch": 3.2363968055995085, "grad_norm": 0.24015018343925476, "learning_rate": 1.5803473482391804e-05, "loss": 1.2153, "step": 10866 }, { "epoch": 3.2366946517991773, "grad_norm": 0.2549809515476227, "learning_rate": 1.580268794239715e-05, "loss": 1.2317, "step": 10867 }, { "epoch": 3.236992497998846, "grad_norm": 0.24323932826519012, "learning_rate": 1.5801902348414966e-05, "loss": 1.2456, "step": 10868 }, { "epoch": 3.2372903441985144, "grad_norm": 0.23269499838352203, "learning_rate": 1.5801116700452565e-05, "loss": 1.2372, "step": 10869 }, { "epoch": 3.237588190398183, "grad_norm": 0.24246612191200256, "learning_rate": 1.5800330998517252e-05, "loss": 1.2365, "step": 10870 }, { "epoch": 3.237886036597852, "grad_norm": 0.24057693779468536, "learning_rate": 1.579954524261634e-05, "loss": 1.234, "step": 10871 }, { "epoch": 3.2381838827975202, "grad_norm": 0.23094598948955536, "learning_rate": 1.579875943275714e-05, "loss": 1.2459, "step": 10872 }, { "epoch": 3.238481728997189, "grad_norm": 0.23706576228141785, "learning_rate": 1.5797973568946965e-05, "loss": 1.2446, "step": 10873 }, { "epoch": 3.2387795751968578, "grad_norm": 0.23647253215312958, "learning_rate": 1.579718765119312e-05, "loss": 1.2426, "step": 10874 }, { "epoch": 3.2390774213965265, "grad_norm": 0.23313447833061218, "learning_rate": 1.579640167950292e-05, "loss": 1.2452, "step": 10875 }, { "epoch": 3.239375267596195, "grad_norm": 0.23108598589897156, "learning_rate": 1.579561565388368e-05, "loss": 1.2196, "step": 10876 }, { "epoch": 3.2396731137958636, "grad_norm": 0.23014722764492035, "learning_rate": 1.5794829574342717e-05, "loss": 1.2129, "step": 10877 }, { "epoch": 3.2399709599955324, "grad_norm": 0.23379579186439514, "learning_rate": 1.5794043440887333e-05, "loss": 1.2343, "step": 10878 }, { "epoch": 3.2402688061952007, "grad_norm": 0.24715496599674225, "learning_rate": 1.5793257253524848e-05, "loss": 1.2487, "step": 10879 }, { "epoch": 3.2405666523948695, "grad_norm": 0.24083739519119263, "learning_rate": 1.579247101226258e-05, "loss": 1.2565, "step": 10880 }, { "epoch": 3.2408644985945383, "grad_norm": 0.23865579068660736, "learning_rate": 1.579168471710784e-05, "loss": 1.2424, "step": 10881 }, { "epoch": 3.241162344794207, "grad_norm": 0.24451987445354462, "learning_rate": 1.5790898368067945e-05, "loss": 1.2374, "step": 10882 }, { "epoch": 3.2414601909938754, "grad_norm": 0.24056999385356903, "learning_rate": 1.5790111965150208e-05, "loss": 1.2217, "step": 10883 }, { "epoch": 3.241758037193544, "grad_norm": 0.2411208599805832, "learning_rate": 1.578932550836195e-05, "loss": 1.2213, "step": 10884 }, { "epoch": 3.242055883393213, "grad_norm": 0.24081043899059296, "learning_rate": 1.5788538997710487e-05, "loss": 1.2354, "step": 10885 }, { "epoch": 3.2423537295928817, "grad_norm": 0.238377645611763, "learning_rate": 1.5787752433203136e-05, "loss": 1.2285, "step": 10886 }, { "epoch": 3.24265157579255, "grad_norm": 0.23264189064502716, "learning_rate": 1.5786965814847214e-05, "loss": 1.2424, "step": 10887 }, { "epoch": 3.242949421992219, "grad_norm": 0.23121654987335205, "learning_rate": 1.578617914265004e-05, "loss": 1.2161, "step": 10888 }, { "epoch": 3.2432472681918876, "grad_norm": 0.2341800332069397, "learning_rate": 1.5785392416618933e-05, "loss": 1.2286, "step": 10889 }, { "epoch": 3.243545114391556, "grad_norm": 0.2328624576330185, "learning_rate": 1.5784605636761218e-05, "loss": 1.2207, "step": 10890 }, { "epoch": 3.2438429605912247, "grad_norm": 0.24926938116550446, "learning_rate": 1.5783818803084208e-05, "loss": 1.2162, "step": 10891 }, { "epoch": 3.2441408067908934, "grad_norm": 0.23768393695354462, "learning_rate": 1.5783031915595222e-05, "loss": 1.2464, "step": 10892 }, { "epoch": 3.2444386529905618, "grad_norm": 0.2344312220811844, "learning_rate": 1.578224497430159e-05, "loss": 1.2441, "step": 10893 }, { "epoch": 3.2447364991902305, "grad_norm": 0.2264043241739273, "learning_rate": 1.578145797921063e-05, "loss": 1.2413, "step": 10894 }, { "epoch": 3.2450343453898993, "grad_norm": 0.23375320434570312, "learning_rate": 1.5780670930329656e-05, "loss": 1.2417, "step": 10895 }, { "epoch": 3.245332191589568, "grad_norm": 0.23985905945301056, "learning_rate": 1.5779883827666004e-05, "loss": 1.24, "step": 10896 }, { "epoch": 3.2456300377892364, "grad_norm": 0.23048816621303558, "learning_rate": 1.577909667122699e-05, "loss": 1.23, "step": 10897 }, { "epoch": 3.245927883988905, "grad_norm": 0.23868009448051453, "learning_rate": 1.5778309461019937e-05, "loss": 1.2265, "step": 10898 }, { "epoch": 3.246225730188574, "grad_norm": 0.2428135722875595, "learning_rate": 1.5777522197052172e-05, "loss": 1.2438, "step": 10899 }, { "epoch": 3.2465235763882427, "grad_norm": 0.24055194854736328, "learning_rate": 1.5776734879331015e-05, "loss": 1.2302, "step": 10900 }, { "epoch": 3.246821422587911, "grad_norm": 0.2537178099155426, "learning_rate": 1.5775947507863795e-05, "loss": 1.2404, "step": 10901 }, { "epoch": 3.24711926878758, "grad_norm": 0.2449008971452713, "learning_rate": 1.577516008265784e-05, "loss": 1.235, "step": 10902 }, { "epoch": 3.2474171149872486, "grad_norm": 0.23761901259422302, "learning_rate": 1.577437260372047e-05, "loss": 1.2363, "step": 10903 }, { "epoch": 3.247714961186917, "grad_norm": 0.24395373463630676, "learning_rate": 1.5773585071059013e-05, "loss": 1.239, "step": 10904 }, { "epoch": 3.2480128073865857, "grad_norm": 0.24250420928001404, "learning_rate": 1.5772797484680798e-05, "loss": 1.2315, "step": 10905 }, { "epoch": 3.2483106535862545, "grad_norm": 0.23782804608345032, "learning_rate": 1.5772009844593156e-05, "loss": 1.2471, "step": 10906 }, { "epoch": 3.248608499785923, "grad_norm": 0.2457427680492401, "learning_rate": 1.5771222150803407e-05, "loss": 1.2249, "step": 10907 }, { "epoch": 3.2489063459855916, "grad_norm": 0.23378737270832062, "learning_rate": 1.5770434403318885e-05, "loss": 1.2415, "step": 10908 }, { "epoch": 3.2492041921852604, "grad_norm": 0.23457074165344238, "learning_rate": 1.5769646602146918e-05, "loss": 1.2458, "step": 10909 }, { "epoch": 3.249502038384929, "grad_norm": 0.24822695553302765, "learning_rate": 1.5768858747294837e-05, "loss": 1.2439, "step": 10910 }, { "epoch": 3.2497998845845975, "grad_norm": 0.23484498262405396, "learning_rate": 1.5768070838769972e-05, "loss": 1.2398, "step": 10911 }, { "epoch": 3.2500977307842662, "grad_norm": 0.24210688471794128, "learning_rate": 1.5767282876579647e-05, "loss": 1.2429, "step": 10912 }, { "epoch": 3.250395576983935, "grad_norm": 0.22929832339286804, "learning_rate": 1.57664948607312e-05, "loss": 1.2198, "step": 10913 }, { "epoch": 3.2506934231836038, "grad_norm": 0.24054791033267975, "learning_rate": 1.5765706791231965e-05, "loss": 1.2377, "step": 10914 }, { "epoch": 3.250991269383272, "grad_norm": 0.23274177312850952, "learning_rate": 1.5764918668089266e-05, "loss": 1.2368, "step": 10915 }, { "epoch": 3.251289115582941, "grad_norm": 0.24070748686790466, "learning_rate": 1.5764130491310442e-05, "loss": 1.2292, "step": 10916 }, { "epoch": 3.2515869617826096, "grad_norm": 0.23932212591171265, "learning_rate": 1.5763342260902824e-05, "loss": 1.2513, "step": 10917 }, { "epoch": 3.251884807982278, "grad_norm": 0.24381308257579803, "learning_rate": 1.5762553976873745e-05, "loss": 1.2236, "step": 10918 }, { "epoch": 3.2521826541819467, "grad_norm": 0.23351305723190308, "learning_rate": 1.5761765639230537e-05, "loss": 1.2491, "step": 10919 }, { "epoch": 3.2524805003816155, "grad_norm": 0.25579774379730225, "learning_rate": 1.576097724798054e-05, "loss": 1.2563, "step": 10920 }, { "epoch": 3.252778346581284, "grad_norm": 0.23283495008945465, "learning_rate": 1.5760188803131086e-05, "loss": 1.226, "step": 10921 }, { "epoch": 3.2530761927809526, "grad_norm": 0.23861253261566162, "learning_rate": 1.575940030468951e-05, "loss": 1.2386, "step": 10922 }, { "epoch": 3.2533740389806214, "grad_norm": 0.25366294384002686, "learning_rate": 1.575861175266315e-05, "loss": 1.244, "step": 10923 }, { "epoch": 3.25367188518029, "grad_norm": 0.2527261972427368, "learning_rate": 1.5757823147059343e-05, "loss": 1.2304, "step": 10924 }, { "epoch": 3.2539697313799585, "grad_norm": 0.2507016360759735, "learning_rate": 1.575703448788542e-05, "loss": 1.2283, "step": 10925 }, { "epoch": 3.2542675775796273, "grad_norm": 0.2321532815694809, "learning_rate": 1.5756245775148723e-05, "loss": 1.2341, "step": 10926 }, { "epoch": 3.254565423779296, "grad_norm": 0.24228094518184662, "learning_rate": 1.5755457008856598e-05, "loss": 1.2465, "step": 10927 }, { "epoch": 3.254863269978965, "grad_norm": 0.24019403755664825, "learning_rate": 1.575466818901637e-05, "loss": 1.248, "step": 10928 }, { "epoch": 3.255161116178633, "grad_norm": 0.2458336353302002, "learning_rate": 1.5753879315635384e-05, "loss": 1.254, "step": 10929 }, { "epoch": 3.255458962378302, "grad_norm": 0.2647082507610321, "learning_rate": 1.575309038872098e-05, "loss": 1.2229, "step": 10930 }, { "epoch": 3.2557568085779707, "grad_norm": 0.24335896968841553, "learning_rate": 1.57523014082805e-05, "loss": 1.2437, "step": 10931 }, { "epoch": 3.256054654777639, "grad_norm": 0.24166379868984222, "learning_rate": 1.5751512374321277e-05, "loss": 1.2387, "step": 10932 }, { "epoch": 3.2563525009773078, "grad_norm": 0.27155637741088867, "learning_rate": 1.5750723286850663e-05, "loss": 1.2316, "step": 10933 }, { "epoch": 3.2566503471769765, "grad_norm": 0.24128098785877228, "learning_rate": 1.574993414587599e-05, "loss": 1.2489, "step": 10934 }, { "epoch": 3.2569481933766453, "grad_norm": 0.31059321761131287, "learning_rate": 1.5749144951404606e-05, "loss": 1.2562, "step": 10935 }, { "epoch": 3.2572460395763136, "grad_norm": 0.24159196019172668, "learning_rate": 1.574835570344385e-05, "loss": 1.2447, "step": 10936 }, { "epoch": 3.2575438857759824, "grad_norm": 0.25864771008491516, "learning_rate": 1.574756640200107e-05, "loss": 1.2353, "step": 10937 }, { "epoch": 3.257841731975651, "grad_norm": 0.2590962052345276, "learning_rate": 1.5746777047083607e-05, "loss": 1.229, "step": 10938 }, { "epoch": 3.25813957817532, "grad_norm": 0.23139087855815887, "learning_rate": 1.57459876386988e-05, "loss": 1.234, "step": 10939 }, { "epoch": 3.2584374243749883, "grad_norm": 0.23297728598117828, "learning_rate": 1.5745198176854e-05, "loss": 1.2444, "step": 10940 }, { "epoch": 3.258735270574657, "grad_norm": 0.2693626880645752, "learning_rate": 1.5744408661556547e-05, "loss": 1.2328, "step": 10941 }, { "epoch": 3.259033116774326, "grad_norm": 0.25418147444725037, "learning_rate": 1.5743619092813793e-05, "loss": 1.2418, "step": 10942 }, { "epoch": 3.259330962973994, "grad_norm": 0.2374105155467987, "learning_rate": 1.5742829470633075e-05, "loss": 1.2263, "step": 10943 }, { "epoch": 3.259628809173663, "grad_norm": 0.24964316189289093, "learning_rate": 1.5742039795021752e-05, "loss": 1.2382, "step": 10944 }, { "epoch": 3.2599266553733317, "grad_norm": 0.24822159111499786, "learning_rate": 1.5741250065987158e-05, "loss": 1.2376, "step": 10945 }, { "epoch": 3.260224501573, "grad_norm": 0.2457820326089859, "learning_rate": 1.5740460283536652e-05, "loss": 1.2223, "step": 10946 }, { "epoch": 3.260522347772669, "grad_norm": 0.24911081790924072, "learning_rate": 1.5739670447677572e-05, "loss": 1.245, "step": 10947 }, { "epoch": 3.2608201939723376, "grad_norm": 0.23777316510677338, "learning_rate": 1.5738880558417277e-05, "loss": 1.2512, "step": 10948 }, { "epoch": 3.2611180401720063, "grad_norm": 0.24039356410503387, "learning_rate": 1.5738090615763107e-05, "loss": 1.2284, "step": 10949 }, { "epoch": 3.2614158863716747, "grad_norm": 0.2394830584526062, "learning_rate": 1.5737300619722412e-05, "loss": 1.2471, "step": 10950 }, { "epoch": 3.2617137325713434, "grad_norm": 0.2711692452430725, "learning_rate": 1.573651057030255e-05, "loss": 1.234, "step": 10951 }, { "epoch": 3.262011578771012, "grad_norm": 0.23467116057872772, "learning_rate": 1.573572046751086e-05, "loss": 1.2339, "step": 10952 }, { "epoch": 3.262309424970681, "grad_norm": 0.2356289029121399, "learning_rate": 1.5734930311354705e-05, "loss": 1.2448, "step": 10953 }, { "epoch": 3.2626072711703493, "grad_norm": 0.2496243566274643, "learning_rate": 1.573414010184143e-05, "loss": 1.2604, "step": 10954 }, { "epoch": 3.262905117370018, "grad_norm": 0.23672722280025482, "learning_rate": 1.573334983897839e-05, "loss": 1.2503, "step": 10955 }, { "epoch": 3.263202963569687, "grad_norm": 0.2826528251171112, "learning_rate": 1.5732559522772926e-05, "loss": 1.2452, "step": 10956 }, { "epoch": 3.263500809769355, "grad_norm": 0.23860949277877808, "learning_rate": 1.573176915323241e-05, "loss": 1.2421, "step": 10957 }, { "epoch": 3.263798655969024, "grad_norm": 0.2497885525226593, "learning_rate": 1.5730978730364183e-05, "loss": 1.2378, "step": 10958 }, { "epoch": 3.2640965021686927, "grad_norm": 0.24569043517112732, "learning_rate": 1.57301882541756e-05, "loss": 1.2436, "step": 10959 }, { "epoch": 3.264394348368361, "grad_norm": 0.23629222810268402, "learning_rate": 1.572939772467402e-05, "loss": 1.2269, "step": 10960 }, { "epoch": 3.26469219456803, "grad_norm": 0.31353241205215454, "learning_rate": 1.5728607141866797e-05, "loss": 1.238, "step": 10961 }, { "epoch": 3.2649900407676986, "grad_norm": 0.2657424211502075, "learning_rate": 1.572781650576128e-05, "loss": 1.2364, "step": 10962 }, { "epoch": 3.2652878869673674, "grad_norm": 0.2878069281578064, "learning_rate": 1.5727025816364834e-05, "loss": 1.2491, "step": 10963 }, { "epoch": 3.2655857331670357, "grad_norm": 0.24171844124794006, "learning_rate": 1.5726235073684807e-05, "loss": 1.2184, "step": 10964 }, { "epoch": 3.2658835793667045, "grad_norm": 0.28105929493904114, "learning_rate": 1.5725444277728565e-05, "loss": 1.2454, "step": 10965 }, { "epoch": 3.2661814255663733, "grad_norm": 0.2551528215408325, "learning_rate": 1.572465342850346e-05, "loss": 1.2454, "step": 10966 }, { "epoch": 3.266479271766042, "grad_norm": 0.24841690063476562, "learning_rate": 1.572386252601685e-05, "loss": 1.2285, "step": 10967 }, { "epoch": 3.2667771179657104, "grad_norm": 0.269426554441452, "learning_rate": 1.5723071570276095e-05, "loss": 1.2326, "step": 10968 }, { "epoch": 3.267074964165379, "grad_norm": 0.23246163129806519, "learning_rate": 1.572228056128855e-05, "loss": 1.2284, "step": 10969 }, { "epoch": 3.267372810365048, "grad_norm": 0.32379814982414246, "learning_rate": 1.5721489499061582e-05, "loss": 1.2327, "step": 10970 }, { "epoch": 3.2676706565647162, "grad_norm": 0.25725942850112915, "learning_rate": 1.572069838360254e-05, "loss": 1.2391, "step": 10971 }, { "epoch": 3.267968502764385, "grad_norm": 0.25456273555755615, "learning_rate": 1.5719907214918802e-05, "loss": 1.2305, "step": 10972 }, { "epoch": 3.2682663489640538, "grad_norm": 0.24083547294139862, "learning_rate": 1.571911599301771e-05, "loss": 1.2487, "step": 10973 }, { "epoch": 3.268564195163722, "grad_norm": 0.244639053940773, "learning_rate": 1.571832471790663e-05, "loss": 1.2312, "step": 10974 }, { "epoch": 3.268862041363391, "grad_norm": 0.23552767932415009, "learning_rate": 1.571753338959294e-05, "loss": 1.2385, "step": 10975 }, { "epoch": 3.2691598875630596, "grad_norm": 0.23945331573486328, "learning_rate": 1.571674200808398e-05, "loss": 1.237, "step": 10976 }, { "epoch": 3.2694577337627284, "grad_norm": 0.23800155520439148, "learning_rate": 1.5715950573387126e-05, "loss": 1.2353, "step": 10977 }, { "epoch": 3.2697555799623967, "grad_norm": 0.2597010135650635, "learning_rate": 1.5715159085509734e-05, "loss": 1.2584, "step": 10978 }, { "epoch": 3.2700534261620655, "grad_norm": 0.24727323651313782, "learning_rate": 1.5714367544459178e-05, "loss": 1.2429, "step": 10979 }, { "epoch": 3.2703512723617343, "grad_norm": 0.2544100880622864, "learning_rate": 1.5713575950242814e-05, "loss": 1.2546, "step": 10980 }, { "epoch": 3.270649118561403, "grad_norm": 0.2520626485347748, "learning_rate": 1.571278430286801e-05, "loss": 1.2395, "step": 10981 }, { "epoch": 3.2709469647610714, "grad_norm": 0.3480736017227173, "learning_rate": 1.571199260234213e-05, "loss": 1.2472, "step": 10982 }, { "epoch": 3.27124481096074, "grad_norm": 0.266797810792923, "learning_rate": 1.571120084867254e-05, "loss": 1.2414, "step": 10983 }, { "epoch": 3.271542657160409, "grad_norm": 0.26604586839675903, "learning_rate": 1.571040904186661e-05, "loss": 1.2224, "step": 10984 }, { "epoch": 3.2718405033600773, "grad_norm": 0.26607516407966614, "learning_rate": 1.57096171819317e-05, "loss": 1.238, "step": 10985 }, { "epoch": 3.272138349559746, "grad_norm": 0.236188605427742, "learning_rate": 1.5708825268875182e-05, "loss": 1.2227, "step": 10986 }, { "epoch": 3.272436195759415, "grad_norm": 0.2337832897901535, "learning_rate": 1.5708033302704425e-05, "loss": 1.2269, "step": 10987 }, { "epoch": 3.272734041959083, "grad_norm": 0.23533278703689575, "learning_rate": 1.5707241283426792e-05, "loss": 1.2442, "step": 10988 }, { "epoch": 3.273031888158752, "grad_norm": 0.23811085522174835, "learning_rate": 1.570644921104966e-05, "loss": 1.2373, "step": 10989 }, { "epoch": 3.2733297343584207, "grad_norm": 0.2450307458639145, "learning_rate": 1.570565708558039e-05, "loss": 1.2455, "step": 10990 }, { "epoch": 3.2736275805580894, "grad_norm": 0.26522406935691833, "learning_rate": 1.5704864907026357e-05, "loss": 1.2318, "step": 10991 }, { "epoch": 3.2739254267577578, "grad_norm": 0.25072145462036133, "learning_rate": 1.5704072675394932e-05, "loss": 1.2399, "step": 10992 }, { "epoch": 3.2742232729574265, "grad_norm": 0.25735607743263245, "learning_rate": 1.570328039069348e-05, "loss": 1.2441, "step": 10993 }, { "epoch": 3.2745211191570953, "grad_norm": 0.30827564001083374, "learning_rate": 1.5702488052929376e-05, "loss": 1.2461, "step": 10994 }, { "epoch": 3.274818965356764, "grad_norm": 0.2914036214351654, "learning_rate": 1.5701695662109994e-05, "loss": 1.2289, "step": 10995 }, { "epoch": 3.2751168115564324, "grad_norm": 0.2513253092765808, "learning_rate": 1.5700903218242703e-05, "loss": 1.2206, "step": 10996 }, { "epoch": 3.275414657756101, "grad_norm": 0.42634081840515137, "learning_rate": 1.5700110721334877e-05, "loss": 1.2496, "step": 10997 }, { "epoch": 3.27571250395577, "grad_norm": 0.31619492173194885, "learning_rate": 1.569931817139389e-05, "loss": 1.2364, "step": 10998 }, { "epoch": 3.2760103501554383, "grad_norm": 0.2968463897705078, "learning_rate": 1.5698525568427118e-05, "loss": 1.233, "step": 10999 }, { "epoch": 3.276308196355107, "grad_norm": 0.24828587472438812, "learning_rate": 1.569773291244193e-05, "loss": 1.2265, "step": 11000 }, { "epoch": 3.276308196355107, "eval_loss": 1.3360927104949951, "eval_runtime": 20.0927, "eval_samples_per_second": 86.3, "eval_steps_per_second": 5.425, "step": 11000 }, { "epoch": 3.276606042554776, "grad_norm": 0.3992263674736023, "learning_rate": 1.5696940203445704e-05, "loss": 1.2192, "step": 11001 }, { "epoch": 3.2769038887544446, "grad_norm": 0.2677016258239746, "learning_rate": 1.5696147441445812e-05, "loss": 1.2429, "step": 11002 }, { "epoch": 3.277201734954113, "grad_norm": 0.28629764914512634, "learning_rate": 1.5695354626449633e-05, "loss": 1.2413, "step": 11003 }, { "epoch": 3.2774995811537817, "grad_norm": 0.25016868114471436, "learning_rate": 1.569456175846454e-05, "loss": 1.2173, "step": 11004 }, { "epoch": 3.2777974273534505, "grad_norm": 0.3337778151035309, "learning_rate": 1.569376883749792e-05, "loss": 1.2294, "step": 11005 }, { "epoch": 3.2780952735531192, "grad_norm": 0.24379444122314453, "learning_rate": 1.5692975863557136e-05, "loss": 1.2319, "step": 11006 }, { "epoch": 3.2783931197527876, "grad_norm": 0.24937567114830017, "learning_rate": 1.5692182836649573e-05, "loss": 1.2347, "step": 11007 }, { "epoch": 3.2786909659524563, "grad_norm": 0.2599264979362488, "learning_rate": 1.5691389756782607e-05, "loss": 1.2317, "step": 11008 }, { "epoch": 3.278988812152125, "grad_norm": 0.24208100140094757, "learning_rate": 1.569059662396362e-05, "loss": 1.2197, "step": 11009 }, { "epoch": 3.2792866583517934, "grad_norm": 0.3027574419975281, "learning_rate": 1.568980343819999e-05, "loss": 1.245, "step": 11010 }, { "epoch": 3.279584504551462, "grad_norm": 0.24216900765895844, "learning_rate": 1.5689010199499094e-05, "loss": 1.2268, "step": 11011 }, { "epoch": 3.279882350751131, "grad_norm": 0.27850988507270813, "learning_rate": 1.5688216907868318e-05, "loss": 1.2227, "step": 11012 }, { "epoch": 3.2801801969507993, "grad_norm": 0.24350112676620483, "learning_rate": 1.5687423563315034e-05, "loss": 1.233, "step": 11013 }, { "epoch": 3.280478043150468, "grad_norm": 0.2829402983188629, "learning_rate": 1.568663016584663e-05, "loss": 1.2286, "step": 11014 }, { "epoch": 3.280775889350137, "grad_norm": 0.2563793957233429, "learning_rate": 1.5685836715470485e-05, "loss": 1.2421, "step": 11015 }, { "epoch": 3.2810737355498056, "grad_norm": 0.24953509867191315, "learning_rate": 1.5685043212193985e-05, "loss": 1.2567, "step": 11016 }, { "epoch": 3.281371581749474, "grad_norm": 0.2540299892425537, "learning_rate": 1.568424965602451e-05, "loss": 1.2401, "step": 11017 }, { "epoch": 3.2816694279491427, "grad_norm": 0.24344053864479065, "learning_rate": 1.568345604696944e-05, "loss": 1.2303, "step": 11018 }, { "epoch": 3.2819672741488115, "grad_norm": 0.35500892996788025, "learning_rate": 1.568266238503616e-05, "loss": 1.2503, "step": 11019 }, { "epoch": 3.2822651203484803, "grad_norm": 0.23837926983833313, "learning_rate": 1.568186867023206e-05, "loss": 1.2318, "step": 11020 }, { "epoch": 3.2825629665481486, "grad_norm": 0.25181010365486145, "learning_rate": 1.5681074902564516e-05, "loss": 1.2267, "step": 11021 }, { "epoch": 3.2828608127478174, "grad_norm": 0.25234049558639526, "learning_rate": 1.568028108204092e-05, "loss": 1.2371, "step": 11022 }, { "epoch": 3.283158658947486, "grad_norm": 0.26462599635124207, "learning_rate": 1.5679487208668653e-05, "loss": 1.2413, "step": 11023 }, { "epoch": 3.2834565051471545, "grad_norm": 0.286300390958786, "learning_rate": 1.5678693282455103e-05, "loss": 1.2372, "step": 11024 }, { "epoch": 3.2837543513468233, "grad_norm": 0.282059907913208, "learning_rate": 1.5677899303407657e-05, "loss": 1.241, "step": 11025 }, { "epoch": 3.284052197546492, "grad_norm": 0.274752140045166, "learning_rate": 1.5677105271533704e-05, "loss": 1.2253, "step": 11026 }, { "epoch": 3.2843500437461604, "grad_norm": 0.27365416288375854, "learning_rate": 1.5676311186840626e-05, "loss": 1.2475, "step": 11027 }, { "epoch": 3.284647889945829, "grad_norm": 0.2936675548553467, "learning_rate": 1.567551704933582e-05, "loss": 1.2123, "step": 11028 }, { "epoch": 3.284945736145498, "grad_norm": 0.28637778759002686, "learning_rate": 1.5674722859026664e-05, "loss": 1.2391, "step": 11029 }, { "epoch": 3.2852435823451667, "grad_norm": 0.2901567816734314, "learning_rate": 1.5673928615920552e-05, "loss": 1.2343, "step": 11030 }, { "epoch": 3.285541428544835, "grad_norm": 0.26433366537094116, "learning_rate": 1.5673134320024874e-05, "loss": 1.2494, "step": 11031 }, { "epoch": 3.2858392747445038, "grad_norm": 0.2561134099960327, "learning_rate": 1.567233997134702e-05, "loss": 1.2368, "step": 11032 }, { "epoch": 3.2861371209441725, "grad_norm": 0.25221318006515503, "learning_rate": 1.5671545569894382e-05, "loss": 1.2269, "step": 11033 }, { "epoch": 3.2864349671438413, "grad_norm": 0.2477511763572693, "learning_rate": 1.5670751115674345e-05, "loss": 1.2328, "step": 11034 }, { "epoch": 3.2867328133435096, "grad_norm": 0.2927154004573822, "learning_rate": 1.566995660869431e-05, "loss": 1.2292, "step": 11035 }, { "epoch": 3.2870306595431784, "grad_norm": 0.24017727375030518, "learning_rate": 1.5669162048961654e-05, "loss": 1.2336, "step": 11036 }, { "epoch": 3.287328505742847, "grad_norm": 0.24952299892902374, "learning_rate": 1.566836743648379e-05, "loss": 1.2286, "step": 11037 }, { "epoch": 3.2876263519425155, "grad_norm": 0.24183036386966705, "learning_rate": 1.5667572771268092e-05, "loss": 1.2458, "step": 11038 }, { "epoch": 3.2879241981421843, "grad_norm": 0.25494590401649475, "learning_rate": 1.5666778053321964e-05, "loss": 1.2482, "step": 11039 }, { "epoch": 3.288222044341853, "grad_norm": 0.2713030278682709, "learning_rate": 1.5665983282652803e-05, "loss": 1.2168, "step": 11040 }, { "epoch": 3.2885198905415214, "grad_norm": 0.24221836030483246, "learning_rate": 1.5665188459267994e-05, "loss": 1.2398, "step": 11041 }, { "epoch": 3.28881773674119, "grad_norm": 0.27836811542510986, "learning_rate": 1.5664393583174933e-05, "loss": 1.2482, "step": 11042 }, { "epoch": 3.289115582940859, "grad_norm": 0.2528473436832428, "learning_rate": 1.5663598654381024e-05, "loss": 1.2275, "step": 11043 }, { "epoch": 3.2894134291405277, "grad_norm": 0.2529313564300537, "learning_rate": 1.5662803672893653e-05, "loss": 1.2234, "step": 11044 }, { "epoch": 3.289711275340196, "grad_norm": 0.23660457134246826, "learning_rate": 1.566200863872022e-05, "loss": 1.2442, "step": 11045 }, { "epoch": 3.290009121539865, "grad_norm": 0.3444722890853882, "learning_rate": 1.5661213551868126e-05, "loss": 1.2257, "step": 11046 }, { "epoch": 3.2903069677395336, "grad_norm": 0.2877964675426483, "learning_rate": 1.5660418412344762e-05, "loss": 1.2298, "step": 11047 }, { "epoch": 3.2906048139392023, "grad_norm": 0.3301161527633667, "learning_rate": 1.565962322015753e-05, "loss": 1.2434, "step": 11048 }, { "epoch": 3.2909026601388707, "grad_norm": 0.2551535665988922, "learning_rate": 1.5658827975313828e-05, "loss": 1.2393, "step": 11049 }, { "epoch": 3.2912005063385394, "grad_norm": 0.426665335893631, "learning_rate": 1.5658032677821052e-05, "loss": 1.2372, "step": 11050 }, { "epoch": 3.291498352538208, "grad_norm": 0.28886961936950684, "learning_rate": 1.5657237327686606e-05, "loss": 1.233, "step": 11051 }, { "epoch": 3.2917961987378765, "grad_norm": 0.2779581546783447, "learning_rate": 1.5656441924917888e-05, "loss": 1.2304, "step": 11052 }, { "epoch": 3.2920940449375453, "grad_norm": 0.24516701698303223, "learning_rate": 1.5655646469522294e-05, "loss": 1.2297, "step": 11053 }, { "epoch": 3.292391891137214, "grad_norm": 0.2536821663379669, "learning_rate": 1.565485096150723e-05, "loss": 1.2441, "step": 11054 }, { "epoch": 3.292689737336883, "grad_norm": 0.2568303644657135, "learning_rate": 1.5654055400880097e-05, "loss": 1.227, "step": 11055 }, { "epoch": 3.292987583536551, "grad_norm": 0.24783992767333984, "learning_rate": 1.5653259787648293e-05, "loss": 1.2229, "step": 11056 }, { "epoch": 3.29328542973622, "grad_norm": 0.23471763730049133, "learning_rate": 1.5652464121819226e-05, "loss": 1.2214, "step": 11057 }, { "epoch": 3.2935832759358887, "grad_norm": 0.24588479101657867, "learning_rate": 1.5651668403400292e-05, "loss": 1.2408, "step": 11058 }, { "epoch": 3.293881122135557, "grad_norm": 0.2398025542497635, "learning_rate": 1.56508726323989e-05, "loss": 1.2487, "step": 11059 }, { "epoch": 3.294178968335226, "grad_norm": 0.25474634766578674, "learning_rate": 1.5650076808822453e-05, "loss": 1.2292, "step": 11060 }, { "epoch": 3.2944768145348946, "grad_norm": 0.2357475310564041, "learning_rate": 1.5649280932678354e-05, "loss": 1.242, "step": 11061 }, { "epoch": 3.2947746607345634, "grad_norm": 0.2470824271440506, "learning_rate": 1.5648485003974004e-05, "loss": 1.2502, "step": 11062 }, { "epoch": 3.2950725069342317, "grad_norm": 0.25632765889167786, "learning_rate": 1.5647689022716813e-05, "loss": 1.2338, "step": 11063 }, { "epoch": 3.2953703531339005, "grad_norm": 0.24734733998775482, "learning_rate": 1.5646892988914187e-05, "loss": 1.2301, "step": 11064 }, { "epoch": 3.2956681993335692, "grad_norm": 0.24102622270584106, "learning_rate": 1.5646096902573533e-05, "loss": 1.2313, "step": 11065 }, { "epoch": 3.2959660455332376, "grad_norm": 0.23595453798770905, "learning_rate": 1.5645300763702253e-05, "loss": 1.246, "step": 11066 }, { "epoch": 3.2962638917329063, "grad_norm": 0.24554964900016785, "learning_rate": 1.564450457230776e-05, "loss": 1.2552, "step": 11067 }, { "epoch": 3.296561737932575, "grad_norm": 0.23637695610523224, "learning_rate": 1.5643708328397455e-05, "loss": 1.253, "step": 11068 }, { "epoch": 3.296859584132244, "grad_norm": 0.2340025156736374, "learning_rate": 1.5642912031978748e-05, "loss": 1.228, "step": 11069 }, { "epoch": 3.297157430331912, "grad_norm": 0.2444021850824356, "learning_rate": 1.564211568305905e-05, "loss": 1.2328, "step": 11070 }, { "epoch": 3.297455276531581, "grad_norm": 0.24834030866622925, "learning_rate": 1.564131928164577e-05, "loss": 1.2504, "step": 11071 }, { "epoch": 3.2977531227312498, "grad_norm": 0.2617681920528412, "learning_rate": 1.564052282774632e-05, "loss": 1.2386, "step": 11072 }, { "epoch": 3.2980509689309185, "grad_norm": 0.24876558780670166, "learning_rate": 1.56397263213681e-05, "loss": 1.2429, "step": 11073 }, { "epoch": 3.298348815130587, "grad_norm": 0.2376701533794403, "learning_rate": 1.5638929762518537e-05, "loss": 1.2425, "step": 11074 }, { "epoch": 3.2986466613302556, "grad_norm": 0.2451256811618805, "learning_rate": 1.5638133151205026e-05, "loss": 1.2413, "step": 11075 }, { "epoch": 3.2989445075299244, "grad_norm": 0.3038402497768402, "learning_rate": 1.5637336487434988e-05, "loss": 1.2606, "step": 11076 }, { "epoch": 3.2992423537295927, "grad_norm": 0.3035128712654114, "learning_rate": 1.563653977121583e-05, "loss": 1.212, "step": 11077 }, { "epoch": 3.2995401999292615, "grad_norm": 0.243771031498909, "learning_rate": 1.5635743002554968e-05, "loss": 1.2518, "step": 11078 }, { "epoch": 3.2998380461289303, "grad_norm": 0.3867659568786621, "learning_rate": 1.5634946181459815e-05, "loss": 1.2285, "step": 11079 }, { "epoch": 3.3001358923285986, "grad_norm": 0.3225020468235016, "learning_rate": 1.5634149307937782e-05, "loss": 1.24, "step": 11080 }, { "epoch": 3.3004337385282674, "grad_norm": 0.253110408782959, "learning_rate": 1.5633352381996284e-05, "loss": 1.2529, "step": 11081 }, { "epoch": 3.300731584727936, "grad_norm": 0.3336648941040039, "learning_rate": 1.5632555403642736e-05, "loss": 1.2554, "step": 11082 }, { "epoch": 3.301029430927605, "grad_norm": 0.2612948715686798, "learning_rate": 1.5631758372884558e-05, "loss": 1.253, "step": 11083 }, { "epoch": 3.3013272771272733, "grad_norm": 0.26156461238861084, "learning_rate": 1.5630961289729158e-05, "loss": 1.2434, "step": 11084 }, { "epoch": 3.301625123326942, "grad_norm": 0.27161288261413574, "learning_rate": 1.563016415418395e-05, "loss": 1.2323, "step": 11085 }, { "epoch": 3.301922969526611, "grad_norm": 0.2873471975326538, "learning_rate": 1.5629366966256362e-05, "loss": 1.2315, "step": 11086 }, { "epoch": 3.3022208157262796, "grad_norm": 0.2846321165561676, "learning_rate": 1.56285697259538e-05, "loss": 1.224, "step": 11087 }, { "epoch": 3.302518661925948, "grad_norm": 0.265316367149353, "learning_rate": 1.5627772433283685e-05, "loss": 1.2408, "step": 11088 }, { "epoch": 3.3028165081256167, "grad_norm": 0.280942440032959, "learning_rate": 1.5626975088253436e-05, "loss": 1.2403, "step": 11089 }, { "epoch": 3.3031143543252854, "grad_norm": 0.25459396839141846, "learning_rate": 1.562617769087047e-05, "loss": 1.2249, "step": 11090 }, { "epoch": 3.3034122005249538, "grad_norm": 0.33199477195739746, "learning_rate": 1.5625380241142206e-05, "loss": 1.2198, "step": 11091 }, { "epoch": 3.3037100467246225, "grad_norm": 0.2421032041311264, "learning_rate": 1.5624582739076067e-05, "loss": 1.2216, "step": 11092 }, { "epoch": 3.3040078929242913, "grad_norm": 0.5085775852203369, "learning_rate": 1.5623785184679468e-05, "loss": 1.2399, "step": 11093 }, { "epoch": 3.3043057391239596, "grad_norm": 0.32602718472480774, "learning_rate": 1.5622987577959827e-05, "loss": 1.2397, "step": 11094 }, { "epoch": 3.3046035853236284, "grad_norm": 0.3089933395385742, "learning_rate": 1.5622189918924575e-05, "loss": 1.2216, "step": 11095 }, { "epoch": 3.304901431523297, "grad_norm": 0.2592551112174988, "learning_rate": 1.5621392207581125e-05, "loss": 1.2513, "step": 11096 }, { "epoch": 3.305199277722966, "grad_norm": 0.3399773836135864, "learning_rate": 1.56205944439369e-05, "loss": 1.2415, "step": 11097 }, { "epoch": 3.3054971239226343, "grad_norm": 0.2785807251930237, "learning_rate": 1.5619796627999326e-05, "loss": 1.2077, "step": 11098 }, { "epoch": 3.305794970122303, "grad_norm": 0.2429707944393158, "learning_rate": 1.561899875977582e-05, "loss": 1.2334, "step": 11099 }, { "epoch": 3.306092816321972, "grad_norm": 0.25270265340805054, "learning_rate": 1.5618200839273813e-05, "loss": 1.2291, "step": 11100 }, { "epoch": 3.3063906625216406, "grad_norm": 0.2738122045993805, "learning_rate": 1.561740286650072e-05, "loss": 1.2364, "step": 11101 }, { "epoch": 3.306688508721309, "grad_norm": 0.23685142397880554, "learning_rate": 1.5616604841463973e-05, "loss": 1.2413, "step": 11102 }, { "epoch": 3.3069863549209777, "grad_norm": 0.3383182883262634, "learning_rate": 1.5615806764170987e-05, "loss": 1.2378, "step": 11103 }, { "epoch": 3.3072842011206465, "grad_norm": 0.26384952664375305, "learning_rate": 1.5615008634629197e-05, "loss": 1.2336, "step": 11104 }, { "epoch": 3.307582047320315, "grad_norm": 0.2673135995864868, "learning_rate": 1.5614210452846027e-05, "loss": 1.2495, "step": 11105 }, { "epoch": 3.3078798935199836, "grad_norm": 0.2639603614807129, "learning_rate": 1.56134122188289e-05, "loss": 1.2412, "step": 11106 }, { "epoch": 3.3081777397196523, "grad_norm": 0.2544959485530853, "learning_rate": 1.5612613932585243e-05, "loss": 1.233, "step": 11107 }, { "epoch": 3.3084755859193207, "grad_norm": 0.24230916798114777, "learning_rate": 1.5611815594122485e-05, "loss": 1.2295, "step": 11108 }, { "epoch": 3.3087734321189894, "grad_norm": 0.24680808186531067, "learning_rate": 1.5611017203448054e-05, "loss": 1.2267, "step": 11109 }, { "epoch": 3.309071278318658, "grad_norm": 0.2686762511730194, "learning_rate": 1.5610218760569377e-05, "loss": 1.2474, "step": 11110 }, { "epoch": 3.309369124518327, "grad_norm": 0.25730791687965393, "learning_rate": 1.560942026549388e-05, "loss": 1.2473, "step": 11111 }, { "epoch": 3.3096669707179953, "grad_norm": 0.2545897960662842, "learning_rate": 1.5608621718228996e-05, "loss": 1.2294, "step": 11112 }, { "epoch": 3.309964816917664, "grad_norm": 0.24847790598869324, "learning_rate": 1.560782311878215e-05, "loss": 1.2383, "step": 11113 }, { "epoch": 3.310262663117333, "grad_norm": 0.24648483097553253, "learning_rate": 1.5607024467160782e-05, "loss": 1.2471, "step": 11114 }, { "epoch": 3.3105605093170016, "grad_norm": 0.25031670928001404, "learning_rate": 1.560622576337231e-05, "loss": 1.2207, "step": 11115 }, { "epoch": 3.31085835551667, "grad_norm": 0.2567838430404663, "learning_rate": 1.5605427007424175e-05, "loss": 1.2409, "step": 11116 }, { "epoch": 3.3111562017163387, "grad_norm": 0.24459747970104218, "learning_rate": 1.5604628199323803e-05, "loss": 1.2244, "step": 11117 }, { "epoch": 3.3114540479160075, "grad_norm": 0.260470986366272, "learning_rate": 1.5603829339078626e-05, "loss": 1.2329, "step": 11118 }, { "epoch": 3.311751894115676, "grad_norm": 0.24614861607551575, "learning_rate": 1.5603030426696078e-05, "loss": 1.2427, "step": 11119 }, { "epoch": 3.3120497403153446, "grad_norm": 0.24856600165367126, "learning_rate": 1.5602231462183595e-05, "loss": 1.2446, "step": 11120 }, { "epoch": 3.3123475865150134, "grad_norm": 0.2849280536174774, "learning_rate": 1.5601432445548604e-05, "loss": 1.2428, "step": 11121 }, { "epoch": 3.312645432714682, "grad_norm": 0.25962498784065247, "learning_rate": 1.5600633376798546e-05, "loss": 1.2337, "step": 11122 }, { "epoch": 3.3129432789143505, "grad_norm": 0.2747041583061218, "learning_rate": 1.559983425594085e-05, "loss": 1.2329, "step": 11123 }, { "epoch": 3.3132411251140192, "grad_norm": 0.2400052845478058, "learning_rate": 1.559903508298295e-05, "loss": 1.2384, "step": 11124 }, { "epoch": 3.313538971313688, "grad_norm": 0.2882014513015747, "learning_rate": 1.5598235857932288e-05, "loss": 1.2385, "step": 11125 }, { "epoch": 3.3138368175133563, "grad_norm": 0.23737503588199615, "learning_rate": 1.559743658079629e-05, "loss": 1.2276, "step": 11126 }, { "epoch": 3.314134663713025, "grad_norm": 0.281827837228775, "learning_rate": 1.5596637251582406e-05, "loss": 1.2281, "step": 11127 }, { "epoch": 3.314432509912694, "grad_norm": 0.23551490902900696, "learning_rate": 1.5595837870298064e-05, "loss": 1.2313, "step": 11128 }, { "epoch": 3.3147303561123627, "grad_norm": 0.2924799621105194, "learning_rate": 1.5595038436950697e-05, "loss": 1.2305, "step": 11129 }, { "epoch": 3.315028202312031, "grad_norm": 0.24237075448036194, "learning_rate": 1.5594238951547754e-05, "loss": 1.224, "step": 11130 }, { "epoch": 3.3153260485116998, "grad_norm": 0.3355448544025421, "learning_rate": 1.5593439414096666e-05, "loss": 1.2228, "step": 11131 }, { "epoch": 3.3156238947113685, "grad_norm": 0.2854815125465393, "learning_rate": 1.5592639824604874e-05, "loss": 1.2352, "step": 11132 }, { "epoch": 3.315921740911037, "grad_norm": 0.24154211580753326, "learning_rate": 1.5591840183079817e-05, "loss": 1.2457, "step": 11133 }, { "epoch": 3.3162195871107056, "grad_norm": 0.24469321966171265, "learning_rate": 1.5591040489528936e-05, "loss": 1.2235, "step": 11134 }, { "epoch": 3.3165174333103744, "grad_norm": 0.24999083578586578, "learning_rate": 1.5590240743959667e-05, "loss": 1.2596, "step": 11135 }, { "epoch": 3.316815279510043, "grad_norm": 0.24260610342025757, "learning_rate": 1.5589440946379456e-05, "loss": 1.234, "step": 11136 }, { "epoch": 3.3171131257097115, "grad_norm": 0.2558267116546631, "learning_rate": 1.558864109679574e-05, "loss": 1.241, "step": 11137 }, { "epoch": 3.3174109719093803, "grad_norm": 0.27146750688552856, "learning_rate": 1.5587841195215963e-05, "loss": 1.2618, "step": 11138 }, { "epoch": 3.317708818109049, "grad_norm": 0.2366420477628708, "learning_rate": 1.558704124164757e-05, "loss": 1.2395, "step": 11139 }, { "epoch": 3.318006664308718, "grad_norm": 0.25245681405067444, "learning_rate": 1.5586241236097995e-05, "loss": 1.2289, "step": 11140 }, { "epoch": 3.318304510508386, "grad_norm": 0.2500971555709839, "learning_rate": 1.5585441178574688e-05, "loss": 1.2464, "step": 11141 }, { "epoch": 3.318602356708055, "grad_norm": 0.2692393958568573, "learning_rate": 1.5584641069085097e-05, "loss": 1.2469, "step": 11142 }, { "epoch": 3.3189002029077237, "grad_norm": 0.41690102219581604, "learning_rate": 1.5583840907636655e-05, "loss": 1.2207, "step": 11143 }, { "epoch": 3.319198049107392, "grad_norm": 0.2596820592880249, "learning_rate": 1.5583040694236812e-05, "loss": 1.2374, "step": 11144 }, { "epoch": 3.319495895307061, "grad_norm": 0.2902927100658417, "learning_rate": 1.5582240428893013e-05, "loss": 1.2273, "step": 11145 }, { "epoch": 3.3197937415067296, "grad_norm": 0.2572629153728485, "learning_rate": 1.5581440111612707e-05, "loss": 1.2404, "step": 11146 }, { "epoch": 3.320091587706398, "grad_norm": 0.33107370138168335, "learning_rate": 1.5580639742403332e-05, "loss": 1.2392, "step": 11147 }, { "epoch": 3.3203894339060667, "grad_norm": 0.27607280015945435, "learning_rate": 1.5579839321272342e-05, "loss": 1.2241, "step": 11148 }, { "epoch": 3.3206872801057354, "grad_norm": 0.28077617287635803, "learning_rate": 1.5579038848227184e-05, "loss": 1.2272, "step": 11149 }, { "epoch": 3.320985126305404, "grad_norm": 0.2819717824459076, "learning_rate": 1.5578238323275298e-05, "loss": 1.2274, "step": 11150 }, { "epoch": 3.3212829725050725, "grad_norm": 0.24944089353084564, "learning_rate": 1.5577437746424138e-05, "loss": 1.2516, "step": 11151 }, { "epoch": 3.3215808187047413, "grad_norm": 0.3945283591747284, "learning_rate": 1.557663711768115e-05, "loss": 1.2347, "step": 11152 }, { "epoch": 3.32187866490441, "grad_norm": 0.36648494005203247, "learning_rate": 1.5575836437053787e-05, "loss": 1.2292, "step": 11153 }, { "epoch": 3.322176511104079, "grad_norm": 0.30352574586868286, "learning_rate": 1.5575035704549498e-05, "loss": 1.2408, "step": 11154 }, { "epoch": 3.322474357303747, "grad_norm": 0.30668723583221436, "learning_rate": 1.5574234920175727e-05, "loss": 1.2316, "step": 11155 }, { "epoch": 3.322772203503416, "grad_norm": 0.33290109038352966, "learning_rate": 1.5573434083939927e-05, "loss": 1.2292, "step": 11156 }, { "epoch": 3.3230700497030847, "grad_norm": 0.2717600166797638, "learning_rate": 1.557263319584955e-05, "loss": 1.2307, "step": 11157 }, { "epoch": 3.323367895902753, "grad_norm": 0.2959630489349365, "learning_rate": 1.5571832255912048e-05, "loss": 1.2367, "step": 11158 }, { "epoch": 3.323665742102422, "grad_norm": 0.25313127040863037, "learning_rate": 1.5571031264134873e-05, "loss": 1.2327, "step": 11159 }, { "epoch": 3.3239635883020906, "grad_norm": 0.2804490029811859, "learning_rate": 1.5570230220525476e-05, "loss": 1.2655, "step": 11160 }, { "epoch": 3.324261434501759, "grad_norm": 0.334187388420105, "learning_rate": 1.556942912509131e-05, "loss": 1.237, "step": 11161 }, { "epoch": 3.3245592807014277, "grad_norm": 0.24870112538337708, "learning_rate": 1.5568627977839828e-05, "loss": 1.2391, "step": 11162 }, { "epoch": 3.3248571269010965, "grad_norm": 0.27060747146606445, "learning_rate": 1.5567826778778485e-05, "loss": 1.2543, "step": 11163 }, { "epoch": 3.3251549731007652, "grad_norm": 0.2730172872543335, "learning_rate": 1.5567025527914735e-05, "loss": 1.2662, "step": 11164 }, { "epoch": 3.3254528193004336, "grad_norm": 0.2469976395368576, "learning_rate": 1.556622422525603e-05, "loss": 1.242, "step": 11165 }, { "epoch": 3.3257506655001023, "grad_norm": 0.336348295211792, "learning_rate": 1.556542287080983e-05, "loss": 1.2396, "step": 11166 }, { "epoch": 3.326048511699771, "grad_norm": 0.24766014516353607, "learning_rate": 1.5564621464583586e-05, "loss": 1.2395, "step": 11167 }, { "epoch": 3.32634635789944, "grad_norm": 0.26880988478660583, "learning_rate": 1.556382000658476e-05, "loss": 1.2395, "step": 11168 }, { "epoch": 3.326644204099108, "grad_norm": 0.2785128355026245, "learning_rate": 1.5563018496820805e-05, "loss": 1.2221, "step": 11169 }, { "epoch": 3.326942050298777, "grad_norm": 0.2513678967952728, "learning_rate": 1.5562216935299176e-05, "loss": 1.2359, "step": 11170 }, { "epoch": 3.3272398964984458, "grad_norm": 0.39331939816474915, "learning_rate": 1.556141532202733e-05, "loss": 1.2293, "step": 11171 }, { "epoch": 3.327537742698114, "grad_norm": 0.2812885046005249, "learning_rate": 1.556061365701273e-05, "loss": 1.2153, "step": 11172 }, { "epoch": 3.327835588897783, "grad_norm": 0.30336180329322815, "learning_rate": 1.5559811940262838e-05, "loss": 1.2361, "step": 11173 }, { "epoch": 3.3281334350974516, "grad_norm": 0.23909083008766174, "learning_rate": 1.5559010171785104e-05, "loss": 1.2316, "step": 11174 }, { "epoch": 3.32843128129712, "grad_norm": 0.28223928809165955, "learning_rate": 1.5558208351586986e-05, "loss": 1.2449, "step": 11175 }, { "epoch": 3.3287291274967887, "grad_norm": 0.24867713451385498, "learning_rate": 1.555740647967596e-05, "loss": 1.238, "step": 11176 }, { "epoch": 3.3290269736964575, "grad_norm": 0.2665850818157196, "learning_rate": 1.5556604556059465e-05, "loss": 1.2258, "step": 11177 }, { "epoch": 3.3293248198961263, "grad_norm": 0.28365078568458557, "learning_rate": 1.555580258074498e-05, "loss": 1.2372, "step": 11178 }, { "epoch": 3.3296226660957946, "grad_norm": 0.37532925605773926, "learning_rate": 1.5555000553739955e-05, "loss": 1.2222, "step": 11179 }, { "epoch": 3.3299205122954634, "grad_norm": 0.22920817136764526, "learning_rate": 1.5554198475051858e-05, "loss": 1.2338, "step": 11180 }, { "epoch": 3.330218358495132, "grad_norm": 0.28767210245132446, "learning_rate": 1.555339634468815e-05, "loss": 1.2309, "step": 11181 }, { "epoch": 3.330516204694801, "grad_norm": 0.24251440167427063, "learning_rate": 1.5552594162656294e-05, "loss": 1.241, "step": 11182 }, { "epoch": 3.3308140508944692, "grad_norm": 0.35734620690345764, "learning_rate": 1.5551791928963752e-05, "loss": 1.2458, "step": 11183 }, { "epoch": 3.331111897094138, "grad_norm": 0.2381419837474823, "learning_rate": 1.5550989643617992e-05, "loss": 1.2279, "step": 11184 }, { "epoch": 3.331409743293807, "grad_norm": 0.27754923701286316, "learning_rate": 1.555018730662647e-05, "loss": 1.2333, "step": 11185 }, { "epoch": 3.331707589493475, "grad_norm": 0.24203741550445557, "learning_rate": 1.554938491799666e-05, "loss": 1.2378, "step": 11186 }, { "epoch": 3.332005435693144, "grad_norm": 0.2660702168941498, "learning_rate": 1.554858247773602e-05, "loss": 1.2315, "step": 11187 }, { "epoch": 3.3323032818928127, "grad_norm": 0.2345641702413559, "learning_rate": 1.554777998585202e-05, "loss": 1.2462, "step": 11188 }, { "epoch": 3.3326011280924814, "grad_norm": 0.2346842736005783, "learning_rate": 1.554697744235213e-05, "loss": 1.2465, "step": 11189 }, { "epoch": 3.3328989742921498, "grad_norm": 0.22912979125976562, "learning_rate": 1.554617484724381e-05, "loss": 1.2554, "step": 11190 }, { "epoch": 3.3331968204918185, "grad_norm": 0.2334700971841812, "learning_rate": 1.554537220053453e-05, "loss": 1.2369, "step": 11191 }, { "epoch": 3.3334946666914873, "grad_norm": 0.22997841238975525, "learning_rate": 1.5544569502231756e-05, "loss": 1.2362, "step": 11192 }, { "epoch": 3.333792512891156, "grad_norm": 0.25977134704589844, "learning_rate": 1.5543766752342957e-05, "loss": 1.2335, "step": 11193 }, { "epoch": 3.3340903590908244, "grad_norm": 0.2689334452152252, "learning_rate": 1.5542963950875604e-05, "loss": 1.2376, "step": 11194 }, { "epoch": 3.334388205290493, "grad_norm": 0.2830061912536621, "learning_rate": 1.5542161097837162e-05, "loss": 1.2258, "step": 11195 }, { "epoch": 3.334686051490162, "grad_norm": 0.2689288258552551, "learning_rate": 1.5541358193235105e-05, "loss": 1.2205, "step": 11196 }, { "epoch": 3.3349838976898303, "grad_norm": 0.23667997121810913, "learning_rate": 1.55405552370769e-05, "loss": 1.2447, "step": 11197 }, { "epoch": 3.335281743889499, "grad_norm": 0.2667357623577118, "learning_rate": 1.5539752229370022e-05, "loss": 1.2398, "step": 11198 }, { "epoch": 3.335579590089168, "grad_norm": 0.2455081343650818, "learning_rate": 1.5538949170121938e-05, "loss": 1.2282, "step": 11199 }, { "epoch": 3.335877436288836, "grad_norm": 0.29290878772735596, "learning_rate": 1.553814605934012e-05, "loss": 1.229, "step": 11200 }, { "epoch": 3.336175282488505, "grad_norm": 0.2486369013786316, "learning_rate": 1.5537342897032038e-05, "loss": 1.2478, "step": 11201 }, { "epoch": 3.3364731286881737, "grad_norm": 0.25021225214004517, "learning_rate": 1.553653968320517e-05, "loss": 1.2316, "step": 11202 }, { "epoch": 3.3367709748878425, "grad_norm": 0.24372710287570953, "learning_rate": 1.5535736417866984e-05, "loss": 1.2361, "step": 11203 }, { "epoch": 3.337068821087511, "grad_norm": 0.24520167708396912, "learning_rate": 1.5534933101024955e-05, "loss": 1.2297, "step": 11204 }, { "epoch": 3.3373666672871796, "grad_norm": 0.25759097933769226, "learning_rate": 1.553412973268656e-05, "loss": 1.2516, "step": 11205 }, { "epoch": 3.3376645134868483, "grad_norm": 0.2549898624420166, "learning_rate": 1.5533326312859266e-05, "loss": 1.244, "step": 11206 }, { "epoch": 3.337962359686517, "grad_norm": 0.23916058242321014, "learning_rate": 1.5532522841550558e-05, "loss": 1.2476, "step": 11207 }, { "epoch": 3.3382602058861854, "grad_norm": 0.23708297312259674, "learning_rate": 1.5531719318767905e-05, "loss": 1.2468, "step": 11208 }, { "epoch": 3.338558052085854, "grad_norm": 0.26817384362220764, "learning_rate": 1.5530915744518784e-05, "loss": 1.2376, "step": 11209 }, { "epoch": 3.338855898285523, "grad_norm": 0.2532375454902649, "learning_rate": 1.553011211881067e-05, "loss": 1.2511, "step": 11210 }, { "epoch": 3.3391537444851913, "grad_norm": 0.2590024769306183, "learning_rate": 1.5529308441651045e-05, "loss": 1.2445, "step": 11211 }, { "epoch": 3.33945159068486, "grad_norm": 0.3769739270210266, "learning_rate": 1.5528504713047378e-05, "loss": 1.2351, "step": 11212 }, { "epoch": 3.339749436884529, "grad_norm": 0.3174033761024475, "learning_rate": 1.5527700933007154e-05, "loss": 1.221, "step": 11213 }, { "epoch": 3.340047283084197, "grad_norm": 0.2384985238313675, "learning_rate": 1.552689710153785e-05, "loss": 1.2173, "step": 11214 }, { "epoch": 3.340345129283866, "grad_norm": 0.29194796085357666, "learning_rate": 1.552609321864694e-05, "loss": 1.2261, "step": 11215 }, { "epoch": 3.3406429754835347, "grad_norm": 0.2940313220024109, "learning_rate": 1.552528928434191e-05, "loss": 1.2108, "step": 11216 }, { "epoch": 3.3409408216832035, "grad_norm": 0.30959200859069824, "learning_rate": 1.5524485298630237e-05, "loss": 1.2483, "step": 11217 }, { "epoch": 3.341238667882872, "grad_norm": 0.28486740589141846, "learning_rate": 1.5523681261519396e-05, "loss": 1.2348, "step": 11218 }, { "epoch": 3.3415365140825406, "grad_norm": 0.2476719468832016, "learning_rate": 1.5522877173016878e-05, "loss": 1.2333, "step": 11219 }, { "epoch": 3.3418343602822094, "grad_norm": 0.2547983229160309, "learning_rate": 1.5522073033130153e-05, "loss": 1.2469, "step": 11220 }, { "epoch": 3.342132206481878, "grad_norm": 0.2380474954843521, "learning_rate": 1.552126884186671e-05, "loss": 1.2356, "step": 11221 }, { "epoch": 3.3424300526815465, "grad_norm": 0.2959021031856537, "learning_rate": 1.552046459923403e-05, "loss": 1.2423, "step": 11222 }, { "epoch": 3.3427278988812152, "grad_norm": 0.32568472623825073, "learning_rate": 1.5519660305239595e-05, "loss": 1.2313, "step": 11223 }, { "epoch": 3.343025745080884, "grad_norm": 0.2493116706609726, "learning_rate": 1.5518855959890887e-05, "loss": 1.2325, "step": 11224 }, { "epoch": 3.3433235912805523, "grad_norm": 0.2777004837989807, "learning_rate": 1.551805156319539e-05, "loss": 1.24, "step": 11225 }, { "epoch": 3.343621437480221, "grad_norm": 0.2908431589603424, "learning_rate": 1.551724711516059e-05, "loss": 1.2271, "step": 11226 }, { "epoch": 3.34391928367989, "grad_norm": 0.23702867329120636, "learning_rate": 1.5516442615793967e-05, "loss": 1.2398, "step": 11227 }, { "epoch": 3.344217129879558, "grad_norm": 0.2778693735599518, "learning_rate": 1.551563806510301e-05, "loss": 1.2381, "step": 11228 }, { "epoch": 3.344514976079227, "grad_norm": 0.23637819290161133, "learning_rate": 1.5514833463095206e-05, "loss": 1.2358, "step": 11229 }, { "epoch": 3.3448128222788958, "grad_norm": 0.2453201860189438, "learning_rate": 1.5514028809778033e-05, "loss": 1.2341, "step": 11230 }, { "epoch": 3.3451106684785645, "grad_norm": 0.24085794389247894, "learning_rate": 1.551322410515899e-05, "loss": 1.2464, "step": 11231 }, { "epoch": 3.345408514678233, "grad_norm": 0.2635227143764496, "learning_rate": 1.5512419349245548e-05, "loss": 1.2532, "step": 11232 }, { "epoch": 3.3457063608779016, "grad_norm": 0.23879170417785645, "learning_rate": 1.5511614542045206e-05, "loss": 1.2441, "step": 11233 }, { "epoch": 3.3460042070775704, "grad_norm": 0.2861533463001251, "learning_rate": 1.551080968356545e-05, "loss": 1.2367, "step": 11234 }, { "epoch": 3.346302053277239, "grad_norm": 0.3119814395904541, "learning_rate": 1.5510004773813766e-05, "loss": 1.2341, "step": 11235 }, { "epoch": 3.3465998994769075, "grad_norm": 0.2288614809513092, "learning_rate": 1.5509199812797645e-05, "loss": 1.2325, "step": 11236 }, { "epoch": 3.3468977456765763, "grad_norm": 0.3121853470802307, "learning_rate": 1.5508394800524573e-05, "loss": 1.2375, "step": 11237 }, { "epoch": 3.347195591876245, "grad_norm": 0.2740115821361542, "learning_rate": 1.5507589737002043e-05, "loss": 1.2252, "step": 11238 }, { "epoch": 3.3474934380759134, "grad_norm": 0.2465236335992813, "learning_rate": 1.5506784622237543e-05, "loss": 1.2334, "step": 11239 }, { "epoch": 3.347791284275582, "grad_norm": 0.27302634716033936, "learning_rate": 1.5505979456238565e-05, "loss": 1.2426, "step": 11240 }, { "epoch": 3.348089130475251, "grad_norm": 0.23760247230529785, "learning_rate": 1.55051742390126e-05, "loss": 1.2418, "step": 11241 }, { "epoch": 3.3483869766749192, "grad_norm": 0.24832670390605927, "learning_rate": 1.5504368970567142e-05, "loss": 1.2388, "step": 11242 }, { "epoch": 3.348684822874588, "grad_norm": 0.2424217015504837, "learning_rate": 1.5503563650909675e-05, "loss": 1.2448, "step": 11243 }, { "epoch": 3.348982669074257, "grad_norm": 0.24038386344909668, "learning_rate": 1.5502758280047702e-05, "loss": 1.218, "step": 11244 }, { "epoch": 3.3492805152739256, "grad_norm": 0.23946547508239746, "learning_rate": 1.550195285798871e-05, "loss": 1.2427, "step": 11245 }, { "epoch": 3.349578361473594, "grad_norm": 0.26257964968681335, "learning_rate": 1.550114738474019e-05, "loss": 1.2502, "step": 11246 }, { "epoch": 3.3498762076732627, "grad_norm": 0.26204416155815125, "learning_rate": 1.5500341860309643e-05, "loss": 1.2304, "step": 11247 }, { "epoch": 3.3501740538729314, "grad_norm": 0.2664135694503784, "learning_rate": 1.5499536284704563e-05, "loss": 1.2185, "step": 11248 }, { "epoch": 3.3504719000726, "grad_norm": 0.2504350244998932, "learning_rate": 1.5498730657932442e-05, "loss": 1.2452, "step": 11249 }, { "epoch": 3.3507697462722685, "grad_norm": 0.2880156934261322, "learning_rate": 1.549792498000077e-05, "loss": 1.2469, "step": 11250 }, { "epoch": 3.3510675924719373, "grad_norm": 0.2322516292333603, "learning_rate": 1.549711925091705e-05, "loss": 1.2407, "step": 11251 }, { "epoch": 3.351365438671606, "grad_norm": 0.3223528265953064, "learning_rate": 1.5496313470688784e-05, "loss": 1.2483, "step": 11252 }, { "epoch": 3.3516632848712744, "grad_norm": 0.28237348794937134, "learning_rate": 1.5495507639323453e-05, "loss": 1.2348, "step": 11253 }, { "epoch": 3.351961131070943, "grad_norm": 0.3122625946998596, "learning_rate": 1.549470175682857e-05, "loss": 1.2409, "step": 11254 }, { "epoch": 3.352258977270612, "grad_norm": 0.49274566769599915, "learning_rate": 1.5493895823211623e-05, "loss": 1.2217, "step": 11255 }, { "epoch": 3.3525568234702807, "grad_norm": 0.35787808895111084, "learning_rate": 1.5493089838480116e-05, "loss": 1.2284, "step": 11256 }, { "epoch": 3.352854669669949, "grad_norm": 0.3012523949146271, "learning_rate": 1.5492283802641544e-05, "loss": 1.2401, "step": 11257 }, { "epoch": 3.353152515869618, "grad_norm": 0.4172825813293457, "learning_rate": 1.5491477715703405e-05, "loss": 1.2134, "step": 11258 }, { "epoch": 3.3534503620692866, "grad_norm": 0.2507089376449585, "learning_rate": 1.5490671577673205e-05, "loss": 1.2302, "step": 11259 }, { "epoch": 3.3537482082689554, "grad_norm": 0.25309303402900696, "learning_rate": 1.5489865388558438e-05, "loss": 1.2361, "step": 11260 }, { "epoch": 3.3540460544686237, "grad_norm": 0.23591768741607666, "learning_rate": 1.5489059148366608e-05, "loss": 1.2225, "step": 11261 }, { "epoch": 3.3543439006682925, "grad_norm": 0.3089366555213928, "learning_rate": 1.5488252857105217e-05, "loss": 1.2405, "step": 11262 }, { "epoch": 3.3546417468679612, "grad_norm": 0.258203387260437, "learning_rate": 1.5487446514781762e-05, "loss": 1.2332, "step": 11263 }, { "epoch": 3.3549395930676296, "grad_norm": 0.28117161989212036, "learning_rate": 1.548664012140375e-05, "loss": 1.2348, "step": 11264 }, { "epoch": 3.3552374392672983, "grad_norm": 0.30442148447036743, "learning_rate": 1.5485833676978683e-05, "loss": 1.2227, "step": 11265 }, { "epoch": 3.355535285466967, "grad_norm": 0.2500152289867401, "learning_rate": 1.548502718151406e-05, "loss": 1.2292, "step": 11266 }, { "epoch": 3.3558331316666354, "grad_norm": 0.26201489567756653, "learning_rate": 1.548422063501739e-05, "loss": 1.2374, "step": 11267 }, { "epoch": 3.356130977866304, "grad_norm": 0.2322075515985489, "learning_rate": 1.5483414037496173e-05, "loss": 1.2387, "step": 11268 }, { "epoch": 3.356428824065973, "grad_norm": 0.2669526934623718, "learning_rate": 1.5482607388957915e-05, "loss": 1.2364, "step": 11269 }, { "epoch": 3.3567266702656418, "grad_norm": 0.32109588384628296, "learning_rate": 1.548180068941012e-05, "loss": 1.2123, "step": 11270 }, { "epoch": 3.35702451646531, "grad_norm": 0.2350834608078003, "learning_rate": 1.5480993938860294e-05, "loss": 1.2312, "step": 11271 }, { "epoch": 3.357322362664979, "grad_norm": 0.25055015087127686, "learning_rate": 1.5480187137315942e-05, "loss": 1.2467, "step": 11272 }, { "epoch": 3.3576202088646476, "grad_norm": 0.23985609412193298, "learning_rate": 1.5479380284784574e-05, "loss": 1.2331, "step": 11273 }, { "epoch": 3.3579180550643164, "grad_norm": 0.2938431203365326, "learning_rate": 1.5478573381273694e-05, "loss": 1.2447, "step": 11274 }, { "epoch": 3.3582159012639847, "grad_norm": 0.30418601632118225, "learning_rate": 1.547776642679081e-05, "loss": 1.2456, "step": 11275 }, { "epoch": 3.3585137474636535, "grad_norm": 0.2474595159292221, "learning_rate": 1.5476959421343426e-05, "loss": 1.2425, "step": 11276 }, { "epoch": 3.3588115936633223, "grad_norm": 0.2823903560638428, "learning_rate": 1.5476152364939058e-05, "loss": 1.2244, "step": 11277 }, { "epoch": 3.3591094398629906, "grad_norm": 0.30574876070022583, "learning_rate": 1.547534525758521e-05, "loss": 1.2133, "step": 11278 }, { "epoch": 3.3594072860626594, "grad_norm": 0.24360063672065735, "learning_rate": 1.547453809928939e-05, "loss": 1.2337, "step": 11279 }, { "epoch": 3.359705132262328, "grad_norm": 0.2866424322128296, "learning_rate": 1.547373089005911e-05, "loss": 1.2511, "step": 11280 }, { "epoch": 3.3600029784619965, "grad_norm": 0.2539839744567871, "learning_rate": 1.547292362990188e-05, "loss": 1.239, "step": 11281 }, { "epoch": 3.3603008246616652, "grad_norm": 0.2697642743587494, "learning_rate": 1.547211631882521e-05, "loss": 1.2164, "step": 11282 }, { "epoch": 3.360598670861334, "grad_norm": 0.30866938829421997, "learning_rate": 1.5471308956836614e-05, "loss": 1.2295, "step": 11283 }, { "epoch": 3.360896517061003, "grad_norm": 0.24037256836891174, "learning_rate": 1.54705015439436e-05, "loss": 1.234, "step": 11284 }, { "epoch": 3.361194363260671, "grad_norm": 0.27809154987335205, "learning_rate": 1.546969408015368e-05, "loss": 1.2264, "step": 11285 }, { "epoch": 3.36149220946034, "grad_norm": 0.2547796666622162, "learning_rate": 1.546888656547437e-05, "loss": 1.253, "step": 11286 }, { "epoch": 3.3617900556600087, "grad_norm": 0.29331886768341064, "learning_rate": 1.5468078999913177e-05, "loss": 1.2242, "step": 11287 }, { "epoch": 3.3620879018596774, "grad_norm": 0.308010995388031, "learning_rate": 1.5467271383477617e-05, "loss": 1.2261, "step": 11288 }, { "epoch": 3.3623857480593458, "grad_norm": 0.24176310002803802, "learning_rate": 1.546646371617521e-05, "loss": 1.2268, "step": 11289 }, { "epoch": 3.3626835942590145, "grad_norm": 0.30735498666763306, "learning_rate": 1.5465655998013463e-05, "loss": 1.247, "step": 11290 }, { "epoch": 3.3629814404586833, "grad_norm": 0.2666889429092407, "learning_rate": 1.5464848228999893e-05, "loss": 1.2399, "step": 11291 }, { "epoch": 3.3632792866583516, "grad_norm": 0.3406980037689209, "learning_rate": 1.546404040914202e-05, "loss": 1.2472, "step": 11292 }, { "epoch": 3.3635771328580204, "grad_norm": 0.3658931255340576, "learning_rate": 1.546323253844735e-05, "loss": 1.2063, "step": 11293 }, { "epoch": 3.363874979057689, "grad_norm": 0.28469905257225037, "learning_rate": 1.5462424616923408e-05, "loss": 1.2346, "step": 11294 }, { "epoch": 3.3641728252573575, "grad_norm": 0.3957582712173462, "learning_rate": 1.546161664457771e-05, "loss": 1.2271, "step": 11295 }, { "epoch": 3.3644706714570263, "grad_norm": 0.24683406949043274, "learning_rate": 1.5460808621417768e-05, "loss": 1.2269, "step": 11296 }, { "epoch": 3.364768517656695, "grad_norm": 0.2529073655605316, "learning_rate": 1.5460000547451103e-05, "loss": 1.2387, "step": 11297 }, { "epoch": 3.365066363856364, "grad_norm": 0.2449730634689331, "learning_rate": 1.5459192422685234e-05, "loss": 1.2303, "step": 11298 }, { "epoch": 3.365364210056032, "grad_norm": 0.3057553768157959, "learning_rate": 1.545838424712768e-05, "loss": 1.2147, "step": 11299 }, { "epoch": 3.365662056255701, "grad_norm": 0.2699509859085083, "learning_rate": 1.545757602078596e-05, "loss": 1.2329, "step": 11300 }, { "epoch": 3.3659599024553697, "grad_norm": 0.22986018657684326, "learning_rate": 1.545676774366759e-05, "loss": 1.219, "step": 11301 }, { "epoch": 3.3662577486550385, "grad_norm": 0.24623237550258636, "learning_rate": 1.545595941578009e-05, "loss": 1.2344, "step": 11302 }, { "epoch": 3.366555594854707, "grad_norm": 0.2394254505634308, "learning_rate": 1.545515103713099e-05, "loss": 1.2522, "step": 11303 }, { "epoch": 3.3668534410543756, "grad_norm": 0.24272018671035767, "learning_rate": 1.54543426077278e-05, "loss": 1.2167, "step": 11304 }, { "epoch": 3.3671512872540443, "grad_norm": 0.23567035794258118, "learning_rate": 1.545353412757805e-05, "loss": 1.2314, "step": 11305 }, { "epoch": 3.3674491334537127, "grad_norm": 0.26490622758865356, "learning_rate": 1.5452725596689253e-05, "loss": 1.2422, "step": 11306 }, { "epoch": 3.3677469796533814, "grad_norm": 0.26032236218452454, "learning_rate": 1.545191701506894e-05, "loss": 1.2271, "step": 11307 }, { "epoch": 3.36804482585305, "grad_norm": 0.2538129687309265, "learning_rate": 1.5451108382724628e-05, "loss": 1.2273, "step": 11308 }, { "epoch": 3.3683426720527185, "grad_norm": 0.30988457798957825, "learning_rate": 1.5450299699663842e-05, "loss": 1.217, "step": 11309 }, { "epoch": 3.3686405182523873, "grad_norm": 0.30218836665153503, "learning_rate": 1.544949096589411e-05, "loss": 1.2261, "step": 11310 }, { "epoch": 3.368938364452056, "grad_norm": 0.2780192792415619, "learning_rate": 1.5448682181422955e-05, "loss": 1.223, "step": 11311 }, { "epoch": 3.369236210651725, "grad_norm": 0.23283803462982178, "learning_rate": 1.5447873346257895e-05, "loss": 1.2239, "step": 11312 }, { "epoch": 3.369534056851393, "grad_norm": 0.2457301914691925, "learning_rate": 1.5447064460406464e-05, "loss": 1.2263, "step": 11313 }, { "epoch": 3.369831903051062, "grad_norm": 0.2556935250759125, "learning_rate": 1.5446255523876178e-05, "loss": 1.2382, "step": 11314 }, { "epoch": 3.3701297492507307, "grad_norm": 0.24787387251853943, "learning_rate": 1.5445446536674575e-05, "loss": 1.2383, "step": 11315 }, { "epoch": 3.3704275954503995, "grad_norm": 0.23617060482501984, "learning_rate": 1.5444637498809177e-05, "loss": 1.25, "step": 11316 }, { "epoch": 3.370725441650068, "grad_norm": 0.32861047983169556, "learning_rate": 1.5443828410287506e-05, "loss": 1.2309, "step": 11317 }, { "epoch": 3.3710232878497366, "grad_norm": 0.34847843647003174, "learning_rate": 1.5443019271117096e-05, "loss": 1.2357, "step": 11318 }, { "epoch": 3.3713211340494054, "grad_norm": 0.25416529178619385, "learning_rate": 1.544221008130547e-05, "loss": 1.2434, "step": 11319 }, { "epoch": 3.3716189802490737, "grad_norm": 0.8670878410339355, "learning_rate": 1.5441400840860165e-05, "loss": 1.249, "step": 11320 }, { "epoch": 3.3719168264487425, "grad_norm": 0.26365143060684204, "learning_rate": 1.54405915497887e-05, "loss": 1.2437, "step": 11321 }, { "epoch": 3.3722146726484112, "grad_norm": 0.24683503806591034, "learning_rate": 1.543978220809861e-05, "loss": 1.2421, "step": 11322 }, { "epoch": 3.37251251884808, "grad_norm": 0.24612995982170105, "learning_rate": 1.5438972815797427e-05, "loss": 1.2395, "step": 11323 }, { "epoch": 3.3728103650477483, "grad_norm": 0.24689802527427673, "learning_rate": 1.5438163372892675e-05, "loss": 1.2375, "step": 11324 }, { "epoch": 3.373108211247417, "grad_norm": 0.22813276946544647, "learning_rate": 1.5437353879391893e-05, "loss": 1.2193, "step": 11325 }, { "epoch": 3.373406057447086, "grad_norm": 0.23651349544525146, "learning_rate": 1.5436544335302604e-05, "loss": 1.2258, "step": 11326 }, { "epoch": 3.3737039036467547, "grad_norm": 0.246408149600029, "learning_rate": 1.5435734740632343e-05, "loss": 1.2265, "step": 11327 }, { "epoch": 3.374001749846423, "grad_norm": 0.24524538218975067, "learning_rate": 1.5434925095388648e-05, "loss": 1.2395, "step": 11328 }, { "epoch": 3.3742995960460918, "grad_norm": 0.23395466804504395, "learning_rate": 1.5434115399579048e-05, "loss": 1.2354, "step": 11329 }, { "epoch": 3.3745974422457605, "grad_norm": 0.22607681155204773, "learning_rate": 1.543330565321107e-05, "loss": 1.24, "step": 11330 }, { "epoch": 3.374895288445429, "grad_norm": 0.23405821621418, "learning_rate": 1.5432495856292255e-05, "loss": 1.2485, "step": 11331 }, { "epoch": 3.3751931346450976, "grad_norm": 0.2404523640871048, "learning_rate": 1.5431686008830137e-05, "loss": 1.2362, "step": 11332 }, { "epoch": 3.3754909808447664, "grad_norm": 0.23444102704524994, "learning_rate": 1.543087611083225e-05, "loss": 1.243, "step": 11333 }, { "epoch": 3.3757888270444347, "grad_norm": 0.23104414343833923, "learning_rate": 1.543006616230613e-05, "loss": 1.2444, "step": 11334 }, { "epoch": 3.3760866732441035, "grad_norm": 0.23779796063899994, "learning_rate": 1.5429256163259307e-05, "loss": 1.2441, "step": 11335 }, { "epoch": 3.3763845194437723, "grad_norm": 0.24722295999526978, "learning_rate": 1.542844611369932e-05, "loss": 1.2246, "step": 11336 }, { "epoch": 3.376682365643441, "grad_norm": 0.24286231398582458, "learning_rate": 1.5427636013633717e-05, "loss": 1.222, "step": 11337 }, { "epoch": 3.3769802118431094, "grad_norm": 0.24265117943286896, "learning_rate": 1.5426825863070013e-05, "loss": 1.2288, "step": 11338 }, { "epoch": 3.377278058042778, "grad_norm": 0.2378610223531723, "learning_rate": 1.5426015662015763e-05, "loss": 1.2452, "step": 11339 }, { "epoch": 3.377575904242447, "grad_norm": 0.2299157977104187, "learning_rate": 1.5425205410478498e-05, "loss": 1.2269, "step": 11340 }, { "epoch": 3.3778737504421157, "grad_norm": 0.23765970766544342, "learning_rate": 1.542439510846576e-05, "loss": 1.2476, "step": 11341 }, { "epoch": 3.378171596641784, "grad_norm": 0.24288013577461243, "learning_rate": 1.5423584755985086e-05, "loss": 1.2212, "step": 11342 }, { "epoch": 3.378469442841453, "grad_norm": 0.23006445169448853, "learning_rate": 1.5422774353044013e-05, "loss": 1.2203, "step": 11343 }, { "epoch": 3.3787672890411216, "grad_norm": 0.23436981439590454, "learning_rate": 1.5421963899650086e-05, "loss": 1.219, "step": 11344 }, { "epoch": 3.37906513524079, "grad_norm": 0.2244638353586197, "learning_rate": 1.542115339581084e-05, "loss": 1.212, "step": 11345 }, { "epoch": 3.3793629814404587, "grad_norm": 0.24347656965255737, "learning_rate": 1.5420342841533823e-05, "loss": 1.2491, "step": 11346 }, { "epoch": 3.3796608276401274, "grad_norm": 0.23737263679504395, "learning_rate": 1.5419532236826568e-05, "loss": 1.2413, "step": 11347 }, { "epoch": 3.3799586738397958, "grad_norm": 0.2330455631017685, "learning_rate": 1.5418721581696623e-05, "loss": 1.2216, "step": 11348 }, { "epoch": 3.3802565200394645, "grad_norm": 0.24291853606700897, "learning_rate": 1.5417910876151525e-05, "loss": 1.2412, "step": 11349 }, { "epoch": 3.3805543662391333, "grad_norm": 0.23848694562911987, "learning_rate": 1.5417100120198822e-05, "loss": 1.2299, "step": 11350 }, { "epoch": 3.380852212438802, "grad_norm": 0.242288738489151, "learning_rate": 1.5416289313846053e-05, "loss": 1.2331, "step": 11351 }, { "epoch": 3.3811500586384704, "grad_norm": 0.23414702713489532, "learning_rate": 1.5415478457100766e-05, "loss": 1.2484, "step": 11352 }, { "epoch": 3.381447904838139, "grad_norm": 0.23552188277244568, "learning_rate": 1.54146675499705e-05, "loss": 1.2364, "step": 11353 }, { "epoch": 3.381745751037808, "grad_norm": 0.2976562976837158, "learning_rate": 1.5413856592462804e-05, "loss": 1.2362, "step": 11354 }, { "epoch": 3.3820435972374767, "grad_norm": 0.22300995886325836, "learning_rate": 1.541304558458522e-05, "loss": 1.2343, "step": 11355 }, { "epoch": 3.382341443437145, "grad_norm": 0.2366066724061966, "learning_rate": 1.5412234526345292e-05, "loss": 1.248, "step": 11356 }, { "epoch": 3.382639289636814, "grad_norm": 0.2326957881450653, "learning_rate": 1.5411423417750574e-05, "loss": 1.219, "step": 11357 }, { "epoch": 3.3829371358364826, "grad_norm": 0.23947864770889282, "learning_rate": 1.54106122588086e-05, "loss": 1.2503, "step": 11358 }, { "epoch": 3.383234982036151, "grad_norm": 0.24725420773029327, "learning_rate": 1.540980104952693e-05, "loss": 1.2324, "step": 11359 }, { "epoch": 3.3835328282358197, "grad_norm": 0.24381734430789948, "learning_rate": 1.5408989789913102e-05, "loss": 1.2052, "step": 11360 }, { "epoch": 3.3838306744354885, "grad_norm": 0.24043962359428406, "learning_rate": 1.540817847997467e-05, "loss": 1.2338, "step": 11361 }, { "epoch": 3.384128520635157, "grad_norm": 0.23623792827129364, "learning_rate": 1.540736711971918e-05, "loss": 1.2289, "step": 11362 }, { "epoch": 3.3844263668348256, "grad_norm": 0.23731105029582977, "learning_rate": 1.5406555709154177e-05, "loss": 1.2413, "step": 11363 }, { "epoch": 3.3847242130344943, "grad_norm": 0.23703479766845703, "learning_rate": 1.5405744248287215e-05, "loss": 1.2469, "step": 11364 }, { "epoch": 3.385022059234163, "grad_norm": 0.23037002980709076, "learning_rate": 1.5404932737125845e-05, "loss": 1.2363, "step": 11365 }, { "epoch": 3.3853199054338314, "grad_norm": 0.2389373779296875, "learning_rate": 1.5404121175677613e-05, "loss": 1.2546, "step": 11366 }, { "epoch": 3.3856177516335, "grad_norm": 0.23546260595321655, "learning_rate": 1.5403309563950067e-05, "loss": 1.2158, "step": 11367 }, { "epoch": 3.385915597833169, "grad_norm": 0.23392999172210693, "learning_rate": 1.5402497901950768e-05, "loss": 1.2238, "step": 11368 }, { "epoch": 3.3862134440328377, "grad_norm": 0.23209136724472046, "learning_rate": 1.5401686189687262e-05, "loss": 1.2254, "step": 11369 }, { "epoch": 3.386511290232506, "grad_norm": 0.23876014351844788, "learning_rate": 1.54008744271671e-05, "loss": 1.2698, "step": 11370 }, { "epoch": 3.386809136432175, "grad_norm": 0.2305627465248108, "learning_rate": 1.5400062614397836e-05, "loss": 1.2351, "step": 11371 }, { "epoch": 3.3871069826318436, "grad_norm": 0.23279821872711182, "learning_rate": 1.5399250751387024e-05, "loss": 1.2307, "step": 11372 }, { "epoch": 3.387404828831512, "grad_norm": 0.2321329116821289, "learning_rate": 1.539843883814221e-05, "loss": 1.2479, "step": 11373 }, { "epoch": 3.3877026750311807, "grad_norm": 0.23694854974746704, "learning_rate": 1.539762687467096e-05, "loss": 1.2331, "step": 11374 }, { "epoch": 3.3880005212308495, "grad_norm": 0.23521921038627625, "learning_rate": 1.5396814860980818e-05, "loss": 1.2391, "step": 11375 }, { "epoch": 3.388298367430518, "grad_norm": 0.23892724514007568, "learning_rate": 1.5396002797079347e-05, "loss": 1.2316, "step": 11376 }, { "epoch": 3.3885962136301866, "grad_norm": 0.22810769081115723, "learning_rate": 1.53951906829741e-05, "loss": 1.2344, "step": 11377 }, { "epoch": 3.3888940598298554, "grad_norm": 0.22269442677497864, "learning_rate": 1.539437851867263e-05, "loss": 1.2383, "step": 11378 }, { "epoch": 3.389191906029524, "grad_norm": 0.22351641952991486, "learning_rate": 1.5393566304182496e-05, "loss": 1.2262, "step": 11379 }, { "epoch": 3.3894897522291925, "grad_norm": 0.2398180514574051, "learning_rate": 1.5392754039511248e-05, "loss": 1.2449, "step": 11380 }, { "epoch": 3.3897875984288612, "grad_norm": 0.244129478931427, "learning_rate": 1.5391941724666454e-05, "loss": 1.2418, "step": 11381 }, { "epoch": 3.39008544462853, "grad_norm": 0.2408350110054016, "learning_rate": 1.5391129359655663e-05, "loss": 1.2466, "step": 11382 }, { "epoch": 3.390383290828199, "grad_norm": 0.24110910296440125, "learning_rate": 1.539031694448644e-05, "loss": 1.2369, "step": 11383 }, { "epoch": 3.390681137027867, "grad_norm": 0.2331976592540741, "learning_rate": 1.5389504479166338e-05, "loss": 1.2155, "step": 11384 }, { "epoch": 3.390978983227536, "grad_norm": 0.2415946125984192, "learning_rate": 1.5388691963702922e-05, "loss": 1.2357, "step": 11385 }, { "epoch": 3.3912768294272047, "grad_norm": 0.22565360367298126, "learning_rate": 1.5387879398103742e-05, "loss": 1.2278, "step": 11386 }, { "epoch": 3.391574675626873, "grad_norm": 0.23874175548553467, "learning_rate": 1.538706678237637e-05, "loss": 1.2379, "step": 11387 }, { "epoch": 3.3918725218265418, "grad_norm": 0.22943367063999176, "learning_rate": 1.5386254116528355e-05, "loss": 1.2483, "step": 11388 }, { "epoch": 3.3921703680262105, "grad_norm": 0.23683691024780273, "learning_rate": 1.5385441400567267e-05, "loss": 1.2391, "step": 11389 }, { "epoch": 3.3924682142258793, "grad_norm": 0.23681248724460602, "learning_rate": 1.5384628634500663e-05, "loss": 1.2433, "step": 11390 }, { "epoch": 3.3927660604255476, "grad_norm": 0.2290094941854477, "learning_rate": 1.5383815818336106e-05, "loss": 1.2506, "step": 11391 }, { "epoch": 3.3930639066252164, "grad_norm": 0.2265474796295166, "learning_rate": 1.5383002952081154e-05, "loss": 1.2464, "step": 11392 }, { "epoch": 3.393361752824885, "grad_norm": 0.22690005600452423, "learning_rate": 1.5382190035743377e-05, "loss": 1.2203, "step": 11393 }, { "epoch": 3.393659599024554, "grad_norm": 0.23712964355945587, "learning_rate": 1.5381377069330333e-05, "loss": 1.2277, "step": 11394 }, { "epoch": 3.3939574452242223, "grad_norm": 0.23558643460273743, "learning_rate": 1.5380564052849592e-05, "loss": 1.2146, "step": 11395 }, { "epoch": 3.394255291423891, "grad_norm": 0.24280938506126404, "learning_rate": 1.5379750986308716e-05, "loss": 1.2386, "step": 11396 }, { "epoch": 3.39455313762356, "grad_norm": 0.24029718339443207, "learning_rate": 1.537893786971526e-05, "loss": 1.2279, "step": 11397 }, { "epoch": 3.394850983823228, "grad_norm": 0.24037472903728485, "learning_rate": 1.53781247030768e-05, "loss": 1.2399, "step": 11398 }, { "epoch": 3.395148830022897, "grad_norm": 0.2389010637998581, "learning_rate": 1.53773114864009e-05, "loss": 1.2228, "step": 11399 }, { "epoch": 3.3954466762225657, "grad_norm": 0.25520673394203186, "learning_rate": 1.537649821969512e-05, "loss": 1.2429, "step": 11400 }, { "epoch": 3.395744522422234, "grad_norm": 0.23565858602523804, "learning_rate": 1.5375684902967038e-05, "loss": 1.2234, "step": 11401 }, { "epoch": 3.396042368621903, "grad_norm": 0.24352875351905823, "learning_rate": 1.537487153622421e-05, "loss": 1.2457, "step": 11402 }, { "epoch": 3.3963402148215716, "grad_norm": 0.23773491382598877, "learning_rate": 1.5374058119474208e-05, "loss": 1.2342, "step": 11403 }, { "epoch": 3.3966380610212403, "grad_norm": 0.23633608222007751, "learning_rate": 1.5373244652724596e-05, "loss": 1.2565, "step": 11404 }, { "epoch": 3.3969359072209087, "grad_norm": 0.23755548894405365, "learning_rate": 1.537243113598295e-05, "loss": 1.2298, "step": 11405 }, { "epoch": 3.3972337534205774, "grad_norm": 0.23748286068439484, "learning_rate": 1.5371617569256834e-05, "loss": 1.2288, "step": 11406 }, { "epoch": 3.397531599620246, "grad_norm": 0.23521433770656586, "learning_rate": 1.537080395255382e-05, "loss": 1.2534, "step": 11407 }, { "epoch": 3.397829445819915, "grad_norm": 0.24321109056472778, "learning_rate": 1.5369990285881473e-05, "loss": 1.2291, "step": 11408 }, { "epoch": 3.3981272920195833, "grad_norm": 0.23622368276119232, "learning_rate": 1.5369176569247367e-05, "loss": 1.2367, "step": 11409 }, { "epoch": 3.398425138219252, "grad_norm": 0.2309383749961853, "learning_rate": 1.536836280265907e-05, "loss": 1.2377, "step": 11410 }, { "epoch": 3.398722984418921, "grad_norm": 0.24700772762298584, "learning_rate": 1.5367548986124156e-05, "loss": 1.2167, "step": 11411 }, { "epoch": 3.399020830618589, "grad_norm": 0.2414867877960205, "learning_rate": 1.53667351196502e-05, "loss": 1.2547, "step": 11412 }, { "epoch": 3.399318676818258, "grad_norm": 0.23084893822669983, "learning_rate": 1.5365921203244765e-05, "loss": 1.2276, "step": 11413 }, { "epoch": 3.3996165230179267, "grad_norm": 0.23509570956230164, "learning_rate": 1.536510723691543e-05, "loss": 1.2423, "step": 11414 }, { "epoch": 3.399914369217595, "grad_norm": 0.24819272756576538, "learning_rate": 1.5364293220669764e-05, "loss": 1.2291, "step": 11415 }, { "epoch": 3.400212215417264, "grad_norm": 0.23980839550495148, "learning_rate": 1.5363479154515342e-05, "loss": 1.2319, "step": 11416 }, { "epoch": 3.4005100616169326, "grad_norm": 0.2304224967956543, "learning_rate": 1.5362665038459743e-05, "loss": 1.2384, "step": 11417 }, { "epoch": 3.4008079078166014, "grad_norm": 0.23365645110607147, "learning_rate": 1.536185087251054e-05, "loss": 1.2207, "step": 11418 }, { "epoch": 3.4011057540162697, "grad_norm": 0.24747350811958313, "learning_rate": 1.53610366566753e-05, "loss": 1.2359, "step": 11419 }, { "epoch": 3.4014036002159385, "grad_norm": 0.23782464861869812, "learning_rate": 1.5360222390961602e-05, "loss": 1.2469, "step": 11420 }, { "epoch": 3.4017014464156072, "grad_norm": 0.2449568659067154, "learning_rate": 1.5359408075377028e-05, "loss": 1.2256, "step": 11421 }, { "epoch": 3.401999292615276, "grad_norm": 0.23566211760044098, "learning_rate": 1.5358593709929148e-05, "loss": 1.229, "step": 11422 }, { "epoch": 3.4022971388149443, "grad_norm": 0.23816730082035065, "learning_rate": 1.535777929462554e-05, "loss": 1.2198, "step": 11423 }, { "epoch": 3.402594985014613, "grad_norm": 0.23726990818977356, "learning_rate": 1.5356964829473785e-05, "loss": 1.2452, "step": 11424 }, { "epoch": 3.402892831214282, "grad_norm": 0.24003499746322632, "learning_rate": 1.535615031448145e-05, "loss": 1.2421, "step": 11425 }, { "epoch": 3.40319067741395, "grad_norm": 0.24958519637584686, "learning_rate": 1.535533574965613e-05, "loss": 1.235, "step": 11426 }, { "epoch": 3.403488523613619, "grad_norm": 0.23630942404270172, "learning_rate": 1.5354521135005387e-05, "loss": 1.2159, "step": 11427 }, { "epoch": 3.4037863698132877, "grad_norm": 0.23265543580055237, "learning_rate": 1.535370647053681e-05, "loss": 1.2638, "step": 11428 }, { "epoch": 3.404084216012956, "grad_norm": 0.23566044867038727, "learning_rate": 1.5352891756257977e-05, "loss": 1.2366, "step": 11429 }, { "epoch": 3.404382062212625, "grad_norm": 0.23651911318302155, "learning_rate": 1.5352076992176464e-05, "loss": 1.233, "step": 11430 }, { "epoch": 3.4046799084122936, "grad_norm": 0.24107149243354797, "learning_rate": 1.5351262178299855e-05, "loss": 1.2396, "step": 11431 }, { "epoch": 3.4049777546119624, "grad_norm": 0.237289160490036, "learning_rate": 1.535044731463573e-05, "loss": 1.2406, "step": 11432 }, { "epoch": 3.4052756008116307, "grad_norm": 0.244338721036911, "learning_rate": 1.534963240119167e-05, "loss": 1.2387, "step": 11433 }, { "epoch": 3.4055734470112995, "grad_norm": 0.2455170452594757, "learning_rate": 1.534881743797526e-05, "loss": 1.2468, "step": 11434 }, { "epoch": 3.4058712932109683, "grad_norm": 0.23031951487064362, "learning_rate": 1.534800242499408e-05, "loss": 1.2308, "step": 11435 }, { "epoch": 3.406169139410637, "grad_norm": 0.24803990125656128, "learning_rate": 1.5347187362255712e-05, "loss": 1.2142, "step": 11436 }, { "epoch": 3.4064669856103054, "grad_norm": 0.24458499252796173, "learning_rate": 1.534637224976774e-05, "loss": 1.242, "step": 11437 }, { "epoch": 3.406764831809974, "grad_norm": 0.24179518222808838, "learning_rate": 1.5345557087537745e-05, "loss": 1.2345, "step": 11438 }, { "epoch": 3.407062678009643, "grad_norm": 0.24180927872657776, "learning_rate": 1.5344741875573314e-05, "loss": 1.2373, "step": 11439 }, { "epoch": 3.4073605242093112, "grad_norm": 0.2570483088493347, "learning_rate": 1.5343926613882035e-05, "loss": 1.2379, "step": 11440 }, { "epoch": 3.40765837040898, "grad_norm": 0.24013374745845795, "learning_rate": 1.5343111302471487e-05, "loss": 1.252, "step": 11441 }, { "epoch": 3.407956216608649, "grad_norm": 0.25222188234329224, "learning_rate": 1.5342295941349256e-05, "loss": 1.2317, "step": 11442 }, { "epoch": 3.408254062808317, "grad_norm": 0.2308976799249649, "learning_rate": 1.5341480530522933e-05, "loss": 1.2043, "step": 11443 }, { "epoch": 3.408551909007986, "grad_norm": 0.23158913850784302, "learning_rate": 1.5340665070000102e-05, "loss": 1.2322, "step": 11444 }, { "epoch": 3.4088497552076547, "grad_norm": 0.23615525662899017, "learning_rate": 1.5339849559788346e-05, "loss": 1.2337, "step": 11445 }, { "epoch": 3.4091476014073234, "grad_norm": 0.23380263149738312, "learning_rate": 1.5339033999895262e-05, "loss": 1.2208, "step": 11446 }, { "epoch": 3.4094454476069918, "grad_norm": 0.2484705001115799, "learning_rate": 1.5338218390328426e-05, "loss": 1.2289, "step": 11447 }, { "epoch": 3.4097432938066605, "grad_norm": 0.2381022572517395, "learning_rate": 1.5337402731095433e-05, "loss": 1.2303, "step": 11448 }, { "epoch": 3.4100411400063293, "grad_norm": 0.2324039340019226, "learning_rate": 1.5336587022203874e-05, "loss": 1.2167, "step": 11449 }, { "epoch": 3.410338986205998, "grad_norm": 0.23961563408374786, "learning_rate": 1.533577126366133e-05, "loss": 1.2483, "step": 11450 }, { "epoch": 3.4106368324056664, "grad_norm": 0.2381473183631897, "learning_rate": 1.5334955455475398e-05, "loss": 1.2287, "step": 11451 }, { "epoch": 3.410934678605335, "grad_norm": 0.24186542630195618, "learning_rate": 1.5334139597653667e-05, "loss": 1.2295, "step": 11452 }, { "epoch": 3.411232524805004, "grad_norm": 0.23231279850006104, "learning_rate": 1.5333323690203727e-05, "loss": 1.2287, "step": 11453 }, { "epoch": 3.4115303710046723, "grad_norm": 0.23343810439109802, "learning_rate": 1.5332507733133167e-05, "loss": 1.2377, "step": 11454 }, { "epoch": 3.411828217204341, "grad_norm": 0.24753369390964508, "learning_rate": 1.5331691726449584e-05, "loss": 1.2308, "step": 11455 }, { "epoch": 3.41212606340401, "grad_norm": 0.24245429039001465, "learning_rate": 1.5330875670160563e-05, "loss": 1.2388, "step": 11456 }, { "epoch": 3.4124239096036786, "grad_norm": 0.2326730191707611, "learning_rate": 1.53300595642737e-05, "loss": 1.2193, "step": 11457 }, { "epoch": 3.412721755803347, "grad_norm": 0.23411613702774048, "learning_rate": 1.532924340879659e-05, "loss": 1.2267, "step": 11458 }, { "epoch": 3.4130196020030157, "grad_norm": 0.2385409027338028, "learning_rate": 1.5328427203736822e-05, "loss": 1.24, "step": 11459 }, { "epoch": 3.4133174482026845, "grad_norm": 0.24209468066692352, "learning_rate": 1.5327610949101996e-05, "loss": 1.2404, "step": 11460 }, { "epoch": 3.4136152944023532, "grad_norm": 0.23912429809570312, "learning_rate": 1.53267946448997e-05, "loss": 1.2348, "step": 11461 }, { "epoch": 3.4139131406020216, "grad_norm": 0.24100053310394287, "learning_rate": 1.5325978291137528e-05, "loss": 1.2275, "step": 11462 }, { "epoch": 3.4142109868016903, "grad_norm": 0.2500961124897003, "learning_rate": 1.5325161887823083e-05, "loss": 1.2382, "step": 11463 }, { "epoch": 3.414508833001359, "grad_norm": 0.2547939717769623, "learning_rate": 1.5324345434963953e-05, "loss": 1.2172, "step": 11464 }, { "epoch": 3.4148066792010274, "grad_norm": 0.23142468929290771, "learning_rate": 1.532352893256774e-05, "loss": 1.2428, "step": 11465 }, { "epoch": 3.415104525400696, "grad_norm": 0.234122633934021, "learning_rate": 1.532271238064204e-05, "loss": 1.2307, "step": 11466 }, { "epoch": 3.415402371600365, "grad_norm": 0.22871531546115875, "learning_rate": 1.5321895779194444e-05, "loss": 1.2294, "step": 11467 }, { "epoch": 3.4157002178000333, "grad_norm": 0.231284499168396, "learning_rate": 1.5321079128232556e-05, "loss": 1.2294, "step": 11468 }, { "epoch": 3.415998063999702, "grad_norm": 0.2404448390007019, "learning_rate": 1.5320262427763967e-05, "loss": 1.2443, "step": 11469 }, { "epoch": 3.416295910199371, "grad_norm": 0.22571872174739838, "learning_rate": 1.5319445677796287e-05, "loss": 1.23, "step": 11470 }, { "epoch": 3.4165937563990396, "grad_norm": 0.23221392929553986, "learning_rate": 1.5318628878337106e-05, "loss": 1.2288, "step": 11471 }, { "epoch": 3.416891602598708, "grad_norm": 0.24114064872264862, "learning_rate": 1.5317812029394022e-05, "loss": 1.2246, "step": 11472 }, { "epoch": 3.4171894487983767, "grad_norm": 0.2522605359554291, "learning_rate": 1.5316995130974647e-05, "loss": 1.2435, "step": 11473 }, { "epoch": 3.4174872949980455, "grad_norm": 0.23176567256450653, "learning_rate": 1.5316178183086562e-05, "loss": 1.2514, "step": 11474 }, { "epoch": 3.4177851411977143, "grad_norm": 0.23099017143249512, "learning_rate": 1.5315361185737384e-05, "loss": 1.241, "step": 11475 }, { "epoch": 3.4180829873973826, "grad_norm": 0.2418859750032425, "learning_rate": 1.531454413893471e-05, "loss": 1.2458, "step": 11476 }, { "epoch": 3.4183808335970514, "grad_norm": 0.23647300899028778, "learning_rate": 1.531372704268614e-05, "loss": 1.247, "step": 11477 }, { "epoch": 3.41867867979672, "grad_norm": 0.23838262259960175, "learning_rate": 1.5312909896999277e-05, "loss": 1.2409, "step": 11478 }, { "epoch": 3.4189765259963885, "grad_norm": 0.23677779734134674, "learning_rate": 1.531209270188172e-05, "loss": 1.2419, "step": 11479 }, { "epoch": 3.4192743721960572, "grad_norm": 0.23330330848693848, "learning_rate": 1.531127545734108e-05, "loss": 1.2541, "step": 11480 }, { "epoch": 3.419572218395726, "grad_norm": 0.24141250550746918, "learning_rate": 1.5310458163384955e-05, "loss": 1.2394, "step": 11481 }, { "epoch": 3.4198700645953943, "grad_norm": 0.23776668310165405, "learning_rate": 1.5309640820020947e-05, "loss": 1.2474, "step": 11482 }, { "epoch": 3.420167910795063, "grad_norm": 0.2449045479297638, "learning_rate": 1.5308823427256664e-05, "loss": 1.2289, "step": 11483 }, { "epoch": 3.420465756994732, "grad_norm": 0.24275699257850647, "learning_rate": 1.530800598509971e-05, "loss": 1.2282, "step": 11484 }, { "epoch": 3.4207636031944006, "grad_norm": 0.24288229644298553, "learning_rate": 1.5307188493557698e-05, "loss": 1.2323, "step": 11485 }, { "epoch": 3.421061449394069, "grad_norm": 0.23896589875221252, "learning_rate": 1.5306370952638217e-05, "loss": 1.2299, "step": 11486 }, { "epoch": 3.4213592955937377, "grad_norm": 0.24213965237140656, "learning_rate": 1.5305553362348887e-05, "loss": 1.2364, "step": 11487 }, { "epoch": 3.4216571417934065, "grad_norm": 0.2553258240222931, "learning_rate": 1.530473572269731e-05, "loss": 1.2463, "step": 11488 }, { "epoch": 3.4219549879930753, "grad_norm": 0.2358098179101944, "learning_rate": 1.5303918033691095e-05, "loss": 1.2205, "step": 11489 }, { "epoch": 3.4222528341927436, "grad_norm": 0.24891728162765503, "learning_rate": 1.530310029533785e-05, "loss": 1.2447, "step": 11490 }, { "epoch": 3.4225506803924124, "grad_norm": 0.25674664974212646, "learning_rate": 1.530228250764518e-05, "loss": 1.2315, "step": 11491 }, { "epoch": 3.422848526592081, "grad_norm": 0.2444644570350647, "learning_rate": 1.5301464670620695e-05, "loss": 1.2335, "step": 11492 }, { "epoch": 3.4231463727917495, "grad_norm": 0.24277029931545258, "learning_rate": 1.5300646784272003e-05, "loss": 1.24, "step": 11493 }, { "epoch": 3.4234442189914183, "grad_norm": 0.23423664271831512, "learning_rate": 1.5299828848606716e-05, "loss": 1.245, "step": 11494 }, { "epoch": 3.423742065191087, "grad_norm": 0.2517566382884979, "learning_rate": 1.5299010863632443e-05, "loss": 1.2186, "step": 11495 }, { "epoch": 3.4240399113907554, "grad_norm": 0.24025209248065948, "learning_rate": 1.5298192829356796e-05, "loss": 1.2335, "step": 11496 }, { "epoch": 3.424337757590424, "grad_norm": 0.2340984046459198, "learning_rate": 1.5297374745787383e-05, "loss": 1.2333, "step": 11497 }, { "epoch": 3.424635603790093, "grad_norm": 0.25047650933265686, "learning_rate": 1.5296556612931816e-05, "loss": 1.2257, "step": 11498 }, { "epoch": 3.4249334499897617, "grad_norm": 0.23872359097003937, "learning_rate": 1.5295738430797705e-05, "loss": 1.23, "step": 11499 }, { "epoch": 3.42523129618943, "grad_norm": 0.23748847842216492, "learning_rate": 1.5294920199392667e-05, "loss": 1.2255, "step": 11500 }, { "epoch": 3.42523129618943, "eval_loss": 1.3356280326843262, "eval_runtime": 20.2227, "eval_samples_per_second": 85.745, "eval_steps_per_second": 5.39, "step": 11500 }, { "epoch": 3.425529142389099, "grad_norm": 0.2422969937324524, "learning_rate": 1.5294101918724314e-05, "loss": 1.2451, "step": 11501 }, { "epoch": 3.4258269885887676, "grad_norm": 0.23358918726444244, "learning_rate": 1.5293283588800257e-05, "loss": 1.2222, "step": 11502 }, { "epoch": 3.4261248347884363, "grad_norm": 0.22912141680717468, "learning_rate": 1.529246520962811e-05, "loss": 1.2167, "step": 11503 }, { "epoch": 3.4264226809881047, "grad_norm": 0.22926951944828033, "learning_rate": 1.5291646781215486e-05, "loss": 1.239, "step": 11504 }, { "epoch": 3.4267205271877734, "grad_norm": 0.23395386338233948, "learning_rate": 1.529082830357e-05, "loss": 1.2256, "step": 11505 }, { "epoch": 3.427018373387442, "grad_norm": 0.22575154900550842, "learning_rate": 1.529000977669927e-05, "loss": 1.2189, "step": 11506 }, { "epoch": 3.4273162195871105, "grad_norm": 0.24313803017139435, "learning_rate": 1.5289191200610912e-05, "loss": 1.2381, "step": 11507 }, { "epoch": 3.4276140657867793, "grad_norm": 0.23815204203128815, "learning_rate": 1.5288372575312534e-05, "loss": 1.2288, "step": 11508 }, { "epoch": 3.427911911986448, "grad_norm": 0.23456953465938568, "learning_rate": 1.528755390081176e-05, "loss": 1.2363, "step": 11509 }, { "epoch": 3.4282097581861164, "grad_norm": 0.23735493421554565, "learning_rate": 1.52867351771162e-05, "loss": 1.2338, "step": 11510 }, { "epoch": 3.428507604385785, "grad_norm": 0.24212393164634705, "learning_rate": 1.5285916404233487e-05, "loss": 1.2308, "step": 11511 }, { "epoch": 3.428805450585454, "grad_norm": 0.25003373622894287, "learning_rate": 1.528509758217122e-05, "loss": 1.2551, "step": 11512 }, { "epoch": 3.4291032967851227, "grad_norm": 0.24068664014339447, "learning_rate": 1.5284278710937025e-05, "loss": 1.225, "step": 11513 }, { "epoch": 3.429401142984791, "grad_norm": 0.24042995274066925, "learning_rate": 1.528345979053852e-05, "loss": 1.2416, "step": 11514 }, { "epoch": 3.42969898918446, "grad_norm": 0.2419779896736145, "learning_rate": 1.5282640820983328e-05, "loss": 1.2367, "step": 11515 }, { "epoch": 3.4299968353841286, "grad_norm": 0.23886997997760773, "learning_rate": 1.528182180227906e-05, "loss": 1.2427, "step": 11516 }, { "epoch": 3.4302946815837974, "grad_norm": 0.23846594989299774, "learning_rate": 1.5281002734433344e-05, "loss": 1.2256, "step": 11517 }, { "epoch": 3.4305925277834657, "grad_norm": 0.23304830491542816, "learning_rate": 1.5280183617453805e-05, "loss": 1.238, "step": 11518 }, { "epoch": 3.4308903739831345, "grad_norm": 0.24864642322063446, "learning_rate": 1.5279364451348048e-05, "loss": 1.2303, "step": 11519 }, { "epoch": 3.4311882201828032, "grad_norm": 0.2348104864358902, "learning_rate": 1.5278545236123705e-05, "loss": 1.2352, "step": 11520 }, { "epoch": 3.4314860663824716, "grad_norm": 0.24465292692184448, "learning_rate": 1.5277725971788398e-05, "loss": 1.2217, "step": 11521 }, { "epoch": 3.4317839125821403, "grad_norm": 0.2340582311153412, "learning_rate": 1.5276906658349747e-05, "loss": 1.2267, "step": 11522 }, { "epoch": 3.432081758781809, "grad_norm": 0.23585832118988037, "learning_rate": 1.5276087295815373e-05, "loss": 1.2317, "step": 11523 }, { "epoch": 3.432379604981478, "grad_norm": 0.24549613893032074, "learning_rate": 1.5275267884192905e-05, "loss": 1.2412, "step": 11524 }, { "epoch": 3.432677451181146, "grad_norm": 0.23276914656162262, "learning_rate": 1.527444842348996e-05, "loss": 1.2521, "step": 11525 }, { "epoch": 3.432975297380815, "grad_norm": 0.2454558163881302, "learning_rate": 1.5273628913714165e-05, "loss": 1.2326, "step": 11526 }, { "epoch": 3.4332731435804837, "grad_norm": 0.257112979888916, "learning_rate": 1.5272809354873146e-05, "loss": 1.2418, "step": 11527 }, { "epoch": 3.4335709897801525, "grad_norm": 0.23777510225772858, "learning_rate": 1.527198974697453e-05, "loss": 1.2475, "step": 11528 }, { "epoch": 3.433868835979821, "grad_norm": 0.23546954989433289, "learning_rate": 1.5271170090025936e-05, "loss": 1.2408, "step": 11529 }, { "epoch": 3.4341666821794896, "grad_norm": 0.23517552018165588, "learning_rate": 1.5270350384034993e-05, "loss": 1.2193, "step": 11530 }, { "epoch": 3.4344645283791584, "grad_norm": 0.23490594327449799, "learning_rate": 1.5269530629009332e-05, "loss": 1.2424, "step": 11531 }, { "epoch": 3.4347623745788267, "grad_norm": 0.24757042527198792, "learning_rate": 1.5268710824956574e-05, "loss": 1.217, "step": 11532 }, { "epoch": 3.4350602207784955, "grad_norm": 0.23664523661136627, "learning_rate": 1.5267890971884346e-05, "loss": 1.2264, "step": 11533 }, { "epoch": 3.4353580669781643, "grad_norm": 0.24112482368946075, "learning_rate": 1.526707106980028e-05, "loss": 1.2415, "step": 11534 }, { "epoch": 3.4356559131778326, "grad_norm": 0.251738578081131, "learning_rate": 1.5266251118712005e-05, "loss": 1.2343, "step": 11535 }, { "epoch": 3.4359537593775014, "grad_norm": 0.2540231943130493, "learning_rate": 1.5265431118627147e-05, "loss": 1.2307, "step": 11536 }, { "epoch": 3.43625160557717, "grad_norm": 0.24106860160827637, "learning_rate": 1.526461106955333e-05, "loss": 1.2267, "step": 11537 }, { "epoch": 3.436549451776839, "grad_norm": 0.24626649916172028, "learning_rate": 1.5263790971498195e-05, "loss": 1.2342, "step": 11538 }, { "epoch": 3.4368472979765072, "grad_norm": 0.24112309515476227, "learning_rate": 1.5262970824469362e-05, "loss": 1.2251, "step": 11539 }, { "epoch": 3.437145144176176, "grad_norm": 0.23126092553138733, "learning_rate": 1.5262150628474467e-05, "loss": 1.2328, "step": 11540 }, { "epoch": 3.4374429903758448, "grad_norm": 0.25916430354118347, "learning_rate": 1.5261330383521138e-05, "loss": 1.235, "step": 11541 }, { "epoch": 3.4377408365755135, "grad_norm": 0.23983551561832428, "learning_rate": 1.5260510089617012e-05, "loss": 1.2208, "step": 11542 }, { "epoch": 3.438038682775182, "grad_norm": 0.24710910022258759, "learning_rate": 1.5259689746769714e-05, "loss": 1.2284, "step": 11543 }, { "epoch": 3.4383365289748506, "grad_norm": 0.2789791524410248, "learning_rate": 1.525886935498688e-05, "loss": 1.2285, "step": 11544 }, { "epoch": 3.4386343751745194, "grad_norm": 0.25198033452033997, "learning_rate": 1.525804891427614e-05, "loss": 1.2438, "step": 11545 }, { "epoch": 3.4389322213741877, "grad_norm": 0.24183997511863708, "learning_rate": 1.5257228424645132e-05, "loss": 1.2136, "step": 11546 }, { "epoch": 3.4392300675738565, "grad_norm": 0.2604411542415619, "learning_rate": 1.5256407886101486e-05, "loss": 1.248, "step": 11547 }, { "epoch": 3.4395279137735253, "grad_norm": 0.24355940520763397, "learning_rate": 1.5255587298652841e-05, "loss": 1.2382, "step": 11548 }, { "epoch": 3.4398257599731936, "grad_norm": 0.23050110042095184, "learning_rate": 1.5254766662306825e-05, "loss": 1.2311, "step": 11549 }, { "epoch": 3.4401236061728624, "grad_norm": 0.2449619323015213, "learning_rate": 1.5253945977071076e-05, "loss": 1.232, "step": 11550 }, { "epoch": 3.440421452372531, "grad_norm": 0.2661532759666443, "learning_rate": 1.5253125242953228e-05, "loss": 1.2517, "step": 11551 }, { "epoch": 3.4407192985722, "grad_norm": 0.24256959557533264, "learning_rate": 1.5252304459960922e-05, "loss": 1.2168, "step": 11552 }, { "epoch": 3.4410171447718683, "grad_norm": 0.2672572433948517, "learning_rate": 1.5251483628101791e-05, "loss": 1.2303, "step": 11553 }, { "epoch": 3.441314990971537, "grad_norm": 0.27686724066734314, "learning_rate": 1.525066274738347e-05, "loss": 1.2264, "step": 11554 }, { "epoch": 3.441612837171206, "grad_norm": 0.2409094274044037, "learning_rate": 1.5249841817813602e-05, "loss": 1.228, "step": 11555 }, { "epoch": 3.4419106833708746, "grad_norm": 0.2512090802192688, "learning_rate": 1.5249020839399819e-05, "loss": 1.2488, "step": 11556 }, { "epoch": 3.442208529570543, "grad_norm": 0.23591990768909454, "learning_rate": 1.524819981214976e-05, "loss": 1.2239, "step": 11557 }, { "epoch": 3.4425063757702117, "grad_norm": 0.26415589451789856, "learning_rate": 1.5247378736071068e-05, "loss": 1.2398, "step": 11558 }, { "epoch": 3.4428042219698805, "grad_norm": 0.27485665678977966, "learning_rate": 1.524655761117138e-05, "loss": 1.2146, "step": 11559 }, { "epoch": 3.443102068169549, "grad_norm": 0.2404964715242386, "learning_rate": 1.5245736437458333e-05, "loss": 1.2233, "step": 11560 }, { "epoch": 3.4433999143692176, "grad_norm": 0.36868318915367126, "learning_rate": 1.5244915214939574e-05, "loss": 1.2349, "step": 11561 }, { "epoch": 3.4436977605688863, "grad_norm": 0.371759831905365, "learning_rate": 1.5244093943622733e-05, "loss": 1.2466, "step": 11562 }, { "epoch": 3.4439956067685547, "grad_norm": 0.23560090363025665, "learning_rate": 1.5243272623515459e-05, "loss": 1.2324, "step": 11563 }, { "epoch": 3.4442934529682234, "grad_norm": 0.38808372616767883, "learning_rate": 1.5242451254625393e-05, "loss": 1.2547, "step": 11564 }, { "epoch": 3.444591299167892, "grad_norm": 0.24690575897693634, "learning_rate": 1.5241629836960173e-05, "loss": 1.2413, "step": 11565 }, { "epoch": 3.444889145367561, "grad_norm": 0.2622869908809662, "learning_rate": 1.5240808370527444e-05, "loss": 1.2251, "step": 11566 }, { "epoch": 3.4451869915672293, "grad_norm": 0.2426728904247284, "learning_rate": 1.5239986855334849e-05, "loss": 1.2129, "step": 11567 }, { "epoch": 3.445484837766898, "grad_norm": 0.23665250837802887, "learning_rate": 1.5239165291390033e-05, "loss": 1.2536, "step": 11568 }, { "epoch": 3.445782683966567, "grad_norm": 0.2624794840812683, "learning_rate": 1.5238343678700633e-05, "loss": 1.2273, "step": 11569 }, { "epoch": 3.4460805301662356, "grad_norm": 0.2386084645986557, "learning_rate": 1.52375220172743e-05, "loss": 1.2418, "step": 11570 }, { "epoch": 3.446378376365904, "grad_norm": 0.23174983263015747, "learning_rate": 1.5236700307118674e-05, "loss": 1.241, "step": 11571 }, { "epoch": 3.4466762225655727, "grad_norm": 0.2384009212255478, "learning_rate": 1.523587854824141e-05, "loss": 1.23, "step": 11572 }, { "epoch": 3.4469740687652415, "grad_norm": 0.23641443252563477, "learning_rate": 1.5235056740650138e-05, "loss": 1.2256, "step": 11573 }, { "epoch": 3.44727191496491, "grad_norm": 0.23755528032779694, "learning_rate": 1.5234234884352513e-05, "loss": 1.2521, "step": 11574 }, { "epoch": 3.4475697611645786, "grad_norm": 0.23581965267658234, "learning_rate": 1.5233412979356184e-05, "loss": 1.2449, "step": 11575 }, { "epoch": 3.4478676073642474, "grad_norm": 0.23634174466133118, "learning_rate": 1.5232591025668792e-05, "loss": 1.2538, "step": 11576 }, { "epoch": 3.4481654535639157, "grad_norm": 0.2442152500152588, "learning_rate": 1.5231769023297989e-05, "loss": 1.2541, "step": 11577 }, { "epoch": 3.4484632997635845, "grad_norm": 0.23592181503772736, "learning_rate": 1.5230946972251416e-05, "loss": 1.2374, "step": 11578 }, { "epoch": 3.4487611459632532, "grad_norm": 0.23963795602321625, "learning_rate": 1.523012487253673e-05, "loss": 1.2251, "step": 11579 }, { "epoch": 3.449058992162922, "grad_norm": 0.26141953468322754, "learning_rate": 1.5229302724161574e-05, "loss": 1.2459, "step": 11580 }, { "epoch": 3.4493568383625903, "grad_norm": 0.2843151390552521, "learning_rate": 1.5228480527133598e-05, "loss": 1.2247, "step": 11581 }, { "epoch": 3.449654684562259, "grad_norm": 0.25682297348976135, "learning_rate": 1.5227658281460453e-05, "loss": 1.2454, "step": 11582 }, { "epoch": 3.449952530761928, "grad_norm": 0.4024375081062317, "learning_rate": 1.5226835987149788e-05, "loss": 1.2345, "step": 11583 }, { "epoch": 3.4502503769615966, "grad_norm": 0.29868048429489136, "learning_rate": 1.5226013644209255e-05, "loss": 1.2346, "step": 11584 }, { "epoch": 3.450548223161265, "grad_norm": 0.28260061144828796, "learning_rate": 1.5225191252646503e-05, "loss": 1.244, "step": 11585 }, { "epoch": 3.4508460693609337, "grad_norm": 0.3465733230113983, "learning_rate": 1.5224368812469185e-05, "loss": 1.2389, "step": 11586 }, { "epoch": 3.4511439155606025, "grad_norm": 0.2428768426179886, "learning_rate": 1.522354632368495e-05, "loss": 1.2565, "step": 11587 }, { "epoch": 3.451441761760271, "grad_norm": 0.26560989022254944, "learning_rate": 1.5222723786301454e-05, "loss": 1.2241, "step": 11588 }, { "epoch": 3.4517396079599396, "grad_norm": 0.2622128129005432, "learning_rate": 1.5221901200326352e-05, "loss": 1.2381, "step": 11589 }, { "epoch": 3.4520374541596084, "grad_norm": 0.24717475473880768, "learning_rate": 1.5221078565767289e-05, "loss": 1.2509, "step": 11590 }, { "epoch": 3.452335300359277, "grad_norm": 0.2863736152648926, "learning_rate": 1.5220255882631922e-05, "loss": 1.2513, "step": 11591 }, { "epoch": 3.4526331465589455, "grad_norm": 0.23726223409175873, "learning_rate": 1.5219433150927909e-05, "loss": 1.2436, "step": 11592 }, { "epoch": 3.4529309927586143, "grad_norm": 0.2420269399881363, "learning_rate": 1.5218610370662903e-05, "loss": 1.242, "step": 11593 }, { "epoch": 3.453228838958283, "grad_norm": 0.242299884557724, "learning_rate": 1.5217787541844557e-05, "loss": 1.2291, "step": 11594 }, { "epoch": 3.453526685157952, "grad_norm": 0.23246005177497864, "learning_rate": 1.5216964664480527e-05, "loss": 1.2306, "step": 11595 }, { "epoch": 3.45382453135762, "grad_norm": 0.26450425386428833, "learning_rate": 1.5216141738578471e-05, "loss": 1.2305, "step": 11596 }, { "epoch": 3.454122377557289, "grad_norm": 0.2860598564147949, "learning_rate": 1.5215318764146046e-05, "loss": 1.232, "step": 11597 }, { "epoch": 3.4544202237569577, "grad_norm": 0.2404812127351761, "learning_rate": 1.5214495741190898e-05, "loss": 1.2405, "step": 11598 }, { "epoch": 3.454718069956626, "grad_norm": 0.2578551769256592, "learning_rate": 1.5213672669720701e-05, "loss": 1.232, "step": 11599 }, { "epoch": 3.4550159161562948, "grad_norm": 0.24088731408119202, "learning_rate": 1.5212849549743106e-05, "loss": 1.224, "step": 11600 }, { "epoch": 3.4553137623559635, "grad_norm": 0.24067500233650208, "learning_rate": 1.5212026381265763e-05, "loss": 1.2374, "step": 11601 }, { "epoch": 3.455611608555632, "grad_norm": 0.23492807149887085, "learning_rate": 1.5211203164296342e-05, "loss": 1.2335, "step": 11602 }, { "epoch": 3.4559094547553006, "grad_norm": 0.22992326319217682, "learning_rate": 1.5210379898842499e-05, "loss": 1.2359, "step": 11603 }, { "epoch": 3.4562073009549694, "grad_norm": 0.25816991925239563, "learning_rate": 1.5209556584911889e-05, "loss": 1.2474, "step": 11604 }, { "epoch": 3.456505147154638, "grad_norm": 0.2460036426782608, "learning_rate": 1.5208733222512178e-05, "loss": 1.2224, "step": 11605 }, { "epoch": 3.4568029933543065, "grad_norm": 0.23283728957176208, "learning_rate": 1.5207909811651024e-05, "loss": 1.238, "step": 11606 }, { "epoch": 3.4571008395539753, "grad_norm": 0.23499785363674164, "learning_rate": 1.5207086352336088e-05, "loss": 1.223, "step": 11607 }, { "epoch": 3.457398685753644, "grad_norm": 0.24944666028022766, "learning_rate": 1.520626284457503e-05, "loss": 1.2271, "step": 11608 }, { "epoch": 3.457696531953313, "grad_norm": 0.25068676471710205, "learning_rate": 1.5205439288375511e-05, "loss": 1.2504, "step": 11609 }, { "epoch": 3.457994378152981, "grad_norm": 0.2367619276046753, "learning_rate": 1.5204615683745198e-05, "loss": 1.2555, "step": 11610 }, { "epoch": 3.45829222435265, "grad_norm": 0.2698817849159241, "learning_rate": 1.520379203069175e-05, "loss": 1.2342, "step": 11611 }, { "epoch": 3.4585900705523187, "grad_norm": 0.2815074920654297, "learning_rate": 1.5202968329222832e-05, "loss": 1.2323, "step": 11612 }, { "epoch": 3.458887916751987, "grad_norm": 0.24128301441669464, "learning_rate": 1.5202144579346108e-05, "loss": 1.261, "step": 11613 }, { "epoch": 3.459185762951656, "grad_norm": 0.3304024338722229, "learning_rate": 1.520132078106924e-05, "loss": 1.2242, "step": 11614 }, { "epoch": 3.4594836091513246, "grad_norm": 0.2661862075328827, "learning_rate": 1.5200496934399893e-05, "loss": 1.2535, "step": 11615 }, { "epoch": 3.459781455350993, "grad_norm": 0.3135344684123993, "learning_rate": 1.5199673039345728e-05, "loss": 1.2326, "step": 11616 }, { "epoch": 3.4600793015506617, "grad_norm": 0.30401116609573364, "learning_rate": 1.5198849095914421e-05, "loss": 1.2376, "step": 11617 }, { "epoch": 3.4603771477503305, "grad_norm": 0.25784486532211304, "learning_rate": 1.519802510411363e-05, "loss": 1.2329, "step": 11618 }, { "epoch": 3.4606749939499992, "grad_norm": 0.28522834181785583, "learning_rate": 1.5197201063951022e-05, "loss": 1.2152, "step": 11619 }, { "epoch": 3.4609728401496676, "grad_norm": 0.24161286652088165, "learning_rate": 1.5196376975434265e-05, "loss": 1.2241, "step": 11620 }, { "epoch": 3.4612706863493363, "grad_norm": 0.2395504266023636, "learning_rate": 1.5195552838571026e-05, "loss": 1.2233, "step": 11621 }, { "epoch": 3.461568532549005, "grad_norm": 0.2447492629289627, "learning_rate": 1.5194728653368973e-05, "loss": 1.2439, "step": 11622 }, { "epoch": 3.461866378748674, "grad_norm": 0.23938801884651184, "learning_rate": 1.5193904419835775e-05, "loss": 1.2441, "step": 11623 }, { "epoch": 3.462164224948342, "grad_norm": 0.24809573590755463, "learning_rate": 1.5193080137979098e-05, "loss": 1.2217, "step": 11624 }, { "epoch": 3.462462071148011, "grad_norm": 0.24161048233509064, "learning_rate": 1.5192255807806611e-05, "loss": 1.2405, "step": 11625 }, { "epoch": 3.4627599173476797, "grad_norm": 0.23102059960365295, "learning_rate": 1.5191431429325986e-05, "loss": 1.2351, "step": 11626 }, { "epoch": 3.463057763547348, "grad_norm": 0.2383132129907608, "learning_rate": 1.5190607002544893e-05, "loss": 1.2361, "step": 11627 }, { "epoch": 3.463355609747017, "grad_norm": 0.23085835576057434, "learning_rate": 1.5189782527471e-05, "loss": 1.2264, "step": 11628 }, { "epoch": 3.4636534559466856, "grad_norm": 0.23786355555057526, "learning_rate": 1.5188958004111977e-05, "loss": 1.2265, "step": 11629 }, { "epoch": 3.463951302146354, "grad_norm": 0.24842850863933563, "learning_rate": 1.5188133432475502e-05, "loss": 1.2315, "step": 11630 }, { "epoch": 3.4642491483460227, "grad_norm": 0.3102658987045288, "learning_rate": 1.518730881256924e-05, "loss": 1.2201, "step": 11631 }, { "epoch": 3.4645469945456915, "grad_norm": 0.26763206720352173, "learning_rate": 1.5186484144400861e-05, "loss": 1.2232, "step": 11632 }, { "epoch": 3.4648448407453603, "grad_norm": 0.2530723810195923, "learning_rate": 1.5185659427978046e-05, "loss": 1.2433, "step": 11633 }, { "epoch": 3.4651426869450286, "grad_norm": 0.3082621097564697, "learning_rate": 1.5184834663308464e-05, "loss": 1.2397, "step": 11634 }, { "epoch": 3.4654405331446974, "grad_norm": 0.23491385579109192, "learning_rate": 1.5184009850399788e-05, "loss": 1.2241, "step": 11635 }, { "epoch": 3.465738379344366, "grad_norm": 0.2701856791973114, "learning_rate": 1.518318498925969e-05, "loss": 1.2237, "step": 11636 }, { "epoch": 3.466036225544035, "grad_norm": 0.3279270529747009, "learning_rate": 1.518236007989585e-05, "loss": 1.2419, "step": 11637 }, { "epoch": 3.4663340717437032, "grad_norm": 0.2563593089580536, "learning_rate": 1.5181535122315936e-05, "loss": 1.2361, "step": 11638 }, { "epoch": 3.466631917943372, "grad_norm": 0.25081443786621094, "learning_rate": 1.5180710116527628e-05, "loss": 1.2381, "step": 11639 }, { "epoch": 3.4669297641430408, "grad_norm": 0.2737971246242523, "learning_rate": 1.51798850625386e-05, "loss": 1.229, "step": 11640 }, { "epoch": 3.467227610342709, "grad_norm": 0.255681574344635, "learning_rate": 1.5179059960356531e-05, "loss": 1.2343, "step": 11641 }, { "epoch": 3.467525456542378, "grad_norm": 0.2361997365951538, "learning_rate": 1.5178234809989094e-05, "loss": 1.2458, "step": 11642 }, { "epoch": 3.4678233027420466, "grad_norm": 0.2407403141260147, "learning_rate": 1.5177409611443968e-05, "loss": 1.2343, "step": 11643 }, { "epoch": 3.4681211489417154, "grad_norm": 0.2502894997596741, "learning_rate": 1.5176584364728829e-05, "loss": 1.2303, "step": 11644 }, { "epoch": 3.4684189951413837, "grad_norm": 0.3361647427082062, "learning_rate": 1.5175759069851357e-05, "loss": 1.2341, "step": 11645 }, { "epoch": 3.4687168413410525, "grad_norm": 0.35799121856689453, "learning_rate": 1.5174933726819229e-05, "loss": 1.226, "step": 11646 }, { "epoch": 3.4690146875407213, "grad_norm": 0.24811288714408875, "learning_rate": 1.5174108335640127e-05, "loss": 1.242, "step": 11647 }, { "epoch": 3.4693125337403896, "grad_norm": 0.27803266048431396, "learning_rate": 1.5173282896321727e-05, "loss": 1.2269, "step": 11648 }, { "epoch": 3.4696103799400584, "grad_norm": 0.29134753346443176, "learning_rate": 1.5172457408871708e-05, "loss": 1.2242, "step": 11649 }, { "epoch": 3.469908226139727, "grad_norm": 0.24610275030136108, "learning_rate": 1.517163187329775e-05, "loss": 1.2519, "step": 11650 }, { "epoch": 3.470206072339396, "grad_norm": 0.2515765130519867, "learning_rate": 1.5170806289607538e-05, "loss": 1.2518, "step": 11651 }, { "epoch": 3.4705039185390643, "grad_norm": 0.2578960359096527, "learning_rate": 1.5169980657808752e-05, "loss": 1.2435, "step": 11652 }, { "epoch": 3.470801764738733, "grad_norm": 0.2373909205198288, "learning_rate": 1.5169154977909068e-05, "loss": 1.215, "step": 11653 }, { "epoch": 3.471099610938402, "grad_norm": 0.2524226903915405, "learning_rate": 1.5168329249916176e-05, "loss": 1.2442, "step": 11654 }, { "epoch": 3.47139745713807, "grad_norm": 0.25795549154281616, "learning_rate": 1.5167503473837754e-05, "loss": 1.2246, "step": 11655 }, { "epoch": 3.471695303337739, "grad_norm": 0.2714008390903473, "learning_rate": 1.5166677649681484e-05, "loss": 1.2288, "step": 11656 }, { "epoch": 3.4719931495374077, "grad_norm": 0.3128831088542938, "learning_rate": 1.5165851777455051e-05, "loss": 1.2381, "step": 11657 }, { "epoch": 3.4722909957370764, "grad_norm": 0.23760241270065308, "learning_rate": 1.5165025857166143e-05, "loss": 1.2522, "step": 11658 }, { "epoch": 3.4725888419367448, "grad_norm": 0.3532728850841522, "learning_rate": 1.5164199888822438e-05, "loss": 1.2178, "step": 11659 }, { "epoch": 3.4728866881364135, "grad_norm": 0.2692665457725525, "learning_rate": 1.5163373872431622e-05, "loss": 1.2357, "step": 11660 }, { "epoch": 3.4731845343360823, "grad_norm": 0.2848125696182251, "learning_rate": 1.516254780800138e-05, "loss": 1.2348, "step": 11661 }, { "epoch": 3.473482380535751, "grad_norm": 0.27238327264785767, "learning_rate": 1.51617216955394e-05, "loss": 1.2196, "step": 11662 }, { "epoch": 3.4737802267354194, "grad_norm": 0.25026488304138184, "learning_rate": 1.5160895535053364e-05, "loss": 1.2267, "step": 11663 }, { "epoch": 3.474078072935088, "grad_norm": 0.2846026122570038, "learning_rate": 1.5160069326550965e-05, "loss": 1.2197, "step": 11664 }, { "epoch": 3.474375919134757, "grad_norm": 0.23811158537864685, "learning_rate": 1.5159243070039887e-05, "loss": 1.227, "step": 11665 }, { "epoch": 3.4746737653344253, "grad_norm": 0.29034659266471863, "learning_rate": 1.5158416765527811e-05, "loss": 1.2392, "step": 11666 }, { "epoch": 3.474971611534094, "grad_norm": 0.24458420276641846, "learning_rate": 1.5157590413022433e-05, "loss": 1.2241, "step": 11667 }, { "epoch": 3.475269457733763, "grad_norm": 0.2439926713705063, "learning_rate": 1.5156764012531438e-05, "loss": 1.2295, "step": 11668 }, { "epoch": 3.475567303933431, "grad_norm": 0.25224968791007996, "learning_rate": 1.5155937564062517e-05, "loss": 1.2599, "step": 11669 }, { "epoch": 3.4758651501331, "grad_norm": 0.23347537219524384, "learning_rate": 1.5155111067623357e-05, "loss": 1.2452, "step": 11670 }, { "epoch": 3.4761629963327687, "grad_norm": 0.2441583126783371, "learning_rate": 1.5154284523221648e-05, "loss": 1.2431, "step": 11671 }, { "epoch": 3.4764608425324375, "grad_norm": 0.2398219257593155, "learning_rate": 1.5153457930865081e-05, "loss": 1.233, "step": 11672 }, { "epoch": 3.476758688732106, "grad_norm": 0.3126453161239624, "learning_rate": 1.5152631290561343e-05, "loss": 1.2401, "step": 11673 }, { "epoch": 3.4770565349317746, "grad_norm": 0.3580472469329834, "learning_rate": 1.5151804602318133e-05, "loss": 1.2424, "step": 11674 }, { "epoch": 3.4773543811314434, "grad_norm": 0.23871783912181854, "learning_rate": 1.515097786614313e-05, "loss": 1.2557, "step": 11675 }, { "epoch": 3.477652227331112, "grad_norm": 0.527107834815979, "learning_rate": 1.5150151082044039e-05, "loss": 1.2302, "step": 11676 }, { "epoch": 3.4779500735307805, "grad_norm": 0.34208452701568604, "learning_rate": 1.5149324250028544e-05, "loss": 1.2393, "step": 11677 }, { "epoch": 3.4782479197304492, "grad_norm": 0.311907559633255, "learning_rate": 1.514849737010434e-05, "loss": 1.2422, "step": 11678 }, { "epoch": 3.478545765930118, "grad_norm": 0.23271065950393677, "learning_rate": 1.5147670442279121e-05, "loss": 1.2326, "step": 11679 }, { "epoch": 3.4788436121297863, "grad_norm": 0.26043426990509033, "learning_rate": 1.5146843466560577e-05, "loss": 1.2335, "step": 11680 }, { "epoch": 3.479141458329455, "grad_norm": 0.2789304554462433, "learning_rate": 1.5146016442956409e-05, "loss": 1.2601, "step": 11681 }, { "epoch": 3.479439304529124, "grad_norm": 0.2576315999031067, "learning_rate": 1.5145189371474302e-05, "loss": 1.2326, "step": 11682 }, { "epoch": 3.479737150728792, "grad_norm": 0.24909977614879608, "learning_rate": 1.514436225212196e-05, "loss": 1.2432, "step": 11683 }, { "epoch": 3.480034996928461, "grad_norm": 0.25181618332862854, "learning_rate": 1.5143535084907075e-05, "loss": 1.2398, "step": 11684 }, { "epoch": 3.4803328431281297, "grad_norm": 0.25857406854629517, "learning_rate": 1.5142707869837342e-05, "loss": 1.2421, "step": 11685 }, { "epoch": 3.4806306893277985, "grad_norm": 0.2688182592391968, "learning_rate": 1.5141880606920459e-05, "loss": 1.2292, "step": 11686 }, { "epoch": 3.480928535527467, "grad_norm": 0.23363617062568665, "learning_rate": 1.514105329616412e-05, "loss": 1.2202, "step": 11687 }, { "epoch": 3.4812263817271356, "grad_norm": 0.40400955080986023, "learning_rate": 1.5140225937576027e-05, "loss": 1.2373, "step": 11688 }, { "epoch": 3.4815242279268044, "grad_norm": 0.26861774921417236, "learning_rate": 1.5139398531163872e-05, "loss": 1.2375, "step": 11689 }, { "epoch": 3.481822074126473, "grad_norm": 0.28978976607322693, "learning_rate": 1.5138571076935357e-05, "loss": 1.2426, "step": 11690 }, { "epoch": 3.4821199203261415, "grad_norm": 0.2498270720243454, "learning_rate": 1.5137743574898178e-05, "loss": 1.2452, "step": 11691 }, { "epoch": 3.4824177665258103, "grad_norm": 0.23749759793281555, "learning_rate": 1.5136916025060035e-05, "loss": 1.2412, "step": 11692 }, { "epoch": 3.482715612725479, "grad_norm": 0.30182361602783203, "learning_rate": 1.513608842742863e-05, "loss": 1.2483, "step": 11693 }, { "epoch": 3.4830134589251474, "grad_norm": 0.25271162390708923, "learning_rate": 1.5135260782011659e-05, "loss": 1.2293, "step": 11694 }, { "epoch": 3.483311305124816, "grad_norm": 0.24954035878181458, "learning_rate": 1.5134433088816826e-05, "loss": 1.2422, "step": 11695 }, { "epoch": 3.483609151324485, "grad_norm": 0.2597573697566986, "learning_rate": 1.5133605347851827e-05, "loss": 1.2339, "step": 11696 }, { "epoch": 3.4839069975241532, "grad_norm": 0.2629152238368988, "learning_rate": 1.5132777559124367e-05, "loss": 1.2402, "step": 11697 }, { "epoch": 3.484204843723822, "grad_norm": 0.2538357675075531, "learning_rate": 1.5131949722642145e-05, "loss": 1.2282, "step": 11698 }, { "epoch": 3.4845026899234908, "grad_norm": 0.3156806230545044, "learning_rate": 1.5131121838412868e-05, "loss": 1.2467, "step": 11699 }, { "epoch": 3.4848005361231595, "grad_norm": 0.26059478521347046, "learning_rate": 1.5130293906444232e-05, "loss": 1.2275, "step": 11700 }, { "epoch": 3.485098382322828, "grad_norm": 0.26958325505256653, "learning_rate": 1.5129465926743944e-05, "loss": 1.2389, "step": 11701 }, { "epoch": 3.4853962285224966, "grad_norm": 0.24079011380672455, "learning_rate": 1.5128637899319709e-05, "loss": 1.238, "step": 11702 }, { "epoch": 3.4856940747221654, "grad_norm": 0.2870723009109497, "learning_rate": 1.5127809824179223e-05, "loss": 1.2369, "step": 11703 }, { "epoch": 3.485991920921834, "grad_norm": 0.266473650932312, "learning_rate": 1.5126981701330196e-05, "loss": 1.2322, "step": 11704 }, { "epoch": 3.4862897671215025, "grad_norm": 0.2880406677722931, "learning_rate": 1.512615353078034e-05, "loss": 1.2347, "step": 11705 }, { "epoch": 3.4865876133211713, "grad_norm": 0.26380014419555664, "learning_rate": 1.5125325312537347e-05, "loss": 1.2561, "step": 11706 }, { "epoch": 3.48688545952084, "grad_norm": 0.24268145859241486, "learning_rate": 1.5124497046608927e-05, "loss": 1.229, "step": 11707 }, { "epoch": 3.4871833057205084, "grad_norm": 0.2717461585998535, "learning_rate": 1.512366873300279e-05, "loss": 1.2397, "step": 11708 }, { "epoch": 3.487481151920177, "grad_norm": 0.2414597123861313, "learning_rate": 1.512284037172664e-05, "loss": 1.2317, "step": 11709 }, { "epoch": 3.487778998119846, "grad_norm": 0.2472236007452011, "learning_rate": 1.5122011962788184e-05, "loss": 1.2339, "step": 11710 }, { "epoch": 3.4880768443195147, "grad_norm": 0.24366678297519684, "learning_rate": 1.5121183506195128e-05, "loss": 1.2265, "step": 11711 }, { "epoch": 3.488374690519183, "grad_norm": 0.24428726732730865, "learning_rate": 1.5120355001955184e-05, "loss": 1.2233, "step": 11712 }, { "epoch": 3.488672536718852, "grad_norm": 0.24292029440402985, "learning_rate": 1.5119526450076055e-05, "loss": 1.244, "step": 11713 }, { "epoch": 3.4889703829185206, "grad_norm": 0.23800034821033478, "learning_rate": 1.511869785056545e-05, "loss": 1.2374, "step": 11714 }, { "epoch": 3.489268229118189, "grad_norm": 0.23138365149497986, "learning_rate": 1.5117869203431083e-05, "loss": 1.2376, "step": 11715 }, { "epoch": 3.4895660753178577, "grad_norm": 0.25169605016708374, "learning_rate": 1.5117040508680663e-05, "loss": 1.2436, "step": 11716 }, { "epoch": 3.4898639215175264, "grad_norm": 0.23849761486053467, "learning_rate": 1.5116211766321896e-05, "loss": 1.2522, "step": 11717 }, { "epoch": 3.490161767717195, "grad_norm": 0.24596084654331207, "learning_rate": 1.5115382976362493e-05, "loss": 1.2316, "step": 11718 }, { "epoch": 3.4904596139168635, "grad_norm": 0.23545923829078674, "learning_rate": 1.5114554138810174e-05, "loss": 1.2379, "step": 11719 }, { "epoch": 3.4907574601165323, "grad_norm": 0.24625824391841888, "learning_rate": 1.5113725253672635e-05, "loss": 1.2308, "step": 11720 }, { "epoch": 3.491055306316201, "grad_norm": 0.2673510015010834, "learning_rate": 1.5112896320957597e-05, "loss": 1.2244, "step": 11721 }, { "epoch": 3.4913531525158694, "grad_norm": 0.2335682362318039, "learning_rate": 1.5112067340672774e-05, "loss": 1.2224, "step": 11722 }, { "epoch": 3.491650998715538, "grad_norm": 0.285336434841156, "learning_rate": 1.5111238312825875e-05, "loss": 1.2361, "step": 11723 }, { "epoch": 3.491948844915207, "grad_norm": 0.2684919834136963, "learning_rate": 1.5110409237424612e-05, "loss": 1.2173, "step": 11724 }, { "epoch": 3.4922466911148757, "grad_norm": 0.23368461430072784, "learning_rate": 1.5109580114476705e-05, "loss": 1.2332, "step": 11725 }, { "epoch": 3.492544537314544, "grad_norm": 0.2434931993484497, "learning_rate": 1.5108750943989863e-05, "loss": 1.227, "step": 11726 }, { "epoch": 3.492842383514213, "grad_norm": 0.252808541059494, "learning_rate": 1.5107921725971797e-05, "loss": 1.218, "step": 11727 }, { "epoch": 3.4931402297138816, "grad_norm": 0.2509145736694336, "learning_rate": 1.5107092460430228e-05, "loss": 1.2318, "step": 11728 }, { "epoch": 3.4934380759135504, "grad_norm": 0.30690038204193115, "learning_rate": 1.5106263147372872e-05, "loss": 1.2296, "step": 11729 }, { "epoch": 3.4937359221132187, "grad_norm": 0.2524191439151764, "learning_rate": 1.5105433786807442e-05, "loss": 1.2347, "step": 11730 }, { "epoch": 3.4940337683128875, "grad_norm": 0.2589205503463745, "learning_rate": 1.5104604378741653e-05, "loss": 1.2335, "step": 11731 }, { "epoch": 3.4943316145125562, "grad_norm": 0.2606979012489319, "learning_rate": 1.5103774923183223e-05, "loss": 1.2253, "step": 11732 }, { "epoch": 3.4946294607122246, "grad_norm": 0.22792330384254456, "learning_rate": 1.5102945420139872e-05, "loss": 1.226, "step": 11733 }, { "epoch": 3.4949273069118933, "grad_norm": 0.2698785364627838, "learning_rate": 1.5102115869619315e-05, "loss": 1.2297, "step": 11734 }, { "epoch": 3.495225153111562, "grad_norm": 0.24608469009399414, "learning_rate": 1.5101286271629269e-05, "loss": 1.2346, "step": 11735 }, { "epoch": 3.4955229993112304, "grad_norm": 0.32868117094039917, "learning_rate": 1.5100456626177455e-05, "loss": 1.2437, "step": 11736 }, { "epoch": 3.495820845510899, "grad_norm": 0.28755104541778564, "learning_rate": 1.5099626933271589e-05, "loss": 1.2284, "step": 11737 }, { "epoch": 3.496118691710568, "grad_norm": 0.26792627573013306, "learning_rate": 1.5098797192919393e-05, "loss": 1.2304, "step": 11738 }, { "epoch": 3.4964165379102368, "grad_norm": 0.3042823374271393, "learning_rate": 1.5097967405128584e-05, "loss": 1.2291, "step": 11739 }, { "epoch": 3.496714384109905, "grad_norm": 0.2533160448074341, "learning_rate": 1.5097137569906885e-05, "loss": 1.2305, "step": 11740 }, { "epoch": 3.497012230309574, "grad_norm": 0.25064122676849365, "learning_rate": 1.5096307687262019e-05, "loss": 1.2438, "step": 11741 }, { "epoch": 3.4973100765092426, "grad_norm": 0.2505095601081848, "learning_rate": 1.50954777572017e-05, "loss": 1.2337, "step": 11742 }, { "epoch": 3.4976079227089114, "grad_norm": 0.23554089665412903, "learning_rate": 1.5094647779733659e-05, "loss": 1.236, "step": 11743 }, { "epoch": 3.4979057689085797, "grad_norm": 0.2355387657880783, "learning_rate": 1.5093817754865608e-05, "loss": 1.2295, "step": 11744 }, { "epoch": 3.4982036151082485, "grad_norm": 0.23906509578227997, "learning_rate": 1.509298768260527e-05, "loss": 1.2422, "step": 11745 }, { "epoch": 3.4985014613079173, "grad_norm": 0.2412572056055069, "learning_rate": 1.5092157562960377e-05, "loss": 1.2483, "step": 11746 }, { "epoch": 3.4987993075075856, "grad_norm": 0.2579009234905243, "learning_rate": 1.5091327395938646e-05, "loss": 1.2283, "step": 11747 }, { "epoch": 3.4990971537072544, "grad_norm": 0.2576175928115845, "learning_rate": 1.5090497181547803e-05, "loss": 1.2408, "step": 11748 }, { "epoch": 3.499394999906923, "grad_norm": 0.24004359543323517, "learning_rate": 1.5089666919795569e-05, "loss": 1.2273, "step": 11749 }, { "epoch": 3.4996928461065915, "grad_norm": 0.2498658001422882, "learning_rate": 1.5088836610689673e-05, "loss": 1.2422, "step": 11750 }, { "epoch": 3.4999906923062603, "grad_norm": 0.24315737187862396, "learning_rate": 1.5088006254237837e-05, "loss": 1.24, "step": 11751 }, { "epoch": 3.500288538505929, "grad_norm": 0.24262316524982452, "learning_rate": 1.5087175850447788e-05, "loss": 1.2516, "step": 11752 }, { "epoch": 3.500586384705598, "grad_norm": 0.2583785653114319, "learning_rate": 1.5086345399327252e-05, "loss": 1.2269, "step": 11753 }, { "epoch": 3.5008842309052666, "grad_norm": 0.2627702057361603, "learning_rate": 1.5085514900883953e-05, "loss": 1.2366, "step": 11754 }, { "epoch": 3.501182077104935, "grad_norm": 0.251803457736969, "learning_rate": 1.508468435512562e-05, "loss": 1.2325, "step": 11755 }, { "epoch": 3.5014799233046037, "grad_norm": 0.36371490359306335, "learning_rate": 1.508385376205998e-05, "loss": 1.2297, "step": 11756 }, { "epoch": 3.5017777695042724, "grad_norm": 0.3003302812576294, "learning_rate": 1.5083023121694762e-05, "loss": 1.2261, "step": 11757 }, { "epoch": 3.5020756157039408, "grad_norm": 0.2752167880535126, "learning_rate": 1.5082192434037693e-05, "loss": 1.2243, "step": 11758 }, { "epoch": 3.5023734619036095, "grad_norm": 0.3372878134250641, "learning_rate": 1.50813616990965e-05, "loss": 1.2072, "step": 11759 }, { "epoch": 3.5026713081032783, "grad_norm": 0.2392565906047821, "learning_rate": 1.5080530916878914e-05, "loss": 1.2414, "step": 11760 }, { "epoch": 3.5029691543029466, "grad_norm": 0.28664082288742065, "learning_rate": 1.5079700087392666e-05, "loss": 1.2423, "step": 11761 }, { "epoch": 3.5032670005026154, "grad_norm": 0.2399989664554596, "learning_rate": 1.5078869210645484e-05, "loss": 1.243, "step": 11762 }, { "epoch": 3.503564846702284, "grad_norm": 0.43447959423065186, "learning_rate": 1.5078038286645097e-05, "loss": 1.2458, "step": 11763 }, { "epoch": 3.5038626929019525, "grad_norm": 0.35170891880989075, "learning_rate": 1.5077207315399236e-05, "loss": 1.2263, "step": 11764 }, { "epoch": 3.5041605391016213, "grad_norm": 0.2994493544101715, "learning_rate": 1.5076376296915634e-05, "loss": 1.2373, "step": 11765 }, { "epoch": 3.50445838530129, "grad_norm": 0.24334172904491425, "learning_rate": 1.5075545231202022e-05, "loss": 1.2362, "step": 11766 }, { "epoch": 3.504756231500959, "grad_norm": 0.3043341040611267, "learning_rate": 1.5074714118266137e-05, "loss": 1.2485, "step": 11767 }, { "epoch": 3.5050540777006276, "grad_norm": 0.2556568384170532, "learning_rate": 1.5073882958115701e-05, "loss": 1.2582, "step": 11768 }, { "epoch": 3.505351923900296, "grad_norm": 0.25257766246795654, "learning_rate": 1.5073051750758456e-05, "loss": 1.2367, "step": 11769 }, { "epoch": 3.5056497700999647, "grad_norm": 0.23973365128040314, "learning_rate": 1.5072220496202132e-05, "loss": 1.2611, "step": 11770 }, { "epoch": 3.5059476162996335, "grad_norm": 0.2681906819343567, "learning_rate": 1.5071389194454464e-05, "loss": 1.2361, "step": 11771 }, { "epoch": 3.506245462499302, "grad_norm": 0.24991819262504578, "learning_rate": 1.5070557845523184e-05, "loss": 1.229, "step": 11772 }, { "epoch": 3.5065433086989706, "grad_norm": 0.25751763582229614, "learning_rate": 1.5069726449416029e-05, "loss": 1.2138, "step": 11773 }, { "epoch": 3.5068411548986393, "grad_norm": 0.25750166177749634, "learning_rate": 1.5068895006140733e-05, "loss": 1.2452, "step": 11774 }, { "epoch": 3.5071390010983077, "grad_norm": 0.24973426759243011, "learning_rate": 1.5068063515705033e-05, "loss": 1.2355, "step": 11775 }, { "epoch": 3.5074368472979764, "grad_norm": 0.25527223944664, "learning_rate": 1.5067231978116663e-05, "loss": 1.223, "step": 11776 }, { "epoch": 3.507734693497645, "grad_norm": 0.23963607847690582, "learning_rate": 1.5066400393383367e-05, "loss": 1.2361, "step": 11777 }, { "epoch": 3.5080325396973135, "grad_norm": 0.2547277510166168, "learning_rate": 1.506556876151287e-05, "loss": 1.2407, "step": 11778 }, { "epoch": 3.5083303858969823, "grad_norm": 0.266491174697876, "learning_rate": 1.5064737082512916e-05, "loss": 1.2274, "step": 11779 }, { "epoch": 3.508628232096651, "grad_norm": 0.28425759077072144, "learning_rate": 1.5063905356391242e-05, "loss": 1.2477, "step": 11780 }, { "epoch": 3.50892607829632, "grad_norm": 0.2597352862358093, "learning_rate": 1.5063073583155587e-05, "loss": 1.2271, "step": 11781 }, { "epoch": 3.5092239244959886, "grad_norm": 0.38104113936424255, "learning_rate": 1.5062241762813688e-05, "loss": 1.2261, "step": 11782 }, { "epoch": 3.509521770695657, "grad_norm": 0.3198084235191345, "learning_rate": 1.5061409895373284e-05, "loss": 1.2273, "step": 11783 }, { "epoch": 3.5098196168953257, "grad_norm": 0.2601906359195709, "learning_rate": 1.5060577980842121e-05, "loss": 1.2441, "step": 11784 }, { "epoch": 3.5101174630949945, "grad_norm": 0.504921555519104, "learning_rate": 1.5059746019227932e-05, "loss": 1.2329, "step": 11785 }, { "epoch": 3.510415309294663, "grad_norm": 0.33166366815567017, "learning_rate": 1.5058914010538454e-05, "loss": 1.2349, "step": 11786 }, { "epoch": 3.5107131554943316, "grad_norm": 0.28050699830055237, "learning_rate": 1.505808195478144e-05, "loss": 1.2389, "step": 11787 }, { "epoch": 3.5110110016940004, "grad_norm": 0.2675134539604187, "learning_rate": 1.5057249851964623e-05, "loss": 1.2242, "step": 11788 }, { "epoch": 3.5113088478936687, "grad_norm": 0.31375494599342346, "learning_rate": 1.5056417702095747e-05, "loss": 1.2162, "step": 11789 }, { "epoch": 3.5116066940933375, "grad_norm": 0.25974899530410767, "learning_rate": 1.5055585505182551e-05, "loss": 1.2269, "step": 11790 }, { "epoch": 3.5119045402930062, "grad_norm": 0.23940208554267883, "learning_rate": 1.5054753261232782e-05, "loss": 1.2511, "step": 11791 }, { "epoch": 3.5122023864926746, "grad_norm": 0.26865193247795105, "learning_rate": 1.505392097025418e-05, "loss": 1.2364, "step": 11792 }, { "epoch": 3.5125002326923433, "grad_norm": 0.2572293281555176, "learning_rate": 1.5053088632254491e-05, "loss": 1.2236, "step": 11793 }, { "epoch": 3.512798078892012, "grad_norm": 0.28694552183151245, "learning_rate": 1.505225624724146e-05, "loss": 1.2473, "step": 11794 }, { "epoch": 3.513095925091681, "grad_norm": 0.24849431216716766, "learning_rate": 1.5051423815222826e-05, "loss": 1.2368, "step": 11795 }, { "epoch": 3.5133937712913497, "grad_norm": 0.24320310354232788, "learning_rate": 1.505059133620634e-05, "loss": 1.2246, "step": 11796 }, { "epoch": 3.513691617491018, "grad_norm": 0.24401208758354187, "learning_rate": 1.5049758810199741e-05, "loss": 1.2251, "step": 11797 }, { "epoch": 3.5139894636906868, "grad_norm": 0.24516740441322327, "learning_rate": 1.504892623721078e-05, "loss": 1.2434, "step": 11798 }, { "epoch": 3.5142873098903555, "grad_norm": 0.23222769796848297, "learning_rate": 1.5048093617247201e-05, "loss": 1.2367, "step": 11799 }, { "epoch": 3.514585156090024, "grad_norm": 0.23739634454250336, "learning_rate": 1.5047260950316753e-05, "loss": 1.2285, "step": 11800 }, { "epoch": 3.5148830022896926, "grad_norm": 0.24186591804027557, "learning_rate": 1.5046428236427179e-05, "loss": 1.2326, "step": 11801 }, { "epoch": 3.5151808484893614, "grad_norm": 0.23322808742523193, "learning_rate": 1.504559547558623e-05, "loss": 1.2325, "step": 11802 }, { "epoch": 3.5154786946890297, "grad_norm": 0.2473326027393341, "learning_rate": 1.5044762667801651e-05, "loss": 1.2401, "step": 11803 }, { "epoch": 3.5157765408886985, "grad_norm": 0.25037267804145813, "learning_rate": 1.5043929813081191e-05, "loss": 1.2384, "step": 11804 }, { "epoch": 3.5160743870883673, "grad_norm": 0.24375185370445251, "learning_rate": 1.5043096911432602e-05, "loss": 1.2349, "step": 11805 }, { "epoch": 3.516372233288036, "grad_norm": 0.24941161274909973, "learning_rate": 1.5042263962863627e-05, "loss": 1.2385, "step": 11806 }, { "epoch": 3.5166700794877044, "grad_norm": 0.2311730533838272, "learning_rate": 1.5041430967382021e-05, "loss": 1.2287, "step": 11807 }, { "epoch": 3.516967925687373, "grad_norm": 0.23631060123443604, "learning_rate": 1.5040597924995535e-05, "loss": 1.2294, "step": 11808 }, { "epoch": 3.517265771887042, "grad_norm": 0.23170427978038788, "learning_rate": 1.5039764835711914e-05, "loss": 1.2253, "step": 11809 }, { "epoch": 3.5175636180867107, "grad_norm": 0.26978152990341187, "learning_rate": 1.5038931699538913e-05, "loss": 1.2459, "step": 11810 }, { "epoch": 3.517861464286379, "grad_norm": 0.23406825959682465, "learning_rate": 1.5038098516484283e-05, "loss": 1.234, "step": 11811 }, { "epoch": 3.518159310486048, "grad_norm": 0.2873719334602356, "learning_rate": 1.5037265286555776e-05, "loss": 1.2322, "step": 11812 }, { "epoch": 3.5184571566857166, "grad_norm": 0.25312504172325134, "learning_rate": 1.503643200976114e-05, "loss": 1.2379, "step": 11813 }, { "epoch": 3.518755002885385, "grad_norm": 0.2473198026418686, "learning_rate": 1.5035598686108132e-05, "loss": 1.2384, "step": 11814 }, { "epoch": 3.5190528490850537, "grad_norm": 0.27882787585258484, "learning_rate": 1.5034765315604506e-05, "loss": 1.2321, "step": 11815 }, { "epoch": 3.5193506952847224, "grad_norm": 0.25915205478668213, "learning_rate": 1.5033931898258013e-05, "loss": 1.2215, "step": 11816 }, { "epoch": 3.5196485414843908, "grad_norm": 0.2492315173149109, "learning_rate": 1.503309843407641e-05, "loss": 1.237, "step": 11817 }, { "epoch": 3.5199463876840595, "grad_norm": 0.24386049807071686, "learning_rate": 1.5032264923067448e-05, "loss": 1.2538, "step": 11818 }, { "epoch": 3.5202442338837283, "grad_norm": 0.2439318299293518, "learning_rate": 1.5031431365238884e-05, "loss": 1.2513, "step": 11819 }, { "epoch": 3.520542080083397, "grad_norm": 0.3076123893260956, "learning_rate": 1.5030597760598471e-05, "loss": 1.2317, "step": 11820 }, { "epoch": 3.520839926283066, "grad_norm": 0.275299996137619, "learning_rate": 1.5029764109153966e-05, "loss": 1.2382, "step": 11821 }, { "epoch": 3.521137772482734, "grad_norm": 0.27564939856529236, "learning_rate": 1.5028930410913125e-05, "loss": 1.2636, "step": 11822 }, { "epoch": 3.521435618682403, "grad_norm": 0.48066186904907227, "learning_rate": 1.502809666588371e-05, "loss": 1.2322, "step": 11823 }, { "epoch": 3.5217334648820717, "grad_norm": 0.3236466646194458, "learning_rate": 1.5027262874073466e-05, "loss": 1.2269, "step": 11824 }, { "epoch": 3.52203131108174, "grad_norm": 0.33280837535858154, "learning_rate": 1.5026429035490164e-05, "loss": 1.2329, "step": 11825 }, { "epoch": 3.522329157281409, "grad_norm": 0.24413232505321503, "learning_rate": 1.5025595150141554e-05, "loss": 1.2316, "step": 11826 }, { "epoch": 3.5226270034810776, "grad_norm": 0.5738617181777954, "learning_rate": 1.5024761218035394e-05, "loss": 1.2373, "step": 11827 }, { "epoch": 3.522924849680746, "grad_norm": 0.2744631767272949, "learning_rate": 1.5023927239179447e-05, "loss": 1.2423, "step": 11828 }, { "epoch": 3.5232226958804147, "grad_norm": 0.25785768032073975, "learning_rate": 1.502309321358147e-05, "loss": 1.2298, "step": 11829 }, { "epoch": 3.5235205420800835, "grad_norm": 0.24500633776187897, "learning_rate": 1.5022259141249222e-05, "loss": 1.2337, "step": 11830 }, { "epoch": 3.523818388279752, "grad_norm": 0.23993581533432007, "learning_rate": 1.5021425022190464e-05, "loss": 1.2275, "step": 11831 }, { "epoch": 3.5241162344794206, "grad_norm": 0.24274177849292755, "learning_rate": 1.5020590856412955e-05, "loss": 1.2303, "step": 11832 }, { "epoch": 3.5244140806790893, "grad_norm": 0.26262083649635315, "learning_rate": 1.501975664392446e-05, "loss": 1.2352, "step": 11833 }, { "epoch": 3.524711926878758, "grad_norm": 0.24239331483840942, "learning_rate": 1.5018922384732735e-05, "loss": 1.2191, "step": 11834 }, { "epoch": 3.525009773078427, "grad_norm": 0.23399677872657776, "learning_rate": 1.501808807884555e-05, "loss": 1.2316, "step": 11835 }, { "epoch": 3.525307619278095, "grad_norm": 0.23567579686641693, "learning_rate": 1.5017253726270658e-05, "loss": 1.2249, "step": 11836 }, { "epoch": 3.525605465477764, "grad_norm": 0.24142025411128998, "learning_rate": 1.5016419327015825e-05, "loss": 1.2236, "step": 11837 }, { "epoch": 3.5259033116774328, "grad_norm": 0.24926820397377014, "learning_rate": 1.5015584881088817e-05, "loss": 1.2257, "step": 11838 }, { "epoch": 3.526201157877101, "grad_norm": 0.23994338512420654, "learning_rate": 1.5014750388497393e-05, "loss": 1.2411, "step": 11839 }, { "epoch": 3.52649900407677, "grad_norm": 0.23732413351535797, "learning_rate": 1.501391584924932e-05, "loss": 1.2199, "step": 11840 }, { "epoch": 3.5267968502764386, "grad_norm": 0.23851914703845978, "learning_rate": 1.5013081263352362e-05, "loss": 1.2279, "step": 11841 }, { "epoch": 3.527094696476107, "grad_norm": 0.2557980716228485, "learning_rate": 1.5012246630814287e-05, "loss": 1.2335, "step": 11842 }, { "epoch": 3.5273925426757757, "grad_norm": 0.2455354630947113, "learning_rate": 1.5011411951642853e-05, "loss": 1.2399, "step": 11843 }, { "epoch": 3.5276903888754445, "grad_norm": 0.23361371457576752, "learning_rate": 1.5010577225845833e-05, "loss": 1.2157, "step": 11844 }, { "epoch": 3.527988235075113, "grad_norm": 0.23348510265350342, "learning_rate": 1.5009742453430987e-05, "loss": 1.206, "step": 11845 }, { "epoch": 3.5282860812747816, "grad_norm": 0.24507753551006317, "learning_rate": 1.5008907634406087e-05, "loss": 1.2339, "step": 11846 }, { "epoch": 3.5285839274744504, "grad_norm": 0.24981550872325897, "learning_rate": 1.5008072768778897e-05, "loss": 1.2443, "step": 11847 }, { "epoch": 3.528881773674119, "grad_norm": 0.24339215457439423, "learning_rate": 1.5007237856557184e-05, "loss": 1.2148, "step": 11848 }, { "epoch": 3.529179619873788, "grad_norm": 0.24039603769779205, "learning_rate": 1.500640289774872e-05, "loss": 1.2368, "step": 11849 }, { "epoch": 3.5294774660734562, "grad_norm": 0.24635085463523865, "learning_rate": 1.5005567892361269e-05, "loss": 1.2332, "step": 11850 }, { "epoch": 3.529775312273125, "grad_norm": 0.24283139407634735, "learning_rate": 1.5004732840402598e-05, "loss": 1.2405, "step": 11851 }, { "epoch": 3.530073158472794, "grad_norm": 0.23929741978645325, "learning_rate": 1.5003897741880484e-05, "loss": 1.233, "step": 11852 }, { "epoch": 3.530371004672462, "grad_norm": 0.23753710091114044, "learning_rate": 1.500306259680269e-05, "loss": 1.2248, "step": 11853 }, { "epoch": 3.530668850872131, "grad_norm": 0.24215765297412872, "learning_rate": 1.500222740517699e-05, "loss": 1.2295, "step": 11854 }, { "epoch": 3.5309666970717997, "grad_norm": 0.23550446331501007, "learning_rate": 1.5001392167011153e-05, "loss": 1.2213, "step": 11855 }, { "epoch": 3.531264543271468, "grad_norm": 0.23802022635936737, "learning_rate": 1.5000556882312948e-05, "loss": 1.2452, "step": 11856 }, { "epoch": 3.5315623894711368, "grad_norm": 0.2376105934381485, "learning_rate": 1.4999721551090148e-05, "loss": 1.2464, "step": 11857 }, { "epoch": 3.5318602356708055, "grad_norm": 0.23863351345062256, "learning_rate": 1.4998886173350527e-05, "loss": 1.2199, "step": 11858 }, { "epoch": 3.532158081870474, "grad_norm": 0.2474067658185959, "learning_rate": 1.4998050749101855e-05, "loss": 1.2257, "step": 11859 }, { "epoch": 3.5324559280701426, "grad_norm": 0.2507241666316986, "learning_rate": 1.4997215278351905e-05, "loss": 1.2277, "step": 11860 }, { "epoch": 3.5327537742698114, "grad_norm": 0.24643084406852722, "learning_rate": 1.4996379761108446e-05, "loss": 1.2503, "step": 11861 }, { "epoch": 3.53305162046948, "grad_norm": 0.24573387205600739, "learning_rate": 1.4995544197379259e-05, "loss": 1.2302, "step": 11862 }, { "epoch": 3.533349466669149, "grad_norm": 0.24578134715557098, "learning_rate": 1.4994708587172115e-05, "loss": 1.2589, "step": 11863 }, { "epoch": 3.5336473128688173, "grad_norm": 0.2325790375471115, "learning_rate": 1.4993872930494785e-05, "loss": 1.2398, "step": 11864 }, { "epoch": 3.533945159068486, "grad_norm": 0.2433117777109146, "learning_rate": 1.4993037227355052e-05, "loss": 1.2145, "step": 11865 }, { "epoch": 3.534243005268155, "grad_norm": 0.23357227444648743, "learning_rate": 1.4992201477760683e-05, "loss": 1.2572, "step": 11866 }, { "epoch": 3.534540851467823, "grad_norm": 0.23796075582504272, "learning_rate": 1.499136568171946e-05, "loss": 1.2398, "step": 11867 }, { "epoch": 3.534838697667492, "grad_norm": 0.23160098493099213, "learning_rate": 1.4990529839239149e-05, "loss": 1.2191, "step": 11868 }, { "epoch": 3.5351365438671607, "grad_norm": 0.23105956614017487, "learning_rate": 1.498969395032754e-05, "loss": 1.2316, "step": 11869 }, { "epoch": 3.535434390066829, "grad_norm": 0.2299727499485016, "learning_rate": 1.49888580149924e-05, "loss": 1.2294, "step": 11870 }, { "epoch": 3.535732236266498, "grad_norm": 0.24049155414104462, "learning_rate": 1.4988022033241511e-05, "loss": 1.2391, "step": 11871 }, { "epoch": 3.5360300824661666, "grad_norm": 0.24359889328479767, "learning_rate": 1.4987186005082653e-05, "loss": 1.2339, "step": 11872 }, { "epoch": 3.5363279286658353, "grad_norm": 0.24411973357200623, "learning_rate": 1.4986349930523599e-05, "loss": 1.2423, "step": 11873 }, { "epoch": 3.5366257748655037, "grad_norm": 0.2380755990743637, "learning_rate": 1.4985513809572127e-05, "loss": 1.2271, "step": 11874 }, { "epoch": 3.5369236210651724, "grad_norm": 0.24348989129066467, "learning_rate": 1.4984677642236021e-05, "loss": 1.2445, "step": 11875 }, { "epoch": 3.537221467264841, "grad_norm": 0.23694832623004913, "learning_rate": 1.4983841428523063e-05, "loss": 1.2291, "step": 11876 }, { "epoch": 3.53751931346451, "grad_norm": 0.25284677743911743, "learning_rate": 1.4983005168441023e-05, "loss": 1.2235, "step": 11877 }, { "epoch": 3.5378171596641783, "grad_norm": 0.23373016715049744, "learning_rate": 1.498216886199769e-05, "loss": 1.2136, "step": 11878 }, { "epoch": 3.538115005863847, "grad_norm": 0.2290879487991333, "learning_rate": 1.4981332509200842e-05, "loss": 1.225, "step": 11879 }, { "epoch": 3.538412852063516, "grad_norm": 0.23518285155296326, "learning_rate": 1.498049611005826e-05, "loss": 1.2389, "step": 11880 }, { "epoch": 3.538710698263184, "grad_norm": 0.24591416120529175, "learning_rate": 1.4979659664577727e-05, "loss": 1.2266, "step": 11881 }, { "epoch": 3.539008544462853, "grad_norm": 0.24626922607421875, "learning_rate": 1.497882317276702e-05, "loss": 1.244, "step": 11882 }, { "epoch": 3.5393063906625217, "grad_norm": 0.24175675213336945, "learning_rate": 1.4977986634633935e-05, "loss": 1.2439, "step": 11883 }, { "epoch": 3.53960423686219, "grad_norm": 0.24137884378433228, "learning_rate": 1.497715005018624e-05, "loss": 1.2444, "step": 11884 }, { "epoch": 3.539902083061859, "grad_norm": 0.2463780641555786, "learning_rate": 1.4976313419431725e-05, "loss": 1.2387, "step": 11885 }, { "epoch": 3.5401999292615276, "grad_norm": 0.24791429936885834, "learning_rate": 1.4975476742378173e-05, "loss": 1.2475, "step": 11886 }, { "epoch": 3.5404977754611964, "grad_norm": 0.24221408367156982, "learning_rate": 1.497464001903337e-05, "loss": 1.2574, "step": 11887 }, { "epoch": 3.540795621660865, "grad_norm": 0.24171023070812225, "learning_rate": 1.49738032494051e-05, "loss": 1.2451, "step": 11888 }, { "epoch": 3.5410934678605335, "grad_norm": 0.2393396496772766, "learning_rate": 1.4972966433501146e-05, "loss": 1.2319, "step": 11889 }, { "epoch": 3.5413913140602022, "grad_norm": 0.23181261122226715, "learning_rate": 1.49721295713293e-05, "loss": 1.233, "step": 11890 }, { "epoch": 3.541689160259871, "grad_norm": 0.23957259953022003, "learning_rate": 1.4971292662897339e-05, "loss": 1.2541, "step": 11891 }, { "epoch": 3.5419870064595393, "grad_norm": 0.2328466922044754, "learning_rate": 1.4970455708213055e-05, "loss": 1.2273, "step": 11892 }, { "epoch": 3.542284852659208, "grad_norm": 0.2393094003200531, "learning_rate": 1.4969618707284235e-05, "loss": 1.2389, "step": 11893 }, { "epoch": 3.542582698858877, "grad_norm": 0.2378576546907425, "learning_rate": 1.4968781660118662e-05, "loss": 1.2226, "step": 11894 }, { "epoch": 3.542880545058545, "grad_norm": 0.2518276870250702, "learning_rate": 1.496794456672413e-05, "loss": 1.2331, "step": 11895 }, { "epoch": 3.543178391258214, "grad_norm": 0.2509482800960541, "learning_rate": 1.4967107427108427e-05, "loss": 1.2249, "step": 11896 }, { "epoch": 3.5434762374578828, "grad_norm": 0.252467542886734, "learning_rate": 1.4966270241279333e-05, "loss": 1.2326, "step": 11897 }, { "epoch": 3.543774083657551, "grad_norm": 0.25453031063079834, "learning_rate": 1.4965433009244646e-05, "loss": 1.2448, "step": 11898 }, { "epoch": 3.54407192985722, "grad_norm": 0.23521898686885834, "learning_rate": 1.4964595731012153e-05, "loss": 1.2322, "step": 11899 }, { "epoch": 3.5443697760568886, "grad_norm": 0.25464823842048645, "learning_rate": 1.4963758406589644e-05, "loss": 1.2248, "step": 11900 }, { "epoch": 3.5446676222565574, "grad_norm": 0.23565281927585602, "learning_rate": 1.4962921035984907e-05, "loss": 1.2396, "step": 11901 }, { "epoch": 3.544965468456226, "grad_norm": 0.26107919216156006, "learning_rate": 1.4962083619205737e-05, "loss": 1.2437, "step": 11902 }, { "epoch": 3.5452633146558945, "grad_norm": 0.2406272143125534, "learning_rate": 1.496124615625992e-05, "loss": 1.2454, "step": 11903 }, { "epoch": 3.5455611608555633, "grad_norm": 0.24669715762138367, "learning_rate": 1.4960408647155254e-05, "loss": 1.2166, "step": 11904 }, { "epoch": 3.545859007055232, "grad_norm": 0.23928460478782654, "learning_rate": 1.4959571091899525e-05, "loss": 1.2254, "step": 11905 }, { "epoch": 3.5461568532549004, "grad_norm": 0.2638623118400574, "learning_rate": 1.4958733490500528e-05, "loss": 1.2453, "step": 11906 }, { "epoch": 3.546454699454569, "grad_norm": 0.2426176816225052, "learning_rate": 1.495789584296606e-05, "loss": 1.2443, "step": 11907 }, { "epoch": 3.546752545654238, "grad_norm": 0.2569550573825836, "learning_rate": 1.4957058149303905e-05, "loss": 1.2404, "step": 11908 }, { "epoch": 3.5470503918539062, "grad_norm": 0.2393750548362732, "learning_rate": 1.4956220409521862e-05, "loss": 1.2319, "step": 11909 }, { "epoch": 3.547348238053575, "grad_norm": 0.3033221960067749, "learning_rate": 1.4955382623627733e-05, "loss": 1.2341, "step": 11910 }, { "epoch": 3.547646084253244, "grad_norm": 0.26827922463417053, "learning_rate": 1.4954544791629299e-05, "loss": 1.2331, "step": 11911 }, { "epoch": 3.547943930452912, "grad_norm": 0.3209385871887207, "learning_rate": 1.4953706913534363e-05, "loss": 1.2247, "step": 11912 }, { "epoch": 3.548241776652581, "grad_norm": 0.32322075963020325, "learning_rate": 1.495286898935072e-05, "loss": 1.212, "step": 11913 }, { "epoch": 3.5485396228522497, "grad_norm": 0.293900728225708, "learning_rate": 1.4952031019086161e-05, "loss": 1.2294, "step": 11914 }, { "epoch": 3.5488374690519184, "grad_norm": 0.3369714319705963, "learning_rate": 1.4951193002748487e-05, "loss": 1.2514, "step": 11915 }, { "epoch": 3.549135315251587, "grad_norm": 0.26007702946662903, "learning_rate": 1.4950354940345494e-05, "loss": 1.252, "step": 11916 }, { "epoch": 3.5494331614512555, "grad_norm": 0.2527429461479187, "learning_rate": 1.4949516831884983e-05, "loss": 1.22, "step": 11917 }, { "epoch": 3.5497310076509243, "grad_norm": 0.29641637206077576, "learning_rate": 1.4948678677374746e-05, "loss": 1.2363, "step": 11918 }, { "epoch": 3.550028853850593, "grad_norm": 0.24928542971611023, "learning_rate": 1.494784047682258e-05, "loss": 1.2495, "step": 11919 }, { "epoch": 3.5503267000502614, "grad_norm": 0.26066356897354126, "learning_rate": 1.494700223023629e-05, "loss": 1.2314, "step": 11920 }, { "epoch": 3.55062454624993, "grad_norm": 0.22851091623306274, "learning_rate": 1.4946163937623668e-05, "loss": 1.2382, "step": 11921 }, { "epoch": 3.550922392449599, "grad_norm": 0.2403339147567749, "learning_rate": 1.4945325598992519e-05, "loss": 1.2143, "step": 11922 }, { "epoch": 3.5512202386492673, "grad_norm": 0.2572495937347412, "learning_rate": 1.494448721435064e-05, "loss": 1.2267, "step": 11923 }, { "epoch": 3.551518084848936, "grad_norm": 0.2491593360900879, "learning_rate": 1.4943648783705833e-05, "loss": 1.2413, "step": 11924 }, { "epoch": 3.551815931048605, "grad_norm": 0.27651259303092957, "learning_rate": 1.4942810307065895e-05, "loss": 1.2252, "step": 11925 }, { "epoch": 3.552113777248273, "grad_norm": 0.23403340578079224, "learning_rate": 1.4941971784438631e-05, "loss": 1.2177, "step": 11926 }, { "epoch": 3.552411623447942, "grad_norm": 0.25173115730285645, "learning_rate": 1.4941133215831842e-05, "loss": 1.2338, "step": 11927 }, { "epoch": 3.5527094696476107, "grad_norm": 0.2646122872829437, "learning_rate": 1.4940294601253327e-05, "loss": 1.2233, "step": 11928 }, { "epoch": 3.5530073158472795, "grad_norm": 0.23748041689395905, "learning_rate": 1.493945594071089e-05, "loss": 1.2511, "step": 11929 }, { "epoch": 3.5533051620469482, "grad_norm": 0.2807159423828125, "learning_rate": 1.4938617234212336e-05, "loss": 1.2441, "step": 11930 }, { "epoch": 3.5536030082466166, "grad_norm": 0.23702220618724823, "learning_rate": 1.4937778481765465e-05, "loss": 1.2379, "step": 11931 }, { "epoch": 3.5539008544462853, "grad_norm": 0.27692580223083496, "learning_rate": 1.4936939683378085e-05, "loss": 1.2328, "step": 11932 }, { "epoch": 3.554198700645954, "grad_norm": 0.24006317555904388, "learning_rate": 1.4936100839057992e-05, "loss": 1.2243, "step": 11933 }, { "epoch": 3.5544965468456224, "grad_norm": 0.2403414100408554, "learning_rate": 1.4935261948813e-05, "loss": 1.2273, "step": 11934 }, { "epoch": 3.554794393045291, "grad_norm": 0.2287728637456894, "learning_rate": 1.4934423012650906e-05, "loss": 1.2339, "step": 11935 }, { "epoch": 3.55509223924496, "grad_norm": 0.24130764603614807, "learning_rate": 1.4933584030579523e-05, "loss": 1.2199, "step": 11936 }, { "epoch": 3.5553900854446283, "grad_norm": 0.2393208146095276, "learning_rate": 1.4932745002606652e-05, "loss": 1.2486, "step": 11937 }, { "epoch": 3.555687931644297, "grad_norm": 0.24160076677799225, "learning_rate": 1.4931905928740097e-05, "loss": 1.2314, "step": 11938 }, { "epoch": 3.555985777843966, "grad_norm": 0.2669520378112793, "learning_rate": 1.4931066808987668e-05, "loss": 1.2264, "step": 11939 }, { "epoch": 3.5562836240436346, "grad_norm": 0.24485118687152863, "learning_rate": 1.4930227643357174e-05, "loss": 1.254, "step": 11940 }, { "epoch": 3.556581470243303, "grad_norm": 0.2909088134765625, "learning_rate": 1.492938843185642e-05, "loss": 1.237, "step": 11941 }, { "epoch": 3.5568793164429717, "grad_norm": 0.2594226002693176, "learning_rate": 1.4928549174493214e-05, "loss": 1.2244, "step": 11942 }, { "epoch": 3.5571771626426405, "grad_norm": 0.26096615195274353, "learning_rate": 1.4927709871275361e-05, "loss": 1.232, "step": 11943 }, { "epoch": 3.5574750088423093, "grad_norm": 0.24414198100566864, "learning_rate": 1.4926870522210676e-05, "loss": 1.2366, "step": 11944 }, { "epoch": 3.5577728550419776, "grad_norm": 0.30891233682632446, "learning_rate": 1.4926031127306965e-05, "loss": 1.2288, "step": 11945 }, { "epoch": 3.5580707012416464, "grad_norm": 0.31807658076286316, "learning_rate": 1.492519168657204e-05, "loss": 1.2247, "step": 11946 }, { "epoch": 3.558368547441315, "grad_norm": 0.24587668478488922, "learning_rate": 1.4924352200013706e-05, "loss": 1.2351, "step": 11947 }, { "epoch": 3.5586663936409835, "grad_norm": 0.3405172526836395, "learning_rate": 1.4923512667639778e-05, "loss": 1.2371, "step": 11948 }, { "epoch": 3.5589642398406522, "grad_norm": 0.280850350856781, "learning_rate": 1.4922673089458066e-05, "loss": 1.238, "step": 11949 }, { "epoch": 3.559262086040321, "grad_norm": 0.2465774565935135, "learning_rate": 1.492183346547638e-05, "loss": 1.2234, "step": 11950 }, { "epoch": 3.5595599322399893, "grad_norm": 0.23266535997390747, "learning_rate": 1.4920993795702535e-05, "loss": 1.2268, "step": 11951 }, { "epoch": 3.559857778439658, "grad_norm": 0.27076929807662964, "learning_rate": 1.4920154080144337e-05, "loss": 1.2398, "step": 11952 }, { "epoch": 3.560155624639327, "grad_norm": 0.25998640060424805, "learning_rate": 1.4919314318809603e-05, "loss": 1.2302, "step": 11953 }, { "epoch": 3.5604534708389957, "grad_norm": 0.254976361989975, "learning_rate": 1.491847451170615e-05, "loss": 1.2367, "step": 11954 }, { "epoch": 3.5607513170386644, "grad_norm": 0.28322747349739075, "learning_rate": 1.4917634658841782e-05, "loss": 1.236, "step": 11955 }, { "epoch": 3.5610491632383328, "grad_norm": 0.24852986633777618, "learning_rate": 1.4916794760224318e-05, "loss": 1.2285, "step": 11956 }, { "epoch": 3.5613470094380015, "grad_norm": 0.23009903728961945, "learning_rate": 1.4915954815861572e-05, "loss": 1.2479, "step": 11957 }, { "epoch": 3.5616448556376703, "grad_norm": 0.2647802233695984, "learning_rate": 1.491511482576136e-05, "loss": 1.2466, "step": 11958 }, { "epoch": 3.5619427018373386, "grad_norm": 0.23196657001972198, "learning_rate": 1.4914274789931498e-05, "loss": 1.2254, "step": 11959 }, { "epoch": 3.5622405480370074, "grad_norm": 0.30095914006233215, "learning_rate": 1.4913434708379796e-05, "loss": 1.2454, "step": 11960 }, { "epoch": 3.562538394236676, "grad_norm": 0.31365150213241577, "learning_rate": 1.4912594581114077e-05, "loss": 1.2291, "step": 11961 }, { "epoch": 3.5628362404363445, "grad_norm": 0.24199581146240234, "learning_rate": 1.4911754408142152e-05, "loss": 1.2305, "step": 11962 }, { "epoch": 3.5631340866360133, "grad_norm": 0.3605521023273468, "learning_rate": 1.4910914189471841e-05, "loss": 1.2339, "step": 11963 }, { "epoch": 3.563431932835682, "grad_norm": 0.31536492705345154, "learning_rate": 1.491007392511096e-05, "loss": 1.2368, "step": 11964 }, { "epoch": 3.5637297790353504, "grad_norm": 0.2886471450328827, "learning_rate": 1.4909233615067326e-05, "loss": 1.2269, "step": 11965 }, { "epoch": 3.564027625235019, "grad_norm": 0.4995361268520355, "learning_rate": 1.4908393259348761e-05, "loss": 1.2336, "step": 11966 }, { "epoch": 3.564325471434688, "grad_norm": 0.30417266488075256, "learning_rate": 1.4907552857963077e-05, "loss": 1.2468, "step": 11967 }, { "epoch": 3.5646233176343567, "grad_norm": 0.3080073297023773, "learning_rate": 1.49067124109181e-05, "loss": 1.2303, "step": 11968 }, { "epoch": 3.5649211638340255, "grad_norm": 0.22961090505123138, "learning_rate": 1.4905871918221643e-05, "loss": 1.2319, "step": 11969 }, { "epoch": 3.565219010033694, "grad_norm": 0.3508792519569397, "learning_rate": 1.490503137988153e-05, "loss": 1.2466, "step": 11970 }, { "epoch": 3.5655168562333626, "grad_norm": 0.24273687601089478, "learning_rate": 1.4904190795905584e-05, "loss": 1.2238, "step": 11971 }, { "epoch": 3.5658147024330313, "grad_norm": 0.25304114818573, "learning_rate": 1.4903350166301618e-05, "loss": 1.217, "step": 11972 }, { "epoch": 3.5661125486326997, "grad_norm": 0.24923963844776154, "learning_rate": 1.4902509491077458e-05, "loss": 1.2345, "step": 11973 }, { "epoch": 3.5664103948323684, "grad_norm": 0.25437408685684204, "learning_rate": 1.490166877024092e-05, "loss": 1.2366, "step": 11974 }, { "epoch": 3.566708241032037, "grad_norm": 0.2828586995601654, "learning_rate": 1.4900828003799839e-05, "loss": 1.2372, "step": 11975 }, { "epoch": 3.5670060872317055, "grad_norm": 0.2456802874803543, "learning_rate": 1.4899987191762022e-05, "loss": 1.2166, "step": 11976 }, { "epoch": 3.5673039334313743, "grad_norm": 0.27970948815345764, "learning_rate": 1.48991463341353e-05, "loss": 1.2434, "step": 11977 }, { "epoch": 3.567601779631043, "grad_norm": 0.2370612472295761, "learning_rate": 1.48983054309275e-05, "loss": 1.2381, "step": 11978 }, { "epoch": 3.5678996258307114, "grad_norm": 0.24851901829242706, "learning_rate": 1.4897464482146435e-05, "loss": 1.2598, "step": 11979 }, { "epoch": 3.56819747203038, "grad_norm": 0.2595925033092499, "learning_rate": 1.4896623487799937e-05, "loss": 1.2323, "step": 11980 }, { "epoch": 3.568495318230049, "grad_norm": 0.2781471610069275, "learning_rate": 1.4895782447895828e-05, "loss": 1.2503, "step": 11981 }, { "epoch": 3.5687931644297177, "grad_norm": 0.2639451026916504, "learning_rate": 1.4894941362441935e-05, "loss": 1.243, "step": 11982 }, { "epoch": 3.5690910106293865, "grad_norm": 0.2645019292831421, "learning_rate": 1.489410023144608e-05, "loss": 1.2386, "step": 11983 }, { "epoch": 3.569388856829055, "grad_norm": 0.25453054904937744, "learning_rate": 1.489325905491609e-05, "loss": 1.2409, "step": 11984 }, { "epoch": 3.5696867030287236, "grad_norm": 0.24270634353160858, "learning_rate": 1.4892417832859792e-05, "loss": 1.2437, "step": 11985 }, { "epoch": 3.5699845492283924, "grad_norm": 0.26533761620521545, "learning_rate": 1.489157656528501e-05, "loss": 1.2184, "step": 11986 }, { "epoch": 3.5702823954280607, "grad_norm": 0.254607230424881, "learning_rate": 1.4890735252199578e-05, "loss": 1.2329, "step": 11987 }, { "epoch": 3.5705802416277295, "grad_norm": 0.27076947689056396, "learning_rate": 1.4889893893611317e-05, "loss": 1.2327, "step": 11988 }, { "epoch": 3.5708780878273982, "grad_norm": 0.25077369809150696, "learning_rate": 1.4889052489528055e-05, "loss": 1.2387, "step": 11989 }, { "epoch": 3.5711759340270666, "grad_norm": 0.2612989842891693, "learning_rate": 1.4888211039957623e-05, "loss": 1.2494, "step": 11990 }, { "epoch": 3.5714737802267353, "grad_norm": 0.2401239573955536, "learning_rate": 1.4887369544907848e-05, "loss": 1.2238, "step": 11991 }, { "epoch": 3.571771626426404, "grad_norm": 0.24555446207523346, "learning_rate": 1.488652800438656e-05, "loss": 1.2347, "step": 11992 }, { "epoch": 3.5720694726260724, "grad_norm": 0.2627737522125244, "learning_rate": 1.4885686418401588e-05, "loss": 1.2241, "step": 11993 }, { "epoch": 3.572367318825741, "grad_norm": 0.2304520308971405, "learning_rate": 1.4884844786960763e-05, "loss": 1.2111, "step": 11994 }, { "epoch": 3.57266516502541, "grad_norm": 0.23122353851795197, "learning_rate": 1.4884003110071918e-05, "loss": 1.2239, "step": 11995 }, { "epoch": 3.5729630112250788, "grad_norm": 0.23823879659175873, "learning_rate": 1.4883161387742877e-05, "loss": 1.2433, "step": 11996 }, { "epoch": 3.5732608574247475, "grad_norm": 0.2689054012298584, "learning_rate": 1.4882319619981473e-05, "loss": 1.2238, "step": 11997 }, { "epoch": 3.573558703624416, "grad_norm": 0.3108246326446533, "learning_rate": 1.4881477806795543e-05, "loss": 1.2272, "step": 11998 }, { "epoch": 3.5738565498240846, "grad_norm": 0.2484017014503479, "learning_rate": 1.4880635948192918e-05, "loss": 1.1908, "step": 11999 }, { "epoch": 3.5741543960237534, "grad_norm": 0.28578975796699524, "learning_rate": 1.4879794044181426e-05, "loss": 1.2269, "step": 12000 }, { "epoch": 3.5741543960237534, "eval_loss": 1.331367015838623, "eval_runtime": 20.7914, "eval_samples_per_second": 83.4, "eval_steps_per_second": 5.243, "step": 12000 }, { "epoch": 3.5744522422234217, "grad_norm": 0.2474474161863327, "learning_rate": 1.4878952094768904e-05, "loss": 1.2152, "step": 12001 }, { "epoch": 3.5747500884230905, "grad_norm": 0.24221831560134888, "learning_rate": 1.4878110099963181e-05, "loss": 1.223, "step": 12002 }, { "epoch": 3.5750479346227593, "grad_norm": 0.24997448921203613, "learning_rate": 1.4877268059772094e-05, "loss": 1.2321, "step": 12003 }, { "epoch": 3.5753457808224276, "grad_norm": 0.24379056692123413, "learning_rate": 1.487642597420348e-05, "loss": 1.2337, "step": 12004 }, { "epoch": 3.5756436270220964, "grad_norm": 0.266418993473053, "learning_rate": 1.4875583843265169e-05, "loss": 1.2307, "step": 12005 }, { "epoch": 3.575941473221765, "grad_norm": 0.304832398891449, "learning_rate": 1.4874741666964999e-05, "loss": 1.2186, "step": 12006 }, { "epoch": 3.576239319421434, "grad_norm": 0.24056296050548553, "learning_rate": 1.48738994453108e-05, "loss": 1.2266, "step": 12007 }, { "epoch": 3.5765371656211022, "grad_norm": 0.2905622720718384, "learning_rate": 1.4873057178310414e-05, "loss": 1.2372, "step": 12008 }, { "epoch": 3.576835011820771, "grad_norm": 0.40158170461654663, "learning_rate": 1.4872214865971677e-05, "loss": 1.2298, "step": 12009 }, { "epoch": 3.57713285802044, "grad_norm": 0.3111315071582794, "learning_rate": 1.4871372508302422e-05, "loss": 1.2395, "step": 12010 }, { "epoch": 3.5774307042201086, "grad_norm": 0.26447662711143494, "learning_rate": 1.487053010531049e-05, "loss": 1.2399, "step": 12011 }, { "epoch": 3.577728550419777, "grad_norm": 0.4011225700378418, "learning_rate": 1.486968765700372e-05, "loss": 1.2257, "step": 12012 }, { "epoch": 3.5780263966194457, "grad_norm": 0.30219927430152893, "learning_rate": 1.4868845163389942e-05, "loss": 1.2365, "step": 12013 }, { "epoch": 3.5783242428191144, "grad_norm": 0.2629011571407318, "learning_rate": 1.4868002624477003e-05, "loss": 1.2261, "step": 12014 }, { "epoch": 3.5786220890187828, "grad_norm": 0.29616492986679077, "learning_rate": 1.4867160040272732e-05, "loss": 1.2259, "step": 12015 }, { "epoch": 3.5789199352184515, "grad_norm": 0.2366276979446411, "learning_rate": 1.4866317410784982e-05, "loss": 1.2312, "step": 12016 }, { "epoch": 3.5792177814181203, "grad_norm": 0.28599026799201965, "learning_rate": 1.4865474736021584e-05, "loss": 1.2182, "step": 12017 }, { "epoch": 3.5795156276177886, "grad_norm": 0.26458537578582764, "learning_rate": 1.4864632015990377e-05, "loss": 1.2507, "step": 12018 }, { "epoch": 3.5798134738174574, "grad_norm": 0.2551570534706116, "learning_rate": 1.486378925069921e-05, "loss": 1.2448, "step": 12019 }, { "epoch": 3.580111320017126, "grad_norm": 0.2608723044395447, "learning_rate": 1.486294644015591e-05, "loss": 1.2223, "step": 12020 }, { "epoch": 3.580409166216795, "grad_norm": 0.2454732060432434, "learning_rate": 1.486210358436833e-05, "loss": 1.2234, "step": 12021 }, { "epoch": 3.5807070124164637, "grad_norm": 0.3151560425758362, "learning_rate": 1.486126068334431e-05, "loss": 1.2203, "step": 12022 }, { "epoch": 3.581004858616132, "grad_norm": 0.25535303354263306, "learning_rate": 1.486041773709169e-05, "loss": 1.24, "step": 12023 }, { "epoch": 3.581302704815801, "grad_norm": 0.2909095287322998, "learning_rate": 1.4859574745618312e-05, "loss": 1.2474, "step": 12024 }, { "epoch": 3.5816005510154696, "grad_norm": 0.2343895584344864, "learning_rate": 1.4858731708932022e-05, "loss": 1.2393, "step": 12025 }, { "epoch": 3.581898397215138, "grad_norm": 0.5344161987304688, "learning_rate": 1.4857888627040662e-05, "loss": 1.2285, "step": 12026 }, { "epoch": 3.5821962434148067, "grad_norm": 0.31793347001075745, "learning_rate": 1.4857045499952075e-05, "loss": 1.2365, "step": 12027 }, { "epoch": 3.5824940896144755, "grad_norm": 0.24410374462604523, "learning_rate": 1.4856202327674107e-05, "loss": 1.2378, "step": 12028 }, { "epoch": 3.582791935814144, "grad_norm": 0.24520458281040192, "learning_rate": 1.4855359110214602e-05, "loss": 1.2227, "step": 12029 }, { "epoch": 3.5830897820138126, "grad_norm": 0.24458833038806915, "learning_rate": 1.4854515847581405e-05, "loss": 1.245, "step": 12030 }, { "epoch": 3.5833876282134813, "grad_norm": 0.24848031997680664, "learning_rate": 1.485367253978236e-05, "loss": 1.2436, "step": 12031 }, { "epoch": 3.5836854744131497, "grad_norm": 0.24862825870513916, "learning_rate": 1.4852829186825318e-05, "loss": 1.2477, "step": 12032 }, { "epoch": 3.5839833206128184, "grad_norm": 0.22774432599544525, "learning_rate": 1.485198578871812e-05, "loss": 1.227, "step": 12033 }, { "epoch": 3.584281166812487, "grad_norm": 0.24232453107833862, "learning_rate": 1.4851142345468614e-05, "loss": 1.2433, "step": 12034 }, { "epoch": 3.584579013012156, "grad_norm": 0.24190890789031982, "learning_rate": 1.485029885708465e-05, "loss": 1.246, "step": 12035 }, { "epoch": 3.5848768592118248, "grad_norm": 0.2451910376548767, "learning_rate": 1.4849455323574078e-05, "loss": 1.2342, "step": 12036 }, { "epoch": 3.585174705411493, "grad_norm": 0.243106871843338, "learning_rate": 1.4848611744944739e-05, "loss": 1.2261, "step": 12037 }, { "epoch": 3.585472551611162, "grad_norm": 0.24494658410549164, "learning_rate": 1.4847768121204482e-05, "loss": 1.246, "step": 12038 }, { "epoch": 3.5857703978108306, "grad_norm": 0.23604218661785126, "learning_rate": 1.484692445236116e-05, "loss": 1.2077, "step": 12039 }, { "epoch": 3.586068244010499, "grad_norm": 0.23879358172416687, "learning_rate": 1.4846080738422626e-05, "loss": 1.2264, "step": 12040 }, { "epoch": 3.5863660902101677, "grad_norm": 0.23756951093673706, "learning_rate": 1.4845236979396722e-05, "loss": 1.2054, "step": 12041 }, { "epoch": 3.5866639364098365, "grad_norm": 0.25303810834884644, "learning_rate": 1.48443931752913e-05, "loss": 1.2401, "step": 12042 }, { "epoch": 3.586961782609505, "grad_norm": 0.24577119946479797, "learning_rate": 1.4843549326114217e-05, "loss": 1.2492, "step": 12043 }, { "epoch": 3.5872596288091736, "grad_norm": 0.24664612114429474, "learning_rate": 1.4842705431873312e-05, "loss": 1.2286, "step": 12044 }, { "epoch": 3.5875574750088424, "grad_norm": 0.23486971855163574, "learning_rate": 1.4841861492576448e-05, "loss": 1.2246, "step": 12045 }, { "epoch": 3.5878553212085107, "grad_norm": 0.24063166975975037, "learning_rate": 1.484101750823147e-05, "loss": 1.2258, "step": 12046 }, { "epoch": 3.5881531674081795, "grad_norm": 0.25894486904144287, "learning_rate": 1.4840173478846236e-05, "loss": 1.2471, "step": 12047 }, { "epoch": 3.5884510136078482, "grad_norm": 0.22805535793304443, "learning_rate": 1.4839329404428593e-05, "loss": 1.2395, "step": 12048 }, { "epoch": 3.588748859807517, "grad_norm": 0.2343277782201767, "learning_rate": 1.4838485284986398e-05, "loss": 1.2377, "step": 12049 }, { "epoch": 3.589046706007186, "grad_norm": 0.24033649265766144, "learning_rate": 1.4837641120527502e-05, "loss": 1.2191, "step": 12050 }, { "epoch": 3.589344552206854, "grad_norm": 0.23253053426742554, "learning_rate": 1.483679691105976e-05, "loss": 1.2205, "step": 12051 }, { "epoch": 3.589642398406523, "grad_norm": 0.23812629282474518, "learning_rate": 1.4835952656591028e-05, "loss": 1.2238, "step": 12052 }, { "epoch": 3.5899402446061917, "grad_norm": 0.23149777948856354, "learning_rate": 1.4835108357129162e-05, "loss": 1.2257, "step": 12053 }, { "epoch": 3.59023809080586, "grad_norm": 0.2525610327720642, "learning_rate": 1.4834264012682012e-05, "loss": 1.2357, "step": 12054 }, { "epoch": 3.5905359370055288, "grad_norm": 0.24097470939159393, "learning_rate": 1.4833419623257437e-05, "loss": 1.2198, "step": 12055 }, { "epoch": 3.5908337832051975, "grad_norm": 0.2267192006111145, "learning_rate": 1.483257518886329e-05, "loss": 1.2163, "step": 12056 }, { "epoch": 3.591131629404866, "grad_norm": 0.24324989318847656, "learning_rate": 1.4831730709507436e-05, "loss": 1.2298, "step": 12057 }, { "epoch": 3.5914294756045346, "grad_norm": 0.25552991032600403, "learning_rate": 1.4830886185197719e-05, "loss": 1.2338, "step": 12058 }, { "epoch": 3.5917273218042034, "grad_norm": 0.24068230390548706, "learning_rate": 1.4830041615942008e-05, "loss": 1.2358, "step": 12059 }, { "epoch": 3.5920251680038717, "grad_norm": 0.2373005896806717, "learning_rate": 1.4829197001748156e-05, "loss": 1.226, "step": 12060 }, { "epoch": 3.5923230142035405, "grad_norm": 0.25316908955574036, "learning_rate": 1.4828352342624021e-05, "loss": 1.2327, "step": 12061 }, { "epoch": 3.5926208604032093, "grad_norm": 0.2392289936542511, "learning_rate": 1.4827507638577459e-05, "loss": 1.2406, "step": 12062 }, { "epoch": 3.592918706602878, "grad_norm": 0.23246882855892181, "learning_rate": 1.4826662889616335e-05, "loss": 1.2367, "step": 12063 }, { "epoch": 3.593216552802547, "grad_norm": 0.2436981499195099, "learning_rate": 1.4825818095748505e-05, "loss": 1.2418, "step": 12064 }, { "epoch": 3.593514399002215, "grad_norm": 0.23481963574886322, "learning_rate": 1.4824973256981831e-05, "loss": 1.2362, "step": 12065 }, { "epoch": 3.593812245201884, "grad_norm": 0.23828721046447754, "learning_rate": 1.4824128373324168e-05, "loss": 1.2354, "step": 12066 }, { "epoch": 3.5941100914015527, "grad_norm": 0.23842285573482513, "learning_rate": 1.4823283444783383e-05, "loss": 1.2503, "step": 12067 }, { "epoch": 3.594407937601221, "grad_norm": 0.23747295141220093, "learning_rate": 1.4822438471367333e-05, "loss": 1.2382, "step": 12068 }, { "epoch": 3.59470578380089, "grad_norm": 0.24500344693660736, "learning_rate": 1.4821593453083883e-05, "loss": 1.229, "step": 12069 }, { "epoch": 3.5950036300005586, "grad_norm": 0.2548135817050934, "learning_rate": 1.4820748389940894e-05, "loss": 1.2261, "step": 12070 }, { "epoch": 3.595301476200227, "grad_norm": 0.2509630024433136, "learning_rate": 1.4819903281946224e-05, "loss": 1.247, "step": 12071 }, { "epoch": 3.5955993223998957, "grad_norm": 0.23769141733646393, "learning_rate": 1.481905812910774e-05, "loss": 1.2391, "step": 12072 }, { "epoch": 3.5958971685995644, "grad_norm": 0.23118427395820618, "learning_rate": 1.4818212931433305e-05, "loss": 1.2214, "step": 12073 }, { "epoch": 3.596195014799233, "grad_norm": 0.2310144454240799, "learning_rate": 1.481736768893078e-05, "loss": 1.2389, "step": 12074 }, { "epoch": 3.5964928609989015, "grad_norm": 0.24245847761631012, "learning_rate": 1.481652240160803e-05, "loss": 1.2408, "step": 12075 }, { "epoch": 3.5967907071985703, "grad_norm": 0.2524879574775696, "learning_rate": 1.4815677069472924e-05, "loss": 1.2236, "step": 12076 }, { "epoch": 3.597088553398239, "grad_norm": 0.23940180242061615, "learning_rate": 1.4814831692533322e-05, "loss": 1.2249, "step": 12077 }, { "epoch": 3.597386399597908, "grad_norm": 0.24941429495811462, "learning_rate": 1.4813986270797091e-05, "loss": 1.2277, "step": 12078 }, { "epoch": 3.597684245797576, "grad_norm": 0.24452979862689972, "learning_rate": 1.4813140804272097e-05, "loss": 1.204, "step": 12079 }, { "epoch": 3.597982091997245, "grad_norm": 0.2458629608154297, "learning_rate": 1.4812295292966201e-05, "loss": 1.2436, "step": 12080 }, { "epoch": 3.5982799381969137, "grad_norm": 0.2509615421295166, "learning_rate": 1.481144973688728e-05, "loss": 1.2353, "step": 12081 }, { "epoch": 3.598577784396582, "grad_norm": 0.26614630222320557, "learning_rate": 1.481060413604319e-05, "loss": 1.2352, "step": 12082 }, { "epoch": 3.598875630596251, "grad_norm": 0.2650357782840729, "learning_rate": 1.4809758490441806e-05, "loss": 1.2298, "step": 12083 }, { "epoch": 3.5991734767959196, "grad_norm": 0.2656654417514801, "learning_rate": 1.4808912800090994e-05, "loss": 1.2305, "step": 12084 }, { "epoch": 3.599471322995588, "grad_norm": 0.2687116265296936, "learning_rate": 1.4808067064998618e-05, "loss": 1.2336, "step": 12085 }, { "epoch": 3.5997691691952567, "grad_norm": 0.24685779213905334, "learning_rate": 1.4807221285172548e-05, "loss": 1.2444, "step": 12086 }, { "epoch": 3.6000670153949255, "grad_norm": 0.2496700882911682, "learning_rate": 1.4806375460620661e-05, "loss": 1.2561, "step": 12087 }, { "epoch": 3.6003648615945942, "grad_norm": 0.25268059968948364, "learning_rate": 1.4805529591350816e-05, "loss": 1.232, "step": 12088 }, { "epoch": 3.600662707794263, "grad_norm": 0.23773519694805145, "learning_rate": 1.480468367737089e-05, "loss": 1.2568, "step": 12089 }, { "epoch": 3.6009605539939313, "grad_norm": 0.2564716637134552, "learning_rate": 1.4803837718688746e-05, "loss": 1.2215, "step": 12090 }, { "epoch": 3.6012584001936, "grad_norm": 0.2356421947479248, "learning_rate": 1.4802991715312262e-05, "loss": 1.2314, "step": 12091 }, { "epoch": 3.601556246393269, "grad_norm": 0.2563536465167999, "learning_rate": 1.4802145667249308e-05, "loss": 1.2069, "step": 12092 }, { "epoch": 3.601854092592937, "grad_norm": 0.25942927598953247, "learning_rate": 1.4801299574507753e-05, "loss": 1.207, "step": 12093 }, { "epoch": 3.602151938792606, "grad_norm": 0.28995251655578613, "learning_rate": 1.480045343709547e-05, "loss": 1.2312, "step": 12094 }, { "epoch": 3.6024497849922748, "grad_norm": 0.2602880299091339, "learning_rate": 1.479960725502033e-05, "loss": 1.2258, "step": 12095 }, { "epoch": 3.602747631191943, "grad_norm": 0.23588363826274872, "learning_rate": 1.4798761028290207e-05, "loss": 1.2302, "step": 12096 }, { "epoch": 3.603045477391612, "grad_norm": 0.270133376121521, "learning_rate": 1.4797914756912974e-05, "loss": 1.2342, "step": 12097 }, { "epoch": 3.6033433235912806, "grad_norm": 0.25792115926742554, "learning_rate": 1.4797068440896506e-05, "loss": 1.2403, "step": 12098 }, { "epoch": 3.603641169790949, "grad_norm": 0.24684999883174896, "learning_rate": 1.4796222080248675e-05, "loss": 1.2292, "step": 12099 }, { "epoch": 3.6039390159906177, "grad_norm": 0.2691803276538849, "learning_rate": 1.4795375674977354e-05, "loss": 1.2436, "step": 12100 }, { "epoch": 3.6042368621902865, "grad_norm": 0.2561197876930237, "learning_rate": 1.4794529225090423e-05, "loss": 1.228, "step": 12101 }, { "epoch": 3.6045347083899553, "grad_norm": 0.2439146190881729, "learning_rate": 1.4793682730595756e-05, "loss": 1.243, "step": 12102 }, { "epoch": 3.604832554589624, "grad_norm": 0.26815205812454224, "learning_rate": 1.479283619150122e-05, "loss": 1.2318, "step": 12103 }, { "epoch": 3.6051304007892924, "grad_norm": 0.2403976321220398, "learning_rate": 1.4791989607814703e-05, "loss": 1.235, "step": 12104 }, { "epoch": 3.605428246988961, "grad_norm": 0.26826152205467224, "learning_rate": 1.4791142979544077e-05, "loss": 1.2315, "step": 12105 }, { "epoch": 3.60572609318863, "grad_norm": 0.3150516450405121, "learning_rate": 1.4790296306697219e-05, "loss": 1.2262, "step": 12106 }, { "epoch": 3.6060239393882982, "grad_norm": 0.23319309949874878, "learning_rate": 1.4789449589282004e-05, "loss": 1.2212, "step": 12107 }, { "epoch": 3.606321785587967, "grad_norm": 0.4288104176521301, "learning_rate": 1.478860282730631e-05, "loss": 1.2559, "step": 12108 }, { "epoch": 3.606619631787636, "grad_norm": 0.34519460797309875, "learning_rate": 1.4787756020778022e-05, "loss": 1.2356, "step": 12109 }, { "epoch": 3.606917477987304, "grad_norm": 0.3091725707054138, "learning_rate": 1.4786909169705008e-05, "loss": 1.221, "step": 12110 }, { "epoch": 3.607215324186973, "grad_norm": 0.27726104855537415, "learning_rate": 1.4786062274095159e-05, "loss": 1.2286, "step": 12111 }, { "epoch": 3.6075131703866417, "grad_norm": 0.33248814940452576, "learning_rate": 1.4785215333956342e-05, "loss": 1.2243, "step": 12112 }, { "epoch": 3.60781101658631, "grad_norm": 0.2328971028327942, "learning_rate": 1.4784368349296446e-05, "loss": 1.2258, "step": 12113 }, { "epoch": 3.6081088627859788, "grad_norm": 0.2641240358352661, "learning_rate": 1.4783521320123347e-05, "loss": 1.2343, "step": 12114 }, { "epoch": 3.6084067089856475, "grad_norm": 0.29334887862205505, "learning_rate": 1.4782674246444924e-05, "loss": 1.2187, "step": 12115 }, { "epoch": 3.6087045551853163, "grad_norm": 0.23718927800655365, "learning_rate": 1.4781827128269062e-05, "loss": 1.2179, "step": 12116 }, { "epoch": 3.609002401384985, "grad_norm": 0.30184462666511536, "learning_rate": 1.4780979965603642e-05, "loss": 1.2395, "step": 12117 }, { "epoch": 3.6093002475846534, "grad_norm": 0.25319600105285645, "learning_rate": 1.4780132758456547e-05, "loss": 1.2373, "step": 12118 }, { "epoch": 3.609598093784322, "grad_norm": 0.25590604543685913, "learning_rate": 1.4779285506835654e-05, "loss": 1.2381, "step": 12119 }, { "epoch": 3.609895939983991, "grad_norm": 0.27177754044532776, "learning_rate": 1.4778438210748851e-05, "loss": 1.2252, "step": 12120 }, { "epoch": 3.6101937861836593, "grad_norm": 0.2445678561925888, "learning_rate": 1.4777590870204015e-05, "loss": 1.23, "step": 12121 }, { "epoch": 3.610491632383328, "grad_norm": 0.3108128309249878, "learning_rate": 1.4776743485209039e-05, "loss": 1.2381, "step": 12122 }, { "epoch": 3.610789478582997, "grad_norm": 0.24306008219718933, "learning_rate": 1.4775896055771801e-05, "loss": 1.248, "step": 12123 }, { "epoch": 3.611087324782665, "grad_norm": 0.2657211124897003, "learning_rate": 1.4775048581900184e-05, "loss": 1.2384, "step": 12124 }, { "epoch": 3.611385170982334, "grad_norm": 0.2527020275592804, "learning_rate": 1.4774201063602079e-05, "loss": 1.2506, "step": 12125 }, { "epoch": 3.6116830171820027, "grad_norm": 0.2403356432914734, "learning_rate": 1.477335350088536e-05, "loss": 1.2186, "step": 12126 }, { "epoch": 3.611980863381671, "grad_norm": 0.2830476760864258, "learning_rate": 1.4772505893757924e-05, "loss": 1.2354, "step": 12127 }, { "epoch": 3.61227870958134, "grad_norm": 0.2860715091228485, "learning_rate": 1.4771658242227655e-05, "loss": 1.24, "step": 12128 }, { "epoch": 3.6125765557810086, "grad_norm": 0.24847184121608734, "learning_rate": 1.4770810546302436e-05, "loss": 1.2118, "step": 12129 }, { "epoch": 3.6128744019806773, "grad_norm": 0.2509472966194153, "learning_rate": 1.4769962805990154e-05, "loss": 1.2169, "step": 12130 }, { "epoch": 3.613172248180346, "grad_norm": 0.27350685000419617, "learning_rate": 1.4769115021298696e-05, "loss": 1.2349, "step": 12131 }, { "epoch": 3.6134700943800144, "grad_norm": 0.27454113960266113, "learning_rate": 1.4768267192235954e-05, "loss": 1.247, "step": 12132 }, { "epoch": 3.613767940579683, "grad_norm": 0.2896553575992584, "learning_rate": 1.4767419318809812e-05, "loss": 1.2283, "step": 12133 }, { "epoch": 3.614065786779352, "grad_norm": 0.2629541754722595, "learning_rate": 1.476657140102816e-05, "loss": 1.2161, "step": 12134 }, { "epoch": 3.6143636329790203, "grad_norm": 0.2812981903553009, "learning_rate": 1.4765723438898886e-05, "loss": 1.2382, "step": 12135 }, { "epoch": 3.614661479178689, "grad_norm": 0.25081151723861694, "learning_rate": 1.476487543242988e-05, "loss": 1.2276, "step": 12136 }, { "epoch": 3.614959325378358, "grad_norm": 0.30545827746391296, "learning_rate": 1.4764027381629034e-05, "loss": 1.2286, "step": 12137 }, { "epoch": 3.615257171578026, "grad_norm": 0.24036957323551178, "learning_rate": 1.4763179286504234e-05, "loss": 1.2148, "step": 12138 }, { "epoch": 3.615555017777695, "grad_norm": 0.3674185276031494, "learning_rate": 1.476233114706337e-05, "loss": 1.2077, "step": 12139 }, { "epoch": 3.6158528639773637, "grad_norm": 0.24986650049686432, "learning_rate": 1.476148296331434e-05, "loss": 1.2393, "step": 12140 }, { "epoch": 3.6161507101770325, "grad_norm": 0.33543214201927185, "learning_rate": 1.4760634735265029e-05, "loss": 1.2292, "step": 12141 }, { "epoch": 3.616448556376701, "grad_norm": 0.24175876379013062, "learning_rate": 1.4759786462923332e-05, "loss": 1.2195, "step": 12142 }, { "epoch": 3.6167464025763696, "grad_norm": 0.34412795305252075, "learning_rate": 1.4758938146297138e-05, "loss": 1.2358, "step": 12143 }, { "epoch": 3.6170442487760384, "grad_norm": 0.2774752676486969, "learning_rate": 1.4758089785394338e-05, "loss": 1.24, "step": 12144 }, { "epoch": 3.617342094975707, "grad_norm": 0.2854416072368622, "learning_rate": 1.4757241380222835e-05, "loss": 1.2433, "step": 12145 }, { "epoch": 3.6176399411753755, "grad_norm": 0.24738280475139618, "learning_rate": 1.4756392930790516e-05, "loss": 1.2334, "step": 12146 }, { "epoch": 3.6179377873750442, "grad_norm": 0.4207460582256317, "learning_rate": 1.4755544437105269e-05, "loss": 1.2238, "step": 12147 }, { "epoch": 3.618235633574713, "grad_norm": 0.32039952278137207, "learning_rate": 1.4754695899174997e-05, "loss": 1.2334, "step": 12148 }, { "epoch": 3.6185334797743813, "grad_norm": 0.31696414947509766, "learning_rate": 1.4753847317007594e-05, "loss": 1.2517, "step": 12149 }, { "epoch": 3.61883132597405, "grad_norm": 0.26194241642951965, "learning_rate": 1.4752998690610951e-05, "loss": 1.2318, "step": 12150 }, { "epoch": 3.619129172173719, "grad_norm": 0.4184989035129547, "learning_rate": 1.4752150019992968e-05, "loss": 1.2323, "step": 12151 }, { "epoch": 3.619427018373387, "grad_norm": 0.32291746139526367, "learning_rate": 1.4751301305161536e-05, "loss": 1.2329, "step": 12152 }, { "epoch": 3.619724864573056, "grad_norm": 0.32019558548927307, "learning_rate": 1.4750452546124556e-05, "loss": 1.237, "step": 12153 }, { "epoch": 3.6200227107727247, "grad_norm": 0.24402610957622528, "learning_rate": 1.474960374288992e-05, "loss": 1.2339, "step": 12154 }, { "epoch": 3.6203205569723935, "grad_norm": 0.3688165247440338, "learning_rate": 1.474875489546553e-05, "loss": 1.2468, "step": 12155 }, { "epoch": 3.6206184031720623, "grad_norm": 0.2501266896724701, "learning_rate": 1.4747906003859278e-05, "loss": 1.2277, "step": 12156 }, { "epoch": 3.6209162493717306, "grad_norm": 0.25592708587646484, "learning_rate": 1.4747057068079067e-05, "loss": 1.2313, "step": 12157 }, { "epoch": 3.6212140955713994, "grad_norm": 0.2680782377719879, "learning_rate": 1.4746208088132794e-05, "loss": 1.2284, "step": 12158 }, { "epoch": 3.621511941771068, "grad_norm": 0.2572602927684784, "learning_rate": 1.474535906402836e-05, "loss": 1.2353, "step": 12159 }, { "epoch": 3.6218097879707365, "grad_norm": 0.30644771456718445, "learning_rate": 1.474450999577366e-05, "loss": 1.2305, "step": 12160 }, { "epoch": 3.6221076341704053, "grad_norm": 0.29997578263282776, "learning_rate": 1.4743660883376593e-05, "loss": 1.237, "step": 12161 }, { "epoch": 3.622405480370074, "grad_norm": 0.25831934809684753, "learning_rate": 1.4742811726845063e-05, "loss": 1.2229, "step": 12162 }, { "epoch": 3.6227033265697424, "grad_norm": 0.3088092803955078, "learning_rate": 1.474196252618697e-05, "loss": 1.2221, "step": 12163 }, { "epoch": 3.623001172769411, "grad_norm": 0.2495061606168747, "learning_rate": 1.4741113281410213e-05, "loss": 1.23, "step": 12164 }, { "epoch": 3.62329901896908, "grad_norm": 0.37403103709220886, "learning_rate": 1.4740263992522695e-05, "loss": 1.2415, "step": 12165 }, { "epoch": 3.6235968651687482, "grad_norm": 0.24580949544906616, "learning_rate": 1.4739414659532316e-05, "loss": 1.2355, "step": 12166 }, { "epoch": 3.623894711368417, "grad_norm": 0.32103511691093445, "learning_rate": 1.4738565282446976e-05, "loss": 1.2178, "step": 12167 }, { "epoch": 3.624192557568086, "grad_norm": 0.23806986212730408, "learning_rate": 1.4737715861274582e-05, "loss": 1.2229, "step": 12168 }, { "epoch": 3.6244904037677546, "grad_norm": 0.2906794250011444, "learning_rate": 1.4736866396023038e-05, "loss": 1.2429, "step": 12169 }, { "epoch": 3.6247882499674233, "grad_norm": 0.26994964480400085, "learning_rate": 1.4736016886700242e-05, "loss": 1.2376, "step": 12170 }, { "epoch": 3.6250860961670917, "grad_norm": 0.26804840564727783, "learning_rate": 1.47351673333141e-05, "loss": 1.2503, "step": 12171 }, { "epoch": 3.6253839423667604, "grad_norm": 0.255969375371933, "learning_rate": 1.4734317735872516e-05, "loss": 1.209, "step": 12172 }, { "epoch": 3.625681788566429, "grad_norm": 0.2911655008792877, "learning_rate": 1.4733468094383397e-05, "loss": 1.2396, "step": 12173 }, { "epoch": 3.6259796347660975, "grad_norm": 0.26125261187553406, "learning_rate": 1.4732618408854644e-05, "loss": 1.2274, "step": 12174 }, { "epoch": 3.6262774809657663, "grad_norm": 0.2519410252571106, "learning_rate": 1.4731768679294165e-05, "loss": 1.2434, "step": 12175 }, { "epoch": 3.626575327165435, "grad_norm": 0.2514265179634094, "learning_rate": 1.4730918905709868e-05, "loss": 1.2303, "step": 12176 }, { "epoch": 3.6268731733651034, "grad_norm": 0.2466183602809906, "learning_rate": 1.4730069088109653e-05, "loss": 1.2454, "step": 12177 }, { "epoch": 3.627171019564772, "grad_norm": 0.2416197657585144, "learning_rate": 1.472921922650143e-05, "loss": 1.2298, "step": 12178 }, { "epoch": 3.627468865764441, "grad_norm": 0.2630484104156494, "learning_rate": 1.4728369320893106e-05, "loss": 1.2205, "step": 12179 }, { "epoch": 3.6277667119641093, "grad_norm": 0.24044835567474365, "learning_rate": 1.4727519371292589e-05, "loss": 1.2371, "step": 12180 }, { "epoch": 3.628064558163778, "grad_norm": 0.2447517216205597, "learning_rate": 1.4726669377707783e-05, "loss": 1.2329, "step": 12181 }, { "epoch": 3.628362404363447, "grad_norm": 0.24271756410598755, "learning_rate": 1.4725819340146603e-05, "loss": 1.2325, "step": 12182 }, { "epoch": 3.6286602505631156, "grad_norm": 0.24534739553928375, "learning_rate": 1.4724969258616954e-05, "loss": 1.2358, "step": 12183 }, { "epoch": 3.6289580967627844, "grad_norm": 0.24850770831108093, "learning_rate": 1.4724119133126746e-05, "loss": 1.2289, "step": 12184 }, { "epoch": 3.6292559429624527, "grad_norm": 0.2543782591819763, "learning_rate": 1.4723268963683883e-05, "loss": 1.2249, "step": 12185 }, { "epoch": 3.6295537891621215, "grad_norm": 0.25927695631980896, "learning_rate": 1.4722418750296283e-05, "loss": 1.232, "step": 12186 }, { "epoch": 3.6298516353617902, "grad_norm": 0.266666054725647, "learning_rate": 1.4721568492971851e-05, "loss": 1.2118, "step": 12187 }, { "epoch": 3.6301494815614586, "grad_norm": 0.23791494965553284, "learning_rate": 1.4720718191718502e-05, "loss": 1.2366, "step": 12188 }, { "epoch": 3.6304473277611273, "grad_norm": 0.2627164125442505, "learning_rate": 1.4719867846544142e-05, "loss": 1.2314, "step": 12189 }, { "epoch": 3.630745173960796, "grad_norm": 0.2487691044807434, "learning_rate": 1.4719017457456688e-05, "loss": 1.2216, "step": 12190 }, { "epoch": 3.6310430201604644, "grad_norm": 0.25168392062187195, "learning_rate": 1.4718167024464045e-05, "loss": 1.2292, "step": 12191 }, { "epoch": 3.631340866360133, "grad_norm": 0.2399231642484665, "learning_rate": 1.471731654757413e-05, "loss": 1.2234, "step": 12192 }, { "epoch": 3.631638712559802, "grad_norm": 0.2516423463821411, "learning_rate": 1.4716466026794858e-05, "loss": 1.2278, "step": 12193 }, { "epoch": 3.6319365587594703, "grad_norm": 0.2878287732601166, "learning_rate": 1.4715615462134138e-05, "loss": 1.2285, "step": 12194 }, { "epoch": 3.632234404959139, "grad_norm": 0.26969072222709656, "learning_rate": 1.4714764853599884e-05, "loss": 1.2355, "step": 12195 }, { "epoch": 3.632532251158808, "grad_norm": 0.2461068034172058, "learning_rate": 1.471391420120001e-05, "loss": 1.2396, "step": 12196 }, { "epoch": 3.6328300973584766, "grad_norm": 0.2353159636259079, "learning_rate": 1.4713063504942434e-05, "loss": 1.2275, "step": 12197 }, { "epoch": 3.6331279435581454, "grad_norm": 0.2578493356704712, "learning_rate": 1.4712212764835063e-05, "loss": 1.2178, "step": 12198 }, { "epoch": 3.6334257897578137, "grad_norm": 0.29053571820259094, "learning_rate": 1.4711361980885821e-05, "loss": 1.229, "step": 12199 }, { "epoch": 3.6337236359574825, "grad_norm": 0.24387332797050476, "learning_rate": 1.4710511153102617e-05, "loss": 1.2324, "step": 12200 }, { "epoch": 3.6340214821571513, "grad_norm": 0.30657774209976196, "learning_rate": 1.4709660281493371e-05, "loss": 1.2402, "step": 12201 }, { "epoch": 3.6343193283568196, "grad_norm": 0.32485294342041016, "learning_rate": 1.4708809366066e-05, "loss": 1.2217, "step": 12202 }, { "epoch": 3.6346171745564884, "grad_norm": 0.2695823013782501, "learning_rate": 1.4707958406828416e-05, "loss": 1.2412, "step": 12203 }, { "epoch": 3.634915020756157, "grad_norm": 0.25589483976364136, "learning_rate": 1.4707107403788539e-05, "loss": 1.2408, "step": 12204 }, { "epoch": 3.6352128669558255, "grad_norm": 0.23745296895503998, "learning_rate": 1.4706256356954287e-05, "loss": 1.2102, "step": 12205 }, { "epoch": 3.6355107131554942, "grad_norm": 0.3855360150337219, "learning_rate": 1.470540526633358e-05, "loss": 1.2274, "step": 12206 }, { "epoch": 3.635808559355163, "grad_norm": 0.3667902946472168, "learning_rate": 1.4704554131934332e-05, "loss": 1.21, "step": 12207 }, { "epoch": 3.6361064055548318, "grad_norm": 0.3271882236003876, "learning_rate": 1.4703702953764465e-05, "loss": 1.2383, "step": 12208 }, { "epoch": 3.6364042517545, "grad_norm": 0.512535572052002, "learning_rate": 1.4702851731831895e-05, "loss": 1.2391, "step": 12209 }, { "epoch": 3.636702097954169, "grad_norm": 0.2458118051290512, "learning_rate": 1.4702000466144548e-05, "loss": 1.2275, "step": 12210 }, { "epoch": 3.6369999441538376, "grad_norm": 0.30778971314430237, "learning_rate": 1.4701149156710338e-05, "loss": 1.2326, "step": 12211 }, { "epoch": 3.6372977903535064, "grad_norm": 0.2693280279636383, "learning_rate": 1.4700297803537184e-05, "loss": 1.2354, "step": 12212 }, { "epoch": 3.6375956365531747, "grad_norm": 0.26226574182510376, "learning_rate": 1.4699446406633016e-05, "loss": 1.251, "step": 12213 }, { "epoch": 3.6378934827528435, "grad_norm": 0.3327712416648865, "learning_rate": 1.4698594966005745e-05, "loss": 1.2227, "step": 12214 }, { "epoch": 3.6381913289525123, "grad_norm": 0.285996675491333, "learning_rate": 1.4697743481663303e-05, "loss": 1.2366, "step": 12215 }, { "epoch": 3.6384891751521806, "grad_norm": 0.26572054624557495, "learning_rate": 1.46968919536136e-05, "loss": 1.251, "step": 12216 }, { "epoch": 3.6387870213518494, "grad_norm": 0.2764107584953308, "learning_rate": 1.4696040381864571e-05, "loss": 1.2466, "step": 12217 }, { "epoch": 3.639084867551518, "grad_norm": 0.2600226104259491, "learning_rate": 1.4695188766424132e-05, "loss": 1.2549, "step": 12218 }, { "epoch": 3.6393827137511865, "grad_norm": 0.2641436755657196, "learning_rate": 1.4694337107300205e-05, "loss": 1.2156, "step": 12219 }, { "epoch": 3.6396805599508553, "grad_norm": 0.26132163405418396, "learning_rate": 1.4693485404500716e-05, "loss": 1.2334, "step": 12220 }, { "epoch": 3.639978406150524, "grad_norm": 0.3215862512588501, "learning_rate": 1.469263365803359e-05, "loss": 1.2395, "step": 12221 }, { "epoch": 3.640276252350193, "grad_norm": 0.26509419083595276, "learning_rate": 1.469178186790675e-05, "loss": 1.2168, "step": 12222 }, { "epoch": 3.6405740985498616, "grad_norm": 0.2737056314945221, "learning_rate": 1.4690930034128124e-05, "loss": 1.2266, "step": 12223 }, { "epoch": 3.64087194474953, "grad_norm": 0.24052157998085022, "learning_rate": 1.4690078156705634e-05, "loss": 1.2235, "step": 12224 }, { "epoch": 3.6411697909491987, "grad_norm": 0.2925179898738861, "learning_rate": 1.4689226235647205e-05, "loss": 1.2265, "step": 12225 }, { "epoch": 3.6414676371488675, "grad_norm": 0.2738029956817627, "learning_rate": 1.4688374270960766e-05, "loss": 1.2355, "step": 12226 }, { "epoch": 3.641765483348536, "grad_norm": 0.30287572741508484, "learning_rate": 1.4687522262654244e-05, "loss": 1.2353, "step": 12227 }, { "epoch": 3.6420633295482046, "grad_norm": 0.7332232594490051, "learning_rate": 1.4686670210735562e-05, "loss": 1.2331, "step": 12228 }, { "epoch": 3.6423611757478733, "grad_norm": 0.3383345901966095, "learning_rate": 1.468581811521265e-05, "loss": 1.2258, "step": 12229 }, { "epoch": 3.6426590219475417, "grad_norm": 0.2796458899974823, "learning_rate": 1.468496597609344e-05, "loss": 1.2537, "step": 12230 }, { "epoch": 3.6429568681472104, "grad_norm": 0.24633510410785675, "learning_rate": 1.4684113793385852e-05, "loss": 1.2301, "step": 12231 }, { "epoch": 3.643254714346879, "grad_norm": 0.24334703385829926, "learning_rate": 1.468326156709782e-05, "loss": 1.2377, "step": 12232 }, { "epoch": 3.6435525605465475, "grad_norm": 0.2508890926837921, "learning_rate": 1.468240929723727e-05, "loss": 1.233, "step": 12233 }, { "epoch": 3.6438504067462163, "grad_norm": 0.2544920742511749, "learning_rate": 1.4681556983812136e-05, "loss": 1.2488, "step": 12234 }, { "epoch": 3.644148252945885, "grad_norm": 0.2417229562997818, "learning_rate": 1.4680704626830342e-05, "loss": 1.2223, "step": 12235 }, { "epoch": 3.644446099145554, "grad_norm": 0.24827317893505096, "learning_rate": 1.4679852226299823e-05, "loss": 1.235, "step": 12236 }, { "epoch": 3.6447439453452226, "grad_norm": 0.24662494659423828, "learning_rate": 1.4678999782228505e-05, "loss": 1.2244, "step": 12237 }, { "epoch": 3.645041791544891, "grad_norm": 0.24618802964687347, "learning_rate": 1.4678147294624323e-05, "loss": 1.2447, "step": 12238 }, { "epoch": 3.6453396377445597, "grad_norm": 0.2501998543739319, "learning_rate": 1.4677294763495207e-05, "loss": 1.2389, "step": 12239 }, { "epoch": 3.6456374839442285, "grad_norm": 0.2450493425130844, "learning_rate": 1.467644218884909e-05, "loss": 1.2482, "step": 12240 }, { "epoch": 3.645935330143897, "grad_norm": 0.23952670395374298, "learning_rate": 1.4675589570693905e-05, "loss": 1.2182, "step": 12241 }, { "epoch": 3.6462331763435656, "grad_norm": 0.2516058087348938, "learning_rate": 1.4674736909037579e-05, "loss": 1.2384, "step": 12242 }, { "epoch": 3.6465310225432344, "grad_norm": 0.25174760818481445, "learning_rate": 1.4673884203888052e-05, "loss": 1.2423, "step": 12243 }, { "epoch": 3.6468288687429027, "grad_norm": 0.2346472293138504, "learning_rate": 1.4673031455253253e-05, "loss": 1.223, "step": 12244 }, { "epoch": 3.6471267149425715, "grad_norm": 0.2519069015979767, "learning_rate": 1.4672178663141117e-05, "loss": 1.2423, "step": 12245 }, { "epoch": 3.6474245611422402, "grad_norm": 0.23978659510612488, "learning_rate": 1.4671325827559576e-05, "loss": 1.2165, "step": 12246 }, { "epoch": 3.6477224073419086, "grad_norm": 0.24777527153491974, "learning_rate": 1.4670472948516572e-05, "loss": 1.2421, "step": 12247 }, { "epoch": 3.6480202535415773, "grad_norm": 0.23103217780590057, "learning_rate": 1.4669620026020035e-05, "loss": 1.2289, "step": 12248 }, { "epoch": 3.648318099741246, "grad_norm": 0.24028585851192474, "learning_rate": 1.46687670600779e-05, "loss": 1.2273, "step": 12249 }, { "epoch": 3.648615945940915, "grad_norm": 0.23895616829395294, "learning_rate": 1.4667914050698102e-05, "loss": 1.2342, "step": 12250 }, { "epoch": 3.6489137921405836, "grad_norm": 0.23453964293003082, "learning_rate": 1.466706099788858e-05, "loss": 1.2413, "step": 12251 }, { "epoch": 3.649211638340252, "grad_norm": 0.2327313870191574, "learning_rate": 1.4666207901657273e-05, "loss": 1.2379, "step": 12252 }, { "epoch": 3.6495094845399207, "grad_norm": 0.23636144399642944, "learning_rate": 1.466535476201211e-05, "loss": 1.2336, "step": 12253 }, { "epoch": 3.6498073307395895, "grad_norm": 0.23853585124015808, "learning_rate": 1.4664501578961034e-05, "loss": 1.2214, "step": 12254 }, { "epoch": 3.650105176939258, "grad_norm": 0.24279792606830597, "learning_rate": 1.4663648352511986e-05, "loss": 1.2277, "step": 12255 }, { "epoch": 3.6504030231389266, "grad_norm": 0.24204349517822266, "learning_rate": 1.4662795082672898e-05, "loss": 1.2235, "step": 12256 }, { "epoch": 3.6507008693385954, "grad_norm": 0.2332702875137329, "learning_rate": 1.466194176945171e-05, "loss": 1.2192, "step": 12257 }, { "epoch": 3.6509987155382637, "grad_norm": 0.24131035804748535, "learning_rate": 1.4661088412856366e-05, "loss": 1.2364, "step": 12258 }, { "epoch": 3.6512965617379325, "grad_norm": 0.23367980122566223, "learning_rate": 1.46602350128948e-05, "loss": 1.2304, "step": 12259 }, { "epoch": 3.6515944079376013, "grad_norm": 0.237742081284523, "learning_rate": 1.4659381569574956e-05, "loss": 1.2367, "step": 12260 }, { "epoch": 3.65189225413727, "grad_norm": 0.2423309087753296, "learning_rate": 1.4658528082904768e-05, "loss": 1.2133, "step": 12261 }, { "epoch": 3.6521901003369384, "grad_norm": 0.242317795753479, "learning_rate": 1.4657674552892183e-05, "loss": 1.215, "step": 12262 }, { "epoch": 3.652487946536607, "grad_norm": 0.24148796498775482, "learning_rate": 1.4656820979545141e-05, "loss": 1.2201, "step": 12263 }, { "epoch": 3.652785792736276, "grad_norm": 0.2296200841665268, "learning_rate": 1.465596736287158e-05, "loss": 1.2349, "step": 12264 }, { "epoch": 3.6530836389359447, "grad_norm": 0.23521791398525238, "learning_rate": 1.4655113702879451e-05, "loss": 1.2336, "step": 12265 }, { "epoch": 3.653381485135613, "grad_norm": 0.23943842947483063, "learning_rate": 1.4654259999576685e-05, "loss": 1.2423, "step": 12266 }, { "epoch": 3.6536793313352818, "grad_norm": 0.232687309384346, "learning_rate": 1.4653406252971229e-05, "loss": 1.2401, "step": 12267 }, { "epoch": 3.6539771775349505, "grad_norm": 0.23358887434005737, "learning_rate": 1.4652552463071028e-05, "loss": 1.2362, "step": 12268 }, { "epoch": 3.654275023734619, "grad_norm": 0.25296834111213684, "learning_rate": 1.4651698629884025e-05, "loss": 1.2351, "step": 12269 }, { "epoch": 3.6545728699342876, "grad_norm": 0.24187542498111725, "learning_rate": 1.4650844753418164e-05, "loss": 1.2437, "step": 12270 }, { "epoch": 3.6548707161339564, "grad_norm": 0.2396983951330185, "learning_rate": 1.4649990833681388e-05, "loss": 1.2362, "step": 12271 }, { "epoch": 3.6551685623336247, "grad_norm": 0.2348756194114685, "learning_rate": 1.4649136870681643e-05, "loss": 1.2469, "step": 12272 }, { "epoch": 3.6554664085332935, "grad_norm": 0.2409428358078003, "learning_rate": 1.4648282864426873e-05, "loss": 1.2216, "step": 12273 }, { "epoch": 3.6557642547329623, "grad_norm": 0.23733539879322052, "learning_rate": 1.4647428814925025e-05, "loss": 1.2382, "step": 12274 }, { "epoch": 3.656062100932631, "grad_norm": 0.23807461559772491, "learning_rate": 1.4646574722184043e-05, "loss": 1.2114, "step": 12275 }, { "epoch": 3.6563599471322994, "grad_norm": 0.24783183634281158, "learning_rate": 1.4645720586211878e-05, "loss": 1.2137, "step": 12276 }, { "epoch": 3.656657793331968, "grad_norm": 0.25191429257392883, "learning_rate": 1.464486640701647e-05, "loss": 1.236, "step": 12277 }, { "epoch": 3.656955639531637, "grad_norm": 0.23169903457164764, "learning_rate": 1.4644012184605771e-05, "loss": 1.2279, "step": 12278 }, { "epoch": 3.6572534857313057, "grad_norm": 0.2517816424369812, "learning_rate": 1.4643157918987727e-05, "loss": 1.2198, "step": 12279 }, { "epoch": 3.657551331930974, "grad_norm": 0.2378106415271759, "learning_rate": 1.4642303610170286e-05, "loss": 1.2342, "step": 12280 }, { "epoch": 3.657849178130643, "grad_norm": 0.24237866699695587, "learning_rate": 1.4641449258161396e-05, "loss": 1.2263, "step": 12281 }, { "epoch": 3.6581470243303116, "grad_norm": 0.24208959937095642, "learning_rate": 1.4640594862969009e-05, "loss": 1.2322, "step": 12282 }, { "epoch": 3.65844487052998, "grad_norm": 0.24164338409900665, "learning_rate": 1.463974042460107e-05, "loss": 1.2359, "step": 12283 }, { "epoch": 3.6587427167296487, "grad_norm": 0.23954281210899353, "learning_rate": 1.4638885943065528e-05, "loss": 1.2104, "step": 12284 }, { "epoch": 3.6590405629293175, "grad_norm": 0.24737069010734558, "learning_rate": 1.4638031418370338e-05, "loss": 1.2475, "step": 12285 }, { "epoch": 3.659338409128986, "grad_norm": 0.26112329959869385, "learning_rate": 1.4637176850523447e-05, "loss": 1.2436, "step": 12286 }, { "epoch": 3.6596362553286546, "grad_norm": 0.236568421125412, "learning_rate": 1.4636322239532806e-05, "loss": 1.2364, "step": 12287 }, { "epoch": 3.6599341015283233, "grad_norm": 0.2685500681400299, "learning_rate": 1.4635467585406366e-05, "loss": 1.2433, "step": 12288 }, { "epoch": 3.660231947727992, "grad_norm": 0.2374984622001648, "learning_rate": 1.4634612888152082e-05, "loss": 1.2268, "step": 12289 }, { "epoch": 3.660529793927661, "grad_norm": 0.2977931797504425, "learning_rate": 1.4633758147777902e-05, "loss": 1.2335, "step": 12290 }, { "epoch": 3.660827640127329, "grad_norm": 0.2528064250946045, "learning_rate": 1.4632903364291776e-05, "loss": 1.2326, "step": 12291 }, { "epoch": 3.661125486326998, "grad_norm": 0.24254608154296875, "learning_rate": 1.4632048537701664e-05, "loss": 1.2328, "step": 12292 }, { "epoch": 3.6614233325266667, "grad_norm": 0.2421228289604187, "learning_rate": 1.4631193668015513e-05, "loss": 1.2288, "step": 12293 }, { "epoch": 3.661721178726335, "grad_norm": 0.2589697241783142, "learning_rate": 1.4630338755241283e-05, "loss": 1.228, "step": 12294 }, { "epoch": 3.662019024926004, "grad_norm": 0.2550235986709595, "learning_rate": 1.4629483799386922e-05, "loss": 1.2228, "step": 12295 }, { "epoch": 3.6623168711256726, "grad_norm": 0.2578994035720825, "learning_rate": 1.4628628800460385e-05, "loss": 1.2402, "step": 12296 }, { "epoch": 3.662614717325341, "grad_norm": 0.2588058412075043, "learning_rate": 1.462777375846963e-05, "loss": 1.2401, "step": 12297 }, { "epoch": 3.6629125635250097, "grad_norm": 0.2608630657196045, "learning_rate": 1.462691867342261e-05, "loss": 1.2467, "step": 12298 }, { "epoch": 3.6632104097246785, "grad_norm": 0.33025142550468445, "learning_rate": 1.4626063545327283e-05, "loss": 1.2279, "step": 12299 }, { "epoch": 3.663508255924347, "grad_norm": 0.24234481155872345, "learning_rate": 1.4625208374191603e-05, "loss": 1.2388, "step": 12300 }, { "epoch": 3.6638061021240156, "grad_norm": 0.3066018223762512, "learning_rate": 1.4624353160023526e-05, "loss": 1.2165, "step": 12301 }, { "epoch": 3.6641039483236844, "grad_norm": 0.24784480035305023, "learning_rate": 1.4623497902831007e-05, "loss": 1.231, "step": 12302 }, { "epoch": 3.664401794523353, "grad_norm": 0.28738903999328613, "learning_rate": 1.4622642602622008e-05, "loss": 1.2083, "step": 12303 }, { "epoch": 3.664699640723022, "grad_norm": 0.2508058249950409, "learning_rate": 1.4621787259404484e-05, "loss": 1.2304, "step": 12304 }, { "epoch": 3.6649974869226902, "grad_norm": 0.28743064403533936, "learning_rate": 1.4620931873186393e-05, "loss": 1.2446, "step": 12305 }, { "epoch": 3.665295333122359, "grad_norm": 0.2851032316684723, "learning_rate": 1.4620076443975697e-05, "loss": 1.2289, "step": 12306 }, { "epoch": 3.6655931793220278, "grad_norm": 0.3177313804626465, "learning_rate": 1.4619220971780348e-05, "loss": 1.24, "step": 12307 }, { "epoch": 3.665891025521696, "grad_norm": 0.26557326316833496, "learning_rate": 1.4618365456608309e-05, "loss": 1.2241, "step": 12308 }, { "epoch": 3.666188871721365, "grad_norm": 0.3448164463043213, "learning_rate": 1.4617509898467539e-05, "loss": 1.2331, "step": 12309 }, { "epoch": 3.6664867179210336, "grad_norm": 0.2601182162761688, "learning_rate": 1.4616654297366e-05, "loss": 1.2301, "step": 12310 }, { "epoch": 3.666784564120702, "grad_norm": 0.24896277487277985, "learning_rate": 1.461579865331165e-05, "loss": 1.2358, "step": 12311 }, { "epoch": 3.6670824103203707, "grad_norm": 0.25055694580078125, "learning_rate": 1.4614942966312452e-05, "loss": 1.2367, "step": 12312 }, { "epoch": 3.6673802565200395, "grad_norm": 0.24847956001758575, "learning_rate": 1.4614087236376364e-05, "loss": 1.2206, "step": 12313 }, { "epoch": 3.667678102719708, "grad_norm": 0.2494431734085083, "learning_rate": 1.4613231463511347e-05, "loss": 1.232, "step": 12314 }, { "epoch": 3.6679759489193766, "grad_norm": 0.2703046202659607, "learning_rate": 1.461237564772537e-05, "loss": 1.2367, "step": 12315 }, { "epoch": 3.6682737951190454, "grad_norm": 0.2518532872200012, "learning_rate": 1.4611519789026392e-05, "loss": 1.2474, "step": 12316 }, { "epoch": 3.668571641318714, "grad_norm": 0.2615206837654114, "learning_rate": 1.461066388742237e-05, "loss": 1.2279, "step": 12317 }, { "epoch": 3.668869487518383, "grad_norm": 0.27220064401626587, "learning_rate": 1.4609807942921274e-05, "loss": 1.2241, "step": 12318 }, { "epoch": 3.6691673337180513, "grad_norm": 0.27002599835395813, "learning_rate": 1.4608951955531066e-05, "loss": 1.2359, "step": 12319 }, { "epoch": 3.66946517991772, "grad_norm": 0.38039320707321167, "learning_rate": 1.4608095925259707e-05, "loss": 1.2285, "step": 12320 }, { "epoch": 3.669763026117389, "grad_norm": 0.24123649299144745, "learning_rate": 1.4607239852115165e-05, "loss": 1.2248, "step": 12321 }, { "epoch": 3.670060872317057, "grad_norm": 0.2824949622154236, "learning_rate": 1.4606383736105406e-05, "loss": 1.2427, "step": 12322 }, { "epoch": 3.670358718516726, "grad_norm": 0.2610337436199188, "learning_rate": 1.4605527577238391e-05, "loss": 1.2311, "step": 12323 }, { "epoch": 3.6706565647163947, "grad_norm": 0.2990787923336029, "learning_rate": 1.4604671375522088e-05, "loss": 1.2419, "step": 12324 }, { "epoch": 3.670954410916063, "grad_norm": 0.2511928975582123, "learning_rate": 1.4603815130964463e-05, "loss": 1.2374, "step": 12325 }, { "epoch": 3.6712522571157318, "grad_norm": 0.2628847360610962, "learning_rate": 1.4602958843573477e-05, "loss": 1.2354, "step": 12326 }, { "epoch": 3.6715501033154005, "grad_norm": 0.24535098671913147, "learning_rate": 1.4602102513357109e-05, "loss": 1.2267, "step": 12327 }, { "epoch": 3.6718479495150693, "grad_norm": 0.3978574872016907, "learning_rate": 1.4601246140323316e-05, "loss": 1.2394, "step": 12328 }, { "epoch": 3.6721457957147376, "grad_norm": 0.27503323554992676, "learning_rate": 1.4600389724480067e-05, "loss": 1.2449, "step": 12329 }, { "epoch": 3.6724436419144064, "grad_norm": 0.2998397648334503, "learning_rate": 1.4599533265835332e-05, "loss": 1.2265, "step": 12330 }, { "epoch": 3.672741488114075, "grad_norm": 0.2379540652036667, "learning_rate": 1.459867676439708e-05, "loss": 1.2417, "step": 12331 }, { "epoch": 3.673039334313744, "grad_norm": 0.447098970413208, "learning_rate": 1.4597820220173278e-05, "loss": 1.2468, "step": 12332 }, { "epoch": 3.6733371805134123, "grad_norm": 0.349273681640625, "learning_rate": 1.4596963633171897e-05, "loss": 1.2266, "step": 12333 }, { "epoch": 3.673635026713081, "grad_norm": 0.3068247437477112, "learning_rate": 1.4596107003400903e-05, "loss": 1.2445, "step": 12334 }, { "epoch": 3.67393287291275, "grad_norm": 0.3136936128139496, "learning_rate": 1.459525033086827e-05, "loss": 1.234, "step": 12335 }, { "epoch": 3.674230719112418, "grad_norm": 0.28681203722953796, "learning_rate": 1.4594393615581965e-05, "loss": 1.2232, "step": 12336 }, { "epoch": 3.674528565312087, "grad_norm": 0.2418070137500763, "learning_rate": 1.4593536857549964e-05, "loss": 1.2275, "step": 12337 }, { "epoch": 3.6748264115117557, "grad_norm": 0.27810901403427124, "learning_rate": 1.4592680056780232e-05, "loss": 1.2253, "step": 12338 }, { "epoch": 3.675124257711424, "grad_norm": 0.23245923221111298, "learning_rate": 1.4591823213280742e-05, "loss": 1.2371, "step": 12339 }, { "epoch": 3.675422103911093, "grad_norm": 0.277251660823822, "learning_rate": 1.4590966327059472e-05, "loss": 1.2284, "step": 12340 }, { "epoch": 3.6757199501107616, "grad_norm": 0.2440660148859024, "learning_rate": 1.4590109398124388e-05, "loss": 1.2366, "step": 12341 }, { "epoch": 3.6760177963104304, "grad_norm": 0.2404196560382843, "learning_rate": 1.4589252426483462e-05, "loss": 1.2589, "step": 12342 }, { "epoch": 3.676315642510099, "grad_norm": 0.2652944326400757, "learning_rate": 1.4588395412144672e-05, "loss": 1.2372, "step": 12343 }, { "epoch": 3.6766134887097675, "grad_norm": 0.25153660774230957, "learning_rate": 1.458753835511599e-05, "loss": 1.2519, "step": 12344 }, { "epoch": 3.6769113349094362, "grad_norm": 0.28372517228126526, "learning_rate": 1.4586681255405384e-05, "loss": 1.2395, "step": 12345 }, { "epoch": 3.677209181109105, "grad_norm": 0.24544914066791534, "learning_rate": 1.4585824113020839e-05, "loss": 1.2404, "step": 12346 }, { "epoch": 3.6775070273087733, "grad_norm": 0.2560138702392578, "learning_rate": 1.4584966927970324e-05, "loss": 1.2235, "step": 12347 }, { "epoch": 3.677804873508442, "grad_norm": 0.28169959783554077, "learning_rate": 1.458410970026181e-05, "loss": 1.2284, "step": 12348 }, { "epoch": 3.678102719708111, "grad_norm": 0.23807823657989502, "learning_rate": 1.4583252429903278e-05, "loss": 1.2555, "step": 12349 }, { "epoch": 3.678400565907779, "grad_norm": 0.2647009789943695, "learning_rate": 1.4582395116902706e-05, "loss": 1.2236, "step": 12350 }, { "epoch": 3.678698412107448, "grad_norm": 0.26797088980674744, "learning_rate": 1.4581537761268065e-05, "loss": 1.2495, "step": 12351 }, { "epoch": 3.6789962583071167, "grad_norm": 0.27212825417518616, "learning_rate": 1.4580680363007333e-05, "loss": 1.2262, "step": 12352 }, { "epoch": 3.679294104506785, "grad_norm": 0.24689751863479614, "learning_rate": 1.457982292212849e-05, "loss": 1.2144, "step": 12353 }, { "epoch": 3.679591950706454, "grad_norm": 0.26500949263572693, "learning_rate": 1.4578965438639511e-05, "loss": 1.2518, "step": 12354 }, { "epoch": 3.6798897969061226, "grad_norm": 0.2519420385360718, "learning_rate": 1.4578107912548374e-05, "loss": 1.2275, "step": 12355 }, { "epoch": 3.6801876431057914, "grad_norm": 0.24441255629062653, "learning_rate": 1.4577250343863059e-05, "loss": 1.224, "step": 12356 }, { "epoch": 3.68048548930546, "grad_norm": 0.2505742907524109, "learning_rate": 1.4576392732591545e-05, "loss": 1.2195, "step": 12357 }, { "epoch": 3.6807833355051285, "grad_norm": 0.2475583851337433, "learning_rate": 1.4575535078741804e-05, "loss": 1.2189, "step": 12358 }, { "epoch": 3.6810811817047973, "grad_norm": 0.2532002329826355, "learning_rate": 1.4574677382321828e-05, "loss": 1.2263, "step": 12359 }, { "epoch": 3.681379027904466, "grad_norm": 0.2748001515865326, "learning_rate": 1.4573819643339587e-05, "loss": 1.2322, "step": 12360 }, { "epoch": 3.6816768741041344, "grad_norm": 0.2338627725839615, "learning_rate": 1.4572961861803061e-05, "loss": 1.2068, "step": 12361 }, { "epoch": 3.681974720303803, "grad_norm": 0.24474157392978668, "learning_rate": 1.4572104037720239e-05, "loss": 1.2205, "step": 12362 }, { "epoch": 3.682272566503472, "grad_norm": 0.24145713448524475, "learning_rate": 1.4571246171099095e-05, "loss": 1.2293, "step": 12363 }, { "epoch": 3.6825704127031402, "grad_norm": 0.25295016169548035, "learning_rate": 1.4570388261947615e-05, "loss": 1.2194, "step": 12364 }, { "epoch": 3.682868258902809, "grad_norm": 0.24577906727790833, "learning_rate": 1.4569530310273776e-05, "loss": 1.2369, "step": 12365 }, { "epoch": 3.6831661051024778, "grad_norm": 0.2546062767505646, "learning_rate": 1.4568672316085564e-05, "loss": 1.2501, "step": 12366 }, { "epoch": 3.683463951302146, "grad_norm": 0.2619376480579376, "learning_rate": 1.4567814279390958e-05, "loss": 1.2387, "step": 12367 }, { "epoch": 3.683761797501815, "grad_norm": 0.2567276358604431, "learning_rate": 1.4566956200197946e-05, "loss": 1.246, "step": 12368 }, { "epoch": 3.6840596437014836, "grad_norm": 0.27727437019348145, "learning_rate": 1.4566098078514508e-05, "loss": 1.2566, "step": 12369 }, { "epoch": 3.6843574899011524, "grad_norm": 0.24857331812381744, "learning_rate": 1.456523991434863e-05, "loss": 1.2351, "step": 12370 }, { "epoch": 3.684655336100821, "grad_norm": 0.3601863980293274, "learning_rate": 1.4564381707708295e-05, "loss": 1.2246, "step": 12371 }, { "epoch": 3.6849531823004895, "grad_norm": 0.28480979800224304, "learning_rate": 1.456352345860149e-05, "loss": 1.2165, "step": 12372 }, { "epoch": 3.6852510285001583, "grad_norm": 0.29959580302238464, "learning_rate": 1.4562665167036191e-05, "loss": 1.238, "step": 12373 }, { "epoch": 3.685548874699827, "grad_norm": 0.27144676446914673, "learning_rate": 1.4561806833020398e-05, "loss": 1.236, "step": 12374 }, { "epoch": 3.6858467208994954, "grad_norm": 0.3229242265224457, "learning_rate": 1.4560948456562085e-05, "loss": 1.2331, "step": 12375 }, { "epoch": 3.686144567099164, "grad_norm": 0.285470575094223, "learning_rate": 1.4560090037669243e-05, "loss": 1.2402, "step": 12376 }, { "epoch": 3.686442413298833, "grad_norm": 0.28538915514945984, "learning_rate": 1.455923157634986e-05, "loss": 1.2298, "step": 12377 }, { "epoch": 3.6867402594985013, "grad_norm": 0.24156533181667328, "learning_rate": 1.455837307261192e-05, "loss": 1.2424, "step": 12378 }, { "epoch": 3.68703810569817, "grad_norm": 0.3212227523326874, "learning_rate": 1.4557514526463407e-05, "loss": 1.2251, "step": 12379 }, { "epoch": 3.687335951897839, "grad_norm": 0.24562905728816986, "learning_rate": 1.4556655937912318e-05, "loss": 1.2378, "step": 12380 }, { "epoch": 3.687633798097507, "grad_norm": 0.28291693329811096, "learning_rate": 1.455579730696664e-05, "loss": 1.2304, "step": 12381 }, { "epoch": 3.687931644297176, "grad_norm": 0.2626647353172302, "learning_rate": 1.4554938633634352e-05, "loss": 1.2395, "step": 12382 }, { "epoch": 3.6882294904968447, "grad_norm": 0.25677576661109924, "learning_rate": 1.455407991792345e-05, "loss": 1.2391, "step": 12383 }, { "epoch": 3.6885273366965134, "grad_norm": 0.29853981733322144, "learning_rate": 1.4553221159841925e-05, "loss": 1.2333, "step": 12384 }, { "epoch": 3.688825182896182, "grad_norm": 0.2698603570461273, "learning_rate": 1.4552362359397761e-05, "loss": 1.2282, "step": 12385 }, { "epoch": 3.6891230290958505, "grad_norm": 0.3009848892688751, "learning_rate": 1.4551503516598953e-05, "loss": 1.2447, "step": 12386 }, { "epoch": 3.6894208752955193, "grad_norm": 0.2589022219181061, "learning_rate": 1.455064463145349e-05, "loss": 1.2391, "step": 12387 }, { "epoch": 3.689718721495188, "grad_norm": 0.3754878640174866, "learning_rate": 1.4549785703969365e-05, "loss": 1.2167, "step": 12388 }, { "epoch": 3.6900165676948564, "grad_norm": 0.34311220049858093, "learning_rate": 1.4548926734154565e-05, "loss": 1.2172, "step": 12389 }, { "epoch": 3.690314413894525, "grad_norm": 0.37802860140800476, "learning_rate": 1.4548067722017085e-05, "loss": 1.2382, "step": 12390 }, { "epoch": 3.690612260094194, "grad_norm": 0.3811923861503601, "learning_rate": 1.4547208667564916e-05, "loss": 1.2268, "step": 12391 }, { "epoch": 3.6909101062938623, "grad_norm": 0.3294679820537567, "learning_rate": 1.4546349570806051e-05, "loss": 1.2384, "step": 12392 }, { "epoch": 3.691207952493531, "grad_norm": 0.3174379765987396, "learning_rate": 1.4545490431748481e-05, "loss": 1.2378, "step": 12393 }, { "epoch": 3.6915057986932, "grad_norm": 0.31269073486328125, "learning_rate": 1.4544631250400203e-05, "loss": 1.2109, "step": 12394 }, { "epoch": 3.6918036448928686, "grad_norm": 0.23892486095428467, "learning_rate": 1.4543772026769209e-05, "loss": 1.2331, "step": 12395 }, { "epoch": 3.692101491092537, "grad_norm": 0.3780063986778259, "learning_rate": 1.4542912760863493e-05, "loss": 1.2436, "step": 12396 }, { "epoch": 3.6923993372922057, "grad_norm": 0.25369495153427124, "learning_rate": 1.4542053452691045e-05, "loss": 1.2426, "step": 12397 }, { "epoch": 3.6926971834918745, "grad_norm": 0.26020002365112305, "learning_rate": 1.454119410225987e-05, "loss": 1.225, "step": 12398 }, { "epoch": 3.6929950296915433, "grad_norm": 0.2710517346858978, "learning_rate": 1.4540334709577955e-05, "loss": 1.226, "step": 12399 }, { "epoch": 3.6932928758912116, "grad_norm": 0.2550024390220642, "learning_rate": 1.4539475274653298e-05, "loss": 1.2211, "step": 12400 }, { "epoch": 3.6935907220908804, "grad_norm": 0.2555781900882721, "learning_rate": 1.4538615797493898e-05, "loss": 1.2244, "step": 12401 }, { "epoch": 3.693888568290549, "grad_norm": 0.29365408420562744, "learning_rate": 1.4537756278107743e-05, "loss": 1.224, "step": 12402 }, { "epoch": 3.6941864144902175, "grad_norm": 0.25374558568000793, "learning_rate": 1.4536896716502841e-05, "loss": 1.2223, "step": 12403 }, { "epoch": 3.6944842606898862, "grad_norm": 0.3000558018684387, "learning_rate": 1.4536037112687182e-05, "loss": 1.2434, "step": 12404 }, { "epoch": 3.694782106889555, "grad_norm": 0.2646952271461487, "learning_rate": 1.4535177466668768e-05, "loss": 1.2552, "step": 12405 }, { "epoch": 3.6950799530892233, "grad_norm": 0.29860109090805054, "learning_rate": 1.4534317778455593e-05, "loss": 1.2244, "step": 12406 }, { "epoch": 3.695377799288892, "grad_norm": 0.24162277579307556, "learning_rate": 1.4533458048055658e-05, "loss": 1.2348, "step": 12407 }, { "epoch": 3.695675645488561, "grad_norm": 0.2900954782962799, "learning_rate": 1.453259827547696e-05, "loss": 1.215, "step": 12408 }, { "epoch": 3.6959734916882296, "grad_norm": 0.24424606561660767, "learning_rate": 1.4531738460727497e-05, "loss": 1.2519, "step": 12409 }, { "epoch": 3.6962713378878984, "grad_norm": 0.2941872179508209, "learning_rate": 1.4530878603815273e-05, "loss": 1.2308, "step": 12410 }, { "epoch": 3.6965691840875667, "grad_norm": 0.2427120804786682, "learning_rate": 1.4530018704748286e-05, "loss": 1.2309, "step": 12411 }, { "epoch": 3.6968670302872355, "grad_norm": 0.25930750370025635, "learning_rate": 1.4529158763534536e-05, "loss": 1.2231, "step": 12412 }, { "epoch": 3.6971648764869043, "grad_norm": 0.24975840747356415, "learning_rate": 1.4528298780182025e-05, "loss": 1.2436, "step": 12413 }, { "epoch": 3.6974627226865726, "grad_norm": 0.24957533180713654, "learning_rate": 1.4527438754698751e-05, "loss": 1.2301, "step": 12414 }, { "epoch": 3.6977605688862414, "grad_norm": 0.2980298399925232, "learning_rate": 1.4526578687092719e-05, "loss": 1.2324, "step": 12415 }, { "epoch": 3.69805841508591, "grad_norm": 0.26207733154296875, "learning_rate": 1.4525718577371928e-05, "loss": 1.2235, "step": 12416 }, { "epoch": 3.6983562612855785, "grad_norm": 0.26013311743736267, "learning_rate": 1.4524858425544383e-05, "loss": 1.2224, "step": 12417 }, { "epoch": 3.6986541074852473, "grad_norm": 0.2547445595264435, "learning_rate": 1.4523998231618088e-05, "loss": 1.2296, "step": 12418 }, { "epoch": 3.698951953684916, "grad_norm": 0.3200279474258423, "learning_rate": 1.4523137995601042e-05, "loss": 1.2445, "step": 12419 }, { "epoch": 3.6992497998845844, "grad_norm": 0.272141695022583, "learning_rate": 1.4522277717501249e-05, "loss": 1.2268, "step": 12420 }, { "epoch": 3.699547646084253, "grad_norm": 0.31334221363067627, "learning_rate": 1.4521417397326717e-05, "loss": 1.2357, "step": 12421 }, { "epoch": 3.699845492283922, "grad_norm": 0.29066202044487, "learning_rate": 1.452055703508545e-05, "loss": 1.2365, "step": 12422 }, { "epoch": 3.7001433384835907, "grad_norm": 0.3803086578845978, "learning_rate": 1.4519696630785448e-05, "loss": 1.2171, "step": 12423 }, { "epoch": 3.7004411846832594, "grad_norm": 0.3314111828804016, "learning_rate": 1.4518836184434715e-05, "loss": 1.2319, "step": 12424 }, { "epoch": 3.7007390308829278, "grad_norm": 0.3040827810764313, "learning_rate": 1.4517975696041263e-05, "loss": 1.2384, "step": 12425 }, { "epoch": 3.7010368770825965, "grad_norm": 0.268032044172287, "learning_rate": 1.4517115165613094e-05, "loss": 1.2363, "step": 12426 }, { "epoch": 3.7013347232822653, "grad_norm": 0.3218487501144409, "learning_rate": 1.4516254593158216e-05, "loss": 1.2402, "step": 12427 }, { "epoch": 3.7016325694819336, "grad_norm": 0.26561835408210754, "learning_rate": 1.4515393978684635e-05, "loss": 1.2289, "step": 12428 }, { "epoch": 3.7019304156816024, "grad_norm": 0.30710723996162415, "learning_rate": 1.4514533322200359e-05, "loss": 1.2352, "step": 12429 }, { "epoch": 3.702228261881271, "grad_norm": 0.2447279989719391, "learning_rate": 1.4513672623713395e-05, "loss": 1.2141, "step": 12430 }, { "epoch": 3.7025261080809395, "grad_norm": 0.4446869194507599, "learning_rate": 1.4512811883231748e-05, "loss": 1.2366, "step": 12431 }, { "epoch": 3.7028239542806083, "grad_norm": 0.3563823103904724, "learning_rate": 1.4511951100763428e-05, "loss": 1.2334, "step": 12432 }, { "epoch": 3.703121800480277, "grad_norm": 0.3404117226600647, "learning_rate": 1.4511090276316444e-05, "loss": 1.2464, "step": 12433 }, { "epoch": 3.7034196466799454, "grad_norm": 0.2750464379787445, "learning_rate": 1.4510229409898806e-05, "loss": 1.2399, "step": 12434 }, { "epoch": 3.703717492879614, "grad_norm": 0.4659253656864166, "learning_rate": 1.4509368501518526e-05, "loss": 1.2372, "step": 12435 }, { "epoch": 3.704015339079283, "grad_norm": 0.2898559868335724, "learning_rate": 1.4508507551183606e-05, "loss": 1.2187, "step": 12436 }, { "epoch": 3.7043131852789517, "grad_norm": 0.3510069251060486, "learning_rate": 1.4507646558902062e-05, "loss": 1.2468, "step": 12437 }, { "epoch": 3.7046110314786205, "grad_norm": 0.26242899894714355, "learning_rate": 1.4506785524681898e-05, "loss": 1.2136, "step": 12438 }, { "epoch": 3.704908877678289, "grad_norm": 0.35440772771835327, "learning_rate": 1.4505924448531137e-05, "loss": 1.2418, "step": 12439 }, { "epoch": 3.7052067238779576, "grad_norm": 0.25245949625968933, "learning_rate": 1.450506333045778e-05, "loss": 1.2328, "step": 12440 }, { "epoch": 3.7055045700776263, "grad_norm": 0.27578699588775635, "learning_rate": 1.4504202170469842e-05, "loss": 1.2221, "step": 12441 }, { "epoch": 3.7058024162772947, "grad_norm": 0.31687596440315247, "learning_rate": 1.450334096857534e-05, "loss": 1.2126, "step": 12442 }, { "epoch": 3.7061002624769634, "grad_norm": 0.26394516229629517, "learning_rate": 1.4502479724782272e-05, "loss": 1.2167, "step": 12443 }, { "epoch": 3.706398108676632, "grad_norm": 0.3684418201446533, "learning_rate": 1.4501618439098666e-05, "loss": 1.2383, "step": 12444 }, { "epoch": 3.7066959548763005, "grad_norm": 0.2711924612522125, "learning_rate": 1.450075711153253e-05, "loss": 1.2403, "step": 12445 }, { "epoch": 3.7069938010759693, "grad_norm": 0.3082464039325714, "learning_rate": 1.449989574209188e-05, "loss": 1.2235, "step": 12446 }, { "epoch": 3.707291647275638, "grad_norm": 0.25182807445526123, "learning_rate": 1.4499034330784725e-05, "loss": 1.2322, "step": 12447 }, { "epoch": 3.7075894934753064, "grad_norm": 0.24612084031105042, "learning_rate": 1.4498172877619083e-05, "loss": 1.2261, "step": 12448 }, { "epoch": 3.707887339674975, "grad_norm": 0.44000059366226196, "learning_rate": 1.4497311382602967e-05, "loss": 1.25, "step": 12449 }, { "epoch": 3.708185185874644, "grad_norm": 0.2966899275779724, "learning_rate": 1.4496449845744393e-05, "loss": 1.2317, "step": 12450 }, { "epoch": 3.7084830320743127, "grad_norm": 0.3248160481452942, "learning_rate": 1.449558826705138e-05, "loss": 1.2416, "step": 12451 }, { "epoch": 3.7087808782739815, "grad_norm": 0.2693547010421753, "learning_rate": 1.4494726646531936e-05, "loss": 1.2145, "step": 12452 }, { "epoch": 3.70907872447365, "grad_norm": 0.6153912544250488, "learning_rate": 1.4493864984194089e-05, "loss": 1.2287, "step": 12453 }, { "epoch": 3.7093765706733186, "grad_norm": 0.3109544813632965, "learning_rate": 1.4493003280045843e-05, "loss": 1.2349, "step": 12454 }, { "epoch": 3.7096744168729874, "grad_norm": 0.26281970739364624, "learning_rate": 1.4492141534095222e-05, "loss": 1.2342, "step": 12455 }, { "epoch": 3.7099722630726557, "grad_norm": 0.26653775572776794, "learning_rate": 1.4491279746350246e-05, "loss": 1.2257, "step": 12456 }, { "epoch": 3.7102701092723245, "grad_norm": 0.23921839892864227, "learning_rate": 1.4490417916818929e-05, "loss": 1.2122, "step": 12457 }, { "epoch": 3.7105679554719933, "grad_norm": 0.24738293886184692, "learning_rate": 1.4489556045509287e-05, "loss": 1.2429, "step": 12458 }, { "epoch": 3.7108658016716616, "grad_norm": 0.24981005489826202, "learning_rate": 1.4488694132429348e-05, "loss": 1.2227, "step": 12459 }, { "epoch": 3.7111636478713304, "grad_norm": 0.2284761369228363, "learning_rate": 1.4487832177587121e-05, "loss": 1.2197, "step": 12460 }, { "epoch": 3.711461494070999, "grad_norm": 0.24517232179641724, "learning_rate": 1.4486970180990629e-05, "loss": 1.2362, "step": 12461 }, { "epoch": 3.711759340270668, "grad_norm": 0.2507041096687317, "learning_rate": 1.4486108142647894e-05, "loss": 1.2249, "step": 12462 }, { "epoch": 3.7120571864703362, "grad_norm": 0.2416991889476776, "learning_rate": 1.4485246062566938e-05, "loss": 1.231, "step": 12463 }, { "epoch": 3.712355032670005, "grad_norm": 0.246464803814888, "learning_rate": 1.4484383940755776e-05, "loss": 1.2286, "step": 12464 }, { "epoch": 3.7126528788696738, "grad_norm": 0.23346826434135437, "learning_rate": 1.4483521777222428e-05, "loss": 1.2312, "step": 12465 }, { "epoch": 3.7129507250693425, "grad_norm": 0.23480015993118286, "learning_rate": 1.4482659571974924e-05, "loss": 1.2303, "step": 12466 }, { "epoch": 3.713248571269011, "grad_norm": 0.23757559061050415, "learning_rate": 1.4481797325021281e-05, "loss": 1.219, "step": 12467 }, { "epoch": 3.7135464174686796, "grad_norm": 0.25187939405441284, "learning_rate": 1.4480935036369519e-05, "loss": 1.2272, "step": 12468 }, { "epoch": 3.7138442636683484, "grad_norm": 0.2528476417064667, "learning_rate": 1.4480072706027661e-05, "loss": 1.2181, "step": 12469 }, { "epoch": 3.7141421098680167, "grad_norm": 0.25194936990737915, "learning_rate": 1.4479210334003737e-05, "loss": 1.2198, "step": 12470 }, { "epoch": 3.7144399560676855, "grad_norm": 0.23710715770721436, "learning_rate": 1.4478347920305761e-05, "loss": 1.2315, "step": 12471 }, { "epoch": 3.7147378022673543, "grad_norm": 0.2685161828994751, "learning_rate": 1.4477485464941766e-05, "loss": 1.2192, "step": 12472 }, { "epoch": 3.7150356484670226, "grad_norm": 0.23555238544940948, "learning_rate": 1.4476622967919766e-05, "loss": 1.226, "step": 12473 }, { "epoch": 3.7153334946666914, "grad_norm": 0.24198657274246216, "learning_rate": 1.4475760429247793e-05, "loss": 1.2354, "step": 12474 }, { "epoch": 3.71563134086636, "grad_norm": 0.2961115539073944, "learning_rate": 1.4474897848933872e-05, "loss": 1.2279, "step": 12475 }, { "epoch": 3.715929187066029, "grad_norm": 0.33702465891838074, "learning_rate": 1.4474035226986025e-05, "loss": 1.2342, "step": 12476 }, { "epoch": 3.7162270332656977, "grad_norm": 0.23954392969608307, "learning_rate": 1.4473172563412277e-05, "loss": 1.2345, "step": 12477 }, { "epoch": 3.716524879465366, "grad_norm": 0.28280097246170044, "learning_rate": 1.4472309858220657e-05, "loss": 1.2304, "step": 12478 }, { "epoch": 3.716822725665035, "grad_norm": 0.24391360580921173, "learning_rate": 1.4471447111419189e-05, "loss": 1.2194, "step": 12479 }, { "epoch": 3.7171205718647036, "grad_norm": 0.28991103172302246, "learning_rate": 1.4470584323015904e-05, "loss": 1.2343, "step": 12480 }, { "epoch": 3.717418418064372, "grad_norm": 0.26414933800697327, "learning_rate": 1.4469721493018827e-05, "loss": 1.2309, "step": 12481 }, { "epoch": 3.7177162642640407, "grad_norm": 0.33833932876586914, "learning_rate": 1.4468858621435984e-05, "loss": 1.2396, "step": 12482 }, { "epoch": 3.7180141104637094, "grad_norm": 0.2547779381275177, "learning_rate": 1.4467995708275404e-05, "loss": 1.221, "step": 12483 }, { "epoch": 3.7183119566633778, "grad_norm": 0.28853312134742737, "learning_rate": 1.4467132753545114e-05, "loss": 1.2413, "step": 12484 }, { "epoch": 3.7186098028630465, "grad_norm": 0.2745019793510437, "learning_rate": 1.4466269757253148e-05, "loss": 1.2235, "step": 12485 }, { "epoch": 3.7189076490627153, "grad_norm": 0.24493719637393951, "learning_rate": 1.446540671940753e-05, "loss": 1.2381, "step": 12486 }, { "epoch": 3.7192054952623836, "grad_norm": 0.29772064089775085, "learning_rate": 1.4464543640016295e-05, "loss": 1.2106, "step": 12487 }, { "epoch": 3.7195033414620524, "grad_norm": 0.2681344747543335, "learning_rate": 1.4463680519087466e-05, "loss": 1.2276, "step": 12488 }, { "epoch": 3.719801187661721, "grad_norm": 0.3461523652076721, "learning_rate": 1.446281735662908e-05, "loss": 1.2286, "step": 12489 }, { "epoch": 3.72009903386139, "grad_norm": 0.24688516557216644, "learning_rate": 1.446195415264916e-05, "loss": 1.2279, "step": 12490 }, { "epoch": 3.7203968800610587, "grad_norm": 0.2979736626148224, "learning_rate": 1.4461090907155746e-05, "loss": 1.2172, "step": 12491 }, { "epoch": 3.720694726260727, "grad_norm": 0.25735822319984436, "learning_rate": 1.4460227620156864e-05, "loss": 1.2184, "step": 12492 }, { "epoch": 3.720992572460396, "grad_norm": 0.31744399666786194, "learning_rate": 1.4459364291660548e-05, "loss": 1.219, "step": 12493 }, { "epoch": 3.7212904186600646, "grad_norm": 0.2509385943412781, "learning_rate": 1.4458500921674828e-05, "loss": 1.2401, "step": 12494 }, { "epoch": 3.721588264859733, "grad_norm": 0.26265573501586914, "learning_rate": 1.4457637510207738e-05, "loss": 1.2403, "step": 12495 }, { "epoch": 3.7218861110594017, "grad_norm": 0.2772224247455597, "learning_rate": 1.445677405726731e-05, "loss": 1.2316, "step": 12496 }, { "epoch": 3.7221839572590705, "grad_norm": 0.26284000277519226, "learning_rate": 1.4455910562861583e-05, "loss": 1.2316, "step": 12497 }, { "epoch": 3.722481803458739, "grad_norm": 0.25751689076423645, "learning_rate": 1.4455047026998584e-05, "loss": 1.2318, "step": 12498 }, { "epoch": 3.7227796496584076, "grad_norm": 0.29437172412872314, "learning_rate": 1.4454183449686349e-05, "loss": 1.2412, "step": 12499 }, { "epoch": 3.7230774958580763, "grad_norm": 0.2750532627105713, "learning_rate": 1.4453319830932917e-05, "loss": 1.2396, "step": 12500 }, { "epoch": 3.7230774958580763, "eval_loss": 1.3298418521881104, "eval_runtime": 23.7313, "eval_samples_per_second": 73.068, "eval_steps_per_second": 4.593, "step": 12500 }, { "epoch": 3.7233753420577447, "grad_norm": 0.25021812319755554, "learning_rate": 1.4452456170746318e-05, "loss": 1.2211, "step": 12501 }, { "epoch": 3.7236731882574134, "grad_norm": 0.25232037901878357, "learning_rate": 1.4451592469134585e-05, "loss": 1.2192, "step": 12502 }, { "epoch": 3.723971034457082, "grad_norm": 0.25040093064308167, "learning_rate": 1.4450728726105762e-05, "loss": 1.2305, "step": 12503 }, { "epoch": 3.724268880656751, "grad_norm": 0.2766180634498596, "learning_rate": 1.4449864941667881e-05, "loss": 1.2185, "step": 12504 }, { "epoch": 3.7245667268564198, "grad_norm": 0.2391878068447113, "learning_rate": 1.4449001115828978e-05, "loss": 1.2395, "step": 12505 }, { "epoch": 3.724864573056088, "grad_norm": 0.24679675698280334, "learning_rate": 1.444813724859709e-05, "loss": 1.2331, "step": 12506 }, { "epoch": 3.725162419255757, "grad_norm": 0.2730211317539215, "learning_rate": 1.4447273339980254e-05, "loss": 1.2278, "step": 12507 }, { "epoch": 3.7254602654554256, "grad_norm": 0.2508825957775116, "learning_rate": 1.4446409389986506e-05, "loss": 1.2191, "step": 12508 }, { "epoch": 3.725758111655094, "grad_norm": 0.39348164200782776, "learning_rate": 1.444554539862389e-05, "loss": 1.2394, "step": 12509 }, { "epoch": 3.7260559578547627, "grad_norm": 0.35554832220077515, "learning_rate": 1.444468136590044e-05, "loss": 1.2251, "step": 12510 }, { "epoch": 3.7263538040544315, "grad_norm": 0.2719554305076599, "learning_rate": 1.4443817291824198e-05, "loss": 1.2342, "step": 12511 }, { "epoch": 3.7266516502541, "grad_norm": 0.28637775778770447, "learning_rate": 1.4442953176403199e-05, "loss": 1.2306, "step": 12512 }, { "epoch": 3.7269494964537686, "grad_norm": 0.2555995583534241, "learning_rate": 1.4442089019645484e-05, "loss": 1.2222, "step": 12513 }, { "epoch": 3.7272473426534374, "grad_norm": 0.24985826015472412, "learning_rate": 1.4441224821559093e-05, "loss": 1.2251, "step": 12514 }, { "epoch": 3.7275451888531057, "grad_norm": 0.26199600100517273, "learning_rate": 1.4440360582152069e-05, "loss": 1.223, "step": 12515 }, { "epoch": 3.7278430350527745, "grad_norm": 0.24520473182201385, "learning_rate": 1.443949630143245e-05, "loss": 1.2128, "step": 12516 }, { "epoch": 3.7281408812524433, "grad_norm": 0.28552722930908203, "learning_rate": 1.443863197940828e-05, "loss": 1.2402, "step": 12517 }, { "epoch": 3.728438727452112, "grad_norm": 0.2536368668079376, "learning_rate": 1.4437767616087595e-05, "loss": 1.2406, "step": 12518 }, { "epoch": 3.728736573651781, "grad_norm": 0.2644293010234833, "learning_rate": 1.4436903211478442e-05, "loss": 1.2252, "step": 12519 }, { "epoch": 3.729034419851449, "grad_norm": 0.24547292292118073, "learning_rate": 1.443603876558886e-05, "loss": 1.2244, "step": 12520 }, { "epoch": 3.729332266051118, "grad_norm": 0.27538245916366577, "learning_rate": 1.4435174278426898e-05, "loss": 1.2258, "step": 12521 }, { "epoch": 3.7296301122507867, "grad_norm": 0.34969258308410645, "learning_rate": 1.4434309750000593e-05, "loss": 1.2248, "step": 12522 }, { "epoch": 3.729927958450455, "grad_norm": 0.3113882839679718, "learning_rate": 1.4433445180317986e-05, "loss": 1.2179, "step": 12523 }, { "epoch": 3.7302258046501238, "grad_norm": 0.26758047938346863, "learning_rate": 1.4432580569387131e-05, "loss": 1.2318, "step": 12524 }, { "epoch": 3.7305236508497925, "grad_norm": 0.285544753074646, "learning_rate": 1.4431715917216063e-05, "loss": 1.2255, "step": 12525 }, { "epoch": 3.730821497049461, "grad_norm": 0.32492971420288086, "learning_rate": 1.4430851223812827e-05, "loss": 1.2467, "step": 12526 }, { "epoch": 3.7311193432491296, "grad_norm": 0.24724063277244568, "learning_rate": 1.4429986489185475e-05, "loss": 1.2159, "step": 12527 }, { "epoch": 3.7314171894487984, "grad_norm": 0.2818030118942261, "learning_rate": 1.4429121713342049e-05, "loss": 1.2311, "step": 12528 }, { "epoch": 3.731715035648467, "grad_norm": 0.24205012619495392, "learning_rate": 1.4428256896290592e-05, "loss": 1.21, "step": 12529 }, { "epoch": 3.7320128818481355, "grad_norm": 0.3191501498222351, "learning_rate": 1.4427392038039152e-05, "loss": 1.2321, "step": 12530 }, { "epoch": 3.7323107280478043, "grad_norm": 0.2558881342411041, "learning_rate": 1.4426527138595773e-05, "loss": 1.2289, "step": 12531 }, { "epoch": 3.732608574247473, "grad_norm": 0.25629472732543945, "learning_rate": 1.4425662197968509e-05, "loss": 1.2254, "step": 12532 }, { "epoch": 3.732906420447142, "grad_norm": 0.26400554180145264, "learning_rate": 1.44247972161654e-05, "loss": 1.2387, "step": 12533 }, { "epoch": 3.73320426664681, "grad_norm": 0.2637544572353363, "learning_rate": 1.44239321931945e-05, "loss": 1.2402, "step": 12534 }, { "epoch": 3.733502112846479, "grad_norm": 0.2510688602924347, "learning_rate": 1.4423067129063852e-05, "loss": 1.2342, "step": 12535 }, { "epoch": 3.7337999590461477, "grad_norm": 0.2486250400543213, "learning_rate": 1.4422202023781506e-05, "loss": 1.2252, "step": 12536 }, { "epoch": 3.734097805245816, "grad_norm": 0.2711244225502014, "learning_rate": 1.442133687735551e-05, "loss": 1.2361, "step": 12537 }, { "epoch": 3.734395651445485, "grad_norm": 0.25015541911125183, "learning_rate": 1.4420471689793914e-05, "loss": 1.2135, "step": 12538 }, { "epoch": 3.7346934976451536, "grad_norm": 0.2576945722103119, "learning_rate": 1.4419606461104767e-05, "loss": 1.2207, "step": 12539 }, { "epoch": 3.734991343844822, "grad_norm": 0.27408546209335327, "learning_rate": 1.4418741191296122e-05, "loss": 1.2217, "step": 12540 }, { "epoch": 3.7352891900444907, "grad_norm": 0.32182052731513977, "learning_rate": 1.441787588037603e-05, "loss": 1.2415, "step": 12541 }, { "epoch": 3.7355870362441594, "grad_norm": 0.24851801991462708, "learning_rate": 1.4417010528352534e-05, "loss": 1.2456, "step": 12542 }, { "epoch": 3.735884882443828, "grad_norm": 0.33332735300064087, "learning_rate": 1.4416145135233688e-05, "loss": 1.2236, "step": 12543 }, { "epoch": 3.736182728643497, "grad_norm": 0.28403955698013306, "learning_rate": 1.4415279701027549e-05, "loss": 1.2269, "step": 12544 }, { "epoch": 3.7364805748431653, "grad_norm": 0.2707551121711731, "learning_rate": 1.4414414225742167e-05, "loss": 1.2546, "step": 12545 }, { "epoch": 3.736778421042834, "grad_norm": 0.29916614294052124, "learning_rate": 1.441354870938559e-05, "loss": 1.218, "step": 12546 }, { "epoch": 3.737076267242503, "grad_norm": 0.24943292140960693, "learning_rate": 1.4412683151965875e-05, "loss": 1.2347, "step": 12547 }, { "epoch": 3.737374113442171, "grad_norm": 0.35270532965660095, "learning_rate": 1.4411817553491074e-05, "loss": 1.2319, "step": 12548 }, { "epoch": 3.73767195964184, "grad_norm": 0.26501792669296265, "learning_rate": 1.4410951913969235e-05, "loss": 1.2242, "step": 12549 }, { "epoch": 3.7379698058415087, "grad_norm": 0.3242575526237488, "learning_rate": 1.4410086233408422e-05, "loss": 1.2172, "step": 12550 }, { "epoch": 3.738267652041177, "grad_norm": 0.3066577911376953, "learning_rate": 1.4409220511816684e-05, "loss": 1.2276, "step": 12551 }, { "epoch": 3.738565498240846, "grad_norm": 0.26140496134757996, "learning_rate": 1.4408354749202072e-05, "loss": 1.2383, "step": 12552 }, { "epoch": 3.7388633444405146, "grad_norm": 0.23789289593696594, "learning_rate": 1.4407488945572645e-05, "loss": 1.2288, "step": 12553 }, { "epoch": 3.739161190640183, "grad_norm": 0.2502937614917755, "learning_rate": 1.4406623100936459e-05, "loss": 1.2389, "step": 12554 }, { "epoch": 3.7394590368398517, "grad_norm": 0.24868594110012054, "learning_rate": 1.4405757215301568e-05, "loss": 1.2307, "step": 12555 }, { "epoch": 3.7397568830395205, "grad_norm": 0.2607230246067047, "learning_rate": 1.4404891288676029e-05, "loss": 1.222, "step": 12556 }, { "epoch": 3.7400547292391892, "grad_norm": 0.30206650495529175, "learning_rate": 1.4404025321067896e-05, "loss": 1.2189, "step": 12557 }, { "epoch": 3.740352575438858, "grad_norm": 0.2802787125110626, "learning_rate": 1.440315931248523e-05, "loss": 1.2539, "step": 12558 }, { "epoch": 3.7406504216385263, "grad_norm": 0.26320210099220276, "learning_rate": 1.4402293262936086e-05, "loss": 1.2384, "step": 12559 }, { "epoch": 3.740948267838195, "grad_norm": 0.24549047648906708, "learning_rate": 1.4401427172428521e-05, "loss": 1.225, "step": 12560 }, { "epoch": 3.741246114037864, "grad_norm": 0.2524031698703766, "learning_rate": 1.4400561040970591e-05, "loss": 1.2515, "step": 12561 }, { "epoch": 3.741543960237532, "grad_norm": 0.23625604808330536, "learning_rate": 1.4399694868570361e-05, "loss": 1.2227, "step": 12562 }, { "epoch": 3.741841806437201, "grad_norm": 0.2702176570892334, "learning_rate": 1.4398828655235886e-05, "loss": 1.2344, "step": 12563 }, { "epoch": 3.7421396526368698, "grad_norm": 0.27187982201576233, "learning_rate": 1.4397962400975222e-05, "loss": 1.237, "step": 12564 }, { "epoch": 3.742437498836538, "grad_norm": 0.338758647441864, "learning_rate": 1.4397096105796435e-05, "loss": 1.2255, "step": 12565 }, { "epoch": 3.742735345036207, "grad_norm": 0.29826152324676514, "learning_rate": 1.4396229769707581e-05, "loss": 1.227, "step": 12566 }, { "epoch": 3.7430331912358756, "grad_norm": 0.29730749130249023, "learning_rate": 1.4395363392716717e-05, "loss": 1.222, "step": 12567 }, { "epoch": 3.743331037435544, "grad_norm": 0.27890071272850037, "learning_rate": 1.4394496974831911e-05, "loss": 1.2301, "step": 12568 }, { "epoch": 3.7436288836352127, "grad_norm": 0.2715962827205658, "learning_rate": 1.439363051606122e-05, "loss": 1.2369, "step": 12569 }, { "epoch": 3.7439267298348815, "grad_norm": 0.3789243996143341, "learning_rate": 1.4392764016412705e-05, "loss": 1.2358, "step": 12570 }, { "epoch": 3.7442245760345503, "grad_norm": 0.3870760500431061, "learning_rate": 1.439189747589443e-05, "loss": 1.2183, "step": 12571 }, { "epoch": 3.744522422234219, "grad_norm": 0.31367406249046326, "learning_rate": 1.4391030894514453e-05, "loss": 1.2339, "step": 12572 }, { "epoch": 3.7448202684338874, "grad_norm": 0.8676227927207947, "learning_rate": 1.4390164272280841e-05, "loss": 1.2247, "step": 12573 }, { "epoch": 3.745118114633556, "grad_norm": 0.29327961802482605, "learning_rate": 1.4389297609201655e-05, "loss": 1.2265, "step": 12574 }, { "epoch": 3.745415960833225, "grad_norm": 0.2783328890800476, "learning_rate": 1.4388430905284963e-05, "loss": 1.2395, "step": 12575 }, { "epoch": 3.7457138070328932, "grad_norm": 0.2528507709503174, "learning_rate": 1.4387564160538821e-05, "loss": 1.2494, "step": 12576 }, { "epoch": 3.746011653232562, "grad_norm": 0.24488899111747742, "learning_rate": 1.4386697374971297e-05, "loss": 1.2313, "step": 12577 }, { "epoch": 3.746309499432231, "grad_norm": 0.24234485626220703, "learning_rate": 1.4385830548590454e-05, "loss": 1.2361, "step": 12578 }, { "epoch": 3.746607345631899, "grad_norm": 0.24857281148433685, "learning_rate": 1.4384963681404358e-05, "loss": 1.2244, "step": 12579 }, { "epoch": 3.746905191831568, "grad_norm": 0.2506444454193115, "learning_rate": 1.4384096773421074e-05, "loss": 1.222, "step": 12580 }, { "epoch": 3.7472030380312367, "grad_norm": 0.24715283513069153, "learning_rate": 1.4383229824648668e-05, "loss": 1.2336, "step": 12581 }, { "epoch": 3.747500884230905, "grad_norm": 0.24358782172203064, "learning_rate": 1.4382362835095208e-05, "loss": 1.215, "step": 12582 }, { "epoch": 3.7477987304305738, "grad_norm": 0.24742436408996582, "learning_rate": 1.4381495804768757e-05, "loss": 1.2207, "step": 12583 }, { "epoch": 3.7480965766302425, "grad_norm": 0.2504050135612488, "learning_rate": 1.4380628733677382e-05, "loss": 1.241, "step": 12584 }, { "epoch": 3.7483944228299113, "grad_norm": 0.24546143412590027, "learning_rate": 1.4379761621829147e-05, "loss": 1.2237, "step": 12585 }, { "epoch": 3.74869226902958, "grad_norm": 0.23761746287345886, "learning_rate": 1.437889446923213e-05, "loss": 1.2434, "step": 12586 }, { "epoch": 3.7489901152292484, "grad_norm": 0.2457745224237442, "learning_rate": 1.437802727589439e-05, "loss": 1.2294, "step": 12587 }, { "epoch": 3.749287961428917, "grad_norm": 0.23757010698318481, "learning_rate": 1.4377160041823996e-05, "loss": 1.2546, "step": 12588 }, { "epoch": 3.749585807628586, "grad_norm": 0.2490760087966919, "learning_rate": 1.4376292767029023e-05, "loss": 1.2185, "step": 12589 }, { "epoch": 3.7498836538282543, "grad_norm": 0.25601309537887573, "learning_rate": 1.437542545151753e-05, "loss": 1.2462, "step": 12590 }, { "epoch": 3.750181500027923, "grad_norm": 0.23233304917812347, "learning_rate": 1.4374558095297593e-05, "loss": 1.2206, "step": 12591 }, { "epoch": 3.750479346227592, "grad_norm": 0.24676714837551117, "learning_rate": 1.4373690698377283e-05, "loss": 1.2526, "step": 12592 }, { "epoch": 3.75077719242726, "grad_norm": 0.2371000498533249, "learning_rate": 1.4372823260764665e-05, "loss": 1.2373, "step": 12593 }, { "epoch": 3.751075038626929, "grad_norm": 0.24489223957061768, "learning_rate": 1.4371955782467813e-05, "loss": 1.2283, "step": 12594 }, { "epoch": 3.7513728848265977, "grad_norm": 0.23973029851913452, "learning_rate": 1.4371088263494797e-05, "loss": 1.2308, "step": 12595 }, { "epoch": 3.7516707310262665, "grad_norm": 0.24719129502773285, "learning_rate": 1.4370220703853688e-05, "loss": 1.2271, "step": 12596 }, { "epoch": 3.751968577225935, "grad_norm": 0.23591454327106476, "learning_rate": 1.436935310355256e-05, "loss": 1.2325, "step": 12597 }, { "epoch": 3.7522664234256036, "grad_norm": 0.23700574040412903, "learning_rate": 1.4368485462599479e-05, "loss": 1.2377, "step": 12598 }, { "epoch": 3.7525642696252723, "grad_norm": 0.2341172993183136, "learning_rate": 1.4367617781002525e-05, "loss": 1.242, "step": 12599 }, { "epoch": 3.752862115824941, "grad_norm": 0.24162684381008148, "learning_rate": 1.4366750058769768e-05, "loss": 1.2362, "step": 12600 }, { "epoch": 3.7531599620246094, "grad_norm": 0.23593008518218994, "learning_rate": 1.4365882295909278e-05, "loss": 1.2418, "step": 12601 }, { "epoch": 3.753457808224278, "grad_norm": 0.24879764020442963, "learning_rate": 1.4365014492429132e-05, "loss": 1.2363, "step": 12602 }, { "epoch": 3.753755654423947, "grad_norm": 0.24039660394191742, "learning_rate": 1.4364146648337403e-05, "loss": 1.2264, "step": 12603 }, { "epoch": 3.7540535006236153, "grad_norm": 0.24104157090187073, "learning_rate": 1.4363278763642164e-05, "loss": 1.2338, "step": 12604 }, { "epoch": 3.754351346823284, "grad_norm": 0.251393586397171, "learning_rate": 1.4362410838351492e-05, "loss": 1.247, "step": 12605 }, { "epoch": 3.754649193022953, "grad_norm": 0.23401515185832977, "learning_rate": 1.4361542872473464e-05, "loss": 1.2235, "step": 12606 }, { "epoch": 3.754947039222621, "grad_norm": 0.23661254346370697, "learning_rate": 1.4360674866016152e-05, "loss": 1.236, "step": 12607 }, { "epoch": 3.75524488542229, "grad_norm": 0.22960497438907623, "learning_rate": 1.4359806818987628e-05, "loss": 1.2149, "step": 12608 }, { "epoch": 3.7555427316219587, "grad_norm": 0.23821663856506348, "learning_rate": 1.4358938731395975e-05, "loss": 1.2205, "step": 12609 }, { "epoch": 3.7558405778216275, "grad_norm": 0.23379720747470856, "learning_rate": 1.4358070603249267e-05, "loss": 1.2346, "step": 12610 }, { "epoch": 3.7561384240212963, "grad_norm": 0.23535868525505066, "learning_rate": 1.4357202434555582e-05, "loss": 1.22, "step": 12611 }, { "epoch": 3.7564362702209646, "grad_norm": 0.2433944195508957, "learning_rate": 1.4356334225322997e-05, "loss": 1.2182, "step": 12612 }, { "epoch": 3.7567341164206334, "grad_norm": 0.2404908686876297, "learning_rate": 1.4355465975559586e-05, "loss": 1.2302, "step": 12613 }, { "epoch": 3.757031962620302, "grad_norm": 0.24126465618610382, "learning_rate": 1.4354597685273432e-05, "loss": 1.2345, "step": 12614 }, { "epoch": 3.7573298088199705, "grad_norm": 0.23302330076694489, "learning_rate": 1.4353729354472611e-05, "loss": 1.2318, "step": 12615 }, { "epoch": 3.7576276550196392, "grad_norm": 0.24231773614883423, "learning_rate": 1.435286098316521e-05, "loss": 1.2303, "step": 12616 }, { "epoch": 3.757925501219308, "grad_norm": 0.2433927059173584, "learning_rate": 1.4351992571359292e-05, "loss": 1.2218, "step": 12617 }, { "epoch": 3.7582233474189763, "grad_norm": 0.24965760111808777, "learning_rate": 1.4351124119062949e-05, "loss": 1.223, "step": 12618 }, { "epoch": 3.758521193618645, "grad_norm": 0.23461966216564178, "learning_rate": 1.4350255626284254e-05, "loss": 1.2291, "step": 12619 }, { "epoch": 3.758819039818314, "grad_norm": 0.23923666775226593, "learning_rate": 1.4349387093031295e-05, "loss": 1.2447, "step": 12620 }, { "epoch": 3.759116886017982, "grad_norm": 0.24339085817337036, "learning_rate": 1.4348518519312147e-05, "loss": 1.2252, "step": 12621 }, { "epoch": 3.759414732217651, "grad_norm": 0.2397315949201584, "learning_rate": 1.4347649905134893e-05, "loss": 1.2248, "step": 12622 }, { "epoch": 3.7597125784173198, "grad_norm": 0.23497259616851807, "learning_rate": 1.4346781250507615e-05, "loss": 1.242, "step": 12623 }, { "epoch": 3.7600104246169885, "grad_norm": 0.24975162744522095, "learning_rate": 1.4345912555438393e-05, "loss": 1.2323, "step": 12624 }, { "epoch": 3.7603082708166573, "grad_norm": 0.23598866164684296, "learning_rate": 1.434504381993531e-05, "loss": 1.2245, "step": 12625 }, { "epoch": 3.7606061170163256, "grad_norm": 0.23897448182106018, "learning_rate": 1.4344175044006445e-05, "loss": 1.2249, "step": 12626 }, { "epoch": 3.7609039632159944, "grad_norm": 0.23851969838142395, "learning_rate": 1.434330622765989e-05, "loss": 1.2364, "step": 12627 }, { "epoch": 3.761201809415663, "grad_norm": 0.23931705951690674, "learning_rate": 1.4342437370903723e-05, "loss": 1.2207, "step": 12628 }, { "epoch": 3.7614996556153315, "grad_norm": 0.24956797063350677, "learning_rate": 1.4341568473746026e-05, "loss": 1.2324, "step": 12629 }, { "epoch": 3.7617975018150003, "grad_norm": 0.2446548491716385, "learning_rate": 1.4340699536194887e-05, "loss": 1.2255, "step": 12630 }, { "epoch": 3.762095348014669, "grad_norm": 0.23512233793735504, "learning_rate": 1.4339830558258385e-05, "loss": 1.2323, "step": 12631 }, { "epoch": 3.7623931942143374, "grad_norm": 0.2424010932445526, "learning_rate": 1.4338961539944611e-05, "loss": 1.2293, "step": 12632 }, { "epoch": 3.762691040414006, "grad_norm": 0.2611819803714752, "learning_rate": 1.4338092481261648e-05, "loss": 1.2458, "step": 12633 }, { "epoch": 3.762988886613675, "grad_norm": 0.23570644855499268, "learning_rate": 1.4337223382217582e-05, "loss": 1.2046, "step": 12634 }, { "epoch": 3.7632867328133432, "grad_norm": 0.24657092988491058, "learning_rate": 1.4336354242820497e-05, "loss": 1.2216, "step": 12635 }, { "epoch": 3.763584579013012, "grad_norm": 0.2414436936378479, "learning_rate": 1.4335485063078478e-05, "loss": 1.2171, "step": 12636 }, { "epoch": 3.763882425212681, "grad_norm": 0.26084816455841064, "learning_rate": 1.4334615842999618e-05, "loss": 1.2376, "step": 12637 }, { "epoch": 3.7641802714123496, "grad_norm": 0.25345945358276367, "learning_rate": 1.4333746582592e-05, "loss": 1.2395, "step": 12638 }, { "epoch": 3.7644781176120183, "grad_norm": 0.24163250625133514, "learning_rate": 1.4332877281863708e-05, "loss": 1.2292, "step": 12639 }, { "epoch": 3.7647759638116867, "grad_norm": 0.24772830307483673, "learning_rate": 1.433200794082284e-05, "loss": 1.22, "step": 12640 }, { "epoch": 3.7650738100113554, "grad_norm": 0.24569171667099, "learning_rate": 1.4331138559477476e-05, "loss": 1.2263, "step": 12641 }, { "epoch": 3.765371656211024, "grad_norm": 0.25932878255844116, "learning_rate": 1.4330269137835706e-05, "loss": 1.2397, "step": 12642 }, { "epoch": 3.7656695024106925, "grad_norm": 0.26829370856285095, "learning_rate": 1.4329399675905618e-05, "loss": 1.22, "step": 12643 }, { "epoch": 3.7659673486103613, "grad_norm": 0.23489348590373993, "learning_rate": 1.4328530173695306e-05, "loss": 1.2226, "step": 12644 }, { "epoch": 3.76626519481003, "grad_norm": 0.24736730754375458, "learning_rate": 1.4327660631212856e-05, "loss": 1.2418, "step": 12645 }, { "epoch": 3.7665630410096984, "grad_norm": 0.23377391695976257, "learning_rate": 1.4326791048466358e-05, "loss": 1.2251, "step": 12646 }, { "epoch": 3.766860887209367, "grad_norm": 0.2361942082643509, "learning_rate": 1.4325921425463904e-05, "loss": 1.221, "step": 12647 }, { "epoch": 3.767158733409036, "grad_norm": 0.24728500843048096, "learning_rate": 1.4325051762213586e-05, "loss": 1.2502, "step": 12648 }, { "epoch": 3.7674565796087043, "grad_norm": 0.25355592370033264, "learning_rate": 1.432418205872349e-05, "loss": 1.2395, "step": 12649 }, { "epoch": 3.767754425808373, "grad_norm": 0.2558923363685608, "learning_rate": 1.4323312315001714e-05, "loss": 1.2358, "step": 12650 }, { "epoch": 3.768052272008042, "grad_norm": 0.3136868178844452, "learning_rate": 1.4322442531056346e-05, "loss": 1.2226, "step": 12651 }, { "epoch": 3.7683501182077106, "grad_norm": 0.26222771406173706, "learning_rate": 1.4321572706895482e-05, "loss": 1.2319, "step": 12652 }, { "epoch": 3.7686479644073794, "grad_norm": 0.2818720042705536, "learning_rate": 1.432070284252721e-05, "loss": 1.2479, "step": 12653 }, { "epoch": 3.7689458106070477, "grad_norm": 0.27556848526000977, "learning_rate": 1.4319832937959626e-05, "loss": 1.2295, "step": 12654 }, { "epoch": 3.7692436568067165, "grad_norm": 0.26749783754348755, "learning_rate": 1.4318962993200821e-05, "loss": 1.2296, "step": 12655 }, { "epoch": 3.7695415030063852, "grad_norm": 0.2664186656475067, "learning_rate": 1.4318093008258892e-05, "loss": 1.2293, "step": 12656 }, { "epoch": 3.7698393492060536, "grad_norm": 0.2746717035770416, "learning_rate": 1.4317222983141934e-05, "loss": 1.2295, "step": 12657 }, { "epoch": 3.7701371954057223, "grad_norm": 0.2837357521057129, "learning_rate": 1.4316352917858038e-05, "loss": 1.2262, "step": 12658 }, { "epoch": 3.770435041605391, "grad_norm": 0.27733665704727173, "learning_rate": 1.43154828124153e-05, "loss": 1.2141, "step": 12659 }, { "epoch": 3.7707328878050594, "grad_norm": 0.2695692777633667, "learning_rate": 1.4314612666821817e-05, "loss": 1.2299, "step": 12660 }, { "epoch": 3.771030734004728, "grad_norm": 0.2996913492679596, "learning_rate": 1.4313742481085684e-05, "loss": 1.2332, "step": 12661 }, { "epoch": 3.771328580204397, "grad_norm": 0.31992167234420776, "learning_rate": 1.4312872255214996e-05, "loss": 1.2174, "step": 12662 }, { "epoch": 3.7716264264040658, "grad_norm": 0.2649223208427429, "learning_rate": 1.431200198921785e-05, "loss": 1.2314, "step": 12663 }, { "epoch": 3.771924272603734, "grad_norm": 0.3016444146633148, "learning_rate": 1.4311131683102347e-05, "loss": 1.2264, "step": 12664 }, { "epoch": 3.772222118803403, "grad_norm": 0.3113537132740021, "learning_rate": 1.4310261336876576e-05, "loss": 1.2258, "step": 12665 }, { "epoch": 3.7725199650030716, "grad_norm": 0.3229823708534241, "learning_rate": 1.4309390950548639e-05, "loss": 1.2267, "step": 12666 }, { "epoch": 3.7728178112027404, "grad_norm": 0.2772013247013092, "learning_rate": 1.4308520524126634e-05, "loss": 1.2367, "step": 12667 }, { "epoch": 3.7731156574024087, "grad_norm": 0.2575990855693817, "learning_rate": 1.430765005761866e-05, "loss": 1.2111, "step": 12668 }, { "epoch": 3.7734135036020775, "grad_norm": 0.2997469902038574, "learning_rate": 1.4306779551032817e-05, "loss": 1.2358, "step": 12669 }, { "epoch": 3.7737113498017463, "grad_norm": 0.28576475381851196, "learning_rate": 1.4305909004377198e-05, "loss": 1.2455, "step": 12670 }, { "epoch": 3.7740091960014146, "grad_norm": 0.2684483528137207, "learning_rate": 1.4305038417659912e-05, "loss": 1.2355, "step": 12671 }, { "epoch": 3.7743070422010834, "grad_norm": 0.23956318199634552, "learning_rate": 1.4304167790889048e-05, "loss": 1.2363, "step": 12672 }, { "epoch": 3.774604888400752, "grad_norm": 0.2986161708831787, "learning_rate": 1.4303297124072713e-05, "loss": 1.227, "step": 12673 }, { "epoch": 3.7749027346004205, "grad_norm": 0.2462140917778015, "learning_rate": 1.4302426417219009e-05, "loss": 1.2394, "step": 12674 }, { "epoch": 3.7752005808000892, "grad_norm": 0.2997017204761505, "learning_rate": 1.430155567033603e-05, "loss": 1.2514, "step": 12675 }, { "epoch": 3.775498426999758, "grad_norm": 0.251280277967453, "learning_rate": 1.4300684883431886e-05, "loss": 1.2448, "step": 12676 }, { "epoch": 3.775796273199427, "grad_norm": 0.27205878496170044, "learning_rate": 1.4299814056514672e-05, "loss": 1.2221, "step": 12677 }, { "epoch": 3.7760941193990956, "grad_norm": 0.29926252365112305, "learning_rate": 1.4298943189592491e-05, "loss": 1.2315, "step": 12678 }, { "epoch": 3.776391965598764, "grad_norm": 0.2541559636592865, "learning_rate": 1.4298072282673448e-05, "loss": 1.2283, "step": 12679 }, { "epoch": 3.7766898117984327, "grad_norm": 0.28829267621040344, "learning_rate": 1.4297201335765643e-05, "loss": 1.2127, "step": 12680 }, { "epoch": 3.7769876579981014, "grad_norm": 0.2551625967025757, "learning_rate": 1.4296330348877183e-05, "loss": 1.233, "step": 12681 }, { "epoch": 3.7772855041977698, "grad_norm": 0.2720828652381897, "learning_rate": 1.429545932201617e-05, "loss": 1.2149, "step": 12682 }, { "epoch": 3.7775833503974385, "grad_norm": 0.30832982063293457, "learning_rate": 1.4294588255190704e-05, "loss": 1.2323, "step": 12683 }, { "epoch": 3.7778811965971073, "grad_norm": 0.26237520575523376, "learning_rate": 1.4293717148408895e-05, "loss": 1.2282, "step": 12684 }, { "epoch": 3.7781790427967756, "grad_norm": 0.30137020349502563, "learning_rate": 1.4292846001678842e-05, "loss": 1.251, "step": 12685 }, { "epoch": 3.7784768889964444, "grad_norm": 0.2443857342004776, "learning_rate": 1.4291974815008656e-05, "loss": 1.2236, "step": 12686 }, { "epoch": 3.778774735196113, "grad_norm": 0.278854638338089, "learning_rate": 1.4291103588406439e-05, "loss": 1.2435, "step": 12687 }, { "epoch": 3.7790725813957815, "grad_norm": 0.27049946784973145, "learning_rate": 1.4290232321880297e-05, "loss": 1.2415, "step": 12688 }, { "epoch": 3.7793704275954503, "grad_norm": 0.28129956126213074, "learning_rate": 1.4289361015438337e-05, "loss": 1.235, "step": 12689 }, { "epoch": 3.779668273795119, "grad_norm": 0.30038005113601685, "learning_rate": 1.4288489669088663e-05, "loss": 1.2378, "step": 12690 }, { "epoch": 3.779966119994788, "grad_norm": 0.2376733124256134, "learning_rate": 1.4287618282839388e-05, "loss": 1.232, "step": 12691 }, { "epoch": 3.7802639661944566, "grad_norm": 0.31067323684692383, "learning_rate": 1.4286746856698614e-05, "loss": 1.2307, "step": 12692 }, { "epoch": 3.780561812394125, "grad_norm": 0.24436017870903015, "learning_rate": 1.4285875390674448e-05, "loss": 1.2438, "step": 12693 }, { "epoch": 3.7808596585937937, "grad_norm": 0.2674926519393921, "learning_rate": 1.4285003884775002e-05, "loss": 1.2166, "step": 12694 }, { "epoch": 3.7811575047934625, "grad_norm": 0.25261390209198, "learning_rate": 1.4284132339008383e-05, "loss": 1.2287, "step": 12695 }, { "epoch": 3.781455350993131, "grad_norm": 0.2543044686317444, "learning_rate": 1.4283260753382696e-05, "loss": 1.2535, "step": 12696 }, { "epoch": 3.7817531971927996, "grad_norm": 0.2796723246574402, "learning_rate": 1.4282389127906053e-05, "loss": 1.2226, "step": 12697 }, { "epoch": 3.7820510433924683, "grad_norm": 0.2589174807071686, "learning_rate": 1.4281517462586567e-05, "loss": 1.2269, "step": 12698 }, { "epoch": 3.7823488895921367, "grad_norm": 0.2836647033691406, "learning_rate": 1.4280645757432343e-05, "loss": 1.2506, "step": 12699 }, { "epoch": 3.7826467357918054, "grad_norm": 0.26077181100845337, "learning_rate": 1.4279774012451493e-05, "loss": 1.2353, "step": 12700 }, { "epoch": 3.782944581991474, "grad_norm": 0.2713318467140198, "learning_rate": 1.4278902227652128e-05, "loss": 1.2497, "step": 12701 }, { "epoch": 3.7832424281911425, "grad_norm": 0.24418582022190094, "learning_rate": 1.4278030403042357e-05, "loss": 1.2176, "step": 12702 }, { "epoch": 3.7835402743908113, "grad_norm": 0.28171828389167786, "learning_rate": 1.4277158538630294e-05, "loss": 1.2286, "step": 12703 }, { "epoch": 3.78383812059048, "grad_norm": 0.26205557584762573, "learning_rate": 1.4276286634424048e-05, "loss": 1.2259, "step": 12704 }, { "epoch": 3.784135966790149, "grad_norm": 0.26702257990837097, "learning_rate": 1.4275414690431735e-05, "loss": 1.2135, "step": 12705 }, { "epoch": 3.7844338129898176, "grad_norm": 0.2754550874233246, "learning_rate": 1.4274542706661465e-05, "loss": 1.2265, "step": 12706 }, { "epoch": 3.784731659189486, "grad_norm": 0.29837870597839355, "learning_rate": 1.427367068312135e-05, "loss": 1.2318, "step": 12707 }, { "epoch": 3.7850295053891547, "grad_norm": 0.26493605971336365, "learning_rate": 1.4272798619819503e-05, "loss": 1.2214, "step": 12708 }, { "epoch": 3.7853273515888235, "grad_norm": 0.3191721439361572, "learning_rate": 1.4271926516764038e-05, "loss": 1.2354, "step": 12709 }, { "epoch": 3.785625197788492, "grad_norm": 0.2518579363822937, "learning_rate": 1.4271054373963073e-05, "loss": 1.2078, "step": 12710 }, { "epoch": 3.7859230439881606, "grad_norm": 0.3005197048187256, "learning_rate": 1.4270182191424718e-05, "loss": 1.2429, "step": 12711 }, { "epoch": 3.7862208901878294, "grad_norm": 0.25633421540260315, "learning_rate": 1.4269309969157087e-05, "loss": 1.2191, "step": 12712 }, { "epoch": 3.7865187363874977, "grad_norm": 0.3384515941143036, "learning_rate": 1.4268437707168299e-05, "loss": 1.2168, "step": 12713 }, { "epoch": 3.7868165825871665, "grad_norm": 0.2397773265838623, "learning_rate": 1.4267565405466464e-05, "loss": 1.2267, "step": 12714 }, { "epoch": 3.7871144287868352, "grad_norm": 0.30711597204208374, "learning_rate": 1.4266693064059705e-05, "loss": 1.2221, "step": 12715 }, { "epoch": 3.7874122749865036, "grad_norm": 0.25729888677597046, "learning_rate": 1.4265820682956131e-05, "loss": 1.2428, "step": 12716 }, { "epoch": 3.7877101211861723, "grad_norm": 0.24500930309295654, "learning_rate": 1.426494826216386e-05, "loss": 1.2185, "step": 12717 }, { "epoch": 3.788007967385841, "grad_norm": 0.279811829328537, "learning_rate": 1.4264075801691014e-05, "loss": 1.2491, "step": 12718 }, { "epoch": 3.78830581358551, "grad_norm": 0.2677135467529297, "learning_rate": 1.4263203301545705e-05, "loss": 1.237, "step": 12719 }, { "epoch": 3.7886036597851787, "grad_norm": 0.29217711091041565, "learning_rate": 1.4262330761736051e-05, "loss": 1.2313, "step": 12720 }, { "epoch": 3.788901505984847, "grad_norm": 0.26018255949020386, "learning_rate": 1.4261458182270173e-05, "loss": 1.2316, "step": 12721 }, { "epoch": 3.7891993521845158, "grad_norm": 0.349682480096817, "learning_rate": 1.426058556315619e-05, "loss": 1.2238, "step": 12722 }, { "epoch": 3.7894971983841845, "grad_norm": 0.24746723473072052, "learning_rate": 1.4259712904402219e-05, "loss": 1.2287, "step": 12723 }, { "epoch": 3.789795044583853, "grad_norm": 0.313815176486969, "learning_rate": 1.4258840206016376e-05, "loss": 1.2183, "step": 12724 }, { "epoch": 3.7900928907835216, "grad_norm": 0.24808746576309204, "learning_rate": 1.4257967468006782e-05, "loss": 1.2433, "step": 12725 }, { "epoch": 3.7903907369831904, "grad_norm": 0.2969103753566742, "learning_rate": 1.4257094690381559e-05, "loss": 1.2373, "step": 12726 }, { "epoch": 3.7906885831828587, "grad_norm": 0.26070764660835266, "learning_rate": 1.4256221873148826e-05, "loss": 1.2149, "step": 12727 }, { "epoch": 3.7909864293825275, "grad_norm": 0.3144305944442749, "learning_rate": 1.4255349016316707e-05, "loss": 1.2255, "step": 12728 }, { "epoch": 3.7912842755821963, "grad_norm": 0.2931560277938843, "learning_rate": 1.4254476119893317e-05, "loss": 1.2357, "step": 12729 }, { "epoch": 3.791582121781865, "grad_norm": 0.3370493948459625, "learning_rate": 1.4253603183886779e-05, "loss": 1.2322, "step": 12730 }, { "epoch": 3.7918799679815334, "grad_norm": 0.23886103928089142, "learning_rate": 1.425273020830522e-05, "loss": 1.2387, "step": 12731 }, { "epoch": 3.792177814181202, "grad_norm": 0.25827065110206604, "learning_rate": 1.4251857193156754e-05, "loss": 1.2214, "step": 12732 }, { "epoch": 3.792475660380871, "grad_norm": 0.24766288697719574, "learning_rate": 1.4250984138449506e-05, "loss": 1.2268, "step": 12733 }, { "epoch": 3.7927735065805397, "grad_norm": 0.23920349776744843, "learning_rate": 1.4250111044191604e-05, "loss": 1.2229, "step": 12734 }, { "epoch": 3.793071352780208, "grad_norm": 0.26184290647506714, "learning_rate": 1.4249237910391164e-05, "loss": 1.2299, "step": 12735 }, { "epoch": 3.793369198979877, "grad_norm": 0.24239376187324524, "learning_rate": 1.4248364737056318e-05, "loss": 1.224, "step": 12736 }, { "epoch": 3.7936670451795456, "grad_norm": 0.24595192074775696, "learning_rate": 1.4247491524195176e-05, "loss": 1.246, "step": 12737 }, { "epoch": 3.793964891379214, "grad_norm": 0.2419978678226471, "learning_rate": 1.4246618271815877e-05, "loss": 1.223, "step": 12738 }, { "epoch": 3.7942627375788827, "grad_norm": 0.2508220970630646, "learning_rate": 1.4245744979926538e-05, "loss": 1.25, "step": 12739 }, { "epoch": 3.7945605837785514, "grad_norm": 0.24817495048046112, "learning_rate": 1.4244871648535288e-05, "loss": 1.2404, "step": 12740 }, { "epoch": 3.7948584299782198, "grad_norm": 0.2514076828956604, "learning_rate": 1.4243998277650248e-05, "loss": 1.2483, "step": 12741 }, { "epoch": 3.7951562761778885, "grad_norm": 0.2461961805820465, "learning_rate": 1.4243124867279545e-05, "loss": 1.2093, "step": 12742 }, { "epoch": 3.7954541223775573, "grad_norm": 0.2558443248271942, "learning_rate": 1.4242251417431306e-05, "loss": 1.2331, "step": 12743 }, { "epoch": 3.795751968577226, "grad_norm": 0.28598901629447937, "learning_rate": 1.4241377928113658e-05, "loss": 1.2371, "step": 12744 }, { "epoch": 3.796049814776895, "grad_norm": 0.2537623941898346, "learning_rate": 1.4240504399334728e-05, "loss": 1.2321, "step": 12745 }, { "epoch": 3.796347660976563, "grad_norm": 0.2908998131752014, "learning_rate": 1.4239630831102641e-05, "loss": 1.2273, "step": 12746 }, { "epoch": 3.796645507176232, "grad_norm": 0.3052311837673187, "learning_rate": 1.4238757223425528e-05, "loss": 1.2196, "step": 12747 }, { "epoch": 3.7969433533759007, "grad_norm": 0.3534977436065674, "learning_rate": 1.423788357631151e-05, "loss": 1.2208, "step": 12748 }, { "epoch": 3.797241199575569, "grad_norm": 0.27182501554489136, "learning_rate": 1.4237009889768725e-05, "loss": 1.2239, "step": 12749 }, { "epoch": 3.797539045775238, "grad_norm": 0.33824798464775085, "learning_rate": 1.4236136163805294e-05, "loss": 1.2348, "step": 12750 }, { "epoch": 3.7978368919749066, "grad_norm": 0.3123975694179535, "learning_rate": 1.423526239842935e-05, "loss": 1.2235, "step": 12751 }, { "epoch": 3.798134738174575, "grad_norm": 0.2852601408958435, "learning_rate": 1.423438859364902e-05, "loss": 1.2294, "step": 12752 }, { "epoch": 3.7984325843742437, "grad_norm": 0.33295968174934387, "learning_rate": 1.423351474947244e-05, "loss": 1.2377, "step": 12753 }, { "epoch": 3.7987304305739125, "grad_norm": 0.2448887676000595, "learning_rate": 1.4232640865907729e-05, "loss": 1.2299, "step": 12754 }, { "epoch": 3.799028276773581, "grad_norm": 0.299250990152359, "learning_rate": 1.4231766942963025e-05, "loss": 1.2408, "step": 12755 }, { "epoch": 3.7993261229732496, "grad_norm": 0.25415459275245667, "learning_rate": 1.423089298064646e-05, "loss": 1.2254, "step": 12756 }, { "epoch": 3.7996239691729183, "grad_norm": 0.2801014482975006, "learning_rate": 1.4230018978966162e-05, "loss": 1.2234, "step": 12757 }, { "epoch": 3.799921815372587, "grad_norm": 0.2553093135356903, "learning_rate": 1.4229144937930263e-05, "loss": 1.2236, "step": 12758 }, { "epoch": 3.800219661572256, "grad_norm": 0.2585800290107727, "learning_rate": 1.4228270857546895e-05, "loss": 1.2099, "step": 12759 }, { "epoch": 3.800517507771924, "grad_norm": 0.29664096236228943, "learning_rate": 1.4227396737824193e-05, "loss": 1.2389, "step": 12760 }, { "epoch": 3.800815353971593, "grad_norm": 0.2602430284023285, "learning_rate": 1.4226522578770285e-05, "loss": 1.2383, "step": 12761 }, { "epoch": 3.8011132001712618, "grad_norm": 0.27732184529304504, "learning_rate": 1.4225648380393306e-05, "loss": 1.2235, "step": 12762 }, { "epoch": 3.80141104637093, "grad_norm": 0.2764049470424652, "learning_rate": 1.4224774142701394e-05, "loss": 1.2356, "step": 12763 }, { "epoch": 3.801708892570599, "grad_norm": 0.24653466045856476, "learning_rate": 1.4223899865702677e-05, "loss": 1.2279, "step": 12764 }, { "epoch": 3.8020067387702676, "grad_norm": 0.25066083669662476, "learning_rate": 1.422302554940529e-05, "loss": 1.2325, "step": 12765 }, { "epoch": 3.802304584969936, "grad_norm": 0.3771968483924866, "learning_rate": 1.4222151193817368e-05, "loss": 1.2464, "step": 12766 }, { "epoch": 3.8026024311696047, "grad_norm": 0.3234902620315552, "learning_rate": 1.4221276798947048e-05, "loss": 1.2205, "step": 12767 }, { "epoch": 3.8029002773692735, "grad_norm": 0.33073753118515015, "learning_rate": 1.4220402364802461e-05, "loss": 1.2168, "step": 12768 }, { "epoch": 3.803198123568942, "grad_norm": 0.4305053651332855, "learning_rate": 1.4219527891391746e-05, "loss": 1.2184, "step": 12769 }, { "epoch": 3.8034959697686106, "grad_norm": 0.25483497977256775, "learning_rate": 1.4218653378723042e-05, "loss": 1.2169, "step": 12770 }, { "epoch": 3.8037938159682794, "grad_norm": 0.32836589217185974, "learning_rate": 1.421777882680448e-05, "loss": 1.2201, "step": 12771 }, { "epoch": 3.804091662167948, "grad_norm": 0.2492336630821228, "learning_rate": 1.4216904235644195e-05, "loss": 1.2349, "step": 12772 }, { "epoch": 3.804389508367617, "grad_norm": 0.3971332609653473, "learning_rate": 1.421602960525033e-05, "loss": 1.2135, "step": 12773 }, { "epoch": 3.8046873545672852, "grad_norm": 0.32474571466445923, "learning_rate": 1.4215154935631021e-05, "loss": 1.2447, "step": 12774 }, { "epoch": 3.804985200766954, "grad_norm": 0.310362309217453, "learning_rate": 1.4214280226794404e-05, "loss": 1.2294, "step": 12775 }, { "epoch": 3.805283046966623, "grad_norm": 0.26335400342941284, "learning_rate": 1.4213405478748616e-05, "loss": 1.2481, "step": 12776 }, { "epoch": 3.805580893166291, "grad_norm": 0.33689048886299133, "learning_rate": 1.42125306915018e-05, "loss": 1.2269, "step": 12777 }, { "epoch": 3.80587873936596, "grad_norm": 0.2633482813835144, "learning_rate": 1.4211655865062091e-05, "loss": 1.2301, "step": 12778 }, { "epoch": 3.8061765855656287, "grad_norm": 0.3629227876663208, "learning_rate": 1.421078099943763e-05, "loss": 1.2232, "step": 12779 }, { "epoch": 3.806474431765297, "grad_norm": 0.24028271436691284, "learning_rate": 1.420990609463656e-05, "loss": 1.2363, "step": 12780 }, { "epoch": 3.8067722779649658, "grad_norm": 0.4227234423160553, "learning_rate": 1.4209031150667013e-05, "loss": 1.2418, "step": 12781 }, { "epoch": 3.8070701241646345, "grad_norm": 0.2809666097164154, "learning_rate": 1.4208156167537132e-05, "loss": 1.2268, "step": 12782 }, { "epoch": 3.807367970364303, "grad_norm": 0.3538129925727844, "learning_rate": 1.4207281145255063e-05, "loss": 1.2254, "step": 12783 }, { "epoch": 3.8076658165639716, "grad_norm": 0.24617306888103485, "learning_rate": 1.4206406083828943e-05, "loss": 1.2284, "step": 12784 }, { "epoch": 3.8079636627636404, "grad_norm": 0.4199250638484955, "learning_rate": 1.4205530983266913e-05, "loss": 1.2411, "step": 12785 }, { "epoch": 3.808261508963309, "grad_norm": 0.24131809175014496, "learning_rate": 1.4204655843577117e-05, "loss": 1.2143, "step": 12786 }, { "epoch": 3.808559355162978, "grad_norm": 0.30456212162971497, "learning_rate": 1.42037806647677e-05, "loss": 1.2339, "step": 12787 }, { "epoch": 3.8088572013626463, "grad_norm": 0.25974634289741516, "learning_rate": 1.4202905446846793e-05, "loss": 1.2316, "step": 12788 }, { "epoch": 3.809155047562315, "grad_norm": 0.318865567445755, "learning_rate": 1.4202030189822551e-05, "loss": 1.2177, "step": 12789 }, { "epoch": 3.809452893761984, "grad_norm": 0.2840079963207245, "learning_rate": 1.420115489370311e-05, "loss": 1.2415, "step": 12790 }, { "epoch": 3.809750739961652, "grad_norm": 0.26356443762779236, "learning_rate": 1.420027955849662e-05, "loss": 1.2234, "step": 12791 }, { "epoch": 3.810048586161321, "grad_norm": 0.27328574657440186, "learning_rate": 1.419940418421122e-05, "loss": 1.2234, "step": 12792 }, { "epoch": 3.8103464323609897, "grad_norm": 0.25236785411834717, "learning_rate": 1.4198528770855056e-05, "loss": 1.2198, "step": 12793 }, { "epoch": 3.810644278560658, "grad_norm": 0.2905680537223816, "learning_rate": 1.4197653318436273e-05, "loss": 1.2455, "step": 12794 }, { "epoch": 3.810942124760327, "grad_norm": 0.25465255975723267, "learning_rate": 1.4196777826963018e-05, "loss": 1.2265, "step": 12795 }, { "epoch": 3.8112399709599956, "grad_norm": 0.27620112895965576, "learning_rate": 1.4195902296443427e-05, "loss": 1.2478, "step": 12796 }, { "epoch": 3.8115378171596643, "grad_norm": 0.2501170337200165, "learning_rate": 1.419502672688566e-05, "loss": 1.21, "step": 12797 }, { "epoch": 3.8118356633593327, "grad_norm": 0.24600648880004883, "learning_rate": 1.4194151118297854e-05, "loss": 1.2288, "step": 12798 }, { "epoch": 3.8121335095590014, "grad_norm": 0.28037095069885254, "learning_rate": 1.4193275470688157e-05, "loss": 1.2301, "step": 12799 }, { "epoch": 3.81243135575867, "grad_norm": 0.2571197748184204, "learning_rate": 1.4192399784064718e-05, "loss": 1.2219, "step": 12800 }, { "epoch": 3.812729201958339, "grad_norm": 0.29590463638305664, "learning_rate": 1.4191524058435682e-05, "loss": 1.2236, "step": 12801 }, { "epoch": 3.8130270481580073, "grad_norm": 0.2541712522506714, "learning_rate": 1.4190648293809198e-05, "loss": 1.2225, "step": 12802 }, { "epoch": 3.813324894357676, "grad_norm": 0.30674511194229126, "learning_rate": 1.4189772490193414e-05, "loss": 1.2229, "step": 12803 }, { "epoch": 3.813622740557345, "grad_norm": 0.24881549179553986, "learning_rate": 1.418889664759648e-05, "loss": 1.2191, "step": 12804 }, { "epoch": 3.813920586757013, "grad_norm": 0.2782246768474579, "learning_rate": 1.418802076602654e-05, "loss": 1.224, "step": 12805 }, { "epoch": 3.814218432956682, "grad_norm": 0.2547534704208374, "learning_rate": 1.4187144845491748e-05, "loss": 1.215, "step": 12806 }, { "epoch": 3.8145162791563507, "grad_norm": 0.2677958309650421, "learning_rate": 1.418626888600025e-05, "loss": 1.22, "step": 12807 }, { "epoch": 3.814814125356019, "grad_norm": 0.25236955285072327, "learning_rate": 1.4185392887560197e-05, "loss": 1.2383, "step": 12808 }, { "epoch": 3.815111971555688, "grad_norm": 0.24296407401561737, "learning_rate": 1.4184516850179739e-05, "loss": 1.2217, "step": 12809 }, { "epoch": 3.8154098177553566, "grad_norm": 0.27396103739738464, "learning_rate": 1.4183640773867026e-05, "loss": 1.2237, "step": 12810 }, { "epoch": 3.8157076639550254, "grad_norm": 0.2571660876274109, "learning_rate": 1.4182764658630215e-05, "loss": 1.2088, "step": 12811 }, { "epoch": 3.816005510154694, "grad_norm": 0.26393911242485046, "learning_rate": 1.4181888504477447e-05, "loss": 1.2457, "step": 12812 }, { "epoch": 3.8163033563543625, "grad_norm": 0.2524780035018921, "learning_rate": 1.4181012311416882e-05, "loss": 1.2366, "step": 12813 }, { "epoch": 3.8166012025540312, "grad_norm": 0.3149135708808899, "learning_rate": 1.4180136079456666e-05, "loss": 1.2309, "step": 12814 }, { "epoch": 3.8168990487537, "grad_norm": 0.26127392053604126, "learning_rate": 1.4179259808604954e-05, "loss": 1.2289, "step": 12815 }, { "epoch": 3.8171968949533683, "grad_norm": 0.29379215836524963, "learning_rate": 1.4178383498869902e-05, "loss": 1.2355, "step": 12816 }, { "epoch": 3.817494741153037, "grad_norm": 0.3243806064128876, "learning_rate": 1.4177507150259656e-05, "loss": 1.2401, "step": 12817 }, { "epoch": 3.817792587352706, "grad_norm": 0.26709645986557007, "learning_rate": 1.4176630762782378e-05, "loss": 1.2387, "step": 12818 }, { "epoch": 3.818090433552374, "grad_norm": 0.274420827627182, "learning_rate": 1.417575433644621e-05, "loss": 1.235, "step": 12819 }, { "epoch": 3.818388279752043, "grad_norm": 0.3666256368160248, "learning_rate": 1.4174877871259319e-05, "loss": 1.2343, "step": 12820 }, { "epoch": 3.8186861259517118, "grad_norm": 0.3476208448410034, "learning_rate": 1.4174001367229853e-05, "loss": 1.2284, "step": 12821 }, { "epoch": 3.81898397215138, "grad_norm": 0.28282949328422546, "learning_rate": 1.4173124824365968e-05, "loss": 1.2367, "step": 12822 }, { "epoch": 3.819281818351049, "grad_norm": 0.26556313037872314, "learning_rate": 1.417224824267582e-05, "loss": 1.2416, "step": 12823 }, { "epoch": 3.8195796645507176, "grad_norm": 0.3180636167526245, "learning_rate": 1.4171371622167562e-05, "loss": 1.2339, "step": 12824 }, { "epoch": 3.8198775107503864, "grad_norm": 0.24711021780967712, "learning_rate": 1.4170494962849349e-05, "loss": 1.2254, "step": 12825 }, { "epoch": 3.820175356950055, "grad_norm": 0.2862757444381714, "learning_rate": 1.4169618264729343e-05, "loss": 1.2259, "step": 12826 }, { "epoch": 3.8204732031497235, "grad_norm": 0.2634437382221222, "learning_rate": 1.4168741527815697e-05, "loss": 1.2319, "step": 12827 }, { "epoch": 3.8207710493493923, "grad_norm": 0.31689709424972534, "learning_rate": 1.416786475211657e-05, "loss": 1.2281, "step": 12828 }, { "epoch": 3.821068895549061, "grad_norm": 0.2715291380882263, "learning_rate": 1.4166987937640117e-05, "loss": 1.2323, "step": 12829 }, { "epoch": 3.8213667417487294, "grad_norm": 0.3118676245212555, "learning_rate": 1.4166111084394495e-05, "loss": 1.2391, "step": 12830 }, { "epoch": 3.821664587948398, "grad_norm": 0.2390519380569458, "learning_rate": 1.4165234192387866e-05, "loss": 1.206, "step": 12831 }, { "epoch": 3.821962434148067, "grad_norm": 0.3161649703979492, "learning_rate": 1.4164357261628388e-05, "loss": 1.2205, "step": 12832 }, { "epoch": 3.8222602803477352, "grad_norm": 0.23863878846168518, "learning_rate": 1.4163480292124214e-05, "loss": 1.2189, "step": 12833 }, { "epoch": 3.822558126547404, "grad_norm": 0.2643231451511383, "learning_rate": 1.4162603283883511e-05, "loss": 1.2417, "step": 12834 }, { "epoch": 3.822855972747073, "grad_norm": 0.23881596326828003, "learning_rate": 1.4161726236914434e-05, "loss": 1.2371, "step": 12835 }, { "epoch": 3.823153818946741, "grad_norm": 0.28563517332077026, "learning_rate": 1.4160849151225147e-05, "loss": 1.2442, "step": 12836 }, { "epoch": 3.82345166514641, "grad_norm": 0.24477483332157135, "learning_rate": 1.4159972026823802e-05, "loss": 1.2296, "step": 12837 }, { "epoch": 3.8237495113460787, "grad_norm": 0.25879228115081787, "learning_rate": 1.415909486371857e-05, "loss": 1.2247, "step": 12838 }, { "epoch": 3.8240473575457474, "grad_norm": 0.26841700077056885, "learning_rate": 1.4158217661917604e-05, "loss": 1.2219, "step": 12839 }, { "epoch": 3.824345203745416, "grad_norm": 0.29669052362442017, "learning_rate": 1.4157340421429071e-05, "loss": 1.2453, "step": 12840 }, { "epoch": 3.8246430499450845, "grad_norm": 0.24732880294322968, "learning_rate": 1.415646314226113e-05, "loss": 1.2406, "step": 12841 }, { "epoch": 3.8249408961447533, "grad_norm": 0.2771657109260559, "learning_rate": 1.4155585824421941e-05, "loss": 1.2316, "step": 12842 }, { "epoch": 3.825238742344422, "grad_norm": 0.2514537572860718, "learning_rate": 1.4154708467919669e-05, "loss": 1.233, "step": 12843 }, { "epoch": 3.8255365885440904, "grad_norm": 0.2608799338340759, "learning_rate": 1.4153831072762476e-05, "loss": 1.2366, "step": 12844 }, { "epoch": 3.825834434743759, "grad_norm": 0.23932795226573944, "learning_rate": 1.4152953638958531e-05, "loss": 1.2475, "step": 12845 }, { "epoch": 3.826132280943428, "grad_norm": 0.24809974431991577, "learning_rate": 1.4152076166515987e-05, "loss": 1.2333, "step": 12846 }, { "epoch": 3.8264301271430963, "grad_norm": 0.24367476999759674, "learning_rate": 1.4151198655443014e-05, "loss": 1.2292, "step": 12847 }, { "epoch": 3.826727973342765, "grad_norm": 0.2372283935546875, "learning_rate": 1.4150321105747777e-05, "loss": 1.2335, "step": 12848 }, { "epoch": 3.827025819542434, "grad_norm": 0.2502457797527313, "learning_rate": 1.414944351743844e-05, "loss": 1.2319, "step": 12849 }, { "epoch": 3.8273236657421026, "grad_norm": 0.25848546624183655, "learning_rate": 1.4148565890523162e-05, "loss": 1.224, "step": 12850 }, { "epoch": 3.827621511941771, "grad_norm": 0.24064145982265472, "learning_rate": 1.4147688225010119e-05, "loss": 1.2133, "step": 12851 }, { "epoch": 3.8279193581414397, "grad_norm": 0.26705843210220337, "learning_rate": 1.414681052090747e-05, "loss": 1.2402, "step": 12852 }, { "epoch": 3.8282172043411085, "grad_norm": 0.24247168004512787, "learning_rate": 1.4145932778223381e-05, "loss": 1.2302, "step": 12853 }, { "epoch": 3.8285150505407772, "grad_norm": 0.26049691438674927, "learning_rate": 1.4145054996966021e-05, "loss": 1.2326, "step": 12854 }, { "epoch": 3.8288128967404456, "grad_norm": 0.30729058384895325, "learning_rate": 1.4144177177143554e-05, "loss": 1.2334, "step": 12855 }, { "epoch": 3.8291107429401143, "grad_norm": 0.2650429904460907, "learning_rate": 1.4143299318764149e-05, "loss": 1.2382, "step": 12856 }, { "epoch": 3.829408589139783, "grad_norm": 0.2934802770614624, "learning_rate": 1.4142421421835972e-05, "loss": 1.2304, "step": 12857 }, { "epoch": 3.8297064353394514, "grad_norm": 0.253642737865448, "learning_rate": 1.4141543486367193e-05, "loss": 1.2304, "step": 12858 }, { "epoch": 3.83000428153912, "grad_norm": 0.3072591722011566, "learning_rate": 1.414066551236598e-05, "loss": 1.237, "step": 12859 }, { "epoch": 3.830302127738789, "grad_norm": 0.3484978675842285, "learning_rate": 1.41397874998405e-05, "loss": 1.2316, "step": 12860 }, { "epoch": 3.8305999739384573, "grad_norm": 0.3204314112663269, "learning_rate": 1.413890944879892e-05, "loss": 1.2345, "step": 12861 }, { "epoch": 3.830897820138126, "grad_norm": 0.3025658130645752, "learning_rate": 1.4138031359249416e-05, "loss": 1.219, "step": 12862 }, { "epoch": 3.831195666337795, "grad_norm": 0.45898616313934326, "learning_rate": 1.413715323120015e-05, "loss": 1.2281, "step": 12863 }, { "epoch": 3.8314935125374636, "grad_norm": 0.2997315526008606, "learning_rate": 1.4136275064659297e-05, "loss": 1.2262, "step": 12864 }, { "epoch": 3.831791358737132, "grad_norm": 0.2999708354473114, "learning_rate": 1.4135396859635028e-05, "loss": 1.2194, "step": 12865 }, { "epoch": 3.8320892049368007, "grad_norm": 0.30110031366348267, "learning_rate": 1.4134518616135507e-05, "loss": 1.2303, "step": 12866 }, { "epoch": 3.8323870511364695, "grad_norm": 0.27862995862960815, "learning_rate": 1.4133640334168912e-05, "loss": 1.2235, "step": 12867 }, { "epoch": 3.8326848973361383, "grad_norm": 0.3227183222770691, "learning_rate": 1.413276201374341e-05, "loss": 1.2088, "step": 12868 }, { "epoch": 3.8329827435358066, "grad_norm": 0.27088144421577454, "learning_rate": 1.413188365486718e-05, "loss": 1.2288, "step": 12869 }, { "epoch": 3.8332805897354754, "grad_norm": 0.30756035447120667, "learning_rate": 1.4131005257548387e-05, "loss": 1.2166, "step": 12870 }, { "epoch": 3.833578435935144, "grad_norm": 0.26481255888938904, "learning_rate": 1.4130126821795202e-05, "loss": 1.2493, "step": 12871 }, { "epoch": 3.8338762821348125, "grad_norm": 0.32878783345222473, "learning_rate": 1.4129248347615804e-05, "loss": 1.2195, "step": 12872 }, { "epoch": 3.8341741283344812, "grad_norm": 0.2628609538078308, "learning_rate": 1.4128369835018365e-05, "loss": 1.2375, "step": 12873 }, { "epoch": 3.83447197453415, "grad_norm": 0.27789103984832764, "learning_rate": 1.4127491284011054e-05, "loss": 1.2126, "step": 12874 }, { "epoch": 3.8347698207338183, "grad_norm": 0.26154825091362, "learning_rate": 1.4126612694602049e-05, "loss": 1.2341, "step": 12875 }, { "epoch": 3.835067666933487, "grad_norm": 0.2882075905799866, "learning_rate": 1.4125734066799526e-05, "loss": 1.2393, "step": 12876 }, { "epoch": 3.835365513133156, "grad_norm": 0.2781543731689453, "learning_rate": 1.4124855400611655e-05, "loss": 1.216, "step": 12877 }, { "epoch": 3.8356633593328247, "grad_norm": 0.28504717350006104, "learning_rate": 1.412397669604661e-05, "loss": 1.2342, "step": 12878 }, { "epoch": 3.8359612055324934, "grad_norm": 0.32166755199432373, "learning_rate": 1.4123097953112574e-05, "loss": 1.2228, "step": 12879 }, { "epoch": 3.8362590517321618, "grad_norm": 0.27417391538619995, "learning_rate": 1.4122219171817717e-05, "loss": 1.2111, "step": 12880 }, { "epoch": 3.8365568979318305, "grad_norm": 0.48461154103279114, "learning_rate": 1.4121340352170217e-05, "loss": 1.2248, "step": 12881 }, { "epoch": 3.8368547441314993, "grad_norm": 0.3839823603630066, "learning_rate": 1.412046149417825e-05, "loss": 1.2247, "step": 12882 }, { "epoch": 3.8371525903311676, "grad_norm": 0.29947030544281006, "learning_rate": 1.4119582597849993e-05, "loss": 1.2255, "step": 12883 }, { "epoch": 3.8374504365308364, "grad_norm": 0.35313907265663147, "learning_rate": 1.4118703663193615e-05, "loss": 1.2265, "step": 12884 }, { "epoch": 3.837748282730505, "grad_norm": 0.27523180842399597, "learning_rate": 1.4117824690217307e-05, "loss": 1.2319, "step": 12885 }, { "epoch": 3.8380461289301735, "grad_norm": 0.31127822399139404, "learning_rate": 1.4116945678929244e-05, "loss": 1.2195, "step": 12886 }, { "epoch": 3.8383439751298423, "grad_norm": 0.2907017767429352, "learning_rate": 1.4116066629337596e-05, "loss": 1.2471, "step": 12887 }, { "epoch": 3.838641821329511, "grad_norm": 0.2989345192909241, "learning_rate": 1.411518754145055e-05, "loss": 1.2197, "step": 12888 }, { "epoch": 3.8389396675291794, "grad_norm": 0.2512325942516327, "learning_rate": 1.411430841527628e-05, "loss": 1.2167, "step": 12889 }, { "epoch": 3.839237513728848, "grad_norm": 0.30058085918426514, "learning_rate": 1.4113429250822965e-05, "loss": 1.2366, "step": 12890 }, { "epoch": 3.839535359928517, "grad_norm": 0.2330739051103592, "learning_rate": 1.411255004809879e-05, "loss": 1.2098, "step": 12891 }, { "epoch": 3.8398332061281857, "grad_norm": 0.2903881072998047, "learning_rate": 1.411167080711193e-05, "loss": 1.2274, "step": 12892 }, { "epoch": 3.8401310523278545, "grad_norm": 0.246946781873703, "learning_rate": 1.411079152787057e-05, "loss": 1.2264, "step": 12893 }, { "epoch": 3.840428898527523, "grad_norm": 0.2761788070201874, "learning_rate": 1.4109912210382884e-05, "loss": 1.2443, "step": 12894 }, { "epoch": 3.8407267447271916, "grad_norm": 0.256935179233551, "learning_rate": 1.410903285465706e-05, "loss": 1.2195, "step": 12895 }, { "epoch": 3.8410245909268603, "grad_norm": 0.2901683449745178, "learning_rate": 1.4108153460701272e-05, "loss": 1.2362, "step": 12896 }, { "epoch": 3.8413224371265287, "grad_norm": 0.287685751914978, "learning_rate": 1.4107274028523708e-05, "loss": 1.2208, "step": 12897 }, { "epoch": 3.8416202833261974, "grad_norm": 0.26763716340065, "learning_rate": 1.4106394558132548e-05, "loss": 1.2564, "step": 12898 }, { "epoch": 3.841918129525866, "grad_norm": 0.2797291874885559, "learning_rate": 1.4105515049535974e-05, "loss": 1.2247, "step": 12899 }, { "epoch": 3.8422159757255345, "grad_norm": 0.25272098183631897, "learning_rate": 1.4104635502742172e-05, "loss": 1.2265, "step": 12900 }, { "epoch": 3.8425138219252033, "grad_norm": 0.2657968997955322, "learning_rate": 1.4103755917759321e-05, "loss": 1.2335, "step": 12901 }, { "epoch": 3.842811668124872, "grad_norm": 0.3597632944583893, "learning_rate": 1.4102876294595602e-05, "loss": 1.2321, "step": 12902 }, { "epoch": 3.8431095143245404, "grad_norm": 0.45975279808044434, "learning_rate": 1.410199663325921e-05, "loss": 1.2072, "step": 12903 }, { "epoch": 3.843407360524209, "grad_norm": 0.26953503489494324, "learning_rate": 1.4101116933758318e-05, "loss": 1.239, "step": 12904 }, { "epoch": 3.843705206723878, "grad_norm": 0.44539040327072144, "learning_rate": 1.4100237196101116e-05, "loss": 1.2255, "step": 12905 }, { "epoch": 3.8440030529235467, "grad_norm": 0.4395925998687744, "learning_rate": 1.4099357420295792e-05, "loss": 1.2385, "step": 12906 }, { "epoch": 3.8443008991232155, "grad_norm": 0.32635074853897095, "learning_rate": 1.409847760635052e-05, "loss": 1.2247, "step": 12907 }, { "epoch": 3.844598745322884, "grad_norm": 1.0792405605316162, "learning_rate": 1.4097597754273497e-05, "loss": 1.2356, "step": 12908 }, { "epoch": 3.8448965915225526, "grad_norm": 0.3065727651119232, "learning_rate": 1.4096717864072904e-05, "loss": 1.2288, "step": 12909 }, { "epoch": 3.8451944377222214, "grad_norm": 0.26505017280578613, "learning_rate": 1.409583793575693e-05, "loss": 1.2308, "step": 12910 }, { "epoch": 3.8454922839218897, "grad_norm": 0.2600894272327423, "learning_rate": 1.409495796933376e-05, "loss": 1.2391, "step": 12911 }, { "epoch": 3.8457901301215585, "grad_norm": 0.2440338134765625, "learning_rate": 1.4094077964811579e-05, "loss": 1.2293, "step": 12912 }, { "epoch": 3.8460879763212272, "grad_norm": 0.23576608300209045, "learning_rate": 1.4093197922198577e-05, "loss": 1.2277, "step": 12913 }, { "epoch": 3.8463858225208956, "grad_norm": 0.24084974825382233, "learning_rate": 1.4092317841502942e-05, "loss": 1.2146, "step": 12914 }, { "epoch": 3.8466836687205643, "grad_norm": 0.25695639848709106, "learning_rate": 1.4091437722732863e-05, "loss": 1.2404, "step": 12915 }, { "epoch": 3.846981514920233, "grad_norm": 0.25280672311782837, "learning_rate": 1.4090557565896526e-05, "loss": 1.2367, "step": 12916 }, { "epoch": 3.847279361119902, "grad_norm": 0.23778603971004486, "learning_rate": 1.4089677371002124e-05, "loss": 1.226, "step": 12917 }, { "epoch": 3.84757720731957, "grad_norm": 0.2415882796049118, "learning_rate": 1.4088797138057839e-05, "loss": 1.2302, "step": 12918 }, { "epoch": 3.847875053519239, "grad_norm": 0.24139143526554108, "learning_rate": 1.4087916867071866e-05, "loss": 1.2223, "step": 12919 }, { "epoch": 3.8481728997189077, "grad_norm": 0.2485382854938507, "learning_rate": 1.4087036558052396e-05, "loss": 1.2245, "step": 12920 }, { "epoch": 3.8484707459185765, "grad_norm": 0.25132110714912415, "learning_rate": 1.4086156211007615e-05, "loss": 1.2221, "step": 12921 }, { "epoch": 3.848768592118245, "grad_norm": 0.24649682641029358, "learning_rate": 1.4085275825945717e-05, "loss": 1.2268, "step": 12922 }, { "epoch": 3.8490664383179136, "grad_norm": 0.24931305646896362, "learning_rate": 1.4084395402874894e-05, "loss": 1.2265, "step": 12923 }, { "epoch": 3.8493642845175824, "grad_norm": 0.25036677718162537, "learning_rate": 1.4083514941803332e-05, "loss": 1.2321, "step": 12924 }, { "epoch": 3.8496621307172507, "grad_norm": 0.2317926436662674, "learning_rate": 1.4082634442739223e-05, "loss": 1.2157, "step": 12925 }, { "epoch": 3.8499599769169195, "grad_norm": 0.24006804823875427, "learning_rate": 1.4081753905690764e-05, "loss": 1.2377, "step": 12926 }, { "epoch": 3.8502578231165883, "grad_norm": 0.242327019572258, "learning_rate": 1.4080873330666149e-05, "loss": 1.2353, "step": 12927 }, { "epoch": 3.8505556693162566, "grad_norm": 0.2397931069135666, "learning_rate": 1.4079992717673563e-05, "loss": 1.2315, "step": 12928 }, { "epoch": 3.8508535155159254, "grad_norm": 0.24000605940818787, "learning_rate": 1.4079112066721205e-05, "loss": 1.2461, "step": 12929 }, { "epoch": 3.851151361715594, "grad_norm": 0.2388916164636612, "learning_rate": 1.4078231377817266e-05, "loss": 1.2346, "step": 12930 }, { "epoch": 3.851449207915263, "grad_norm": 0.22943438589572906, "learning_rate": 1.4077350650969941e-05, "loss": 1.2248, "step": 12931 }, { "epoch": 3.8517470541149317, "grad_norm": 0.24493323266506195, "learning_rate": 1.4076469886187423e-05, "loss": 1.2306, "step": 12932 }, { "epoch": 3.8520449003146, "grad_norm": 0.24133175611495972, "learning_rate": 1.4075589083477907e-05, "loss": 1.2031, "step": 12933 }, { "epoch": 3.852342746514269, "grad_norm": 0.24110007286071777, "learning_rate": 1.407470824284959e-05, "loss": 1.2223, "step": 12934 }, { "epoch": 3.8526405927139376, "grad_norm": 0.22775600850582123, "learning_rate": 1.4073827364310666e-05, "loss": 1.2221, "step": 12935 }, { "epoch": 3.852938438913606, "grad_norm": 0.23863746225833893, "learning_rate": 1.4072946447869326e-05, "loss": 1.2306, "step": 12936 }, { "epoch": 3.8532362851132747, "grad_norm": 0.2448456883430481, "learning_rate": 1.4072065493533773e-05, "loss": 1.2125, "step": 12937 }, { "epoch": 3.8535341313129434, "grad_norm": 0.23308877646923065, "learning_rate": 1.4071184501312199e-05, "loss": 1.2172, "step": 12938 }, { "epoch": 3.8538319775126118, "grad_norm": 0.23921458423137665, "learning_rate": 1.40703034712128e-05, "loss": 1.232, "step": 12939 }, { "epoch": 3.8541298237122805, "grad_norm": 0.2406570017337799, "learning_rate": 1.406942240324378e-05, "loss": 1.2184, "step": 12940 }, { "epoch": 3.8544276699119493, "grad_norm": 0.24633492529392242, "learning_rate": 1.4068541297413327e-05, "loss": 1.2451, "step": 12941 }, { "epoch": 3.8547255161116176, "grad_norm": 0.24022212624549866, "learning_rate": 1.4067660153729646e-05, "loss": 1.2349, "step": 12942 }, { "epoch": 3.8550233623112864, "grad_norm": 0.24970600008964539, "learning_rate": 1.4066778972200926e-05, "loss": 1.2585, "step": 12943 }, { "epoch": 3.855321208510955, "grad_norm": 0.23601877689361572, "learning_rate": 1.4065897752835378e-05, "loss": 1.24, "step": 12944 }, { "epoch": 3.855619054710624, "grad_norm": 0.23589526116847992, "learning_rate": 1.406501649564119e-05, "loss": 1.2319, "step": 12945 }, { "epoch": 3.8559169009102927, "grad_norm": 0.24604082107543945, "learning_rate": 1.4064135200626566e-05, "loss": 1.2163, "step": 12946 }, { "epoch": 3.856214747109961, "grad_norm": 0.24139080941677094, "learning_rate": 1.4063253867799706e-05, "loss": 1.225, "step": 12947 }, { "epoch": 3.85651259330963, "grad_norm": 0.23471637070178986, "learning_rate": 1.4062372497168805e-05, "loss": 1.2327, "step": 12948 }, { "epoch": 3.8568104395092986, "grad_norm": 0.23509523272514343, "learning_rate": 1.406149108874207e-05, "loss": 1.2252, "step": 12949 }, { "epoch": 3.857108285708967, "grad_norm": 0.2365317940711975, "learning_rate": 1.4060609642527696e-05, "loss": 1.2307, "step": 12950 }, { "epoch": 3.8574061319086357, "grad_norm": 0.2595860958099365, "learning_rate": 1.405972815853389e-05, "loss": 1.239, "step": 12951 }, { "epoch": 3.8577039781083045, "grad_norm": 0.24657686054706573, "learning_rate": 1.4058846636768845e-05, "loss": 1.2149, "step": 12952 }, { "epoch": 3.858001824307973, "grad_norm": 0.2447076141834259, "learning_rate": 1.405796507724077e-05, "loss": 1.2235, "step": 12953 }, { "epoch": 3.8582996705076416, "grad_norm": 0.24253875017166138, "learning_rate": 1.4057083479957863e-05, "loss": 1.223, "step": 12954 }, { "epoch": 3.8585975167073103, "grad_norm": 0.23274843394756317, "learning_rate": 1.4056201844928325e-05, "loss": 1.2222, "step": 12955 }, { "epoch": 3.8588953629069787, "grad_norm": 0.23749485611915588, "learning_rate": 1.4055320172160363e-05, "loss": 1.2392, "step": 12956 }, { "epoch": 3.8591932091066474, "grad_norm": 0.23494069278240204, "learning_rate": 1.4054438461662177e-05, "loss": 1.2264, "step": 12957 }, { "epoch": 3.859491055306316, "grad_norm": 0.24517206847667694, "learning_rate": 1.4053556713441972e-05, "loss": 1.2362, "step": 12958 }, { "epoch": 3.859788901505985, "grad_norm": 0.23095130920410156, "learning_rate": 1.4052674927507947e-05, "loss": 1.2294, "step": 12959 }, { "epoch": 3.8600867477056537, "grad_norm": 0.23468315601348877, "learning_rate": 1.4051793103868314e-05, "loss": 1.2399, "step": 12960 }, { "epoch": 3.860384593905322, "grad_norm": 0.2437102496623993, "learning_rate": 1.405091124253127e-05, "loss": 1.2366, "step": 12961 }, { "epoch": 3.860682440104991, "grad_norm": 0.23301923274993896, "learning_rate": 1.4050029343505025e-05, "loss": 1.2254, "step": 12962 }, { "epoch": 3.8609802863046596, "grad_norm": 0.2372865527868271, "learning_rate": 1.404914740679778e-05, "loss": 1.2249, "step": 12963 }, { "epoch": 3.861278132504328, "grad_norm": 0.22793932259082794, "learning_rate": 1.4048265432417745e-05, "loss": 1.2417, "step": 12964 }, { "epoch": 3.8615759787039967, "grad_norm": 0.23911087214946747, "learning_rate": 1.4047383420373121e-05, "loss": 1.2382, "step": 12965 }, { "epoch": 3.8618738249036655, "grad_norm": 0.2349393218755722, "learning_rate": 1.4046501370672114e-05, "loss": 1.2265, "step": 12966 }, { "epoch": 3.862171671103334, "grad_norm": 0.2519095540046692, "learning_rate": 1.4045619283322936e-05, "loss": 1.238, "step": 12967 }, { "epoch": 3.8624695173030026, "grad_norm": 0.24281921982765198, "learning_rate": 1.4044737158333793e-05, "loss": 1.2064, "step": 12968 }, { "epoch": 3.8627673635026714, "grad_norm": 0.24120020866394043, "learning_rate": 1.4043854995712886e-05, "loss": 1.2156, "step": 12969 }, { "epoch": 3.8630652097023397, "grad_norm": 0.23576250672340393, "learning_rate": 1.4042972795468428e-05, "loss": 1.2127, "step": 12970 }, { "epoch": 3.8633630559020085, "grad_norm": 0.24687890708446503, "learning_rate": 1.4042090557608623e-05, "loss": 1.2336, "step": 12971 }, { "epoch": 3.8636609021016772, "grad_norm": 0.2488422989845276, "learning_rate": 1.4041208282141683e-05, "loss": 1.2291, "step": 12972 }, { "epoch": 3.863958748301346, "grad_norm": 0.24812117218971252, "learning_rate": 1.4040325969075814e-05, "loss": 1.2138, "step": 12973 }, { "epoch": 3.8642565945010148, "grad_norm": 0.3074995279312134, "learning_rate": 1.4039443618419228e-05, "loss": 1.2457, "step": 12974 }, { "epoch": 3.864554440700683, "grad_norm": 0.2558381259441376, "learning_rate": 1.4038561230180132e-05, "loss": 1.2468, "step": 12975 }, { "epoch": 3.864852286900352, "grad_norm": 0.26688438653945923, "learning_rate": 1.4037678804366735e-05, "loss": 1.2287, "step": 12976 }, { "epoch": 3.8651501331000206, "grad_norm": 0.24483263492584229, "learning_rate": 1.4036796340987248e-05, "loss": 1.2433, "step": 12977 }, { "epoch": 3.865447979299689, "grad_norm": 0.2557902932167053, "learning_rate": 1.4035913840049882e-05, "loss": 1.2298, "step": 12978 }, { "epoch": 3.8657458254993577, "grad_norm": 0.22995924949645996, "learning_rate": 1.4035031301562845e-05, "loss": 1.2227, "step": 12979 }, { "epoch": 3.8660436716990265, "grad_norm": 0.25961512327194214, "learning_rate": 1.4034148725534351e-05, "loss": 1.2325, "step": 12980 }, { "epoch": 3.866341517898695, "grad_norm": 0.24411143362522125, "learning_rate": 1.4033266111972613e-05, "loss": 1.2315, "step": 12981 }, { "epoch": 3.8666393640983636, "grad_norm": 0.2525519132614136, "learning_rate": 1.4032383460885837e-05, "loss": 1.2345, "step": 12982 }, { "epoch": 3.8669372102980324, "grad_norm": 0.24101418256759644, "learning_rate": 1.4031500772282238e-05, "loss": 1.2236, "step": 12983 }, { "epoch": 3.867235056497701, "grad_norm": 0.26651257276535034, "learning_rate": 1.4030618046170026e-05, "loss": 1.2273, "step": 12984 }, { "epoch": 3.8675329026973695, "grad_norm": 0.2586762011051178, "learning_rate": 1.4029735282557423e-05, "loss": 1.2247, "step": 12985 }, { "epoch": 3.8678307488970383, "grad_norm": 0.27988675236701965, "learning_rate": 1.402885248145263e-05, "loss": 1.2244, "step": 12986 }, { "epoch": 3.868128595096707, "grad_norm": 0.2713130712509155, "learning_rate": 1.4027969642863869e-05, "loss": 1.2288, "step": 12987 }, { "epoch": 3.868426441296376, "grad_norm": 0.3202548623085022, "learning_rate": 1.4027086766799352e-05, "loss": 1.2312, "step": 12988 }, { "epoch": 3.868724287496044, "grad_norm": 0.2576482892036438, "learning_rate": 1.4026203853267285e-05, "loss": 1.2434, "step": 12989 }, { "epoch": 3.869022133695713, "grad_norm": 0.287153035402298, "learning_rate": 1.4025320902275895e-05, "loss": 1.2261, "step": 12990 }, { "epoch": 3.8693199798953817, "grad_norm": 0.299532413482666, "learning_rate": 1.4024437913833391e-05, "loss": 1.2214, "step": 12991 }, { "epoch": 3.86961782609505, "grad_norm": 0.26357489824295044, "learning_rate": 1.402355488794799e-05, "loss": 1.229, "step": 12992 }, { "epoch": 3.869915672294719, "grad_norm": 0.2971954047679901, "learning_rate": 1.4022671824627902e-05, "loss": 1.2239, "step": 12993 }, { "epoch": 3.8702135184943876, "grad_norm": 0.2457713931798935, "learning_rate": 1.402178872388135e-05, "loss": 1.2296, "step": 12994 }, { "epoch": 3.870511364694056, "grad_norm": 0.26199328899383545, "learning_rate": 1.4020905585716546e-05, "loss": 1.2322, "step": 12995 }, { "epoch": 3.8708092108937247, "grad_norm": 0.24583016335964203, "learning_rate": 1.402002241014171e-05, "loss": 1.2309, "step": 12996 }, { "epoch": 3.8711070570933934, "grad_norm": 0.25787654519081116, "learning_rate": 1.4019139197165054e-05, "loss": 1.2374, "step": 12997 }, { "epoch": 3.871404903293062, "grad_norm": 0.26302674412727356, "learning_rate": 1.4018255946794802e-05, "loss": 1.2407, "step": 12998 }, { "epoch": 3.871702749492731, "grad_norm": 0.2593098282814026, "learning_rate": 1.4017372659039166e-05, "loss": 1.2435, "step": 12999 }, { "epoch": 3.8720005956923993, "grad_norm": 0.2577357888221741, "learning_rate": 1.4016489333906365e-05, "loss": 1.2303, "step": 13000 }, { "epoch": 3.8720005956923993, "eval_loss": 1.3259975910186768, "eval_runtime": 22.2707, "eval_samples_per_second": 77.86, "eval_steps_per_second": 4.894, "step": 13000 }, { "epoch": 3.872298441892068, "grad_norm": 0.25295189023017883, "learning_rate": 1.4015605971404618e-05, "loss": 1.231, "step": 13001 }, { "epoch": 3.872596288091737, "grad_norm": 0.2653793692588806, "learning_rate": 1.4014722571542146e-05, "loss": 1.2351, "step": 13002 }, { "epoch": 3.872894134291405, "grad_norm": 0.2781165838241577, "learning_rate": 1.4013839134327163e-05, "loss": 1.2451, "step": 13003 }, { "epoch": 3.873191980491074, "grad_norm": 0.24404959380626678, "learning_rate": 1.4012955659767893e-05, "loss": 1.2195, "step": 13004 }, { "epoch": 3.8734898266907427, "grad_norm": 0.2575218081474304, "learning_rate": 1.4012072147872555e-05, "loss": 1.2222, "step": 13005 }, { "epoch": 3.873787672890411, "grad_norm": 0.2504289746284485, "learning_rate": 1.4011188598649368e-05, "loss": 1.2299, "step": 13006 }, { "epoch": 3.87408551909008, "grad_norm": 0.23770669102668762, "learning_rate": 1.401030501210655e-05, "loss": 1.2146, "step": 13007 }, { "epoch": 3.8743833652897486, "grad_norm": 0.2796214818954468, "learning_rate": 1.4009421388252326e-05, "loss": 1.2219, "step": 13008 }, { "epoch": 3.874681211489417, "grad_norm": 0.2567894756793976, "learning_rate": 1.4008537727094918e-05, "loss": 1.2271, "step": 13009 }, { "epoch": 3.8749790576890857, "grad_norm": 0.29812344908714294, "learning_rate": 1.4007654028642543e-05, "loss": 1.2158, "step": 13010 }, { "epoch": 3.8752769038887545, "grad_norm": 0.2508329451084137, "learning_rate": 1.4006770292903425e-05, "loss": 1.2244, "step": 13011 }, { "epoch": 3.8755747500884232, "grad_norm": 0.32301008701324463, "learning_rate": 1.4005886519885788e-05, "loss": 1.2287, "step": 13012 }, { "epoch": 3.875872596288092, "grad_norm": 0.25789791345596313, "learning_rate": 1.4005002709597848e-05, "loss": 1.2348, "step": 13013 }, { "epoch": 3.8761704424877603, "grad_norm": 0.3174765706062317, "learning_rate": 1.4004118862047836e-05, "loss": 1.2159, "step": 13014 }, { "epoch": 3.876468288687429, "grad_norm": 0.24706552922725677, "learning_rate": 1.400323497724397e-05, "loss": 1.2357, "step": 13015 }, { "epoch": 3.876766134887098, "grad_norm": 0.2730309069156647, "learning_rate": 1.4002351055194477e-05, "loss": 1.2325, "step": 13016 }, { "epoch": 3.877063981086766, "grad_norm": 0.24229326844215393, "learning_rate": 1.4001467095907577e-05, "loss": 1.2452, "step": 13017 }, { "epoch": 3.877361827286435, "grad_norm": 0.24087953567504883, "learning_rate": 1.4000583099391498e-05, "loss": 1.2372, "step": 13018 }, { "epoch": 3.8776596734861037, "grad_norm": 0.2412424385547638, "learning_rate": 1.3999699065654462e-05, "loss": 1.2286, "step": 13019 }, { "epoch": 3.877957519685772, "grad_norm": 0.23994415998458862, "learning_rate": 1.3998814994704696e-05, "loss": 1.2291, "step": 13020 }, { "epoch": 3.878255365885441, "grad_norm": 0.2591041922569275, "learning_rate": 1.3997930886550421e-05, "loss": 1.2072, "step": 13021 }, { "epoch": 3.8785532120851096, "grad_norm": 0.2399747222661972, "learning_rate": 1.3997046741199871e-05, "loss": 1.2435, "step": 13022 }, { "epoch": 3.878851058284778, "grad_norm": 0.23505285382270813, "learning_rate": 1.3996162558661264e-05, "loss": 1.2195, "step": 13023 }, { "epoch": 3.8791489044844467, "grad_norm": 0.2627851963043213, "learning_rate": 1.399527833894283e-05, "loss": 1.2335, "step": 13024 }, { "epoch": 3.8794467506841155, "grad_norm": 0.28813469409942627, "learning_rate": 1.3994394082052791e-05, "loss": 1.239, "step": 13025 }, { "epoch": 3.8797445968837843, "grad_norm": 0.24302732944488525, "learning_rate": 1.3993509787999383e-05, "loss": 1.2214, "step": 13026 }, { "epoch": 3.880042443083453, "grad_norm": 0.2919987440109253, "learning_rate": 1.3992625456790827e-05, "loss": 1.2259, "step": 13027 }, { "epoch": 3.8803402892831214, "grad_norm": 0.25420111417770386, "learning_rate": 1.3991741088435351e-05, "loss": 1.2392, "step": 13028 }, { "epoch": 3.88063813548279, "grad_norm": 0.31524187326431274, "learning_rate": 1.3990856682941184e-05, "loss": 1.2354, "step": 13029 }, { "epoch": 3.880935981682459, "grad_norm": 0.33104392886161804, "learning_rate": 1.3989972240316556e-05, "loss": 1.2406, "step": 13030 }, { "epoch": 3.8812338278821272, "grad_norm": 0.24215851724147797, "learning_rate": 1.398908776056969e-05, "loss": 1.2299, "step": 13031 }, { "epoch": 3.881531674081796, "grad_norm": 0.32851579785346985, "learning_rate": 1.3988203243708822e-05, "loss": 1.2293, "step": 13032 }, { "epoch": 3.8818295202814648, "grad_norm": 0.27357521653175354, "learning_rate": 1.398731868974218e-05, "loss": 1.218, "step": 13033 }, { "epoch": 3.882127366481133, "grad_norm": 0.2925247251987457, "learning_rate": 1.3986434098677992e-05, "loss": 1.223, "step": 13034 }, { "epoch": 3.882425212680802, "grad_norm": 0.31523144245147705, "learning_rate": 1.3985549470524487e-05, "loss": 1.2375, "step": 13035 }, { "epoch": 3.8827230588804706, "grad_norm": 0.25969579815864563, "learning_rate": 1.3984664805289898e-05, "loss": 1.2343, "step": 13036 }, { "epoch": 3.883020905080139, "grad_norm": 0.31095975637435913, "learning_rate": 1.3983780102982454e-05, "loss": 1.2279, "step": 13037 }, { "epoch": 3.8833187512798077, "grad_norm": 0.28102242946624756, "learning_rate": 1.398289536361039e-05, "loss": 1.2278, "step": 13038 }, { "epoch": 3.8836165974794765, "grad_norm": 0.2894362807273865, "learning_rate": 1.3982010587181933e-05, "loss": 1.2371, "step": 13039 }, { "epoch": 3.8839144436791453, "grad_norm": 0.2896195948123932, "learning_rate": 1.3981125773705316e-05, "loss": 1.2161, "step": 13040 }, { "epoch": 3.884212289878814, "grad_norm": 0.2916172444820404, "learning_rate": 1.398024092318877e-05, "loss": 1.2349, "step": 13041 }, { "epoch": 3.8845101360784824, "grad_norm": 0.2779673635959625, "learning_rate": 1.3979356035640532e-05, "loss": 1.211, "step": 13042 }, { "epoch": 3.884807982278151, "grad_norm": 0.29934269189834595, "learning_rate": 1.3978471111068829e-05, "loss": 1.2235, "step": 13043 }, { "epoch": 3.88510582847782, "grad_norm": 0.26979440450668335, "learning_rate": 1.3977586149481901e-05, "loss": 1.2242, "step": 13044 }, { "epoch": 3.8854036746774883, "grad_norm": 0.30681008100509644, "learning_rate": 1.3976701150887975e-05, "loss": 1.2336, "step": 13045 }, { "epoch": 3.885701520877157, "grad_norm": 0.24776335060596466, "learning_rate": 1.397581611529529e-05, "loss": 1.2464, "step": 13046 }, { "epoch": 3.885999367076826, "grad_norm": 0.2861195206642151, "learning_rate": 1.3974931042712079e-05, "loss": 1.2284, "step": 13047 }, { "epoch": 3.886297213276494, "grad_norm": 0.28940054774284363, "learning_rate": 1.3974045933146574e-05, "loss": 1.2224, "step": 13048 }, { "epoch": 3.886595059476163, "grad_norm": 0.2539926767349243, "learning_rate": 1.3973160786607007e-05, "loss": 1.2316, "step": 13049 }, { "epoch": 3.8868929056758317, "grad_norm": 0.2622884213924408, "learning_rate": 1.3972275603101624e-05, "loss": 1.2289, "step": 13050 }, { "epoch": 3.8871907518755004, "grad_norm": 0.2613528370857239, "learning_rate": 1.3971390382638654e-05, "loss": 1.2188, "step": 13051 }, { "epoch": 3.8874885980751688, "grad_norm": 0.2652732729911804, "learning_rate": 1.3970505125226332e-05, "loss": 1.2392, "step": 13052 }, { "epoch": 3.8877864442748375, "grad_norm": 0.26048338413238525, "learning_rate": 1.3969619830872898e-05, "loss": 1.2253, "step": 13053 }, { "epoch": 3.8880842904745063, "grad_norm": 0.25450706481933594, "learning_rate": 1.3968734499586582e-05, "loss": 1.2321, "step": 13054 }, { "epoch": 3.888382136674175, "grad_norm": 0.26873451471328735, "learning_rate": 1.3967849131375631e-05, "loss": 1.2354, "step": 13055 }, { "epoch": 3.8886799828738434, "grad_norm": 0.24118025600910187, "learning_rate": 1.3966963726248276e-05, "loss": 1.2447, "step": 13056 }, { "epoch": 3.888977829073512, "grad_norm": 0.32833707332611084, "learning_rate": 1.3966078284212754e-05, "loss": 1.2256, "step": 13057 }, { "epoch": 3.889275675273181, "grad_norm": 0.242631733417511, "learning_rate": 1.3965192805277306e-05, "loss": 1.2471, "step": 13058 }, { "epoch": 3.8895735214728493, "grad_norm": 0.2761871814727783, "learning_rate": 1.3964307289450166e-05, "loss": 1.2259, "step": 13059 }, { "epoch": 3.889871367672518, "grad_norm": 0.250262975692749, "learning_rate": 1.3963421736739578e-05, "loss": 1.2243, "step": 13060 }, { "epoch": 3.890169213872187, "grad_norm": 0.28379562497138977, "learning_rate": 1.396253614715378e-05, "loss": 1.2481, "step": 13061 }, { "epoch": 3.890467060071855, "grad_norm": 0.23868994414806366, "learning_rate": 1.396165052070101e-05, "loss": 1.2245, "step": 13062 }, { "epoch": 3.890764906271524, "grad_norm": 0.3253426253795624, "learning_rate": 1.3960764857389509e-05, "loss": 1.2375, "step": 13063 }, { "epoch": 3.8910627524711927, "grad_norm": 0.2643393278121948, "learning_rate": 1.3959879157227514e-05, "loss": 1.2149, "step": 13064 }, { "epoch": 3.8913605986708615, "grad_norm": 0.34511491656303406, "learning_rate": 1.3958993420223268e-05, "loss": 1.2409, "step": 13065 }, { "epoch": 3.8916584448705303, "grad_norm": 0.25883370637893677, "learning_rate": 1.3958107646385011e-05, "loss": 1.2357, "step": 13066 }, { "epoch": 3.8919562910701986, "grad_norm": 0.351248174905777, "learning_rate": 1.3957221835720986e-05, "loss": 1.2441, "step": 13067 }, { "epoch": 3.8922541372698674, "grad_norm": 0.25470685958862305, "learning_rate": 1.3956335988239431e-05, "loss": 1.2149, "step": 13068 }, { "epoch": 3.892551983469536, "grad_norm": 0.43083131313323975, "learning_rate": 1.3955450103948592e-05, "loss": 1.2292, "step": 13069 }, { "epoch": 3.8928498296692045, "grad_norm": 0.31342896819114685, "learning_rate": 1.3954564182856709e-05, "loss": 1.2309, "step": 13070 }, { "epoch": 3.8931476758688732, "grad_norm": 0.29841113090515137, "learning_rate": 1.3953678224972025e-05, "loss": 1.2335, "step": 13071 }, { "epoch": 3.893445522068542, "grad_norm": 0.24150791764259338, "learning_rate": 1.395279223030278e-05, "loss": 1.2348, "step": 13072 }, { "epoch": 3.8937433682682103, "grad_norm": 0.3425508439540863, "learning_rate": 1.3951906198857221e-05, "loss": 1.2453, "step": 13073 }, { "epoch": 3.894041214467879, "grad_norm": 0.24360494315624237, "learning_rate": 1.395102013064359e-05, "loss": 1.2332, "step": 13074 }, { "epoch": 3.894339060667548, "grad_norm": 0.27031195163726807, "learning_rate": 1.3950134025670132e-05, "loss": 1.219, "step": 13075 }, { "epoch": 3.894636906867216, "grad_norm": 0.2692050635814667, "learning_rate": 1.394924788394509e-05, "loss": 1.2112, "step": 13076 }, { "epoch": 3.894934753066885, "grad_norm": 0.2483391910791397, "learning_rate": 1.394836170547671e-05, "loss": 1.2255, "step": 13077 }, { "epoch": 3.8952325992665537, "grad_norm": 0.256445050239563, "learning_rate": 1.3947475490273232e-05, "loss": 1.2399, "step": 13078 }, { "epoch": 3.8955304454662225, "grad_norm": 0.2907178997993469, "learning_rate": 1.3946589238342907e-05, "loss": 1.2274, "step": 13079 }, { "epoch": 3.8958282916658913, "grad_norm": 0.29144421219825745, "learning_rate": 1.394570294969398e-05, "loss": 1.2228, "step": 13080 }, { "epoch": 3.8961261378655596, "grad_norm": 0.2566499710083008, "learning_rate": 1.3944816624334693e-05, "loss": 1.2242, "step": 13081 }, { "epoch": 3.8964239840652284, "grad_norm": 0.252852201461792, "learning_rate": 1.3943930262273297e-05, "loss": 1.2238, "step": 13082 }, { "epoch": 3.896721830264897, "grad_norm": 0.28006088733673096, "learning_rate": 1.3943043863518036e-05, "loss": 1.2393, "step": 13083 }, { "epoch": 3.8970196764645655, "grad_norm": 0.2647339701652527, "learning_rate": 1.3942157428077154e-05, "loss": 1.2377, "step": 13084 }, { "epoch": 3.8973175226642343, "grad_norm": 0.2689646780490875, "learning_rate": 1.3941270955958905e-05, "loss": 1.2193, "step": 13085 }, { "epoch": 3.897615368863903, "grad_norm": 0.2609025835990906, "learning_rate": 1.3940384447171531e-05, "loss": 1.2239, "step": 13086 }, { "epoch": 3.8979132150635714, "grad_norm": 0.2561178505420685, "learning_rate": 1.3939497901723287e-05, "loss": 1.2291, "step": 13087 }, { "epoch": 3.89821106126324, "grad_norm": 0.2868281602859497, "learning_rate": 1.393861131962241e-05, "loss": 1.2249, "step": 13088 }, { "epoch": 3.898508907462909, "grad_norm": 0.2537521421909332, "learning_rate": 1.3937724700877157e-05, "loss": 1.2163, "step": 13089 }, { "epoch": 3.8988067536625772, "grad_norm": 0.3083294928073883, "learning_rate": 1.3936838045495775e-05, "loss": 1.2244, "step": 13090 }, { "epoch": 3.899104599862246, "grad_norm": 0.26004940271377563, "learning_rate": 1.3935951353486516e-05, "loss": 1.2341, "step": 13091 }, { "epoch": 3.8994024460619148, "grad_norm": 0.28579092025756836, "learning_rate": 1.3935064624857626e-05, "loss": 1.2276, "step": 13092 }, { "epoch": 3.8997002922615835, "grad_norm": 0.28353142738342285, "learning_rate": 1.3934177859617356e-05, "loss": 1.2194, "step": 13093 }, { "epoch": 3.8999981384612523, "grad_norm": 0.3470597565174103, "learning_rate": 1.3933291057773959e-05, "loss": 1.2221, "step": 13094 }, { "epoch": 3.9002959846609206, "grad_norm": 0.27343669533729553, "learning_rate": 1.3932404219335678e-05, "loss": 1.2356, "step": 13095 }, { "epoch": 3.9005938308605894, "grad_norm": 0.31600189208984375, "learning_rate": 1.3931517344310772e-05, "loss": 1.2453, "step": 13096 }, { "epoch": 3.900891677060258, "grad_norm": 0.24470661580562592, "learning_rate": 1.393063043270749e-05, "loss": 1.2442, "step": 13097 }, { "epoch": 3.9011895232599265, "grad_norm": 0.380693644285202, "learning_rate": 1.3929743484534083e-05, "loss": 1.2367, "step": 13098 }, { "epoch": 3.9014873694595953, "grad_norm": 0.27079856395721436, "learning_rate": 1.3928856499798803e-05, "loss": 1.2318, "step": 13099 }, { "epoch": 3.901785215659264, "grad_norm": 0.31574124097824097, "learning_rate": 1.3927969478509904e-05, "loss": 1.2325, "step": 13100 }, { "epoch": 3.9020830618589324, "grad_norm": 0.28129833936691284, "learning_rate": 1.3927082420675637e-05, "loss": 1.2394, "step": 13101 }, { "epoch": 3.902380908058601, "grad_norm": 0.2591829299926758, "learning_rate": 1.3926195326304254e-05, "loss": 1.229, "step": 13102 }, { "epoch": 3.90267875425827, "grad_norm": 0.4169105291366577, "learning_rate": 1.3925308195404013e-05, "loss": 1.2144, "step": 13103 }, { "epoch": 3.9029766004579383, "grad_norm": 0.32777151465415955, "learning_rate": 1.3924421027983166e-05, "loss": 1.2326, "step": 13104 }, { "epoch": 3.903274446657607, "grad_norm": 0.3248552680015564, "learning_rate": 1.3923533824049962e-05, "loss": 1.2232, "step": 13105 }, { "epoch": 3.903572292857276, "grad_norm": 0.371523916721344, "learning_rate": 1.392264658361266e-05, "loss": 1.2324, "step": 13106 }, { "epoch": 3.9038701390569446, "grad_norm": 0.29270094633102417, "learning_rate": 1.3921759306679516e-05, "loss": 1.2272, "step": 13107 }, { "epoch": 3.9041679852566133, "grad_norm": 0.31862056255340576, "learning_rate": 1.3920871993258782e-05, "loss": 1.2088, "step": 13108 }, { "epoch": 3.9044658314562817, "grad_norm": 0.24687409400939941, "learning_rate": 1.3919984643358715e-05, "loss": 1.2067, "step": 13109 }, { "epoch": 3.9047636776559504, "grad_norm": 0.2599547803401947, "learning_rate": 1.391909725698757e-05, "loss": 1.2238, "step": 13110 }, { "epoch": 3.905061523855619, "grad_norm": 0.26793843507766724, "learning_rate": 1.3918209834153606e-05, "loss": 1.2377, "step": 13111 }, { "epoch": 3.9053593700552875, "grad_norm": 0.23883236944675446, "learning_rate": 1.3917322374865076e-05, "loss": 1.2203, "step": 13112 }, { "epoch": 3.9056572162549563, "grad_norm": 0.23982957005500793, "learning_rate": 1.3916434879130233e-05, "loss": 1.2224, "step": 13113 }, { "epoch": 3.905955062454625, "grad_norm": 0.2505717873573303, "learning_rate": 1.3915547346957348e-05, "loss": 1.2332, "step": 13114 }, { "epoch": 3.9062529086542934, "grad_norm": 0.2481008768081665, "learning_rate": 1.3914659778354664e-05, "loss": 1.226, "step": 13115 }, { "epoch": 3.906550754853962, "grad_norm": 0.23589645326137543, "learning_rate": 1.3913772173330445e-05, "loss": 1.2554, "step": 13116 }, { "epoch": 3.906848601053631, "grad_norm": 0.2520972192287445, "learning_rate": 1.391288453189295e-05, "loss": 1.2257, "step": 13117 }, { "epoch": 3.9071464472532997, "grad_norm": 0.24034655094146729, "learning_rate": 1.3911996854050436e-05, "loss": 1.2379, "step": 13118 }, { "epoch": 3.907444293452968, "grad_norm": 0.26680758595466614, "learning_rate": 1.3911109139811161e-05, "loss": 1.2263, "step": 13119 }, { "epoch": 3.907742139652637, "grad_norm": 0.2632834315299988, "learning_rate": 1.3910221389183384e-05, "loss": 1.2191, "step": 13120 }, { "epoch": 3.9080399858523056, "grad_norm": 0.26333409547805786, "learning_rate": 1.390933360217537e-05, "loss": 1.2396, "step": 13121 }, { "epoch": 3.9083378320519744, "grad_norm": 0.24903489649295807, "learning_rate": 1.3908445778795372e-05, "loss": 1.2389, "step": 13122 }, { "epoch": 3.9086356782516427, "grad_norm": 0.3399023115634918, "learning_rate": 1.3907557919051654e-05, "loss": 1.2339, "step": 13123 }, { "epoch": 3.9089335244513115, "grad_norm": 0.37201112508773804, "learning_rate": 1.3906670022952473e-05, "loss": 1.2178, "step": 13124 }, { "epoch": 3.9092313706509803, "grad_norm": 0.2407924383878708, "learning_rate": 1.3905782090506095e-05, "loss": 1.2169, "step": 13125 }, { "epoch": 3.9095292168506486, "grad_norm": 0.37440383434295654, "learning_rate": 1.3904894121720776e-05, "loss": 1.2331, "step": 13126 }, { "epoch": 3.9098270630503174, "grad_norm": 0.27032122015953064, "learning_rate": 1.390400611660478e-05, "loss": 1.2259, "step": 13127 }, { "epoch": 3.910124909249986, "grad_norm": 0.3098152279853821, "learning_rate": 1.3903118075166371e-05, "loss": 1.2328, "step": 13128 }, { "epoch": 3.9104227554496545, "grad_norm": 0.24604958295822144, "learning_rate": 1.390222999741381e-05, "loss": 1.2314, "step": 13129 }, { "epoch": 3.9107206016493232, "grad_norm": 0.2745722830295563, "learning_rate": 1.3901341883355356e-05, "loss": 1.2192, "step": 13130 }, { "epoch": 3.911018447848992, "grad_norm": 0.300998717546463, "learning_rate": 1.3900453732999273e-05, "loss": 1.2137, "step": 13131 }, { "epoch": 3.9113162940486608, "grad_norm": 0.4329034686088562, "learning_rate": 1.3899565546353828e-05, "loss": 1.2376, "step": 13132 }, { "epoch": 3.9116141402483295, "grad_norm": 0.29281479120254517, "learning_rate": 1.3898677323427283e-05, "loss": 1.2332, "step": 13133 }, { "epoch": 3.911911986447998, "grad_norm": 0.27857616543769836, "learning_rate": 1.38977890642279e-05, "loss": 1.2366, "step": 13134 }, { "epoch": 3.9122098326476666, "grad_norm": 0.3135628402233124, "learning_rate": 1.3896900768763948e-05, "loss": 1.2315, "step": 13135 }, { "epoch": 3.9125076788473354, "grad_norm": 0.2370682656764984, "learning_rate": 1.3896012437043685e-05, "loss": 1.234, "step": 13136 }, { "epoch": 3.9128055250470037, "grad_norm": 0.29846489429473877, "learning_rate": 1.3895124069075378e-05, "loss": 1.2391, "step": 13137 }, { "epoch": 3.9131033712466725, "grad_norm": 0.2559567987918854, "learning_rate": 1.3894235664867298e-05, "loss": 1.2308, "step": 13138 }, { "epoch": 3.9134012174463413, "grad_norm": 0.27251166105270386, "learning_rate": 1.3893347224427703e-05, "loss": 1.2277, "step": 13139 }, { "epoch": 3.9136990636460096, "grad_norm": 0.24662406742572784, "learning_rate": 1.3892458747764863e-05, "loss": 1.2231, "step": 13140 }, { "epoch": 3.9139969098456784, "grad_norm": 0.257978230714798, "learning_rate": 1.3891570234887041e-05, "loss": 1.222, "step": 13141 }, { "epoch": 3.914294756045347, "grad_norm": 0.242611363530159, "learning_rate": 1.3890681685802508e-05, "loss": 1.2354, "step": 13142 }, { "epoch": 3.9145926022450155, "grad_norm": 0.274141401052475, "learning_rate": 1.3889793100519529e-05, "loss": 1.2174, "step": 13143 }, { "epoch": 3.9148904484446843, "grad_norm": 0.27475303411483765, "learning_rate": 1.388890447904637e-05, "loss": 1.2136, "step": 13144 }, { "epoch": 3.915188294644353, "grad_norm": 0.2634553611278534, "learning_rate": 1.38880158213913e-05, "loss": 1.2218, "step": 13145 }, { "epoch": 3.915486140844022, "grad_norm": 0.36685436964035034, "learning_rate": 1.388712712756259e-05, "loss": 1.2461, "step": 13146 }, { "epoch": 3.9157839870436906, "grad_norm": 0.3333728611469269, "learning_rate": 1.3886238397568498e-05, "loss": 1.2325, "step": 13147 }, { "epoch": 3.916081833243359, "grad_norm": 0.33060261607170105, "learning_rate": 1.3885349631417307e-05, "loss": 1.2172, "step": 13148 }, { "epoch": 3.9163796794430277, "grad_norm": 0.2988325357437134, "learning_rate": 1.3884460829117275e-05, "loss": 1.2336, "step": 13149 }, { "epoch": 3.9166775256426964, "grad_norm": 0.3582686185836792, "learning_rate": 1.3883571990676675e-05, "loss": 1.2411, "step": 13150 }, { "epoch": 3.9169753718423648, "grad_norm": 0.298128604888916, "learning_rate": 1.3882683116103777e-05, "loss": 1.2524, "step": 13151 }, { "epoch": 3.9172732180420335, "grad_norm": 0.3053963780403137, "learning_rate": 1.3881794205406852e-05, "loss": 1.2129, "step": 13152 }, { "epoch": 3.9175710642417023, "grad_norm": 0.28138604760169983, "learning_rate": 1.3880905258594166e-05, "loss": 1.2205, "step": 13153 }, { "epoch": 3.9178689104413706, "grad_norm": 0.33124756813049316, "learning_rate": 1.388001627567399e-05, "loss": 1.2279, "step": 13154 }, { "epoch": 3.9181667566410394, "grad_norm": 0.3027435839176178, "learning_rate": 1.3879127256654606e-05, "loss": 1.2409, "step": 13155 }, { "epoch": 3.918464602840708, "grad_norm": 0.27342790365219116, "learning_rate": 1.3878238201544272e-05, "loss": 1.2489, "step": 13156 }, { "epoch": 3.9187624490403765, "grad_norm": 0.35694262385368347, "learning_rate": 1.3877349110351263e-05, "loss": 1.2568, "step": 13157 }, { "epoch": 3.9190602952400453, "grad_norm": 0.28090721368789673, "learning_rate": 1.3876459983083857e-05, "loss": 1.2317, "step": 13158 }, { "epoch": 3.919358141439714, "grad_norm": 0.3718603849411011, "learning_rate": 1.3875570819750319e-05, "loss": 1.2171, "step": 13159 }, { "epoch": 3.919655987639383, "grad_norm": 0.33166900277137756, "learning_rate": 1.3874681620358924e-05, "loss": 1.2233, "step": 13160 }, { "epoch": 3.9199538338390516, "grad_norm": 0.26730549335479736, "learning_rate": 1.3873792384917945e-05, "loss": 1.2252, "step": 13161 }, { "epoch": 3.92025168003872, "grad_norm": 0.2688398063182831, "learning_rate": 1.387290311343566e-05, "loss": 1.2129, "step": 13162 }, { "epoch": 3.9205495262383887, "grad_norm": 0.26750171184539795, "learning_rate": 1.3872013805920337e-05, "loss": 1.2331, "step": 13163 }, { "epoch": 3.9208473724380575, "grad_norm": 0.32285237312316895, "learning_rate": 1.387112446238025e-05, "loss": 1.232, "step": 13164 }, { "epoch": 3.921145218637726, "grad_norm": 0.2518673241138458, "learning_rate": 1.3870235082823675e-05, "loss": 1.2255, "step": 13165 }, { "epoch": 3.9214430648373946, "grad_norm": 0.30436524748802185, "learning_rate": 1.3869345667258887e-05, "loss": 1.2371, "step": 13166 }, { "epoch": 3.9217409110370633, "grad_norm": 0.2510192096233368, "learning_rate": 1.386845621569416e-05, "loss": 1.2338, "step": 13167 }, { "epoch": 3.9220387572367317, "grad_norm": 0.3274816572666168, "learning_rate": 1.3867566728137771e-05, "loss": 1.2375, "step": 13168 }, { "epoch": 3.9223366034364004, "grad_norm": 0.2782514989376068, "learning_rate": 1.3866677204597997e-05, "loss": 1.2405, "step": 13169 }, { "epoch": 3.922634449636069, "grad_norm": 0.2858296036720276, "learning_rate": 1.3865787645083111e-05, "loss": 1.2219, "step": 13170 }, { "epoch": 3.9229322958357375, "grad_norm": 0.25521355867385864, "learning_rate": 1.3864898049601387e-05, "loss": 1.2246, "step": 13171 }, { "epoch": 3.9232301420354063, "grad_norm": 0.26153141260147095, "learning_rate": 1.3864008418161106e-05, "loss": 1.2382, "step": 13172 }, { "epoch": 3.923527988235075, "grad_norm": 0.26130759716033936, "learning_rate": 1.3863118750770543e-05, "loss": 1.2202, "step": 13173 }, { "epoch": 3.923825834434744, "grad_norm": 0.2539772093296051, "learning_rate": 1.3862229047437979e-05, "loss": 1.2287, "step": 13174 }, { "epoch": 3.9241236806344126, "grad_norm": 0.27328330278396606, "learning_rate": 1.3861339308171686e-05, "loss": 1.2357, "step": 13175 }, { "epoch": 3.924421526834081, "grad_norm": 0.2556736469268799, "learning_rate": 1.3860449532979947e-05, "loss": 1.2191, "step": 13176 }, { "epoch": 3.9247193730337497, "grad_norm": 0.27171239256858826, "learning_rate": 1.3859559721871037e-05, "loss": 1.2467, "step": 13177 }, { "epoch": 3.9250172192334185, "grad_norm": 0.24272269010543823, "learning_rate": 1.3858669874853235e-05, "loss": 1.2175, "step": 13178 }, { "epoch": 3.925315065433087, "grad_norm": 0.26099082827568054, "learning_rate": 1.3857779991934823e-05, "loss": 1.2362, "step": 13179 }, { "epoch": 3.9256129116327556, "grad_norm": 0.25095367431640625, "learning_rate": 1.3856890073124077e-05, "loss": 1.2364, "step": 13180 }, { "epoch": 3.9259107578324244, "grad_norm": 0.29382559657096863, "learning_rate": 1.3856000118429278e-05, "loss": 1.2343, "step": 13181 }, { "epoch": 3.9262086040320927, "grad_norm": 0.3270135819911957, "learning_rate": 1.3855110127858706e-05, "loss": 1.2265, "step": 13182 }, { "epoch": 3.9265064502317615, "grad_norm": 0.2749212086200714, "learning_rate": 1.3854220101420644e-05, "loss": 1.2376, "step": 13183 }, { "epoch": 3.9268042964314303, "grad_norm": 0.3886204957962036, "learning_rate": 1.3853330039123365e-05, "loss": 1.2241, "step": 13184 }, { "epoch": 3.927102142631099, "grad_norm": 0.2546490430831909, "learning_rate": 1.385243994097516e-05, "loss": 1.227, "step": 13185 }, { "epoch": 3.9273999888307674, "grad_norm": 0.4590868651866913, "learning_rate": 1.3851549806984306e-05, "loss": 1.2189, "step": 13186 }, { "epoch": 3.927697835030436, "grad_norm": 0.3852839469909668, "learning_rate": 1.3850659637159081e-05, "loss": 1.2176, "step": 13187 }, { "epoch": 3.927995681230105, "grad_norm": 0.4147535562515259, "learning_rate": 1.3849769431507771e-05, "loss": 1.2238, "step": 13188 }, { "epoch": 3.9282935274297737, "grad_norm": 0.40568336844444275, "learning_rate": 1.3848879190038658e-05, "loss": 1.2503, "step": 13189 }, { "epoch": 3.928591373629442, "grad_norm": 0.34892594814300537, "learning_rate": 1.3847988912760025e-05, "loss": 1.2395, "step": 13190 }, { "epoch": 3.9288892198291108, "grad_norm": 0.31116774678230286, "learning_rate": 1.3847098599680153e-05, "loss": 1.2224, "step": 13191 }, { "epoch": 3.9291870660287795, "grad_norm": 0.360795795917511, "learning_rate": 1.3846208250807326e-05, "loss": 1.2294, "step": 13192 }, { "epoch": 3.929484912228448, "grad_norm": 0.2690966725349426, "learning_rate": 1.3845317866149833e-05, "loss": 1.2195, "step": 13193 }, { "epoch": 3.9297827584281166, "grad_norm": 0.4607526957988739, "learning_rate": 1.3844427445715952e-05, "loss": 1.2321, "step": 13194 }, { "epoch": 3.9300806046277854, "grad_norm": 0.2560616135597229, "learning_rate": 1.3843536989513964e-05, "loss": 1.2309, "step": 13195 }, { "epoch": 3.9303784508274537, "grad_norm": 0.29613804817199707, "learning_rate": 1.3842646497552164e-05, "loss": 1.2468, "step": 13196 }, { "epoch": 3.9306762970271225, "grad_norm": 0.28417542576789856, "learning_rate": 1.3841755969838829e-05, "loss": 1.2209, "step": 13197 }, { "epoch": 3.9309741432267913, "grad_norm": 0.34782496094703674, "learning_rate": 1.3840865406382246e-05, "loss": 1.2255, "step": 13198 }, { "epoch": 3.93127198942646, "grad_norm": 0.2893727123737335, "learning_rate": 1.3839974807190703e-05, "loss": 1.2479, "step": 13199 }, { "epoch": 3.931569835626129, "grad_norm": 0.2869235873222351, "learning_rate": 1.3839084172272487e-05, "loss": 1.2209, "step": 13200 }, { "epoch": 3.931867681825797, "grad_norm": 0.25480109453201294, "learning_rate": 1.3838193501635878e-05, "loss": 1.2301, "step": 13201 }, { "epoch": 3.932165528025466, "grad_norm": 0.2833738625049591, "learning_rate": 1.3837302795289169e-05, "loss": 1.226, "step": 13202 }, { "epoch": 3.9324633742251347, "grad_norm": 0.3162032961845398, "learning_rate": 1.3836412053240644e-05, "loss": 1.2398, "step": 13203 }, { "epoch": 3.932761220424803, "grad_norm": 0.3235669434070587, "learning_rate": 1.3835521275498593e-05, "loss": 1.2317, "step": 13204 }, { "epoch": 3.933059066624472, "grad_norm": 0.24424074590206146, "learning_rate": 1.3834630462071301e-05, "loss": 1.23, "step": 13205 }, { "epoch": 3.9333569128241406, "grad_norm": 0.47108832001686096, "learning_rate": 1.3833739612967054e-05, "loss": 1.2338, "step": 13206 }, { "epoch": 3.933654759023809, "grad_norm": 0.29592519998550415, "learning_rate": 1.3832848728194146e-05, "loss": 1.231, "step": 13207 }, { "epoch": 3.9339526052234777, "grad_norm": 0.3220840394496918, "learning_rate": 1.3831957807760862e-05, "loss": 1.2255, "step": 13208 }, { "epoch": 3.9342504514231464, "grad_norm": 0.26471439003944397, "learning_rate": 1.3831066851675492e-05, "loss": 1.2397, "step": 13209 }, { "epoch": 3.9345482976228148, "grad_norm": 0.47105714678764343, "learning_rate": 1.3830175859946327e-05, "loss": 1.2334, "step": 13210 }, { "epoch": 3.9348461438224835, "grad_norm": 0.23992817103862762, "learning_rate": 1.3829284832581653e-05, "loss": 1.2232, "step": 13211 }, { "epoch": 3.9351439900221523, "grad_norm": 0.32074904441833496, "learning_rate": 1.3828393769589764e-05, "loss": 1.2217, "step": 13212 }, { "epoch": 3.935441836221821, "grad_norm": 0.2720768451690674, "learning_rate": 1.3827502670978945e-05, "loss": 1.236, "step": 13213 }, { "epoch": 3.93573968242149, "grad_norm": 0.31859278678894043, "learning_rate": 1.3826611536757493e-05, "loss": 1.2269, "step": 13214 }, { "epoch": 3.936037528621158, "grad_norm": 0.2529468238353729, "learning_rate": 1.3825720366933695e-05, "loss": 1.2435, "step": 13215 }, { "epoch": 3.936335374820827, "grad_norm": 0.28112754225730896, "learning_rate": 1.382482916151584e-05, "loss": 1.2279, "step": 13216 }, { "epoch": 3.9366332210204957, "grad_norm": 0.28580331802368164, "learning_rate": 1.382393792051223e-05, "loss": 1.2195, "step": 13217 }, { "epoch": 3.936931067220164, "grad_norm": 0.23452556133270264, "learning_rate": 1.3823046643931144e-05, "loss": 1.2286, "step": 13218 }, { "epoch": 3.937228913419833, "grad_norm": 0.270723432302475, "learning_rate": 1.3822155331780882e-05, "loss": 1.2187, "step": 13219 }, { "epoch": 3.9375267596195016, "grad_norm": 0.25173231959342957, "learning_rate": 1.3821263984069737e-05, "loss": 1.2416, "step": 13220 }, { "epoch": 3.93782460581917, "grad_norm": 0.2495669424533844, "learning_rate": 1.3820372600805998e-05, "loss": 1.2041, "step": 13221 }, { "epoch": 3.9381224520188387, "grad_norm": 0.2559252083301544, "learning_rate": 1.381948118199796e-05, "loss": 1.227, "step": 13222 }, { "epoch": 3.9384202982185075, "grad_norm": 0.29392632842063904, "learning_rate": 1.3818589727653918e-05, "loss": 1.243, "step": 13223 }, { "epoch": 3.938718144418176, "grad_norm": 0.2594864070415497, "learning_rate": 1.3817698237782165e-05, "loss": 1.225, "step": 13224 }, { "epoch": 3.9390159906178446, "grad_norm": 0.25113070011138916, "learning_rate": 1.3816806712390993e-05, "loss": 1.2268, "step": 13225 }, { "epoch": 3.9393138368175133, "grad_norm": 0.26626530289649963, "learning_rate": 1.3815915151488702e-05, "loss": 1.2248, "step": 13226 }, { "epoch": 3.939611683017182, "grad_norm": 0.24988804757595062, "learning_rate": 1.3815023555083584e-05, "loss": 1.2278, "step": 13227 }, { "epoch": 3.939909529216851, "grad_norm": 0.25075680017471313, "learning_rate": 1.3814131923183934e-05, "loss": 1.2287, "step": 13228 }, { "epoch": 3.940207375416519, "grad_norm": 0.2675560712814331, "learning_rate": 1.3813240255798046e-05, "loss": 1.2194, "step": 13229 }, { "epoch": 3.940505221616188, "grad_norm": 0.2381962090730667, "learning_rate": 1.3812348552934219e-05, "loss": 1.2323, "step": 13230 }, { "epoch": 3.9408030678158568, "grad_norm": 0.24896834790706635, "learning_rate": 1.3811456814600748e-05, "loss": 1.2319, "step": 13231 }, { "epoch": 3.941100914015525, "grad_norm": 0.25786885619163513, "learning_rate": 1.3810565040805928e-05, "loss": 1.227, "step": 13232 }, { "epoch": 3.941398760215194, "grad_norm": 0.2563535273075104, "learning_rate": 1.3809673231558058e-05, "loss": 1.2305, "step": 13233 }, { "epoch": 3.9416966064148626, "grad_norm": 0.2621895968914032, "learning_rate": 1.380878138686544e-05, "loss": 1.238, "step": 13234 }, { "epoch": 3.941994452614531, "grad_norm": 0.30734968185424805, "learning_rate": 1.3807889506736363e-05, "loss": 1.2295, "step": 13235 }, { "epoch": 3.9422922988141997, "grad_norm": 0.24150706827640533, "learning_rate": 1.3806997591179128e-05, "loss": 1.2265, "step": 13236 }, { "epoch": 3.9425901450138685, "grad_norm": 0.35120949149131775, "learning_rate": 1.3806105640202035e-05, "loss": 1.2247, "step": 13237 }, { "epoch": 3.942887991213537, "grad_norm": 0.315340131521225, "learning_rate": 1.3805213653813378e-05, "loss": 1.2356, "step": 13238 }, { "epoch": 3.9431858374132056, "grad_norm": 0.28169921040534973, "learning_rate": 1.3804321632021462e-05, "loss": 1.2115, "step": 13239 }, { "epoch": 3.9434836836128744, "grad_norm": 0.4050716161727905, "learning_rate": 1.3803429574834583e-05, "loss": 1.2306, "step": 13240 }, { "epoch": 3.943781529812543, "grad_norm": 0.2576994001865387, "learning_rate": 1.3802537482261044e-05, "loss": 1.2318, "step": 13241 }, { "epoch": 3.944079376012212, "grad_norm": 0.4222757816314697, "learning_rate": 1.3801645354309138e-05, "loss": 1.2308, "step": 13242 }, { "epoch": 3.9443772222118803, "grad_norm": 0.4433141052722931, "learning_rate": 1.3800753190987173e-05, "loss": 1.2196, "step": 13243 }, { "epoch": 3.944675068411549, "grad_norm": 0.259381502866745, "learning_rate": 1.3799860992303447e-05, "loss": 1.2299, "step": 13244 }, { "epoch": 3.944972914611218, "grad_norm": 0.4008481204509735, "learning_rate": 1.3798968758266256e-05, "loss": 1.2271, "step": 13245 }, { "epoch": 3.945270760810886, "grad_norm": 0.25758200883865356, "learning_rate": 1.3798076488883907e-05, "loss": 1.2356, "step": 13246 }, { "epoch": 3.945568607010555, "grad_norm": 0.3300653100013733, "learning_rate": 1.3797184184164699e-05, "loss": 1.236, "step": 13247 }, { "epoch": 3.9458664532102237, "grad_norm": 0.2681613564491272, "learning_rate": 1.3796291844116935e-05, "loss": 1.2247, "step": 13248 }, { "epoch": 3.946164299409892, "grad_norm": 0.32309770584106445, "learning_rate": 1.3795399468748916e-05, "loss": 1.2465, "step": 13249 }, { "epoch": 3.9464621456095608, "grad_norm": 0.3046136200428009, "learning_rate": 1.3794507058068947e-05, "loss": 1.225, "step": 13250 }, { "epoch": 3.9467599918092295, "grad_norm": 0.3005814254283905, "learning_rate": 1.3793614612085331e-05, "loss": 1.2279, "step": 13251 }, { "epoch": 3.9470578380088983, "grad_norm": 0.3176072835922241, "learning_rate": 1.3792722130806366e-05, "loss": 1.2109, "step": 13252 }, { "epoch": 3.9473556842085666, "grad_norm": 0.30952659249305725, "learning_rate": 1.379182961424036e-05, "loss": 1.2341, "step": 13253 }, { "epoch": 3.9476535304082354, "grad_norm": 0.34675028920173645, "learning_rate": 1.3790937062395615e-05, "loss": 1.2284, "step": 13254 }, { "epoch": 3.947951376607904, "grad_norm": 0.29791009426116943, "learning_rate": 1.3790044475280438e-05, "loss": 1.2338, "step": 13255 }, { "epoch": 3.948249222807573, "grad_norm": 0.37231138348579407, "learning_rate": 1.378915185290313e-05, "loss": 1.2312, "step": 13256 }, { "epoch": 3.9485470690072413, "grad_norm": 0.28895992040634155, "learning_rate": 1.3788259195271998e-05, "loss": 1.2508, "step": 13257 }, { "epoch": 3.94884491520691, "grad_norm": 0.2745595872402191, "learning_rate": 1.378736650239535e-05, "loss": 1.2536, "step": 13258 }, { "epoch": 3.949142761406579, "grad_norm": 0.24945619702339172, "learning_rate": 1.3786473774281484e-05, "loss": 1.2237, "step": 13259 }, { "epoch": 3.949440607606247, "grad_norm": 0.34441810846328735, "learning_rate": 1.3785581010938709e-05, "loss": 1.2449, "step": 13260 }, { "epoch": 3.949738453805916, "grad_norm": 0.28372707962989807, "learning_rate": 1.3784688212375337e-05, "loss": 1.2267, "step": 13261 }, { "epoch": 3.9500363000055847, "grad_norm": 0.2903372049331665, "learning_rate": 1.3783795378599666e-05, "loss": 1.253, "step": 13262 }, { "epoch": 3.950334146205253, "grad_norm": 0.24565307796001434, "learning_rate": 1.3782902509620009e-05, "loss": 1.2213, "step": 13263 }, { "epoch": 3.950631992404922, "grad_norm": 0.3678787052631378, "learning_rate": 1.3782009605444669e-05, "loss": 1.2338, "step": 13264 }, { "epoch": 3.9509298386045906, "grad_norm": 0.24600066244602203, "learning_rate": 1.3781116666081956e-05, "loss": 1.2318, "step": 13265 }, { "epoch": 3.9512276848042593, "grad_norm": 0.2908230721950531, "learning_rate": 1.3780223691540174e-05, "loss": 1.2265, "step": 13266 }, { "epoch": 3.951525531003928, "grad_norm": 0.2514212429523468, "learning_rate": 1.3779330681827637e-05, "loss": 1.2078, "step": 13267 }, { "epoch": 3.9518233772035964, "grad_norm": 0.25792211294174194, "learning_rate": 1.3778437636952654e-05, "loss": 1.2411, "step": 13268 }, { "epoch": 3.952121223403265, "grad_norm": 0.284408837556839, "learning_rate": 1.3777544556923523e-05, "loss": 1.2243, "step": 13269 }, { "epoch": 3.952419069602934, "grad_norm": 0.35277897119522095, "learning_rate": 1.3776651441748565e-05, "loss": 1.2177, "step": 13270 }, { "epoch": 3.9527169158026023, "grad_norm": 0.32351380586624146, "learning_rate": 1.3775758291436083e-05, "loss": 1.2358, "step": 13271 }, { "epoch": 3.953014762002271, "grad_norm": 0.3098355233669281, "learning_rate": 1.3774865105994388e-05, "loss": 1.2275, "step": 13272 }, { "epoch": 3.95331260820194, "grad_norm": 0.28767481446266174, "learning_rate": 1.3773971885431791e-05, "loss": 1.2379, "step": 13273 }, { "epoch": 3.953610454401608, "grad_norm": 0.3367336690425873, "learning_rate": 1.3773078629756604e-05, "loss": 1.2158, "step": 13274 }, { "epoch": 3.953908300601277, "grad_norm": 0.32177621126174927, "learning_rate": 1.3772185338977138e-05, "loss": 1.2046, "step": 13275 }, { "epoch": 3.9542061468009457, "grad_norm": 0.3031023442745209, "learning_rate": 1.3771292013101698e-05, "loss": 1.2189, "step": 13276 }, { "epoch": 3.954503993000614, "grad_norm": 0.2863255739212036, "learning_rate": 1.3770398652138598e-05, "loss": 1.219, "step": 13277 }, { "epoch": 3.954801839200283, "grad_norm": 0.2768959701061249, "learning_rate": 1.3769505256096154e-05, "loss": 1.2212, "step": 13278 }, { "epoch": 3.9550996853999516, "grad_norm": 0.25252997875213623, "learning_rate": 1.3768611824982675e-05, "loss": 1.2127, "step": 13279 }, { "epoch": 3.9553975315996204, "grad_norm": 0.3021676540374756, "learning_rate": 1.3767718358806473e-05, "loss": 1.2216, "step": 13280 }, { "epoch": 3.955695377799289, "grad_norm": 0.2823593020439148, "learning_rate": 1.376682485757586e-05, "loss": 1.2118, "step": 13281 }, { "epoch": 3.9559932239989575, "grad_norm": 0.2506415545940399, "learning_rate": 1.3765931321299153e-05, "loss": 1.2306, "step": 13282 }, { "epoch": 3.9562910701986262, "grad_norm": 0.2492976188659668, "learning_rate": 1.376503774998466e-05, "loss": 1.2232, "step": 13283 }, { "epoch": 3.956588916398295, "grad_norm": 0.24417439103126526, "learning_rate": 1.3764144143640699e-05, "loss": 1.2353, "step": 13284 }, { "epoch": 3.9568867625979633, "grad_norm": 0.2732827067375183, "learning_rate": 1.3763250502275583e-05, "loss": 1.2343, "step": 13285 }, { "epoch": 3.957184608797632, "grad_norm": 0.2529347836971283, "learning_rate": 1.3762356825897622e-05, "loss": 1.2083, "step": 13286 }, { "epoch": 3.957482454997301, "grad_norm": 0.2552180290222168, "learning_rate": 1.3761463114515137e-05, "loss": 1.2236, "step": 13287 }, { "epoch": 3.957780301196969, "grad_norm": 0.3384588956832886, "learning_rate": 1.3760569368136443e-05, "loss": 1.2244, "step": 13288 }, { "epoch": 3.958078147396638, "grad_norm": 0.28780779242515564, "learning_rate": 1.3759675586769844e-05, "loss": 1.2139, "step": 13289 }, { "epoch": 3.9583759935963068, "grad_norm": 0.5732924342155457, "learning_rate": 1.375878177042367e-05, "loss": 1.2133, "step": 13290 }, { "epoch": 3.958673839795975, "grad_norm": 0.4588649570941925, "learning_rate": 1.3757887919106232e-05, "loss": 1.2275, "step": 13291 }, { "epoch": 3.958971685995644, "grad_norm": 0.34352895617485046, "learning_rate": 1.3756994032825846e-05, "loss": 1.2292, "step": 13292 }, { "epoch": 3.9592695321953126, "grad_norm": 0.29472851753234863, "learning_rate": 1.3756100111590826e-05, "loss": 1.2355, "step": 13293 }, { "epoch": 3.9595673783949814, "grad_norm": 0.6480907201766968, "learning_rate": 1.375520615540949e-05, "loss": 1.2303, "step": 13294 }, { "epoch": 3.95986522459465, "grad_norm": 0.33491411805152893, "learning_rate": 1.3754312164290157e-05, "loss": 1.2287, "step": 13295 }, { "epoch": 3.9601630707943185, "grad_norm": 0.3167470395565033, "learning_rate": 1.3753418138241146e-05, "loss": 1.2268, "step": 13296 }, { "epoch": 3.9604609169939873, "grad_norm": 0.28013840317726135, "learning_rate": 1.375252407727077e-05, "loss": 1.2193, "step": 13297 }, { "epoch": 3.960758763193656, "grad_norm": 0.27401334047317505, "learning_rate": 1.3751629981387352e-05, "loss": 1.2147, "step": 13298 }, { "epoch": 3.9610566093933244, "grad_norm": 0.26001015305519104, "learning_rate": 1.3750735850599211e-05, "loss": 1.2221, "step": 13299 }, { "epoch": 3.961354455592993, "grad_norm": 0.3323241174221039, "learning_rate": 1.3749841684914658e-05, "loss": 1.2213, "step": 13300 }, { "epoch": 3.961652301792662, "grad_norm": 0.25518015027046204, "learning_rate": 1.374894748434202e-05, "loss": 1.2397, "step": 13301 }, { "epoch": 3.9619501479923303, "grad_norm": 0.2652130424976349, "learning_rate": 1.3748053248889615e-05, "loss": 1.2185, "step": 13302 }, { "epoch": 3.962247994191999, "grad_norm": 0.2644560635089874, "learning_rate": 1.374715897856576e-05, "loss": 1.2348, "step": 13303 }, { "epoch": 3.962545840391668, "grad_norm": 0.24957582354545593, "learning_rate": 1.374626467337878e-05, "loss": 1.2228, "step": 13304 }, { "epoch": 3.962843686591336, "grad_norm": 0.25798946619033813, "learning_rate": 1.374537033333699e-05, "loss": 1.2235, "step": 13305 }, { "epoch": 3.963141532791005, "grad_norm": 0.24982014298439026, "learning_rate": 1.3744475958448715e-05, "loss": 1.2253, "step": 13306 }, { "epoch": 3.9634393789906737, "grad_norm": 0.2523110806941986, "learning_rate": 1.3743581548722276e-05, "loss": 1.2176, "step": 13307 }, { "epoch": 3.9637372251903424, "grad_norm": 0.25925248861312866, "learning_rate": 1.374268710416599e-05, "loss": 1.2309, "step": 13308 }, { "epoch": 3.964035071390011, "grad_norm": 0.25103744864463806, "learning_rate": 1.3741792624788186e-05, "loss": 1.2435, "step": 13309 }, { "epoch": 3.9643329175896795, "grad_norm": 0.25214672088623047, "learning_rate": 1.374089811059718e-05, "loss": 1.2048, "step": 13310 }, { "epoch": 3.9646307637893483, "grad_norm": 0.2590175271034241, "learning_rate": 1.3740003561601298e-05, "loss": 1.2388, "step": 13311 }, { "epoch": 3.964928609989017, "grad_norm": 0.25053536891937256, "learning_rate": 1.3739108977808862e-05, "loss": 1.2362, "step": 13312 }, { "epoch": 3.9652264561886854, "grad_norm": 0.26403069496154785, "learning_rate": 1.3738214359228192e-05, "loss": 1.241, "step": 13313 }, { "epoch": 3.965524302388354, "grad_norm": 0.2543472349643707, "learning_rate": 1.3737319705867615e-05, "loss": 1.2288, "step": 13314 }, { "epoch": 3.965822148588023, "grad_norm": 0.2844364047050476, "learning_rate": 1.3736425017735453e-05, "loss": 1.2332, "step": 13315 }, { "epoch": 3.9661199947876913, "grad_norm": 0.292111873626709, "learning_rate": 1.3735530294840034e-05, "loss": 1.2458, "step": 13316 }, { "epoch": 3.96641784098736, "grad_norm": 0.26022544503211975, "learning_rate": 1.3734635537189675e-05, "loss": 1.2287, "step": 13317 }, { "epoch": 3.966715687187029, "grad_norm": 0.30310890078544617, "learning_rate": 1.3733740744792708e-05, "loss": 1.2266, "step": 13318 }, { "epoch": 3.9670135333866976, "grad_norm": 0.26686811447143555, "learning_rate": 1.3732845917657453e-05, "loss": 1.2174, "step": 13319 }, { "epoch": 3.967311379586366, "grad_norm": 0.2606055736541748, "learning_rate": 1.3731951055792237e-05, "loss": 1.221, "step": 13320 }, { "epoch": 3.9676092257860347, "grad_norm": 0.2605314254760742, "learning_rate": 1.3731056159205386e-05, "loss": 1.2218, "step": 13321 }, { "epoch": 3.9679070719857035, "grad_norm": 0.2701477110385895, "learning_rate": 1.3730161227905227e-05, "loss": 1.2232, "step": 13322 }, { "epoch": 3.9682049181853722, "grad_norm": 0.2692610025405884, "learning_rate": 1.3729266261900086e-05, "loss": 1.2375, "step": 13323 }, { "epoch": 3.9685027643850406, "grad_norm": 0.3172907829284668, "learning_rate": 1.3728371261198287e-05, "loss": 1.2231, "step": 13324 }, { "epoch": 3.9688006105847093, "grad_norm": 0.2733261287212372, "learning_rate": 1.372747622580816e-05, "loss": 1.2124, "step": 13325 }, { "epoch": 3.969098456784378, "grad_norm": 0.31666746735572815, "learning_rate": 1.3726581155738035e-05, "loss": 1.2273, "step": 13326 }, { "epoch": 3.9693963029840464, "grad_norm": 0.30270305275917053, "learning_rate": 1.3725686050996232e-05, "loss": 1.2254, "step": 13327 }, { "epoch": 3.969694149183715, "grad_norm": 0.2606399655342102, "learning_rate": 1.3724790911591082e-05, "loss": 1.2336, "step": 13328 }, { "epoch": 3.969991995383384, "grad_norm": 0.35708358883857727, "learning_rate": 1.3723895737530919e-05, "loss": 1.2307, "step": 13329 }, { "epoch": 3.9702898415830523, "grad_norm": 0.2547812759876251, "learning_rate": 1.372300052882406e-05, "loss": 1.2354, "step": 13330 }, { "epoch": 3.970587687782721, "grad_norm": 0.2860596179962158, "learning_rate": 1.3722105285478844e-05, "loss": 1.2268, "step": 13331 }, { "epoch": 3.97088553398239, "grad_norm": 0.260704904794693, "learning_rate": 1.3721210007503596e-05, "loss": 1.2144, "step": 13332 }, { "epoch": 3.9711833801820586, "grad_norm": 0.2592894732952118, "learning_rate": 1.372031469490665e-05, "loss": 1.2393, "step": 13333 }, { "epoch": 3.9714812263817274, "grad_norm": 0.3154374957084656, "learning_rate": 1.371941934769633e-05, "loss": 1.2124, "step": 13334 }, { "epoch": 3.9717790725813957, "grad_norm": 0.27829208970069885, "learning_rate": 1.3718523965880967e-05, "loss": 1.2154, "step": 13335 }, { "epoch": 3.9720769187810645, "grad_norm": 0.3362678289413452, "learning_rate": 1.3717628549468893e-05, "loss": 1.2308, "step": 13336 }, { "epoch": 3.9723747649807333, "grad_norm": 0.25007420778274536, "learning_rate": 1.3716733098468441e-05, "loss": 1.2377, "step": 13337 }, { "epoch": 3.9726726111804016, "grad_norm": 0.27721577882766724, "learning_rate": 1.371583761288794e-05, "loss": 1.2244, "step": 13338 }, { "epoch": 3.9729704573800704, "grad_norm": 0.3188413381576538, "learning_rate": 1.371494209273572e-05, "loss": 1.2397, "step": 13339 }, { "epoch": 3.973268303579739, "grad_norm": 0.2549150884151459, "learning_rate": 1.3714046538020116e-05, "loss": 1.2191, "step": 13340 }, { "epoch": 3.9735661497794075, "grad_norm": 0.3306371569633484, "learning_rate": 1.3713150948749459e-05, "loss": 1.2211, "step": 13341 }, { "epoch": 3.9738639959790762, "grad_norm": 0.2630874812602997, "learning_rate": 1.3712255324932079e-05, "loss": 1.2226, "step": 13342 }, { "epoch": 3.974161842178745, "grad_norm": 0.2721898853778839, "learning_rate": 1.3711359666576312e-05, "loss": 1.2326, "step": 13343 }, { "epoch": 3.9744596883784133, "grad_norm": 0.43239787220954895, "learning_rate": 1.371046397369049e-05, "loss": 1.2438, "step": 13344 }, { "epoch": 3.974757534578082, "grad_norm": 0.2552736699581146, "learning_rate": 1.3709568246282945e-05, "loss": 1.2214, "step": 13345 }, { "epoch": 3.975055380777751, "grad_norm": 0.34517979621887207, "learning_rate": 1.3708672484362013e-05, "loss": 1.2266, "step": 13346 }, { "epoch": 3.9753532269774197, "grad_norm": 0.27155017852783203, "learning_rate": 1.3707776687936028e-05, "loss": 1.2294, "step": 13347 }, { "epoch": 3.9756510731770884, "grad_norm": 0.271278977394104, "learning_rate": 1.370688085701332e-05, "loss": 1.2185, "step": 13348 }, { "epoch": 3.9759489193767568, "grad_norm": 0.38098785281181335, "learning_rate": 1.3705984991602229e-05, "loss": 1.2276, "step": 13349 }, { "epoch": 3.9762467655764255, "grad_norm": 0.30669933557510376, "learning_rate": 1.370508909171109e-05, "loss": 1.2249, "step": 13350 }, { "epoch": 3.9765446117760943, "grad_norm": 0.3008941113948822, "learning_rate": 1.3704193157348236e-05, "loss": 1.2295, "step": 13351 }, { "epoch": 3.9768424579757626, "grad_norm": 0.3231416344642639, "learning_rate": 1.3703297188522002e-05, "loss": 1.2201, "step": 13352 }, { "epoch": 3.9771403041754314, "grad_norm": 0.3122359812259674, "learning_rate": 1.3702401185240726e-05, "loss": 1.2365, "step": 13353 }, { "epoch": 3.9774381503751, "grad_norm": 0.28316250443458557, "learning_rate": 1.3701505147512744e-05, "loss": 1.2172, "step": 13354 }, { "epoch": 3.9777359965747685, "grad_norm": 0.31386739015579224, "learning_rate": 1.3700609075346391e-05, "loss": 1.2197, "step": 13355 }, { "epoch": 3.9780338427744373, "grad_norm": 0.2508314549922943, "learning_rate": 1.3699712968750004e-05, "loss": 1.2242, "step": 13356 }, { "epoch": 3.978331688974106, "grad_norm": 0.40898066759109497, "learning_rate": 1.3698816827731926e-05, "loss": 1.2277, "step": 13357 }, { "epoch": 3.9786295351737744, "grad_norm": 0.3118723928928375, "learning_rate": 1.3697920652300487e-05, "loss": 1.2267, "step": 13358 }, { "epoch": 3.978927381373443, "grad_norm": 0.36608466506004333, "learning_rate": 1.3697024442464027e-05, "loss": 1.2314, "step": 13359 }, { "epoch": 3.979225227573112, "grad_norm": 0.24628345668315887, "learning_rate": 1.3696128198230888e-05, "loss": 1.2358, "step": 13360 }, { "epoch": 3.9795230737727807, "grad_norm": 0.5022084712982178, "learning_rate": 1.3695231919609402e-05, "loss": 1.2333, "step": 13361 }, { "epoch": 3.9798209199724495, "grad_norm": 0.30987975001335144, "learning_rate": 1.3694335606607913e-05, "loss": 1.2212, "step": 13362 }, { "epoch": 3.980118766172118, "grad_norm": 0.37688446044921875, "learning_rate": 1.369343925923476e-05, "loss": 1.2416, "step": 13363 }, { "epoch": 3.9804166123717866, "grad_norm": 0.2744654417037964, "learning_rate": 1.3692542877498281e-05, "loss": 1.2261, "step": 13364 }, { "epoch": 3.9807144585714553, "grad_norm": 0.5666722655296326, "learning_rate": 1.3691646461406816e-05, "loss": 1.2173, "step": 13365 }, { "epoch": 3.9810123047711237, "grad_norm": 0.30332040786743164, "learning_rate": 1.3690750010968703e-05, "loss": 1.221, "step": 13366 }, { "epoch": 3.9813101509707924, "grad_norm": 0.35051602125167847, "learning_rate": 1.3689853526192287e-05, "loss": 1.2212, "step": 13367 }, { "epoch": 3.981607997170461, "grad_norm": 0.2537488341331482, "learning_rate": 1.3688957007085906e-05, "loss": 1.2155, "step": 13368 }, { "epoch": 3.9819058433701295, "grad_norm": 0.308860182762146, "learning_rate": 1.3688060453657901e-05, "loss": 1.2237, "step": 13369 }, { "epoch": 3.9822036895697983, "grad_norm": 0.32965436577796936, "learning_rate": 1.3687163865916616e-05, "loss": 1.2222, "step": 13370 }, { "epoch": 3.982501535769467, "grad_norm": 0.252133846282959, "learning_rate": 1.3686267243870385e-05, "loss": 1.2289, "step": 13371 }, { "epoch": 3.9827993819691354, "grad_norm": 0.33121633529663086, "learning_rate": 1.3685370587527562e-05, "loss": 1.2278, "step": 13372 }, { "epoch": 3.983097228168804, "grad_norm": 0.25625285506248474, "learning_rate": 1.368447389689648e-05, "loss": 1.2375, "step": 13373 }, { "epoch": 3.983395074368473, "grad_norm": 0.3637160658836365, "learning_rate": 1.3683577171985487e-05, "loss": 1.2124, "step": 13374 }, { "epoch": 3.9836929205681417, "grad_norm": 0.28041210770606995, "learning_rate": 1.3682680412802924e-05, "loss": 1.2287, "step": 13375 }, { "epoch": 3.9839907667678105, "grad_norm": 0.25311151146888733, "learning_rate": 1.3681783619357134e-05, "loss": 1.2147, "step": 13376 }, { "epoch": 3.984288612967479, "grad_norm": 0.2525341510772705, "learning_rate": 1.368088679165646e-05, "loss": 1.2312, "step": 13377 }, { "epoch": 3.9845864591671476, "grad_norm": 0.24177545309066772, "learning_rate": 1.3679989929709247e-05, "loss": 1.2156, "step": 13378 }, { "epoch": 3.9848843053668164, "grad_norm": 0.2735190689563751, "learning_rate": 1.3679093033523838e-05, "loss": 1.2163, "step": 13379 }, { "epoch": 3.9851821515664847, "grad_norm": 0.2353980392217636, "learning_rate": 1.367819610310858e-05, "loss": 1.2245, "step": 13380 }, { "epoch": 3.9854799977661535, "grad_norm": 0.27483224868774414, "learning_rate": 1.367729913847182e-05, "loss": 1.2108, "step": 13381 }, { "epoch": 3.9857778439658222, "grad_norm": 0.2524372935295105, "learning_rate": 1.3676402139621896e-05, "loss": 1.2127, "step": 13382 }, { "epoch": 3.9860756901654906, "grad_norm": 0.2566528916358948, "learning_rate": 1.367550510656716e-05, "loss": 1.225, "step": 13383 }, { "epoch": 3.9863735363651593, "grad_norm": 0.23759350180625916, "learning_rate": 1.3674608039315954e-05, "loss": 1.2175, "step": 13384 }, { "epoch": 3.986671382564828, "grad_norm": 0.26291152834892273, "learning_rate": 1.3673710937876626e-05, "loss": 1.2137, "step": 13385 }, { "epoch": 3.986969228764497, "grad_norm": 0.2782786786556244, "learning_rate": 1.3672813802257521e-05, "loss": 1.2349, "step": 13386 }, { "epoch": 3.987267074964165, "grad_norm": 0.27542656660079956, "learning_rate": 1.367191663246699e-05, "loss": 1.2218, "step": 13387 }, { "epoch": 3.987564921163834, "grad_norm": 0.26309019327163696, "learning_rate": 1.3671019428513374e-05, "loss": 1.2179, "step": 13388 }, { "epoch": 3.9878627673635028, "grad_norm": 0.2406158298254013, "learning_rate": 1.3670122190405026e-05, "loss": 1.2357, "step": 13389 }, { "epoch": 3.9881606135631715, "grad_norm": 0.26158133149147034, "learning_rate": 1.366922491815029e-05, "loss": 1.2268, "step": 13390 }, { "epoch": 3.98845845976284, "grad_norm": 0.25136712193489075, "learning_rate": 1.3668327611757516e-05, "loss": 1.2362, "step": 13391 }, { "epoch": 3.9887563059625086, "grad_norm": 0.3350224196910858, "learning_rate": 1.3667430271235053e-05, "loss": 1.2249, "step": 13392 }, { "epoch": 3.9890541521621774, "grad_norm": 0.25642406940460205, "learning_rate": 1.3666532896591249e-05, "loss": 1.229, "step": 13393 }, { "epoch": 3.9893519983618457, "grad_norm": 0.27974164485931396, "learning_rate": 1.3665635487834453e-05, "loss": 1.2324, "step": 13394 }, { "epoch": 3.9896498445615145, "grad_norm": 0.3308431804180145, "learning_rate": 1.3664738044973011e-05, "loss": 1.2481, "step": 13395 }, { "epoch": 3.9899476907611833, "grad_norm": 0.3376076817512512, "learning_rate": 1.366384056801528e-05, "loss": 1.2224, "step": 13396 }, { "epoch": 3.9902455369608516, "grad_norm": 0.2756621837615967, "learning_rate": 1.3662943056969605e-05, "loss": 1.2265, "step": 13397 }, { "epoch": 3.9905433831605204, "grad_norm": 0.250488817691803, "learning_rate": 1.3662045511844338e-05, "loss": 1.2201, "step": 13398 }, { "epoch": 3.990841229360189, "grad_norm": 0.3583226799964905, "learning_rate": 1.3661147932647826e-05, "loss": 1.2415, "step": 13399 }, { "epoch": 3.991139075559858, "grad_norm": 0.2988361120223999, "learning_rate": 1.3660250319388427e-05, "loss": 1.2195, "step": 13400 }, { "epoch": 3.9914369217595267, "grad_norm": 0.2791658043861389, "learning_rate": 1.3659352672074484e-05, "loss": 1.23, "step": 13401 }, { "epoch": 3.991734767959195, "grad_norm": 0.2548013925552368, "learning_rate": 1.3658454990714356e-05, "loss": 1.215, "step": 13402 }, { "epoch": 3.992032614158864, "grad_norm": 0.25278958678245544, "learning_rate": 1.365755727531639e-05, "loss": 1.231, "step": 13403 }, { "epoch": 3.9923304603585326, "grad_norm": 0.27782562375068665, "learning_rate": 1.3656659525888942e-05, "loss": 1.2275, "step": 13404 }, { "epoch": 3.992628306558201, "grad_norm": 0.26534757018089294, "learning_rate": 1.365576174244036e-05, "loss": 1.2259, "step": 13405 }, { "epoch": 3.9929261527578697, "grad_norm": 0.3131057322025299, "learning_rate": 1.3654863924979002e-05, "loss": 1.2205, "step": 13406 }, { "epoch": 3.9932239989575384, "grad_norm": 0.2801111936569214, "learning_rate": 1.3653966073513214e-05, "loss": 1.238, "step": 13407 }, { "epoch": 3.9935218451572068, "grad_norm": 0.26753294467926025, "learning_rate": 1.3653068188051359e-05, "loss": 1.235, "step": 13408 }, { "epoch": 3.9938196913568755, "grad_norm": 0.2762524485588074, "learning_rate": 1.3652170268601785e-05, "loss": 1.2376, "step": 13409 }, { "epoch": 3.9941175375565443, "grad_norm": 0.3896852731704712, "learning_rate": 1.3651272315172846e-05, "loss": 1.2169, "step": 13410 }, { "epoch": 3.9944153837562126, "grad_norm": 0.36620697379112244, "learning_rate": 1.36503743277729e-05, "loss": 1.2215, "step": 13411 }, { "epoch": 3.9947132299558814, "grad_norm": 0.35110583901405334, "learning_rate": 1.3649476306410296e-05, "loss": 1.2262, "step": 13412 }, { "epoch": 3.99501107615555, "grad_norm": 0.46130087971687317, "learning_rate": 1.3648578251093395e-05, "loss": 1.222, "step": 13413 }, { "epoch": 3.995308922355219, "grad_norm": 0.26714006066322327, "learning_rate": 1.3647680161830546e-05, "loss": 1.2269, "step": 13414 }, { "epoch": 3.9956067685548877, "grad_norm": 0.346652626991272, "learning_rate": 1.3646782038630114e-05, "loss": 1.2301, "step": 13415 }, { "epoch": 3.995904614754556, "grad_norm": 0.3244880735874176, "learning_rate": 1.3645883881500445e-05, "loss": 1.2105, "step": 13416 }, { "epoch": 3.996202460954225, "grad_norm": 0.2955103814601898, "learning_rate": 1.3644985690449901e-05, "loss": 1.2225, "step": 13417 }, { "epoch": 3.9965003071538936, "grad_norm": 0.3054068088531494, "learning_rate": 1.3644087465486839e-05, "loss": 1.2387, "step": 13418 }, { "epoch": 3.996798153353562, "grad_norm": 0.29540181159973145, "learning_rate": 1.3643189206619613e-05, "loss": 1.2321, "step": 13419 }, { "epoch": 3.9970959995532307, "grad_norm": 0.27125608921051025, "learning_rate": 1.3642290913856582e-05, "loss": 1.2144, "step": 13420 }, { "epoch": 3.9973938457528995, "grad_norm": 0.25907444953918457, "learning_rate": 1.3641392587206103e-05, "loss": 1.2368, "step": 13421 }, { "epoch": 3.997691691952568, "grad_norm": 0.266828328371048, "learning_rate": 1.364049422667654e-05, "loss": 1.2354, "step": 13422 }, { "epoch": 3.9979895381522366, "grad_norm": 0.24839074909687042, "learning_rate": 1.363959583227624e-05, "loss": 1.2263, "step": 13423 }, { "epoch": 3.9982873843519053, "grad_norm": 0.25622907280921936, "learning_rate": 1.3638697404013566e-05, "loss": 1.2206, "step": 13424 }, { "epoch": 3.9985852305515737, "grad_norm": 0.27672067284584045, "learning_rate": 1.363779894189688e-05, "loss": 1.2309, "step": 13425 }, { "epoch": 3.9988830767512424, "grad_norm": 0.29166603088378906, "learning_rate": 1.363690044593454e-05, "loss": 1.2258, "step": 13426 }, { "epoch": 3.999180922950911, "grad_norm": 0.2589173913002014, "learning_rate": 1.3636001916134904e-05, "loss": 1.2542, "step": 13427 }, { "epoch": 3.99947876915058, "grad_norm": 0.26252371072769165, "learning_rate": 1.3635103352506333e-05, "loss": 1.2101, "step": 13428 }, { "epoch": 3.9997766153502488, "grad_norm": 0.2667730152606964, "learning_rate": 1.3634204755057187e-05, "loss": 1.2485, "step": 13429 }, { "epoch": 4.000074461549917, "grad_norm": 0.24984438717365265, "learning_rate": 1.3633306123795824e-05, "loss": 1.2281, "step": 13430 }, { "epoch": 4.000372307749585, "grad_norm": 0.2710225284099579, "learning_rate": 1.3632407458730607e-05, "loss": 1.2259, "step": 13431 }, { "epoch": 4.000670153949255, "grad_norm": 0.24073953926563263, "learning_rate": 1.3631508759869903e-05, "loss": 1.2012, "step": 13432 }, { "epoch": 4.000968000148923, "grad_norm": 0.2353154420852661, "learning_rate": 1.3630610027222062e-05, "loss": 1.241, "step": 13433 }, { "epoch": 4.001265846348592, "grad_norm": 0.23412029445171356, "learning_rate": 1.362971126079545e-05, "loss": 1.2255, "step": 13434 }, { "epoch": 4.0015636925482605, "grad_norm": 0.2648339867591858, "learning_rate": 1.3628812460598431e-05, "loss": 1.2337, "step": 13435 }, { "epoch": 4.001861538747929, "grad_norm": 0.2483452707529068, "learning_rate": 1.3627913626639368e-05, "loss": 1.2289, "step": 13436 }, { "epoch": 4.002159384947598, "grad_norm": 0.27604132890701294, "learning_rate": 1.3627014758926622e-05, "loss": 1.2204, "step": 13437 }, { "epoch": 4.002457231147266, "grad_norm": 0.23962776362895966, "learning_rate": 1.3626115857468553e-05, "loss": 1.2272, "step": 13438 }, { "epoch": 4.002755077346935, "grad_norm": 0.3925142288208008, "learning_rate": 1.3625216922273532e-05, "loss": 1.2256, "step": 13439 }, { "epoch": 4.003052923546604, "grad_norm": 0.47054609656333923, "learning_rate": 1.3624317953349916e-05, "loss": 1.2337, "step": 13440 }, { "epoch": 4.003350769746272, "grad_norm": 0.2619442045688629, "learning_rate": 1.3623418950706068e-05, "loss": 1.2356, "step": 13441 }, { "epoch": 4.003648615945941, "grad_norm": 0.2708479166030884, "learning_rate": 1.3622519914350358e-05, "loss": 1.2059, "step": 13442 }, { "epoch": 4.00394646214561, "grad_norm": 0.2544673979282379, "learning_rate": 1.3621620844291147e-05, "loss": 1.2081, "step": 13443 }, { "epoch": 4.004244308345278, "grad_norm": 0.2590193748474121, "learning_rate": 1.36207217405368e-05, "loss": 1.2295, "step": 13444 }, { "epoch": 4.004542154544946, "grad_norm": 0.24963033199310303, "learning_rate": 1.3619822603095685e-05, "loss": 1.2054, "step": 13445 }, { "epoch": 4.004840000744616, "grad_norm": 0.2631826102733612, "learning_rate": 1.3618923431976163e-05, "loss": 1.2303, "step": 13446 }, { "epoch": 4.005137846944284, "grad_norm": 0.31257209181785583, "learning_rate": 1.3618024227186601e-05, "loss": 1.2159, "step": 13447 }, { "epoch": 4.005435693143953, "grad_norm": 0.27918773889541626, "learning_rate": 1.3617124988735362e-05, "loss": 1.2262, "step": 13448 }, { "epoch": 4.0057335393436215, "grad_norm": 0.2980075180530548, "learning_rate": 1.3616225716630824e-05, "loss": 1.2444, "step": 13449 }, { "epoch": 4.00603138554329, "grad_norm": 0.4892701208591461, "learning_rate": 1.3615326410881342e-05, "loss": 1.2293, "step": 13450 }, { "epoch": 4.006329231742959, "grad_norm": 0.5278096795082092, "learning_rate": 1.3614427071495287e-05, "loss": 1.2284, "step": 13451 }, { "epoch": 4.006627077942627, "grad_norm": 0.3616490662097931, "learning_rate": 1.3613527698481029e-05, "loss": 1.2318, "step": 13452 }, { "epoch": 4.006924924142296, "grad_norm": 0.33913710713386536, "learning_rate": 1.3612628291846927e-05, "loss": 1.2125, "step": 13453 }, { "epoch": 4.007222770341965, "grad_norm": 0.5846142768859863, "learning_rate": 1.361172885160136e-05, "loss": 1.217, "step": 13454 }, { "epoch": 4.007520616541633, "grad_norm": 0.39744895696640015, "learning_rate": 1.361082937775269e-05, "loss": 1.2231, "step": 13455 }, { "epoch": 4.007818462741302, "grad_norm": 0.38165080547332764, "learning_rate": 1.3609929870309288e-05, "loss": 1.2344, "step": 13456 }, { "epoch": 4.008116308940971, "grad_norm": 0.28365325927734375, "learning_rate": 1.3609030329279522e-05, "loss": 1.2136, "step": 13457 }, { "epoch": 4.008414155140639, "grad_norm": 0.5792515873908997, "learning_rate": 1.360813075467176e-05, "loss": 1.2273, "step": 13458 }, { "epoch": 4.0087120013403075, "grad_norm": 0.3754171133041382, "learning_rate": 1.360723114649437e-05, "loss": 1.2409, "step": 13459 }, { "epoch": 4.009009847539977, "grad_norm": 0.36889129877090454, "learning_rate": 1.3606331504755727e-05, "loss": 1.2193, "step": 13460 }, { "epoch": 4.009307693739645, "grad_norm": 0.33019784092903137, "learning_rate": 1.36054318294642e-05, "loss": 1.2184, "step": 13461 }, { "epoch": 4.009605539939314, "grad_norm": 0.3890586495399475, "learning_rate": 1.3604532120628158e-05, "loss": 1.2195, "step": 13462 }, { "epoch": 4.009903386138983, "grad_norm": 0.4135071337223053, "learning_rate": 1.3603632378255972e-05, "loss": 1.2122, "step": 13463 }, { "epoch": 4.010201232338651, "grad_norm": 0.2886429727077484, "learning_rate": 1.360273260235601e-05, "loss": 1.2232, "step": 13464 }, { "epoch": 4.01049907853832, "grad_norm": 0.34025245904922485, "learning_rate": 1.3601832792936646e-05, "loss": 1.2191, "step": 13465 }, { "epoch": 4.010796924737988, "grad_norm": 0.28943249583244324, "learning_rate": 1.3600932950006254e-05, "loss": 1.2253, "step": 13466 }, { "epoch": 4.011094770937657, "grad_norm": 0.3403421938419342, "learning_rate": 1.3600033073573204e-05, "loss": 1.217, "step": 13467 }, { "epoch": 4.011392617137326, "grad_norm": 0.3473210036754608, "learning_rate": 1.3599133163645868e-05, "loss": 1.2377, "step": 13468 }, { "epoch": 4.011690463336994, "grad_norm": 0.33960193395614624, "learning_rate": 1.3598233220232622e-05, "loss": 1.2284, "step": 13469 }, { "epoch": 4.011988309536663, "grad_norm": 0.31858521699905396, "learning_rate": 1.3597333243341832e-05, "loss": 1.2299, "step": 13470 }, { "epoch": 4.012286155736332, "grad_norm": 0.2420596480369568, "learning_rate": 1.3596433232981877e-05, "loss": 1.2345, "step": 13471 }, { "epoch": 4.012584001936, "grad_norm": 0.46303659677505493, "learning_rate": 1.3595533189161128e-05, "loss": 1.2074, "step": 13472 }, { "epoch": 4.0128818481356685, "grad_norm": 0.26272690296173096, "learning_rate": 1.359463311188796e-05, "loss": 1.2163, "step": 13473 }, { "epoch": 4.013179694335338, "grad_norm": 0.3758026957511902, "learning_rate": 1.3593733001170749e-05, "loss": 1.2184, "step": 13474 }, { "epoch": 4.013477540535006, "grad_norm": 0.2465454787015915, "learning_rate": 1.3592832857017865e-05, "loss": 1.2257, "step": 13475 }, { "epoch": 4.013775386734675, "grad_norm": 0.3642311692237854, "learning_rate": 1.3591932679437689e-05, "loss": 1.256, "step": 13476 }, { "epoch": 4.014073232934344, "grad_norm": 0.28112563490867615, "learning_rate": 1.3591032468438588e-05, "loss": 1.2303, "step": 13477 }, { "epoch": 4.014371079134012, "grad_norm": 0.2791341245174408, "learning_rate": 1.3590132224028945e-05, "loss": 1.2212, "step": 13478 }, { "epoch": 4.014668925333681, "grad_norm": 0.35364946722984314, "learning_rate": 1.358923194621713e-05, "loss": 1.2257, "step": 13479 }, { "epoch": 4.0149667715333495, "grad_norm": 0.25044015049934387, "learning_rate": 1.3588331635011527e-05, "loss": 1.2357, "step": 13480 }, { "epoch": 4.015264617733018, "grad_norm": 0.397842675447464, "learning_rate": 1.3587431290420502e-05, "loss": 1.2275, "step": 13481 }, { "epoch": 4.015562463932687, "grad_norm": 0.24611815810203552, "learning_rate": 1.358653091245244e-05, "loss": 1.2335, "step": 13482 }, { "epoch": 4.015860310132355, "grad_norm": 0.35895636677742004, "learning_rate": 1.3585630501115714e-05, "loss": 1.2076, "step": 13483 }, { "epoch": 4.016158156332024, "grad_norm": 0.2622363567352295, "learning_rate": 1.3584730056418702e-05, "loss": 1.2295, "step": 13484 }, { "epoch": 4.016456002531693, "grad_norm": 0.3240194022655487, "learning_rate": 1.3583829578369783e-05, "loss": 1.2347, "step": 13485 }, { "epoch": 4.016753848731361, "grad_norm": 0.2415049523115158, "learning_rate": 1.3582929066977336e-05, "loss": 1.2374, "step": 13486 }, { "epoch": 4.01705169493103, "grad_norm": 0.262705534696579, "learning_rate": 1.3582028522249732e-05, "loss": 1.2085, "step": 13487 }, { "epoch": 4.017349541130699, "grad_norm": 0.2660171687602997, "learning_rate": 1.3581127944195356e-05, "loss": 1.2117, "step": 13488 }, { "epoch": 4.017647387330367, "grad_norm": 0.2507553696632385, "learning_rate": 1.3580227332822584e-05, "loss": 1.2341, "step": 13489 }, { "epoch": 4.017945233530036, "grad_norm": 0.25532859563827515, "learning_rate": 1.3579326688139802e-05, "loss": 1.2244, "step": 13490 }, { "epoch": 4.018243079729705, "grad_norm": 0.2624969184398651, "learning_rate": 1.3578426010155381e-05, "loss": 1.2357, "step": 13491 }, { "epoch": 4.018540925929373, "grad_norm": 0.3329092860221863, "learning_rate": 1.3577525298877705e-05, "loss": 1.2153, "step": 13492 }, { "epoch": 4.018838772129042, "grad_norm": 0.28040000796318054, "learning_rate": 1.3576624554315154e-05, "loss": 1.2216, "step": 13493 }, { "epoch": 4.0191366183287105, "grad_norm": 0.2757415473461151, "learning_rate": 1.3575723776476106e-05, "loss": 1.2297, "step": 13494 }, { "epoch": 4.019434464528379, "grad_norm": 0.30063891410827637, "learning_rate": 1.3574822965368942e-05, "loss": 1.2313, "step": 13495 }, { "epoch": 4.019732310728048, "grad_norm": 0.26259714365005493, "learning_rate": 1.3573922121002045e-05, "loss": 1.2345, "step": 13496 }, { "epoch": 4.020030156927716, "grad_norm": 0.2762293815612793, "learning_rate": 1.3573021243383797e-05, "loss": 1.2207, "step": 13497 }, { "epoch": 4.020328003127385, "grad_norm": 0.2598060667514801, "learning_rate": 1.3572120332522578e-05, "loss": 1.2201, "step": 13498 }, { "epoch": 4.020625849327054, "grad_norm": 0.2517746388912201, "learning_rate": 1.357121938842677e-05, "loss": 1.2226, "step": 13499 }, { "epoch": 4.020923695526722, "grad_norm": 0.24112702906131744, "learning_rate": 1.3570318411104756e-05, "loss": 1.2254, "step": 13500 }, { "epoch": 4.020923695526722, "eval_loss": 1.3276550769805908, "eval_runtime": 21.9183, "eval_samples_per_second": 79.112, "eval_steps_per_second": 4.973, "step": 13500 }, { "epoch": 4.0212215417263915, "grad_norm": 0.2654041647911072, "learning_rate": 1.3569417400564917e-05, "loss": 1.225, "step": 13501 }, { "epoch": 4.02151938792606, "grad_norm": 0.2819170653820038, "learning_rate": 1.3568516356815637e-05, "loss": 1.2155, "step": 13502 }, { "epoch": 4.021817234125728, "grad_norm": 0.3072175979614258, "learning_rate": 1.3567615279865303e-05, "loss": 1.2233, "step": 13503 }, { "epoch": 4.022115080325397, "grad_norm": 0.24807359278202057, "learning_rate": 1.3566714169722292e-05, "loss": 1.2314, "step": 13504 }, { "epoch": 4.022412926525066, "grad_norm": 0.2827318012714386, "learning_rate": 1.3565813026394992e-05, "loss": 1.222, "step": 13505 }, { "epoch": 4.022710772724734, "grad_norm": 0.31187954545021057, "learning_rate": 1.3564911849891785e-05, "loss": 1.2238, "step": 13506 }, { "epoch": 4.023008618924403, "grad_norm": 0.24782270193099976, "learning_rate": 1.3564010640221052e-05, "loss": 1.2349, "step": 13507 }, { "epoch": 4.0233064651240715, "grad_norm": 0.29361018538475037, "learning_rate": 1.3563109397391188e-05, "loss": 1.222, "step": 13508 }, { "epoch": 4.02360431132374, "grad_norm": 0.3015013635158539, "learning_rate": 1.3562208121410568e-05, "loss": 1.2186, "step": 13509 }, { "epoch": 4.023902157523409, "grad_norm": 0.39766520261764526, "learning_rate": 1.3561306812287584e-05, "loss": 1.2016, "step": 13510 }, { "epoch": 4.024200003723077, "grad_norm": 0.24687308073043823, "learning_rate": 1.3560405470030617e-05, "loss": 1.2216, "step": 13511 }, { "epoch": 4.024497849922746, "grad_norm": 0.40833809971809387, "learning_rate": 1.3559504094648055e-05, "loss": 1.2143, "step": 13512 }, { "epoch": 4.024795696122415, "grad_norm": 0.44821810722351074, "learning_rate": 1.3558602686148285e-05, "loss": 1.2134, "step": 13513 }, { "epoch": 4.025093542322083, "grad_norm": 0.48145267367362976, "learning_rate": 1.3557701244539696e-05, "loss": 1.2478, "step": 13514 }, { "epoch": 4.0253913885217525, "grad_norm": 0.2988905906677246, "learning_rate": 1.3556799769830669e-05, "loss": 1.2212, "step": 13515 }, { "epoch": 4.025689234721421, "grad_norm": 0.3648199439048767, "learning_rate": 1.3555898262029593e-05, "loss": 1.2107, "step": 13516 }, { "epoch": 4.025987080921089, "grad_norm": 0.2863132655620575, "learning_rate": 1.355499672114486e-05, "loss": 1.2067, "step": 13517 }, { "epoch": 4.026284927120758, "grad_norm": 0.46540048718452454, "learning_rate": 1.3554095147184849e-05, "loss": 1.2273, "step": 13518 }, { "epoch": 4.026582773320427, "grad_norm": 0.2618623673915863, "learning_rate": 1.3553193540157956e-05, "loss": 1.2197, "step": 13519 }, { "epoch": 4.026880619520095, "grad_norm": 0.7541860342025757, "learning_rate": 1.355229190007257e-05, "loss": 1.2172, "step": 13520 }, { "epoch": 4.027178465719764, "grad_norm": 0.46875596046447754, "learning_rate": 1.3551390226937074e-05, "loss": 1.2315, "step": 13521 }, { "epoch": 4.027476311919433, "grad_norm": 0.30588921904563904, "learning_rate": 1.355048852075986e-05, "loss": 1.2422, "step": 13522 }, { "epoch": 4.027774158119101, "grad_norm": 0.28768113255500793, "learning_rate": 1.3549586781549318e-05, "loss": 1.2218, "step": 13523 }, { "epoch": 4.02807200431877, "grad_norm": 0.284957617521286, "learning_rate": 1.3548685009313834e-05, "loss": 1.2366, "step": 13524 }, { "epoch": 4.028369850518438, "grad_norm": 0.2643536925315857, "learning_rate": 1.3547783204061804e-05, "loss": 1.2182, "step": 13525 }, { "epoch": 4.028667696718107, "grad_norm": 0.29222121834754944, "learning_rate": 1.3546881365801612e-05, "loss": 1.2312, "step": 13526 }, { "epoch": 4.028965542917776, "grad_norm": 0.3241350054740906, "learning_rate": 1.3545979494541656e-05, "loss": 1.2423, "step": 13527 }, { "epoch": 4.029263389117444, "grad_norm": 0.25367748737335205, "learning_rate": 1.3545077590290321e-05, "loss": 1.2229, "step": 13528 }, { "epoch": 4.0295612353171135, "grad_norm": 0.2630714178085327, "learning_rate": 1.3544175653055997e-05, "loss": 1.2149, "step": 13529 }, { "epoch": 4.029859081516782, "grad_norm": 0.2765938937664032, "learning_rate": 1.3543273682847078e-05, "loss": 1.2138, "step": 13530 }, { "epoch": 4.03015692771645, "grad_norm": 0.24845919013023376, "learning_rate": 1.3542371679671961e-05, "loss": 1.2226, "step": 13531 }, { "epoch": 4.030454773916119, "grad_norm": 0.3170214295387268, "learning_rate": 1.354146964353903e-05, "loss": 1.22, "step": 13532 }, { "epoch": 4.030752620115788, "grad_norm": 0.25234171748161316, "learning_rate": 1.3540567574456683e-05, "loss": 1.2161, "step": 13533 }, { "epoch": 4.031050466315456, "grad_norm": 0.2870865762233734, "learning_rate": 1.353966547243331e-05, "loss": 1.2327, "step": 13534 }, { "epoch": 4.031348312515125, "grad_norm": 0.30945733189582825, "learning_rate": 1.3538763337477303e-05, "loss": 1.2288, "step": 13535 }, { "epoch": 4.031646158714794, "grad_norm": 0.2529684007167816, "learning_rate": 1.3537861169597055e-05, "loss": 1.2302, "step": 13536 }, { "epoch": 4.031944004914462, "grad_norm": 0.29196882247924805, "learning_rate": 1.3536958968800963e-05, "loss": 1.2065, "step": 13537 }, { "epoch": 4.032241851114131, "grad_norm": 0.264177143573761, "learning_rate": 1.3536056735097423e-05, "loss": 1.227, "step": 13538 }, { "epoch": 4.0325396973137995, "grad_norm": 0.2778324484825134, "learning_rate": 1.3535154468494822e-05, "loss": 1.2007, "step": 13539 }, { "epoch": 4.032837543513468, "grad_norm": 0.2707468271255493, "learning_rate": 1.3534252169001561e-05, "loss": 1.2514, "step": 13540 }, { "epoch": 4.033135389713137, "grad_norm": 0.2863061726093292, "learning_rate": 1.3533349836626031e-05, "loss": 1.2478, "step": 13541 }, { "epoch": 4.033433235912805, "grad_norm": 0.2589258849620819, "learning_rate": 1.3532447471376628e-05, "loss": 1.2259, "step": 13542 }, { "epoch": 4.0337310821124746, "grad_norm": 0.27595287561416626, "learning_rate": 1.3531545073261749e-05, "loss": 1.238, "step": 13543 }, { "epoch": 4.034028928312143, "grad_norm": 0.2699430286884308, "learning_rate": 1.3530642642289789e-05, "loss": 1.2228, "step": 13544 }, { "epoch": 4.034326774511811, "grad_norm": 0.2584626078605652, "learning_rate": 1.3529740178469144e-05, "loss": 1.2238, "step": 13545 }, { "epoch": 4.03462462071148, "grad_norm": 0.2900390326976776, "learning_rate": 1.3528837681808206e-05, "loss": 1.2167, "step": 13546 }, { "epoch": 4.034922466911149, "grad_norm": 0.25129029154777527, "learning_rate": 1.3527935152315381e-05, "loss": 1.2191, "step": 13547 }, { "epoch": 4.035220313110817, "grad_norm": 0.2600948214530945, "learning_rate": 1.3527032589999058e-05, "loss": 1.2234, "step": 13548 }, { "epoch": 4.035518159310486, "grad_norm": 0.26066386699676514, "learning_rate": 1.3526129994867639e-05, "loss": 1.2198, "step": 13549 }, { "epoch": 4.035816005510155, "grad_norm": 0.25908273458480835, "learning_rate": 1.352522736692952e-05, "loss": 1.2397, "step": 13550 }, { "epoch": 4.036113851709823, "grad_norm": 0.2511611580848694, "learning_rate": 1.3524324706193102e-05, "loss": 1.2204, "step": 13551 }, { "epoch": 4.036411697909492, "grad_norm": 0.2425713688135147, "learning_rate": 1.3523422012666776e-05, "loss": 1.2196, "step": 13552 }, { "epoch": 4.0367095441091605, "grad_norm": 0.2602206766605377, "learning_rate": 1.3522519286358944e-05, "loss": 1.2331, "step": 13553 }, { "epoch": 4.03700739030883, "grad_norm": 0.24712972342967987, "learning_rate": 1.3521616527278006e-05, "loss": 1.2283, "step": 13554 }, { "epoch": 4.037305236508498, "grad_norm": 0.2583823502063751, "learning_rate": 1.3520713735432365e-05, "loss": 1.2206, "step": 13555 }, { "epoch": 4.037603082708166, "grad_norm": 0.2572380304336548, "learning_rate": 1.3519810910830411e-05, "loss": 1.2191, "step": 13556 }, { "epoch": 4.037900928907836, "grad_norm": 0.25522580742836, "learning_rate": 1.3518908053480553e-05, "loss": 1.2156, "step": 13557 }, { "epoch": 4.038198775107504, "grad_norm": 0.261445015668869, "learning_rate": 1.3518005163391185e-05, "loss": 1.2348, "step": 13558 }, { "epoch": 4.038496621307172, "grad_norm": 0.25178179144859314, "learning_rate": 1.351710224057071e-05, "loss": 1.2256, "step": 13559 }, { "epoch": 4.0387944675068415, "grad_norm": 0.2590835988521576, "learning_rate": 1.3516199285027527e-05, "loss": 1.2259, "step": 13560 }, { "epoch": 4.03909231370651, "grad_norm": 0.24784936010837555, "learning_rate": 1.3515296296770041e-05, "loss": 1.2336, "step": 13561 }, { "epoch": 4.039390159906178, "grad_norm": 0.2646029591560364, "learning_rate": 1.351439327580665e-05, "loss": 1.2354, "step": 13562 }, { "epoch": 4.039688006105847, "grad_norm": 0.25064483284950256, "learning_rate": 1.3513490222145756e-05, "loss": 1.2173, "step": 13563 }, { "epoch": 4.039985852305516, "grad_norm": 0.24869340658187866, "learning_rate": 1.3512587135795759e-05, "loss": 1.2298, "step": 13564 }, { "epoch": 4.040283698505184, "grad_norm": 0.27413302659988403, "learning_rate": 1.3511684016765063e-05, "loss": 1.2127, "step": 13565 }, { "epoch": 4.040581544704853, "grad_norm": 0.25885817408561707, "learning_rate": 1.3510780865062072e-05, "loss": 1.2192, "step": 13566 }, { "epoch": 4.0408793909045215, "grad_norm": 0.3105272948741913, "learning_rate": 1.3509877680695187e-05, "loss": 1.2232, "step": 13567 }, { "epoch": 4.041177237104191, "grad_norm": 0.34326842427253723, "learning_rate": 1.3508974463672814e-05, "loss": 1.2114, "step": 13568 }, { "epoch": 4.041475083303859, "grad_norm": 0.33438393473625183, "learning_rate": 1.3508071214003353e-05, "loss": 1.2284, "step": 13569 }, { "epoch": 4.041772929503527, "grad_norm": 0.8982435464859009, "learning_rate": 1.3507167931695206e-05, "loss": 1.2093, "step": 13570 }, { "epoch": 4.042070775703197, "grad_norm": 0.3824581205844879, "learning_rate": 1.3506264616756784e-05, "loss": 1.2331, "step": 13571 }, { "epoch": 4.042368621902865, "grad_norm": 0.2815983295440674, "learning_rate": 1.3505361269196482e-05, "loss": 1.2305, "step": 13572 }, { "epoch": 4.042666468102533, "grad_norm": 0.24957147240638733, "learning_rate": 1.3504457889022713e-05, "loss": 1.2327, "step": 13573 }, { "epoch": 4.0429643143022025, "grad_norm": 0.2587170898914337, "learning_rate": 1.3503554476243878e-05, "loss": 1.227, "step": 13574 }, { "epoch": 4.043262160501871, "grad_norm": 0.2537778913974762, "learning_rate": 1.3502651030868386e-05, "loss": 1.2342, "step": 13575 }, { "epoch": 4.043560006701539, "grad_norm": 0.24664580821990967, "learning_rate": 1.3501747552904636e-05, "loss": 1.2229, "step": 13576 }, { "epoch": 4.043857852901208, "grad_norm": 0.25332629680633545, "learning_rate": 1.3500844042361036e-05, "loss": 1.2416, "step": 13577 }, { "epoch": 4.044155699100877, "grad_norm": 0.24326634407043457, "learning_rate": 1.3499940499245998e-05, "loss": 1.2323, "step": 13578 }, { "epoch": 4.044453545300545, "grad_norm": 0.239749014377594, "learning_rate": 1.3499036923567923e-05, "loss": 1.2184, "step": 13579 }, { "epoch": 4.044751391500214, "grad_norm": 0.24345549941062927, "learning_rate": 1.3498133315335219e-05, "loss": 1.2173, "step": 13580 }, { "epoch": 4.045049237699883, "grad_norm": 0.24426348507404327, "learning_rate": 1.349722967455629e-05, "loss": 1.2177, "step": 13581 }, { "epoch": 4.045347083899552, "grad_norm": 0.24000655114650726, "learning_rate": 1.3496326001239547e-05, "loss": 1.2185, "step": 13582 }, { "epoch": 4.04564493009922, "grad_norm": 0.24266929924488068, "learning_rate": 1.3495422295393399e-05, "loss": 1.2074, "step": 13583 }, { "epoch": 4.045942776298888, "grad_norm": 0.25049179792404175, "learning_rate": 1.349451855702625e-05, "loss": 1.2434, "step": 13584 }, { "epoch": 4.046240622498558, "grad_norm": 0.24727383255958557, "learning_rate": 1.3493614786146512e-05, "loss": 1.2248, "step": 13585 }, { "epoch": 4.046538468698226, "grad_norm": 0.24171684682369232, "learning_rate": 1.3492710982762592e-05, "loss": 1.2278, "step": 13586 }, { "epoch": 4.046836314897894, "grad_norm": 0.24876807630062103, "learning_rate": 1.3491807146882897e-05, "loss": 1.2247, "step": 13587 }, { "epoch": 4.0471341610975635, "grad_norm": 0.24070468544960022, "learning_rate": 1.3490903278515837e-05, "loss": 1.2244, "step": 13588 }, { "epoch": 4.047432007297232, "grad_norm": 0.24079173803329468, "learning_rate": 1.3489999377669823e-05, "loss": 1.2276, "step": 13589 }, { "epoch": 4.0477298534969, "grad_norm": 0.24558314681053162, "learning_rate": 1.3489095444353261e-05, "loss": 1.2237, "step": 13590 }, { "epoch": 4.048027699696569, "grad_norm": 0.24840916693210602, "learning_rate": 1.3488191478574568e-05, "loss": 1.2273, "step": 13591 }, { "epoch": 4.048325545896238, "grad_norm": 0.26182103157043457, "learning_rate": 1.3487287480342152e-05, "loss": 1.2173, "step": 13592 }, { "epoch": 4.048623392095906, "grad_norm": 0.24094253778457642, "learning_rate": 1.348638344966442e-05, "loss": 1.2288, "step": 13593 }, { "epoch": 4.048921238295575, "grad_norm": 0.24000322818756104, "learning_rate": 1.3485479386549785e-05, "loss": 1.2288, "step": 13594 }, { "epoch": 4.049219084495244, "grad_norm": 0.2488178163766861, "learning_rate": 1.3484575291006656e-05, "loss": 1.2378, "step": 13595 }, { "epoch": 4.049516930694913, "grad_norm": 0.24873749911785126, "learning_rate": 1.3483671163043453e-05, "loss": 1.2274, "step": 13596 }, { "epoch": 4.049814776894581, "grad_norm": 0.2439805418252945, "learning_rate": 1.3482767002668578e-05, "loss": 1.2433, "step": 13597 }, { "epoch": 4.0501126230942495, "grad_norm": 0.24247656762599945, "learning_rate": 1.3481862809890447e-05, "loss": 1.2142, "step": 13598 }, { "epoch": 4.050410469293919, "grad_norm": 0.25325271487236023, "learning_rate": 1.3480958584717476e-05, "loss": 1.2266, "step": 13599 }, { "epoch": 4.050708315493587, "grad_norm": 0.2725006341934204, "learning_rate": 1.3480054327158069e-05, "loss": 1.2292, "step": 13600 }, { "epoch": 4.051006161693255, "grad_norm": 0.24800099432468414, "learning_rate": 1.347915003722065e-05, "loss": 1.2427, "step": 13601 }, { "epoch": 4.0513040078929246, "grad_norm": 0.2671442925930023, "learning_rate": 1.3478245714913626e-05, "loss": 1.2367, "step": 13602 }, { "epoch": 4.051601854092593, "grad_norm": 0.2499389946460724, "learning_rate": 1.347734136024541e-05, "loss": 1.2211, "step": 13603 }, { "epoch": 4.051899700292261, "grad_norm": 0.2426474392414093, "learning_rate": 1.3476436973224418e-05, "loss": 1.2198, "step": 13604 }, { "epoch": 4.05219754649193, "grad_norm": 0.2303323596715927, "learning_rate": 1.3475532553859065e-05, "loss": 1.2242, "step": 13605 }, { "epoch": 4.052495392691599, "grad_norm": 0.27949002385139465, "learning_rate": 1.3474628102157765e-05, "loss": 1.2382, "step": 13606 }, { "epoch": 4.052793238891267, "grad_norm": 0.3251260817050934, "learning_rate": 1.3473723618128931e-05, "loss": 1.2418, "step": 13607 }, { "epoch": 4.053091085090936, "grad_norm": 0.2503196597099304, "learning_rate": 1.3472819101780981e-05, "loss": 1.2272, "step": 13608 }, { "epoch": 4.053388931290605, "grad_norm": 0.28937655687332153, "learning_rate": 1.3471914553122329e-05, "loss": 1.2353, "step": 13609 }, { "epoch": 4.053686777490274, "grad_norm": 0.2629067003726959, "learning_rate": 1.3471009972161392e-05, "loss": 1.2341, "step": 13610 }, { "epoch": 4.053984623689942, "grad_norm": 0.2914303243160248, "learning_rate": 1.3470105358906585e-05, "loss": 1.2251, "step": 13611 }, { "epoch": 4.0542824698896105, "grad_norm": 0.28820618987083435, "learning_rate": 1.3469200713366323e-05, "loss": 1.2296, "step": 13612 }, { "epoch": 4.05458031608928, "grad_norm": 0.27909815311431885, "learning_rate": 1.3468296035549026e-05, "loss": 1.2254, "step": 13613 }, { "epoch": 4.054878162288948, "grad_norm": 0.278202086687088, "learning_rate": 1.3467391325463109e-05, "loss": 1.2136, "step": 13614 }, { "epoch": 4.055176008488616, "grad_norm": 0.2600470185279846, "learning_rate": 1.3466486583116989e-05, "loss": 1.225, "step": 13615 }, { "epoch": 4.055473854688286, "grad_norm": 0.23804350197315216, "learning_rate": 1.3465581808519086e-05, "loss": 1.2346, "step": 13616 }, { "epoch": 4.055771700887954, "grad_norm": 0.2773174047470093, "learning_rate": 1.3464677001677815e-05, "loss": 1.2341, "step": 13617 }, { "epoch": 4.056069547087622, "grad_norm": 0.29110240936279297, "learning_rate": 1.3463772162601594e-05, "loss": 1.2143, "step": 13618 }, { "epoch": 4.0563673932872915, "grad_norm": 0.2538909614086151, "learning_rate": 1.3462867291298846e-05, "loss": 1.2311, "step": 13619 }, { "epoch": 4.05666523948696, "grad_norm": 0.3023252487182617, "learning_rate": 1.3461962387777985e-05, "loss": 1.2374, "step": 13620 }, { "epoch": 4.056963085686629, "grad_norm": 0.23743636906147003, "learning_rate": 1.3461057452047433e-05, "loss": 1.2103, "step": 13621 }, { "epoch": 4.057260931886297, "grad_norm": 0.26855871081352234, "learning_rate": 1.3460152484115607e-05, "loss": 1.2317, "step": 13622 }, { "epoch": 4.057558778085966, "grad_norm": 0.2674756646156311, "learning_rate": 1.3459247483990926e-05, "loss": 1.2232, "step": 13623 }, { "epoch": 4.057856624285635, "grad_norm": 0.25979241728782654, "learning_rate": 1.3458342451681816e-05, "loss": 1.2366, "step": 13624 }, { "epoch": 4.058154470485303, "grad_norm": 0.29046401381492615, "learning_rate": 1.3457437387196692e-05, "loss": 1.227, "step": 13625 }, { "epoch": 4.0584523166849715, "grad_norm": 0.27793610095977783, "learning_rate": 1.3456532290543978e-05, "loss": 1.2254, "step": 13626 }, { "epoch": 4.058750162884641, "grad_norm": 0.3121613562107086, "learning_rate": 1.3455627161732092e-05, "loss": 1.2294, "step": 13627 }, { "epoch": 4.059048009084309, "grad_norm": 0.25026342272758484, "learning_rate": 1.3454722000769453e-05, "loss": 1.2257, "step": 13628 }, { "epoch": 4.059345855283977, "grad_norm": 0.28906697034835815, "learning_rate": 1.3453816807664488e-05, "loss": 1.2368, "step": 13629 }, { "epoch": 4.059643701483647, "grad_norm": 0.29712942242622375, "learning_rate": 1.3452911582425616e-05, "loss": 1.2152, "step": 13630 }, { "epoch": 4.059941547683315, "grad_norm": 0.28302890062332153, "learning_rate": 1.345200632506126e-05, "loss": 1.2133, "step": 13631 }, { "epoch": 4.060239393882983, "grad_norm": 0.26594823598861694, "learning_rate": 1.3451101035579841e-05, "loss": 1.2086, "step": 13632 }, { "epoch": 4.0605372400826525, "grad_norm": 0.29638671875, "learning_rate": 1.3450195713989787e-05, "loss": 1.2129, "step": 13633 }, { "epoch": 4.060835086282321, "grad_norm": 0.2815476953983307, "learning_rate": 1.3449290360299512e-05, "loss": 1.2259, "step": 13634 }, { "epoch": 4.06113293248199, "grad_norm": 0.31807461380958557, "learning_rate": 1.3448384974517446e-05, "loss": 1.2243, "step": 13635 }, { "epoch": 4.061430778681658, "grad_norm": 0.26353856921195984, "learning_rate": 1.3447479556652008e-05, "loss": 1.232, "step": 13636 }, { "epoch": 4.061728624881327, "grad_norm": 0.3384415805339813, "learning_rate": 1.3446574106711625e-05, "loss": 1.2311, "step": 13637 }, { "epoch": 4.062026471080996, "grad_norm": 0.2682347595691681, "learning_rate": 1.3445668624704722e-05, "loss": 1.2031, "step": 13638 }, { "epoch": 4.062324317280664, "grad_norm": 0.25412505865097046, "learning_rate": 1.3444763110639722e-05, "loss": 1.2228, "step": 13639 }, { "epoch": 4.062622163480333, "grad_norm": 0.2534717917442322, "learning_rate": 1.3443857564525051e-05, "loss": 1.2201, "step": 13640 }, { "epoch": 4.062920009680002, "grad_norm": 0.2565198540687561, "learning_rate": 1.3442951986369131e-05, "loss": 1.2268, "step": 13641 }, { "epoch": 4.06321785587967, "grad_norm": 0.28886285424232483, "learning_rate": 1.3442046376180388e-05, "loss": 1.2391, "step": 13642 }, { "epoch": 4.063515702079338, "grad_norm": 0.25802603363990784, "learning_rate": 1.3441140733967254e-05, "loss": 1.218, "step": 13643 }, { "epoch": 4.063813548279008, "grad_norm": 0.256026029586792, "learning_rate": 1.3440235059738147e-05, "loss": 1.2096, "step": 13644 }, { "epoch": 4.064111394478676, "grad_norm": 0.23858126997947693, "learning_rate": 1.3439329353501497e-05, "loss": 1.2446, "step": 13645 }, { "epoch": 4.064409240678344, "grad_norm": 0.23307913541793823, "learning_rate": 1.343842361526573e-05, "loss": 1.2249, "step": 13646 }, { "epoch": 4.0647070868780135, "grad_norm": 0.2645309269428253, "learning_rate": 1.3437517845039273e-05, "loss": 1.2106, "step": 13647 }, { "epoch": 4.065004933077682, "grad_norm": 0.2464878261089325, "learning_rate": 1.3436612042830552e-05, "loss": 1.2319, "step": 13648 }, { "epoch": 4.065302779277351, "grad_norm": 0.2564888894557953, "learning_rate": 1.3435706208647998e-05, "loss": 1.2243, "step": 13649 }, { "epoch": 4.065600625477019, "grad_norm": 0.2656373977661133, "learning_rate": 1.3434800342500036e-05, "loss": 1.2244, "step": 13650 }, { "epoch": 4.065898471676688, "grad_norm": 0.27228906750679016, "learning_rate": 1.3433894444395093e-05, "loss": 1.2307, "step": 13651 }, { "epoch": 4.066196317876357, "grad_norm": 0.29931968450546265, "learning_rate": 1.3432988514341598e-05, "loss": 1.2191, "step": 13652 }, { "epoch": 4.066494164076025, "grad_norm": 0.27375850081443787, "learning_rate": 1.3432082552347983e-05, "loss": 1.2256, "step": 13653 }, { "epoch": 4.066792010275694, "grad_norm": 0.3032185733318329, "learning_rate": 1.3431176558422671e-05, "loss": 1.2123, "step": 13654 }, { "epoch": 4.067089856475363, "grad_norm": 0.25765061378479004, "learning_rate": 1.3430270532574097e-05, "loss": 1.2333, "step": 13655 }, { "epoch": 4.067387702675031, "grad_norm": 0.2828814387321472, "learning_rate": 1.3429364474810689e-05, "loss": 1.2352, "step": 13656 }, { "epoch": 4.0676855488746995, "grad_norm": 0.2466113418340683, "learning_rate": 1.3428458385140877e-05, "loss": 1.2229, "step": 13657 }, { "epoch": 4.067983395074369, "grad_norm": 0.298318088054657, "learning_rate": 1.3427552263573087e-05, "loss": 1.2168, "step": 13658 }, { "epoch": 4.068281241274037, "grad_norm": 0.2558857798576355, "learning_rate": 1.342664611011575e-05, "loss": 1.2198, "step": 13659 }, { "epoch": 4.068579087473705, "grad_norm": 0.28958582878112793, "learning_rate": 1.3425739924777307e-05, "loss": 1.2361, "step": 13660 }, { "epoch": 4.0688769336733746, "grad_norm": 0.27416858077049255, "learning_rate": 1.3424833707566176e-05, "loss": 1.2117, "step": 13661 }, { "epoch": 4.069174779873043, "grad_norm": 0.2728113532066345, "learning_rate": 1.3423927458490795e-05, "loss": 1.2275, "step": 13662 }, { "epoch": 4.069472626072712, "grad_norm": 0.2720509171485901, "learning_rate": 1.3423021177559596e-05, "loss": 1.217, "step": 13663 }, { "epoch": 4.06977047227238, "grad_norm": 0.27941185235977173, "learning_rate": 1.3422114864781008e-05, "loss": 1.2243, "step": 13664 }, { "epoch": 4.070068318472049, "grad_norm": 0.24686238169670105, "learning_rate": 1.3421208520163465e-05, "loss": 1.2179, "step": 13665 }, { "epoch": 4.070366164671718, "grad_norm": 0.2613919675350189, "learning_rate": 1.3420302143715397e-05, "loss": 1.2332, "step": 13666 }, { "epoch": 4.070664010871386, "grad_norm": 0.2512933313846588, "learning_rate": 1.3419395735445244e-05, "loss": 1.2263, "step": 13667 }, { "epoch": 4.070961857071055, "grad_norm": 0.2590857148170471, "learning_rate": 1.3418489295361429e-05, "loss": 1.225, "step": 13668 }, { "epoch": 4.071259703270724, "grad_norm": 0.2707737982273102, "learning_rate": 1.3417582823472395e-05, "loss": 1.2137, "step": 13669 }, { "epoch": 4.071557549470392, "grad_norm": 0.2573646605014801, "learning_rate": 1.3416676319786568e-05, "loss": 1.2313, "step": 13670 }, { "epoch": 4.0718553956700605, "grad_norm": 0.27899056673049927, "learning_rate": 1.3415769784312385e-05, "loss": 1.2485, "step": 13671 }, { "epoch": 4.07215324186973, "grad_norm": 0.2648579180240631, "learning_rate": 1.3414863217058281e-05, "loss": 1.2279, "step": 13672 }, { "epoch": 4.072451088069398, "grad_norm": 0.2523336112499237, "learning_rate": 1.3413956618032691e-05, "loss": 1.2337, "step": 13673 }, { "epoch": 4.072748934269066, "grad_norm": 0.24383379518985748, "learning_rate": 1.3413049987244048e-05, "loss": 1.2249, "step": 13674 }, { "epoch": 4.073046780468736, "grad_norm": 0.2558092474937439, "learning_rate": 1.3412143324700791e-05, "loss": 1.229, "step": 13675 }, { "epoch": 4.073344626668404, "grad_norm": 0.31439778208732605, "learning_rate": 1.3411236630411349e-05, "loss": 1.2237, "step": 13676 }, { "epoch": 4.073642472868073, "grad_norm": 0.29358455538749695, "learning_rate": 1.3410329904384161e-05, "loss": 1.2266, "step": 13677 }, { "epoch": 4.0739403190677415, "grad_norm": 0.2612122893333435, "learning_rate": 1.3409423146627665e-05, "loss": 1.2365, "step": 13678 }, { "epoch": 4.07423816526741, "grad_norm": 0.3495548367500305, "learning_rate": 1.3408516357150296e-05, "loss": 1.2245, "step": 13679 }, { "epoch": 4.074536011467079, "grad_norm": 0.2837928533554077, "learning_rate": 1.340760953596049e-05, "loss": 1.2136, "step": 13680 }, { "epoch": 4.074833857666747, "grad_norm": 0.2951454222202301, "learning_rate": 1.3406702683066688e-05, "loss": 1.2296, "step": 13681 }, { "epoch": 4.075131703866416, "grad_norm": 0.3578106164932251, "learning_rate": 1.340579579847732e-05, "loss": 1.2283, "step": 13682 }, { "epoch": 4.075429550066085, "grad_norm": 0.30266961455345154, "learning_rate": 1.3404888882200827e-05, "loss": 1.2353, "step": 13683 }, { "epoch": 4.075727396265753, "grad_norm": 0.29476091265678406, "learning_rate": 1.3403981934245649e-05, "loss": 1.2171, "step": 13684 }, { "epoch": 4.0760252424654215, "grad_norm": 0.30868008732795715, "learning_rate": 1.3403074954620219e-05, "loss": 1.2048, "step": 13685 }, { "epoch": 4.076323088665091, "grad_norm": 0.4029812216758728, "learning_rate": 1.3402167943332982e-05, "loss": 1.2165, "step": 13686 }, { "epoch": 4.076620934864759, "grad_norm": 0.3996765613555908, "learning_rate": 1.3401260900392373e-05, "loss": 1.2177, "step": 13687 }, { "epoch": 4.076918781064428, "grad_norm": 0.3624470829963684, "learning_rate": 1.340035382580683e-05, "loss": 1.2348, "step": 13688 }, { "epoch": 4.077216627264097, "grad_norm": 0.9039879441261292, "learning_rate": 1.3399446719584792e-05, "loss": 1.22, "step": 13689 }, { "epoch": 4.077514473463765, "grad_norm": 0.4560149908065796, "learning_rate": 1.3398539581734704e-05, "loss": 1.2212, "step": 13690 }, { "epoch": 4.077812319663434, "grad_norm": 0.35248881578445435, "learning_rate": 1.3397632412265002e-05, "loss": 1.2291, "step": 13691 }, { "epoch": 4.0781101658631025, "grad_norm": 0.3275471329689026, "learning_rate": 1.3396725211184128e-05, "loss": 1.2441, "step": 13692 }, { "epoch": 4.078408012062771, "grad_norm": 0.3053240180015564, "learning_rate": 1.3395817978500515e-05, "loss": 1.2276, "step": 13693 }, { "epoch": 4.07870585826244, "grad_norm": 0.6420515179634094, "learning_rate": 1.3394910714222615e-05, "loss": 1.2334, "step": 13694 }, { "epoch": 4.079003704462108, "grad_norm": 0.3115173280239105, "learning_rate": 1.339400341835886e-05, "loss": 1.2163, "step": 13695 }, { "epoch": 4.079301550661777, "grad_norm": 0.2744782865047455, "learning_rate": 1.3393096090917699e-05, "loss": 1.2232, "step": 13696 }, { "epoch": 4.079599396861446, "grad_norm": 0.2513817548751831, "learning_rate": 1.3392188731907567e-05, "loss": 1.2228, "step": 13697 }, { "epoch": 4.079897243061114, "grad_norm": 0.25459542870521545, "learning_rate": 1.3391281341336913e-05, "loss": 1.229, "step": 13698 }, { "epoch": 4.080195089260783, "grad_norm": 0.258196622133255, "learning_rate": 1.339037391921417e-05, "loss": 1.2279, "step": 13699 }, { "epoch": 4.080492935460452, "grad_norm": 0.24485677480697632, "learning_rate": 1.338946646554779e-05, "loss": 1.218, "step": 13700 }, { "epoch": 4.08079078166012, "grad_norm": 0.24852995574474335, "learning_rate": 1.3388558980346207e-05, "loss": 1.2279, "step": 13701 }, { "epoch": 4.081088627859789, "grad_norm": 0.25261256098747253, "learning_rate": 1.3387651463617873e-05, "loss": 1.2307, "step": 13702 }, { "epoch": 4.081386474059458, "grad_norm": 0.27149641513824463, "learning_rate": 1.3386743915371223e-05, "loss": 1.2225, "step": 13703 }, { "epoch": 4.081684320259126, "grad_norm": 0.2802477478981018, "learning_rate": 1.3385836335614706e-05, "loss": 1.2299, "step": 13704 }, { "epoch": 4.081982166458795, "grad_norm": 0.24667353928089142, "learning_rate": 1.3384928724356767e-05, "loss": 1.2318, "step": 13705 }, { "epoch": 4.0822800126584635, "grad_norm": 0.25782522559165955, "learning_rate": 1.3384021081605844e-05, "loss": 1.2312, "step": 13706 }, { "epoch": 4.082577858858132, "grad_norm": 0.26943254470825195, "learning_rate": 1.3383113407370387e-05, "loss": 1.2036, "step": 13707 }, { "epoch": 4.082875705057801, "grad_norm": 0.25699976086616516, "learning_rate": 1.3382205701658843e-05, "loss": 1.2087, "step": 13708 }, { "epoch": 4.083173551257469, "grad_norm": 0.25810468196868896, "learning_rate": 1.3381297964479654e-05, "loss": 1.2139, "step": 13709 }, { "epoch": 4.083471397457138, "grad_norm": 0.2752602994441986, "learning_rate": 1.3380390195841262e-05, "loss": 1.228, "step": 13710 }, { "epoch": 4.083769243656807, "grad_norm": 0.26506638526916504, "learning_rate": 1.3379482395752115e-05, "loss": 1.2203, "step": 13711 }, { "epoch": 4.084067089856475, "grad_norm": 0.28648409247398376, "learning_rate": 1.337857456422066e-05, "loss": 1.2204, "step": 13712 }, { "epoch": 4.084364936056144, "grad_norm": 0.25765904784202576, "learning_rate": 1.3377666701255347e-05, "loss": 1.2178, "step": 13713 }, { "epoch": 4.084662782255813, "grad_norm": 0.3073035478591919, "learning_rate": 1.3376758806864618e-05, "loss": 1.2279, "step": 13714 }, { "epoch": 4.084960628455481, "grad_norm": 0.23774142563343048, "learning_rate": 1.3375850881056922e-05, "loss": 1.2182, "step": 13715 }, { "epoch": 4.08525847465515, "grad_norm": 0.33706215023994446, "learning_rate": 1.3374942923840703e-05, "loss": 1.2248, "step": 13716 }, { "epoch": 4.085556320854819, "grad_norm": 0.2777893841266632, "learning_rate": 1.337403493522441e-05, "loss": 1.2372, "step": 13717 }, { "epoch": 4.085854167054487, "grad_norm": 0.25480255484580994, "learning_rate": 1.3373126915216494e-05, "loss": 1.2383, "step": 13718 }, { "epoch": 4.086152013254156, "grad_norm": 0.24534185230731964, "learning_rate": 1.3372218863825399e-05, "loss": 1.2254, "step": 13719 }, { "epoch": 4.0864498594538246, "grad_norm": 0.25387948751449585, "learning_rate": 1.3371310781059576e-05, "loss": 1.2358, "step": 13720 }, { "epoch": 4.086747705653493, "grad_norm": 0.25739654898643494, "learning_rate": 1.3370402666927473e-05, "loss": 1.2351, "step": 13721 }, { "epoch": 4.087045551853162, "grad_norm": 0.2902533710002899, "learning_rate": 1.3369494521437541e-05, "loss": 1.2252, "step": 13722 }, { "epoch": 4.08734339805283, "grad_norm": 0.2753237783908844, "learning_rate": 1.3368586344598225e-05, "loss": 1.2322, "step": 13723 }, { "epoch": 4.087641244252499, "grad_norm": 0.23811596632003784, "learning_rate": 1.3367678136417974e-05, "loss": 1.2352, "step": 13724 }, { "epoch": 4.087939090452168, "grad_norm": 0.26392996311187744, "learning_rate": 1.3366769896905245e-05, "loss": 1.2156, "step": 13725 }, { "epoch": 4.088236936651836, "grad_norm": 0.27758848667144775, "learning_rate": 1.336586162606848e-05, "loss": 1.2388, "step": 13726 }, { "epoch": 4.088534782851505, "grad_norm": 0.2551907002925873, "learning_rate": 1.3364953323916134e-05, "loss": 1.2271, "step": 13727 }, { "epoch": 4.088832629051174, "grad_norm": 0.26844868063926697, "learning_rate": 1.3364044990456659e-05, "loss": 1.2307, "step": 13728 }, { "epoch": 4.089130475250842, "grad_norm": 0.29414692521095276, "learning_rate": 1.3363136625698503e-05, "loss": 1.2266, "step": 13729 }, { "epoch": 4.089428321450511, "grad_norm": 0.42714792490005493, "learning_rate": 1.3362228229650116e-05, "loss": 1.2107, "step": 13730 }, { "epoch": 4.08972616765018, "grad_norm": 0.3364658057689667, "learning_rate": 1.3361319802319954e-05, "loss": 1.2176, "step": 13731 }, { "epoch": 4.090024013849848, "grad_norm": 0.315278023481369, "learning_rate": 1.3360411343716468e-05, "loss": 1.2365, "step": 13732 }, { "epoch": 4.090321860049517, "grad_norm": 0.29315799474716187, "learning_rate": 1.3359502853848107e-05, "loss": 1.2126, "step": 13733 }, { "epoch": 4.090619706249186, "grad_norm": 0.3561355471611023, "learning_rate": 1.3358594332723325e-05, "loss": 1.2337, "step": 13734 }, { "epoch": 4.090917552448854, "grad_norm": 0.4308493137359619, "learning_rate": 1.3357685780350575e-05, "loss": 1.2385, "step": 13735 }, { "epoch": 4.091215398648523, "grad_norm": 0.28329983353614807, "learning_rate": 1.3356777196738312e-05, "loss": 1.1968, "step": 13736 }, { "epoch": 4.0915132448481915, "grad_norm": 0.2651686370372772, "learning_rate": 1.3355868581894987e-05, "loss": 1.2327, "step": 13737 }, { "epoch": 4.09181109104786, "grad_norm": 0.3283056914806366, "learning_rate": 1.3354959935829052e-05, "loss": 1.223, "step": 13738 }, { "epoch": 4.092108937247529, "grad_norm": 0.25197744369506836, "learning_rate": 1.3354051258548968e-05, "loss": 1.2141, "step": 13739 }, { "epoch": 4.092406783447197, "grad_norm": 0.2821015417575836, "learning_rate": 1.335314255006318e-05, "loss": 1.2397, "step": 13740 }, { "epoch": 4.092704629646866, "grad_norm": 0.24738141894340515, "learning_rate": 1.3352233810380149e-05, "loss": 1.2313, "step": 13741 }, { "epoch": 4.093002475846535, "grad_norm": 0.2650309205055237, "learning_rate": 1.3351325039508328e-05, "loss": 1.218, "step": 13742 }, { "epoch": 4.093300322046203, "grad_norm": 0.3236358165740967, "learning_rate": 1.335041623745617e-05, "loss": 1.2216, "step": 13743 }, { "epoch": 4.093598168245872, "grad_norm": 0.30074071884155273, "learning_rate": 1.3349507404232131e-05, "loss": 1.2025, "step": 13744 }, { "epoch": 4.093896014445541, "grad_norm": 0.2564895749092102, "learning_rate": 1.3348598539844671e-05, "loss": 1.2137, "step": 13745 }, { "epoch": 4.094193860645209, "grad_norm": 0.24864192306995392, "learning_rate": 1.3347689644302244e-05, "loss": 1.2224, "step": 13746 }, { "epoch": 4.094491706844878, "grad_norm": 0.25647157430648804, "learning_rate": 1.3346780717613302e-05, "loss": 1.2349, "step": 13747 }, { "epoch": 4.094789553044547, "grad_norm": 0.28633326292037964, "learning_rate": 1.3345871759786303e-05, "loss": 1.2402, "step": 13748 }, { "epoch": 4.095087399244215, "grad_norm": 0.25307002663612366, "learning_rate": 1.3344962770829712e-05, "loss": 1.2242, "step": 13749 }, { "epoch": 4.095385245443884, "grad_norm": 0.28082963824272156, "learning_rate": 1.3344053750751976e-05, "loss": 1.2064, "step": 13750 }, { "epoch": 4.0956830916435525, "grad_norm": 0.24947533011436462, "learning_rate": 1.3343144699561555e-05, "loss": 1.2471, "step": 13751 }, { "epoch": 4.095980937843221, "grad_norm": 0.30670779943466187, "learning_rate": 1.3342235617266908e-05, "loss": 1.2178, "step": 13752 }, { "epoch": 4.09627878404289, "grad_norm": 0.27580228447914124, "learning_rate": 1.3341326503876495e-05, "loss": 1.2266, "step": 13753 }, { "epoch": 4.096576630242558, "grad_norm": 0.2716881334781647, "learning_rate": 1.334041735939877e-05, "loss": 1.212, "step": 13754 }, { "epoch": 4.096874476442228, "grad_norm": 0.2548467516899109, "learning_rate": 1.3339508183842194e-05, "loss": 1.2271, "step": 13755 }, { "epoch": 4.097172322641896, "grad_norm": 0.29218921065330505, "learning_rate": 1.3338598977215224e-05, "loss": 1.2217, "step": 13756 }, { "epoch": 4.097470168841564, "grad_norm": 0.2565087676048279, "learning_rate": 1.3337689739526325e-05, "loss": 1.2173, "step": 13757 }, { "epoch": 4.0977680150412334, "grad_norm": 0.39434826374053955, "learning_rate": 1.3336780470783947e-05, "loss": 1.2382, "step": 13758 }, { "epoch": 4.098065861240902, "grad_norm": 0.3490411341190338, "learning_rate": 1.3335871170996558e-05, "loss": 1.2271, "step": 13759 }, { "epoch": 4.09836370744057, "grad_norm": 0.4308874309062958, "learning_rate": 1.3334961840172612e-05, "loss": 1.2153, "step": 13760 }, { "epoch": 4.098661553640239, "grad_norm": 0.39875128865242004, "learning_rate": 1.3334052478320576e-05, "loss": 1.2196, "step": 13761 }, { "epoch": 4.098959399839908, "grad_norm": 0.3732142746448517, "learning_rate": 1.3333143085448903e-05, "loss": 1.2354, "step": 13762 }, { "epoch": 4.099257246039576, "grad_norm": 0.2714923024177551, "learning_rate": 1.333223366156606e-05, "loss": 1.219, "step": 13763 }, { "epoch": 4.099555092239245, "grad_norm": 0.6516803503036499, "learning_rate": 1.3331324206680508e-05, "loss": 1.2223, "step": 13764 }, { "epoch": 4.0998529384389135, "grad_norm": 0.4559103846549988, "learning_rate": 1.33304147208007e-05, "loss": 1.2185, "step": 13765 }, { "epoch": 4.100150784638582, "grad_norm": 0.339886873960495, "learning_rate": 1.3329505203935109e-05, "loss": 1.2091, "step": 13766 }, { "epoch": 4.100448630838251, "grad_norm": 0.3002610504627228, "learning_rate": 1.332859565609219e-05, "loss": 1.2272, "step": 13767 }, { "epoch": 4.100746477037919, "grad_norm": 0.4801383316516876, "learning_rate": 1.3327686077280406e-05, "loss": 1.2335, "step": 13768 }, { "epoch": 4.101044323237589, "grad_norm": 0.34596481919288635, "learning_rate": 1.3326776467508223e-05, "loss": 1.2301, "step": 13769 }, { "epoch": 4.101342169437257, "grad_norm": 0.34558749198913574, "learning_rate": 1.33258668267841e-05, "loss": 1.2378, "step": 13770 }, { "epoch": 4.101640015636925, "grad_norm": 0.33433011174201965, "learning_rate": 1.3324957155116503e-05, "loss": 1.2129, "step": 13771 }, { "epoch": 4.1019378618365945, "grad_norm": 0.3223732113838196, "learning_rate": 1.3324047452513894e-05, "loss": 1.2243, "step": 13772 }, { "epoch": 4.102235708036263, "grad_norm": 0.539328932762146, "learning_rate": 1.3323137718984743e-05, "loss": 1.2165, "step": 13773 }, { "epoch": 4.102533554235931, "grad_norm": 0.40519529581069946, "learning_rate": 1.3322227954537503e-05, "loss": 1.2218, "step": 13774 }, { "epoch": 4.1028314004356, "grad_norm": 0.4319937825202942, "learning_rate": 1.3321318159180643e-05, "loss": 1.2247, "step": 13775 }, { "epoch": 4.103129246635269, "grad_norm": 0.27791622281074524, "learning_rate": 1.332040833292263e-05, "loss": 1.2223, "step": 13776 }, { "epoch": 4.103427092834937, "grad_norm": 0.5481389760971069, "learning_rate": 1.3319498475771926e-05, "loss": 1.2423, "step": 13777 }, { "epoch": 4.103724939034606, "grad_norm": 0.2930184304714203, "learning_rate": 1.3318588587736997e-05, "loss": 1.2218, "step": 13778 }, { "epoch": 4.1040227852342746, "grad_norm": 0.3152657151222229, "learning_rate": 1.331767866882631e-05, "loss": 1.2216, "step": 13779 }, { "epoch": 4.104320631433943, "grad_norm": 0.35558372735977173, "learning_rate": 1.3316768719048332e-05, "loss": 1.2266, "step": 13780 }, { "epoch": 4.104618477633612, "grad_norm": 0.2518128752708435, "learning_rate": 1.3315858738411525e-05, "loss": 1.24, "step": 13781 }, { "epoch": 4.10491632383328, "grad_norm": 0.4876645803451538, "learning_rate": 1.3314948726924358e-05, "loss": 1.223, "step": 13782 }, { "epoch": 4.10521417003295, "grad_norm": 0.24637188017368317, "learning_rate": 1.3314038684595295e-05, "loss": 1.2364, "step": 13783 }, { "epoch": 4.105512016232618, "grad_norm": 0.3102695047855377, "learning_rate": 1.3313128611432804e-05, "loss": 1.2281, "step": 13784 }, { "epoch": 4.105809862432286, "grad_norm": 0.32695600390434265, "learning_rate": 1.3312218507445355e-05, "loss": 1.2261, "step": 13785 }, { "epoch": 4.1061077086319555, "grad_norm": 0.26559188961982727, "learning_rate": 1.331130837264141e-05, "loss": 1.2301, "step": 13786 }, { "epoch": 4.106405554831624, "grad_norm": 0.3279210925102234, "learning_rate": 1.3310398207029447e-05, "loss": 1.224, "step": 13787 }, { "epoch": 4.106703401031292, "grad_norm": 0.2661599814891815, "learning_rate": 1.3309488010617921e-05, "loss": 1.2074, "step": 13788 }, { "epoch": 4.107001247230961, "grad_norm": 0.25574296712875366, "learning_rate": 1.3308577783415308e-05, "loss": 1.2262, "step": 13789 }, { "epoch": 4.10729909343063, "grad_norm": 0.3278314173221588, "learning_rate": 1.330766752543008e-05, "loss": 1.2244, "step": 13790 }, { "epoch": 4.107596939630298, "grad_norm": 0.2441791296005249, "learning_rate": 1.3306757236670696e-05, "loss": 1.2199, "step": 13791 }, { "epoch": 4.107894785829967, "grad_norm": 0.3597651720046997, "learning_rate": 1.330584691714563e-05, "loss": 1.2252, "step": 13792 }, { "epoch": 4.108192632029636, "grad_norm": 0.25514233112335205, "learning_rate": 1.3304936566863358e-05, "loss": 1.2293, "step": 13793 }, { "epoch": 4.108490478229304, "grad_norm": 0.2969241142272949, "learning_rate": 1.3304026185832335e-05, "loss": 1.231, "step": 13794 }, { "epoch": 4.108788324428973, "grad_norm": 0.27977922558784485, "learning_rate": 1.3303115774061044e-05, "loss": 1.2138, "step": 13795 }, { "epoch": 4.1090861706286415, "grad_norm": 0.38240185379981995, "learning_rate": 1.3302205331557952e-05, "loss": 1.2334, "step": 13796 }, { "epoch": 4.109384016828311, "grad_norm": 0.25447720289230347, "learning_rate": 1.3301294858331531e-05, "loss": 1.241, "step": 13797 }, { "epoch": 4.109681863027979, "grad_norm": 0.3108261525630951, "learning_rate": 1.3300384354390245e-05, "loss": 1.2187, "step": 13798 }, { "epoch": 4.109979709227647, "grad_norm": 0.2546389102935791, "learning_rate": 1.3299473819742572e-05, "loss": 1.2207, "step": 13799 }, { "epoch": 4.1102775554273165, "grad_norm": 0.3126170039176941, "learning_rate": 1.3298563254396983e-05, "loss": 1.2232, "step": 13800 }, { "epoch": 4.110575401626985, "grad_norm": 0.2660098969936371, "learning_rate": 1.3297652658361945e-05, "loss": 1.2191, "step": 13801 }, { "epoch": 4.110873247826653, "grad_norm": 0.3079351782798767, "learning_rate": 1.3296742031645934e-05, "loss": 1.2434, "step": 13802 }, { "epoch": 4.111171094026322, "grad_norm": 0.35410577058792114, "learning_rate": 1.3295831374257424e-05, "loss": 1.2086, "step": 13803 }, { "epoch": 4.111468940225991, "grad_norm": 0.2747192978858948, "learning_rate": 1.3294920686204886e-05, "loss": 1.2276, "step": 13804 }, { "epoch": 4.111766786425659, "grad_norm": 0.3622715473175049, "learning_rate": 1.329400996749679e-05, "loss": 1.2238, "step": 13805 }, { "epoch": 4.112064632625328, "grad_norm": 0.26115700602531433, "learning_rate": 1.329309921814161e-05, "loss": 1.2171, "step": 13806 }, { "epoch": 4.112362478824997, "grad_norm": 0.3849085569381714, "learning_rate": 1.3292188438147826e-05, "loss": 1.2181, "step": 13807 }, { "epoch": 4.112660325024665, "grad_norm": 0.2930947244167328, "learning_rate": 1.3291277627523905e-05, "loss": 1.2191, "step": 13808 }, { "epoch": 4.112958171224334, "grad_norm": 0.4109828472137451, "learning_rate": 1.3290366786278321e-05, "loss": 1.2389, "step": 13809 }, { "epoch": 4.1132560174240025, "grad_norm": 0.41322556138038635, "learning_rate": 1.3289455914419552e-05, "loss": 1.2278, "step": 13810 }, { "epoch": 4.113553863623672, "grad_norm": 0.3003167510032654, "learning_rate": 1.3288545011956071e-05, "loss": 1.2425, "step": 13811 }, { "epoch": 4.11385170982334, "grad_norm": 0.25727447867393494, "learning_rate": 1.328763407889635e-05, "loss": 1.2172, "step": 13812 }, { "epoch": 4.114149556023008, "grad_norm": 0.4414829909801483, "learning_rate": 1.3286723115248867e-05, "loss": 1.2267, "step": 13813 }, { "epoch": 4.114447402222678, "grad_norm": 0.3486790955066681, "learning_rate": 1.3285812121022101e-05, "loss": 1.2093, "step": 13814 }, { "epoch": 4.114745248422346, "grad_norm": 0.36375582218170166, "learning_rate": 1.3284901096224525e-05, "loss": 1.2105, "step": 13815 }, { "epoch": 4.115043094622014, "grad_norm": 0.2826632559299469, "learning_rate": 1.3283990040864611e-05, "loss": 1.2295, "step": 13816 }, { "epoch": 4.1153409408216834, "grad_norm": 0.47464892268180847, "learning_rate": 1.3283078954950842e-05, "loss": 1.2225, "step": 13817 }, { "epoch": 4.115638787021352, "grad_norm": 0.34501388669013977, "learning_rate": 1.3282167838491687e-05, "loss": 1.2306, "step": 13818 }, { "epoch": 4.11593663322102, "grad_norm": 0.34228000044822693, "learning_rate": 1.328125669149563e-05, "loss": 1.2193, "step": 13819 }, { "epoch": 4.116234479420689, "grad_norm": 0.3054259121417999, "learning_rate": 1.3280345513971144e-05, "loss": 1.235, "step": 13820 }, { "epoch": 4.116532325620358, "grad_norm": 0.39247167110443115, "learning_rate": 1.3279434305926713e-05, "loss": 1.1979, "step": 13821 }, { "epoch": 4.116830171820027, "grad_norm": 0.27253589034080505, "learning_rate": 1.3278523067370805e-05, "loss": 1.2, "step": 13822 }, { "epoch": 4.117128018019695, "grad_norm": 0.3247166574001312, "learning_rate": 1.3277611798311902e-05, "loss": 1.225, "step": 13823 }, { "epoch": 4.1174258642193635, "grad_norm": 0.27334272861480713, "learning_rate": 1.3276700498758486e-05, "loss": 1.2293, "step": 13824 }, { "epoch": 4.117723710419033, "grad_norm": 0.2963360846042633, "learning_rate": 1.3275789168719033e-05, "loss": 1.2196, "step": 13825 }, { "epoch": 4.118021556618701, "grad_norm": 0.36786139011383057, "learning_rate": 1.327487780820202e-05, "loss": 1.2256, "step": 13826 }, { "epoch": 4.118319402818369, "grad_norm": 0.32970741391181946, "learning_rate": 1.3273966417215926e-05, "loss": 1.2125, "step": 13827 }, { "epoch": 4.118617249018039, "grad_norm": 0.33740895986557007, "learning_rate": 1.3273054995769238e-05, "loss": 1.2051, "step": 13828 }, { "epoch": 4.118915095217707, "grad_norm": 0.26221126317977905, "learning_rate": 1.3272143543870427e-05, "loss": 1.2054, "step": 13829 }, { "epoch": 4.119212941417375, "grad_norm": 0.37126147747039795, "learning_rate": 1.3271232061527974e-05, "loss": 1.2165, "step": 13830 }, { "epoch": 4.1195107876170445, "grad_norm": 0.24958691000938416, "learning_rate": 1.3270320548750366e-05, "loss": 1.2261, "step": 13831 }, { "epoch": 4.119808633816713, "grad_norm": 0.31862881779670715, "learning_rate": 1.3269409005546077e-05, "loss": 1.2204, "step": 13832 }, { "epoch": 4.120106480016381, "grad_norm": 0.25110116600990295, "learning_rate": 1.3268497431923591e-05, "loss": 1.2428, "step": 13833 }, { "epoch": 4.12040432621605, "grad_norm": 0.4147375822067261, "learning_rate": 1.3267585827891387e-05, "loss": 1.2206, "step": 13834 }, { "epoch": 4.120702172415719, "grad_norm": 0.3040507733821869, "learning_rate": 1.3266674193457944e-05, "loss": 1.2347, "step": 13835 }, { "epoch": 4.121000018615388, "grad_norm": 0.2682487964630127, "learning_rate": 1.3265762528631752e-05, "loss": 1.2152, "step": 13836 }, { "epoch": 4.121297864815056, "grad_norm": 0.26606062054634094, "learning_rate": 1.3264850833421288e-05, "loss": 1.2188, "step": 13837 }, { "epoch": 4.1215957110147246, "grad_norm": 0.2641187310218811, "learning_rate": 1.3263939107835036e-05, "loss": 1.2058, "step": 13838 }, { "epoch": 4.121893557214394, "grad_norm": 0.3513612449169159, "learning_rate": 1.3263027351881475e-05, "loss": 1.2338, "step": 13839 }, { "epoch": 4.122191403414062, "grad_norm": 0.31655916571617126, "learning_rate": 1.3262115565569088e-05, "loss": 1.2186, "step": 13840 }, { "epoch": 4.12248924961373, "grad_norm": 0.340635746717453, "learning_rate": 1.3261203748906363e-05, "loss": 1.2047, "step": 13841 }, { "epoch": 4.1227870958134, "grad_norm": 0.2886393964290619, "learning_rate": 1.3260291901901783e-05, "loss": 1.2451, "step": 13842 }, { "epoch": 4.123084942013068, "grad_norm": 0.3992542624473572, "learning_rate": 1.3259380024563826e-05, "loss": 1.2129, "step": 13843 }, { "epoch": 4.123382788212736, "grad_norm": 0.28295430541038513, "learning_rate": 1.325846811690098e-05, "loss": 1.2276, "step": 13844 }, { "epoch": 4.1236806344124055, "grad_norm": 0.33247432112693787, "learning_rate": 1.3257556178921732e-05, "loss": 1.2315, "step": 13845 }, { "epoch": 4.123978480612074, "grad_norm": 0.3000031113624573, "learning_rate": 1.325664421063456e-05, "loss": 1.2202, "step": 13846 }, { "epoch": 4.124276326811742, "grad_norm": 0.5144345164299011, "learning_rate": 1.325573221204795e-05, "loss": 1.2314, "step": 13847 }, { "epoch": 4.124574173011411, "grad_norm": 0.35543328523635864, "learning_rate": 1.3254820183170394e-05, "loss": 1.2247, "step": 13848 }, { "epoch": 4.12487201921108, "grad_norm": 0.29800620675086975, "learning_rate": 1.325390812401037e-05, "loss": 1.2249, "step": 13849 }, { "epoch": 4.125169865410749, "grad_norm": 0.2537350058555603, "learning_rate": 1.3252996034576368e-05, "loss": 1.214, "step": 13850 }, { "epoch": 4.125467711610417, "grad_norm": 0.37921464443206787, "learning_rate": 1.3252083914876873e-05, "loss": 1.2276, "step": 13851 }, { "epoch": 4.125765557810086, "grad_norm": 0.26001298427581787, "learning_rate": 1.3251171764920369e-05, "loss": 1.2152, "step": 13852 }, { "epoch": 4.126063404009755, "grad_norm": 0.3361720144748688, "learning_rate": 1.3250259584715341e-05, "loss": 1.2226, "step": 13853 }, { "epoch": 4.126361250209423, "grad_norm": 0.24085834622383118, "learning_rate": 1.3249347374270282e-05, "loss": 1.2331, "step": 13854 }, { "epoch": 4.1266590964090915, "grad_norm": 0.2936878502368927, "learning_rate": 1.324843513359368e-05, "loss": 1.2185, "step": 13855 }, { "epoch": 4.126956942608761, "grad_norm": 0.2679438889026642, "learning_rate": 1.3247522862694014e-05, "loss": 1.2318, "step": 13856 }, { "epoch": 4.127254788808429, "grad_norm": 0.26150068640708923, "learning_rate": 1.3246610561579776e-05, "loss": 1.2457, "step": 13857 }, { "epoch": 4.127552635008097, "grad_norm": 0.3115221858024597, "learning_rate": 1.3245698230259456e-05, "loss": 1.2204, "step": 13858 }, { "epoch": 4.1278504812077665, "grad_norm": 0.2776271104812622, "learning_rate": 1.3244785868741539e-05, "loss": 1.2215, "step": 13859 }, { "epoch": 4.128148327407435, "grad_norm": 0.37744003534317017, "learning_rate": 1.3243873477034514e-05, "loss": 1.2316, "step": 13860 }, { "epoch": 4.128446173607104, "grad_norm": 0.28370046615600586, "learning_rate": 1.3242961055146873e-05, "loss": 1.233, "step": 13861 }, { "epoch": 4.128744019806772, "grad_norm": 0.3110189139842987, "learning_rate": 1.32420486030871e-05, "loss": 1.2228, "step": 13862 }, { "epoch": 4.129041866006441, "grad_norm": 0.30852046608924866, "learning_rate": 1.324113612086369e-05, "loss": 1.2175, "step": 13863 }, { "epoch": 4.12933971220611, "grad_norm": 0.28926947712898254, "learning_rate": 1.3240223608485127e-05, "loss": 1.2121, "step": 13864 }, { "epoch": 4.129637558405778, "grad_norm": 0.3092958331108093, "learning_rate": 1.3239311065959903e-05, "loss": 1.2142, "step": 13865 }, { "epoch": 4.129935404605447, "grad_norm": 0.2516164779663086, "learning_rate": 1.3238398493296511e-05, "loss": 1.2047, "step": 13866 }, { "epoch": 4.130233250805116, "grad_norm": 0.30716443061828613, "learning_rate": 1.323748589050344e-05, "loss": 1.2194, "step": 13867 }, { "epoch": 4.130531097004784, "grad_norm": 0.2708042562007904, "learning_rate": 1.3236573257589178e-05, "loss": 1.22, "step": 13868 }, { "epoch": 4.1308289432044525, "grad_norm": 0.38280943036079407, "learning_rate": 1.3235660594562219e-05, "loss": 1.2159, "step": 13869 }, { "epoch": 4.131126789404122, "grad_norm": 0.36915719509124756, "learning_rate": 1.323474790143105e-05, "loss": 1.2203, "step": 13870 }, { "epoch": 4.13142463560379, "grad_norm": 0.25969138741493225, "learning_rate": 1.3233835178204166e-05, "loss": 1.221, "step": 13871 }, { "epoch": 4.131722481803458, "grad_norm": 0.36319831013679504, "learning_rate": 1.3232922424890063e-05, "loss": 1.2302, "step": 13872 }, { "epoch": 4.132020328003128, "grad_norm": 0.24696652591228485, "learning_rate": 1.3232009641497227e-05, "loss": 1.2209, "step": 13873 }, { "epoch": 4.132318174202796, "grad_norm": 0.395910382270813, "learning_rate": 1.3231096828034151e-05, "loss": 1.2239, "step": 13874 }, { "epoch": 4.132616020402464, "grad_norm": 0.24994641542434692, "learning_rate": 1.3230183984509333e-05, "loss": 1.2199, "step": 13875 }, { "epoch": 4.1329138666021334, "grad_norm": 0.8485873937606812, "learning_rate": 1.3229271110931254e-05, "loss": 1.2243, "step": 13876 }, { "epoch": 4.133211712801802, "grad_norm": 0.51860511302948, "learning_rate": 1.322835820730842e-05, "loss": 1.2158, "step": 13877 }, { "epoch": 4.133509559001471, "grad_norm": 0.43362873792648315, "learning_rate": 1.3227445273649323e-05, "loss": 1.2246, "step": 13878 }, { "epoch": 4.133807405201139, "grad_norm": 0.3828974664211273, "learning_rate": 1.3226532309962453e-05, "loss": 1.2237, "step": 13879 }, { "epoch": 4.134105251400808, "grad_norm": 0.6636733412742615, "learning_rate": 1.32256193162563e-05, "loss": 1.2219, "step": 13880 }, { "epoch": 4.134403097600477, "grad_norm": 0.3177591860294342, "learning_rate": 1.3224706292539366e-05, "loss": 1.2144, "step": 13881 }, { "epoch": 4.134700943800145, "grad_norm": 0.33369359374046326, "learning_rate": 1.3223793238820141e-05, "loss": 1.2169, "step": 13882 }, { "epoch": 4.1349987899998135, "grad_norm": 0.30640286207199097, "learning_rate": 1.3222880155107124e-05, "loss": 1.233, "step": 13883 }, { "epoch": 4.135296636199483, "grad_norm": 0.24687960743904114, "learning_rate": 1.3221967041408808e-05, "loss": 1.2352, "step": 13884 }, { "epoch": 4.135594482399151, "grad_norm": 0.32527896761894226, "learning_rate": 1.3221053897733686e-05, "loss": 1.2094, "step": 13885 }, { "epoch": 4.135892328598819, "grad_norm": 0.33888906240463257, "learning_rate": 1.322014072409026e-05, "loss": 1.2253, "step": 13886 }, { "epoch": 4.136190174798489, "grad_norm": 0.25949883460998535, "learning_rate": 1.3219227520487019e-05, "loss": 1.2213, "step": 13887 }, { "epoch": 4.136488020998157, "grad_norm": 0.29973429441452026, "learning_rate": 1.3218314286932465e-05, "loss": 1.2214, "step": 13888 }, { "epoch": 4.136785867197826, "grad_norm": 0.2880532443523407, "learning_rate": 1.321740102343509e-05, "loss": 1.2469, "step": 13889 }, { "epoch": 4.1370837133974945, "grad_norm": 0.26244160532951355, "learning_rate": 1.3216487730003396e-05, "loss": 1.2477, "step": 13890 }, { "epoch": 4.137381559597163, "grad_norm": 0.29476916790008545, "learning_rate": 1.3215574406645874e-05, "loss": 1.2189, "step": 13891 }, { "epoch": 4.137679405796832, "grad_norm": 0.27913185954093933, "learning_rate": 1.3214661053371027e-05, "loss": 1.2226, "step": 13892 }, { "epoch": 4.1379772519965, "grad_norm": 0.27448925375938416, "learning_rate": 1.321374767018735e-05, "loss": 1.2203, "step": 13893 }, { "epoch": 4.138275098196169, "grad_norm": 0.29136985540390015, "learning_rate": 1.3212834257103339e-05, "loss": 1.2114, "step": 13894 }, { "epoch": 4.138572944395838, "grad_norm": 0.2683270275592804, "learning_rate": 1.3211920814127499e-05, "loss": 1.236, "step": 13895 }, { "epoch": 4.138870790595506, "grad_norm": 0.3383072316646576, "learning_rate": 1.3211007341268324e-05, "loss": 1.2139, "step": 13896 }, { "epoch": 4.1391686367951745, "grad_norm": 0.2853613793849945, "learning_rate": 1.3210093838534313e-05, "loss": 1.2348, "step": 13897 }, { "epoch": 4.139466482994844, "grad_norm": 0.2859134376049042, "learning_rate": 1.3209180305933961e-05, "loss": 1.2372, "step": 13898 }, { "epoch": 4.139764329194512, "grad_norm": 0.3109267055988312, "learning_rate": 1.3208266743475777e-05, "loss": 1.2296, "step": 13899 }, { "epoch": 4.14006217539418, "grad_norm": 0.26448291540145874, "learning_rate": 1.3207353151168254e-05, "loss": 1.2289, "step": 13900 }, { "epoch": 4.14036002159385, "grad_norm": 0.3694174587726593, "learning_rate": 1.3206439529019892e-05, "loss": 1.2336, "step": 13901 }, { "epoch": 4.140657867793518, "grad_norm": 0.2503821551799774, "learning_rate": 1.3205525877039194e-05, "loss": 1.2256, "step": 13902 }, { "epoch": 4.140955713993187, "grad_norm": 0.2735968828201294, "learning_rate": 1.320461219523466e-05, "loss": 1.2217, "step": 13903 }, { "epoch": 4.1412535601928555, "grad_norm": 0.2927398383617401, "learning_rate": 1.320369848361479e-05, "loss": 1.2367, "step": 13904 }, { "epoch": 4.141551406392524, "grad_norm": 0.2513747215270996, "learning_rate": 1.3202784742188081e-05, "loss": 1.2328, "step": 13905 }, { "epoch": 4.141849252592193, "grad_norm": 0.29553186893463135, "learning_rate": 1.3201870970963042e-05, "loss": 1.2373, "step": 13906 }, { "epoch": 4.142147098791861, "grad_norm": 0.2649666368961334, "learning_rate": 1.3200957169948169e-05, "loss": 1.2308, "step": 13907 }, { "epoch": 4.14244494499153, "grad_norm": 0.29517480731010437, "learning_rate": 1.3200043339151967e-05, "loss": 1.2306, "step": 13908 }, { "epoch": 4.142742791191199, "grad_norm": 0.282366544008255, "learning_rate": 1.3199129478582937e-05, "loss": 1.226, "step": 13909 }, { "epoch": 4.143040637390867, "grad_norm": 0.28966495394706726, "learning_rate": 1.319821558824958e-05, "loss": 1.2118, "step": 13910 }, { "epoch": 4.143338483590536, "grad_norm": 0.35091632604599, "learning_rate": 1.3197301668160405e-05, "loss": 1.2323, "step": 13911 }, { "epoch": 4.143636329790205, "grad_norm": 0.3068065643310547, "learning_rate": 1.3196387718323905e-05, "loss": 1.2225, "step": 13912 }, { "epoch": 4.143934175989873, "grad_norm": 0.33353498578071594, "learning_rate": 1.319547373874859e-05, "loss": 1.2139, "step": 13913 }, { "epoch": 4.1442320221895415, "grad_norm": 0.28323015570640564, "learning_rate": 1.3194559729442962e-05, "loss": 1.2158, "step": 13914 }, { "epoch": 4.144529868389211, "grad_norm": 0.44431978464126587, "learning_rate": 1.3193645690415524e-05, "loss": 1.223, "step": 13915 }, { "epoch": 4.144827714588879, "grad_norm": 0.32332706451416016, "learning_rate": 1.3192731621674786e-05, "loss": 1.2252, "step": 13916 }, { "epoch": 4.145125560788548, "grad_norm": 0.403382807970047, "learning_rate": 1.319181752322924e-05, "loss": 1.2375, "step": 13917 }, { "epoch": 4.1454234069882165, "grad_norm": 0.2784106731414795, "learning_rate": 1.3190903395087403e-05, "loss": 1.2145, "step": 13918 }, { "epoch": 4.145721253187885, "grad_norm": 0.5025516748428345, "learning_rate": 1.3189989237257771e-05, "loss": 1.2113, "step": 13919 }, { "epoch": 4.146019099387554, "grad_norm": 0.3577045798301697, "learning_rate": 1.3189075049748858e-05, "loss": 1.2258, "step": 13920 }, { "epoch": 4.146316945587222, "grad_norm": 0.3309796452522278, "learning_rate": 1.3188160832569164e-05, "loss": 1.2167, "step": 13921 }, { "epoch": 4.146614791786891, "grad_norm": 0.270114004611969, "learning_rate": 1.3187246585727192e-05, "loss": 1.232, "step": 13922 }, { "epoch": 4.14691263798656, "grad_norm": 0.4047696888446808, "learning_rate": 1.3186332309231453e-05, "loss": 1.222, "step": 13923 }, { "epoch": 4.147210484186228, "grad_norm": 0.25433439016342163, "learning_rate": 1.3185418003090454e-05, "loss": 1.224, "step": 13924 }, { "epoch": 4.147508330385897, "grad_norm": 0.3170766532421112, "learning_rate": 1.3184503667312695e-05, "loss": 1.2381, "step": 13925 }, { "epoch": 4.147806176585566, "grad_norm": 0.2565918266773224, "learning_rate": 1.3183589301906695e-05, "loss": 1.219, "step": 13926 }, { "epoch": 4.148104022785234, "grad_norm": 0.2843881845474243, "learning_rate": 1.3182674906880948e-05, "loss": 1.219, "step": 13927 }, { "epoch": 4.148401868984903, "grad_norm": 0.329664945602417, "learning_rate": 1.3181760482243967e-05, "loss": 1.2375, "step": 13928 }, { "epoch": 4.148699715184572, "grad_norm": 0.2986266314983368, "learning_rate": 1.318084602800426e-05, "loss": 1.2276, "step": 13929 }, { "epoch": 4.14899756138424, "grad_norm": 0.3061686158180237, "learning_rate": 1.3179931544170332e-05, "loss": 1.2247, "step": 13930 }, { "epoch": 4.149295407583909, "grad_norm": 0.26146790385246277, "learning_rate": 1.3179017030750698e-05, "loss": 1.2186, "step": 13931 }, { "epoch": 4.149593253783578, "grad_norm": 0.4060761332511902, "learning_rate": 1.3178102487753859e-05, "loss": 1.2242, "step": 13932 }, { "epoch": 4.149891099983246, "grad_norm": 0.2841872274875641, "learning_rate": 1.3177187915188331e-05, "loss": 1.2341, "step": 13933 }, { "epoch": 4.150188946182915, "grad_norm": 0.3375382423400879, "learning_rate": 1.3176273313062617e-05, "loss": 1.2207, "step": 13934 }, { "epoch": 4.1504867923825834, "grad_norm": 0.2719001770019531, "learning_rate": 1.3175358681385227e-05, "loss": 1.2392, "step": 13935 }, { "epoch": 4.150784638582252, "grad_norm": 0.35442888736724854, "learning_rate": 1.3174444020164674e-05, "loss": 1.2207, "step": 13936 }, { "epoch": 4.151082484781921, "grad_norm": 0.3728295862674713, "learning_rate": 1.3173529329409466e-05, "loss": 1.2188, "step": 13937 }, { "epoch": 4.151380330981589, "grad_norm": 0.25157758593559265, "learning_rate": 1.3172614609128114e-05, "loss": 1.2388, "step": 13938 }, { "epoch": 4.151678177181258, "grad_norm": 0.2839163541793823, "learning_rate": 1.3171699859329125e-05, "loss": 1.2331, "step": 13939 }, { "epoch": 4.151976023380927, "grad_norm": 0.24345000088214874, "learning_rate": 1.3170785080021019e-05, "loss": 1.2087, "step": 13940 }, { "epoch": 4.152273869580595, "grad_norm": 0.2459232360124588, "learning_rate": 1.3169870271212293e-05, "loss": 1.2345, "step": 13941 }, { "epoch": 4.1525717157802635, "grad_norm": 0.29808467626571655, "learning_rate": 1.3168955432911467e-05, "loss": 1.2244, "step": 13942 }, { "epoch": 4.152869561979933, "grad_norm": 0.2867465913295746, "learning_rate": 1.3168040565127055e-05, "loss": 1.1921, "step": 13943 }, { "epoch": 4.153167408179601, "grad_norm": 0.25989869236946106, "learning_rate": 1.3167125667867562e-05, "loss": 1.2328, "step": 13944 }, { "epoch": 4.15346525437927, "grad_norm": 0.2782016694545746, "learning_rate": 1.3166210741141505e-05, "loss": 1.225, "step": 13945 }, { "epoch": 4.153763100578939, "grad_norm": 0.2776489555835724, "learning_rate": 1.3165295784957393e-05, "loss": 1.2262, "step": 13946 }, { "epoch": 4.154060946778607, "grad_norm": 0.2627328336238861, "learning_rate": 1.316438079932374e-05, "loss": 1.2092, "step": 13947 }, { "epoch": 4.154358792978276, "grad_norm": 0.2466094046831131, "learning_rate": 1.316346578424906e-05, "loss": 1.2425, "step": 13948 }, { "epoch": 4.1546566391779445, "grad_norm": 0.26793643832206726, "learning_rate": 1.3162550739741864e-05, "loss": 1.22, "step": 13949 }, { "epoch": 4.154954485377613, "grad_norm": 0.28842565417289734, "learning_rate": 1.316163566581067e-05, "loss": 1.2268, "step": 13950 }, { "epoch": 4.155252331577282, "grad_norm": 0.3220886290073395, "learning_rate": 1.3160720562463986e-05, "loss": 1.2127, "step": 13951 }, { "epoch": 4.15555017777695, "grad_norm": 0.36773040890693665, "learning_rate": 1.315980542971033e-05, "loss": 1.2324, "step": 13952 }, { "epoch": 4.155848023976619, "grad_norm": 0.36914122104644775, "learning_rate": 1.315889026755821e-05, "loss": 1.1924, "step": 13953 }, { "epoch": 4.156145870176288, "grad_norm": 0.24720020592212677, "learning_rate": 1.3157975076016154e-05, "loss": 1.2391, "step": 13954 }, { "epoch": 4.156443716375956, "grad_norm": 0.26182740926742554, "learning_rate": 1.3157059855092662e-05, "loss": 1.223, "step": 13955 }, { "epoch": 4.156741562575625, "grad_norm": 0.30646032094955444, "learning_rate": 1.3156144604796257e-05, "loss": 1.2277, "step": 13956 }, { "epoch": 4.157039408775294, "grad_norm": 0.27570775151252747, "learning_rate": 1.3155229325135455e-05, "loss": 1.2372, "step": 13957 }, { "epoch": 4.157337254974962, "grad_norm": 0.47832658886909485, "learning_rate": 1.3154314016118763e-05, "loss": 1.207, "step": 13958 }, { "epoch": 4.157635101174631, "grad_norm": 0.3838043510913849, "learning_rate": 1.3153398677754707e-05, "loss": 1.2133, "step": 13959 }, { "epoch": 4.1579329473743, "grad_norm": 0.3545500338077545, "learning_rate": 1.31524833100518e-05, "loss": 1.2345, "step": 13960 }, { "epoch": 4.158230793573968, "grad_norm": 0.2836247682571411, "learning_rate": 1.315156791301856e-05, "loss": 1.2229, "step": 13961 }, { "epoch": 4.158528639773637, "grad_norm": 0.26212188601493835, "learning_rate": 1.3150652486663498e-05, "loss": 1.233, "step": 13962 }, { "epoch": 4.1588264859733055, "grad_norm": 0.3060859441757202, "learning_rate": 1.3149737030995136e-05, "loss": 1.2049, "step": 13963 }, { "epoch": 4.159124332172974, "grad_norm": 0.2617625594139099, "learning_rate": 1.3148821546021991e-05, "loss": 1.2229, "step": 13964 }, { "epoch": 4.159422178372643, "grad_norm": 0.32073497772216797, "learning_rate": 1.3147906031752578e-05, "loss": 1.2253, "step": 13965 }, { "epoch": 4.159720024572311, "grad_norm": 0.44537511467933655, "learning_rate": 1.3146990488195415e-05, "loss": 1.2287, "step": 13966 }, { "epoch": 4.16001787077198, "grad_norm": 0.3504146933555603, "learning_rate": 1.3146074915359026e-05, "loss": 1.2299, "step": 13967 }, { "epoch": 4.160315716971649, "grad_norm": 0.26046764850616455, "learning_rate": 1.3145159313251922e-05, "loss": 1.217, "step": 13968 }, { "epoch": 4.160613563171317, "grad_norm": 0.3015144467353821, "learning_rate": 1.3144243681882625e-05, "loss": 1.217, "step": 13969 }, { "epoch": 4.1609114093709865, "grad_norm": 0.38506749272346497, "learning_rate": 1.3143328021259654e-05, "loss": 1.2119, "step": 13970 }, { "epoch": 4.161209255570655, "grad_norm": 0.3009447753429413, "learning_rate": 1.3142412331391528e-05, "loss": 1.2172, "step": 13971 }, { "epoch": 4.161507101770323, "grad_norm": 0.25323185324668884, "learning_rate": 1.3141496612286763e-05, "loss": 1.2244, "step": 13972 }, { "epoch": 4.161804947969992, "grad_norm": 0.34163016080856323, "learning_rate": 1.3140580863953885e-05, "loss": 1.2385, "step": 13973 }, { "epoch": 4.162102794169661, "grad_norm": 0.25774526596069336, "learning_rate": 1.3139665086401413e-05, "loss": 1.2332, "step": 13974 }, { "epoch": 4.162400640369329, "grad_norm": 0.40288153290748596, "learning_rate": 1.313874927963786e-05, "loss": 1.2274, "step": 13975 }, { "epoch": 4.162698486568998, "grad_norm": 0.30704954266548157, "learning_rate": 1.3137833443671753e-05, "loss": 1.2231, "step": 13976 }, { "epoch": 4.1629963327686665, "grad_norm": 0.30630987882614136, "learning_rate": 1.3136917578511612e-05, "loss": 1.2277, "step": 13977 }, { "epoch": 4.163294178968335, "grad_norm": 0.2474551796913147, "learning_rate": 1.313600168416596e-05, "loss": 1.2126, "step": 13978 }, { "epoch": 4.163592025168004, "grad_norm": 0.30916136503219604, "learning_rate": 1.3135085760643316e-05, "loss": 1.2065, "step": 13979 }, { "epoch": 4.163889871367672, "grad_norm": 0.2527836263179779, "learning_rate": 1.31341698079522e-05, "loss": 1.227, "step": 13980 }, { "epoch": 4.164187717567341, "grad_norm": 0.2940084636211395, "learning_rate": 1.3133253826101138e-05, "loss": 1.2185, "step": 13981 }, { "epoch": 4.16448556376701, "grad_norm": 0.2573065757751465, "learning_rate": 1.3132337815098646e-05, "loss": 1.2043, "step": 13982 }, { "epoch": 4.164783409966678, "grad_norm": 0.28639015555381775, "learning_rate": 1.313142177495325e-05, "loss": 1.2074, "step": 13983 }, { "epoch": 4.1650812561663475, "grad_norm": 0.3267950713634491, "learning_rate": 1.3130505705673478e-05, "loss": 1.2213, "step": 13984 }, { "epoch": 4.165379102366016, "grad_norm": 0.25070884823799133, "learning_rate": 1.3129589607267846e-05, "loss": 1.2164, "step": 13985 }, { "epoch": 4.165676948565684, "grad_norm": 0.3283660411834717, "learning_rate": 1.3128673479744878e-05, "loss": 1.2177, "step": 13986 }, { "epoch": 4.165974794765353, "grad_norm": 0.4489118754863739, "learning_rate": 1.3127757323113097e-05, "loss": 1.2288, "step": 13987 }, { "epoch": 4.166272640965022, "grad_norm": 0.4153156876564026, "learning_rate": 1.3126841137381033e-05, "loss": 1.2365, "step": 13988 }, { "epoch": 4.16657048716469, "grad_norm": 0.2731744647026062, "learning_rate": 1.3125924922557203e-05, "loss": 1.2207, "step": 13989 }, { "epoch": 4.166868333364359, "grad_norm": 0.37405744194984436, "learning_rate": 1.3125008678650133e-05, "loss": 1.2232, "step": 13990 }, { "epoch": 4.167166179564028, "grad_norm": 0.35321325063705444, "learning_rate": 1.3124092405668351e-05, "loss": 1.2007, "step": 13991 }, { "epoch": 4.167464025763696, "grad_norm": 0.2919888198375702, "learning_rate": 1.312317610362038e-05, "loss": 1.2098, "step": 13992 }, { "epoch": 4.167761871963365, "grad_norm": 0.489874929189682, "learning_rate": 1.3122259772514744e-05, "loss": 1.2241, "step": 13993 }, { "epoch": 4.1680597181630334, "grad_norm": 0.3767636716365814, "learning_rate": 1.3121343412359966e-05, "loss": 1.2304, "step": 13994 }, { "epoch": 4.168357564362703, "grad_norm": 0.24852719902992249, "learning_rate": 1.3120427023164579e-05, "loss": 1.2294, "step": 13995 }, { "epoch": 4.168655410562371, "grad_norm": 0.30303964018821716, "learning_rate": 1.3119510604937105e-05, "loss": 1.2189, "step": 13996 }, { "epoch": 4.168953256762039, "grad_norm": 0.25402772426605225, "learning_rate": 1.311859415768607e-05, "loss": 1.2216, "step": 13997 }, { "epoch": 4.1692511029617085, "grad_norm": 0.323548823595047, "learning_rate": 1.311767768142e-05, "loss": 1.2204, "step": 13998 }, { "epoch": 4.169548949161377, "grad_norm": 0.5445820689201355, "learning_rate": 1.3116761176147421e-05, "loss": 1.2239, "step": 13999 }, { "epoch": 4.169846795361045, "grad_norm": 0.4276760220527649, "learning_rate": 1.311584464187686e-05, "loss": 1.2277, "step": 14000 }, { "epoch": 4.169846795361045, "eval_loss": 1.3272372484207153, "eval_runtime": 20.3261, "eval_samples_per_second": 85.309, "eval_steps_per_second": 5.363, "step": 14000 }, { "epoch": 4.170144641560714, "grad_norm": 0.3908666968345642, "learning_rate": 1.3114928078616848e-05, "loss": 1.2366, "step": 14001 }, { "epoch": 4.170442487760383, "grad_norm": 0.811020016670227, "learning_rate": 1.3114011486375912e-05, "loss": 1.2227, "step": 14002 }, { "epoch": 4.170740333960051, "grad_norm": 0.5054917335510254, "learning_rate": 1.311309486516258e-05, "loss": 1.2273, "step": 14003 }, { "epoch": 4.17103818015972, "grad_norm": 0.5227484107017517, "learning_rate": 1.3112178214985374e-05, "loss": 1.215, "step": 14004 }, { "epoch": 4.171336026359389, "grad_norm": 0.581238329410553, "learning_rate": 1.3111261535852828e-05, "loss": 1.228, "step": 14005 }, { "epoch": 4.171633872559057, "grad_norm": 0.33035385608673096, "learning_rate": 1.3110344827773469e-05, "loss": 1.2347, "step": 14006 }, { "epoch": 4.171931718758726, "grad_norm": 0.4523831009864807, "learning_rate": 1.310942809075583e-05, "loss": 1.2375, "step": 14007 }, { "epoch": 4.1722295649583945, "grad_norm": 0.4137898087501526, "learning_rate": 1.3108511324808433e-05, "loss": 1.2254, "step": 14008 }, { "epoch": 4.172527411158063, "grad_norm": 0.34187790751457214, "learning_rate": 1.3107594529939814e-05, "loss": 1.2237, "step": 14009 }, { "epoch": 4.172825257357732, "grad_norm": 0.36907291412353516, "learning_rate": 1.3106677706158496e-05, "loss": 1.2112, "step": 14010 }, { "epoch": 4.1731231035574, "grad_norm": 0.2730240821838379, "learning_rate": 1.3105760853473015e-05, "loss": 1.2328, "step": 14011 }, { "epoch": 4.17342094975707, "grad_norm": 0.30101391673088074, "learning_rate": 1.3104843971891898e-05, "loss": 1.2301, "step": 14012 }, { "epoch": 4.173718795956738, "grad_norm": 0.3022208511829376, "learning_rate": 1.310392706142368e-05, "loss": 1.2206, "step": 14013 }, { "epoch": 4.174016642156406, "grad_norm": 0.2642434537410736, "learning_rate": 1.3103010122076888e-05, "loss": 1.213, "step": 14014 }, { "epoch": 4.174314488356075, "grad_norm": 0.42281395196914673, "learning_rate": 1.3102093153860052e-05, "loss": 1.2185, "step": 14015 }, { "epoch": 4.174612334555744, "grad_norm": 0.26686713099479675, "learning_rate": 1.3101176156781706e-05, "loss": 1.2045, "step": 14016 }, { "epoch": 4.174910180755412, "grad_norm": 0.3635387420654297, "learning_rate": 1.3100259130850378e-05, "loss": 1.2232, "step": 14017 }, { "epoch": 4.175208026955081, "grad_norm": 0.2564242482185364, "learning_rate": 1.3099342076074604e-05, "loss": 1.2125, "step": 14018 }, { "epoch": 4.17550587315475, "grad_norm": 0.4493139684200287, "learning_rate": 1.3098424992462918e-05, "loss": 1.2257, "step": 14019 }, { "epoch": 4.175803719354418, "grad_norm": 0.27888286113739014, "learning_rate": 1.3097507880023846e-05, "loss": 1.2272, "step": 14020 }, { "epoch": 4.176101565554087, "grad_norm": 0.28345227241516113, "learning_rate": 1.3096590738765925e-05, "loss": 1.2192, "step": 14021 }, { "epoch": 4.1763994117537555, "grad_norm": 0.30897077918052673, "learning_rate": 1.3095673568697687e-05, "loss": 1.2022, "step": 14022 }, { "epoch": 4.176697257953425, "grad_norm": 0.34538790583610535, "learning_rate": 1.3094756369827661e-05, "loss": 1.2117, "step": 14023 }, { "epoch": 4.176995104153093, "grad_norm": 0.29366397857666016, "learning_rate": 1.309383914216439e-05, "loss": 1.2301, "step": 14024 }, { "epoch": 4.177292950352761, "grad_norm": 0.34857288002967834, "learning_rate": 1.3092921885716404e-05, "loss": 1.201, "step": 14025 }, { "epoch": 4.177590796552431, "grad_norm": 0.2962677478790283, "learning_rate": 1.309200460049223e-05, "loss": 1.223, "step": 14026 }, { "epoch": 4.177888642752099, "grad_norm": 0.2779518663883209, "learning_rate": 1.309108728650041e-05, "loss": 1.2353, "step": 14027 }, { "epoch": 4.178186488951767, "grad_norm": 0.2660236358642578, "learning_rate": 1.3090169943749475e-05, "loss": 1.2174, "step": 14028 }, { "epoch": 4.1784843351514365, "grad_norm": 0.2925933003425598, "learning_rate": 1.3089252572247963e-05, "loss": 1.2096, "step": 14029 }, { "epoch": 4.178782181351105, "grad_norm": 0.29860958456993103, "learning_rate": 1.3088335172004408e-05, "loss": 1.2192, "step": 14030 }, { "epoch": 4.179080027550773, "grad_norm": 0.35349053144454956, "learning_rate": 1.3087417743027341e-05, "loss": 1.2231, "step": 14031 }, { "epoch": 4.179377873750442, "grad_norm": 0.28386175632476807, "learning_rate": 1.3086500285325307e-05, "loss": 1.2453, "step": 14032 }, { "epoch": 4.179675719950111, "grad_norm": 0.36909663677215576, "learning_rate": 1.3085582798906834e-05, "loss": 1.2282, "step": 14033 }, { "epoch": 4.179973566149779, "grad_norm": 0.2573685050010681, "learning_rate": 1.3084665283780458e-05, "loss": 1.2265, "step": 14034 }, { "epoch": 4.180271412349448, "grad_norm": 0.4504440724849701, "learning_rate": 1.308374773995472e-05, "loss": 1.2124, "step": 14035 }, { "epoch": 4.1805692585491165, "grad_norm": 0.32981064915657043, "learning_rate": 1.3082830167438154e-05, "loss": 1.2332, "step": 14036 }, { "epoch": 4.180867104748786, "grad_norm": 0.4069383442401886, "learning_rate": 1.3081912566239296e-05, "loss": 1.2135, "step": 14037 }, { "epoch": 4.181164950948454, "grad_norm": 0.35591837763786316, "learning_rate": 1.3080994936366689e-05, "loss": 1.2217, "step": 14038 }, { "epoch": 4.181462797148122, "grad_norm": 0.3362163007259369, "learning_rate": 1.3080077277828864e-05, "loss": 1.2325, "step": 14039 }, { "epoch": 4.181760643347792, "grad_norm": 0.3613108992576599, "learning_rate": 1.3079159590634363e-05, "loss": 1.2328, "step": 14040 }, { "epoch": 4.18205848954746, "grad_norm": 0.33352869749069214, "learning_rate": 1.3078241874791718e-05, "loss": 1.2261, "step": 14041 }, { "epoch": 4.182356335747128, "grad_norm": 0.5207704305648804, "learning_rate": 1.3077324130309475e-05, "loss": 1.2294, "step": 14042 }, { "epoch": 4.1826541819467975, "grad_norm": 0.25804591178894043, "learning_rate": 1.3076406357196168e-05, "loss": 1.2166, "step": 14043 }, { "epoch": 4.182952028146466, "grad_norm": 0.4659302532672882, "learning_rate": 1.3075488555460338e-05, "loss": 1.2197, "step": 14044 }, { "epoch": 4.183249874346134, "grad_norm": 0.3011718690395355, "learning_rate": 1.3074570725110523e-05, "loss": 1.2216, "step": 14045 }, { "epoch": 4.183547720545803, "grad_norm": 0.4368714988231659, "learning_rate": 1.307365286615526e-05, "loss": 1.2253, "step": 14046 }, { "epoch": 4.183845566745472, "grad_norm": 0.3222927153110504, "learning_rate": 1.3072734978603094e-05, "loss": 1.2158, "step": 14047 }, { "epoch": 4.18414341294514, "grad_norm": 0.3827953040599823, "learning_rate": 1.3071817062462562e-05, "loss": 1.2255, "step": 14048 }, { "epoch": 4.184441259144809, "grad_norm": 0.3645618259906769, "learning_rate": 1.3070899117742204e-05, "loss": 1.1938, "step": 14049 }, { "epoch": 4.184739105344478, "grad_norm": 0.2756417393684387, "learning_rate": 1.3069981144450561e-05, "loss": 1.2309, "step": 14050 }, { "epoch": 4.185036951544147, "grad_norm": 0.25370144844055176, "learning_rate": 1.3069063142596173e-05, "loss": 1.2215, "step": 14051 }, { "epoch": 4.185334797743815, "grad_norm": 0.2705208957195282, "learning_rate": 1.306814511218758e-05, "loss": 1.2258, "step": 14052 }, { "epoch": 4.1856326439434834, "grad_norm": 0.2705756425857544, "learning_rate": 1.3067227053233325e-05, "loss": 1.217, "step": 14053 }, { "epoch": 4.185930490143153, "grad_norm": 0.24070847034454346, "learning_rate": 1.306630896574195e-05, "loss": 1.2357, "step": 14054 }, { "epoch": 4.186228336342821, "grad_norm": 0.2530656158924103, "learning_rate": 1.3065390849721995e-05, "loss": 1.2222, "step": 14055 }, { "epoch": 4.186526182542489, "grad_norm": 0.3035649359226227, "learning_rate": 1.3064472705182008e-05, "loss": 1.2152, "step": 14056 }, { "epoch": 4.1868240287421585, "grad_norm": 0.2673301100730896, "learning_rate": 1.306355453213052e-05, "loss": 1.2228, "step": 14057 }, { "epoch": 4.187121874941827, "grad_norm": 0.27571043372154236, "learning_rate": 1.3062636330576081e-05, "loss": 1.1945, "step": 14058 }, { "epoch": 4.187419721141495, "grad_norm": 0.25011956691741943, "learning_rate": 1.3061718100527232e-05, "loss": 1.1996, "step": 14059 }, { "epoch": 4.187717567341164, "grad_norm": 0.2861412465572357, "learning_rate": 1.3060799841992521e-05, "loss": 1.2173, "step": 14060 }, { "epoch": 4.188015413540833, "grad_norm": 0.25167977809906006, "learning_rate": 1.3059881554980485e-05, "loss": 1.2421, "step": 14061 }, { "epoch": 4.188313259740502, "grad_norm": 0.2569390535354614, "learning_rate": 1.3058963239499669e-05, "loss": 1.2221, "step": 14062 }, { "epoch": 4.18861110594017, "grad_norm": 0.26285067200660706, "learning_rate": 1.3058044895558621e-05, "loss": 1.2129, "step": 14063 }, { "epoch": 4.188908952139839, "grad_norm": 0.26100948452949524, "learning_rate": 1.3057126523165877e-05, "loss": 1.2097, "step": 14064 }, { "epoch": 4.189206798339508, "grad_norm": 0.278222918510437, "learning_rate": 1.3056208122329988e-05, "loss": 1.215, "step": 14065 }, { "epoch": 4.189504644539176, "grad_norm": 0.2587638199329376, "learning_rate": 1.3055289693059501e-05, "loss": 1.2135, "step": 14066 }, { "epoch": 4.1898024907388445, "grad_norm": 0.2603953778743744, "learning_rate": 1.3054371235362952e-05, "loss": 1.2125, "step": 14067 }, { "epoch": 4.190100336938514, "grad_norm": 0.27730056643486023, "learning_rate": 1.3053452749248892e-05, "loss": 1.2363, "step": 14068 }, { "epoch": 4.190398183138182, "grad_norm": 0.2562923729419708, "learning_rate": 1.3052534234725866e-05, "loss": 1.2342, "step": 14069 }, { "epoch": 4.19069602933785, "grad_norm": 0.25303590297698975, "learning_rate": 1.3051615691802421e-05, "loss": 1.2091, "step": 14070 }, { "epoch": 4.19099387553752, "grad_norm": 0.2474832534790039, "learning_rate": 1.30506971204871e-05, "loss": 1.2249, "step": 14071 }, { "epoch": 4.191291721737188, "grad_norm": 0.2717093527317047, "learning_rate": 1.304977852078845e-05, "loss": 1.2165, "step": 14072 }, { "epoch": 4.191589567936856, "grad_norm": 0.2634011209011078, "learning_rate": 1.3048859892715022e-05, "loss": 1.231, "step": 14073 }, { "epoch": 4.191887414136525, "grad_norm": 0.38878726959228516, "learning_rate": 1.3047941236275356e-05, "loss": 1.2132, "step": 14074 }, { "epoch": 4.192185260336194, "grad_norm": 0.47264716029167175, "learning_rate": 1.3047022551478004e-05, "loss": 1.2169, "step": 14075 }, { "epoch": 4.192483106535862, "grad_norm": 0.2845783829689026, "learning_rate": 1.304610383833151e-05, "loss": 1.2115, "step": 14076 }, { "epoch": 4.192780952735531, "grad_norm": 0.33200255036354065, "learning_rate": 1.3045185096844422e-05, "loss": 1.238, "step": 14077 }, { "epoch": 4.1930787989352, "grad_norm": 0.2838309705257416, "learning_rate": 1.304426632702529e-05, "loss": 1.2281, "step": 14078 }, { "epoch": 4.193376645134869, "grad_norm": 0.30562594532966614, "learning_rate": 1.3043347528882662e-05, "loss": 1.2167, "step": 14079 }, { "epoch": 4.193674491334537, "grad_norm": 0.3289188742637634, "learning_rate": 1.3042428702425084e-05, "loss": 1.2449, "step": 14080 }, { "epoch": 4.1939723375342055, "grad_norm": 0.3240574598312378, "learning_rate": 1.3041509847661109e-05, "loss": 1.219, "step": 14081 }, { "epoch": 4.194270183733875, "grad_norm": 0.2809533178806305, "learning_rate": 1.3040590964599278e-05, "loss": 1.2265, "step": 14082 }, { "epoch": 4.194568029933543, "grad_norm": 0.24093203246593475, "learning_rate": 1.3039672053248148e-05, "loss": 1.229, "step": 14083 }, { "epoch": 4.194865876133211, "grad_norm": 0.26057231426239014, "learning_rate": 1.3038753113616267e-05, "loss": 1.2378, "step": 14084 }, { "epoch": 4.195163722332881, "grad_norm": 0.26788297295570374, "learning_rate": 1.3037834145712183e-05, "loss": 1.2179, "step": 14085 }, { "epoch": 4.195461568532549, "grad_norm": 0.24841520190238953, "learning_rate": 1.3036915149544444e-05, "loss": 1.2367, "step": 14086 }, { "epoch": 4.195759414732217, "grad_norm": 0.33649685978889465, "learning_rate": 1.3035996125121604e-05, "loss": 1.2086, "step": 14087 }, { "epoch": 4.1960572609318865, "grad_norm": 0.3129236400127411, "learning_rate": 1.3035077072452214e-05, "loss": 1.2353, "step": 14088 }, { "epoch": 4.196355107131555, "grad_norm": 0.2614390552043915, "learning_rate": 1.303415799154482e-05, "loss": 1.2247, "step": 14089 }, { "epoch": 4.196652953331224, "grad_norm": 0.46155256032943726, "learning_rate": 1.3033238882407977e-05, "loss": 1.2074, "step": 14090 }, { "epoch": 4.196950799530892, "grad_norm": 0.3769415318965912, "learning_rate": 1.3032319745050237e-05, "loss": 1.2206, "step": 14091 }, { "epoch": 4.197248645730561, "grad_norm": 0.3116456866264343, "learning_rate": 1.3031400579480147e-05, "loss": 1.2176, "step": 14092 }, { "epoch": 4.19754649193023, "grad_norm": 0.5263674855232239, "learning_rate": 1.3030481385706263e-05, "loss": 1.2134, "step": 14093 }, { "epoch": 4.197844338129898, "grad_norm": 0.34529250860214233, "learning_rate": 1.3029562163737133e-05, "loss": 1.2373, "step": 14094 }, { "epoch": 4.1981421843295665, "grad_norm": 0.3057539463043213, "learning_rate": 1.3028642913581313e-05, "loss": 1.2365, "step": 14095 }, { "epoch": 4.198440030529236, "grad_norm": 0.2712446451187134, "learning_rate": 1.3027723635247355e-05, "loss": 1.218, "step": 14096 }, { "epoch": 4.198737876728904, "grad_norm": 0.27539825439453125, "learning_rate": 1.3026804328743813e-05, "loss": 1.24, "step": 14097 }, { "epoch": 4.199035722928572, "grad_norm": 0.27623793482780457, "learning_rate": 1.3025884994079237e-05, "loss": 1.2323, "step": 14098 }, { "epoch": 4.199333569128242, "grad_norm": 0.2889955937862396, "learning_rate": 1.302496563126218e-05, "loss": 1.2234, "step": 14099 }, { "epoch": 4.19963141532791, "grad_norm": 0.2603602707386017, "learning_rate": 1.3024046240301201e-05, "loss": 1.238, "step": 14100 }, { "epoch": 4.199929261527578, "grad_norm": 0.24106217920780182, "learning_rate": 1.3023126821204848e-05, "loss": 1.2285, "step": 14101 }, { "epoch": 4.2002271077272475, "grad_norm": 0.24685721099376678, "learning_rate": 1.3022207373981676e-05, "loss": 1.2341, "step": 14102 }, { "epoch": 4.200524953926916, "grad_norm": 0.2619490325450897, "learning_rate": 1.3021287898640245e-05, "loss": 1.2277, "step": 14103 }, { "epoch": 4.200822800126585, "grad_norm": 0.2629368305206299, "learning_rate": 1.3020368395189107e-05, "loss": 1.2363, "step": 14104 }, { "epoch": 4.201120646326253, "grad_norm": 0.3153458833694458, "learning_rate": 1.301944886363681e-05, "loss": 1.226, "step": 14105 }, { "epoch": 4.201418492525922, "grad_norm": 0.5970566868782043, "learning_rate": 1.3018529303991915e-05, "loss": 1.2054, "step": 14106 }, { "epoch": 4.201716338725591, "grad_norm": 0.5265555381774902, "learning_rate": 1.3017609716262981e-05, "loss": 1.2322, "step": 14107 }, { "epoch": 4.202014184925259, "grad_norm": 0.3880898058414459, "learning_rate": 1.301669010045856e-05, "loss": 1.2163, "step": 14108 }, { "epoch": 4.202312031124928, "grad_norm": 0.8552187085151672, "learning_rate": 1.3015770456587206e-05, "loss": 1.2292, "step": 14109 }, { "epoch": 4.202609877324597, "grad_norm": 0.503619372844696, "learning_rate": 1.3014850784657478e-05, "loss": 1.2387, "step": 14110 }, { "epoch": 4.202907723524265, "grad_norm": 0.4142906963825226, "learning_rate": 1.3013931084677934e-05, "loss": 1.2342, "step": 14111 }, { "epoch": 4.203205569723933, "grad_norm": 0.41472697257995605, "learning_rate": 1.3013011356657126e-05, "loss": 1.2136, "step": 14112 }, { "epoch": 4.203503415923603, "grad_norm": 0.35401034355163574, "learning_rate": 1.3012091600603613e-05, "loss": 1.2189, "step": 14113 }, { "epoch": 4.203801262123271, "grad_norm": 0.30135732889175415, "learning_rate": 1.3011171816525955e-05, "loss": 1.2154, "step": 14114 }, { "epoch": 4.204099108322939, "grad_norm": 0.3800581991672516, "learning_rate": 1.3010252004432707e-05, "loss": 1.2442, "step": 14115 }, { "epoch": 4.2043969545226085, "grad_norm": 0.27238842844963074, "learning_rate": 1.3009332164332427e-05, "loss": 1.2241, "step": 14116 }, { "epoch": 4.204694800722277, "grad_norm": 0.44803619384765625, "learning_rate": 1.3008412296233673e-05, "loss": 1.2229, "step": 14117 }, { "epoch": 4.204992646921946, "grad_norm": 0.267049640417099, "learning_rate": 1.3007492400145005e-05, "loss": 1.2225, "step": 14118 }, { "epoch": 4.205290493121614, "grad_norm": 0.41336846351623535, "learning_rate": 1.3006572476074978e-05, "loss": 1.2422, "step": 14119 }, { "epoch": 4.205588339321283, "grad_norm": 0.2532871663570404, "learning_rate": 1.3005652524032156e-05, "loss": 1.2167, "step": 14120 }, { "epoch": 4.205886185520952, "grad_norm": 0.2909505367279053, "learning_rate": 1.3004732544025096e-05, "loss": 1.2452, "step": 14121 }, { "epoch": 4.20618403172062, "grad_norm": 0.3960720896720886, "learning_rate": 1.3003812536062355e-05, "loss": 1.2214, "step": 14122 }, { "epoch": 4.206481877920289, "grad_norm": 0.24674756824970245, "learning_rate": 1.3002892500152493e-05, "loss": 1.2286, "step": 14123 }, { "epoch": 4.206779724119958, "grad_norm": 0.37536731362342834, "learning_rate": 1.3001972436304073e-05, "loss": 1.2241, "step": 14124 }, { "epoch": 4.207077570319626, "grad_norm": 0.27816709876060486, "learning_rate": 1.3001052344525652e-05, "loss": 1.243, "step": 14125 }, { "epoch": 4.2073754165192945, "grad_norm": 0.2934020459651947, "learning_rate": 1.3000132224825794e-05, "loss": 1.2319, "step": 14126 }, { "epoch": 4.207673262718964, "grad_norm": 0.35045763850212097, "learning_rate": 1.2999212077213057e-05, "loss": 1.2158, "step": 14127 }, { "epoch": 4.207971108918632, "grad_norm": 0.31835460662841797, "learning_rate": 1.2998291901696006e-05, "loss": 1.2182, "step": 14128 }, { "epoch": 4.208268955118301, "grad_norm": 0.4084266424179077, "learning_rate": 1.2997371698283192e-05, "loss": 1.2191, "step": 14129 }, { "epoch": 4.20856680131797, "grad_norm": 0.2961149215698242, "learning_rate": 1.2996451466983185e-05, "loss": 1.2226, "step": 14130 }, { "epoch": 4.208864647517638, "grad_norm": 0.5896714329719543, "learning_rate": 1.2995531207804549e-05, "loss": 1.2208, "step": 14131 }, { "epoch": 4.209162493717307, "grad_norm": 0.33247706294059753, "learning_rate": 1.2994610920755839e-05, "loss": 1.2405, "step": 14132 }, { "epoch": 4.209460339916975, "grad_norm": 0.317874550819397, "learning_rate": 1.299369060584562e-05, "loss": 1.2235, "step": 14133 }, { "epoch": 4.209758186116644, "grad_norm": 0.30994951725006104, "learning_rate": 1.2992770263082455e-05, "loss": 1.2207, "step": 14134 }, { "epoch": 4.210056032316313, "grad_norm": 0.2707478106021881, "learning_rate": 1.2991849892474905e-05, "loss": 1.2281, "step": 14135 }, { "epoch": 4.210353878515981, "grad_norm": 0.4367784261703491, "learning_rate": 1.2990929494031537e-05, "loss": 1.2324, "step": 14136 }, { "epoch": 4.21065172471565, "grad_norm": 0.39060741662979126, "learning_rate": 1.2990009067760908e-05, "loss": 1.2371, "step": 14137 }, { "epoch": 4.210949570915319, "grad_norm": 0.2724064588546753, "learning_rate": 1.298908861367159e-05, "loss": 1.2203, "step": 14138 }, { "epoch": 4.211247417114987, "grad_norm": 0.3767302930355072, "learning_rate": 1.2988168131772141e-05, "loss": 1.2293, "step": 14139 }, { "epoch": 4.2115452633146555, "grad_norm": 0.32680049538612366, "learning_rate": 1.2987247622071124e-05, "loss": 1.2237, "step": 14140 }, { "epoch": 4.211843109514325, "grad_norm": 0.33390113711357117, "learning_rate": 1.2986327084577106e-05, "loss": 1.2128, "step": 14141 }, { "epoch": 4.212140955713993, "grad_norm": 0.4874330461025238, "learning_rate": 1.2985406519298652e-05, "loss": 1.2256, "step": 14142 }, { "epoch": 4.212438801913661, "grad_norm": 0.2610390782356262, "learning_rate": 1.2984485926244326e-05, "loss": 1.2243, "step": 14143 }, { "epoch": 4.212736648113331, "grad_norm": 0.28563764691352844, "learning_rate": 1.298356530542269e-05, "loss": 1.2334, "step": 14144 }, { "epoch": 4.213034494312999, "grad_norm": 0.32336580753326416, "learning_rate": 1.2982644656842318e-05, "loss": 1.223, "step": 14145 }, { "epoch": 4.213332340512668, "grad_norm": 0.270393431186676, "learning_rate": 1.2981723980511764e-05, "loss": 1.2202, "step": 14146 }, { "epoch": 4.2136301867123365, "grad_norm": 0.2720387578010559, "learning_rate": 1.2980803276439602e-05, "loss": 1.2232, "step": 14147 }, { "epoch": 4.213928032912005, "grad_norm": 0.284346342086792, "learning_rate": 1.2979882544634397e-05, "loss": 1.2377, "step": 14148 }, { "epoch": 4.214225879111674, "grad_norm": 0.2852690815925598, "learning_rate": 1.2978961785104714e-05, "loss": 1.2122, "step": 14149 }, { "epoch": 4.214523725311342, "grad_norm": 0.25768882036209106, "learning_rate": 1.2978040997859118e-05, "loss": 1.2211, "step": 14150 }, { "epoch": 4.214821571511011, "grad_norm": 0.2937699854373932, "learning_rate": 1.2977120182906178e-05, "loss": 1.2263, "step": 14151 }, { "epoch": 4.21511941771068, "grad_norm": 0.4234558045864105, "learning_rate": 1.297619934025446e-05, "loss": 1.2104, "step": 14152 }, { "epoch": 4.215417263910348, "grad_norm": 0.46062833070755005, "learning_rate": 1.2975278469912536e-05, "loss": 1.2197, "step": 14153 }, { "epoch": 4.2157151101100165, "grad_norm": 0.3027644455432892, "learning_rate": 1.2974357571888966e-05, "loss": 1.2188, "step": 14154 }, { "epoch": 4.216012956309686, "grad_norm": 0.32012224197387695, "learning_rate": 1.2973436646192328e-05, "loss": 1.2381, "step": 14155 }, { "epoch": 4.216310802509354, "grad_norm": 0.3370289206504822, "learning_rate": 1.2972515692831178e-05, "loss": 1.2266, "step": 14156 }, { "epoch": 4.216608648709023, "grad_norm": 0.3698771595954895, "learning_rate": 1.2971594711814093e-05, "loss": 1.2333, "step": 14157 }, { "epoch": 4.216906494908692, "grad_norm": 0.3698229193687439, "learning_rate": 1.297067370314964e-05, "loss": 1.2148, "step": 14158 }, { "epoch": 4.21720434110836, "grad_norm": 0.4046650230884552, "learning_rate": 1.2969752666846384e-05, "loss": 1.2057, "step": 14159 }, { "epoch": 4.217502187308029, "grad_norm": 0.3872756063938141, "learning_rate": 1.2968831602912902e-05, "loss": 1.2098, "step": 14160 }, { "epoch": 4.2178000335076975, "grad_norm": 0.3703349232673645, "learning_rate": 1.2967910511357755e-05, "loss": 1.2177, "step": 14161 }, { "epoch": 4.218097879707366, "grad_norm": 0.28257542848587036, "learning_rate": 1.296698939218952e-05, "loss": 1.2281, "step": 14162 }, { "epoch": 4.218395725907035, "grad_norm": 0.3109702467918396, "learning_rate": 1.2966068245416762e-05, "loss": 1.2281, "step": 14163 }, { "epoch": 4.218693572106703, "grad_norm": 0.28986650705337524, "learning_rate": 1.296514707104805e-05, "loss": 1.2254, "step": 14164 }, { "epoch": 4.218991418306372, "grad_norm": 0.2609320878982544, "learning_rate": 1.2964225869091962e-05, "loss": 1.226, "step": 14165 }, { "epoch": 4.219289264506041, "grad_norm": 0.32675209641456604, "learning_rate": 1.2963304639557061e-05, "loss": 1.2322, "step": 14166 }, { "epoch": 4.219587110705709, "grad_norm": 0.2727464437484741, "learning_rate": 1.2962383382451922e-05, "loss": 1.2296, "step": 14167 }, { "epoch": 4.219884956905378, "grad_norm": 0.29681476950645447, "learning_rate": 1.2961462097785116e-05, "loss": 1.2254, "step": 14168 }, { "epoch": 4.220182803105047, "grad_norm": 0.26645466685295105, "learning_rate": 1.2960540785565216e-05, "loss": 1.2223, "step": 14169 }, { "epoch": 4.220480649304715, "grad_norm": 0.28149473667144775, "learning_rate": 1.2959619445800786e-05, "loss": 1.245, "step": 14170 }, { "epoch": 4.220778495504384, "grad_norm": 0.2772013247013092, "learning_rate": 1.295869807850041e-05, "loss": 1.2414, "step": 14171 }, { "epoch": 4.221076341704053, "grad_norm": 0.32597294449806213, "learning_rate": 1.2957776683672651e-05, "loss": 1.2328, "step": 14172 }, { "epoch": 4.221374187903721, "grad_norm": 0.2925127148628235, "learning_rate": 1.2956855261326085e-05, "loss": 1.2263, "step": 14173 }, { "epoch": 4.22167203410339, "grad_norm": 0.4555399715900421, "learning_rate": 1.2955933811469284e-05, "loss": 1.2173, "step": 14174 }, { "epoch": 4.2219698803030585, "grad_norm": 0.42685988545417786, "learning_rate": 1.295501233411082e-05, "loss": 1.2259, "step": 14175 }, { "epoch": 4.222267726502727, "grad_norm": 0.29122740030288696, "learning_rate": 1.295409082925927e-05, "loss": 1.24, "step": 14176 }, { "epoch": 4.222565572702396, "grad_norm": 0.35038653016090393, "learning_rate": 1.2953169296923202e-05, "loss": 1.2337, "step": 14177 }, { "epoch": 4.222863418902064, "grad_norm": 0.2838687300682068, "learning_rate": 1.2952247737111196e-05, "loss": 1.2201, "step": 14178 }, { "epoch": 4.223161265101733, "grad_norm": 0.3801766037940979, "learning_rate": 1.2951326149831826e-05, "loss": 1.2299, "step": 14179 }, { "epoch": 4.223459111301402, "grad_norm": 0.2637985050678253, "learning_rate": 1.2950404535093659e-05, "loss": 1.2275, "step": 14180 }, { "epoch": 4.22375695750107, "grad_norm": 0.6842769980430603, "learning_rate": 1.2949482892905276e-05, "loss": 1.2197, "step": 14181 }, { "epoch": 4.224054803700739, "grad_norm": 0.5419729948043823, "learning_rate": 1.294856122327525e-05, "loss": 1.2285, "step": 14182 }, { "epoch": 4.224352649900408, "grad_norm": 0.4426734149456024, "learning_rate": 1.2947639526212157e-05, "loss": 1.2243, "step": 14183 }, { "epoch": 4.224650496100076, "grad_norm": 0.5549633502960205, "learning_rate": 1.2946717801724571e-05, "loss": 1.214, "step": 14184 }, { "epoch": 4.224948342299745, "grad_norm": 0.29709017276763916, "learning_rate": 1.2945796049821066e-05, "loss": 1.2121, "step": 14185 }, { "epoch": 4.225246188499414, "grad_norm": 0.4249708652496338, "learning_rate": 1.2944874270510223e-05, "loss": 1.2439, "step": 14186 }, { "epoch": 4.225544034699082, "grad_norm": 0.29149335622787476, "learning_rate": 1.2943952463800616e-05, "loss": 1.2213, "step": 14187 }, { "epoch": 4.225841880898751, "grad_norm": 0.3362438976764679, "learning_rate": 1.2943030629700815e-05, "loss": 1.2235, "step": 14188 }, { "epoch": 4.22613972709842, "grad_norm": 0.26623889803886414, "learning_rate": 1.2942108768219406e-05, "loss": 1.2314, "step": 14189 }, { "epoch": 4.226437573298088, "grad_norm": 0.31887322664260864, "learning_rate": 1.2941186879364961e-05, "loss": 1.215, "step": 14190 }, { "epoch": 4.226735419497757, "grad_norm": 0.2840903103351593, "learning_rate": 1.2940264963146059e-05, "loss": 1.208, "step": 14191 }, { "epoch": 4.227033265697425, "grad_norm": 0.33410122990608215, "learning_rate": 1.2939343019571275e-05, "loss": 1.2239, "step": 14192 }, { "epoch": 4.227331111897094, "grad_norm": 0.28891879320144653, "learning_rate": 1.293842104864919e-05, "loss": 1.2237, "step": 14193 }, { "epoch": 4.227628958096763, "grad_norm": 0.3174760341644287, "learning_rate": 1.2937499050388377e-05, "loss": 1.2228, "step": 14194 }, { "epoch": 4.227926804296431, "grad_norm": 0.2664237916469574, "learning_rate": 1.2936577024797421e-05, "loss": 1.2337, "step": 14195 }, { "epoch": 4.2282246504961005, "grad_norm": 0.3116903007030487, "learning_rate": 1.2935654971884897e-05, "loss": 1.2027, "step": 14196 }, { "epoch": 4.228522496695769, "grad_norm": 0.36390095949172974, "learning_rate": 1.2934732891659382e-05, "loss": 1.2338, "step": 14197 }, { "epoch": 4.228820342895437, "grad_norm": 0.2807815968990326, "learning_rate": 1.2933810784129457e-05, "loss": 1.2329, "step": 14198 }, { "epoch": 4.229118189095106, "grad_norm": 0.36470362544059753, "learning_rate": 1.2932888649303699e-05, "loss": 1.2241, "step": 14199 }, { "epoch": 4.229416035294775, "grad_norm": 0.25736430287361145, "learning_rate": 1.2931966487190691e-05, "loss": 1.2186, "step": 14200 }, { "epoch": 4.229713881494443, "grad_norm": 0.39754676818847656, "learning_rate": 1.2931044297799007e-05, "loss": 1.2139, "step": 14201 }, { "epoch": 4.230011727694112, "grad_norm": 0.3175043761730194, "learning_rate": 1.2930122081137234e-05, "loss": 1.2213, "step": 14202 }, { "epoch": 4.230309573893781, "grad_norm": 0.43965598940849304, "learning_rate": 1.2929199837213949e-05, "loss": 1.2295, "step": 14203 }, { "epoch": 4.230607420093449, "grad_norm": 0.29722118377685547, "learning_rate": 1.292827756603773e-05, "loss": 1.2271, "step": 14204 }, { "epoch": 4.230905266293118, "grad_norm": 0.28060266375541687, "learning_rate": 1.292735526761716e-05, "loss": 1.2311, "step": 14205 }, { "epoch": 4.2312031124927865, "grad_norm": 0.2804664373397827, "learning_rate": 1.2926432941960822e-05, "loss": 1.2219, "step": 14206 }, { "epoch": 4.231500958692455, "grad_norm": 0.27232399582862854, "learning_rate": 1.2925510589077292e-05, "loss": 1.2217, "step": 14207 }, { "epoch": 4.231798804892124, "grad_norm": 0.2631955146789551, "learning_rate": 1.2924588208975158e-05, "loss": 1.2178, "step": 14208 }, { "epoch": 4.232096651091792, "grad_norm": 0.29186931252479553, "learning_rate": 1.2923665801662995e-05, "loss": 1.2332, "step": 14209 }, { "epoch": 4.232394497291461, "grad_norm": 0.3281896412372589, "learning_rate": 1.292274336714939e-05, "loss": 1.2254, "step": 14210 }, { "epoch": 4.23269234349113, "grad_norm": 0.25704681873321533, "learning_rate": 1.2921820905442923e-05, "loss": 1.2138, "step": 14211 }, { "epoch": 4.232990189690798, "grad_norm": 0.2990695536136627, "learning_rate": 1.2920898416552177e-05, "loss": 1.2292, "step": 14212 }, { "epoch": 4.233288035890467, "grad_norm": 0.2677648067474365, "learning_rate": 1.2919975900485735e-05, "loss": 1.2259, "step": 14213 }, { "epoch": 4.233585882090136, "grad_norm": 0.25805628299713135, "learning_rate": 1.2919053357252181e-05, "loss": 1.223, "step": 14214 }, { "epoch": 4.233883728289804, "grad_norm": 0.25021156668663025, "learning_rate": 1.2918130786860094e-05, "loss": 1.2151, "step": 14215 }, { "epoch": 4.234181574489473, "grad_norm": 0.34445720911026, "learning_rate": 1.2917208189318063e-05, "loss": 1.2344, "step": 14216 }, { "epoch": 4.234479420689142, "grad_norm": 0.26035237312316895, "learning_rate": 1.2916285564634667e-05, "loss": 1.2167, "step": 14217 }, { "epoch": 4.23477726688881, "grad_norm": 0.31001225113868713, "learning_rate": 1.2915362912818492e-05, "loss": 1.2054, "step": 14218 }, { "epoch": 4.235075113088479, "grad_norm": 0.26624149084091187, "learning_rate": 1.2914440233878123e-05, "loss": 1.2323, "step": 14219 }, { "epoch": 4.2353729592881475, "grad_norm": 0.28126007318496704, "learning_rate": 1.2913517527822146e-05, "loss": 1.2271, "step": 14220 }, { "epoch": 4.235670805487816, "grad_norm": 0.28897514939308167, "learning_rate": 1.2912594794659141e-05, "loss": 1.2294, "step": 14221 }, { "epoch": 4.235968651687485, "grad_norm": 0.27257728576660156, "learning_rate": 1.2911672034397695e-05, "loss": 1.2132, "step": 14222 }, { "epoch": 4.236266497887153, "grad_norm": 0.42830389738082886, "learning_rate": 1.2910749247046397e-05, "loss": 1.2183, "step": 14223 }, { "epoch": 4.236564344086823, "grad_norm": 0.25782597064971924, "learning_rate": 1.2909826432613825e-05, "loss": 1.2186, "step": 14224 }, { "epoch": 4.236862190286491, "grad_norm": 0.2812288701534271, "learning_rate": 1.2908903591108572e-05, "loss": 1.2252, "step": 14225 }, { "epoch": 4.237160036486159, "grad_norm": 0.30920568108558655, "learning_rate": 1.2907980722539219e-05, "loss": 1.2352, "step": 14226 }, { "epoch": 4.2374578826858285, "grad_norm": 0.3106936812400818, "learning_rate": 1.2907057826914357e-05, "loss": 1.2217, "step": 14227 }, { "epoch": 4.237755728885497, "grad_norm": 0.3300728499889374, "learning_rate": 1.290613490424257e-05, "loss": 1.2418, "step": 14228 }, { "epoch": 4.238053575085165, "grad_norm": 0.39398500323295593, "learning_rate": 1.2905211954532441e-05, "loss": 1.2237, "step": 14229 }, { "epoch": 4.238351421284834, "grad_norm": 0.3569934666156769, "learning_rate": 1.2904288977792566e-05, "loss": 1.2324, "step": 14230 }, { "epoch": 4.238649267484503, "grad_norm": 0.6860139966011047, "learning_rate": 1.2903365974031523e-05, "loss": 1.2127, "step": 14231 }, { "epoch": 4.238947113684171, "grad_norm": 0.4754354655742645, "learning_rate": 1.2902442943257905e-05, "loss": 1.232, "step": 14232 }, { "epoch": 4.23924495988384, "grad_norm": 0.5190975069999695, "learning_rate": 1.2901519885480297e-05, "loss": 1.2271, "step": 14233 }, { "epoch": 4.2395428060835085, "grad_norm": 0.47939419746398926, "learning_rate": 1.2900596800707288e-05, "loss": 1.2143, "step": 14234 }, { "epoch": 4.239840652283177, "grad_norm": 0.4061165452003479, "learning_rate": 1.2899673688947467e-05, "loss": 1.226, "step": 14235 }, { "epoch": 4.240138498482846, "grad_norm": 0.3673345446586609, "learning_rate": 1.2898750550209422e-05, "loss": 1.2187, "step": 14236 }, { "epoch": 4.240436344682514, "grad_norm": 0.4002215564250946, "learning_rate": 1.2897827384501744e-05, "loss": 1.2104, "step": 14237 }, { "epoch": 4.240734190882184, "grad_norm": 0.266471266746521, "learning_rate": 1.2896904191833018e-05, "loss": 1.2061, "step": 14238 }, { "epoch": 4.241032037081852, "grad_norm": 0.3146986663341522, "learning_rate": 1.2895980972211835e-05, "loss": 1.2267, "step": 14239 }, { "epoch": 4.24132988328152, "grad_norm": 0.25907179713249207, "learning_rate": 1.2895057725646785e-05, "loss": 1.2282, "step": 14240 }, { "epoch": 4.2416277294811895, "grad_norm": 0.2718636989593506, "learning_rate": 1.2894134452146457e-05, "loss": 1.2281, "step": 14241 }, { "epoch": 4.241925575680858, "grad_norm": 0.27919113636016846, "learning_rate": 1.2893211151719441e-05, "loss": 1.2328, "step": 14242 }, { "epoch": 4.242223421880526, "grad_norm": 0.2516349256038666, "learning_rate": 1.2892287824374328e-05, "loss": 1.2097, "step": 14243 }, { "epoch": 4.242521268080195, "grad_norm": 0.3680274784564972, "learning_rate": 1.2891364470119711e-05, "loss": 1.2298, "step": 14244 }, { "epoch": 4.242819114279864, "grad_norm": 0.26119786500930786, "learning_rate": 1.2890441088964174e-05, "loss": 1.2286, "step": 14245 }, { "epoch": 4.243116960479532, "grad_norm": 0.307192862033844, "learning_rate": 1.2889517680916314e-05, "loss": 1.2067, "step": 14246 }, { "epoch": 4.243414806679201, "grad_norm": 0.3036113977432251, "learning_rate": 1.288859424598472e-05, "loss": 1.2199, "step": 14247 }, { "epoch": 4.24371265287887, "grad_norm": 0.34055301547050476, "learning_rate": 1.2887670784177983e-05, "loss": 1.2278, "step": 14248 }, { "epoch": 4.244010499078538, "grad_norm": 0.27632489800453186, "learning_rate": 1.2886747295504698e-05, "loss": 1.2235, "step": 14249 }, { "epoch": 4.244308345278207, "grad_norm": 0.27771714329719543, "learning_rate": 1.2885823779973451e-05, "loss": 1.2332, "step": 14250 }, { "epoch": 4.244606191477875, "grad_norm": 0.2673502266407013, "learning_rate": 1.2884900237592843e-05, "loss": 1.2194, "step": 14251 }, { "epoch": 4.244904037677545, "grad_norm": 0.25433048605918884, "learning_rate": 1.2883976668371456e-05, "loss": 1.2304, "step": 14252 }, { "epoch": 4.245201883877213, "grad_norm": 0.2676333487033844, "learning_rate": 1.2883053072317891e-05, "loss": 1.2256, "step": 14253 }, { "epoch": 4.245499730076881, "grad_norm": 0.2577076554298401, "learning_rate": 1.288212944944074e-05, "loss": 1.2295, "step": 14254 }, { "epoch": 4.2457975762765505, "grad_norm": 0.31689655780792236, "learning_rate": 1.2881205799748593e-05, "loss": 1.2179, "step": 14255 }, { "epoch": 4.246095422476219, "grad_norm": 0.280807226896286, "learning_rate": 1.2880282123250045e-05, "loss": 1.2518, "step": 14256 }, { "epoch": 4.246393268675887, "grad_norm": 0.30824899673461914, "learning_rate": 1.2879358419953691e-05, "loss": 1.2263, "step": 14257 }, { "epoch": 4.246691114875556, "grad_norm": 0.2893029451370239, "learning_rate": 1.287843468986812e-05, "loss": 1.2277, "step": 14258 }, { "epoch": 4.246988961075225, "grad_norm": 0.3122074007987976, "learning_rate": 1.2877510933001933e-05, "loss": 1.2029, "step": 14259 }, { "epoch": 4.247286807274893, "grad_norm": 0.36463871598243713, "learning_rate": 1.2876587149363722e-05, "loss": 1.2354, "step": 14260 }, { "epoch": 4.247584653474562, "grad_norm": 0.29337945580482483, "learning_rate": 1.287566333896208e-05, "loss": 1.2265, "step": 14261 }, { "epoch": 4.247882499674231, "grad_norm": 0.270013689994812, "learning_rate": 1.2874739501805607e-05, "loss": 1.2264, "step": 14262 }, { "epoch": 4.2481803458739, "grad_norm": 0.28883761167526245, "learning_rate": 1.287381563790289e-05, "loss": 1.2119, "step": 14263 }, { "epoch": 4.248478192073568, "grad_norm": 0.5028244853019714, "learning_rate": 1.2872891747262534e-05, "loss": 1.2346, "step": 14264 }, { "epoch": 4.2487760382732365, "grad_norm": 0.267386794090271, "learning_rate": 1.2871967829893128e-05, "loss": 1.2188, "step": 14265 }, { "epoch": 4.249073884472906, "grad_norm": 0.4257965683937073, "learning_rate": 1.287104388580327e-05, "loss": 1.2298, "step": 14266 }, { "epoch": 4.249371730672574, "grad_norm": 0.3177397549152374, "learning_rate": 1.2870119915001555e-05, "loss": 1.2432, "step": 14267 }, { "epoch": 4.249669576872242, "grad_norm": 0.3956317603588104, "learning_rate": 1.2869195917496585e-05, "loss": 1.2344, "step": 14268 }, { "epoch": 4.2499674230719116, "grad_norm": 0.35344603657722473, "learning_rate": 1.2868271893296949e-05, "loss": 1.206, "step": 14269 }, { "epoch": 4.25026526927158, "grad_norm": 0.3135831356048584, "learning_rate": 1.2867347842411246e-05, "loss": 1.2191, "step": 14270 }, { "epoch": 4.250563115471248, "grad_norm": 0.3514217734336853, "learning_rate": 1.2866423764848079e-05, "loss": 1.2314, "step": 14271 }, { "epoch": 4.250860961670917, "grad_norm": 0.2604618966579437, "learning_rate": 1.2865499660616039e-05, "loss": 1.2224, "step": 14272 }, { "epoch": 4.251158807870586, "grad_norm": 0.31919756531715393, "learning_rate": 1.2864575529723726e-05, "loss": 1.2254, "step": 14273 }, { "epoch": 4.251456654070254, "grad_norm": 0.24949854612350464, "learning_rate": 1.2863651372179742e-05, "loss": 1.2261, "step": 14274 }, { "epoch": 4.251754500269923, "grad_norm": 0.254134863615036, "learning_rate": 1.2862727187992676e-05, "loss": 1.2409, "step": 14275 }, { "epoch": 4.252052346469592, "grad_norm": 0.2743396461009979, "learning_rate": 1.2861802977171133e-05, "loss": 1.2387, "step": 14276 }, { "epoch": 4.25235019266926, "grad_norm": 0.44552743434906006, "learning_rate": 1.2860878739723713e-05, "loss": 1.2083, "step": 14277 }, { "epoch": 4.252648038868929, "grad_norm": 0.3872476816177368, "learning_rate": 1.2859954475659013e-05, "loss": 1.2043, "step": 14278 }, { "epoch": 4.2529458850685975, "grad_norm": 0.3023822009563446, "learning_rate": 1.285903018498563e-05, "loss": 1.2316, "step": 14279 }, { "epoch": 4.253243731268267, "grad_norm": 0.453902930021286, "learning_rate": 1.2858105867712167e-05, "loss": 1.2175, "step": 14280 }, { "epoch": 4.253541577467935, "grad_norm": 0.25171342492103577, "learning_rate": 1.2857181523847222e-05, "loss": 1.2347, "step": 14281 }, { "epoch": 4.253839423667603, "grad_norm": 0.47940343618392944, "learning_rate": 1.2856257153399395e-05, "loss": 1.2295, "step": 14282 }, { "epoch": 4.254137269867273, "grad_norm": 0.2452671080827713, "learning_rate": 1.2855332756377284e-05, "loss": 1.2132, "step": 14283 }, { "epoch": 4.254435116066941, "grad_norm": 0.5154544115066528, "learning_rate": 1.2854408332789494e-05, "loss": 1.2238, "step": 14284 }, { "epoch": 4.254732962266609, "grad_norm": 0.31857696175575256, "learning_rate": 1.2853483882644625e-05, "loss": 1.2058, "step": 14285 }, { "epoch": 4.2550308084662785, "grad_norm": 0.3479495942592621, "learning_rate": 1.2852559405951274e-05, "loss": 1.2229, "step": 14286 }, { "epoch": 4.255328654665947, "grad_norm": 0.26344847679138184, "learning_rate": 1.2851634902718047e-05, "loss": 1.2437, "step": 14287 }, { "epoch": 4.255626500865615, "grad_norm": 0.3922542929649353, "learning_rate": 1.2850710372953541e-05, "loss": 1.2248, "step": 14288 }, { "epoch": 4.255924347065284, "grad_norm": 0.27182137966156006, "learning_rate": 1.284978581666636e-05, "loss": 1.2076, "step": 14289 }, { "epoch": 4.256222193264953, "grad_norm": 0.3713935613632202, "learning_rate": 1.2848861233865107e-05, "loss": 1.2165, "step": 14290 }, { "epoch": 4.256520039464622, "grad_norm": 0.306208074092865, "learning_rate": 1.2847936624558383e-05, "loss": 1.234, "step": 14291 }, { "epoch": 4.25681788566429, "grad_norm": 0.3006187379360199, "learning_rate": 1.2847011988754794e-05, "loss": 1.2088, "step": 14292 }, { "epoch": 4.2571157318639585, "grad_norm": 0.2920163869857788, "learning_rate": 1.2846087326462932e-05, "loss": 1.2279, "step": 14293 }, { "epoch": 4.257413578063628, "grad_norm": 0.3001825213432312, "learning_rate": 1.2845162637691414e-05, "loss": 1.2356, "step": 14294 }, { "epoch": 4.257711424263296, "grad_norm": 0.290507972240448, "learning_rate": 1.2844237922448835e-05, "loss": 1.2277, "step": 14295 }, { "epoch": 4.258009270462964, "grad_norm": 0.3379744589328766, "learning_rate": 1.2843313180743797e-05, "loss": 1.2146, "step": 14296 }, { "epoch": 4.258307116662634, "grad_norm": 0.26328563690185547, "learning_rate": 1.2842388412584907e-05, "loss": 1.2225, "step": 14297 }, { "epoch": 4.258604962862302, "grad_norm": 0.32128599286079407, "learning_rate": 1.2841463617980772e-05, "loss": 1.2375, "step": 14298 }, { "epoch": 4.25890280906197, "grad_norm": 0.2736668884754181, "learning_rate": 1.2840538796939987e-05, "loss": 1.216, "step": 14299 }, { "epoch": 4.2592006552616395, "grad_norm": 0.26555588841438293, "learning_rate": 1.2839613949471168e-05, "loss": 1.2213, "step": 14300 }, { "epoch": 4.259498501461308, "grad_norm": 0.2700282037258148, "learning_rate": 1.2838689075582911e-05, "loss": 1.2363, "step": 14301 }, { "epoch": 4.259796347660977, "grad_norm": 0.2413858026266098, "learning_rate": 1.2837764175283827e-05, "loss": 1.2367, "step": 14302 }, { "epoch": 4.260094193860645, "grad_norm": 0.2933841347694397, "learning_rate": 1.2836839248582514e-05, "loss": 1.2148, "step": 14303 }, { "epoch": 4.260392040060314, "grad_norm": 0.24752715229988098, "learning_rate": 1.2835914295487581e-05, "loss": 1.2238, "step": 14304 }, { "epoch": 4.260689886259983, "grad_norm": 0.24317148327827454, "learning_rate": 1.2834989316007635e-05, "loss": 1.2135, "step": 14305 }, { "epoch": 4.260987732459651, "grad_norm": 0.2943892180919647, "learning_rate": 1.2834064310151283e-05, "loss": 1.228, "step": 14306 }, { "epoch": 4.26128557865932, "grad_norm": 0.2483043521642685, "learning_rate": 1.2833139277927125e-05, "loss": 1.2301, "step": 14307 }, { "epoch": 4.261583424858989, "grad_norm": 0.28642261028289795, "learning_rate": 1.2832214219343773e-05, "loss": 1.2229, "step": 14308 }, { "epoch": 4.261881271058657, "grad_norm": 0.2563004195690155, "learning_rate": 1.2831289134409834e-05, "loss": 1.2426, "step": 14309 }, { "epoch": 4.262179117258325, "grad_norm": 0.30432990193367004, "learning_rate": 1.2830364023133913e-05, "loss": 1.2275, "step": 14310 }, { "epoch": 4.262476963457995, "grad_norm": 0.29737430810928345, "learning_rate": 1.2829438885524611e-05, "loss": 1.2241, "step": 14311 }, { "epoch": 4.262774809657663, "grad_norm": 0.2994697690010071, "learning_rate": 1.2828513721590547e-05, "loss": 1.2146, "step": 14312 }, { "epoch": 4.263072655857331, "grad_norm": 0.30888044834136963, "learning_rate": 1.2827588531340323e-05, "loss": 1.2396, "step": 14313 }, { "epoch": 4.2633705020570005, "grad_norm": 0.42900556325912476, "learning_rate": 1.2826663314782547e-05, "loss": 1.2519, "step": 14314 }, { "epoch": 4.263668348256669, "grad_norm": 0.29265686869621277, "learning_rate": 1.2825738071925827e-05, "loss": 1.2235, "step": 14315 }, { "epoch": 4.263966194456337, "grad_norm": 0.31383705139160156, "learning_rate": 1.282481280277877e-05, "loss": 1.2239, "step": 14316 }, { "epoch": 4.264264040656006, "grad_norm": 0.298958957195282, "learning_rate": 1.2823887507349983e-05, "loss": 1.2237, "step": 14317 }, { "epoch": 4.264561886855675, "grad_norm": 0.33606821298599243, "learning_rate": 1.282296218564808e-05, "loss": 1.2116, "step": 14318 }, { "epoch": 4.264859733055344, "grad_norm": 0.3485826551914215, "learning_rate": 1.282203683768167e-05, "loss": 1.2226, "step": 14319 }, { "epoch": 4.265157579255012, "grad_norm": 0.3396190106868744, "learning_rate": 1.282111146345936e-05, "loss": 1.2279, "step": 14320 }, { "epoch": 4.265455425454681, "grad_norm": 0.24842680990695953, "learning_rate": 1.2820186062989759e-05, "loss": 1.2215, "step": 14321 }, { "epoch": 4.26575327165435, "grad_norm": 0.2768348157405853, "learning_rate": 1.2819260636281477e-05, "loss": 1.2108, "step": 14322 }, { "epoch": 4.266051117854018, "grad_norm": 0.30374816060066223, "learning_rate": 1.2818335183343125e-05, "loss": 1.2133, "step": 14323 }, { "epoch": 4.2663489640536865, "grad_norm": 0.6033021807670593, "learning_rate": 1.2817409704183312e-05, "loss": 1.2129, "step": 14324 }, { "epoch": 4.266646810253356, "grad_norm": 0.5600675344467163, "learning_rate": 1.281648419881065e-05, "loss": 1.2161, "step": 14325 }, { "epoch": 4.266944656453024, "grad_norm": 0.27175745368003845, "learning_rate": 1.2815558667233751e-05, "loss": 1.2339, "step": 14326 }, { "epoch": 4.267242502652692, "grad_norm": 0.5855613350868225, "learning_rate": 1.2814633109461224e-05, "loss": 1.2074, "step": 14327 }, { "epoch": 4.2675403488523616, "grad_norm": 0.29754626750946045, "learning_rate": 1.281370752550168e-05, "loss": 1.2185, "step": 14328 }, { "epoch": 4.26783819505203, "grad_norm": 0.520555853843689, "learning_rate": 1.281278191536373e-05, "loss": 1.219, "step": 14329 }, { "epoch": 4.268136041251699, "grad_norm": 0.35054296255111694, "learning_rate": 1.2811856279055985e-05, "loss": 1.2157, "step": 14330 }, { "epoch": 4.268433887451367, "grad_norm": 0.36541903018951416, "learning_rate": 1.2810930616587063e-05, "loss": 1.2146, "step": 14331 }, { "epoch": 4.268731733651036, "grad_norm": 0.37231627106666565, "learning_rate": 1.2810004927965568e-05, "loss": 1.2109, "step": 14332 }, { "epoch": 4.269029579850705, "grad_norm": 0.30162325501441956, "learning_rate": 1.2809079213200124e-05, "loss": 1.2299, "step": 14333 }, { "epoch": 4.269327426050373, "grad_norm": 0.2992917001247406, "learning_rate": 1.2808153472299329e-05, "loss": 1.2135, "step": 14334 }, { "epoch": 4.269625272250042, "grad_norm": 0.34873780608177185, "learning_rate": 1.2807227705271806e-05, "loss": 1.2145, "step": 14335 }, { "epoch": 4.269923118449711, "grad_norm": 0.2616593837738037, "learning_rate": 1.2806301912126166e-05, "loss": 1.2382, "step": 14336 }, { "epoch": 4.270220964649379, "grad_norm": 0.536194920539856, "learning_rate": 1.2805376092871022e-05, "loss": 1.2171, "step": 14337 }, { "epoch": 4.2705188108490475, "grad_norm": 0.3619884252548218, "learning_rate": 1.2804450247514988e-05, "loss": 1.216, "step": 14338 }, { "epoch": 4.270816657048717, "grad_norm": 0.42572298645973206, "learning_rate": 1.280352437606668e-05, "loss": 1.2324, "step": 14339 }, { "epoch": 4.271114503248385, "grad_norm": 0.2787296175956726, "learning_rate": 1.2802598478534705e-05, "loss": 1.2221, "step": 14340 }, { "epoch": 4.271412349448053, "grad_norm": 0.4477073550224304, "learning_rate": 1.2801672554927684e-05, "loss": 1.2267, "step": 14341 }, { "epoch": 4.271710195647723, "grad_norm": 0.24937401711940765, "learning_rate": 1.280074660525423e-05, "loss": 1.2149, "step": 14342 }, { "epoch": 4.272008041847391, "grad_norm": 0.40457332134246826, "learning_rate": 1.279982062952296e-05, "loss": 1.2414, "step": 14343 }, { "epoch": 4.272305888047059, "grad_norm": 0.37612777948379517, "learning_rate": 1.2798894627742486e-05, "loss": 1.2288, "step": 14344 }, { "epoch": 4.2726037342467285, "grad_norm": 0.4070606231689453, "learning_rate": 1.2797968599921426e-05, "loss": 1.2226, "step": 14345 }, { "epoch": 4.272901580446397, "grad_norm": 0.380602091550827, "learning_rate": 1.2797042546068392e-05, "loss": 1.2367, "step": 14346 }, { "epoch": 4.273199426646066, "grad_norm": 0.3362421691417694, "learning_rate": 1.2796116466192004e-05, "loss": 1.2326, "step": 14347 }, { "epoch": 4.273497272845734, "grad_norm": 0.37402573227882385, "learning_rate": 1.2795190360300875e-05, "loss": 1.2382, "step": 14348 }, { "epoch": 4.273795119045403, "grad_norm": 0.2479506880044937, "learning_rate": 1.2794264228403624e-05, "loss": 1.22, "step": 14349 }, { "epoch": 4.274092965245072, "grad_norm": 0.7761954069137573, "learning_rate": 1.2793338070508865e-05, "loss": 1.2152, "step": 14350 }, { "epoch": 4.27439081144474, "grad_norm": 0.4211745262145996, "learning_rate": 1.2792411886625215e-05, "loss": 1.2271, "step": 14351 }, { "epoch": 4.2746886576444085, "grad_norm": 0.45816418528556824, "learning_rate": 1.2791485676761295e-05, "loss": 1.2294, "step": 14352 }, { "epoch": 4.274986503844078, "grad_norm": 0.4153323173522949, "learning_rate": 1.2790559440925718e-05, "loss": 1.2453, "step": 14353 }, { "epoch": 4.275284350043746, "grad_norm": 0.40850386023521423, "learning_rate": 1.2789633179127103e-05, "loss": 1.2236, "step": 14354 }, { "epoch": 4.275582196243414, "grad_norm": 0.6133487224578857, "learning_rate": 1.2788706891374068e-05, "loss": 1.2135, "step": 14355 }, { "epoch": 4.275880042443084, "grad_norm": 0.30651339888572693, "learning_rate": 1.2787780577675235e-05, "loss": 1.2315, "step": 14356 }, { "epoch": 4.276177888642752, "grad_norm": 0.2998017370700836, "learning_rate": 1.2786854238039215e-05, "loss": 1.233, "step": 14357 }, { "epoch": 4.276475734842421, "grad_norm": 0.3250484764575958, "learning_rate": 1.2785927872474627e-05, "loss": 1.2161, "step": 14358 }, { "epoch": 4.2767735810420895, "grad_norm": 0.28351834416389465, "learning_rate": 1.2785001480990095e-05, "loss": 1.2064, "step": 14359 }, { "epoch": 4.277071427241758, "grad_norm": 0.25644004344940186, "learning_rate": 1.278407506359424e-05, "loss": 1.2167, "step": 14360 }, { "epoch": 4.277369273441427, "grad_norm": 0.30562421679496765, "learning_rate": 1.2783148620295674e-05, "loss": 1.2369, "step": 14361 }, { "epoch": 4.277667119641095, "grad_norm": 0.3682887852191925, "learning_rate": 1.2782222151103017e-05, "loss": 1.2195, "step": 14362 }, { "epoch": 4.277964965840764, "grad_norm": 0.25823503732681274, "learning_rate": 1.2781295656024895e-05, "loss": 1.2285, "step": 14363 }, { "epoch": 4.278262812040433, "grad_norm": 0.30589163303375244, "learning_rate": 1.2780369135069921e-05, "loss": 1.224, "step": 14364 }, { "epoch": 4.278560658240101, "grad_norm": 0.28790926933288574, "learning_rate": 1.2779442588246723e-05, "loss": 1.2241, "step": 14365 }, { "epoch": 4.27885850443977, "grad_norm": 0.24648842215538025, "learning_rate": 1.2778516015563914e-05, "loss": 1.2176, "step": 14366 }, { "epoch": 4.279156350639439, "grad_norm": 0.3285800516605377, "learning_rate": 1.277758941703012e-05, "loss": 1.2218, "step": 14367 }, { "epoch": 4.279454196839107, "grad_norm": 0.2547835409641266, "learning_rate": 1.2776662792653957e-05, "loss": 1.2142, "step": 14368 }, { "epoch": 4.279752043038776, "grad_norm": 0.2792496085166931, "learning_rate": 1.277573614244405e-05, "loss": 1.2032, "step": 14369 }, { "epoch": 4.280049889238445, "grad_norm": 0.25271403789520264, "learning_rate": 1.2774809466409019e-05, "loss": 1.222, "step": 14370 }, { "epoch": 4.280347735438113, "grad_norm": 0.2696061134338379, "learning_rate": 1.2773882764557489e-05, "loss": 1.24, "step": 14371 }, { "epoch": 4.280645581637782, "grad_norm": 0.24025243520736694, "learning_rate": 1.2772956036898076e-05, "loss": 1.2281, "step": 14372 }, { "epoch": 4.2809434278374505, "grad_norm": 0.26664793491363525, "learning_rate": 1.277202928343941e-05, "loss": 1.2175, "step": 14373 }, { "epoch": 4.281241274037119, "grad_norm": 0.2511109411716461, "learning_rate": 1.2771102504190103e-05, "loss": 1.2258, "step": 14374 }, { "epoch": 4.281539120236788, "grad_norm": 0.2659510374069214, "learning_rate": 1.2770175699158784e-05, "loss": 1.2186, "step": 14375 }, { "epoch": 4.281836966436456, "grad_norm": 0.2575090229511261, "learning_rate": 1.2769248868354074e-05, "loss": 1.2288, "step": 14376 }, { "epoch": 4.282134812636125, "grad_norm": 0.2587834298610687, "learning_rate": 1.2768322011784601e-05, "loss": 1.2122, "step": 14377 }, { "epoch": 4.282432658835794, "grad_norm": 0.2907363176345825, "learning_rate": 1.2767395129458983e-05, "loss": 1.2136, "step": 14378 }, { "epoch": 4.282730505035462, "grad_norm": 0.2576514184474945, "learning_rate": 1.2766468221385843e-05, "loss": 1.2219, "step": 14379 }, { "epoch": 4.283028351235131, "grad_norm": 0.3286265730857849, "learning_rate": 1.2765541287573814e-05, "loss": 1.228, "step": 14380 }, { "epoch": 4.2833261974348, "grad_norm": 0.3233201801776886, "learning_rate": 1.2764614328031506e-05, "loss": 1.2254, "step": 14381 }, { "epoch": 4.283624043634468, "grad_norm": 0.3771904408931732, "learning_rate": 1.2763687342767553e-05, "loss": 1.2163, "step": 14382 }, { "epoch": 4.2839218898341365, "grad_norm": 0.26839783787727356, "learning_rate": 1.2762760331790576e-05, "loss": 1.2104, "step": 14383 }, { "epoch": 4.284219736033806, "grad_norm": 0.3020113408565521, "learning_rate": 1.2761833295109205e-05, "loss": 1.239, "step": 14384 }, { "epoch": 4.284517582233474, "grad_norm": 0.2633955478668213, "learning_rate": 1.2760906232732055e-05, "loss": 1.228, "step": 14385 }, { "epoch": 4.284815428433143, "grad_norm": 0.28546205163002014, "learning_rate": 1.275997914466776e-05, "loss": 1.2173, "step": 14386 }, { "epoch": 4.2851132746328116, "grad_norm": 0.3386612832546234, "learning_rate": 1.2759052030924944e-05, "loss": 1.2262, "step": 14387 }, { "epoch": 4.28541112083248, "grad_norm": 0.3652389645576477, "learning_rate": 1.275812489151223e-05, "loss": 1.2142, "step": 14388 }, { "epoch": 4.285708967032149, "grad_norm": 0.3611365854740143, "learning_rate": 1.2757197726438245e-05, "loss": 1.2137, "step": 14389 }, { "epoch": 4.286006813231817, "grad_norm": 0.28039783239364624, "learning_rate": 1.2756270535711615e-05, "loss": 1.2216, "step": 14390 }, { "epoch": 4.286304659431486, "grad_norm": 0.3334992825984955, "learning_rate": 1.2755343319340971e-05, "loss": 1.233, "step": 14391 }, { "epoch": 4.286602505631155, "grad_norm": 0.3214553892612457, "learning_rate": 1.2754416077334933e-05, "loss": 1.2095, "step": 14392 }, { "epoch": 4.286900351830823, "grad_norm": 0.3099585771560669, "learning_rate": 1.275348880970213e-05, "loss": 1.2207, "step": 14393 }, { "epoch": 4.287198198030492, "grad_norm": 0.43240270018577576, "learning_rate": 1.2752561516451191e-05, "loss": 1.2227, "step": 14394 }, { "epoch": 4.287496044230161, "grad_norm": 0.5841981172561646, "learning_rate": 1.2751634197590743e-05, "loss": 1.217, "step": 14395 }, { "epoch": 4.287793890429829, "grad_norm": 0.2586600184440613, "learning_rate": 1.2750706853129414e-05, "loss": 1.2301, "step": 14396 }, { "epoch": 4.288091736629498, "grad_norm": 0.4778662621974945, "learning_rate": 1.2749779483075831e-05, "loss": 1.218, "step": 14397 }, { "epoch": 4.288389582829167, "grad_norm": 0.35863474011421204, "learning_rate": 1.274885208743862e-05, "loss": 1.2102, "step": 14398 }, { "epoch": 4.288687429028835, "grad_norm": 0.566791832447052, "learning_rate": 1.274792466622641e-05, "loss": 1.2299, "step": 14399 }, { "epoch": 4.288985275228504, "grad_norm": 0.36866599321365356, "learning_rate": 1.2746997219447833e-05, "loss": 1.2211, "step": 14400 }, { "epoch": 4.289283121428173, "grad_norm": 0.9903460741043091, "learning_rate": 1.2746069747111518e-05, "loss": 1.235, "step": 14401 }, { "epoch": 4.289580967627841, "grad_norm": 0.47457778453826904, "learning_rate": 1.274514224922609e-05, "loss": 1.2186, "step": 14402 }, { "epoch": 4.28987881382751, "grad_norm": 0.5852064490318298, "learning_rate": 1.2744214725800182e-05, "loss": 1.2257, "step": 14403 }, { "epoch": 4.2901766600271785, "grad_norm": 0.26925256848335266, "learning_rate": 1.274328717684242e-05, "loss": 1.2183, "step": 14404 }, { "epoch": 4.290474506226847, "grad_norm": 0.554728090763092, "learning_rate": 1.2742359602361436e-05, "loss": 1.2098, "step": 14405 }, { "epoch": 4.290772352426516, "grad_norm": 0.35576191544532776, "learning_rate": 1.2741432002365862e-05, "loss": 1.2225, "step": 14406 }, { "epoch": 4.291070198626184, "grad_norm": 0.2810397744178772, "learning_rate": 1.2740504376864322e-05, "loss": 1.2048, "step": 14407 }, { "epoch": 4.291368044825853, "grad_norm": 0.3317042589187622, "learning_rate": 1.2739576725865457e-05, "loss": 1.1954, "step": 14408 }, { "epoch": 4.291665891025522, "grad_norm": 0.2716878354549408, "learning_rate": 1.2738649049377887e-05, "loss": 1.2193, "step": 14409 }, { "epoch": 4.29196373722519, "grad_norm": 0.24953965842723846, "learning_rate": 1.2737721347410247e-05, "loss": 1.2269, "step": 14410 }, { "epoch": 4.2922615834248585, "grad_norm": 0.26282334327697754, "learning_rate": 1.273679361997117e-05, "loss": 1.2088, "step": 14411 }, { "epoch": 4.292559429624528, "grad_norm": 0.2759837806224823, "learning_rate": 1.2735865867069285e-05, "loss": 1.222, "step": 14412 }, { "epoch": 4.292857275824196, "grad_norm": 0.25941190123558044, "learning_rate": 1.2734938088713226e-05, "loss": 1.2109, "step": 14413 }, { "epoch": 4.293155122023865, "grad_norm": 0.24825970828533173, "learning_rate": 1.2734010284911627e-05, "loss": 1.2357, "step": 14414 }, { "epoch": 4.293452968223534, "grad_norm": 0.2763080298900604, "learning_rate": 1.2733082455673111e-05, "loss": 1.2293, "step": 14415 }, { "epoch": 4.293750814423202, "grad_norm": 0.2807149291038513, "learning_rate": 1.273215460100632e-05, "loss": 1.2109, "step": 14416 }, { "epoch": 4.294048660622871, "grad_norm": 0.25246360898017883, "learning_rate": 1.2731226720919877e-05, "loss": 1.2116, "step": 14417 }, { "epoch": 4.2943465068225395, "grad_norm": 0.2973587214946747, "learning_rate": 1.2730298815422428e-05, "loss": 1.2133, "step": 14418 }, { "epoch": 4.294644353022208, "grad_norm": 0.27922913432121277, "learning_rate": 1.2729370884522597e-05, "loss": 1.2135, "step": 14419 }, { "epoch": 4.294942199221877, "grad_norm": 0.29587066173553467, "learning_rate": 1.2728442928229019e-05, "loss": 1.2293, "step": 14420 }, { "epoch": 4.295240045421545, "grad_norm": 0.28535765409469604, "learning_rate": 1.272751494655033e-05, "loss": 1.2109, "step": 14421 }, { "epoch": 4.295537891621214, "grad_norm": 0.25837650895118713, "learning_rate": 1.2726586939495156e-05, "loss": 1.2152, "step": 14422 }, { "epoch": 4.295835737820883, "grad_norm": 0.3583243191242218, "learning_rate": 1.2725658907072141e-05, "loss": 1.2325, "step": 14423 }, { "epoch": 4.296133584020551, "grad_norm": 0.28593847155570984, "learning_rate": 1.2724730849289913e-05, "loss": 1.2143, "step": 14424 }, { "epoch": 4.2964314302202204, "grad_norm": 0.29528507590293884, "learning_rate": 1.2723802766157114e-05, "loss": 1.2114, "step": 14425 }, { "epoch": 4.296729276419889, "grad_norm": 0.3226224184036255, "learning_rate": 1.272287465768237e-05, "loss": 1.2178, "step": 14426 }, { "epoch": 4.297027122619557, "grad_norm": 0.27137264609336853, "learning_rate": 1.2721946523874318e-05, "loss": 1.2291, "step": 14427 }, { "epoch": 4.297324968819226, "grad_norm": 0.6712373495101929, "learning_rate": 1.2721018364741596e-05, "loss": 1.2111, "step": 14428 }, { "epoch": 4.297622815018895, "grad_norm": 0.3394869863986969, "learning_rate": 1.2720090180292836e-05, "loss": 1.2155, "step": 14429 }, { "epoch": 4.297920661218563, "grad_norm": 0.45824187994003296, "learning_rate": 1.2719161970536677e-05, "loss": 1.2143, "step": 14430 }, { "epoch": 4.298218507418232, "grad_norm": 0.3214802145957947, "learning_rate": 1.2718233735481759e-05, "loss": 1.2471, "step": 14431 }, { "epoch": 4.2985163536179005, "grad_norm": 0.5555426478385925, "learning_rate": 1.2717305475136708e-05, "loss": 1.2186, "step": 14432 }, { "epoch": 4.298814199817569, "grad_norm": 0.34570813179016113, "learning_rate": 1.2716377189510167e-05, "loss": 1.2216, "step": 14433 }, { "epoch": 4.299112046017238, "grad_norm": 0.3341878056526184, "learning_rate": 1.2715448878610769e-05, "loss": 1.226, "step": 14434 }, { "epoch": 4.299409892216906, "grad_norm": 0.4007425904273987, "learning_rate": 1.2714520542447155e-05, "loss": 1.2323, "step": 14435 }, { "epoch": 4.299707738416576, "grad_norm": 0.27162936329841614, "learning_rate": 1.271359218102796e-05, "loss": 1.2418, "step": 14436 }, { "epoch": 4.300005584616244, "grad_norm": 0.4114162027835846, "learning_rate": 1.2712663794361821e-05, "loss": 1.2138, "step": 14437 }, { "epoch": 4.300303430815912, "grad_norm": 0.26284918189048767, "learning_rate": 1.271173538245738e-05, "loss": 1.2077, "step": 14438 }, { "epoch": 4.3006012770155815, "grad_norm": 0.2583646774291992, "learning_rate": 1.2710806945323269e-05, "loss": 1.2364, "step": 14439 }, { "epoch": 4.30089912321525, "grad_norm": 0.3204265236854553, "learning_rate": 1.2709878482968124e-05, "loss": 1.2261, "step": 14440 }, { "epoch": 4.301196969414918, "grad_norm": 0.26713070273399353, "learning_rate": 1.2708949995400589e-05, "loss": 1.2366, "step": 14441 }, { "epoch": 4.301494815614587, "grad_norm": 0.26088231801986694, "learning_rate": 1.2708021482629302e-05, "loss": 1.2177, "step": 14442 }, { "epoch": 4.301792661814256, "grad_norm": 0.4549257457256317, "learning_rate": 1.2707092944662902e-05, "loss": 1.2052, "step": 14443 }, { "epoch": 4.302090508013924, "grad_norm": 0.33362963795661926, "learning_rate": 1.2706164381510026e-05, "loss": 1.2259, "step": 14444 }, { "epoch": 4.302388354213593, "grad_norm": 0.34627485275268555, "learning_rate": 1.2705235793179313e-05, "loss": 1.2352, "step": 14445 }, { "epoch": 4.3026862004132616, "grad_norm": 0.27429014444351196, "learning_rate": 1.2704307179679402e-05, "loss": 1.222, "step": 14446 }, { "epoch": 4.30298404661293, "grad_norm": 0.34523606300354004, "learning_rate": 1.2703378541018936e-05, "loss": 1.2322, "step": 14447 }, { "epoch": 4.303281892812599, "grad_norm": 0.2928785979747772, "learning_rate": 1.2702449877206552e-05, "loss": 1.2133, "step": 14448 }, { "epoch": 4.303579739012267, "grad_norm": 0.3015083968639374, "learning_rate": 1.2701521188250897e-05, "loss": 1.2218, "step": 14449 }, { "epoch": 4.303877585211936, "grad_norm": 0.34831950068473816, "learning_rate": 1.27005924741606e-05, "loss": 1.2263, "step": 14450 }, { "epoch": 4.304175431411605, "grad_norm": 0.2671651244163513, "learning_rate": 1.2699663734944307e-05, "loss": 1.2203, "step": 14451 }, { "epoch": 4.304473277611273, "grad_norm": 0.5656589269638062, "learning_rate": 1.269873497061066e-05, "loss": 1.2161, "step": 14452 }, { "epoch": 4.3047711238109425, "grad_norm": 0.2613632380962372, "learning_rate": 1.26978061811683e-05, "loss": 1.2227, "step": 14453 }, { "epoch": 4.305068970010611, "grad_norm": 0.4541975259780884, "learning_rate": 1.2696877366625869e-05, "loss": 1.2298, "step": 14454 }, { "epoch": 4.305366816210279, "grad_norm": 0.2899562418460846, "learning_rate": 1.2695948526992005e-05, "loss": 1.2363, "step": 14455 }, { "epoch": 4.305664662409948, "grad_norm": 0.3821897804737091, "learning_rate": 1.2695019662275354e-05, "loss": 1.206, "step": 14456 }, { "epoch": 4.305962508609617, "grad_norm": 0.2865409255027771, "learning_rate": 1.2694090772484553e-05, "loss": 1.2306, "step": 14457 }, { "epoch": 4.306260354809285, "grad_norm": 0.29775720834732056, "learning_rate": 1.2693161857628246e-05, "loss": 1.2178, "step": 14458 }, { "epoch": 4.306558201008954, "grad_norm": 0.34338074922561646, "learning_rate": 1.2692232917715081e-05, "loss": 1.2248, "step": 14459 }, { "epoch": 4.306856047208623, "grad_norm": 0.27675661444664, "learning_rate": 1.2691303952753697e-05, "loss": 1.234, "step": 14460 }, { "epoch": 4.307153893408291, "grad_norm": 0.31029143929481506, "learning_rate": 1.2690374962752734e-05, "loss": 1.2366, "step": 14461 }, { "epoch": 4.30745173960796, "grad_norm": 0.29586637020111084, "learning_rate": 1.2689445947720842e-05, "loss": 1.22, "step": 14462 }, { "epoch": 4.3077495858076285, "grad_norm": 0.2669788599014282, "learning_rate": 1.2688516907666657e-05, "loss": 1.21, "step": 14463 }, { "epoch": 4.308047432007298, "grad_norm": 0.3429713547229767, "learning_rate": 1.2687587842598826e-05, "loss": 1.2196, "step": 14464 }, { "epoch": 4.308345278206966, "grad_norm": 0.26067858934402466, "learning_rate": 1.2686658752525992e-05, "loss": 1.2266, "step": 14465 }, { "epoch": 4.308643124406634, "grad_norm": 0.4082125127315521, "learning_rate": 1.2685729637456803e-05, "loss": 1.2159, "step": 14466 }, { "epoch": 4.3089409706063035, "grad_norm": 0.2630990445613861, "learning_rate": 1.26848004973999e-05, "loss": 1.2253, "step": 14467 }, { "epoch": 4.309238816805972, "grad_norm": 0.3698793351650238, "learning_rate": 1.2683871332363924e-05, "loss": 1.2233, "step": 14468 }, { "epoch": 4.30953666300564, "grad_norm": 0.2409800887107849, "learning_rate": 1.2682942142357526e-05, "loss": 1.217, "step": 14469 }, { "epoch": 4.309834509205309, "grad_norm": 0.29655179381370544, "learning_rate": 1.268201292738935e-05, "loss": 1.2208, "step": 14470 }, { "epoch": 4.310132355404978, "grad_norm": 0.2768140733242035, "learning_rate": 1.2681083687468038e-05, "loss": 1.2265, "step": 14471 }, { "epoch": 4.310430201604646, "grad_norm": 0.3115537464618683, "learning_rate": 1.2680154422602242e-05, "loss": 1.2583, "step": 14472 }, { "epoch": 4.310728047804315, "grad_norm": 0.2554987370967865, "learning_rate": 1.2679225132800601e-05, "loss": 1.2266, "step": 14473 }, { "epoch": 4.311025894003984, "grad_norm": 0.31973329186439514, "learning_rate": 1.2678295818071764e-05, "loss": 1.2199, "step": 14474 }, { "epoch": 4.311323740203652, "grad_norm": 0.27614399790763855, "learning_rate": 1.2677366478424375e-05, "loss": 1.2281, "step": 14475 }, { "epoch": 4.311621586403321, "grad_norm": 0.3766910135746002, "learning_rate": 1.2676437113867084e-05, "loss": 1.2227, "step": 14476 }, { "epoch": 4.3119194326029895, "grad_norm": 0.24778923392295837, "learning_rate": 1.2675507724408536e-05, "loss": 1.2343, "step": 14477 }, { "epoch": 4.312217278802658, "grad_norm": 0.4000803232192993, "learning_rate": 1.2674578310057375e-05, "loss": 1.2177, "step": 14478 }, { "epoch": 4.312515125002327, "grad_norm": 0.2522578835487366, "learning_rate": 1.2673648870822257e-05, "loss": 1.2127, "step": 14479 }, { "epoch": 4.312812971201995, "grad_norm": 0.3849125802516937, "learning_rate": 1.267271940671182e-05, "loss": 1.2254, "step": 14480 }, { "epoch": 4.313110817401665, "grad_norm": 0.2478676736354828, "learning_rate": 1.2671789917734714e-05, "loss": 1.2318, "step": 14481 }, { "epoch": 4.313408663601333, "grad_norm": 0.3485409617424011, "learning_rate": 1.2670860403899585e-05, "loss": 1.2119, "step": 14482 }, { "epoch": 4.313706509801001, "grad_norm": 0.2599025368690491, "learning_rate": 1.266993086521509e-05, "loss": 1.2133, "step": 14483 }, { "epoch": 4.3140043560006704, "grad_norm": 0.3301621973514557, "learning_rate": 1.2669001301689869e-05, "loss": 1.2207, "step": 14484 }, { "epoch": 4.314302202200339, "grad_norm": 0.2623264491558075, "learning_rate": 1.2668071713332573e-05, "loss": 1.2236, "step": 14485 }, { "epoch": 4.314600048400007, "grad_norm": 0.2623007893562317, "learning_rate": 1.2667142100151852e-05, "loss": 1.2153, "step": 14486 }, { "epoch": 4.314897894599676, "grad_norm": 0.25490888953208923, "learning_rate": 1.2666212462156352e-05, "loss": 1.2198, "step": 14487 }, { "epoch": 4.315195740799345, "grad_norm": 0.27106112241744995, "learning_rate": 1.2665282799354723e-05, "loss": 1.2259, "step": 14488 }, { "epoch": 4.315493586999013, "grad_norm": 0.2589944303035736, "learning_rate": 1.2664353111755618e-05, "loss": 1.2164, "step": 14489 }, { "epoch": 4.315791433198682, "grad_norm": 0.3240913450717926, "learning_rate": 1.2663423399367682e-05, "loss": 1.2363, "step": 14490 }, { "epoch": 4.3160892793983505, "grad_norm": 0.4749835729598999, "learning_rate": 1.2662493662199569e-05, "loss": 1.2277, "step": 14491 }, { "epoch": 4.31638712559802, "grad_norm": 0.3653053045272827, "learning_rate": 1.2661563900259925e-05, "loss": 1.2445, "step": 14492 }, { "epoch": 4.316684971797688, "grad_norm": 0.294147789478302, "learning_rate": 1.2660634113557401e-05, "loss": 1.2202, "step": 14493 }, { "epoch": 4.316982817997356, "grad_norm": 0.32274118065834045, "learning_rate": 1.2659704302100653e-05, "loss": 1.2303, "step": 14494 }, { "epoch": 4.317280664197026, "grad_norm": 0.3582480549812317, "learning_rate": 1.2658774465898327e-05, "loss": 1.2286, "step": 14495 }, { "epoch": 4.317578510396694, "grad_norm": 0.4758828580379486, "learning_rate": 1.2657844604959077e-05, "loss": 1.2373, "step": 14496 }, { "epoch": 4.317876356596362, "grad_norm": 0.2943171560764313, "learning_rate": 1.2656914719291548e-05, "loss": 1.1998, "step": 14497 }, { "epoch": 4.3181742027960315, "grad_norm": 0.4365615248680115, "learning_rate": 1.26559848089044e-05, "loss": 1.2392, "step": 14498 }, { "epoch": 4.3184720489957, "grad_norm": 0.34083935618400574, "learning_rate": 1.2655054873806276e-05, "loss": 1.2236, "step": 14499 }, { "epoch": 4.318769895195368, "grad_norm": 0.3708080053329468, "learning_rate": 1.2654124914005839e-05, "loss": 1.2295, "step": 14500 }, { "epoch": 4.318769895195368, "eval_loss": 1.3240220546722412, "eval_runtime": 20.9016, "eval_samples_per_second": 82.96, "eval_steps_per_second": 5.215, "step": 14500 }, { "epoch": 4.319067741395037, "grad_norm": 0.35728931427001953, "learning_rate": 1.2653194929511732e-05, "loss": 1.2367, "step": 14501 }, { "epoch": 4.319365587594706, "grad_norm": 0.3800946772098541, "learning_rate": 1.2652264920332611e-05, "loss": 1.2061, "step": 14502 }, { "epoch": 4.319663433794375, "grad_norm": 0.3083878457546234, "learning_rate": 1.2651334886477129e-05, "loss": 1.227, "step": 14503 }, { "epoch": 4.319961279994043, "grad_norm": 0.36207953095436096, "learning_rate": 1.2650404827953938e-05, "loss": 1.23, "step": 14504 }, { "epoch": 4.3202591261937116, "grad_norm": 0.3269515931606293, "learning_rate": 1.2649474744771686e-05, "loss": 1.2238, "step": 14505 }, { "epoch": 4.320556972393381, "grad_norm": 0.3152298331260681, "learning_rate": 1.2648544636939035e-05, "loss": 1.2121, "step": 14506 }, { "epoch": 4.320854818593049, "grad_norm": 0.264396607875824, "learning_rate": 1.2647614504464638e-05, "loss": 1.2223, "step": 14507 }, { "epoch": 4.321152664792717, "grad_norm": 0.2580949366092682, "learning_rate": 1.2646684347357146e-05, "loss": 1.2268, "step": 14508 }, { "epoch": 4.321450510992387, "grad_norm": 0.29828542470932007, "learning_rate": 1.264575416562521e-05, "loss": 1.2439, "step": 14509 }, { "epoch": 4.321748357192055, "grad_norm": 0.29618167877197266, "learning_rate": 1.2644823959277489e-05, "loss": 1.2107, "step": 14510 }, { "epoch": 4.322046203391723, "grad_norm": 0.28344979882240295, "learning_rate": 1.2643893728322635e-05, "loss": 1.2285, "step": 14511 }, { "epoch": 4.3223440495913925, "grad_norm": 0.3486150801181793, "learning_rate": 1.2642963472769305e-05, "loss": 1.229, "step": 14512 }, { "epoch": 4.322641895791061, "grad_norm": 0.28179824352264404, "learning_rate": 1.2642033192626153e-05, "loss": 1.2207, "step": 14513 }, { "epoch": 4.322939741990729, "grad_norm": 0.36442992091178894, "learning_rate": 1.2641102887901835e-05, "loss": 1.2305, "step": 14514 }, { "epoch": 4.323237588190398, "grad_norm": 0.2730322480201721, "learning_rate": 1.2640172558605005e-05, "loss": 1.237, "step": 14515 }, { "epoch": 4.323535434390067, "grad_norm": 0.31957578659057617, "learning_rate": 1.2639242204744315e-05, "loss": 1.2193, "step": 14516 }, { "epoch": 4.323833280589735, "grad_norm": 0.2447746843099594, "learning_rate": 1.2638311826328429e-05, "loss": 1.2309, "step": 14517 }, { "epoch": 4.324131126789404, "grad_norm": 0.3349570333957672, "learning_rate": 1.2637381423365998e-05, "loss": 1.2134, "step": 14518 }, { "epoch": 4.324428972989073, "grad_norm": 0.26600414514541626, "learning_rate": 1.2636450995865679e-05, "loss": 1.2064, "step": 14519 }, { "epoch": 4.324726819188742, "grad_norm": 0.30910712480545044, "learning_rate": 1.2635520543836133e-05, "loss": 1.2337, "step": 14520 }, { "epoch": 4.32502466538841, "grad_norm": 0.31716904044151306, "learning_rate": 1.2634590067286007e-05, "loss": 1.2266, "step": 14521 }, { "epoch": 4.3253225115880785, "grad_norm": 0.24567840993404388, "learning_rate": 1.2633659566223968e-05, "loss": 1.2174, "step": 14522 }, { "epoch": 4.325620357787748, "grad_norm": 0.3010712265968323, "learning_rate": 1.2632729040658665e-05, "loss": 1.1963, "step": 14523 }, { "epoch": 4.325918203987416, "grad_norm": 0.29390281438827515, "learning_rate": 1.2631798490598765e-05, "loss": 1.2116, "step": 14524 }, { "epoch": 4.326216050187084, "grad_norm": 0.2399107664823532, "learning_rate": 1.2630867916052918e-05, "loss": 1.2192, "step": 14525 }, { "epoch": 4.3265138963867535, "grad_norm": 0.28184759616851807, "learning_rate": 1.2629937317029786e-05, "loss": 1.2272, "step": 14526 }, { "epoch": 4.326811742586422, "grad_norm": 0.253844290971756, "learning_rate": 1.2629006693538024e-05, "loss": 1.2086, "step": 14527 }, { "epoch": 4.32710958878609, "grad_norm": 0.30625760555267334, "learning_rate": 1.2628076045586291e-05, "loss": 1.2377, "step": 14528 }, { "epoch": 4.327407434985759, "grad_norm": 0.29708221554756165, "learning_rate": 1.2627145373183248e-05, "loss": 1.2166, "step": 14529 }, { "epoch": 4.327705281185428, "grad_norm": 0.3452737331390381, "learning_rate": 1.2626214676337554e-05, "loss": 1.2224, "step": 14530 }, { "epoch": 4.328003127385097, "grad_norm": 0.3978636562824249, "learning_rate": 1.2625283955057865e-05, "loss": 1.2122, "step": 14531 }, { "epoch": 4.328300973584765, "grad_norm": 0.27863913774490356, "learning_rate": 1.2624353209352842e-05, "loss": 1.2273, "step": 14532 }, { "epoch": 4.328598819784434, "grad_norm": 0.391391396522522, "learning_rate": 1.2623422439231142e-05, "loss": 1.2359, "step": 14533 }, { "epoch": 4.328896665984103, "grad_norm": 0.31735673546791077, "learning_rate": 1.2622491644701432e-05, "loss": 1.2185, "step": 14534 }, { "epoch": 4.329194512183771, "grad_norm": 0.2565832734107971, "learning_rate": 1.2621560825772363e-05, "loss": 1.2176, "step": 14535 }, { "epoch": 4.3294923583834395, "grad_norm": 0.26544708013534546, "learning_rate": 1.26206299824526e-05, "loss": 1.2096, "step": 14536 }, { "epoch": 4.329790204583109, "grad_norm": 0.27407172322273254, "learning_rate": 1.2619699114750807e-05, "loss": 1.2315, "step": 14537 }, { "epoch": 4.330088050782777, "grad_norm": 0.2865266799926758, "learning_rate": 1.2618768222675639e-05, "loss": 1.2255, "step": 14538 }, { "epoch": 4.330385896982445, "grad_norm": 0.28279930353164673, "learning_rate": 1.2617837306235756e-05, "loss": 1.2155, "step": 14539 }, { "epoch": 4.330683743182115, "grad_norm": 0.32242903113365173, "learning_rate": 1.2616906365439823e-05, "loss": 1.2194, "step": 14540 }, { "epoch": 4.330981589381783, "grad_norm": 0.30280429124832153, "learning_rate": 1.2615975400296501e-05, "loss": 1.2324, "step": 14541 }, { "epoch": 4.331279435581451, "grad_norm": 0.24647094309329987, "learning_rate": 1.261504441081445e-05, "loss": 1.2266, "step": 14542 }, { "epoch": 4.3315772817811204, "grad_norm": 0.2495497465133667, "learning_rate": 1.2614113397002332e-05, "loss": 1.2317, "step": 14543 }, { "epoch": 4.331875127980789, "grad_norm": 0.24952080845832825, "learning_rate": 1.2613182358868812e-05, "loss": 1.2297, "step": 14544 }, { "epoch": 4.332172974180457, "grad_norm": 0.256031334400177, "learning_rate": 1.2612251296422546e-05, "loss": 1.229, "step": 14545 }, { "epoch": 4.332470820380126, "grad_norm": 0.2596443295478821, "learning_rate": 1.26113202096722e-05, "loss": 1.2321, "step": 14546 }, { "epoch": 4.332768666579795, "grad_norm": 0.2813296318054199, "learning_rate": 1.2610389098626441e-05, "loss": 1.2191, "step": 14547 }, { "epoch": 4.333066512779464, "grad_norm": 0.3075757622718811, "learning_rate": 1.2609457963293927e-05, "loss": 1.2109, "step": 14548 }, { "epoch": 4.333364358979132, "grad_norm": 0.27286309003829956, "learning_rate": 1.2608526803683318e-05, "loss": 1.2178, "step": 14549 }, { "epoch": 4.3336622051788005, "grad_norm": 0.2939545810222626, "learning_rate": 1.2607595619803285e-05, "loss": 1.2312, "step": 14550 }, { "epoch": 4.33396005137847, "grad_norm": 0.30843645334243774, "learning_rate": 1.2606664411662489e-05, "loss": 1.2324, "step": 14551 }, { "epoch": 4.334257897578138, "grad_norm": 0.3237903416156769, "learning_rate": 1.260573317926959e-05, "loss": 1.2329, "step": 14552 }, { "epoch": 4.334555743777806, "grad_norm": 0.5066404938697815, "learning_rate": 1.2604801922633256e-05, "loss": 1.2204, "step": 14553 }, { "epoch": 4.334853589977476, "grad_norm": 0.37350624799728394, "learning_rate": 1.2603870641762155e-05, "loss": 1.2224, "step": 14554 }, { "epoch": 4.335151436177144, "grad_norm": 0.3123527467250824, "learning_rate": 1.2602939336664943e-05, "loss": 1.2187, "step": 14555 }, { "epoch": 4.335449282376812, "grad_norm": 0.27443909645080566, "learning_rate": 1.2602008007350289e-05, "loss": 1.211, "step": 14556 }, { "epoch": 4.3357471285764815, "grad_norm": 0.32020115852355957, "learning_rate": 1.2601076653826856e-05, "loss": 1.2229, "step": 14557 }, { "epoch": 4.33604497477615, "grad_norm": 0.4458685517311096, "learning_rate": 1.2600145276103311e-05, "loss": 1.2174, "step": 14558 }, { "epoch": 4.336342820975819, "grad_norm": 0.39707452058792114, "learning_rate": 1.259921387418832e-05, "loss": 1.2136, "step": 14559 }, { "epoch": 4.336640667175487, "grad_norm": 0.27815672755241394, "learning_rate": 1.2598282448090548e-05, "loss": 1.2096, "step": 14560 }, { "epoch": 4.336938513375156, "grad_norm": 0.372768759727478, "learning_rate": 1.2597350997818663e-05, "loss": 1.2405, "step": 14561 }, { "epoch": 4.337236359574825, "grad_norm": 0.31767967343330383, "learning_rate": 1.2596419523381327e-05, "loss": 1.2335, "step": 14562 }, { "epoch": 4.337534205774493, "grad_norm": 0.274199903011322, "learning_rate": 1.2595488024787208e-05, "loss": 1.2161, "step": 14563 }, { "epoch": 4.3378320519741616, "grad_norm": 0.3076978623867035, "learning_rate": 1.259455650204497e-05, "loss": 1.2185, "step": 14564 }, { "epoch": 4.338129898173831, "grad_norm": 0.34883975982666016, "learning_rate": 1.2593624955163288e-05, "loss": 1.2095, "step": 14565 }, { "epoch": 4.338427744373499, "grad_norm": 0.6107051968574524, "learning_rate": 1.259269338415082e-05, "loss": 1.2316, "step": 14566 }, { "epoch": 4.338725590573167, "grad_norm": 0.35214030742645264, "learning_rate": 1.2591761789016239e-05, "loss": 1.2213, "step": 14567 }, { "epoch": 4.339023436772837, "grad_norm": 0.3620079755783081, "learning_rate": 1.2590830169768208e-05, "loss": 1.2292, "step": 14568 }, { "epoch": 4.339321282972505, "grad_norm": 0.33429720997810364, "learning_rate": 1.2589898526415399e-05, "loss": 1.2295, "step": 14569 }, { "epoch": 4.339619129172174, "grad_norm": 0.3748376667499542, "learning_rate": 1.2588966858966476e-05, "loss": 1.2166, "step": 14570 }, { "epoch": 4.3399169753718425, "grad_norm": 0.5470907688140869, "learning_rate": 1.258803516743011e-05, "loss": 1.2227, "step": 14571 }, { "epoch": 4.340214821571511, "grad_norm": 0.2732674479484558, "learning_rate": 1.2587103451814969e-05, "loss": 1.2258, "step": 14572 }, { "epoch": 4.34051266777118, "grad_norm": 0.5472095012664795, "learning_rate": 1.258617171212972e-05, "loss": 1.2192, "step": 14573 }, { "epoch": 4.340810513970848, "grad_norm": 0.3168555796146393, "learning_rate": 1.2585239948383032e-05, "loss": 1.2207, "step": 14574 }, { "epoch": 4.341108360170517, "grad_norm": 0.37085288763046265, "learning_rate": 1.2584308160583573e-05, "loss": 1.2261, "step": 14575 }, { "epoch": 4.341406206370186, "grad_norm": 0.25340375304222107, "learning_rate": 1.2583376348740017e-05, "loss": 1.2153, "step": 14576 }, { "epoch": 4.341704052569854, "grad_norm": 0.3801701068878174, "learning_rate": 1.258244451286103e-05, "loss": 1.2231, "step": 14577 }, { "epoch": 4.342001898769523, "grad_norm": 0.2782164514064789, "learning_rate": 1.2581512652955284e-05, "loss": 1.2226, "step": 14578 }, { "epoch": 4.342299744969192, "grad_norm": 0.37056881189346313, "learning_rate": 1.2580580769031443e-05, "loss": 1.222, "step": 14579 }, { "epoch": 4.34259759116886, "grad_norm": 0.2865006625652313, "learning_rate": 1.2579648861098184e-05, "loss": 1.2277, "step": 14580 }, { "epoch": 4.3428954373685285, "grad_norm": 0.3121490776538849, "learning_rate": 1.2578716929164171e-05, "loss": 1.2316, "step": 14581 }, { "epoch": 4.343193283568198, "grad_norm": 0.28218358755111694, "learning_rate": 1.2577784973238081e-05, "loss": 1.2252, "step": 14582 }, { "epoch": 4.343491129767866, "grad_norm": 0.24253541231155396, "learning_rate": 1.2576852993328582e-05, "loss": 1.2256, "step": 14583 }, { "epoch": 4.343788975967534, "grad_norm": 0.25248849391937256, "learning_rate": 1.2575920989444343e-05, "loss": 1.2127, "step": 14584 }, { "epoch": 4.3440868221672035, "grad_norm": 0.2888668477535248, "learning_rate": 1.2574988961594041e-05, "loss": 1.2166, "step": 14585 }, { "epoch": 4.344384668366872, "grad_norm": 0.25354546308517456, "learning_rate": 1.257405690978634e-05, "loss": 1.2393, "step": 14586 }, { "epoch": 4.344682514566541, "grad_norm": 0.3220248818397522, "learning_rate": 1.2573124834029915e-05, "loss": 1.2179, "step": 14587 }, { "epoch": 4.344980360766209, "grad_norm": 0.3816487193107605, "learning_rate": 1.2572192734333441e-05, "loss": 1.214, "step": 14588 }, { "epoch": 4.345278206965878, "grad_norm": 0.31159496307373047, "learning_rate": 1.2571260610705585e-05, "loss": 1.2192, "step": 14589 }, { "epoch": 4.345576053165547, "grad_norm": 0.3061455190181732, "learning_rate": 1.2570328463155024e-05, "loss": 1.2293, "step": 14590 }, { "epoch": 4.345873899365215, "grad_norm": 0.30936703085899353, "learning_rate": 1.2569396291690428e-05, "loss": 1.2183, "step": 14591 }, { "epoch": 4.346171745564884, "grad_norm": 0.25135019421577454, "learning_rate": 1.2568464096320467e-05, "loss": 1.2224, "step": 14592 }, { "epoch": 4.346469591764553, "grad_norm": 0.2868601679801941, "learning_rate": 1.256753187705382e-05, "loss": 1.2156, "step": 14593 }, { "epoch": 4.346767437964221, "grad_norm": 0.2719976305961609, "learning_rate": 1.2566599633899158e-05, "loss": 1.2332, "step": 14594 }, { "epoch": 4.3470652841638895, "grad_norm": 0.2508276700973511, "learning_rate": 1.2565667366865155e-05, "loss": 1.2245, "step": 14595 }, { "epoch": 4.347363130363559, "grad_norm": 0.3190496861934662, "learning_rate": 1.256473507596048e-05, "loss": 1.2302, "step": 14596 }, { "epoch": 4.347660976563227, "grad_norm": 0.25456708669662476, "learning_rate": 1.2563802761193812e-05, "loss": 1.2182, "step": 14597 }, { "epoch": 4.347958822762896, "grad_norm": 0.28232961893081665, "learning_rate": 1.2562870422573825e-05, "loss": 1.2129, "step": 14598 }, { "epoch": 4.348256668962565, "grad_norm": 0.2526226341724396, "learning_rate": 1.2561938060109191e-05, "loss": 1.22, "step": 14599 }, { "epoch": 4.348554515162233, "grad_norm": 0.24340102076530457, "learning_rate": 1.2561005673808586e-05, "loss": 1.2165, "step": 14600 }, { "epoch": 4.348852361361902, "grad_norm": 0.27539387345314026, "learning_rate": 1.2560073263680685e-05, "loss": 1.2056, "step": 14601 }, { "epoch": 4.3491502075615704, "grad_norm": 0.25885745882987976, "learning_rate": 1.2559140829734164e-05, "loss": 1.2287, "step": 14602 }, { "epoch": 4.349448053761239, "grad_norm": 0.25479578971862793, "learning_rate": 1.2558208371977693e-05, "loss": 1.2182, "step": 14603 }, { "epoch": 4.349745899960908, "grad_norm": 0.2609451413154602, "learning_rate": 1.2557275890419956e-05, "loss": 1.2159, "step": 14604 }, { "epoch": 4.350043746160576, "grad_norm": 0.3177453875541687, "learning_rate": 1.255634338506962e-05, "loss": 1.2213, "step": 14605 }, { "epoch": 4.350341592360245, "grad_norm": 0.3020949363708496, "learning_rate": 1.2555410855935366e-05, "loss": 1.214, "step": 14606 }, { "epoch": 4.350639438559914, "grad_norm": 0.25208285450935364, "learning_rate": 1.255447830302587e-05, "loss": 1.238, "step": 14607 }, { "epoch": 4.350937284759582, "grad_norm": 0.25857990980148315, "learning_rate": 1.2553545726349805e-05, "loss": 1.2154, "step": 14608 }, { "epoch": 4.3512351309592505, "grad_norm": 0.296636700630188, "learning_rate": 1.2552613125915853e-05, "loss": 1.2176, "step": 14609 }, { "epoch": 4.35153297715892, "grad_norm": 0.3673078417778015, "learning_rate": 1.2551680501732686e-05, "loss": 1.2135, "step": 14610 }, { "epoch": 4.351830823358588, "grad_norm": 0.25487521290779114, "learning_rate": 1.2550747853808982e-05, "loss": 1.2153, "step": 14611 }, { "epoch": 4.352128669558256, "grad_norm": 0.5622789263725281, "learning_rate": 1.2549815182153421e-05, "loss": 1.2221, "step": 14612 }, { "epoch": 4.352426515757926, "grad_norm": 0.8438968062400818, "learning_rate": 1.2548882486774678e-05, "loss": 1.2127, "step": 14613 }, { "epoch": 4.352724361957594, "grad_norm": 0.6250828504562378, "learning_rate": 1.2547949767681433e-05, "loss": 1.2239, "step": 14614 }, { "epoch": 4.353022208157263, "grad_norm": 0.35660964250564575, "learning_rate": 1.254701702488236e-05, "loss": 1.2245, "step": 14615 }, { "epoch": 4.3533200543569315, "grad_norm": 0.47316092252731323, "learning_rate": 1.254608425838614e-05, "loss": 1.2436, "step": 14616 }, { "epoch": 4.3536179005566, "grad_norm": 0.3825356066226959, "learning_rate": 1.2545151468201448e-05, "loss": 1.2214, "step": 14617 }, { "epoch": 4.353915746756269, "grad_norm": 0.49703580141067505, "learning_rate": 1.2544218654336969e-05, "loss": 1.198, "step": 14618 }, { "epoch": 4.354213592955937, "grad_norm": 0.3626794219017029, "learning_rate": 1.2543285816801379e-05, "loss": 1.2271, "step": 14619 }, { "epoch": 4.354511439155606, "grad_norm": 0.40533241629600525, "learning_rate": 1.2542352955603353e-05, "loss": 1.2355, "step": 14620 }, { "epoch": 4.354809285355275, "grad_norm": 0.2636505365371704, "learning_rate": 1.2541420070751572e-05, "loss": 1.2303, "step": 14621 }, { "epoch": 4.355107131554943, "grad_norm": 0.3172827959060669, "learning_rate": 1.254048716225472e-05, "loss": 1.2101, "step": 14622 }, { "epoch": 4.3554049777546116, "grad_norm": 0.2429002970457077, "learning_rate": 1.253955423012147e-05, "loss": 1.2391, "step": 14623 }, { "epoch": 4.355702823954281, "grad_norm": 0.2845412492752075, "learning_rate": 1.2538621274360507e-05, "loss": 1.2211, "step": 14624 }, { "epoch": 4.356000670153949, "grad_norm": 0.26535719633102417, "learning_rate": 1.2537688294980507e-05, "loss": 1.2361, "step": 14625 }, { "epoch": 4.356298516353618, "grad_norm": 0.2541870176792145, "learning_rate": 1.2536755291990158e-05, "loss": 1.2195, "step": 14626 }, { "epoch": 4.356596362553287, "grad_norm": 0.31307801604270935, "learning_rate": 1.2535822265398134e-05, "loss": 1.2348, "step": 14627 }, { "epoch": 4.356894208752955, "grad_norm": 0.3025127947330475, "learning_rate": 1.2534889215213112e-05, "loss": 1.219, "step": 14628 }, { "epoch": 4.357192054952624, "grad_norm": 0.2949710190296173, "learning_rate": 1.2533956141443784e-05, "loss": 1.2225, "step": 14629 }, { "epoch": 4.3574899011522925, "grad_norm": 0.3623819053173065, "learning_rate": 1.2533023044098822e-05, "loss": 1.238, "step": 14630 }, { "epoch": 4.357787747351961, "grad_norm": 0.27650731801986694, "learning_rate": 1.253208992318691e-05, "loss": 1.2272, "step": 14631 }, { "epoch": 4.35808559355163, "grad_norm": 0.39150604605674744, "learning_rate": 1.253115677871673e-05, "loss": 1.2322, "step": 14632 }, { "epoch": 4.358383439751298, "grad_norm": 0.26151758432388306, "learning_rate": 1.253022361069697e-05, "loss": 1.2142, "step": 14633 }, { "epoch": 4.358681285950967, "grad_norm": 0.4499976336956024, "learning_rate": 1.2529290419136297e-05, "loss": 1.2102, "step": 14634 }, { "epoch": 4.358979132150636, "grad_norm": 0.2854762077331543, "learning_rate": 1.2528357204043409e-05, "loss": 1.2182, "step": 14635 }, { "epoch": 4.359276978350304, "grad_norm": 0.6315831542015076, "learning_rate": 1.2527423965426982e-05, "loss": 1.2133, "step": 14636 }, { "epoch": 4.3595748245499735, "grad_norm": 0.44361525774002075, "learning_rate": 1.2526490703295697e-05, "loss": 1.2365, "step": 14637 }, { "epoch": 4.359872670749642, "grad_norm": 0.48205363750457764, "learning_rate": 1.2525557417658238e-05, "loss": 1.2322, "step": 14638 }, { "epoch": 4.36017051694931, "grad_norm": 0.49180200695991516, "learning_rate": 1.252462410852329e-05, "loss": 1.1915, "step": 14639 }, { "epoch": 4.360468363148979, "grad_norm": 0.4336889088153839, "learning_rate": 1.2523690775899535e-05, "loss": 1.2137, "step": 14640 }, { "epoch": 4.360766209348648, "grad_norm": 0.47530874609947205, "learning_rate": 1.2522757419795657e-05, "loss": 1.1999, "step": 14641 }, { "epoch": 4.361064055548316, "grad_norm": 0.37315914034843445, "learning_rate": 1.2521824040220341e-05, "loss": 1.2209, "step": 14642 }, { "epoch": 4.361361901747985, "grad_norm": 0.4032912254333496, "learning_rate": 1.2520890637182271e-05, "loss": 1.2333, "step": 14643 }, { "epoch": 4.3616597479476535, "grad_norm": 0.45708635449409485, "learning_rate": 1.2519957210690126e-05, "loss": 1.2334, "step": 14644 }, { "epoch": 4.361957594147322, "grad_norm": 0.33763477206230164, "learning_rate": 1.2519023760752597e-05, "loss": 1.2052, "step": 14645 }, { "epoch": 4.362255440346991, "grad_norm": 0.4702033996582031, "learning_rate": 1.2518090287378366e-05, "loss": 1.239, "step": 14646 }, { "epoch": 4.362553286546659, "grad_norm": 0.34722065925598145, "learning_rate": 1.2517156790576117e-05, "loss": 1.2212, "step": 14647 }, { "epoch": 4.362851132746328, "grad_norm": 0.6574146747589111, "learning_rate": 1.2516223270354538e-05, "loss": 1.2235, "step": 14648 }, { "epoch": 4.363148978945997, "grad_norm": 0.28901952505111694, "learning_rate": 1.2515289726722312e-05, "loss": 1.2374, "step": 14649 }, { "epoch": 4.363446825145665, "grad_norm": 0.5301523208618164, "learning_rate": 1.2514356159688128e-05, "loss": 1.2225, "step": 14650 }, { "epoch": 4.363744671345334, "grad_norm": 0.2746599018573761, "learning_rate": 1.2513422569260665e-05, "loss": 1.2323, "step": 14651 }, { "epoch": 4.364042517545003, "grad_norm": 0.4509832262992859, "learning_rate": 1.2512488955448614e-05, "loss": 1.2179, "step": 14652 }, { "epoch": 4.364340363744671, "grad_norm": 0.3198435604572296, "learning_rate": 1.2511555318260662e-05, "loss": 1.2294, "step": 14653 }, { "epoch": 4.36463820994434, "grad_norm": 0.36141619086265564, "learning_rate": 1.2510621657705496e-05, "loss": 1.2276, "step": 14654 }, { "epoch": 4.364936056144009, "grad_norm": 0.29216331243515015, "learning_rate": 1.2509687973791795e-05, "loss": 1.2161, "step": 14655 }, { "epoch": 4.365233902343677, "grad_norm": 0.2578272521495819, "learning_rate": 1.2508754266528253e-05, "loss": 1.2204, "step": 14656 }, { "epoch": 4.365531748543346, "grad_norm": 0.27996453642845154, "learning_rate": 1.2507820535923559e-05, "loss": 1.2278, "step": 14657 }, { "epoch": 4.365829594743015, "grad_norm": 0.2756759226322174, "learning_rate": 1.2506886781986394e-05, "loss": 1.2079, "step": 14658 }, { "epoch": 4.366127440942683, "grad_norm": 0.2989519536495209, "learning_rate": 1.2505953004725448e-05, "loss": 1.2228, "step": 14659 }, { "epoch": 4.366425287142352, "grad_norm": 0.280781090259552, "learning_rate": 1.2505019204149411e-05, "loss": 1.2263, "step": 14660 }, { "epoch": 4.3667231333420204, "grad_norm": 0.2653757631778717, "learning_rate": 1.2504085380266968e-05, "loss": 1.2195, "step": 14661 }, { "epoch": 4.367020979541689, "grad_norm": 0.2949097752571106, "learning_rate": 1.2503151533086808e-05, "loss": 1.2244, "step": 14662 }, { "epoch": 4.367318825741358, "grad_norm": 0.30720022320747375, "learning_rate": 1.250221766261762e-05, "loss": 1.222, "step": 14663 }, { "epoch": 4.367616671941026, "grad_norm": 0.33239036798477173, "learning_rate": 1.2501283768868092e-05, "loss": 1.209, "step": 14664 }, { "epoch": 4.3679145181406955, "grad_norm": 0.4349282383918762, "learning_rate": 1.2500349851846914e-05, "loss": 1.234, "step": 14665 }, { "epoch": 4.368212364340364, "grad_norm": 0.26425737142562866, "learning_rate": 1.249941591156277e-05, "loss": 1.2171, "step": 14666 }, { "epoch": 4.368510210540032, "grad_norm": 0.3702079951763153, "learning_rate": 1.249848194802436e-05, "loss": 1.2326, "step": 14667 }, { "epoch": 4.368808056739701, "grad_norm": 0.25778859853744507, "learning_rate": 1.2497547961240363e-05, "loss": 1.2194, "step": 14668 }, { "epoch": 4.36910590293937, "grad_norm": 0.28464213013648987, "learning_rate": 1.2496613951219474e-05, "loss": 1.2273, "step": 14669 }, { "epoch": 4.369403749139038, "grad_norm": 0.27053070068359375, "learning_rate": 1.2495679917970382e-05, "loss": 1.2256, "step": 14670 }, { "epoch": 4.369701595338707, "grad_norm": 0.27638304233551025, "learning_rate": 1.2494745861501777e-05, "loss": 1.2225, "step": 14671 }, { "epoch": 4.369999441538376, "grad_norm": 0.3397147059440613, "learning_rate": 1.2493811781822344e-05, "loss": 1.2281, "step": 14672 }, { "epoch": 4.370297287738044, "grad_norm": 0.2771948277950287, "learning_rate": 1.2492877678940784e-05, "loss": 1.2182, "step": 14673 }, { "epoch": 4.370595133937713, "grad_norm": 0.33297523856163025, "learning_rate": 1.2491943552865781e-05, "loss": 1.2129, "step": 14674 }, { "epoch": 4.3708929801373815, "grad_norm": 0.4865691065788269, "learning_rate": 1.2491009403606026e-05, "loss": 1.2157, "step": 14675 }, { "epoch": 4.37119082633705, "grad_norm": 0.32315999269485474, "learning_rate": 1.2490075231170213e-05, "loss": 1.2364, "step": 14676 }, { "epoch": 4.371488672536719, "grad_norm": 0.551485002040863, "learning_rate": 1.2489141035567035e-05, "loss": 1.2303, "step": 14677 }, { "epoch": 4.371786518736387, "grad_norm": 0.30194327235221863, "learning_rate": 1.2488206816805177e-05, "loss": 1.2164, "step": 14678 }, { "epoch": 4.372084364936056, "grad_norm": 0.5695297122001648, "learning_rate": 1.2487272574893335e-05, "loss": 1.2349, "step": 14679 }, { "epoch": 4.372382211135725, "grad_norm": 0.3976595997810364, "learning_rate": 1.24863383098402e-05, "loss": 1.2175, "step": 14680 }, { "epoch": 4.372680057335393, "grad_norm": 0.43812090158462524, "learning_rate": 1.2485404021654465e-05, "loss": 1.2435, "step": 14681 }, { "epoch": 4.372977903535062, "grad_norm": 0.3519086241722107, "learning_rate": 1.2484469710344824e-05, "loss": 1.2352, "step": 14682 }, { "epoch": 4.373275749734731, "grad_norm": 0.3912816345691681, "learning_rate": 1.2483535375919967e-05, "loss": 1.2341, "step": 14683 }, { "epoch": 4.373573595934399, "grad_norm": 0.30880996584892273, "learning_rate": 1.248260101838859e-05, "loss": 1.2118, "step": 14684 }, { "epoch": 4.373871442134068, "grad_norm": 0.29675379395484924, "learning_rate": 1.2481666637759381e-05, "loss": 1.2349, "step": 14685 }, { "epoch": 4.374169288333737, "grad_norm": 0.24437694251537323, "learning_rate": 1.248073223404104e-05, "loss": 1.2284, "step": 14686 }, { "epoch": 4.374467134533405, "grad_norm": 0.31316179037094116, "learning_rate": 1.2479797807242255e-05, "loss": 1.2168, "step": 14687 }, { "epoch": 4.374764980733074, "grad_norm": 0.2598910331726074, "learning_rate": 1.2478863357371722e-05, "loss": 1.2359, "step": 14688 }, { "epoch": 4.3750628269327425, "grad_norm": 0.298622727394104, "learning_rate": 1.2477928884438135e-05, "loss": 1.2252, "step": 14689 }, { "epoch": 4.375360673132411, "grad_norm": 0.25254201889038086, "learning_rate": 1.247699438845019e-05, "loss": 1.219, "step": 14690 }, { "epoch": 4.37565851933208, "grad_norm": 0.26050952076911926, "learning_rate": 1.2476059869416579e-05, "loss": 1.2312, "step": 14691 }, { "epoch": 4.375956365531748, "grad_norm": 0.2809820771217346, "learning_rate": 1.2475125327345998e-05, "loss": 1.2136, "step": 14692 }, { "epoch": 4.376254211731418, "grad_norm": 0.28439196944236755, "learning_rate": 1.2474190762247136e-05, "loss": 1.2141, "step": 14693 }, { "epoch": 4.376552057931086, "grad_norm": 0.3428935706615448, "learning_rate": 1.2473256174128701e-05, "loss": 1.2341, "step": 14694 }, { "epoch": 4.376849904130754, "grad_norm": 0.32866135239601135, "learning_rate": 1.2472321562999376e-05, "loss": 1.2228, "step": 14695 }, { "epoch": 4.3771477503304235, "grad_norm": 0.27710849046707153, "learning_rate": 1.2471386928867862e-05, "loss": 1.2275, "step": 14696 }, { "epoch": 4.377445596530092, "grad_norm": 0.29330718517303467, "learning_rate": 1.2470452271742853e-05, "loss": 1.2241, "step": 14697 }, { "epoch": 4.37774344272976, "grad_norm": 0.25902777910232544, "learning_rate": 1.2469517591633047e-05, "loss": 1.2269, "step": 14698 }, { "epoch": 4.378041288929429, "grad_norm": 0.30511945486068726, "learning_rate": 1.2468582888547139e-05, "loss": 1.2243, "step": 14699 }, { "epoch": 4.378339135129098, "grad_norm": 0.27240312099456787, "learning_rate": 1.2467648162493823e-05, "loss": 1.22, "step": 14700 }, { "epoch": 4.378636981328766, "grad_norm": 0.30048826336860657, "learning_rate": 1.24667134134818e-05, "loss": 1.2165, "step": 14701 }, { "epoch": 4.378934827528435, "grad_norm": 0.29394248127937317, "learning_rate": 1.2465778641519766e-05, "loss": 1.2471, "step": 14702 }, { "epoch": 4.3792326737281035, "grad_norm": 0.29744812846183777, "learning_rate": 1.2464843846616414e-05, "loss": 1.2063, "step": 14703 }, { "epoch": 4.379530519927773, "grad_norm": 0.4182775020599365, "learning_rate": 1.2463909028780446e-05, "loss": 1.2098, "step": 14704 }, { "epoch": 4.379828366127441, "grad_norm": 0.41468486189842224, "learning_rate": 1.2462974188020557e-05, "loss": 1.2181, "step": 14705 }, { "epoch": 4.380126212327109, "grad_norm": 0.3401853144168854, "learning_rate": 1.2462039324345443e-05, "loss": 1.2221, "step": 14706 }, { "epoch": 4.380424058526779, "grad_norm": 0.29607275128364563, "learning_rate": 1.2461104437763804e-05, "loss": 1.2118, "step": 14707 }, { "epoch": 4.380721904726447, "grad_norm": 0.37843289971351624, "learning_rate": 1.246016952828434e-05, "loss": 1.2261, "step": 14708 }, { "epoch": 4.381019750926115, "grad_norm": 0.26002398133277893, "learning_rate": 1.2459234595915746e-05, "loss": 1.2039, "step": 14709 }, { "epoch": 4.3813175971257845, "grad_norm": 0.3562231957912445, "learning_rate": 1.2458299640666722e-05, "loss": 1.2253, "step": 14710 }, { "epoch": 4.381615443325453, "grad_norm": 0.24358078837394714, "learning_rate": 1.2457364662545965e-05, "loss": 1.2195, "step": 14711 }, { "epoch": 4.381913289525121, "grad_norm": 0.4411725401878357, "learning_rate": 1.2456429661562177e-05, "loss": 1.2374, "step": 14712 }, { "epoch": 4.38221113572479, "grad_norm": 0.41273796558380127, "learning_rate": 1.2455494637724055e-05, "loss": 1.243, "step": 14713 }, { "epoch": 4.382508981924459, "grad_norm": 0.29247844219207764, "learning_rate": 1.2454559591040299e-05, "loss": 1.2326, "step": 14714 }, { "epoch": 4.382806828124127, "grad_norm": 0.25195637345314026, "learning_rate": 1.245362452151961e-05, "loss": 1.208, "step": 14715 }, { "epoch": 4.383104674323796, "grad_norm": 0.30427172780036926, "learning_rate": 1.2452689429170681e-05, "loss": 1.2134, "step": 14716 }, { "epoch": 4.383402520523465, "grad_norm": 0.2622573673725128, "learning_rate": 1.2451754314002223e-05, "loss": 1.2037, "step": 14717 }, { "epoch": 4.383700366723133, "grad_norm": 0.2846639156341553, "learning_rate": 1.2450819176022928e-05, "loss": 1.235, "step": 14718 }, { "epoch": 4.383998212922802, "grad_norm": 0.2838877737522125, "learning_rate": 1.2449884015241498e-05, "loss": 1.2351, "step": 14719 }, { "epoch": 4.3842960591224704, "grad_norm": 0.2814798355102539, "learning_rate": 1.2448948831666636e-05, "loss": 1.2326, "step": 14720 }, { "epoch": 4.38459390532214, "grad_norm": 0.2451288402080536, "learning_rate": 1.244801362530704e-05, "loss": 1.2131, "step": 14721 }, { "epoch": 4.384891751521808, "grad_norm": 0.4032379686832428, "learning_rate": 1.2447078396171412e-05, "loss": 1.2264, "step": 14722 }, { "epoch": 4.385189597721476, "grad_norm": 0.40528061985969543, "learning_rate": 1.2446143144268454e-05, "loss": 1.2213, "step": 14723 }, { "epoch": 4.3854874439211455, "grad_norm": 0.37685254216194153, "learning_rate": 1.2445207869606868e-05, "loss": 1.2422, "step": 14724 }, { "epoch": 4.385785290120814, "grad_norm": 0.2608760893344879, "learning_rate": 1.2444272572195354e-05, "loss": 1.2256, "step": 14725 }, { "epoch": 4.386083136320482, "grad_norm": 0.42907699942588806, "learning_rate": 1.2443337252042615e-05, "loss": 1.2364, "step": 14726 }, { "epoch": 4.386380982520151, "grad_norm": 0.3708975315093994, "learning_rate": 1.2442401909157353e-05, "loss": 1.2305, "step": 14727 }, { "epoch": 4.38667882871982, "grad_norm": 0.4012928307056427, "learning_rate": 1.244146654354827e-05, "loss": 1.2076, "step": 14728 }, { "epoch": 4.386976674919488, "grad_norm": 0.6299765706062317, "learning_rate": 1.2440531155224067e-05, "loss": 1.2276, "step": 14729 }, { "epoch": 4.387274521119157, "grad_norm": 0.29901444911956787, "learning_rate": 1.2439595744193449e-05, "loss": 1.2369, "step": 14730 }, { "epoch": 4.387572367318826, "grad_norm": 0.4860142171382904, "learning_rate": 1.2438660310465118e-05, "loss": 1.2252, "step": 14731 }, { "epoch": 4.387870213518495, "grad_norm": 0.5218364000320435, "learning_rate": 1.2437724854047781e-05, "loss": 1.2136, "step": 14732 }, { "epoch": 4.388168059718163, "grad_norm": 0.2555951476097107, "learning_rate": 1.2436789374950133e-05, "loss": 1.2207, "step": 14733 }, { "epoch": 4.3884659059178315, "grad_norm": 0.369017094373703, "learning_rate": 1.2435853873180884e-05, "loss": 1.2143, "step": 14734 }, { "epoch": 4.388763752117501, "grad_norm": 0.2541263997554779, "learning_rate": 1.243491834874874e-05, "loss": 1.2129, "step": 14735 }, { "epoch": 4.389061598317169, "grad_norm": 0.31892648339271545, "learning_rate": 1.2433982801662397e-05, "loss": 1.223, "step": 14736 }, { "epoch": 4.389359444516837, "grad_norm": 0.2780649960041046, "learning_rate": 1.2433047231930562e-05, "loss": 1.225, "step": 14737 }, { "epoch": 4.389657290716507, "grad_norm": 0.3346238136291504, "learning_rate": 1.2432111639561943e-05, "loss": 1.2218, "step": 14738 }, { "epoch": 4.389955136916175, "grad_norm": 0.2518664598464966, "learning_rate": 1.2431176024565241e-05, "loss": 1.2035, "step": 14739 }, { "epoch": 4.390252983115843, "grad_norm": 0.3096642792224884, "learning_rate": 1.2430240386949165e-05, "loss": 1.238, "step": 14740 }, { "epoch": 4.390550829315512, "grad_norm": 0.3540990948677063, "learning_rate": 1.2429304726722416e-05, "loss": 1.2294, "step": 14741 }, { "epoch": 4.390848675515181, "grad_norm": 0.27909091114997864, "learning_rate": 1.2428369043893703e-05, "loss": 1.2262, "step": 14742 }, { "epoch": 4.391146521714849, "grad_norm": 0.5325915813446045, "learning_rate": 1.2427433338471726e-05, "loss": 1.2236, "step": 14743 }, { "epoch": 4.391444367914518, "grad_norm": 0.3627501130104065, "learning_rate": 1.2426497610465196e-05, "loss": 1.2344, "step": 14744 }, { "epoch": 4.391742214114187, "grad_norm": 0.3426806926727295, "learning_rate": 1.2425561859882815e-05, "loss": 1.2282, "step": 14745 }, { "epoch": 4.392040060313855, "grad_norm": 0.32188719511032104, "learning_rate": 1.2424626086733291e-05, "loss": 1.212, "step": 14746 }, { "epoch": 4.392337906513524, "grad_norm": 0.27562034130096436, "learning_rate": 1.2423690291025332e-05, "loss": 1.2317, "step": 14747 }, { "epoch": 4.3926357527131925, "grad_norm": 0.3530782461166382, "learning_rate": 1.2422754472767641e-05, "loss": 1.2185, "step": 14748 }, { "epoch": 4.392933598912862, "grad_norm": 0.2722001373767853, "learning_rate": 1.242181863196893e-05, "loss": 1.2306, "step": 14749 }, { "epoch": 4.39323144511253, "grad_norm": 0.46160808205604553, "learning_rate": 1.2420882768637899e-05, "loss": 1.2161, "step": 14750 }, { "epoch": 4.393529291312198, "grad_norm": 0.32977479696273804, "learning_rate": 1.2419946882783259e-05, "loss": 1.2435, "step": 14751 }, { "epoch": 4.393827137511868, "grad_norm": 0.30083370208740234, "learning_rate": 1.2419010974413717e-05, "loss": 1.2118, "step": 14752 }, { "epoch": 4.394124983711536, "grad_norm": 0.37337085604667664, "learning_rate": 1.241807504353798e-05, "loss": 1.2238, "step": 14753 }, { "epoch": 4.394422829911204, "grad_norm": 0.27815932035446167, "learning_rate": 1.2417139090164758e-05, "loss": 1.2384, "step": 14754 }, { "epoch": 4.3947206761108735, "grad_norm": 0.30677685141563416, "learning_rate": 1.2416203114302756e-05, "loss": 1.2222, "step": 14755 }, { "epoch": 4.395018522310542, "grad_norm": 0.3042740225791931, "learning_rate": 1.2415267115960685e-05, "loss": 1.2222, "step": 14756 }, { "epoch": 4.39531636851021, "grad_norm": 0.3490796387195587, "learning_rate": 1.2414331095147249e-05, "loss": 1.2113, "step": 14757 }, { "epoch": 4.395614214709879, "grad_norm": 0.3328080177307129, "learning_rate": 1.2413395051871163e-05, "loss": 1.227, "step": 14758 }, { "epoch": 4.395912060909548, "grad_norm": 0.2872603237628937, "learning_rate": 1.2412458986141131e-05, "loss": 1.2232, "step": 14759 }, { "epoch": 4.396209907109217, "grad_norm": 0.27765852212905884, "learning_rate": 1.2411522897965865e-05, "loss": 1.2288, "step": 14760 }, { "epoch": 4.396507753308885, "grad_norm": 0.34384989738464355, "learning_rate": 1.2410586787354068e-05, "loss": 1.2071, "step": 14761 }, { "epoch": 4.3968055995085535, "grad_norm": 0.326244056224823, "learning_rate": 1.2409650654314462e-05, "loss": 1.2303, "step": 14762 }, { "epoch": 4.397103445708223, "grad_norm": 0.3647850751876831, "learning_rate": 1.2408714498855741e-05, "loss": 1.2216, "step": 14763 }, { "epoch": 4.397401291907891, "grad_norm": 0.3525470495223999, "learning_rate": 1.2407778320986625e-05, "loss": 1.2219, "step": 14764 }, { "epoch": 4.397699138107559, "grad_norm": 0.24576127529144287, "learning_rate": 1.2406842120715825e-05, "loss": 1.2218, "step": 14765 }, { "epoch": 4.397996984307229, "grad_norm": 0.4573582410812378, "learning_rate": 1.2405905898052047e-05, "loss": 1.2293, "step": 14766 }, { "epoch": 4.398294830506897, "grad_norm": 0.4853639006614685, "learning_rate": 1.2404969653004002e-05, "loss": 1.2209, "step": 14767 }, { "epoch": 4.398592676706565, "grad_norm": 0.2845476269721985, "learning_rate": 1.2404033385580401e-05, "loss": 1.2371, "step": 14768 }, { "epoch": 4.3988905229062345, "grad_norm": 0.8294855952262878, "learning_rate": 1.2403097095789955e-05, "loss": 1.2102, "step": 14769 }, { "epoch": 4.399188369105903, "grad_norm": 0.4614257216453552, "learning_rate": 1.2402160783641374e-05, "loss": 1.2323, "step": 14770 }, { "epoch": 4.399486215305572, "grad_norm": 0.7635871767997742, "learning_rate": 1.2401224449143374e-05, "loss": 1.2091, "step": 14771 }, { "epoch": 4.39978406150524, "grad_norm": 0.576861560344696, "learning_rate": 1.2400288092304663e-05, "loss": 1.233, "step": 14772 }, { "epoch": 4.400081907704909, "grad_norm": 0.48361676931381226, "learning_rate": 1.2399351713133953e-05, "loss": 1.2192, "step": 14773 }, { "epoch": 4.400379753904578, "grad_norm": 0.26259520649909973, "learning_rate": 1.2398415311639954e-05, "loss": 1.2013, "step": 14774 }, { "epoch": 4.400677600104246, "grad_norm": 0.5118993520736694, "learning_rate": 1.239747888783138e-05, "loss": 1.2351, "step": 14775 }, { "epoch": 4.400975446303915, "grad_norm": 0.40735262632369995, "learning_rate": 1.2396542441716946e-05, "loss": 1.214, "step": 14776 }, { "epoch": 4.401273292503584, "grad_norm": 0.25754430890083313, "learning_rate": 1.2395605973305362e-05, "loss": 1.2058, "step": 14777 }, { "epoch": 4.401571138703252, "grad_norm": 0.4109981656074524, "learning_rate": 1.239466948260534e-05, "loss": 1.2302, "step": 14778 }, { "epoch": 4.4018689849029204, "grad_norm": 0.3392815589904785, "learning_rate": 1.2393732969625597e-05, "loss": 1.2222, "step": 14779 }, { "epoch": 4.40216683110259, "grad_norm": 0.2568325102329254, "learning_rate": 1.2392796434374836e-05, "loss": 1.2215, "step": 14780 }, { "epoch": 4.402464677302258, "grad_norm": 0.6553801894187927, "learning_rate": 1.239185987686178e-05, "loss": 1.2087, "step": 14781 }, { "epoch": 4.402762523501926, "grad_norm": 0.3319759964942932, "learning_rate": 1.2390923297095142e-05, "loss": 1.2298, "step": 14782 }, { "epoch": 4.4030603697015955, "grad_norm": 0.39988401532173157, "learning_rate": 1.2389986695083636e-05, "loss": 1.2386, "step": 14783 }, { "epoch": 4.403358215901264, "grad_norm": 0.3550228774547577, "learning_rate": 1.2389050070835972e-05, "loss": 1.2066, "step": 14784 }, { "epoch": 4.403656062100932, "grad_norm": 0.2917831242084503, "learning_rate": 1.2388113424360865e-05, "loss": 1.2204, "step": 14785 }, { "epoch": 4.403953908300601, "grad_norm": 0.6269718408584595, "learning_rate": 1.2387176755667032e-05, "loss": 1.2186, "step": 14786 }, { "epoch": 4.40425175450027, "grad_norm": 0.31222423911094666, "learning_rate": 1.2386240064763187e-05, "loss": 1.2076, "step": 14787 }, { "epoch": 4.404549600699939, "grad_norm": 0.4469805061817169, "learning_rate": 1.2385303351658042e-05, "loss": 1.2248, "step": 14788 }, { "epoch": 4.404847446899607, "grad_norm": 0.3180270195007324, "learning_rate": 1.2384366616360317e-05, "loss": 1.2218, "step": 14789 }, { "epoch": 4.405145293099276, "grad_norm": 0.3021213412284851, "learning_rate": 1.2383429858878723e-05, "loss": 1.2304, "step": 14790 }, { "epoch": 4.405443139298945, "grad_norm": 0.36727070808410645, "learning_rate": 1.238249307922198e-05, "loss": 1.2406, "step": 14791 }, { "epoch": 4.405740985498613, "grad_norm": 0.33401358127593994, "learning_rate": 1.2381556277398796e-05, "loss": 1.2157, "step": 14792 }, { "epoch": 4.4060388316982815, "grad_norm": 0.24210068583488464, "learning_rate": 1.2380619453417895e-05, "loss": 1.2189, "step": 14793 }, { "epoch": 4.406336677897951, "grad_norm": 0.35449934005737305, "learning_rate": 1.2379682607287988e-05, "loss": 1.2329, "step": 14794 }, { "epoch": 4.406634524097619, "grad_norm": 0.3415757715702057, "learning_rate": 1.2378745739017795e-05, "loss": 1.2203, "step": 14795 }, { "epoch": 4.406932370297287, "grad_norm": 0.305342435836792, "learning_rate": 1.2377808848616029e-05, "loss": 1.2099, "step": 14796 }, { "epoch": 4.407230216496957, "grad_norm": 0.5291569828987122, "learning_rate": 1.237687193609141e-05, "loss": 1.2111, "step": 14797 }, { "epoch": 4.407528062696625, "grad_norm": 0.3190738260746002, "learning_rate": 1.2375935001452652e-05, "loss": 1.2317, "step": 14798 }, { "epoch": 4.407825908896294, "grad_norm": 0.42422544956207275, "learning_rate": 1.2374998044708471e-05, "loss": 1.2084, "step": 14799 }, { "epoch": 4.408123755095962, "grad_norm": 0.3175092041492462, "learning_rate": 1.2374061065867592e-05, "loss": 1.2224, "step": 14800 }, { "epoch": 4.408421601295631, "grad_norm": 0.3442769944667816, "learning_rate": 1.2373124064938722e-05, "loss": 1.2054, "step": 14801 }, { "epoch": 4.4087194474953, "grad_norm": 0.44270092248916626, "learning_rate": 1.2372187041930588e-05, "loss": 1.2225, "step": 14802 }, { "epoch": 4.409017293694968, "grad_norm": 0.25048506259918213, "learning_rate": 1.2371249996851903e-05, "loss": 1.2283, "step": 14803 }, { "epoch": 4.409315139894637, "grad_norm": 0.36463814973831177, "learning_rate": 1.2370312929711383e-05, "loss": 1.2071, "step": 14804 }, { "epoch": 4.409612986094306, "grad_norm": 0.34449633955955505, "learning_rate": 1.2369375840517752e-05, "loss": 1.2133, "step": 14805 }, { "epoch": 4.409910832293974, "grad_norm": 0.26221033930778503, "learning_rate": 1.2368438729279725e-05, "loss": 1.209, "step": 14806 }, { "epoch": 4.4102086784936425, "grad_norm": 0.4529891610145569, "learning_rate": 1.2367501596006023e-05, "loss": 1.2131, "step": 14807 }, { "epoch": 4.410506524693312, "grad_norm": 0.26924896240234375, "learning_rate": 1.2366564440705363e-05, "loss": 1.2267, "step": 14808 }, { "epoch": 4.41080437089298, "grad_norm": 0.35140368342399597, "learning_rate": 1.2365627263386468e-05, "loss": 1.2187, "step": 14809 }, { "epoch": 4.411102217092648, "grad_norm": 0.33502423763275146, "learning_rate": 1.2364690064058052e-05, "loss": 1.2154, "step": 14810 }, { "epoch": 4.411400063292318, "grad_norm": 0.3482076823711395, "learning_rate": 1.2363752842728836e-05, "loss": 1.229, "step": 14811 }, { "epoch": 4.411697909491986, "grad_norm": 0.40956559777259827, "learning_rate": 1.236281559940754e-05, "loss": 1.222, "step": 14812 }, { "epoch": 4.411995755691654, "grad_norm": 0.32557663321495056, "learning_rate": 1.2361878334102885e-05, "loss": 1.2239, "step": 14813 }, { "epoch": 4.4122936018913235, "grad_norm": 0.42252591252326965, "learning_rate": 1.2360941046823596e-05, "loss": 1.2478, "step": 14814 }, { "epoch": 4.412591448090992, "grad_norm": 0.2798241674900055, "learning_rate": 1.2360003737578383e-05, "loss": 1.2122, "step": 14815 }, { "epoch": 4.412889294290661, "grad_norm": 0.35809919238090515, "learning_rate": 1.2359066406375973e-05, "loss": 1.2075, "step": 14816 }, { "epoch": 4.413187140490329, "grad_norm": 0.24428319931030273, "learning_rate": 1.2358129053225088e-05, "loss": 1.2215, "step": 14817 }, { "epoch": 4.413484986689998, "grad_norm": 0.38702455163002014, "learning_rate": 1.2357191678134443e-05, "loss": 1.2066, "step": 14818 }, { "epoch": 4.413782832889667, "grad_norm": 0.265264630317688, "learning_rate": 1.2356254281112766e-05, "loss": 1.2069, "step": 14819 }, { "epoch": 4.414080679089335, "grad_norm": 0.3403869569301605, "learning_rate": 1.2355316862168776e-05, "loss": 1.217, "step": 14820 }, { "epoch": 4.4143785252890035, "grad_norm": 0.27034205198287964, "learning_rate": 1.2354379421311192e-05, "loss": 1.224, "step": 14821 }, { "epoch": 4.414676371488673, "grad_norm": 0.34566351771354675, "learning_rate": 1.2353441958548736e-05, "loss": 1.2247, "step": 14822 }, { "epoch": 4.414974217688341, "grad_norm": 0.2551047205924988, "learning_rate": 1.2352504473890135e-05, "loss": 1.2262, "step": 14823 }, { "epoch": 4.415272063888009, "grad_norm": 0.3116854727268219, "learning_rate": 1.2351566967344109e-05, "loss": 1.2063, "step": 14824 }, { "epoch": 4.415569910087679, "grad_norm": 0.29129716753959656, "learning_rate": 1.2350629438919379e-05, "loss": 1.2205, "step": 14825 }, { "epoch": 4.415867756287347, "grad_norm": 0.28079909086227417, "learning_rate": 1.2349691888624667e-05, "loss": 1.2141, "step": 14826 }, { "epoch": 4.416165602487016, "grad_norm": 0.2966746389865875, "learning_rate": 1.2348754316468699e-05, "loss": 1.2275, "step": 14827 }, { "epoch": 4.4164634486866845, "grad_norm": 0.24404703080654144, "learning_rate": 1.2347816722460196e-05, "loss": 1.2214, "step": 14828 }, { "epoch": 4.416761294886353, "grad_norm": 0.286882609128952, "learning_rate": 1.2346879106607878e-05, "loss": 1.2331, "step": 14829 }, { "epoch": 4.417059141086022, "grad_norm": 0.26083606481552124, "learning_rate": 1.2345941468920476e-05, "loss": 1.2353, "step": 14830 }, { "epoch": 4.41735698728569, "grad_norm": 0.27248531579971313, "learning_rate": 1.234500380940671e-05, "loss": 1.2112, "step": 14831 }, { "epoch": 4.417654833485359, "grad_norm": 0.253886878490448, "learning_rate": 1.2344066128075303e-05, "loss": 1.2263, "step": 14832 }, { "epoch": 4.417952679685028, "grad_norm": 0.27620503306388855, "learning_rate": 1.2343128424934978e-05, "loss": 1.2071, "step": 14833 }, { "epoch": 4.418250525884696, "grad_norm": 0.2539706528186798, "learning_rate": 1.2342190699994461e-05, "loss": 1.2246, "step": 14834 }, { "epoch": 4.418548372084365, "grad_norm": 0.32821041345596313, "learning_rate": 1.2341252953262477e-05, "loss": 1.2081, "step": 14835 }, { "epoch": 4.418846218284034, "grad_norm": 0.2732594311237335, "learning_rate": 1.2340315184747749e-05, "loss": 1.2381, "step": 14836 }, { "epoch": 4.419144064483702, "grad_norm": 0.35821712017059326, "learning_rate": 1.2339377394459006e-05, "loss": 1.2319, "step": 14837 }, { "epoch": 4.419441910683371, "grad_norm": 0.2847093641757965, "learning_rate": 1.2338439582404969e-05, "loss": 1.2155, "step": 14838 }, { "epoch": 4.41973975688304, "grad_norm": 0.41309723258018494, "learning_rate": 1.2337501748594362e-05, "loss": 1.2232, "step": 14839 }, { "epoch": 4.420037603082708, "grad_norm": 0.29065367579460144, "learning_rate": 1.2336563893035913e-05, "loss": 1.2236, "step": 14840 }, { "epoch": 4.420335449282377, "grad_norm": 0.3107321262359619, "learning_rate": 1.2335626015738352e-05, "loss": 1.2126, "step": 14841 }, { "epoch": 4.4206332954820455, "grad_norm": 0.2843754291534424, "learning_rate": 1.2334688116710396e-05, "loss": 1.2189, "step": 14842 }, { "epoch": 4.420931141681714, "grad_norm": 0.26646536588668823, "learning_rate": 1.2333750195960776e-05, "loss": 1.2059, "step": 14843 }, { "epoch": 4.421228987881383, "grad_norm": 0.2937961220741272, "learning_rate": 1.233281225349822e-05, "loss": 1.2224, "step": 14844 }, { "epoch": 4.421526834081051, "grad_norm": 0.3047434687614441, "learning_rate": 1.2331874289331449e-05, "loss": 1.2273, "step": 14845 }, { "epoch": 4.42182468028072, "grad_norm": 0.27585846185684204, "learning_rate": 1.2330936303469194e-05, "loss": 1.2048, "step": 14846 }, { "epoch": 4.422122526480389, "grad_norm": 0.2681923806667328, "learning_rate": 1.2329998295920183e-05, "loss": 1.2067, "step": 14847 }, { "epoch": 4.422420372680057, "grad_norm": 0.27037373185157776, "learning_rate": 1.2329060266693142e-05, "loss": 1.2038, "step": 14848 }, { "epoch": 4.422718218879726, "grad_norm": 0.28603503108024597, "learning_rate": 1.2328122215796797e-05, "loss": 1.2197, "step": 14849 }, { "epoch": 4.423016065079395, "grad_norm": 0.27683189511299133, "learning_rate": 1.2327184143239872e-05, "loss": 1.2242, "step": 14850 }, { "epoch": 4.423313911279063, "grad_norm": 0.27266234159469604, "learning_rate": 1.2326246049031102e-05, "loss": 1.2123, "step": 14851 }, { "epoch": 4.4236117574787315, "grad_norm": 0.2785188853740692, "learning_rate": 1.232530793317921e-05, "loss": 1.2184, "step": 14852 }, { "epoch": 4.423909603678401, "grad_norm": 0.2624119818210602, "learning_rate": 1.2324369795692925e-05, "loss": 1.2185, "step": 14853 }, { "epoch": 4.424207449878069, "grad_norm": 0.33412596583366394, "learning_rate": 1.2323431636580979e-05, "loss": 1.2289, "step": 14854 }, { "epoch": 4.424505296077738, "grad_norm": 0.2909590005874634, "learning_rate": 1.2322493455852096e-05, "loss": 1.2244, "step": 14855 }, { "epoch": 4.424803142277407, "grad_norm": 0.33464330434799194, "learning_rate": 1.2321555253515005e-05, "loss": 1.2262, "step": 14856 }, { "epoch": 4.425100988477075, "grad_norm": 0.38978371024131775, "learning_rate": 1.2320617029578438e-05, "loss": 1.2211, "step": 14857 }, { "epoch": 4.425398834676744, "grad_norm": 0.3329388201236725, "learning_rate": 1.2319678784051121e-05, "loss": 1.2378, "step": 14858 }, { "epoch": 4.425696680876412, "grad_norm": 0.3889275789260864, "learning_rate": 1.2318740516941786e-05, "loss": 1.2158, "step": 14859 }, { "epoch": 4.425994527076081, "grad_norm": 0.34946542978286743, "learning_rate": 1.231780222825916e-05, "loss": 1.222, "step": 14860 }, { "epoch": 4.42629237327575, "grad_norm": 0.315692275762558, "learning_rate": 1.2316863918011975e-05, "loss": 1.2102, "step": 14861 }, { "epoch": 4.426590219475418, "grad_norm": 0.2709731161594391, "learning_rate": 1.2315925586208958e-05, "loss": 1.2273, "step": 14862 }, { "epoch": 4.426888065675087, "grad_norm": 0.3175421953201294, "learning_rate": 1.231498723285884e-05, "loss": 1.2106, "step": 14863 }, { "epoch": 4.427185911874756, "grad_norm": 0.2550317049026489, "learning_rate": 1.2314048857970354e-05, "loss": 1.2278, "step": 14864 }, { "epoch": 4.427483758074424, "grad_norm": 0.365434467792511, "learning_rate": 1.231311046155223e-05, "loss": 1.2143, "step": 14865 }, { "epoch": 4.427781604274093, "grad_norm": 0.3138810396194458, "learning_rate": 1.2312172043613197e-05, "loss": 1.2314, "step": 14866 }, { "epoch": 4.428079450473762, "grad_norm": 0.32338428497314453, "learning_rate": 1.2311233604161984e-05, "loss": 1.2055, "step": 14867 }, { "epoch": 4.42837729667343, "grad_norm": 0.3391285240650177, "learning_rate": 1.2310295143207328e-05, "loss": 1.2074, "step": 14868 }, { "epoch": 4.428675142873099, "grad_norm": 0.3374027609825134, "learning_rate": 1.2309356660757953e-05, "loss": 1.2198, "step": 14869 }, { "epoch": 4.428972989072768, "grad_norm": 0.29991164803504944, "learning_rate": 1.2308418156822599e-05, "loss": 1.2398, "step": 14870 }, { "epoch": 4.429270835272436, "grad_norm": 0.36310943961143494, "learning_rate": 1.2307479631409992e-05, "loss": 1.2222, "step": 14871 }, { "epoch": 4.429568681472105, "grad_norm": 0.35218074917793274, "learning_rate": 1.2306541084528864e-05, "loss": 1.2158, "step": 14872 }, { "epoch": 4.4298665276717735, "grad_norm": 0.2511431872844696, "learning_rate": 1.230560251618795e-05, "loss": 1.2144, "step": 14873 }, { "epoch": 4.430164373871442, "grad_norm": 0.34200379252433777, "learning_rate": 1.2304663926395977e-05, "loss": 1.249, "step": 14874 }, { "epoch": 4.430462220071111, "grad_norm": 0.33715832233428955, "learning_rate": 1.2303725315161682e-05, "loss": 1.2203, "step": 14875 }, { "epoch": 4.430760066270779, "grad_norm": 0.3251497447490692, "learning_rate": 1.2302786682493797e-05, "loss": 1.2208, "step": 14876 }, { "epoch": 4.431057912470448, "grad_norm": 0.3092595636844635, "learning_rate": 1.2301848028401055e-05, "loss": 1.2414, "step": 14877 }, { "epoch": 4.431355758670117, "grad_norm": 0.26255378127098083, "learning_rate": 1.2300909352892192e-05, "loss": 1.2122, "step": 14878 }, { "epoch": 4.431653604869785, "grad_norm": 0.3438512980937958, "learning_rate": 1.2299970655975935e-05, "loss": 1.2061, "step": 14879 }, { "epoch": 4.4319514510694535, "grad_norm": 0.2744996249675751, "learning_rate": 1.229903193766102e-05, "loss": 1.2267, "step": 14880 }, { "epoch": 4.432249297269123, "grad_norm": 0.3187209665775299, "learning_rate": 1.229809319795618e-05, "loss": 1.2147, "step": 14881 }, { "epoch": 4.432547143468791, "grad_norm": 0.6051568388938904, "learning_rate": 1.2297154436870155e-05, "loss": 1.2268, "step": 14882 }, { "epoch": 4.43284498966846, "grad_norm": 0.6375366449356079, "learning_rate": 1.2296215654411674e-05, "loss": 1.2223, "step": 14883 }, { "epoch": 4.433142835868129, "grad_norm": 0.29692256450653076, "learning_rate": 1.2295276850589471e-05, "loss": 1.2285, "step": 14884 }, { "epoch": 4.433440682067797, "grad_norm": 0.37519916892051697, "learning_rate": 1.229433802541228e-05, "loss": 1.2154, "step": 14885 }, { "epoch": 4.433738528267466, "grad_norm": 0.2912464141845703, "learning_rate": 1.2293399178888836e-05, "loss": 1.2194, "step": 14886 }, { "epoch": 4.4340363744671345, "grad_norm": 0.4365774393081665, "learning_rate": 1.2292460311027878e-05, "loss": 1.2401, "step": 14887 }, { "epoch": 4.434334220666803, "grad_norm": 0.29991504549980164, "learning_rate": 1.2291521421838135e-05, "loss": 1.2164, "step": 14888 }, { "epoch": 4.434632066866472, "grad_norm": 0.37065500020980835, "learning_rate": 1.229058251132835e-05, "loss": 1.2142, "step": 14889 }, { "epoch": 4.43492991306614, "grad_norm": 0.2551129162311554, "learning_rate": 1.2289643579507251e-05, "loss": 1.2062, "step": 14890 }, { "epoch": 4.435227759265809, "grad_norm": 0.41739919781684875, "learning_rate": 1.2288704626383576e-05, "loss": 1.2331, "step": 14891 }, { "epoch": 4.435525605465478, "grad_norm": 0.2787053883075714, "learning_rate": 1.2287765651966064e-05, "loss": 1.229, "step": 14892 }, { "epoch": 4.435823451665146, "grad_norm": 0.38573333621025085, "learning_rate": 1.2286826656263445e-05, "loss": 1.2234, "step": 14893 }, { "epoch": 4.4361212978648155, "grad_norm": 0.3470078110694885, "learning_rate": 1.2285887639284461e-05, "loss": 1.2266, "step": 14894 }, { "epoch": 4.436419144064484, "grad_norm": 0.2691015601158142, "learning_rate": 1.2284948601037847e-05, "loss": 1.2335, "step": 14895 }, { "epoch": 4.436716990264152, "grad_norm": 0.2952558994293213, "learning_rate": 1.228400954153234e-05, "loss": 1.2217, "step": 14896 }, { "epoch": 4.437014836463821, "grad_norm": 0.3009900748729706, "learning_rate": 1.2283070460776674e-05, "loss": 1.2047, "step": 14897 }, { "epoch": 4.43731268266349, "grad_norm": 0.3212072551250458, "learning_rate": 1.228213135877959e-05, "loss": 1.2204, "step": 14898 }, { "epoch": 4.437610528863158, "grad_norm": 0.2819103002548218, "learning_rate": 1.2281192235549822e-05, "loss": 1.2143, "step": 14899 }, { "epoch": 4.437908375062827, "grad_norm": 0.3015141189098358, "learning_rate": 1.2280253091096108e-05, "loss": 1.2301, "step": 14900 }, { "epoch": 4.4382062212624955, "grad_norm": 0.33626100420951843, "learning_rate": 1.2279313925427188e-05, "loss": 1.2429, "step": 14901 }, { "epoch": 4.438504067462164, "grad_norm": 0.3932854235172272, "learning_rate": 1.22783747385518e-05, "loss": 1.236, "step": 14902 }, { "epoch": 4.438801913661833, "grad_norm": 0.30726158618927, "learning_rate": 1.2277435530478679e-05, "loss": 1.1996, "step": 14903 }, { "epoch": 4.439099759861501, "grad_norm": 0.319837749004364, "learning_rate": 1.2276496301216564e-05, "loss": 1.2312, "step": 14904 }, { "epoch": 4.439397606061171, "grad_norm": 0.28553327918052673, "learning_rate": 1.2275557050774191e-05, "loss": 1.2307, "step": 14905 }, { "epoch": 4.439695452260839, "grad_norm": 0.3147313892841339, "learning_rate": 1.227461777916031e-05, "loss": 1.2167, "step": 14906 }, { "epoch": 4.439993298460507, "grad_norm": 0.316043883562088, "learning_rate": 1.2273678486383647e-05, "loss": 1.2276, "step": 14907 }, { "epoch": 4.4402911446601765, "grad_norm": 0.2953035831451416, "learning_rate": 1.2272739172452947e-05, "loss": 1.1966, "step": 14908 }, { "epoch": 4.440588990859845, "grad_norm": 0.43210822343826294, "learning_rate": 1.227179983737695e-05, "loss": 1.2325, "step": 14909 }, { "epoch": 4.440886837059513, "grad_norm": 0.4180282950401306, "learning_rate": 1.2270860481164391e-05, "loss": 1.2111, "step": 14910 }, { "epoch": 4.441184683259182, "grad_norm": 0.3034069836139679, "learning_rate": 1.2269921103824015e-05, "loss": 1.2263, "step": 14911 }, { "epoch": 4.441482529458851, "grad_norm": 0.6624096632003784, "learning_rate": 1.2268981705364556e-05, "loss": 1.2138, "step": 14912 }, { "epoch": 4.441780375658519, "grad_norm": 0.4512910842895508, "learning_rate": 1.2268042285794761e-05, "loss": 1.2096, "step": 14913 }, { "epoch": 4.442078221858188, "grad_norm": 0.4340059757232666, "learning_rate": 1.2267102845123364e-05, "loss": 1.228, "step": 14914 }, { "epoch": 4.442376068057857, "grad_norm": 0.550622284412384, "learning_rate": 1.226616338335911e-05, "loss": 1.2339, "step": 14915 }, { "epoch": 4.442673914257525, "grad_norm": 0.3245103657245636, "learning_rate": 1.2265223900510734e-05, "loss": 1.214, "step": 14916 }, { "epoch": 4.442971760457194, "grad_norm": 0.5484495162963867, "learning_rate": 1.2264284396586982e-05, "loss": 1.2391, "step": 14917 }, { "epoch": 4.443269606656862, "grad_norm": 0.2908317744731903, "learning_rate": 1.2263344871596595e-05, "loss": 1.2207, "step": 14918 }, { "epoch": 4.443567452856531, "grad_norm": 0.3075171709060669, "learning_rate": 1.2262405325548313e-05, "loss": 1.2199, "step": 14919 }, { "epoch": 4.4438652990562, "grad_norm": 0.4471679925918579, "learning_rate": 1.2261465758450877e-05, "loss": 1.226, "step": 14920 }, { "epoch": 4.444163145255868, "grad_norm": 0.3009333908557892, "learning_rate": 1.2260526170313027e-05, "loss": 1.2263, "step": 14921 }, { "epoch": 4.4444609914555375, "grad_norm": 0.4344242215156555, "learning_rate": 1.2259586561143504e-05, "loss": 1.22, "step": 14922 }, { "epoch": 4.444758837655206, "grad_norm": 0.28826597332954407, "learning_rate": 1.2258646930951058e-05, "loss": 1.2158, "step": 14923 }, { "epoch": 4.445056683854874, "grad_norm": 0.4402480721473694, "learning_rate": 1.2257707279744424e-05, "loss": 1.2161, "step": 14924 }, { "epoch": 4.445354530054543, "grad_norm": 0.35612747073173523, "learning_rate": 1.2256767607532344e-05, "loss": 1.2085, "step": 14925 }, { "epoch": 4.445652376254212, "grad_norm": 0.36830493807792664, "learning_rate": 1.2255827914323568e-05, "loss": 1.2092, "step": 14926 }, { "epoch": 4.44595022245388, "grad_norm": 0.35415905714035034, "learning_rate": 1.2254888200126829e-05, "loss": 1.2053, "step": 14927 }, { "epoch": 4.446248068653549, "grad_norm": 0.35066384077072144, "learning_rate": 1.2253948464950875e-05, "loss": 1.2161, "step": 14928 }, { "epoch": 4.446545914853218, "grad_norm": 0.33780568838119507, "learning_rate": 1.2253008708804451e-05, "loss": 1.2341, "step": 14929 }, { "epoch": 4.446843761052886, "grad_norm": 0.3455325961112976, "learning_rate": 1.22520689316963e-05, "loss": 1.2197, "step": 14930 }, { "epoch": 4.447141607252555, "grad_norm": 0.2523250877857208, "learning_rate": 1.2251129133635158e-05, "loss": 1.2222, "step": 14931 }, { "epoch": 4.4474394534522235, "grad_norm": 0.34670156240463257, "learning_rate": 1.2250189314629778e-05, "loss": 1.2193, "step": 14932 }, { "epoch": 4.447737299651893, "grad_norm": 0.2636018693447113, "learning_rate": 1.22492494746889e-05, "loss": 1.2022, "step": 14933 }, { "epoch": 4.448035145851561, "grad_norm": 0.3007007837295532, "learning_rate": 1.2248309613821267e-05, "loss": 1.2254, "step": 14934 }, { "epoch": 4.448332992051229, "grad_norm": 0.2943074703216553, "learning_rate": 1.2247369732035628e-05, "loss": 1.2271, "step": 14935 }, { "epoch": 4.448630838250899, "grad_norm": 0.26788967847824097, "learning_rate": 1.2246429829340724e-05, "loss": 1.2442, "step": 14936 }, { "epoch": 4.448928684450567, "grad_norm": 0.3123735785484314, "learning_rate": 1.2245489905745298e-05, "loss": 1.2185, "step": 14937 }, { "epoch": 4.449226530650235, "grad_norm": 0.25530245900154114, "learning_rate": 1.2244549961258098e-05, "loss": 1.2148, "step": 14938 }, { "epoch": 4.449524376849904, "grad_norm": 0.2747754156589508, "learning_rate": 1.224360999588787e-05, "loss": 1.2215, "step": 14939 }, { "epoch": 4.449822223049573, "grad_norm": 0.2637244462966919, "learning_rate": 1.2242670009643357e-05, "loss": 1.225, "step": 14940 }, { "epoch": 4.450120069249241, "grad_norm": 0.3310074806213379, "learning_rate": 1.2241730002533303e-05, "loss": 1.2368, "step": 14941 }, { "epoch": 4.45041791544891, "grad_norm": 0.30113866925239563, "learning_rate": 1.2240789974566458e-05, "loss": 1.21, "step": 14942 }, { "epoch": 4.450715761648579, "grad_norm": 0.40723204612731934, "learning_rate": 1.2239849925751568e-05, "loss": 1.2365, "step": 14943 }, { "epoch": 4.451013607848247, "grad_norm": 0.48997971415519714, "learning_rate": 1.2238909856097374e-05, "loss": 1.2199, "step": 14944 }, { "epoch": 4.451311454047916, "grad_norm": 0.27049025893211365, "learning_rate": 1.2237969765612622e-05, "loss": 1.2268, "step": 14945 }, { "epoch": 4.4516093002475845, "grad_norm": 0.3590318262577057, "learning_rate": 1.2237029654306068e-05, "loss": 1.2199, "step": 14946 }, { "epoch": 4.451907146447254, "grad_norm": 0.27747493982315063, "learning_rate": 1.2236089522186452e-05, "loss": 1.2234, "step": 14947 }, { "epoch": 4.452204992646922, "grad_norm": 0.295388400554657, "learning_rate": 1.2235149369262517e-05, "loss": 1.2089, "step": 14948 }, { "epoch": 4.45250283884659, "grad_norm": 0.26772403717041016, "learning_rate": 1.2234209195543016e-05, "loss": 1.2114, "step": 14949 }, { "epoch": 4.45280068504626, "grad_norm": 0.31864991784095764, "learning_rate": 1.2233269001036698e-05, "loss": 1.2265, "step": 14950 }, { "epoch": 4.453098531245928, "grad_norm": 0.301789253950119, "learning_rate": 1.2232328785752304e-05, "loss": 1.2396, "step": 14951 }, { "epoch": 4.453396377445596, "grad_norm": 0.4028361141681671, "learning_rate": 1.2231388549698584e-05, "loss": 1.2165, "step": 14952 }, { "epoch": 4.4536942236452655, "grad_norm": 0.38277995586395264, "learning_rate": 1.223044829288429e-05, "loss": 1.2108, "step": 14953 }, { "epoch": 4.453992069844934, "grad_norm": 0.304472953081131, "learning_rate": 1.2229508015318163e-05, "loss": 1.2275, "step": 14954 }, { "epoch": 4.454289916044602, "grad_norm": 0.3237842321395874, "learning_rate": 1.2228567717008956e-05, "loss": 1.2159, "step": 14955 }, { "epoch": 4.454587762244271, "grad_norm": 0.36103376746177673, "learning_rate": 1.2227627397965417e-05, "loss": 1.2131, "step": 14956 }, { "epoch": 4.45488560844394, "grad_norm": 0.6664847135543823, "learning_rate": 1.2226687058196292e-05, "loss": 1.2336, "step": 14957 }, { "epoch": 4.455183454643608, "grad_norm": 0.2946167290210724, "learning_rate": 1.2225746697710334e-05, "loss": 1.2162, "step": 14958 }, { "epoch": 4.455481300843277, "grad_norm": 0.5487415194511414, "learning_rate": 1.2224806316516288e-05, "loss": 1.2071, "step": 14959 }, { "epoch": 4.4557791470429455, "grad_norm": 0.3942016065120697, "learning_rate": 1.2223865914622908e-05, "loss": 1.2124, "step": 14960 }, { "epoch": 4.456076993242615, "grad_norm": 0.33333155512809753, "learning_rate": 1.2222925492038938e-05, "loss": 1.2308, "step": 14961 }, { "epoch": 4.456374839442283, "grad_norm": 0.31741824746131897, "learning_rate": 1.2221985048773131e-05, "loss": 1.2147, "step": 14962 }, { "epoch": 4.456672685641951, "grad_norm": 0.34265267848968506, "learning_rate": 1.2221044584834231e-05, "loss": 1.2316, "step": 14963 }, { "epoch": 4.456970531841621, "grad_norm": 0.3344602584838867, "learning_rate": 1.2220104100230999e-05, "loss": 1.2107, "step": 14964 }, { "epoch": 4.457268378041289, "grad_norm": 0.40091463923454285, "learning_rate": 1.2219163594972177e-05, "loss": 1.2291, "step": 14965 }, { "epoch": 4.457566224240957, "grad_norm": 0.5136224031448364, "learning_rate": 1.2218223069066515e-05, "loss": 1.22, "step": 14966 }, { "epoch": 4.4578640704406265, "grad_norm": 0.3041544556617737, "learning_rate": 1.2217282522522768e-05, "loss": 1.2207, "step": 14967 }, { "epoch": 4.458161916640295, "grad_norm": 0.5141249299049377, "learning_rate": 1.2216341955349685e-05, "loss": 1.225, "step": 14968 }, { "epoch": 4.458459762839963, "grad_norm": 0.28203797340393066, "learning_rate": 1.221540136755601e-05, "loss": 1.2196, "step": 14969 }, { "epoch": 4.458757609039632, "grad_norm": 0.4156658351421356, "learning_rate": 1.2214460759150506e-05, "loss": 1.2312, "step": 14970 }, { "epoch": 4.459055455239301, "grad_norm": 0.24402017891407013, "learning_rate": 1.221352013014192e-05, "loss": 1.2392, "step": 14971 }, { "epoch": 4.45935330143897, "grad_norm": 0.3900117874145508, "learning_rate": 1.2212579480538999e-05, "loss": 1.2063, "step": 14972 }, { "epoch": 4.459651147638638, "grad_norm": 0.3157013952732086, "learning_rate": 1.2211638810350499e-05, "loss": 1.2264, "step": 14973 }, { "epoch": 4.459948993838307, "grad_norm": 0.34973737597465515, "learning_rate": 1.2210698119585171e-05, "loss": 1.2093, "step": 14974 }, { "epoch": 4.460246840037976, "grad_norm": 0.3000805974006653, "learning_rate": 1.2209757408251768e-05, "loss": 1.2282, "step": 14975 }, { "epoch": 4.460544686237644, "grad_norm": 0.25990670919418335, "learning_rate": 1.2208816676359038e-05, "loss": 1.2203, "step": 14976 }, { "epoch": 4.460842532437312, "grad_norm": 0.3210603594779968, "learning_rate": 1.2207875923915741e-05, "loss": 1.2111, "step": 14977 }, { "epoch": 4.461140378636982, "grad_norm": 0.25940433144569397, "learning_rate": 1.2206935150930623e-05, "loss": 1.2071, "step": 14978 }, { "epoch": 4.46143822483665, "grad_norm": 0.27722057700157166, "learning_rate": 1.220599435741244e-05, "loss": 1.2124, "step": 14979 }, { "epoch": 4.461736071036318, "grad_norm": 0.2738329768180847, "learning_rate": 1.2205053543369942e-05, "loss": 1.2147, "step": 14980 }, { "epoch": 4.4620339172359875, "grad_norm": 0.4540187120437622, "learning_rate": 1.2204112708811888e-05, "loss": 1.2208, "step": 14981 }, { "epoch": 4.462331763435656, "grad_norm": 0.4237889349460602, "learning_rate": 1.2203171853747024e-05, "loss": 1.2141, "step": 14982 }, { "epoch": 4.462629609635324, "grad_norm": 0.3281431198120117, "learning_rate": 1.220223097818411e-05, "loss": 1.2338, "step": 14983 }, { "epoch": 4.462927455834993, "grad_norm": 0.3441547751426697, "learning_rate": 1.2201290082131898e-05, "loss": 1.218, "step": 14984 }, { "epoch": 4.463225302034662, "grad_norm": 0.36121034622192383, "learning_rate": 1.2200349165599139e-05, "loss": 1.215, "step": 14985 }, { "epoch": 4.46352314823433, "grad_norm": 0.4869498908519745, "learning_rate": 1.2199408228594591e-05, "loss": 1.2131, "step": 14986 }, { "epoch": 4.463820994433999, "grad_norm": 0.2633175551891327, "learning_rate": 1.2198467271127003e-05, "loss": 1.2085, "step": 14987 }, { "epoch": 4.464118840633668, "grad_norm": 0.34987908601760864, "learning_rate": 1.2197526293205138e-05, "loss": 1.2242, "step": 14988 }, { "epoch": 4.464416686833337, "grad_norm": 0.23947428166866302, "learning_rate": 1.2196585294837744e-05, "loss": 1.2235, "step": 14989 }, { "epoch": 4.464714533033005, "grad_norm": 0.4048132300376892, "learning_rate": 1.2195644276033578e-05, "loss": 1.223, "step": 14990 }, { "epoch": 4.4650123792326735, "grad_norm": 0.3032701015472412, "learning_rate": 1.2194703236801398e-05, "loss": 1.2376, "step": 14991 }, { "epoch": 4.465310225432343, "grad_norm": 0.30714356899261475, "learning_rate": 1.219376217714995e-05, "loss": 1.2159, "step": 14992 }, { "epoch": 4.465608071632011, "grad_norm": 0.27791735529899597, "learning_rate": 1.2192821097088e-05, "loss": 1.2182, "step": 14993 }, { "epoch": 4.465905917831679, "grad_norm": 0.3147997260093689, "learning_rate": 1.21918799966243e-05, "loss": 1.2437, "step": 14994 }, { "epoch": 4.4662037640313486, "grad_norm": 0.31485629081726074, "learning_rate": 1.2190938875767606e-05, "loss": 1.2258, "step": 14995 }, { "epoch": 4.466501610231017, "grad_norm": 0.3440761864185333, "learning_rate": 1.2189997734526672e-05, "loss": 1.2163, "step": 14996 }, { "epoch": 4.466799456430685, "grad_norm": 0.3485150635242462, "learning_rate": 1.2189056572910256e-05, "loss": 1.2129, "step": 14997 }, { "epoch": 4.467097302630354, "grad_norm": 0.2978845536708832, "learning_rate": 1.2188115390927115e-05, "loss": 1.2229, "step": 14998 }, { "epoch": 4.467395148830023, "grad_norm": 0.2978161871433258, "learning_rate": 1.2187174188586004e-05, "loss": 1.2187, "step": 14999 }, { "epoch": 4.467692995029692, "grad_norm": 0.29847341775894165, "learning_rate": 1.2186232965895681e-05, "loss": 1.2375, "step": 15000 }, { "epoch": 4.467692995029692, "eval_loss": 1.328788161277771, "eval_runtime": 21.4624, "eval_samples_per_second": 80.792, "eval_steps_per_second": 5.079, "step": 15000 }, { "epoch": 4.46799084122936, "grad_norm": 0.2802181541919708, "learning_rate": 1.2185291722864907e-05, "loss": 1.2094, "step": 15001 }, { "epoch": 4.468288687429029, "grad_norm": 0.5638412237167358, "learning_rate": 1.218435045950243e-05, "loss": 1.2276, "step": 15002 }, { "epoch": 4.468586533628698, "grad_norm": 0.5578691959381104, "learning_rate": 1.2183409175817013e-05, "loss": 1.2234, "step": 15003 }, { "epoch": 4.468884379828366, "grad_norm": 0.3073020875453949, "learning_rate": 1.2182467871817415e-05, "loss": 1.2274, "step": 15004 }, { "epoch": 4.4691822260280345, "grad_norm": 0.3942149877548218, "learning_rate": 1.218152654751239e-05, "loss": 1.2297, "step": 15005 }, { "epoch": 4.469480072227704, "grad_norm": 0.29925331473350525, "learning_rate": 1.21805852029107e-05, "loss": 1.2161, "step": 15006 }, { "epoch": 4.469777918427372, "grad_norm": 0.3018183410167694, "learning_rate": 1.2179643838021098e-05, "loss": 1.2259, "step": 15007 }, { "epoch": 4.47007576462704, "grad_norm": 0.3541475534439087, "learning_rate": 1.217870245285235e-05, "loss": 1.2223, "step": 15008 }, { "epoch": 4.47037361082671, "grad_norm": 0.3656480610370636, "learning_rate": 1.2177761047413205e-05, "loss": 1.2288, "step": 15009 }, { "epoch": 4.470671457026378, "grad_norm": 0.31403958797454834, "learning_rate": 1.2176819621712428e-05, "loss": 1.229, "step": 15010 }, { "epoch": 4.470969303226046, "grad_norm": 0.33523741364479065, "learning_rate": 1.2175878175758777e-05, "loss": 1.2113, "step": 15011 }, { "epoch": 4.4712671494257155, "grad_norm": 0.31044355034828186, "learning_rate": 1.2174936709561012e-05, "loss": 1.2339, "step": 15012 }, { "epoch": 4.471564995625384, "grad_norm": 0.30819764733314514, "learning_rate": 1.2173995223127891e-05, "loss": 1.2239, "step": 15013 }, { "epoch": 4.471862841825053, "grad_norm": 0.2760671377182007, "learning_rate": 1.217305371646817e-05, "loss": 1.2172, "step": 15014 }, { "epoch": 4.472160688024721, "grad_norm": 0.2517237663269043, "learning_rate": 1.2172112189590613e-05, "loss": 1.202, "step": 15015 }, { "epoch": 4.47245853422439, "grad_norm": 0.3907548189163208, "learning_rate": 1.217117064250398e-05, "loss": 1.2289, "step": 15016 }, { "epoch": 4.472756380424059, "grad_norm": 0.35696491599082947, "learning_rate": 1.217022907521703e-05, "loss": 1.2165, "step": 15017 }, { "epoch": 4.473054226623727, "grad_norm": 0.26156413555145264, "learning_rate": 1.2169287487738524e-05, "loss": 1.2175, "step": 15018 }, { "epoch": 4.4733520728233955, "grad_norm": 0.25209230184555054, "learning_rate": 1.2168345880077222e-05, "loss": 1.2315, "step": 15019 }, { "epoch": 4.473649919023065, "grad_norm": 0.2818114161491394, "learning_rate": 1.216740425224188e-05, "loss": 1.2133, "step": 15020 }, { "epoch": 4.473947765222733, "grad_norm": 0.26874756813049316, "learning_rate": 1.2166462604241267e-05, "loss": 1.2302, "step": 15021 }, { "epoch": 4.474245611422401, "grad_norm": 0.2840188443660736, "learning_rate": 1.2165520936084139e-05, "loss": 1.2016, "step": 15022 }, { "epoch": 4.474543457622071, "grad_norm": 0.2546764612197876, "learning_rate": 1.2164579247779258e-05, "loss": 1.2161, "step": 15023 }, { "epoch": 4.474841303821739, "grad_norm": 0.39144718647003174, "learning_rate": 1.2163637539335384e-05, "loss": 1.2264, "step": 15024 }, { "epoch": 4.475139150021407, "grad_norm": 0.3180117607116699, "learning_rate": 1.2162695810761283e-05, "loss": 1.2164, "step": 15025 }, { "epoch": 4.4754369962210765, "grad_norm": 0.30262383818626404, "learning_rate": 1.2161754062065714e-05, "loss": 1.2206, "step": 15026 }, { "epoch": 4.475734842420745, "grad_norm": 0.25390443205833435, "learning_rate": 1.2160812293257436e-05, "loss": 1.2224, "step": 15027 }, { "epoch": 4.476032688620414, "grad_norm": 0.2694064676761627, "learning_rate": 1.2159870504345212e-05, "loss": 1.2105, "step": 15028 }, { "epoch": 4.476330534820082, "grad_norm": 0.2786681056022644, "learning_rate": 1.2158928695337811e-05, "loss": 1.2228, "step": 15029 }, { "epoch": 4.476628381019751, "grad_norm": 0.24426937103271484, "learning_rate": 1.2157986866243988e-05, "loss": 1.2198, "step": 15030 }, { "epoch": 4.47692622721942, "grad_norm": 0.2864536941051483, "learning_rate": 1.2157045017072509e-05, "loss": 1.2201, "step": 15031 }, { "epoch": 4.477224073419088, "grad_norm": 0.266191303730011, "learning_rate": 1.2156103147832137e-05, "loss": 1.2339, "step": 15032 }, { "epoch": 4.477521919618757, "grad_norm": 0.3499411344528198, "learning_rate": 1.2155161258531632e-05, "loss": 1.2225, "step": 15033 }, { "epoch": 4.477819765818426, "grad_norm": 0.24897105991840363, "learning_rate": 1.215421934917976e-05, "loss": 1.2188, "step": 15034 }, { "epoch": 4.478117612018094, "grad_norm": 0.33457404375076294, "learning_rate": 1.2153277419785285e-05, "loss": 1.2324, "step": 15035 }, { "epoch": 4.478415458217762, "grad_norm": 0.2837419807910919, "learning_rate": 1.2152335470356968e-05, "loss": 1.2267, "step": 15036 }, { "epoch": 4.478713304417432, "grad_norm": 0.29611486196517944, "learning_rate": 1.2151393500903575e-05, "loss": 1.2239, "step": 15037 }, { "epoch": 4.4790111506171, "grad_norm": 0.3383772671222687, "learning_rate": 1.2150451511433868e-05, "loss": 1.222, "step": 15038 }, { "epoch": 4.479308996816769, "grad_norm": 0.256071001291275, "learning_rate": 1.2149509501956613e-05, "loss": 1.1995, "step": 15039 }, { "epoch": 4.4796068430164375, "grad_norm": 0.3517363369464874, "learning_rate": 1.2148567472480574e-05, "loss": 1.2244, "step": 15040 }, { "epoch": 4.479904689216106, "grad_norm": 0.30239713191986084, "learning_rate": 1.2147625423014516e-05, "loss": 1.2116, "step": 15041 }, { "epoch": 4.480202535415775, "grad_norm": 0.34286734461784363, "learning_rate": 1.2146683353567204e-05, "loss": 1.2291, "step": 15042 }, { "epoch": 4.480500381615443, "grad_norm": 0.3684154152870178, "learning_rate": 1.2145741264147397e-05, "loss": 1.2289, "step": 15043 }, { "epoch": 4.480798227815112, "grad_norm": 0.34026914834976196, "learning_rate": 1.2144799154763868e-05, "loss": 1.2172, "step": 15044 }, { "epoch": 4.481096074014781, "grad_norm": 0.5373833775520325, "learning_rate": 1.214385702542538e-05, "loss": 1.2015, "step": 15045 }, { "epoch": 4.481393920214449, "grad_norm": 0.2383013218641281, "learning_rate": 1.2142914876140694e-05, "loss": 1.2112, "step": 15046 }, { "epoch": 4.481691766414118, "grad_norm": 0.7224148511886597, "learning_rate": 1.2141972706918583e-05, "loss": 1.2205, "step": 15047 }, { "epoch": 4.481989612613787, "grad_norm": 0.5347703099250793, "learning_rate": 1.2141030517767807e-05, "loss": 1.231, "step": 15048 }, { "epoch": 4.482287458813455, "grad_norm": 0.4365391135215759, "learning_rate": 1.2140088308697137e-05, "loss": 1.2163, "step": 15049 }, { "epoch": 4.4825853050131235, "grad_norm": 0.4629792869091034, "learning_rate": 1.2139146079715334e-05, "loss": 1.2422, "step": 15050 }, { "epoch": 4.482883151212793, "grad_norm": 0.426817923784256, "learning_rate": 1.2138203830831165e-05, "loss": 1.2169, "step": 15051 }, { "epoch": 4.483180997412461, "grad_norm": 0.32950761914253235, "learning_rate": 1.2137261562053401e-05, "loss": 1.2174, "step": 15052 }, { "epoch": 4.483478843612129, "grad_norm": 0.506644070148468, "learning_rate": 1.2136319273390807e-05, "loss": 1.2284, "step": 15053 }, { "epoch": 4.4837766898117986, "grad_norm": 0.35195106267929077, "learning_rate": 1.2135376964852145e-05, "loss": 1.2356, "step": 15054 }, { "epoch": 4.484074536011467, "grad_norm": 0.3815198242664337, "learning_rate": 1.213443463644619e-05, "loss": 1.2255, "step": 15055 }, { "epoch": 4.484372382211136, "grad_norm": 0.26191478967666626, "learning_rate": 1.2133492288181703e-05, "loss": 1.2308, "step": 15056 }, { "epoch": 4.484670228410804, "grad_norm": 0.2946739196777344, "learning_rate": 1.2132549920067455e-05, "loss": 1.2274, "step": 15057 }, { "epoch": 4.484968074610473, "grad_norm": 0.30766019225120544, "learning_rate": 1.2131607532112213e-05, "loss": 1.2215, "step": 15058 }, { "epoch": 4.485265920810142, "grad_norm": 0.24756793677806854, "learning_rate": 1.2130665124324745e-05, "loss": 1.2109, "step": 15059 }, { "epoch": 4.48556376700981, "grad_norm": 0.2954517602920532, "learning_rate": 1.2129722696713817e-05, "loss": 1.2137, "step": 15060 }, { "epoch": 4.485861613209479, "grad_norm": 0.30444443225860596, "learning_rate": 1.2128780249288198e-05, "loss": 1.2045, "step": 15061 }, { "epoch": 4.486159459409148, "grad_norm": 0.3060961961746216, "learning_rate": 1.2127837782056659e-05, "loss": 1.2191, "step": 15062 }, { "epoch": 4.486457305608816, "grad_norm": 0.2736704647541046, "learning_rate": 1.2126895295027968e-05, "loss": 1.2448, "step": 15063 }, { "epoch": 4.4867551518084845, "grad_norm": 0.243803009390831, "learning_rate": 1.212595278821089e-05, "loss": 1.2097, "step": 15064 }, { "epoch": 4.487052998008154, "grad_norm": 0.2489195019006729, "learning_rate": 1.2125010261614197e-05, "loss": 1.2203, "step": 15065 }, { "epoch": 4.487350844207822, "grad_norm": 0.3543930649757385, "learning_rate": 1.2124067715246661e-05, "loss": 1.2078, "step": 15066 }, { "epoch": 4.487648690407491, "grad_norm": 0.244902104139328, "learning_rate": 1.2123125149117043e-05, "loss": 1.2378, "step": 15067 }, { "epoch": 4.48794653660716, "grad_norm": 0.42561474442481995, "learning_rate": 1.2122182563234121e-05, "loss": 1.2123, "step": 15068 }, { "epoch": 4.488244382806828, "grad_norm": 0.2636564075946808, "learning_rate": 1.2121239957606661e-05, "loss": 1.2138, "step": 15069 }, { "epoch": 4.488542229006497, "grad_norm": 0.3436940312385559, "learning_rate": 1.2120297332243432e-05, "loss": 1.23, "step": 15070 }, { "epoch": 4.4888400752061655, "grad_norm": 0.2705753743648529, "learning_rate": 1.2119354687153204e-05, "loss": 1.2155, "step": 15071 }, { "epoch": 4.489137921405834, "grad_norm": 0.34441936016082764, "learning_rate": 1.211841202234475e-05, "loss": 1.2256, "step": 15072 }, { "epoch": 4.489435767605503, "grad_norm": 0.24522235989570618, "learning_rate": 1.2117469337826842e-05, "loss": 1.2273, "step": 15073 }, { "epoch": 4.489733613805171, "grad_norm": 0.4094565510749817, "learning_rate": 1.211652663360824e-05, "loss": 1.2345, "step": 15074 }, { "epoch": 4.49003146000484, "grad_norm": 0.24970418214797974, "learning_rate": 1.2115583909697729e-05, "loss": 1.2257, "step": 15075 }, { "epoch": 4.490329306204509, "grad_norm": 0.37969470024108887, "learning_rate": 1.2114641166104074e-05, "loss": 1.2171, "step": 15076 }, { "epoch": 4.490627152404177, "grad_norm": 0.25172027945518494, "learning_rate": 1.211369840283604e-05, "loss": 1.2246, "step": 15077 }, { "epoch": 4.4909249986038455, "grad_norm": 0.5294029712677002, "learning_rate": 1.2112755619902408e-05, "loss": 1.2127, "step": 15078 }, { "epoch": 4.491222844803515, "grad_norm": 0.28505566716194153, "learning_rate": 1.2111812817311943e-05, "loss": 1.239, "step": 15079 }, { "epoch": 4.491520691003183, "grad_norm": 0.41262924671173096, "learning_rate": 1.2110869995073422e-05, "loss": 1.2166, "step": 15080 }, { "epoch": 4.491818537202852, "grad_norm": 0.24879197776317596, "learning_rate": 1.2109927153195612e-05, "loss": 1.2323, "step": 15081 }, { "epoch": 4.492116383402521, "grad_norm": 0.5299745202064514, "learning_rate": 1.2108984291687286e-05, "loss": 1.2162, "step": 15082 }, { "epoch": 4.492414229602189, "grad_norm": 0.39849525690078735, "learning_rate": 1.2108041410557222e-05, "loss": 1.2248, "step": 15083 }, { "epoch": 4.492712075801858, "grad_norm": 0.43437138199806213, "learning_rate": 1.2107098509814185e-05, "loss": 1.2331, "step": 15084 }, { "epoch": 4.4930099220015265, "grad_norm": 0.47554242610931396, "learning_rate": 1.210615558946695e-05, "loss": 1.2294, "step": 15085 }, { "epoch": 4.493307768201195, "grad_norm": 0.28594446182250977, "learning_rate": 1.2105212649524292e-05, "loss": 1.2188, "step": 15086 }, { "epoch": 4.493605614400864, "grad_norm": 0.33014607429504395, "learning_rate": 1.210426968999498e-05, "loss": 1.2345, "step": 15087 }, { "epoch": 4.493903460600532, "grad_norm": 0.3126441538333893, "learning_rate": 1.2103326710887793e-05, "loss": 1.2024, "step": 15088 }, { "epoch": 4.494201306800201, "grad_norm": 0.2798210382461548, "learning_rate": 1.2102383712211499e-05, "loss": 1.2144, "step": 15089 }, { "epoch": 4.49449915299987, "grad_norm": 0.3776017129421234, "learning_rate": 1.2101440693974875e-05, "loss": 1.2151, "step": 15090 }, { "epoch": 4.494796999199538, "grad_norm": 0.2667417824268341, "learning_rate": 1.2100497656186692e-05, "loss": 1.2207, "step": 15091 }, { "epoch": 4.495094845399207, "grad_norm": 0.3946310877799988, "learning_rate": 1.209955459885572e-05, "loss": 1.2264, "step": 15092 }, { "epoch": 4.495392691598876, "grad_norm": 0.25927817821502686, "learning_rate": 1.2098611521990747e-05, "loss": 1.2327, "step": 15093 }, { "epoch": 4.495690537798544, "grad_norm": 0.4457034766674042, "learning_rate": 1.2097668425600532e-05, "loss": 1.24, "step": 15094 }, { "epoch": 4.495988383998213, "grad_norm": 0.28256189823150635, "learning_rate": 1.209672530969386e-05, "loss": 1.2195, "step": 15095 }, { "epoch": 4.496286230197882, "grad_norm": 0.4875396490097046, "learning_rate": 1.20957821742795e-05, "loss": 1.2132, "step": 15096 }, { "epoch": 4.49658407639755, "grad_norm": 0.2714369297027588, "learning_rate": 1.2094839019366229e-05, "loss": 1.2314, "step": 15097 }, { "epoch": 4.496881922597219, "grad_norm": 0.4154144525527954, "learning_rate": 1.209389584496282e-05, "loss": 1.2328, "step": 15098 }, { "epoch": 4.4971797687968875, "grad_norm": 0.2884173095226288, "learning_rate": 1.209295265107805e-05, "loss": 1.2117, "step": 15099 }, { "epoch": 4.497477614996556, "grad_norm": 0.396198570728302, "learning_rate": 1.2092009437720696e-05, "loss": 1.2197, "step": 15100 }, { "epoch": 4.497775461196225, "grad_norm": 0.38598132133483887, "learning_rate": 1.209106620489953e-05, "loss": 1.2479, "step": 15101 }, { "epoch": 4.498073307395893, "grad_norm": 0.40717682242393494, "learning_rate": 1.2090122952623329e-05, "loss": 1.235, "step": 15102 }, { "epoch": 4.498371153595562, "grad_norm": 0.3123427629470825, "learning_rate": 1.2089179680900869e-05, "loss": 1.2152, "step": 15103 }, { "epoch": 4.498668999795231, "grad_norm": 0.30639252066612244, "learning_rate": 1.2088236389740925e-05, "loss": 1.2282, "step": 15104 }, { "epoch": 4.498966845994899, "grad_norm": 0.3442763090133667, "learning_rate": 1.2087293079152277e-05, "loss": 1.2128, "step": 15105 }, { "epoch": 4.4992646921945685, "grad_norm": 0.32975438237190247, "learning_rate": 1.2086349749143698e-05, "loss": 1.2176, "step": 15106 }, { "epoch": 4.499562538394237, "grad_norm": 0.3876637816429138, "learning_rate": 1.2085406399723966e-05, "loss": 1.2276, "step": 15107 }, { "epoch": 4.499860384593905, "grad_norm": 0.2568848729133606, "learning_rate": 1.2084463030901859e-05, "loss": 1.2295, "step": 15108 }, { "epoch": 4.500158230793574, "grad_norm": 0.40290164947509766, "learning_rate": 1.208351964268615e-05, "loss": 1.2211, "step": 15109 }, { "epoch": 4.500456076993243, "grad_norm": 0.31851232051849365, "learning_rate": 1.2082576235085619e-05, "loss": 1.2003, "step": 15110 }, { "epoch": 4.500753923192911, "grad_norm": 0.40014412999153137, "learning_rate": 1.2081632808109043e-05, "loss": 1.2168, "step": 15111 }, { "epoch": 4.50105176939258, "grad_norm": 0.39033442735671997, "learning_rate": 1.20806893617652e-05, "loss": 1.2284, "step": 15112 }, { "epoch": 4.5013496155922486, "grad_norm": 0.34930282831192017, "learning_rate": 1.2079745896062864e-05, "loss": 1.2205, "step": 15113 }, { "epoch": 4.501647461791917, "grad_norm": 0.39152130484580994, "learning_rate": 1.207880241101082e-05, "loss": 1.2218, "step": 15114 }, { "epoch": 4.501945307991586, "grad_norm": 0.29718637466430664, "learning_rate": 1.2077858906617843e-05, "loss": 1.2193, "step": 15115 }, { "epoch": 4.502243154191254, "grad_norm": 0.34129148721694946, "learning_rate": 1.2076915382892705e-05, "loss": 1.2078, "step": 15116 }, { "epoch": 4.502541000390923, "grad_norm": 0.3030122220516205, "learning_rate": 1.2075971839844195e-05, "loss": 1.2281, "step": 15117 }, { "epoch": 4.502838846590592, "grad_norm": 0.385215699672699, "learning_rate": 1.2075028277481084e-05, "loss": 1.2282, "step": 15118 }, { "epoch": 4.50313669279026, "grad_norm": 0.274476557970047, "learning_rate": 1.2074084695812153e-05, "loss": 1.222, "step": 15119 }, { "epoch": 4.503434538989929, "grad_norm": 0.38454511761665344, "learning_rate": 1.207314109484618e-05, "loss": 1.2353, "step": 15120 }, { "epoch": 4.503732385189598, "grad_norm": 0.2698459327220917, "learning_rate": 1.2072197474591948e-05, "loss": 1.2289, "step": 15121 }, { "epoch": 4.504030231389266, "grad_norm": 0.27188873291015625, "learning_rate": 1.2071253835058235e-05, "loss": 1.2335, "step": 15122 }, { "epoch": 4.504328077588935, "grad_norm": 0.28518879413604736, "learning_rate": 1.2070310176253817e-05, "loss": 1.2084, "step": 15123 }, { "epoch": 4.504625923788604, "grad_norm": 0.30094584822654724, "learning_rate": 1.2069366498187478e-05, "loss": 1.2215, "step": 15124 }, { "epoch": 4.504923769988272, "grad_norm": 0.3028467893600464, "learning_rate": 1.2068422800867995e-05, "loss": 1.2167, "step": 15125 }, { "epoch": 4.505221616187941, "grad_norm": 0.43491336703300476, "learning_rate": 1.2067479084304146e-05, "loss": 1.2246, "step": 15126 }, { "epoch": 4.50551946238761, "grad_norm": 0.24532203376293182, "learning_rate": 1.2066535348504716e-05, "loss": 1.2061, "step": 15127 }, { "epoch": 4.505817308587278, "grad_norm": 0.4910340905189514, "learning_rate": 1.2065591593478483e-05, "loss": 1.233, "step": 15128 }, { "epoch": 4.506115154786947, "grad_norm": 0.25325271487236023, "learning_rate": 1.2064647819234227e-05, "loss": 1.226, "step": 15129 }, { "epoch": 4.5064130009866155, "grad_norm": 0.4711860418319702, "learning_rate": 1.2063704025780732e-05, "loss": 1.2235, "step": 15130 }, { "epoch": 4.506710847186284, "grad_norm": 0.2528174817562103, "learning_rate": 1.2062760213126776e-05, "loss": 1.2285, "step": 15131 }, { "epoch": 4.507008693385953, "grad_norm": 0.5877256393432617, "learning_rate": 1.206181638128114e-05, "loss": 1.215, "step": 15132 }, { "epoch": 4.507306539585621, "grad_norm": 0.43910297751426697, "learning_rate": 1.2060872530252605e-05, "loss": 1.2362, "step": 15133 }, { "epoch": 4.5076043857852905, "grad_norm": 0.2653259336948395, "learning_rate": 1.2059928660049958e-05, "loss": 1.2033, "step": 15134 }, { "epoch": 4.507902231984959, "grad_norm": 0.3588211238384247, "learning_rate": 1.2058984770681974e-05, "loss": 1.2182, "step": 15135 }, { "epoch": 4.508200078184627, "grad_norm": 0.26585859060287476, "learning_rate": 1.2058040862157437e-05, "loss": 1.2212, "step": 15136 }, { "epoch": 4.508497924384296, "grad_norm": 0.33971595764160156, "learning_rate": 1.2057096934485126e-05, "loss": 1.2231, "step": 15137 }, { "epoch": 4.508795770583965, "grad_norm": 0.2848120331764221, "learning_rate": 1.2056152987673832e-05, "loss": 1.2117, "step": 15138 }, { "epoch": 4.509093616783633, "grad_norm": 0.34800732135772705, "learning_rate": 1.2055209021732328e-05, "loss": 1.2268, "step": 15139 }, { "epoch": 4.509391462983302, "grad_norm": 0.3181397020816803, "learning_rate": 1.2054265036669401e-05, "loss": 1.21, "step": 15140 }, { "epoch": 4.509689309182971, "grad_norm": 0.45380064845085144, "learning_rate": 1.2053321032493835e-05, "loss": 1.2027, "step": 15141 }, { "epoch": 4.509987155382639, "grad_norm": 0.2743885815143585, "learning_rate": 1.205237700921441e-05, "loss": 1.2308, "step": 15142 }, { "epoch": 4.510285001582308, "grad_norm": 0.3314945697784424, "learning_rate": 1.2051432966839908e-05, "loss": 1.2297, "step": 15143 }, { "epoch": 4.5105828477819765, "grad_norm": 0.38072890043258667, "learning_rate": 1.2050488905379116e-05, "loss": 1.2173, "step": 15144 }, { "epoch": 4.510880693981646, "grad_norm": 0.28301769495010376, "learning_rate": 1.2049544824840815e-05, "loss": 1.2183, "step": 15145 }, { "epoch": 4.511178540181314, "grad_norm": 0.3803479075431824, "learning_rate": 1.2048600725233787e-05, "loss": 1.2137, "step": 15146 }, { "epoch": 4.511476386380982, "grad_norm": 0.27927228808403015, "learning_rate": 1.2047656606566822e-05, "loss": 1.2307, "step": 15147 }, { "epoch": 4.511774232580651, "grad_norm": 0.26439303159713745, "learning_rate": 1.20467124688487e-05, "loss": 1.2235, "step": 15148 }, { "epoch": 4.51207207878032, "grad_norm": 0.3017038404941559, "learning_rate": 1.2045768312088203e-05, "loss": 1.2292, "step": 15149 }, { "epoch": 4.512369924979988, "grad_norm": 0.25685378909111023, "learning_rate": 1.2044824136294118e-05, "loss": 1.2261, "step": 15150 }, { "epoch": 4.5126677711796574, "grad_norm": 0.3176990747451782, "learning_rate": 1.204387994147523e-05, "loss": 1.2246, "step": 15151 }, { "epoch": 4.512965617379326, "grad_norm": 0.2701776623725891, "learning_rate": 1.2042935727640321e-05, "loss": 1.2117, "step": 15152 }, { "epoch": 4.513263463578994, "grad_norm": 0.41709625720977783, "learning_rate": 1.204199149479818e-05, "loss": 1.22, "step": 15153 }, { "epoch": 4.513561309778663, "grad_norm": 0.34297245740890503, "learning_rate": 1.2041047242957586e-05, "loss": 1.224, "step": 15154 }, { "epoch": 4.513859155978332, "grad_norm": 0.2829325795173645, "learning_rate": 1.2040102972127332e-05, "loss": 1.2324, "step": 15155 }, { "epoch": 4.514157002178, "grad_norm": 0.46197330951690674, "learning_rate": 1.2039158682316198e-05, "loss": 1.2214, "step": 15156 }, { "epoch": 4.514454848377669, "grad_norm": 0.27032995223999023, "learning_rate": 1.2038214373532969e-05, "loss": 1.2286, "step": 15157 }, { "epoch": 4.5147526945773375, "grad_norm": 0.4450484812259674, "learning_rate": 1.2037270045786434e-05, "loss": 1.2122, "step": 15158 }, { "epoch": 4.515050540777006, "grad_norm": 0.24114517867565155, "learning_rate": 1.2036325699085375e-05, "loss": 1.2221, "step": 15159 }, { "epoch": 4.515348386976675, "grad_norm": 0.34915485978126526, "learning_rate": 1.2035381333438583e-05, "loss": 1.2177, "step": 15160 }, { "epoch": 4.515646233176343, "grad_norm": 0.3585357367992401, "learning_rate": 1.2034436948854842e-05, "loss": 1.218, "step": 15161 }, { "epoch": 4.515944079376013, "grad_norm": 0.28885915875434875, "learning_rate": 1.2033492545342934e-05, "loss": 1.2206, "step": 15162 }, { "epoch": 4.516241925575681, "grad_norm": 0.30171290040016174, "learning_rate": 1.2032548122911655e-05, "loss": 1.2069, "step": 15163 }, { "epoch": 4.516539771775349, "grad_norm": 0.3419775366783142, "learning_rate": 1.2031603681569784e-05, "loss": 1.2192, "step": 15164 }, { "epoch": 4.5168376179750185, "grad_norm": 0.4366108775138855, "learning_rate": 1.2030659221326114e-05, "loss": 1.2149, "step": 15165 }, { "epoch": 4.517135464174687, "grad_norm": 0.2641226351261139, "learning_rate": 1.2029714742189424e-05, "loss": 1.2189, "step": 15166 }, { "epoch": 4.517433310374355, "grad_norm": 0.3460739254951477, "learning_rate": 1.2028770244168508e-05, "loss": 1.2363, "step": 15167 }, { "epoch": 4.517731156574024, "grad_norm": 0.34574875235557556, "learning_rate": 1.2027825727272153e-05, "loss": 1.2231, "step": 15168 }, { "epoch": 4.518029002773693, "grad_norm": 0.26001298427581787, "learning_rate": 1.2026881191509143e-05, "loss": 1.2278, "step": 15169 }, { "epoch": 4.518326848973361, "grad_norm": 0.3446212112903595, "learning_rate": 1.2025936636888269e-05, "loss": 1.2251, "step": 15170 }, { "epoch": 4.51862469517303, "grad_norm": 0.3986576497554779, "learning_rate": 1.2024992063418316e-05, "loss": 1.2315, "step": 15171 }, { "epoch": 4.5189225413726986, "grad_norm": 0.26384684443473816, "learning_rate": 1.2024047471108081e-05, "loss": 1.2131, "step": 15172 }, { "epoch": 4.519220387572368, "grad_norm": 0.4883301854133606, "learning_rate": 1.202310285996634e-05, "loss": 1.2231, "step": 15173 }, { "epoch": 4.519518233772036, "grad_norm": 0.6936948299407959, "learning_rate": 1.2022158230001888e-05, "loss": 1.2297, "step": 15174 }, { "epoch": 4.519816079971704, "grad_norm": 0.44598180055618286, "learning_rate": 1.2021213581223512e-05, "loss": 1.23, "step": 15175 }, { "epoch": 4.520113926171374, "grad_norm": 0.29919061064720154, "learning_rate": 1.2020268913640003e-05, "loss": 1.2297, "step": 15176 }, { "epoch": 4.520411772371042, "grad_norm": 0.2921298146247864, "learning_rate": 1.201932422726015e-05, "loss": 1.2063, "step": 15177 }, { "epoch": 4.52070961857071, "grad_norm": 0.32926321029663086, "learning_rate": 1.2018379522092737e-05, "loss": 1.2336, "step": 15178 }, { "epoch": 4.5210074647703795, "grad_norm": 0.3749915063381195, "learning_rate": 1.2017434798146565e-05, "loss": 1.2278, "step": 15179 }, { "epoch": 4.521305310970048, "grad_norm": 0.23137181997299194, "learning_rate": 1.201649005543041e-05, "loss": 1.2424, "step": 15180 }, { "epoch": 4.521603157169716, "grad_norm": 0.2518247067928314, "learning_rate": 1.2015545293953067e-05, "loss": 1.2187, "step": 15181 }, { "epoch": 4.521901003369385, "grad_norm": 0.2674430012702942, "learning_rate": 1.2014600513723333e-05, "loss": 1.217, "step": 15182 }, { "epoch": 4.522198849569054, "grad_norm": 0.2746109664440155, "learning_rate": 1.201365571474999e-05, "loss": 1.2169, "step": 15183 }, { "epoch": 4.522496695768722, "grad_norm": 0.2592957019805908, "learning_rate": 1.2012710897041828e-05, "loss": 1.2334, "step": 15184 }, { "epoch": 4.522794541968391, "grad_norm": 0.34598320722579956, "learning_rate": 1.2011766060607641e-05, "loss": 1.2273, "step": 15185 }, { "epoch": 4.52309238816806, "grad_norm": 0.3186018466949463, "learning_rate": 1.2010821205456218e-05, "loss": 1.2471, "step": 15186 }, { "epoch": 4.523390234367728, "grad_norm": 0.2697501480579376, "learning_rate": 1.200987633159635e-05, "loss": 1.227, "step": 15187 }, { "epoch": 4.523688080567397, "grad_norm": 0.29812702536582947, "learning_rate": 1.200893143903683e-05, "loss": 1.2237, "step": 15188 }, { "epoch": 4.5239859267670655, "grad_norm": 0.2813689410686493, "learning_rate": 1.2007986527786449e-05, "loss": 1.2272, "step": 15189 }, { "epoch": 4.524283772966735, "grad_norm": 0.3204902410507202, "learning_rate": 1.2007041597853995e-05, "loss": 1.2092, "step": 15190 }, { "epoch": 4.524581619166403, "grad_norm": 0.3014407455921173, "learning_rate": 1.200609664924826e-05, "loss": 1.2209, "step": 15191 }, { "epoch": 4.524879465366071, "grad_norm": 0.27222299575805664, "learning_rate": 1.2005151681978038e-05, "loss": 1.2063, "step": 15192 }, { "epoch": 4.5251773115657405, "grad_norm": 0.2536955177783966, "learning_rate": 1.2004206696052119e-05, "loss": 1.2309, "step": 15193 }, { "epoch": 4.525475157765409, "grad_norm": 0.27589666843414307, "learning_rate": 1.2003261691479298e-05, "loss": 1.2402, "step": 15194 }, { "epoch": 4.525773003965077, "grad_norm": 0.38982024788856506, "learning_rate": 1.2002316668268364e-05, "loss": 1.208, "step": 15195 }, { "epoch": 4.526070850164746, "grad_norm": 0.40475478768348694, "learning_rate": 1.2001371626428111e-05, "loss": 1.2263, "step": 15196 }, { "epoch": 4.526368696364415, "grad_norm": 0.24196231365203857, "learning_rate": 1.200042656596733e-05, "loss": 1.2229, "step": 15197 }, { "epoch": 4.526666542564083, "grad_norm": 0.47569191455841064, "learning_rate": 1.1999481486894813e-05, "loss": 1.2344, "step": 15198 }, { "epoch": 4.526964388763752, "grad_norm": 0.5380421280860901, "learning_rate": 1.199853638921936e-05, "loss": 1.2235, "step": 15199 }, { "epoch": 4.527262234963421, "grad_norm": 0.3004697561264038, "learning_rate": 1.1997591272949754e-05, "loss": 1.215, "step": 15200 }, { "epoch": 4.52756008116309, "grad_norm": 0.6487541794776917, "learning_rate": 1.1996646138094794e-05, "loss": 1.2185, "step": 15201 }, { "epoch": 4.527857927362758, "grad_norm": 0.2904670536518097, "learning_rate": 1.1995700984663275e-05, "loss": 1.2232, "step": 15202 }, { "epoch": 4.5281557735624265, "grad_norm": 0.6748509407043457, "learning_rate": 1.1994755812663985e-05, "loss": 1.2239, "step": 15203 }, { "epoch": 4.528453619762096, "grad_norm": 0.5291069746017456, "learning_rate": 1.199381062210572e-05, "loss": 1.2232, "step": 15204 }, { "epoch": 4.528751465961764, "grad_norm": 0.47623565793037415, "learning_rate": 1.1992865412997279e-05, "loss": 1.2215, "step": 15205 }, { "epoch": 4.529049312161432, "grad_norm": 0.38587868213653564, "learning_rate": 1.1991920185347452e-05, "loss": 1.2156, "step": 15206 }, { "epoch": 4.529347158361102, "grad_norm": 0.3916699290275574, "learning_rate": 1.199097493916503e-05, "loss": 1.2157, "step": 15207 }, { "epoch": 4.52964500456077, "grad_norm": 0.2876312732696533, "learning_rate": 1.1990029674458812e-05, "loss": 1.2226, "step": 15208 }, { "epoch": 4.529942850760438, "grad_norm": 0.37393510341644287, "learning_rate": 1.1989084391237591e-05, "loss": 1.2319, "step": 15209 }, { "epoch": 4.5302406969601074, "grad_norm": 0.301517516374588, "learning_rate": 1.1988139089510162e-05, "loss": 1.2364, "step": 15210 }, { "epoch": 4.530538543159776, "grad_norm": 0.28969547152519226, "learning_rate": 1.198719376928532e-05, "loss": 1.22, "step": 15211 }, { "epoch": 4.530836389359445, "grad_norm": 0.2703687250614166, "learning_rate": 1.1986248430571858e-05, "loss": 1.1986, "step": 15212 }, { "epoch": 4.531134235559113, "grad_norm": 0.27715325355529785, "learning_rate": 1.1985303073378578e-05, "loss": 1.1986, "step": 15213 }, { "epoch": 4.531432081758782, "grad_norm": 0.2817557752132416, "learning_rate": 1.198435769771427e-05, "loss": 1.2354, "step": 15214 }, { "epoch": 4.53172992795845, "grad_norm": 0.3252423107624054, "learning_rate": 1.1983412303587729e-05, "loss": 1.2324, "step": 15215 }, { "epoch": 4.532027774158119, "grad_norm": 0.26138994097709656, "learning_rate": 1.1982466891007753e-05, "loss": 1.2306, "step": 15216 }, { "epoch": 4.5323256203577875, "grad_norm": 0.3477913737297058, "learning_rate": 1.1981521459983137e-05, "loss": 1.1917, "step": 15217 }, { "epoch": 4.532623466557457, "grad_norm": 0.2689266800880432, "learning_rate": 1.1980576010522678e-05, "loss": 1.2104, "step": 15218 }, { "epoch": 4.532921312757125, "grad_norm": 0.44335079193115234, "learning_rate": 1.1979630542635173e-05, "loss": 1.2249, "step": 15219 }, { "epoch": 4.533219158956793, "grad_norm": 0.2719641625881195, "learning_rate": 1.1978685056329417e-05, "loss": 1.2148, "step": 15220 }, { "epoch": 4.533517005156463, "grad_norm": 0.3772193193435669, "learning_rate": 1.1977739551614205e-05, "loss": 1.2141, "step": 15221 }, { "epoch": 4.533814851356131, "grad_norm": 0.3737548291683197, "learning_rate": 1.1976794028498338e-05, "loss": 1.2214, "step": 15222 }, { "epoch": 4.534112697555799, "grad_norm": 0.31302064657211304, "learning_rate": 1.1975848486990613e-05, "loss": 1.213, "step": 15223 }, { "epoch": 4.5344105437554685, "grad_norm": 0.2857608497142792, "learning_rate": 1.1974902927099826e-05, "loss": 1.2186, "step": 15224 }, { "epoch": 4.534708389955137, "grad_norm": 0.2496808022260666, "learning_rate": 1.1973957348834771e-05, "loss": 1.2274, "step": 15225 }, { "epoch": 4.535006236154805, "grad_norm": 0.3113750219345093, "learning_rate": 1.1973011752204248e-05, "loss": 1.2189, "step": 15226 }, { "epoch": 4.535304082354474, "grad_norm": 0.2819344103336334, "learning_rate": 1.1972066137217056e-05, "loss": 1.2182, "step": 15227 }, { "epoch": 4.535601928554143, "grad_norm": 0.34998559951782227, "learning_rate": 1.197112050388199e-05, "loss": 1.2169, "step": 15228 }, { "epoch": 4.535899774753812, "grad_norm": 0.3272688090801239, "learning_rate": 1.1970174852207853e-05, "loss": 1.2148, "step": 15229 }, { "epoch": 4.53619762095348, "grad_norm": 0.28262338042259216, "learning_rate": 1.196922918220344e-05, "loss": 1.2253, "step": 15230 }, { "epoch": 4.5364954671531486, "grad_norm": 0.3195551335811615, "learning_rate": 1.1968283493877548e-05, "loss": 1.2287, "step": 15231 }, { "epoch": 4.536793313352818, "grad_norm": 0.34402960538864136, "learning_rate": 1.1967337787238977e-05, "loss": 1.2108, "step": 15232 }, { "epoch": 4.537091159552486, "grad_norm": 0.28968119621276855, "learning_rate": 1.1966392062296528e-05, "loss": 1.2332, "step": 15233 }, { "epoch": 4.537389005752154, "grad_norm": 0.3309728503227234, "learning_rate": 1.1965446319058995e-05, "loss": 1.2296, "step": 15234 }, { "epoch": 4.537686851951824, "grad_norm": 0.5360700488090515, "learning_rate": 1.196450055753518e-05, "loss": 1.2272, "step": 15235 }, { "epoch": 4.537984698151492, "grad_norm": 0.2983231246471405, "learning_rate": 1.1963554777733886e-05, "loss": 1.2045, "step": 15236 }, { "epoch": 4.53828254435116, "grad_norm": 0.645355224609375, "learning_rate": 1.1962608979663907e-05, "loss": 1.2208, "step": 15237 }, { "epoch": 4.5385803905508295, "grad_norm": 0.7613120675086975, "learning_rate": 1.1961663163334042e-05, "loss": 1.2237, "step": 15238 }, { "epoch": 4.538878236750498, "grad_norm": 0.2756083607673645, "learning_rate": 1.1960717328753093e-05, "loss": 1.2194, "step": 15239 }, { "epoch": 4.539176082950167, "grad_norm": 0.5473653674125671, "learning_rate": 1.1959771475929864e-05, "loss": 1.2209, "step": 15240 }, { "epoch": 4.539473929149835, "grad_norm": 0.34889474511146545, "learning_rate": 1.1958825604873148e-05, "loss": 1.2211, "step": 15241 }, { "epoch": 4.539771775349504, "grad_norm": 0.5132902264595032, "learning_rate": 1.1957879715591749e-05, "loss": 1.2254, "step": 15242 }, { "epoch": 4.540069621549173, "grad_norm": 0.27200672030448914, "learning_rate": 1.195693380809447e-05, "loss": 1.2166, "step": 15243 }, { "epoch": 4.540367467748841, "grad_norm": 0.3609423339366913, "learning_rate": 1.1955987882390101e-05, "loss": 1.2012, "step": 15244 }, { "epoch": 4.54066531394851, "grad_norm": 0.3313136100769043, "learning_rate": 1.1955041938487452e-05, "loss": 1.2305, "step": 15245 }, { "epoch": 4.540963160148179, "grad_norm": 0.28946390748023987, "learning_rate": 1.1954095976395323e-05, "loss": 1.215, "step": 15246 }, { "epoch": 4.541261006347847, "grad_norm": 0.4484001696109772, "learning_rate": 1.1953149996122518e-05, "loss": 1.2168, "step": 15247 }, { "epoch": 4.5415588525475155, "grad_norm": 0.3039822280406952, "learning_rate": 1.1952203997677832e-05, "loss": 1.2304, "step": 15248 }, { "epoch": 4.541856698747185, "grad_norm": 0.3786887228488922, "learning_rate": 1.1951257981070068e-05, "loss": 1.2292, "step": 15249 }, { "epoch": 4.542154544946853, "grad_norm": 0.2886325716972351, "learning_rate": 1.1950311946308029e-05, "loss": 1.2207, "step": 15250 }, { "epoch": 4.542452391146521, "grad_norm": 0.3433248996734619, "learning_rate": 1.1949365893400515e-05, "loss": 1.2053, "step": 15251 }, { "epoch": 4.5427502373461905, "grad_norm": 0.2520977854728699, "learning_rate": 1.1948419822356332e-05, "loss": 1.2064, "step": 15252 }, { "epoch": 4.543048083545859, "grad_norm": 0.30309560894966125, "learning_rate": 1.1947473733184275e-05, "loss": 1.2362, "step": 15253 }, { "epoch": 4.543345929745527, "grad_norm": 0.3005962371826172, "learning_rate": 1.1946527625893159e-05, "loss": 1.2167, "step": 15254 }, { "epoch": 4.543643775945196, "grad_norm": 0.329084575176239, "learning_rate": 1.1945581500491771e-05, "loss": 1.2235, "step": 15255 }, { "epoch": 4.543941622144865, "grad_norm": 0.2776544392108917, "learning_rate": 1.1944635356988924e-05, "loss": 1.2192, "step": 15256 }, { "epoch": 4.544239468344534, "grad_norm": 0.2981942594051361, "learning_rate": 1.1943689195393415e-05, "loss": 1.2048, "step": 15257 }, { "epoch": 4.544537314544202, "grad_norm": 0.33692529797554016, "learning_rate": 1.1942743015714051e-05, "loss": 1.235, "step": 15258 }, { "epoch": 4.544835160743871, "grad_norm": 0.2756461203098297, "learning_rate": 1.1941796817959634e-05, "loss": 1.215, "step": 15259 }, { "epoch": 4.54513300694354, "grad_norm": 0.42082613706588745, "learning_rate": 1.1940850602138967e-05, "loss": 1.213, "step": 15260 }, { "epoch": 4.545430853143208, "grad_norm": 0.48097655177116394, "learning_rate": 1.1939904368260855e-05, "loss": 1.2113, "step": 15261 }, { "epoch": 4.5457286993428765, "grad_norm": 0.3191584348678589, "learning_rate": 1.1938958116334099e-05, "loss": 1.2366, "step": 15262 }, { "epoch": 4.546026545542546, "grad_norm": 0.620833694934845, "learning_rate": 1.1938011846367503e-05, "loss": 1.2237, "step": 15263 }, { "epoch": 4.546324391742214, "grad_norm": 0.3137511909008026, "learning_rate": 1.1937065558369876e-05, "loss": 1.2284, "step": 15264 }, { "epoch": 4.546622237941882, "grad_norm": 0.9560343027114868, "learning_rate": 1.1936119252350013e-05, "loss": 1.218, "step": 15265 }, { "epoch": 4.546920084141552, "grad_norm": 0.6752145886421204, "learning_rate": 1.1935172928316727e-05, "loss": 1.2355, "step": 15266 }, { "epoch": 4.54721793034122, "grad_norm": 0.6256542205810547, "learning_rate": 1.193422658627882e-05, "loss": 1.2112, "step": 15267 }, { "epoch": 4.547515776540889, "grad_norm": 0.38072991371154785, "learning_rate": 1.193328022624509e-05, "loss": 1.2138, "step": 15268 }, { "epoch": 4.5478136227405574, "grad_norm": 0.6567838191986084, "learning_rate": 1.1932333848224351e-05, "loss": 1.2129, "step": 15269 }, { "epoch": 4.548111468940226, "grad_norm": 0.266481876373291, "learning_rate": 1.1931387452225405e-05, "loss": 1.2063, "step": 15270 }, { "epoch": 4.548409315139895, "grad_norm": 0.38249945640563965, "learning_rate": 1.1930441038257057e-05, "loss": 1.2198, "step": 15271 }, { "epoch": 4.548707161339563, "grad_norm": 0.3684477210044861, "learning_rate": 1.1929494606328112e-05, "loss": 1.2238, "step": 15272 }, { "epoch": 4.549005007539232, "grad_norm": 0.2897947430610657, "learning_rate": 1.1928548156447372e-05, "loss": 1.2036, "step": 15273 }, { "epoch": 4.549302853738901, "grad_norm": 0.3501741290092468, "learning_rate": 1.192760168862365e-05, "loss": 1.2306, "step": 15274 }, { "epoch": 4.549600699938569, "grad_norm": 0.3471895456314087, "learning_rate": 1.1926655202865747e-05, "loss": 1.2336, "step": 15275 }, { "epoch": 4.5498985461382375, "grad_norm": 0.3095743656158447, "learning_rate": 1.1925708699182467e-05, "loss": 1.2175, "step": 15276 }, { "epoch": 4.550196392337907, "grad_norm": 0.28557077050209045, "learning_rate": 1.1924762177582623e-05, "loss": 1.2158, "step": 15277 }, { "epoch": 4.550494238537575, "grad_norm": 0.2939230501651764, "learning_rate": 1.1923815638075019e-05, "loss": 1.2278, "step": 15278 }, { "epoch": 4.550792084737244, "grad_norm": 0.322860985994339, "learning_rate": 1.1922869080668456e-05, "loss": 1.2269, "step": 15279 }, { "epoch": 4.551089930936913, "grad_norm": 0.2795043885707855, "learning_rate": 1.1921922505371744e-05, "loss": 1.218, "step": 15280 }, { "epoch": 4.551387777136581, "grad_norm": 0.2841345965862274, "learning_rate": 1.1920975912193692e-05, "loss": 1.2361, "step": 15281 }, { "epoch": 4.551685623336249, "grad_norm": 0.27874767780303955, "learning_rate": 1.1920029301143105e-05, "loss": 1.216, "step": 15282 }, { "epoch": 4.5519834695359185, "grad_norm": 0.285792738199234, "learning_rate": 1.191908267222879e-05, "loss": 1.2222, "step": 15283 }, { "epoch": 4.552281315735587, "grad_norm": 0.26173120737075806, "learning_rate": 1.191813602545956e-05, "loss": 1.2155, "step": 15284 }, { "epoch": 4.552579161935256, "grad_norm": 0.2594122290611267, "learning_rate": 1.1917189360844211e-05, "loss": 1.233, "step": 15285 }, { "epoch": 4.552877008134924, "grad_norm": 0.3487972319126129, "learning_rate": 1.1916242678391557e-05, "loss": 1.2162, "step": 15286 }, { "epoch": 4.553174854334593, "grad_norm": 0.30598223209381104, "learning_rate": 1.1915295978110409e-05, "loss": 1.2151, "step": 15287 }, { "epoch": 4.553472700534262, "grad_norm": 0.3238547444343567, "learning_rate": 1.191434926000957e-05, "loss": 1.2077, "step": 15288 }, { "epoch": 4.55377054673393, "grad_norm": 0.282140851020813, "learning_rate": 1.191340252409785e-05, "loss": 1.2222, "step": 15289 }, { "epoch": 4.5540683929335986, "grad_norm": 0.37073850631713867, "learning_rate": 1.191245577038406e-05, "loss": 1.2125, "step": 15290 }, { "epoch": 4.554366239133268, "grad_norm": 0.30525949597358704, "learning_rate": 1.1911508998877001e-05, "loss": 1.2221, "step": 15291 }, { "epoch": 4.554664085332936, "grad_norm": 0.34771493077278137, "learning_rate": 1.191056220958549e-05, "loss": 1.2098, "step": 15292 }, { "epoch": 4.554961931532604, "grad_norm": 0.33122164011001587, "learning_rate": 1.190961540251833e-05, "loss": 1.2249, "step": 15293 }, { "epoch": 4.555259777732274, "grad_norm": 0.2992601692676544, "learning_rate": 1.1908668577684335e-05, "loss": 1.2267, "step": 15294 }, { "epoch": 4.555557623931942, "grad_norm": 0.28742265701293945, "learning_rate": 1.190772173509231e-05, "loss": 1.216, "step": 15295 }, { "epoch": 4.555855470131611, "grad_norm": 0.29658302664756775, "learning_rate": 1.1906774874751065e-05, "loss": 1.2213, "step": 15296 }, { "epoch": 4.5561533163312795, "grad_norm": 0.27233144640922546, "learning_rate": 1.1905827996669413e-05, "loss": 1.2107, "step": 15297 }, { "epoch": 4.556451162530948, "grad_norm": 0.3032150864601135, "learning_rate": 1.1904881100856159e-05, "loss": 1.2444, "step": 15298 }, { "epoch": 4.556749008730617, "grad_norm": 0.304830402135849, "learning_rate": 1.1903934187320112e-05, "loss": 1.2176, "step": 15299 }, { "epoch": 4.557046854930285, "grad_norm": 0.3461590111255646, "learning_rate": 1.1902987256070087e-05, "loss": 1.2157, "step": 15300 }, { "epoch": 4.557344701129954, "grad_norm": 0.2693634033203125, "learning_rate": 1.1902040307114892e-05, "loss": 1.2154, "step": 15301 }, { "epoch": 4.557642547329623, "grad_norm": 0.4794714152812958, "learning_rate": 1.1901093340463339e-05, "loss": 1.2279, "step": 15302 }, { "epoch": 4.557940393529291, "grad_norm": 0.3167456388473511, "learning_rate": 1.1900146356124234e-05, "loss": 1.2193, "step": 15303 }, { "epoch": 4.55823823972896, "grad_norm": 0.3777005672454834, "learning_rate": 1.1899199354106386e-05, "loss": 1.2167, "step": 15304 }, { "epoch": 4.558536085928629, "grad_norm": 0.4013977646827698, "learning_rate": 1.1898252334418616e-05, "loss": 1.207, "step": 15305 }, { "epoch": 4.558833932128297, "grad_norm": 0.2669907212257385, "learning_rate": 1.1897305297069727e-05, "loss": 1.2219, "step": 15306 }, { "epoch": 4.559131778327966, "grad_norm": 0.33641159534454346, "learning_rate": 1.189635824206853e-05, "loss": 1.2193, "step": 15307 }, { "epoch": 4.559429624527635, "grad_norm": 0.2593099772930145, "learning_rate": 1.1895411169423845e-05, "loss": 1.2167, "step": 15308 }, { "epoch": 4.559727470727303, "grad_norm": 0.2861332595348358, "learning_rate": 1.1894464079144467e-05, "loss": 1.2154, "step": 15309 }, { "epoch": 4.560025316926972, "grad_norm": 0.25847122073173523, "learning_rate": 1.1893516971239224e-05, "loss": 1.2236, "step": 15310 }, { "epoch": 4.5603231631266405, "grad_norm": 0.3084351122379303, "learning_rate": 1.189256984571692e-05, "loss": 1.2022, "step": 15311 }, { "epoch": 4.560621009326309, "grad_norm": 0.2561310827732086, "learning_rate": 1.1891622702586369e-05, "loss": 1.2329, "step": 15312 }, { "epoch": 4.560918855525978, "grad_norm": 0.3563559353351593, "learning_rate": 1.189067554185638e-05, "loss": 1.2152, "step": 15313 }, { "epoch": 4.561216701725646, "grad_norm": 0.3996835947036743, "learning_rate": 1.1889728363535769e-05, "loss": 1.2264, "step": 15314 }, { "epoch": 4.561514547925315, "grad_norm": 0.2627658247947693, "learning_rate": 1.1888781167633345e-05, "loss": 1.2226, "step": 15315 }, { "epoch": 4.561812394124984, "grad_norm": 0.27959635853767395, "learning_rate": 1.1887833954157924e-05, "loss": 1.2228, "step": 15316 }, { "epoch": 4.562110240324652, "grad_norm": 0.263959139585495, "learning_rate": 1.1886886723118317e-05, "loss": 1.2208, "step": 15317 }, { "epoch": 4.562408086524321, "grad_norm": 0.3678414523601532, "learning_rate": 1.1885939474523337e-05, "loss": 1.2181, "step": 15318 }, { "epoch": 4.56270593272399, "grad_norm": 0.33446651697158813, "learning_rate": 1.1884992208381798e-05, "loss": 1.2123, "step": 15319 }, { "epoch": 4.563003778923658, "grad_norm": 0.41127288341522217, "learning_rate": 1.1884044924702511e-05, "loss": 1.2209, "step": 15320 }, { "epoch": 4.5633016251233265, "grad_norm": 0.5332489609718323, "learning_rate": 1.1883097623494293e-05, "loss": 1.2251, "step": 15321 }, { "epoch": 4.563599471322996, "grad_norm": 0.2750576138496399, "learning_rate": 1.1882150304765954e-05, "loss": 1.2172, "step": 15322 }, { "epoch": 4.563897317522664, "grad_norm": 0.3738904297351837, "learning_rate": 1.1881202968526311e-05, "loss": 1.2118, "step": 15323 }, { "epoch": 4.564195163722333, "grad_norm": 0.2758297324180603, "learning_rate": 1.1880255614784173e-05, "loss": 1.2184, "step": 15324 }, { "epoch": 4.564493009922002, "grad_norm": 0.4255402386188507, "learning_rate": 1.1879308243548363e-05, "loss": 1.2258, "step": 15325 }, { "epoch": 4.56479085612167, "grad_norm": 0.28178808093070984, "learning_rate": 1.1878360854827683e-05, "loss": 1.2376, "step": 15326 }, { "epoch": 4.565088702321339, "grad_norm": 0.29741108417510986, "learning_rate": 1.1877413448630954e-05, "loss": 1.2073, "step": 15327 }, { "epoch": 4.5653865485210074, "grad_norm": 0.3053363561630249, "learning_rate": 1.1876466024966993e-05, "loss": 1.204, "step": 15328 }, { "epoch": 4.565684394720676, "grad_norm": 0.2684979736804962, "learning_rate": 1.1875518583844614e-05, "loss": 1.2124, "step": 15329 }, { "epoch": 4.565982240920345, "grad_norm": 0.25791966915130615, "learning_rate": 1.187457112527263e-05, "loss": 1.2185, "step": 15330 }, { "epoch": 4.566280087120013, "grad_norm": 0.298246830701828, "learning_rate": 1.1873623649259853e-05, "loss": 1.2241, "step": 15331 }, { "epoch": 4.566577933319682, "grad_norm": 0.26753464341163635, "learning_rate": 1.1872676155815102e-05, "loss": 1.2097, "step": 15332 }, { "epoch": 4.566875779519351, "grad_norm": 0.32827845215797424, "learning_rate": 1.1871728644947192e-05, "loss": 1.2149, "step": 15333 }, { "epoch": 4.567173625719019, "grad_norm": 0.5097824931144714, "learning_rate": 1.1870781116664937e-05, "loss": 1.196, "step": 15334 }, { "epoch": 4.567471471918688, "grad_norm": 0.2686941623687744, "learning_rate": 1.1869833570977156e-05, "loss": 1.1954, "step": 15335 }, { "epoch": 4.567769318118357, "grad_norm": 0.4071314334869385, "learning_rate": 1.1868886007892664e-05, "loss": 1.2166, "step": 15336 }, { "epoch": 4.568067164318025, "grad_norm": 0.26355648040771484, "learning_rate": 1.1867938427420273e-05, "loss": 1.2242, "step": 15337 }, { "epoch": 4.568365010517694, "grad_norm": 0.355571448802948, "learning_rate": 1.1866990829568803e-05, "loss": 1.2085, "step": 15338 }, { "epoch": 4.568662856717363, "grad_norm": 0.2709072530269623, "learning_rate": 1.1866043214347066e-05, "loss": 1.2248, "step": 15339 }, { "epoch": 4.568960702917031, "grad_norm": 0.44759824872016907, "learning_rate": 1.1865095581763886e-05, "loss": 1.2345, "step": 15340 }, { "epoch": 4.5692585491167, "grad_norm": 0.29392242431640625, "learning_rate": 1.1864147931828075e-05, "loss": 1.2249, "step": 15341 }, { "epoch": 4.5695563953163685, "grad_norm": 0.3723401725292206, "learning_rate": 1.1863200264548449e-05, "loss": 1.2275, "step": 15342 }, { "epoch": 4.569854241516037, "grad_norm": 0.2689083218574524, "learning_rate": 1.1862252579933827e-05, "loss": 1.2123, "step": 15343 }, { "epoch": 4.570152087715706, "grad_norm": 0.3089952766895294, "learning_rate": 1.1861304877993025e-05, "loss": 1.2261, "step": 15344 }, { "epoch": 4.570449933915374, "grad_norm": 0.27428343892097473, "learning_rate": 1.1860357158734858e-05, "loss": 1.2239, "step": 15345 }, { "epoch": 4.570747780115044, "grad_norm": 0.3443639576435089, "learning_rate": 1.1859409422168151e-05, "loss": 1.2121, "step": 15346 }, { "epoch": 4.571045626314712, "grad_norm": 0.2563531696796417, "learning_rate": 1.1858461668301713e-05, "loss": 1.2194, "step": 15347 }, { "epoch": 4.57134347251438, "grad_norm": 0.48409304022789, "learning_rate": 1.1857513897144369e-05, "loss": 1.229, "step": 15348 }, { "epoch": 4.5716413187140486, "grad_norm": 0.31136196851730347, "learning_rate": 1.1856566108704932e-05, "loss": 1.2239, "step": 15349 }, { "epoch": 4.571939164913718, "grad_norm": 0.35811367630958557, "learning_rate": 1.1855618302992218e-05, "loss": 1.2027, "step": 15350 }, { "epoch": 4.572237011113386, "grad_norm": 0.2767625153064728, "learning_rate": 1.1854670480015053e-05, "loss": 1.2293, "step": 15351 }, { "epoch": 4.572534857313055, "grad_norm": 0.3490542471408844, "learning_rate": 1.185372263978225e-05, "loss": 1.2288, "step": 15352 }, { "epoch": 4.572832703512724, "grad_norm": 0.28721633553504944, "learning_rate": 1.1852774782302632e-05, "loss": 1.2203, "step": 15353 }, { "epoch": 4.573130549712392, "grad_norm": 0.308515727519989, "learning_rate": 1.1851826907585012e-05, "loss": 1.2232, "step": 15354 }, { "epoch": 4.573428395912061, "grad_norm": 0.304751992225647, "learning_rate": 1.185087901563821e-05, "loss": 1.2392, "step": 15355 }, { "epoch": 4.5737262421117295, "grad_norm": 0.2567261755466461, "learning_rate": 1.1849931106471048e-05, "loss": 1.2009, "step": 15356 }, { "epoch": 4.574024088311398, "grad_norm": 0.2771204113960266, "learning_rate": 1.1848983180092343e-05, "loss": 1.2224, "step": 15357 }, { "epoch": 4.574321934511067, "grad_norm": 0.2839631736278534, "learning_rate": 1.1848035236510918e-05, "loss": 1.2296, "step": 15358 }, { "epoch": 4.574619780710735, "grad_norm": 0.264902800321579, "learning_rate": 1.1847087275735589e-05, "loss": 1.2084, "step": 15359 }, { "epoch": 4.574917626910404, "grad_norm": 0.28315335512161255, "learning_rate": 1.1846139297775176e-05, "loss": 1.2227, "step": 15360 }, { "epoch": 4.575215473110073, "grad_norm": 0.3367712199687958, "learning_rate": 1.18451913026385e-05, "loss": 1.2345, "step": 15361 }, { "epoch": 4.575513319309741, "grad_norm": 0.24982547760009766, "learning_rate": 1.184424329033438e-05, "loss": 1.2131, "step": 15362 }, { "epoch": 4.5758111655094105, "grad_norm": 0.42404788732528687, "learning_rate": 1.1843295260871637e-05, "loss": 1.2382, "step": 15363 }, { "epoch": 4.576109011709079, "grad_norm": 0.3827584385871887, "learning_rate": 1.1842347214259092e-05, "loss": 1.2157, "step": 15364 }, { "epoch": 4.576406857908747, "grad_norm": 0.388635516166687, "learning_rate": 1.1841399150505561e-05, "loss": 1.2306, "step": 15365 }, { "epoch": 4.576704704108416, "grad_norm": 0.7678462266921997, "learning_rate": 1.1840451069619871e-05, "loss": 1.2196, "step": 15366 }, { "epoch": 4.577002550308085, "grad_norm": 0.3733859062194824, "learning_rate": 1.1839502971610841e-05, "loss": 1.2138, "step": 15367 }, { "epoch": 4.577300396507753, "grad_norm": 0.5718566179275513, "learning_rate": 1.1838554856487288e-05, "loss": 1.2281, "step": 15368 }, { "epoch": 4.577598242707422, "grad_norm": 0.4892123341560364, "learning_rate": 1.1837606724258037e-05, "loss": 1.2249, "step": 15369 }, { "epoch": 4.5778960889070905, "grad_norm": 0.4518187940120697, "learning_rate": 1.1836658574931912e-05, "loss": 1.2234, "step": 15370 }, { "epoch": 4.578193935106759, "grad_norm": 0.4065011143684387, "learning_rate": 1.1835710408517729e-05, "loss": 1.2182, "step": 15371 }, { "epoch": 4.578491781306428, "grad_norm": 0.4031160771846771, "learning_rate": 1.183476222502431e-05, "loss": 1.2178, "step": 15372 }, { "epoch": 4.578789627506096, "grad_norm": 0.3682171106338501, "learning_rate": 1.1833814024460479e-05, "loss": 1.2186, "step": 15373 }, { "epoch": 4.579087473705766, "grad_norm": 0.38047561049461365, "learning_rate": 1.1832865806835056e-05, "loss": 1.2237, "step": 15374 }, { "epoch": 4.579385319905434, "grad_norm": 0.36890560388565063, "learning_rate": 1.1831917572156869e-05, "loss": 1.226, "step": 15375 }, { "epoch": 4.579683166105102, "grad_norm": 0.29262301325798035, "learning_rate": 1.1830969320434732e-05, "loss": 1.225, "step": 15376 }, { "epoch": 4.5799810123047715, "grad_norm": 0.3057573437690735, "learning_rate": 1.1830021051677474e-05, "loss": 1.2198, "step": 15377 }, { "epoch": 4.58027885850444, "grad_norm": 0.31514081358909607, "learning_rate": 1.1829072765893911e-05, "loss": 1.2226, "step": 15378 }, { "epoch": 4.580576704704108, "grad_norm": 0.3018852770328522, "learning_rate": 1.1828124463092872e-05, "loss": 1.2041, "step": 15379 }, { "epoch": 4.580874550903777, "grad_norm": 0.365410715341568, "learning_rate": 1.1827176143283177e-05, "loss": 1.2233, "step": 15380 }, { "epoch": 4.581172397103446, "grad_norm": 0.28774645924568176, "learning_rate": 1.182622780647365e-05, "loss": 1.2243, "step": 15381 }, { "epoch": 4.581470243303114, "grad_norm": 0.36222878098487854, "learning_rate": 1.182527945267311e-05, "loss": 1.214, "step": 15382 }, { "epoch": 4.581768089502783, "grad_norm": 0.268823504447937, "learning_rate": 1.182433108189039e-05, "loss": 1.2044, "step": 15383 }, { "epoch": 4.582065935702452, "grad_norm": 0.3325064182281494, "learning_rate": 1.1823382694134305e-05, "loss": 1.2306, "step": 15384 }, { "epoch": 4.582363781902121, "grad_norm": 0.3218379616737366, "learning_rate": 1.182243428941368e-05, "loss": 1.2235, "step": 15385 }, { "epoch": 4.582661628101789, "grad_norm": 0.25131484866142273, "learning_rate": 1.1821485867737337e-05, "loss": 1.2322, "step": 15386 }, { "epoch": 4.5829594743014574, "grad_norm": 0.3831733763217926, "learning_rate": 1.1820537429114109e-05, "loss": 1.2131, "step": 15387 }, { "epoch": 4.583257320501126, "grad_norm": 0.318324476480484, "learning_rate": 1.1819588973552812e-05, "loss": 1.2138, "step": 15388 }, { "epoch": 4.583555166700795, "grad_norm": 0.3682376444339752, "learning_rate": 1.1818640501062272e-05, "loss": 1.2151, "step": 15389 }, { "epoch": 4.583853012900463, "grad_norm": 0.4829396605491638, "learning_rate": 1.1817692011651318e-05, "loss": 1.219, "step": 15390 }, { "epoch": 4.5841508591001325, "grad_norm": 0.6712361574172974, "learning_rate": 1.1816743505328764e-05, "loss": 1.239, "step": 15391 }, { "epoch": 4.584448705299801, "grad_norm": 0.388353168964386, "learning_rate": 1.1815794982103443e-05, "loss": 1.215, "step": 15392 }, { "epoch": 4.584746551499469, "grad_norm": 0.354200541973114, "learning_rate": 1.1814846441984182e-05, "loss": 1.2278, "step": 15393 }, { "epoch": 4.585044397699138, "grad_norm": 0.2890750467777252, "learning_rate": 1.1813897884979802e-05, "loss": 1.2172, "step": 15394 }, { "epoch": 4.585342243898807, "grad_norm": 0.3579188883304596, "learning_rate": 1.1812949311099127e-05, "loss": 1.206, "step": 15395 }, { "epoch": 4.585640090098475, "grad_norm": 0.41335612535476685, "learning_rate": 1.1812000720350984e-05, "loss": 1.2205, "step": 15396 }, { "epoch": 4.585937936298144, "grad_norm": 0.3208654820919037, "learning_rate": 1.1811052112744198e-05, "loss": 1.2112, "step": 15397 }, { "epoch": 4.586235782497813, "grad_norm": 0.5678019523620605, "learning_rate": 1.1810103488287598e-05, "loss": 1.209, "step": 15398 }, { "epoch": 4.586533628697481, "grad_norm": 0.26002275943756104, "learning_rate": 1.1809154846990007e-05, "loss": 1.2132, "step": 15399 }, { "epoch": 4.58683147489715, "grad_norm": 0.48814716935157776, "learning_rate": 1.1808206188860254e-05, "loss": 1.2104, "step": 15400 }, { "epoch": 4.5871293210968185, "grad_norm": 0.2996443212032318, "learning_rate": 1.1807257513907158e-05, "loss": 1.2302, "step": 15401 }, { "epoch": 4.587427167296488, "grad_norm": 0.508868396282196, "learning_rate": 1.1806308822139551e-05, "loss": 1.219, "step": 15402 }, { "epoch": 4.587725013496156, "grad_norm": 0.383939266204834, "learning_rate": 1.180536011356626e-05, "loss": 1.2166, "step": 15403 }, { "epoch": 4.588022859695824, "grad_norm": 0.41564667224884033, "learning_rate": 1.1804411388196108e-05, "loss": 1.227, "step": 15404 }, { "epoch": 4.588320705895494, "grad_norm": 0.2985652685165405, "learning_rate": 1.1803462646037927e-05, "loss": 1.2273, "step": 15405 }, { "epoch": 4.588618552095162, "grad_norm": 0.5704119205474854, "learning_rate": 1.1802513887100538e-05, "loss": 1.2222, "step": 15406 }, { "epoch": 4.58891639829483, "grad_norm": 0.4303019642829895, "learning_rate": 1.1801565111392774e-05, "loss": 1.239, "step": 15407 }, { "epoch": 4.589214244494499, "grad_norm": 0.38110148906707764, "learning_rate": 1.1800616318923458e-05, "loss": 1.2265, "step": 15408 }, { "epoch": 4.589512090694168, "grad_norm": 0.3087179362773895, "learning_rate": 1.1799667509701415e-05, "loss": 1.2204, "step": 15409 }, { "epoch": 4.589809936893836, "grad_norm": 0.45208391547203064, "learning_rate": 1.1798718683735482e-05, "loss": 1.2287, "step": 15410 }, { "epoch": 4.590107783093505, "grad_norm": 0.26270031929016113, "learning_rate": 1.1797769841034481e-05, "loss": 1.2302, "step": 15411 }, { "epoch": 4.590405629293174, "grad_norm": 0.5112342834472656, "learning_rate": 1.179682098160724e-05, "loss": 1.2397, "step": 15412 }, { "epoch": 4.590703475492843, "grad_norm": 0.2571510076522827, "learning_rate": 1.1795872105462585e-05, "loss": 1.2139, "step": 15413 }, { "epoch": 4.591001321692511, "grad_norm": 0.4395637810230255, "learning_rate": 1.179492321260935e-05, "loss": 1.2152, "step": 15414 }, { "epoch": 4.5912991678921795, "grad_norm": 0.2952497601509094, "learning_rate": 1.1793974303056355e-05, "loss": 1.209, "step": 15415 }, { "epoch": 4.591597014091848, "grad_norm": 0.28428253531455994, "learning_rate": 1.1793025376812434e-05, "loss": 1.2315, "step": 15416 }, { "epoch": 4.591894860291517, "grad_norm": 0.282982736825943, "learning_rate": 1.1792076433886421e-05, "loss": 1.2078, "step": 15417 }, { "epoch": 4.592192706491185, "grad_norm": 0.2425025850534439, "learning_rate": 1.1791127474287136e-05, "loss": 1.2169, "step": 15418 }, { "epoch": 4.592490552690855, "grad_norm": 0.27073776721954346, "learning_rate": 1.179017849802341e-05, "loss": 1.2076, "step": 15419 }, { "epoch": 4.592788398890523, "grad_norm": 0.28246089816093445, "learning_rate": 1.1789229505104074e-05, "loss": 1.2157, "step": 15420 }, { "epoch": 4.593086245090191, "grad_norm": 0.25326624512672424, "learning_rate": 1.1788280495537954e-05, "loss": 1.2222, "step": 15421 }, { "epoch": 4.5933840912898605, "grad_norm": 0.2939671576023102, "learning_rate": 1.1787331469333885e-05, "loss": 1.2278, "step": 15422 }, { "epoch": 4.593681937489529, "grad_norm": 0.4320828914642334, "learning_rate": 1.1786382426500691e-05, "loss": 1.223, "step": 15423 }, { "epoch": 4.593979783689197, "grad_norm": 0.4275114834308624, "learning_rate": 1.1785433367047207e-05, "loss": 1.2146, "step": 15424 }, { "epoch": 4.594277629888866, "grad_norm": 0.32131412625312805, "learning_rate": 1.1784484290982262e-05, "loss": 1.2204, "step": 15425 }, { "epoch": 4.594575476088535, "grad_norm": 0.5534677505493164, "learning_rate": 1.178353519831468e-05, "loss": 1.2217, "step": 15426 }, { "epoch": 4.594873322288203, "grad_norm": 0.27633920311927795, "learning_rate": 1.1782586089053293e-05, "loss": 1.2145, "step": 15427 }, { "epoch": 4.595171168487872, "grad_norm": 0.5427194833755493, "learning_rate": 1.1781636963206942e-05, "loss": 1.2186, "step": 15428 }, { "epoch": 4.5954690146875405, "grad_norm": 0.2924329340457916, "learning_rate": 1.1780687820784447e-05, "loss": 1.2183, "step": 15429 }, { "epoch": 4.59576686088721, "grad_norm": 0.7174497842788696, "learning_rate": 1.177973866179464e-05, "loss": 1.2177, "step": 15430 }, { "epoch": 4.596064707086878, "grad_norm": 0.25184351205825806, "learning_rate": 1.1778789486246356e-05, "loss": 1.2162, "step": 15431 }, { "epoch": 4.596362553286546, "grad_norm": 0.5384388566017151, "learning_rate": 1.1777840294148419e-05, "loss": 1.2356, "step": 15432 }, { "epoch": 4.596660399486216, "grad_norm": 0.26802632212638855, "learning_rate": 1.1776891085509664e-05, "loss": 1.2403, "step": 15433 }, { "epoch": 4.596958245685884, "grad_norm": 0.527765154838562, "learning_rate": 1.1775941860338924e-05, "loss": 1.1969, "step": 15434 }, { "epoch": 4.597256091885552, "grad_norm": 0.2646426558494568, "learning_rate": 1.1774992618645034e-05, "loss": 1.2053, "step": 15435 }, { "epoch": 4.5975539380852215, "grad_norm": 0.4056178033351898, "learning_rate": 1.1774043360436816e-05, "loss": 1.2151, "step": 15436 }, { "epoch": 4.59785178428489, "grad_norm": 0.3224065899848938, "learning_rate": 1.1773094085723107e-05, "loss": 1.2314, "step": 15437 }, { "epoch": 4.598149630484558, "grad_norm": 0.26547330617904663, "learning_rate": 1.177214479451274e-05, "loss": 1.2156, "step": 15438 }, { "epoch": 4.598447476684227, "grad_norm": 0.31536105275154114, "learning_rate": 1.1771195486814544e-05, "loss": 1.2187, "step": 15439 }, { "epoch": 4.598745322883896, "grad_norm": 0.2992311716079712, "learning_rate": 1.1770246162637353e-05, "loss": 1.2318, "step": 15440 }, { "epoch": 4.599043169083565, "grad_norm": 0.3289506435394287, "learning_rate": 1.1769296821990001e-05, "loss": 1.2394, "step": 15441 }, { "epoch": 4.599341015283233, "grad_norm": 0.31637370586395264, "learning_rate": 1.1768347464881318e-05, "loss": 1.2197, "step": 15442 }, { "epoch": 4.599638861482902, "grad_norm": 0.3021043539047241, "learning_rate": 1.1767398091320136e-05, "loss": 1.2324, "step": 15443 }, { "epoch": 4.599936707682571, "grad_norm": 0.31690776348114014, "learning_rate": 1.1766448701315292e-05, "loss": 1.2231, "step": 15444 }, { "epoch": 4.600234553882239, "grad_norm": 0.3074594736099243, "learning_rate": 1.1765499294875613e-05, "loss": 1.2424, "step": 15445 }, { "epoch": 4.6005324000819074, "grad_norm": 0.28958991169929504, "learning_rate": 1.1764549872009938e-05, "loss": 1.2234, "step": 15446 }, { "epoch": 4.600830246281577, "grad_norm": 0.27834829688072205, "learning_rate": 1.1763600432727098e-05, "loss": 1.225, "step": 15447 }, { "epoch": 4.601128092481245, "grad_norm": 0.3386121392250061, "learning_rate": 1.1762650977035928e-05, "loss": 1.2376, "step": 15448 }, { "epoch": 4.601425938680913, "grad_norm": 0.3685445785522461, "learning_rate": 1.1761701504945258e-05, "loss": 1.2089, "step": 15449 }, { "epoch": 4.6017237848805825, "grad_norm": 0.29060253500938416, "learning_rate": 1.1760752016463923e-05, "loss": 1.2152, "step": 15450 }, { "epoch": 4.602021631080251, "grad_norm": 0.4046565592288971, "learning_rate": 1.1759802511600756e-05, "loss": 1.2211, "step": 15451 }, { "epoch": 4.60231947727992, "grad_norm": 0.25819021463394165, "learning_rate": 1.1758852990364597e-05, "loss": 1.2136, "step": 15452 }, { "epoch": 4.602617323479588, "grad_norm": 0.3166137635707855, "learning_rate": 1.1757903452764274e-05, "loss": 1.2142, "step": 15453 }, { "epoch": 4.602915169679257, "grad_norm": 0.2821340560913086, "learning_rate": 1.1756953898808625e-05, "loss": 1.2203, "step": 15454 }, { "epoch": 4.603213015878925, "grad_norm": 0.32758796215057373, "learning_rate": 1.1756004328506484e-05, "loss": 1.2209, "step": 15455 }, { "epoch": 4.603510862078594, "grad_norm": 0.41818171739578247, "learning_rate": 1.1755054741866678e-05, "loss": 1.2165, "step": 15456 }, { "epoch": 4.603808708278263, "grad_norm": 0.5267404317855835, "learning_rate": 1.1754105138898054e-05, "loss": 1.2176, "step": 15457 }, { "epoch": 4.604106554477932, "grad_norm": 0.2890204191207886, "learning_rate": 1.1753155519609442e-05, "loss": 1.2276, "step": 15458 }, { "epoch": 4.6044044006776, "grad_norm": 0.4943948984146118, "learning_rate": 1.1752205884009675e-05, "loss": 1.2368, "step": 15459 }, { "epoch": 4.6047022468772685, "grad_norm": 0.29402250051498413, "learning_rate": 1.1751256232107589e-05, "loss": 1.215, "step": 15460 }, { "epoch": 4.605000093076938, "grad_norm": 0.5472093820571899, "learning_rate": 1.1750306563912022e-05, "loss": 1.2082, "step": 15461 }, { "epoch": 4.605297939276606, "grad_norm": 0.3013838529586792, "learning_rate": 1.1749356879431806e-05, "loss": 1.2057, "step": 15462 }, { "epoch": 4.605595785476274, "grad_norm": 0.45778924226760864, "learning_rate": 1.1748407178675781e-05, "loss": 1.2119, "step": 15463 }, { "epoch": 4.605893631675944, "grad_norm": 0.4487936496734619, "learning_rate": 1.1747457461652778e-05, "loss": 1.2164, "step": 15464 }, { "epoch": 4.606191477875612, "grad_norm": 0.4020189642906189, "learning_rate": 1.1746507728371638e-05, "loss": 1.2293, "step": 15465 }, { "epoch": 4.60648932407528, "grad_norm": 0.683911919593811, "learning_rate": 1.1745557978841195e-05, "loss": 1.225, "step": 15466 }, { "epoch": 4.606787170274949, "grad_norm": 0.25149503350257874, "learning_rate": 1.1744608213070285e-05, "loss": 1.2149, "step": 15467 }, { "epoch": 4.607085016474618, "grad_norm": 0.35296499729156494, "learning_rate": 1.1743658431067741e-05, "loss": 1.227, "step": 15468 }, { "epoch": 4.607382862674287, "grad_norm": 0.3388794958591461, "learning_rate": 1.174270863284241e-05, "loss": 1.2063, "step": 15469 }, { "epoch": 4.607680708873955, "grad_norm": 0.46058595180511475, "learning_rate": 1.1741758818403117e-05, "loss": 1.2285, "step": 15470 }, { "epoch": 4.607978555073624, "grad_norm": 0.2816953957080841, "learning_rate": 1.1740808987758708e-05, "loss": 1.2243, "step": 15471 }, { "epoch": 4.608276401273293, "grad_norm": 0.349196195602417, "learning_rate": 1.1739859140918016e-05, "loss": 1.2253, "step": 15472 }, { "epoch": 4.608574247472961, "grad_norm": 0.27341097593307495, "learning_rate": 1.1738909277889877e-05, "loss": 1.2106, "step": 15473 }, { "epoch": 4.6088720936726295, "grad_norm": 0.3639007806777954, "learning_rate": 1.1737959398683127e-05, "loss": 1.2246, "step": 15474 }, { "epoch": 4.609169939872299, "grad_norm": 0.26420873403549194, "learning_rate": 1.173700950330661e-05, "loss": 1.2064, "step": 15475 }, { "epoch": 4.609467786071967, "grad_norm": 0.3559967875480652, "learning_rate": 1.173605959176916e-05, "loss": 1.2236, "step": 15476 }, { "epoch": 4.609765632271635, "grad_norm": 0.39635011553764343, "learning_rate": 1.1735109664079616e-05, "loss": 1.2199, "step": 15477 }, { "epoch": 4.610063478471305, "grad_norm": 0.3912164270877838, "learning_rate": 1.1734159720246814e-05, "loss": 1.2112, "step": 15478 }, { "epoch": 4.610361324670973, "grad_norm": 0.28523966670036316, "learning_rate": 1.1733209760279593e-05, "loss": 1.2078, "step": 15479 }, { "epoch": 4.610659170870642, "grad_norm": 0.2978664040565491, "learning_rate": 1.1732259784186793e-05, "loss": 1.2102, "step": 15480 }, { "epoch": 4.6109570170703105, "grad_norm": 0.2843220829963684, "learning_rate": 1.1731309791977248e-05, "loss": 1.215, "step": 15481 }, { "epoch": 4.611254863269979, "grad_norm": 0.32204434275627136, "learning_rate": 1.1730359783659804e-05, "loss": 1.2173, "step": 15482 }, { "epoch": 4.611552709469647, "grad_norm": 0.3042689561843872, "learning_rate": 1.1729409759243293e-05, "loss": 1.2232, "step": 15483 }, { "epoch": 4.611850555669316, "grad_norm": 0.3689742088317871, "learning_rate": 1.1728459718736557e-05, "loss": 1.2217, "step": 15484 }, { "epoch": 4.612148401868985, "grad_norm": 0.545839250087738, "learning_rate": 1.1727509662148435e-05, "loss": 1.234, "step": 15485 }, { "epoch": 4.612446248068654, "grad_norm": 0.2659914791584015, "learning_rate": 1.1726559589487763e-05, "loss": 1.2318, "step": 15486 }, { "epoch": 4.612744094268322, "grad_norm": 0.40329739451408386, "learning_rate": 1.1725609500763386e-05, "loss": 1.2322, "step": 15487 }, { "epoch": 4.6130419404679905, "grad_norm": 0.27586081624031067, "learning_rate": 1.172465939598414e-05, "loss": 1.2176, "step": 15488 }, { "epoch": 4.61333978666766, "grad_norm": 0.5300125479698181, "learning_rate": 1.1723709275158865e-05, "loss": 1.2125, "step": 15489 }, { "epoch": 4.613637632867328, "grad_norm": 0.4316484034061432, "learning_rate": 1.17227591382964e-05, "loss": 1.2147, "step": 15490 }, { "epoch": 4.613935479066996, "grad_norm": 0.4001738131046295, "learning_rate": 1.1721808985405587e-05, "loss": 1.21, "step": 15491 }, { "epoch": 4.614233325266666, "grad_norm": 0.6518998146057129, "learning_rate": 1.1720858816495265e-05, "loss": 1.1933, "step": 15492 }, { "epoch": 4.614531171466334, "grad_norm": 0.3006100356578827, "learning_rate": 1.1719908631574276e-05, "loss": 1.2403, "step": 15493 }, { "epoch": 4.614829017666002, "grad_norm": 0.374083936214447, "learning_rate": 1.1718958430651455e-05, "loss": 1.2228, "step": 15494 }, { "epoch": 4.6151268638656715, "grad_norm": 0.2627086639404297, "learning_rate": 1.1718008213735648e-05, "loss": 1.2205, "step": 15495 }, { "epoch": 4.61542471006534, "grad_norm": 0.2966965436935425, "learning_rate": 1.1717057980835698e-05, "loss": 1.2198, "step": 15496 }, { "epoch": 4.615722556265009, "grad_norm": 0.30310168862342834, "learning_rate": 1.1716107731960435e-05, "loss": 1.2076, "step": 15497 }, { "epoch": 4.616020402464677, "grad_norm": 0.6953670382499695, "learning_rate": 1.171515746711871e-05, "loss": 1.2296, "step": 15498 }, { "epoch": 4.616318248664346, "grad_norm": 0.5464868545532227, "learning_rate": 1.1714207186319364e-05, "loss": 1.1972, "step": 15499 }, { "epoch": 4.616616094864015, "grad_norm": 0.6416321992874146, "learning_rate": 1.1713256889571231e-05, "loss": 1.2038, "step": 15500 }, { "epoch": 4.616616094864015, "eval_loss": 1.3223536014556885, "eval_runtime": 107.9226, "eval_samples_per_second": 16.067, "eval_steps_per_second": 1.01, "step": 15500 }, { "epoch": 4.616913941063683, "grad_norm": 0.5187292098999023, "learning_rate": 1.1712306576883159e-05, "loss": 1.237, "step": 15501 }, { "epoch": 4.617211787263352, "grad_norm": 0.5971845388412476, "learning_rate": 1.1711356248263987e-05, "loss": 1.2101, "step": 15502 }, { "epoch": 4.617509633463021, "grad_norm": 0.3387489914894104, "learning_rate": 1.1710405903722558e-05, "loss": 1.2173, "step": 15503 }, { "epoch": 4.617807479662689, "grad_norm": 0.3731728792190552, "learning_rate": 1.1709455543267709e-05, "loss": 1.231, "step": 15504 }, { "epoch": 4.6181053258623574, "grad_norm": 0.28528162837028503, "learning_rate": 1.1708505166908291e-05, "loss": 1.2146, "step": 15505 }, { "epoch": 4.618403172062027, "grad_norm": 0.31289324164390564, "learning_rate": 1.170755477465314e-05, "loss": 1.2259, "step": 15506 }, { "epoch": 4.618701018261695, "grad_norm": 0.25294843316078186, "learning_rate": 1.1706604366511099e-05, "loss": 1.2331, "step": 15507 }, { "epoch": 4.618998864461364, "grad_norm": 0.29154565930366516, "learning_rate": 1.1705653942491011e-05, "loss": 1.2306, "step": 15508 }, { "epoch": 4.6192967106610325, "grad_norm": 0.2750505208969116, "learning_rate": 1.170470350260172e-05, "loss": 1.2082, "step": 15509 }, { "epoch": 4.619594556860701, "grad_norm": 0.2772367596626282, "learning_rate": 1.1703753046852065e-05, "loss": 1.2395, "step": 15510 }, { "epoch": 4.61989240306037, "grad_norm": 0.2575388550758362, "learning_rate": 1.1702802575250893e-05, "loss": 1.1939, "step": 15511 }, { "epoch": 4.620190249260038, "grad_norm": 0.2751673460006714, "learning_rate": 1.1701852087807045e-05, "loss": 1.2306, "step": 15512 }, { "epoch": 4.620488095459707, "grad_norm": 0.3043292462825775, "learning_rate": 1.1700901584529368e-05, "loss": 1.2152, "step": 15513 }, { "epoch": 4.620785941659376, "grad_norm": 0.24301272630691528, "learning_rate": 1.1699951065426698e-05, "loss": 1.2026, "step": 15514 }, { "epoch": 4.621083787859044, "grad_norm": 0.3393288254737854, "learning_rate": 1.169900053050788e-05, "loss": 1.2104, "step": 15515 }, { "epoch": 4.621381634058713, "grad_norm": 0.2605547308921814, "learning_rate": 1.1698049979781766e-05, "loss": 1.2306, "step": 15516 }, { "epoch": 4.621679480258382, "grad_norm": 0.2591628432273865, "learning_rate": 1.1697099413257193e-05, "loss": 1.2205, "step": 15517 }, { "epoch": 4.62197732645805, "grad_norm": 0.3370744287967682, "learning_rate": 1.1696148830943005e-05, "loss": 1.2036, "step": 15518 }, { "epoch": 4.622275172657719, "grad_norm": 0.26255717873573303, "learning_rate": 1.1695198232848047e-05, "loss": 1.2229, "step": 15519 }, { "epoch": 4.622573018857388, "grad_norm": 0.2808741331100464, "learning_rate": 1.1694247618981165e-05, "loss": 1.2335, "step": 15520 }, { "epoch": 4.622870865057056, "grad_norm": 0.24790742993354797, "learning_rate": 1.1693296989351199e-05, "loss": 1.2138, "step": 15521 }, { "epoch": 4.623168711256724, "grad_norm": 0.24825873970985413, "learning_rate": 1.1692346343966999e-05, "loss": 1.2269, "step": 15522 }, { "epoch": 4.623466557456394, "grad_norm": 0.24359354376792908, "learning_rate": 1.1691395682837407e-05, "loss": 1.2123, "step": 15523 }, { "epoch": 4.623764403656062, "grad_norm": 0.3074105679988861, "learning_rate": 1.1690445005971268e-05, "loss": 1.2262, "step": 15524 }, { "epoch": 4.624062249855731, "grad_norm": 0.3166869878768921, "learning_rate": 1.1689494313377426e-05, "loss": 1.2116, "step": 15525 }, { "epoch": 4.624360096055399, "grad_norm": 0.3546818792819977, "learning_rate": 1.1688543605064726e-05, "loss": 1.2141, "step": 15526 }, { "epoch": 4.624657942255068, "grad_norm": 0.3457008898258209, "learning_rate": 1.1687592881042016e-05, "loss": 1.2204, "step": 15527 }, { "epoch": 4.624955788454737, "grad_norm": 0.3059236407279968, "learning_rate": 1.1686642141318139e-05, "loss": 1.2192, "step": 15528 }, { "epoch": 4.625253634654405, "grad_norm": 0.3162040710449219, "learning_rate": 1.168569138590194e-05, "loss": 1.2169, "step": 15529 }, { "epoch": 4.625551480854074, "grad_norm": 0.40984293818473816, "learning_rate": 1.1684740614802271e-05, "loss": 1.2261, "step": 15530 }, { "epoch": 4.625849327053743, "grad_norm": 0.3095340430736542, "learning_rate": 1.1683789828027967e-05, "loss": 1.2285, "step": 15531 }, { "epoch": 4.626147173253411, "grad_norm": 0.48104336857795715, "learning_rate": 1.1682839025587882e-05, "loss": 1.2322, "step": 15532 }, { "epoch": 4.6264450194530795, "grad_norm": 0.4585086405277252, "learning_rate": 1.168188820749086e-05, "loss": 1.2205, "step": 15533 }, { "epoch": 4.626742865652749, "grad_norm": 0.2756181061267853, "learning_rate": 1.168093737374575e-05, "loss": 1.2185, "step": 15534 }, { "epoch": 4.627040711852417, "grad_norm": 0.32118865847587585, "learning_rate": 1.167998652436139e-05, "loss": 1.2071, "step": 15535 }, { "epoch": 4.627338558052086, "grad_norm": 0.37187910079956055, "learning_rate": 1.1679035659346637e-05, "loss": 1.2074, "step": 15536 }, { "epoch": 4.627636404251755, "grad_norm": 0.4208984971046448, "learning_rate": 1.1678084778710335e-05, "loss": 1.2189, "step": 15537 }, { "epoch": 4.627934250451423, "grad_norm": 0.2950340509414673, "learning_rate": 1.1677133882461322e-05, "loss": 1.2394, "step": 15538 }, { "epoch": 4.628232096651092, "grad_norm": 0.3526626229286194, "learning_rate": 1.1676182970608457e-05, "loss": 1.2013, "step": 15539 }, { "epoch": 4.6285299428507605, "grad_norm": 0.31066569685935974, "learning_rate": 1.1675232043160583e-05, "loss": 1.2214, "step": 15540 }, { "epoch": 4.628827789050429, "grad_norm": 0.31988176703453064, "learning_rate": 1.1674281100126544e-05, "loss": 1.2112, "step": 15541 }, { "epoch": 4.629125635250098, "grad_norm": 0.3039456903934479, "learning_rate": 1.167333014151519e-05, "loss": 1.2352, "step": 15542 }, { "epoch": 4.629423481449766, "grad_norm": 0.3332209885120392, "learning_rate": 1.1672379167335368e-05, "loss": 1.2292, "step": 15543 }, { "epoch": 4.629721327649435, "grad_norm": 0.3084508180618286, "learning_rate": 1.1671428177595927e-05, "loss": 1.2331, "step": 15544 }, { "epoch": 4.630019173849104, "grad_norm": 0.4446774423122406, "learning_rate": 1.1670477172305714e-05, "loss": 1.2072, "step": 15545 }, { "epoch": 4.630317020048772, "grad_norm": 0.2686777412891388, "learning_rate": 1.1669526151473577e-05, "loss": 1.2094, "step": 15546 }, { "epoch": 4.630614866248441, "grad_norm": 0.3131929039955139, "learning_rate": 1.166857511510837e-05, "loss": 1.2167, "step": 15547 }, { "epoch": 4.63091271244811, "grad_norm": 0.3872765302658081, "learning_rate": 1.166762406321893e-05, "loss": 1.2354, "step": 15548 }, { "epoch": 4.631210558647778, "grad_norm": 0.3278713524341583, "learning_rate": 1.1666672995814113e-05, "loss": 1.2301, "step": 15549 }, { "epoch": 4.631508404847446, "grad_norm": 0.36869022250175476, "learning_rate": 1.1665721912902766e-05, "loss": 1.2016, "step": 15550 }, { "epoch": 4.631806251047116, "grad_norm": 0.4042108356952667, "learning_rate": 1.1664770814493736e-05, "loss": 1.2217, "step": 15551 }, { "epoch": 4.632104097246784, "grad_norm": 0.3759974241256714, "learning_rate": 1.1663819700595872e-05, "loss": 1.2249, "step": 15552 }, { "epoch": 4.632401943446453, "grad_norm": 0.2642388343811035, "learning_rate": 1.1662868571218029e-05, "loss": 1.2101, "step": 15553 }, { "epoch": 4.6326997896461215, "grad_norm": 0.30808690190315247, "learning_rate": 1.1661917426369051e-05, "loss": 1.2191, "step": 15554 }, { "epoch": 4.63299763584579, "grad_norm": 0.28365305066108704, "learning_rate": 1.166096626605779e-05, "loss": 1.2182, "step": 15555 }, { "epoch": 4.633295482045459, "grad_norm": 0.26377856731414795, "learning_rate": 1.1660015090293087e-05, "loss": 1.2223, "step": 15556 }, { "epoch": 4.633593328245127, "grad_norm": 0.29196006059646606, "learning_rate": 1.1659063899083805e-05, "loss": 1.2169, "step": 15557 }, { "epoch": 4.633891174444796, "grad_norm": 0.2571171224117279, "learning_rate": 1.1658112692438783e-05, "loss": 1.2134, "step": 15558 }, { "epoch": 4.634189020644465, "grad_norm": 0.2785705029964447, "learning_rate": 1.1657161470366876e-05, "loss": 1.2092, "step": 15559 }, { "epoch": 4.634486866844133, "grad_norm": 0.4112585186958313, "learning_rate": 1.1656210232876933e-05, "loss": 1.2163, "step": 15560 }, { "epoch": 4.634784713043802, "grad_norm": 0.31251394748687744, "learning_rate": 1.1655258979977804e-05, "loss": 1.2175, "step": 15561 }, { "epoch": 4.635082559243471, "grad_norm": 0.3197955787181854, "learning_rate": 1.1654307711678338e-05, "loss": 1.2283, "step": 15562 }, { "epoch": 4.635380405443139, "grad_norm": 0.38307496905326843, "learning_rate": 1.165335642798739e-05, "loss": 1.2165, "step": 15563 }, { "epoch": 4.635678251642808, "grad_norm": 0.36435335874557495, "learning_rate": 1.165240512891381e-05, "loss": 1.2254, "step": 15564 }, { "epoch": 4.635976097842477, "grad_norm": 0.3051356077194214, "learning_rate": 1.1651453814466443e-05, "loss": 1.2249, "step": 15565 }, { "epoch": 4.636273944042145, "grad_norm": 0.36144301295280457, "learning_rate": 1.1650502484654143e-05, "loss": 1.2159, "step": 15566 }, { "epoch": 4.636571790241814, "grad_norm": 0.3717021048069, "learning_rate": 1.1649551139485759e-05, "loss": 1.2144, "step": 15567 }, { "epoch": 4.6368696364414825, "grad_norm": 0.3736661672592163, "learning_rate": 1.164859977897015e-05, "loss": 1.2143, "step": 15568 }, { "epoch": 4.637167482641151, "grad_norm": 0.35798725485801697, "learning_rate": 1.1647648403116159e-05, "loss": 1.2218, "step": 15569 }, { "epoch": 4.63746532884082, "grad_norm": 0.28397032618522644, "learning_rate": 1.164669701193264e-05, "loss": 1.2176, "step": 15570 }, { "epoch": 4.637763175040488, "grad_norm": 0.33508992195129395, "learning_rate": 1.1645745605428447e-05, "loss": 1.2243, "step": 15571 }, { "epoch": 4.638061021240157, "grad_norm": 0.3435896039009094, "learning_rate": 1.1644794183612429e-05, "loss": 1.2253, "step": 15572 }, { "epoch": 4.638358867439826, "grad_norm": 0.24336761236190796, "learning_rate": 1.1643842746493438e-05, "loss": 1.2001, "step": 15573 }, { "epoch": 4.638656713639494, "grad_norm": 0.30765971541404724, "learning_rate": 1.1642891294080329e-05, "loss": 1.2055, "step": 15574 }, { "epoch": 4.6389545598391635, "grad_norm": 0.31240686774253845, "learning_rate": 1.164193982638195e-05, "loss": 1.2058, "step": 15575 }, { "epoch": 4.639252406038832, "grad_norm": 0.2881528437137604, "learning_rate": 1.1640988343407155e-05, "loss": 1.2254, "step": 15576 }, { "epoch": 4.6395502522385, "grad_norm": 0.5720462203025818, "learning_rate": 1.1640036845164798e-05, "loss": 1.2156, "step": 15577 }, { "epoch": 4.639848098438169, "grad_norm": 0.50917649269104, "learning_rate": 1.1639085331663734e-05, "loss": 1.2173, "step": 15578 }, { "epoch": 4.640145944637838, "grad_norm": 0.2722722589969635, "learning_rate": 1.1638133802912805e-05, "loss": 1.2153, "step": 15579 }, { "epoch": 4.640443790837506, "grad_norm": 0.3898542523384094, "learning_rate": 1.1637182258920874e-05, "loss": 1.2224, "step": 15580 }, { "epoch": 4.640741637037175, "grad_norm": 0.4063347280025482, "learning_rate": 1.1636230699696795e-05, "loss": 1.2026, "step": 15581 }, { "epoch": 4.641039483236844, "grad_norm": 0.2960827350616455, "learning_rate": 1.1635279125249415e-05, "loss": 1.2266, "step": 15582 }, { "epoch": 4.641337329436512, "grad_norm": 0.43661361932754517, "learning_rate": 1.1634327535587588e-05, "loss": 1.2301, "step": 15583 }, { "epoch": 4.641635175636181, "grad_norm": 0.2536190152168274, "learning_rate": 1.163337593072017e-05, "loss": 1.2171, "step": 15584 }, { "epoch": 4.641933021835849, "grad_norm": 0.31220099329948425, "learning_rate": 1.1632424310656014e-05, "loss": 1.2195, "step": 15585 }, { "epoch": 4.642230868035519, "grad_norm": 0.2687770128250122, "learning_rate": 1.1631472675403974e-05, "loss": 1.2258, "step": 15586 }, { "epoch": 4.642528714235187, "grad_norm": 0.3222101330757141, "learning_rate": 1.1630521024972905e-05, "loss": 1.2145, "step": 15587 }, { "epoch": 4.642826560434855, "grad_norm": 0.2599417567253113, "learning_rate": 1.162956935937166e-05, "loss": 1.224, "step": 15588 }, { "epoch": 4.643124406634524, "grad_norm": 0.2568964958190918, "learning_rate": 1.1628617678609091e-05, "loss": 1.2298, "step": 15589 }, { "epoch": 4.643422252834193, "grad_norm": 0.3904465138912201, "learning_rate": 1.1627665982694054e-05, "loss": 1.2271, "step": 15590 }, { "epoch": 4.643720099033861, "grad_norm": 0.6142756938934326, "learning_rate": 1.1626714271635402e-05, "loss": 1.2276, "step": 15591 }, { "epoch": 4.64401794523353, "grad_norm": 0.2873811721801758, "learning_rate": 1.1625762545441995e-05, "loss": 1.2198, "step": 15592 }, { "epoch": 4.644315791433199, "grad_norm": 0.6036465167999268, "learning_rate": 1.1624810804122682e-05, "loss": 1.207, "step": 15593 }, { "epoch": 4.644613637632867, "grad_norm": 0.4465596079826355, "learning_rate": 1.1623859047686317e-05, "loss": 1.2128, "step": 15594 }, { "epoch": 4.644911483832536, "grad_norm": 0.42341628670692444, "learning_rate": 1.1622907276141764e-05, "loss": 1.211, "step": 15595 }, { "epoch": 4.645209330032205, "grad_norm": 0.581132709980011, "learning_rate": 1.1621955489497867e-05, "loss": 1.2178, "step": 15596 }, { "epoch": 4.645507176231873, "grad_norm": 0.4056399166584015, "learning_rate": 1.1621003687763486e-05, "loss": 1.224, "step": 15597 }, { "epoch": 4.645805022431542, "grad_norm": 0.4697325527667999, "learning_rate": 1.162005187094748e-05, "loss": 1.2134, "step": 15598 }, { "epoch": 4.6461028686312105, "grad_norm": 0.28784817457199097, "learning_rate": 1.1619100039058699e-05, "loss": 1.2096, "step": 15599 }, { "epoch": 4.646400714830879, "grad_norm": 0.45402616262435913, "learning_rate": 1.1618148192106003e-05, "loss": 1.2219, "step": 15600 }, { "epoch": 4.646698561030548, "grad_norm": 0.3455040752887726, "learning_rate": 1.1617196330098243e-05, "loss": 1.2126, "step": 15601 }, { "epoch": 4.646996407230216, "grad_norm": 0.4865645170211792, "learning_rate": 1.1616244453044284e-05, "loss": 1.2138, "step": 15602 }, { "epoch": 4.647294253429886, "grad_norm": 0.3852697014808655, "learning_rate": 1.1615292560952968e-05, "loss": 1.1991, "step": 15603 }, { "epoch": 4.647592099629554, "grad_norm": 0.2834094166755676, "learning_rate": 1.1614340653833164e-05, "loss": 1.2119, "step": 15604 }, { "epoch": 4.647889945829222, "grad_norm": 0.4348202645778656, "learning_rate": 1.1613388731693726e-05, "loss": 1.1972, "step": 15605 }, { "epoch": 4.648187792028891, "grad_norm": 0.26572486758232117, "learning_rate": 1.1612436794543507e-05, "loss": 1.2151, "step": 15606 }, { "epoch": 4.64848563822856, "grad_norm": 0.42063283920288086, "learning_rate": 1.1611484842391363e-05, "loss": 1.2419, "step": 15607 }, { "epoch": 4.648783484428228, "grad_norm": 0.2530670464038849, "learning_rate": 1.1610532875246154e-05, "loss": 1.2285, "step": 15608 }, { "epoch": 4.649081330627897, "grad_norm": 0.3255590498447418, "learning_rate": 1.1609580893116737e-05, "loss": 1.2145, "step": 15609 }, { "epoch": 4.649379176827566, "grad_norm": 0.3504859209060669, "learning_rate": 1.1608628896011966e-05, "loss": 1.2429, "step": 15610 }, { "epoch": 4.649677023027234, "grad_norm": 0.3600333333015442, "learning_rate": 1.1607676883940702e-05, "loss": 1.2089, "step": 15611 }, { "epoch": 4.649974869226903, "grad_norm": 0.5154030919075012, "learning_rate": 1.1606724856911802e-05, "loss": 1.2253, "step": 15612 }, { "epoch": 4.6502727154265715, "grad_norm": 0.4052983224391937, "learning_rate": 1.160577281493412e-05, "loss": 1.214, "step": 15613 }, { "epoch": 4.650570561626241, "grad_norm": 0.46548226475715637, "learning_rate": 1.1604820758016518e-05, "loss": 1.2133, "step": 15614 }, { "epoch": 4.650868407825909, "grad_norm": 0.327823281288147, "learning_rate": 1.160386868616785e-05, "loss": 1.2108, "step": 15615 }, { "epoch": 4.651166254025577, "grad_norm": 0.6578736305236816, "learning_rate": 1.1602916599396974e-05, "loss": 1.2076, "step": 15616 }, { "epoch": 4.651464100225246, "grad_norm": 0.49334853887557983, "learning_rate": 1.1601964497712751e-05, "loss": 1.2234, "step": 15617 }, { "epoch": 4.651761946424915, "grad_norm": 0.48380619287490845, "learning_rate": 1.1601012381124042e-05, "loss": 1.2258, "step": 15618 }, { "epoch": 4.652059792624583, "grad_norm": 0.326084166765213, "learning_rate": 1.16000602496397e-05, "loss": 1.2231, "step": 15619 }, { "epoch": 4.6523576388242525, "grad_norm": 0.5676078796386719, "learning_rate": 1.1599108103268584e-05, "loss": 1.2054, "step": 15620 }, { "epoch": 4.652655485023921, "grad_norm": 0.4441809058189392, "learning_rate": 1.1598155942019551e-05, "loss": 1.2146, "step": 15621 }, { "epoch": 4.652953331223589, "grad_norm": 0.33972957730293274, "learning_rate": 1.1597203765901468e-05, "loss": 1.2131, "step": 15622 }, { "epoch": 4.653251177423258, "grad_norm": 0.3543725907802582, "learning_rate": 1.1596251574923185e-05, "loss": 1.2248, "step": 15623 }, { "epoch": 4.653549023622927, "grad_norm": 0.25015169382095337, "learning_rate": 1.1595299369093564e-05, "loss": 1.2306, "step": 15624 }, { "epoch": 4.653846869822595, "grad_norm": 0.3255843222141266, "learning_rate": 1.1594347148421467e-05, "loss": 1.2265, "step": 15625 }, { "epoch": 4.654144716022264, "grad_norm": 0.32699039578437805, "learning_rate": 1.159339491291575e-05, "loss": 1.2045, "step": 15626 }, { "epoch": 4.6544425622219325, "grad_norm": 0.24702998995780945, "learning_rate": 1.1592442662585274e-05, "loss": 1.2147, "step": 15627 }, { "epoch": 4.654740408421601, "grad_norm": 0.28171196579933167, "learning_rate": 1.1591490397438897e-05, "loss": 1.2268, "step": 15628 }, { "epoch": 4.65503825462127, "grad_norm": 0.3784802258014679, "learning_rate": 1.1590538117485483e-05, "loss": 1.2253, "step": 15629 }, { "epoch": 4.655336100820938, "grad_norm": 0.26567766070365906, "learning_rate": 1.1589585822733887e-05, "loss": 1.2374, "step": 15630 }, { "epoch": 4.655633947020608, "grad_norm": 0.4190118610858917, "learning_rate": 1.1588633513192971e-05, "loss": 1.2281, "step": 15631 }, { "epoch": 4.655931793220276, "grad_norm": 0.4046405553817749, "learning_rate": 1.1587681188871595e-05, "loss": 1.2219, "step": 15632 }, { "epoch": 4.656229639419944, "grad_norm": 0.26123669743537903, "learning_rate": 1.1586728849778618e-05, "loss": 1.2152, "step": 15633 }, { "epoch": 4.6565274856196135, "grad_norm": 0.3636496663093567, "learning_rate": 1.1585776495922903e-05, "loss": 1.2154, "step": 15634 }, { "epoch": 4.656825331819282, "grad_norm": 0.31194934248924255, "learning_rate": 1.158482412731331e-05, "loss": 1.2098, "step": 15635 }, { "epoch": 4.65712317801895, "grad_norm": 0.5256511569023132, "learning_rate": 1.15838717439587e-05, "loss": 1.219, "step": 15636 }, { "epoch": 4.657421024218619, "grad_norm": 0.4009208381175995, "learning_rate": 1.158291934586793e-05, "loss": 1.2093, "step": 15637 }, { "epoch": 4.657718870418288, "grad_norm": 0.3432953953742981, "learning_rate": 1.1581966933049867e-05, "loss": 1.2026, "step": 15638 }, { "epoch": 4.658016716617956, "grad_norm": 0.4191701114177704, "learning_rate": 1.1581014505513369e-05, "loss": 1.2281, "step": 15639 }, { "epoch": 4.658314562817625, "grad_norm": 0.32237496972084045, "learning_rate": 1.1580062063267295e-05, "loss": 1.2165, "step": 15640 }, { "epoch": 4.658612409017294, "grad_norm": 0.30885910987854004, "learning_rate": 1.157910960632051e-05, "loss": 1.2253, "step": 15641 }, { "epoch": 4.658910255216963, "grad_norm": 0.41791924834251404, "learning_rate": 1.1578157134681875e-05, "loss": 1.2386, "step": 15642 }, { "epoch": 4.659208101416631, "grad_norm": 0.41621527075767517, "learning_rate": 1.1577204648360252e-05, "loss": 1.232, "step": 15643 }, { "epoch": 4.659505947616299, "grad_norm": 0.25124916434288025, "learning_rate": 1.1576252147364497e-05, "loss": 1.2167, "step": 15644 }, { "epoch": 4.659803793815969, "grad_norm": 0.5308575630187988, "learning_rate": 1.1575299631703482e-05, "loss": 1.2142, "step": 15645 }, { "epoch": 4.660101640015637, "grad_norm": 0.3115309476852417, "learning_rate": 1.1574347101386063e-05, "loss": 1.2208, "step": 15646 }, { "epoch": 4.660399486215305, "grad_norm": 0.4527014195919037, "learning_rate": 1.1573394556421102e-05, "loss": 1.2275, "step": 15647 }, { "epoch": 4.6606973324149745, "grad_norm": 0.5402430891990662, "learning_rate": 1.1572441996817465e-05, "loss": 1.2105, "step": 15648 }, { "epoch": 4.660995178614643, "grad_norm": 0.26349201798439026, "learning_rate": 1.157148942258401e-05, "loss": 1.2164, "step": 15649 }, { "epoch": 4.661293024814311, "grad_norm": 0.39200934767723083, "learning_rate": 1.1570536833729599e-05, "loss": 1.2355, "step": 15650 }, { "epoch": 4.66159087101398, "grad_norm": 0.39736855030059814, "learning_rate": 1.1569584230263103e-05, "loss": 1.2239, "step": 15651 }, { "epoch": 4.661888717213649, "grad_norm": 0.24578113853931427, "learning_rate": 1.1568631612193375e-05, "loss": 1.2292, "step": 15652 }, { "epoch": 4.662186563413318, "grad_norm": 0.35816240310668945, "learning_rate": 1.1567678979529286e-05, "loss": 1.2441, "step": 15653 }, { "epoch": 4.662484409612986, "grad_norm": 0.24438230693340302, "learning_rate": 1.1566726332279694e-05, "loss": 1.2246, "step": 15654 }, { "epoch": 4.662782255812655, "grad_norm": 0.31994426250457764, "learning_rate": 1.1565773670453464e-05, "loss": 1.2219, "step": 15655 }, { "epoch": 4.663080102012323, "grad_norm": 0.285806268453598, "learning_rate": 1.1564820994059459e-05, "loss": 1.2228, "step": 15656 }, { "epoch": 4.663377948211992, "grad_norm": 0.3526848554611206, "learning_rate": 1.1563868303106543e-05, "loss": 1.2205, "step": 15657 }, { "epoch": 4.6636757944116605, "grad_norm": 0.2998829483985901, "learning_rate": 1.156291559760358e-05, "loss": 1.226, "step": 15658 }, { "epoch": 4.66397364061133, "grad_norm": 0.629315972328186, "learning_rate": 1.1561962877559432e-05, "loss": 1.2078, "step": 15659 }, { "epoch": 4.664271486810998, "grad_norm": 0.3076554536819458, "learning_rate": 1.1561010142982967e-05, "loss": 1.2076, "step": 15660 }, { "epoch": 4.664569333010666, "grad_norm": 0.4892565608024597, "learning_rate": 1.1560057393883047e-05, "loss": 1.2164, "step": 15661 }, { "epoch": 4.664867179210336, "grad_norm": 0.3082030117511749, "learning_rate": 1.1559104630268532e-05, "loss": 1.222, "step": 15662 }, { "epoch": 4.665165025410004, "grad_norm": 0.41707074642181396, "learning_rate": 1.1558151852148294e-05, "loss": 1.2243, "step": 15663 }, { "epoch": 4.665462871609672, "grad_norm": 0.2680579125881195, "learning_rate": 1.1557199059531192e-05, "loss": 1.2329, "step": 15664 }, { "epoch": 4.665760717809341, "grad_norm": 0.4275538921356201, "learning_rate": 1.1556246252426093e-05, "loss": 1.2209, "step": 15665 }, { "epoch": 4.66605856400901, "grad_norm": 0.254695326089859, "learning_rate": 1.155529343084186e-05, "loss": 1.2214, "step": 15666 }, { "epoch": 4.666356410208678, "grad_norm": 0.6112091541290283, "learning_rate": 1.1554340594787364e-05, "loss": 1.2276, "step": 15667 }, { "epoch": 4.666654256408347, "grad_norm": 0.26346415281295776, "learning_rate": 1.155338774427146e-05, "loss": 1.2141, "step": 15668 }, { "epoch": 4.666952102608016, "grad_norm": 1.103255033493042, "learning_rate": 1.155243487930302e-05, "loss": 1.2162, "step": 15669 }, { "epoch": 4.667249948807685, "grad_norm": 0.44788405299186707, "learning_rate": 1.1551481999890913e-05, "loss": 1.2262, "step": 15670 }, { "epoch": 4.667547795007353, "grad_norm": 0.7941567897796631, "learning_rate": 1.1550529106043995e-05, "loss": 1.2119, "step": 15671 }, { "epoch": 4.6678456412070215, "grad_norm": 0.3029055595397949, "learning_rate": 1.1549576197771136e-05, "loss": 1.2307, "step": 15672 }, { "epoch": 4.668143487406691, "grad_norm": 1.1536701917648315, "learning_rate": 1.1548623275081201e-05, "loss": 1.2099, "step": 15673 }, { "epoch": 4.668441333606359, "grad_norm": 0.34991881251335144, "learning_rate": 1.1547670337983057e-05, "loss": 1.2208, "step": 15674 }, { "epoch": 4.668739179806027, "grad_norm": 0.7014442086219788, "learning_rate": 1.1546717386485571e-05, "loss": 1.2341, "step": 15675 }, { "epoch": 4.669037026005697, "grad_norm": 0.25845497846603394, "learning_rate": 1.1545764420597607e-05, "loss": 1.217, "step": 15676 }, { "epoch": 4.669334872205365, "grad_norm": 0.35486188530921936, "learning_rate": 1.1544811440328035e-05, "loss": 1.2291, "step": 15677 }, { "epoch": 4.669632718405033, "grad_norm": 0.543127179145813, "learning_rate": 1.1543858445685714e-05, "loss": 1.21, "step": 15678 }, { "epoch": 4.6699305646047025, "grad_norm": 0.25546911358833313, "learning_rate": 1.1542905436679516e-05, "loss": 1.2173, "step": 15679 }, { "epoch": 4.670228410804371, "grad_norm": 0.47984954714775085, "learning_rate": 1.154195241331831e-05, "loss": 1.2244, "step": 15680 }, { "epoch": 4.67052625700404, "grad_norm": 0.3667922616004944, "learning_rate": 1.1540999375610956e-05, "loss": 1.222, "step": 15681 }, { "epoch": 4.670824103203708, "grad_norm": 0.2718745172023773, "learning_rate": 1.1540046323566325e-05, "loss": 1.2127, "step": 15682 }, { "epoch": 4.671121949403377, "grad_norm": 0.38106366991996765, "learning_rate": 1.1539093257193284e-05, "loss": 1.2356, "step": 15683 }, { "epoch": 4.671419795603045, "grad_norm": 0.3056412935256958, "learning_rate": 1.15381401765007e-05, "loss": 1.2291, "step": 15684 }, { "epoch": 4.671717641802714, "grad_norm": 0.276956707239151, "learning_rate": 1.1537187081497439e-05, "loss": 1.2094, "step": 15685 }, { "epoch": 4.6720154880023825, "grad_norm": 0.28864794969558716, "learning_rate": 1.1536233972192372e-05, "loss": 1.219, "step": 15686 }, { "epoch": 4.672313334202052, "grad_norm": 0.34191474318504333, "learning_rate": 1.1535280848594364e-05, "loss": 1.2166, "step": 15687 }, { "epoch": 4.67261118040172, "grad_norm": 0.2732122242450714, "learning_rate": 1.1534327710712282e-05, "loss": 1.218, "step": 15688 }, { "epoch": 4.672909026601388, "grad_norm": 0.2825454771518707, "learning_rate": 1.1533374558554995e-05, "loss": 1.2258, "step": 15689 }, { "epoch": 4.673206872801058, "grad_norm": 0.32758834958076477, "learning_rate": 1.1532421392131373e-05, "loss": 1.208, "step": 15690 }, { "epoch": 4.673504719000726, "grad_norm": 0.2849428057670593, "learning_rate": 1.1531468211450278e-05, "loss": 1.23, "step": 15691 }, { "epoch": 4.673802565200394, "grad_norm": 0.30653116106987, "learning_rate": 1.1530515016520585e-05, "loss": 1.2249, "step": 15692 }, { "epoch": 4.6741004114000635, "grad_norm": 0.24566933512687683, "learning_rate": 1.1529561807351157e-05, "loss": 1.2158, "step": 15693 }, { "epoch": 4.674398257599732, "grad_norm": 0.28762638568878174, "learning_rate": 1.152860858395087e-05, "loss": 1.2131, "step": 15694 }, { "epoch": 4.6746961037994, "grad_norm": 0.25978901982307434, "learning_rate": 1.1527655346328585e-05, "loss": 1.2144, "step": 15695 }, { "epoch": 4.674993949999069, "grad_norm": 0.30036869645118713, "learning_rate": 1.1526702094493174e-05, "loss": 1.2294, "step": 15696 }, { "epoch": 4.675291796198738, "grad_norm": 0.3539499044418335, "learning_rate": 1.1525748828453507e-05, "loss": 1.2286, "step": 15697 }, { "epoch": 4.675589642398407, "grad_norm": 0.26067811250686646, "learning_rate": 1.1524795548218451e-05, "loss": 1.2278, "step": 15698 }, { "epoch": 4.675887488598075, "grad_norm": 0.3095061182975769, "learning_rate": 1.1523842253796875e-05, "loss": 1.2093, "step": 15699 }, { "epoch": 4.676185334797744, "grad_norm": 0.3263936936855316, "learning_rate": 1.152288894519765e-05, "loss": 1.2134, "step": 15700 }, { "epoch": 4.676483180997413, "grad_norm": 0.2679992914199829, "learning_rate": 1.1521935622429647e-05, "loss": 1.2094, "step": 15701 }, { "epoch": 4.676781027197081, "grad_norm": 0.39775457978248596, "learning_rate": 1.1520982285501731e-05, "loss": 1.195, "step": 15702 }, { "epoch": 4.677078873396749, "grad_norm": 0.283600389957428, "learning_rate": 1.1520028934422773e-05, "loss": 1.2343, "step": 15703 }, { "epoch": 4.677376719596419, "grad_norm": 0.4227513074874878, "learning_rate": 1.1519075569201647e-05, "loss": 1.2288, "step": 15704 }, { "epoch": 4.677674565796087, "grad_norm": 0.3514833450317383, "learning_rate": 1.1518122189847218e-05, "loss": 1.2192, "step": 15705 }, { "epoch": 4.677972411995755, "grad_norm": 0.40376874804496765, "learning_rate": 1.1517168796368359e-05, "loss": 1.2108, "step": 15706 }, { "epoch": 4.6782702581954245, "grad_norm": 0.5061954259872437, "learning_rate": 1.1516215388773941e-05, "loss": 1.2175, "step": 15707 }, { "epoch": 4.678568104395093, "grad_norm": 0.24786929786205292, "learning_rate": 1.151526196707283e-05, "loss": 1.2167, "step": 15708 }, { "epoch": 4.678865950594762, "grad_norm": 0.4331750273704529, "learning_rate": 1.15143085312739e-05, "loss": 1.2259, "step": 15709 }, { "epoch": 4.67916379679443, "grad_norm": 0.2860633432865143, "learning_rate": 1.1513355081386021e-05, "loss": 1.2337, "step": 15710 }, { "epoch": 4.679461642994099, "grad_norm": 0.44512543082237244, "learning_rate": 1.1512401617418068e-05, "loss": 1.1948, "step": 15711 }, { "epoch": 4.679759489193768, "grad_norm": 0.2672065794467926, "learning_rate": 1.1511448139378903e-05, "loss": 1.2183, "step": 15712 }, { "epoch": 4.680057335393436, "grad_norm": 0.4276815950870514, "learning_rate": 1.1510494647277402e-05, "loss": 1.2133, "step": 15713 }, { "epoch": 4.680355181593105, "grad_norm": 0.29092276096343994, "learning_rate": 1.1509541141122437e-05, "loss": 1.2037, "step": 15714 }, { "epoch": 4.680653027792774, "grad_norm": 0.4415130615234375, "learning_rate": 1.1508587620922877e-05, "loss": 1.2332, "step": 15715 }, { "epoch": 4.680950873992442, "grad_norm": 0.2688218355178833, "learning_rate": 1.1507634086687596e-05, "loss": 1.2288, "step": 15716 }, { "epoch": 4.6812487201921105, "grad_norm": 0.41821667551994324, "learning_rate": 1.1506680538425463e-05, "loss": 1.2013, "step": 15717 }, { "epoch": 4.68154656639178, "grad_norm": 0.2578604519367218, "learning_rate": 1.1505726976145354e-05, "loss": 1.2101, "step": 15718 }, { "epoch": 4.681844412591448, "grad_norm": 0.4427260756492615, "learning_rate": 1.1504773399856134e-05, "loss": 1.2122, "step": 15719 }, { "epoch": 4.682142258791117, "grad_norm": 0.2817981243133545, "learning_rate": 1.1503819809566683e-05, "loss": 1.2086, "step": 15720 }, { "epoch": 4.682440104990786, "grad_norm": 0.43863993883132935, "learning_rate": 1.1502866205285865e-05, "loss": 1.2096, "step": 15721 }, { "epoch": 4.682737951190454, "grad_norm": 0.42356306314468384, "learning_rate": 1.1501912587022557e-05, "loss": 1.2054, "step": 15722 }, { "epoch": 4.683035797390122, "grad_norm": 0.2597607374191284, "learning_rate": 1.150095895478563e-05, "loss": 1.2026, "step": 15723 }, { "epoch": 4.683333643589791, "grad_norm": 0.34694647789001465, "learning_rate": 1.1500005308583957e-05, "loss": 1.2197, "step": 15724 }, { "epoch": 4.68363148978946, "grad_norm": 0.2810973525047302, "learning_rate": 1.1499051648426414e-05, "loss": 1.2243, "step": 15725 }, { "epoch": 4.683929335989129, "grad_norm": 0.2500973045825958, "learning_rate": 1.1498097974321865e-05, "loss": 1.2234, "step": 15726 }, { "epoch": 4.684227182188797, "grad_norm": 0.27204346656799316, "learning_rate": 1.149714428627919e-05, "loss": 1.2381, "step": 15727 }, { "epoch": 4.684525028388466, "grad_norm": 0.2890789210796356, "learning_rate": 1.1496190584307264e-05, "loss": 1.2443, "step": 15728 }, { "epoch": 4.684822874588135, "grad_norm": 0.24557743966579437, "learning_rate": 1.1495236868414952e-05, "loss": 1.2253, "step": 15729 }, { "epoch": 4.685120720787803, "grad_norm": 0.32808101177215576, "learning_rate": 1.149428313861113e-05, "loss": 1.2181, "step": 15730 }, { "epoch": 4.6854185669874715, "grad_norm": 0.24894008040428162, "learning_rate": 1.149332939490468e-05, "loss": 1.2236, "step": 15731 }, { "epoch": 4.685716413187141, "grad_norm": 0.32874298095703125, "learning_rate": 1.149237563730446e-05, "loss": 1.2165, "step": 15732 }, { "epoch": 4.686014259386809, "grad_norm": 0.25478601455688477, "learning_rate": 1.1491421865819356e-05, "loss": 1.2106, "step": 15733 }, { "epoch": 4.686312105586477, "grad_norm": 0.3377542793750763, "learning_rate": 1.1490468080458237e-05, "loss": 1.2164, "step": 15734 }, { "epoch": 4.686609951786147, "grad_norm": 0.32631832361221313, "learning_rate": 1.148951428122998e-05, "loss": 1.2128, "step": 15735 }, { "epoch": 4.686907797985815, "grad_norm": 0.27623340487480164, "learning_rate": 1.1488560468143454e-05, "loss": 1.2091, "step": 15736 }, { "epoch": 4.687205644185484, "grad_norm": 0.2862810790538788, "learning_rate": 1.1487606641207539e-05, "loss": 1.2217, "step": 15737 }, { "epoch": 4.6875034903851525, "grad_norm": 0.2557696998119354, "learning_rate": 1.1486652800431104e-05, "loss": 1.2383, "step": 15738 }, { "epoch": 4.687801336584821, "grad_norm": 0.28746888041496277, "learning_rate": 1.1485698945823025e-05, "loss": 1.2188, "step": 15739 }, { "epoch": 4.68809918278449, "grad_norm": 0.2647130489349365, "learning_rate": 1.1484745077392179e-05, "loss": 1.2295, "step": 15740 }, { "epoch": 4.688397028984158, "grad_norm": 0.33756572008132935, "learning_rate": 1.1483791195147438e-05, "loss": 1.219, "step": 15741 }, { "epoch": 4.688694875183827, "grad_norm": 0.404854953289032, "learning_rate": 1.148283729909768e-05, "loss": 1.2314, "step": 15742 }, { "epoch": 4.688992721383496, "grad_norm": 0.277214378118515, "learning_rate": 1.1481883389251776e-05, "loss": 1.2478, "step": 15743 }, { "epoch": 4.689290567583164, "grad_norm": 0.37850046157836914, "learning_rate": 1.1480929465618598e-05, "loss": 1.2108, "step": 15744 }, { "epoch": 4.6895884137828325, "grad_norm": 0.2664380669593811, "learning_rate": 1.1479975528207032e-05, "loss": 1.2316, "step": 15745 }, { "epoch": 4.689886259982502, "grad_norm": 0.4216008186340332, "learning_rate": 1.1479021577025946e-05, "loss": 1.2283, "step": 15746 }, { "epoch": 4.69018410618217, "grad_norm": 0.3628690838813782, "learning_rate": 1.1478067612084216e-05, "loss": 1.2043, "step": 15747 }, { "epoch": 4.690481952381839, "grad_norm": 0.27909860014915466, "learning_rate": 1.1477113633390719e-05, "loss": 1.2121, "step": 15748 }, { "epoch": 4.690779798581508, "grad_norm": 0.39102903008461, "learning_rate": 1.147615964095433e-05, "loss": 1.2097, "step": 15749 }, { "epoch": 4.691077644781176, "grad_norm": 0.25147396326065063, "learning_rate": 1.1475205634783921e-05, "loss": 1.2292, "step": 15750 }, { "epoch": 4.691375490980844, "grad_norm": 0.4025093913078308, "learning_rate": 1.1474251614888376e-05, "loss": 1.2214, "step": 15751 }, { "epoch": 4.6916733371805135, "grad_norm": 0.276917964220047, "learning_rate": 1.1473297581276568e-05, "loss": 1.2299, "step": 15752 }, { "epoch": 4.691971183380182, "grad_norm": 0.2868637442588806, "learning_rate": 1.1472343533957369e-05, "loss": 1.2316, "step": 15753 }, { "epoch": 4.692269029579851, "grad_norm": 0.3295955955982208, "learning_rate": 1.147138947293966e-05, "loss": 1.2281, "step": 15754 }, { "epoch": 4.692566875779519, "grad_norm": 0.2683313488960266, "learning_rate": 1.1470435398232313e-05, "loss": 1.2393, "step": 15755 }, { "epoch": 4.692864721979188, "grad_norm": 0.33063018321990967, "learning_rate": 1.146948130984421e-05, "loss": 1.2136, "step": 15756 }, { "epoch": 4.693162568178857, "grad_norm": 0.36789897084236145, "learning_rate": 1.1468527207784225e-05, "loss": 1.2171, "step": 15757 }, { "epoch": 4.693460414378525, "grad_norm": 0.26819494366645813, "learning_rate": 1.1467573092061236e-05, "loss": 1.2147, "step": 15758 }, { "epoch": 4.693758260578194, "grad_norm": 0.28111323714256287, "learning_rate": 1.146661896268412e-05, "loss": 1.2262, "step": 15759 }, { "epoch": 4.694056106777863, "grad_norm": 0.2832450568675995, "learning_rate": 1.1465664819661751e-05, "loss": 1.2254, "step": 15760 }, { "epoch": 4.694353952977531, "grad_norm": 0.2826347351074219, "learning_rate": 1.1464710663003008e-05, "loss": 1.2127, "step": 15761 }, { "epoch": 4.694651799177199, "grad_norm": 0.2850990891456604, "learning_rate": 1.1463756492716768e-05, "loss": 1.2125, "step": 15762 }, { "epoch": 4.694949645376869, "grad_norm": 0.2550585865974426, "learning_rate": 1.146280230881191e-05, "loss": 1.2314, "step": 15763 }, { "epoch": 4.695247491576537, "grad_norm": 0.267622172832489, "learning_rate": 1.1461848111297311e-05, "loss": 1.2232, "step": 15764 }, { "epoch": 4.695545337776206, "grad_norm": 0.27849382162094116, "learning_rate": 1.1460893900181854e-05, "loss": 1.2188, "step": 15765 }, { "epoch": 4.6958431839758745, "grad_norm": 0.31942984461784363, "learning_rate": 1.1459939675474405e-05, "loss": 1.2188, "step": 15766 }, { "epoch": 4.696141030175543, "grad_norm": 0.3307408094406128, "learning_rate": 1.1458985437183849e-05, "loss": 1.2147, "step": 15767 }, { "epoch": 4.696438876375212, "grad_norm": 0.3231342136859894, "learning_rate": 1.1458031185319063e-05, "loss": 1.2172, "step": 15768 }, { "epoch": 4.69673672257488, "grad_norm": 0.3401965796947479, "learning_rate": 1.1457076919888928e-05, "loss": 1.2267, "step": 15769 }, { "epoch": 4.697034568774549, "grad_norm": 0.27146321535110474, "learning_rate": 1.1456122640902318e-05, "loss": 1.2127, "step": 15770 }, { "epoch": 4.697332414974218, "grad_norm": 0.3082990050315857, "learning_rate": 1.1455168348368118e-05, "loss": 1.2389, "step": 15771 }, { "epoch": 4.697630261173886, "grad_norm": 0.3296731412410736, "learning_rate": 1.1454214042295199e-05, "loss": 1.2269, "step": 15772 }, { "epoch": 4.697928107373555, "grad_norm": 0.278409481048584, "learning_rate": 1.1453259722692443e-05, "loss": 1.2224, "step": 15773 }, { "epoch": 4.698225953573224, "grad_norm": 0.24612689018249512, "learning_rate": 1.145230538956873e-05, "loss": 1.225, "step": 15774 }, { "epoch": 4.698523799772892, "grad_norm": 0.4055539667606354, "learning_rate": 1.1451351042932937e-05, "loss": 1.2179, "step": 15775 }, { "epoch": 4.698821645972561, "grad_norm": 0.6009336113929749, "learning_rate": 1.1450396682793945e-05, "loss": 1.2277, "step": 15776 }, { "epoch": 4.69911949217223, "grad_norm": 0.3844189941883087, "learning_rate": 1.1449442309160631e-05, "loss": 1.2087, "step": 15777 }, { "epoch": 4.699417338371898, "grad_norm": 0.5920946002006531, "learning_rate": 1.1448487922041877e-05, "loss": 1.2111, "step": 15778 }, { "epoch": 4.699715184571567, "grad_norm": 0.47467926144599915, "learning_rate": 1.1447533521446561e-05, "loss": 1.2314, "step": 15779 }, { "epoch": 4.700013030771236, "grad_norm": 0.36605679988861084, "learning_rate": 1.1446579107383565e-05, "loss": 1.2204, "step": 15780 }, { "epoch": 4.700310876970904, "grad_norm": 0.42435649037361145, "learning_rate": 1.1445624679861762e-05, "loss": 1.2109, "step": 15781 }, { "epoch": 4.700608723170573, "grad_norm": 0.30126717686653137, "learning_rate": 1.1444670238890039e-05, "loss": 1.227, "step": 15782 }, { "epoch": 4.700906569370241, "grad_norm": 0.4542222023010254, "learning_rate": 1.1443715784477277e-05, "loss": 1.2303, "step": 15783 }, { "epoch": 4.70120441556991, "grad_norm": 0.2698976397514343, "learning_rate": 1.1442761316632347e-05, "loss": 1.2262, "step": 15784 }, { "epoch": 4.701502261769579, "grad_norm": 0.388721227645874, "learning_rate": 1.1441806835364136e-05, "loss": 1.2394, "step": 15785 }, { "epoch": 4.701800107969247, "grad_norm": 0.28787553310394287, "learning_rate": 1.1440852340681525e-05, "loss": 1.2178, "step": 15786 }, { "epoch": 4.7020979541689165, "grad_norm": 0.35869595408439636, "learning_rate": 1.1439897832593392e-05, "loss": 1.2229, "step": 15787 }, { "epoch": 4.702395800368585, "grad_norm": 0.35007795691490173, "learning_rate": 1.143894331110862e-05, "loss": 1.2134, "step": 15788 }, { "epoch": 4.702693646568253, "grad_norm": 0.3028625547885895, "learning_rate": 1.1437988776236087e-05, "loss": 1.2358, "step": 15789 }, { "epoch": 4.7029914927679215, "grad_norm": 0.3037794530391693, "learning_rate": 1.1437034227984673e-05, "loss": 1.2311, "step": 15790 }, { "epoch": 4.703289338967591, "grad_norm": 0.32174941897392273, "learning_rate": 1.1436079666363262e-05, "loss": 1.2003, "step": 15791 }, { "epoch": 4.703587185167259, "grad_norm": 0.3050640821456909, "learning_rate": 1.1435125091380735e-05, "loss": 1.2256, "step": 15792 }, { "epoch": 4.703885031366928, "grad_norm": 0.25783175230026245, "learning_rate": 1.1434170503045974e-05, "loss": 1.2155, "step": 15793 }, { "epoch": 4.704182877566597, "grad_norm": 0.35210901498794556, "learning_rate": 1.1433215901367856e-05, "loss": 1.2261, "step": 15794 }, { "epoch": 4.704480723766265, "grad_norm": 0.2773214876651764, "learning_rate": 1.1432261286355267e-05, "loss": 1.2206, "step": 15795 }, { "epoch": 4.704778569965934, "grad_norm": 0.2532184422016144, "learning_rate": 1.1431306658017088e-05, "loss": 1.2132, "step": 15796 }, { "epoch": 4.7050764161656025, "grad_norm": 0.2917252779006958, "learning_rate": 1.1430352016362197e-05, "loss": 1.2128, "step": 15797 }, { "epoch": 4.705374262365271, "grad_norm": 0.38615626096725464, "learning_rate": 1.142939736139948e-05, "loss": 1.2231, "step": 15798 }, { "epoch": 4.70567210856494, "grad_norm": 0.27403759956359863, "learning_rate": 1.1428442693137815e-05, "loss": 1.2149, "step": 15799 }, { "epoch": 4.705969954764608, "grad_norm": 0.4835969805717468, "learning_rate": 1.1427488011586092e-05, "loss": 1.2234, "step": 15800 }, { "epoch": 4.706267800964277, "grad_norm": 0.3025042712688446, "learning_rate": 1.1426533316753185e-05, "loss": 1.2154, "step": 15801 }, { "epoch": 4.706565647163946, "grad_norm": 0.4113025367259979, "learning_rate": 1.1425578608647978e-05, "loss": 1.218, "step": 15802 }, { "epoch": 4.706863493363614, "grad_norm": 0.32159093022346497, "learning_rate": 1.1424623887279355e-05, "loss": 1.2219, "step": 15803 }, { "epoch": 4.707161339563283, "grad_norm": 0.5616856217384338, "learning_rate": 1.14236691526562e-05, "loss": 1.2046, "step": 15804 }, { "epoch": 4.707459185762952, "grad_norm": 0.5068147778511047, "learning_rate": 1.1422714404787394e-05, "loss": 1.2154, "step": 15805 }, { "epoch": 4.70775703196262, "grad_norm": 0.32729411125183105, "learning_rate": 1.1421759643681822e-05, "loss": 1.2385, "step": 15806 }, { "epoch": 4.708054878162289, "grad_norm": 0.30482223629951477, "learning_rate": 1.142080486934836e-05, "loss": 1.2254, "step": 15807 }, { "epoch": 4.708352724361958, "grad_norm": 0.36513110995292664, "learning_rate": 1.14198500817959e-05, "loss": 1.2025, "step": 15808 }, { "epoch": 4.708650570561626, "grad_norm": 0.27403631806373596, "learning_rate": 1.1418895281033317e-05, "loss": 1.2352, "step": 15809 }, { "epoch": 4.708948416761295, "grad_norm": 0.4531899690628052, "learning_rate": 1.1417940467069504e-05, "loss": 1.2156, "step": 15810 }, { "epoch": 4.7092462629609635, "grad_norm": 0.3138730823993683, "learning_rate": 1.1416985639913339e-05, "loss": 1.209, "step": 15811 }, { "epoch": 4.709544109160632, "grad_norm": 0.5142284631729126, "learning_rate": 1.14160307995737e-05, "loss": 1.2099, "step": 15812 }, { "epoch": 4.709841955360301, "grad_norm": 0.283234566450119, "learning_rate": 1.1415075946059485e-05, "loss": 1.2294, "step": 15813 }, { "epoch": 4.710139801559969, "grad_norm": 0.584631621837616, "learning_rate": 1.1414121079379561e-05, "loss": 1.2097, "step": 15814 }, { "epoch": 4.710437647759639, "grad_norm": 0.3865150213241577, "learning_rate": 1.1413166199542824e-05, "loss": 1.2085, "step": 15815 }, { "epoch": 4.710735493959307, "grad_norm": 0.42615455389022827, "learning_rate": 1.1412211306558155e-05, "loss": 1.2273, "step": 15816 }, { "epoch": 4.711033340158975, "grad_norm": 0.2718888223171234, "learning_rate": 1.1411256400434438e-05, "loss": 1.2106, "step": 15817 }, { "epoch": 4.711331186358644, "grad_norm": 0.3436574637889862, "learning_rate": 1.1410301481180555e-05, "loss": 1.2204, "step": 15818 }, { "epoch": 4.711629032558313, "grad_norm": 0.3228500187397003, "learning_rate": 1.1409346548805393e-05, "loss": 1.2071, "step": 15819 }, { "epoch": 4.711926878757981, "grad_norm": 0.25016388297080994, "learning_rate": 1.1408391603317837e-05, "loss": 1.222, "step": 15820 }, { "epoch": 4.71222472495765, "grad_norm": 0.3156696856021881, "learning_rate": 1.1407436644726768e-05, "loss": 1.2224, "step": 15821 }, { "epoch": 4.712522571157319, "grad_norm": 0.3078279197216034, "learning_rate": 1.1406481673041077e-05, "loss": 1.2098, "step": 15822 }, { "epoch": 4.712820417356987, "grad_norm": 0.3350658416748047, "learning_rate": 1.140552668826964e-05, "loss": 1.2199, "step": 15823 }, { "epoch": 4.713118263556656, "grad_norm": 0.4933941662311554, "learning_rate": 1.1404571690421356e-05, "loss": 1.2483, "step": 15824 }, { "epoch": 4.7134161097563245, "grad_norm": 0.2801974415779114, "learning_rate": 1.1403616679505096e-05, "loss": 1.2146, "step": 15825 }, { "epoch": 4.713713955955993, "grad_norm": 0.42451563477516174, "learning_rate": 1.1402661655529752e-05, "loss": 1.2001, "step": 15826 }, { "epoch": 4.714011802155662, "grad_norm": 0.2859726548194885, "learning_rate": 1.1401706618504206e-05, "loss": 1.2368, "step": 15827 }, { "epoch": 4.71430964835533, "grad_norm": 0.26921480894088745, "learning_rate": 1.140075156843735e-05, "loss": 1.2174, "step": 15828 }, { "epoch": 4.714607494554999, "grad_norm": 0.2621622383594513, "learning_rate": 1.1399796505338064e-05, "loss": 1.239, "step": 15829 }, { "epoch": 4.714905340754668, "grad_norm": 0.2795064449310303, "learning_rate": 1.1398841429215237e-05, "loss": 1.2322, "step": 15830 }, { "epoch": 4.715203186954336, "grad_norm": 0.27517372369766235, "learning_rate": 1.1397886340077753e-05, "loss": 1.2293, "step": 15831 }, { "epoch": 4.7155010331540055, "grad_norm": 0.29767560958862305, "learning_rate": 1.1396931237934495e-05, "loss": 1.2222, "step": 15832 }, { "epoch": 4.715798879353674, "grad_norm": 0.42175325751304626, "learning_rate": 1.1395976122794355e-05, "loss": 1.2152, "step": 15833 }, { "epoch": 4.716096725553342, "grad_norm": 0.3241307735443115, "learning_rate": 1.1395020994666218e-05, "loss": 1.2149, "step": 15834 }, { "epoch": 4.716394571753011, "grad_norm": 0.3327314853668213, "learning_rate": 1.139406585355897e-05, "loss": 1.2229, "step": 15835 }, { "epoch": 4.71669241795268, "grad_norm": 0.5165314674377441, "learning_rate": 1.1393110699481494e-05, "loss": 1.2099, "step": 15836 }, { "epoch": 4.716990264152348, "grad_norm": 0.33652955293655396, "learning_rate": 1.1392155532442679e-05, "loss": 1.225, "step": 15837 }, { "epoch": 4.717288110352017, "grad_norm": 0.4319644570350647, "learning_rate": 1.1391200352451413e-05, "loss": 1.2029, "step": 15838 }, { "epoch": 4.7175859565516856, "grad_norm": 0.8324863314628601, "learning_rate": 1.1390245159516582e-05, "loss": 1.2222, "step": 15839 }, { "epoch": 4.717883802751354, "grad_norm": 0.5616923570632935, "learning_rate": 1.1389289953647073e-05, "loss": 1.216, "step": 15840 }, { "epoch": 4.718181648951023, "grad_norm": 0.3713075816631317, "learning_rate": 1.1388334734851775e-05, "loss": 1.2309, "step": 15841 }, { "epoch": 4.718479495150691, "grad_norm": 0.34850919246673584, "learning_rate": 1.1387379503139573e-05, "loss": 1.2133, "step": 15842 }, { "epoch": 4.718777341350361, "grad_norm": 0.5037635564804077, "learning_rate": 1.1386424258519354e-05, "loss": 1.2172, "step": 15843 }, { "epoch": 4.719075187550029, "grad_norm": 0.31124255061149597, "learning_rate": 1.1385469001000005e-05, "loss": 1.2141, "step": 15844 }, { "epoch": 4.719373033749697, "grad_norm": 0.6399654150009155, "learning_rate": 1.1384513730590416e-05, "loss": 1.2255, "step": 15845 }, { "epoch": 4.7196708799493665, "grad_norm": 0.34755223989486694, "learning_rate": 1.1383558447299474e-05, "loss": 1.2336, "step": 15846 }, { "epoch": 4.719968726149035, "grad_norm": 0.4550135135650635, "learning_rate": 1.1382603151136067e-05, "loss": 1.2276, "step": 15847 }, { "epoch": 4.720266572348703, "grad_norm": 0.3026994466781616, "learning_rate": 1.1381647842109082e-05, "loss": 1.2257, "step": 15848 }, { "epoch": 4.720564418548372, "grad_norm": 0.3064768314361572, "learning_rate": 1.1380692520227408e-05, "loss": 1.2092, "step": 15849 }, { "epoch": 4.720862264748041, "grad_norm": 0.6107318997383118, "learning_rate": 1.1379737185499927e-05, "loss": 1.2017, "step": 15850 }, { "epoch": 4.721160110947709, "grad_norm": 0.31734713912010193, "learning_rate": 1.1378781837935542e-05, "loss": 1.2048, "step": 15851 }, { "epoch": 4.721457957147378, "grad_norm": 0.6801714897155762, "learning_rate": 1.1377826477543128e-05, "loss": 1.2282, "step": 15852 }, { "epoch": 4.721755803347047, "grad_norm": 0.3529517948627472, "learning_rate": 1.1376871104331577e-05, "loss": 1.2178, "step": 15853 }, { "epoch": 4.722053649546716, "grad_norm": 0.425773561000824, "learning_rate": 1.1375915718309782e-05, "loss": 1.2188, "step": 15854 }, { "epoch": 4.722351495746384, "grad_norm": 0.26280325651168823, "learning_rate": 1.1374960319486626e-05, "loss": 1.2048, "step": 15855 }, { "epoch": 4.7226493419460525, "grad_norm": 0.39089539647102356, "learning_rate": 1.1374004907871e-05, "loss": 1.2157, "step": 15856 }, { "epoch": 4.722947188145721, "grad_norm": 0.28381043672561646, "learning_rate": 1.1373049483471793e-05, "loss": 1.2278, "step": 15857 }, { "epoch": 4.72324503434539, "grad_norm": 0.48841750621795654, "learning_rate": 1.1372094046297897e-05, "loss": 1.2168, "step": 15858 }, { "epoch": 4.723542880545058, "grad_norm": 0.29121631383895874, "learning_rate": 1.1371138596358198e-05, "loss": 1.2291, "step": 15859 }, { "epoch": 4.7238407267447275, "grad_norm": 0.348071426153183, "learning_rate": 1.1370183133661587e-05, "loss": 1.222, "step": 15860 }, { "epoch": 4.724138572944396, "grad_norm": 0.29713866114616394, "learning_rate": 1.1369227658216952e-05, "loss": 1.2278, "step": 15861 }, { "epoch": 4.724436419144064, "grad_norm": 0.3051190674304962, "learning_rate": 1.1368272170033183e-05, "loss": 1.2236, "step": 15862 }, { "epoch": 4.724734265343733, "grad_norm": 0.33109524846076965, "learning_rate": 1.136731666911917e-05, "loss": 1.2142, "step": 15863 }, { "epoch": 4.725032111543402, "grad_norm": 0.27014556527137756, "learning_rate": 1.1366361155483806e-05, "loss": 1.2037, "step": 15864 }, { "epoch": 4.72532995774307, "grad_norm": 0.41980963945388794, "learning_rate": 1.1365405629135975e-05, "loss": 1.2186, "step": 15865 }, { "epoch": 4.725627803942739, "grad_norm": 0.25167006254196167, "learning_rate": 1.136445009008457e-05, "loss": 1.2284, "step": 15866 }, { "epoch": 4.725925650142408, "grad_norm": 0.37986552715301514, "learning_rate": 1.1363494538338482e-05, "loss": 1.2269, "step": 15867 }, { "epoch": 4.726223496342076, "grad_norm": 0.3030323386192322, "learning_rate": 1.1362538973906601e-05, "loss": 1.2292, "step": 15868 }, { "epoch": 4.726521342541745, "grad_norm": 0.33262065052986145, "learning_rate": 1.1361583396797817e-05, "loss": 1.2191, "step": 15869 }, { "epoch": 4.7268191887414135, "grad_norm": 0.4100240170955658, "learning_rate": 1.1360627807021022e-05, "loss": 1.2095, "step": 15870 }, { "epoch": 4.727117034941083, "grad_norm": 0.2690527141094208, "learning_rate": 1.1359672204585105e-05, "loss": 1.2212, "step": 15871 }, { "epoch": 4.727414881140751, "grad_norm": 0.3116254210472107, "learning_rate": 1.1358716589498955e-05, "loss": 1.2231, "step": 15872 }, { "epoch": 4.727712727340419, "grad_norm": 0.28574854135513306, "learning_rate": 1.1357760961771465e-05, "loss": 1.2164, "step": 15873 }, { "epoch": 4.728010573540089, "grad_norm": 0.262829065322876, "learning_rate": 1.1356805321411529e-05, "loss": 1.2111, "step": 15874 }, { "epoch": 4.728308419739757, "grad_norm": 0.2841876447200775, "learning_rate": 1.1355849668428035e-05, "loss": 1.2276, "step": 15875 }, { "epoch": 4.728606265939425, "grad_norm": 0.31921133399009705, "learning_rate": 1.1354894002829875e-05, "loss": 1.2222, "step": 15876 }, { "epoch": 4.7289041121390945, "grad_norm": 0.260716050863266, "learning_rate": 1.1353938324625937e-05, "loss": 1.2122, "step": 15877 }, { "epoch": 4.729201958338763, "grad_norm": 0.30550864338874817, "learning_rate": 1.135298263382512e-05, "loss": 1.2096, "step": 15878 }, { "epoch": 4.729499804538431, "grad_norm": 0.28598499298095703, "learning_rate": 1.1352026930436306e-05, "loss": 1.2129, "step": 15879 }, { "epoch": 4.7297976507381, "grad_norm": 0.249919131398201, "learning_rate": 1.1351071214468394e-05, "loss": 1.2048, "step": 15880 }, { "epoch": 4.730095496937769, "grad_norm": 0.25255289673805237, "learning_rate": 1.1350115485930275e-05, "loss": 1.2145, "step": 15881 }, { "epoch": 4.730393343137438, "grad_norm": 0.266684889793396, "learning_rate": 1.1349159744830842e-05, "loss": 1.2358, "step": 15882 }, { "epoch": 4.730691189337106, "grad_norm": 0.29002997279167175, "learning_rate": 1.1348203991178984e-05, "loss": 1.2279, "step": 15883 }, { "epoch": 4.7309890355367745, "grad_norm": 0.2965460419654846, "learning_rate": 1.134724822498359e-05, "loss": 1.2204, "step": 15884 }, { "epoch": 4.731286881736443, "grad_norm": 0.3081950545310974, "learning_rate": 1.134629244625356e-05, "loss": 1.2087, "step": 15885 }, { "epoch": 4.731584727936112, "grad_norm": 0.5313135981559753, "learning_rate": 1.1345336654997783e-05, "loss": 1.2089, "step": 15886 }, { "epoch": 4.73188257413578, "grad_norm": 0.4403473436832428, "learning_rate": 1.134438085122515e-05, "loss": 1.228, "step": 15887 }, { "epoch": 4.73218042033545, "grad_norm": 0.3934048116207123, "learning_rate": 1.1343425034944557e-05, "loss": 1.2068, "step": 15888 }, { "epoch": 4.732478266535118, "grad_norm": 0.7846232056617737, "learning_rate": 1.1342469206164894e-05, "loss": 1.2095, "step": 15889 }, { "epoch": 4.732776112734786, "grad_norm": 0.3105809986591339, "learning_rate": 1.1341513364895053e-05, "loss": 1.2137, "step": 15890 }, { "epoch": 4.7330739589344555, "grad_norm": 0.42994362115859985, "learning_rate": 1.134055751114393e-05, "loss": 1.2437, "step": 15891 }, { "epoch": 4.733371805134124, "grad_norm": 0.2574695348739624, "learning_rate": 1.133960164492042e-05, "loss": 1.2182, "step": 15892 }, { "epoch": 4.733669651333792, "grad_norm": 0.5125569701194763, "learning_rate": 1.1338645766233412e-05, "loss": 1.2131, "step": 15893 }, { "epoch": 4.733967497533461, "grad_norm": 0.2991676926612854, "learning_rate": 1.13376898750918e-05, "loss": 1.2177, "step": 15894 }, { "epoch": 4.73426534373313, "grad_norm": 0.39358091354370117, "learning_rate": 1.133673397150448e-05, "loss": 1.2391, "step": 15895 }, { "epoch": 4.734563189932798, "grad_norm": 0.3041570484638214, "learning_rate": 1.133577805548034e-05, "loss": 1.2223, "step": 15896 }, { "epoch": 4.734861036132467, "grad_norm": 0.31268954277038574, "learning_rate": 1.1334822127028278e-05, "loss": 1.2178, "step": 15897 }, { "epoch": 4.7351588823321356, "grad_norm": 0.25059276819229126, "learning_rate": 1.133386618615719e-05, "loss": 1.2267, "step": 15898 }, { "epoch": 4.735456728531805, "grad_norm": 0.34685003757476807, "learning_rate": 1.133291023287597e-05, "loss": 1.225, "step": 15899 }, { "epoch": 4.735754574731473, "grad_norm": 0.3498765230178833, "learning_rate": 1.1331954267193504e-05, "loss": 1.2026, "step": 15900 }, { "epoch": 4.736052420931141, "grad_norm": 0.25785017013549805, "learning_rate": 1.1330998289118693e-05, "loss": 1.2059, "step": 15901 }, { "epoch": 4.736350267130811, "grad_norm": 0.29561716318130493, "learning_rate": 1.133004229866043e-05, "loss": 1.2168, "step": 15902 }, { "epoch": 4.736648113330479, "grad_norm": 0.2916998565196991, "learning_rate": 1.1329086295827612e-05, "loss": 1.2183, "step": 15903 }, { "epoch": 4.736945959530147, "grad_norm": 0.335693359375, "learning_rate": 1.1328130280629128e-05, "loss": 1.2368, "step": 15904 }, { "epoch": 4.7372438057298165, "grad_norm": 0.3438325524330139, "learning_rate": 1.1327174253073881e-05, "loss": 1.2136, "step": 15905 }, { "epoch": 4.737541651929485, "grad_norm": 0.43850234150886536, "learning_rate": 1.1326218213170755e-05, "loss": 1.2196, "step": 15906 }, { "epoch": 4.737839498129153, "grad_norm": 0.2512917220592499, "learning_rate": 1.1325262160928651e-05, "loss": 1.2065, "step": 15907 }, { "epoch": 4.738137344328822, "grad_norm": 0.36331161856651306, "learning_rate": 1.1324306096356463e-05, "loss": 1.2303, "step": 15908 }, { "epoch": 4.738435190528491, "grad_norm": 0.29328441619873047, "learning_rate": 1.1323350019463088e-05, "loss": 1.2322, "step": 15909 }, { "epoch": 4.73873303672816, "grad_norm": 0.30946558713912964, "learning_rate": 1.1322393930257418e-05, "loss": 1.2259, "step": 15910 }, { "epoch": 4.739030882927828, "grad_norm": 0.4157577157020569, "learning_rate": 1.132143782874835e-05, "loss": 1.2237, "step": 15911 }, { "epoch": 4.739328729127497, "grad_norm": 0.4143553376197815, "learning_rate": 1.1320481714944782e-05, "loss": 1.2318, "step": 15912 }, { "epoch": 4.739626575327166, "grad_norm": 0.29865482449531555, "learning_rate": 1.1319525588855605e-05, "loss": 1.2082, "step": 15913 }, { "epoch": 4.739924421526834, "grad_norm": 0.5328629612922668, "learning_rate": 1.1318569450489713e-05, "loss": 1.2158, "step": 15914 }, { "epoch": 4.7402222677265025, "grad_norm": 0.39911073446273804, "learning_rate": 1.1317613299856011e-05, "loss": 1.2343, "step": 15915 }, { "epoch": 4.740520113926172, "grad_norm": 0.41975584626197815, "learning_rate": 1.131665713696339e-05, "loss": 1.2061, "step": 15916 }, { "epoch": 4.74081796012584, "grad_norm": 0.3446354269981384, "learning_rate": 1.1315700961820742e-05, "loss": 1.2084, "step": 15917 }, { "epoch": 4.741115806325508, "grad_norm": 0.48269596695899963, "learning_rate": 1.1314744774436968e-05, "loss": 1.2237, "step": 15918 }, { "epoch": 4.7414136525251775, "grad_norm": 0.31334182620048523, "learning_rate": 1.1313788574820963e-05, "loss": 1.2164, "step": 15919 }, { "epoch": 4.741711498724846, "grad_norm": 0.4656231999397278, "learning_rate": 1.1312832362981621e-05, "loss": 1.2142, "step": 15920 }, { "epoch": 4.742009344924515, "grad_norm": 0.2596440315246582, "learning_rate": 1.1311876138927842e-05, "loss": 1.2227, "step": 15921 }, { "epoch": 4.742307191124183, "grad_norm": 0.4730317294597626, "learning_rate": 1.1310919902668522e-05, "loss": 1.2415, "step": 15922 }, { "epoch": 4.742605037323852, "grad_norm": 0.28711432218551636, "learning_rate": 1.1309963654212557e-05, "loss": 1.2153, "step": 15923 }, { "epoch": 4.74290288352352, "grad_norm": 0.5630531311035156, "learning_rate": 1.1309007393568843e-05, "loss": 1.2178, "step": 15924 }, { "epoch": 4.743200729723189, "grad_norm": 0.3179137706756592, "learning_rate": 1.1308051120746277e-05, "loss": 1.2418, "step": 15925 }, { "epoch": 4.743498575922858, "grad_norm": 0.6113380789756775, "learning_rate": 1.1307094835753757e-05, "loss": 1.2045, "step": 15926 }, { "epoch": 4.743796422122527, "grad_norm": 0.46305787563323975, "learning_rate": 1.130613853860018e-05, "loss": 1.2153, "step": 15927 }, { "epoch": 4.744094268322195, "grad_norm": 0.5362480282783508, "learning_rate": 1.1305182229294445e-05, "loss": 1.2194, "step": 15928 }, { "epoch": 4.7443921145218635, "grad_norm": 0.26356229186058044, "learning_rate": 1.1304225907845448e-05, "loss": 1.2298, "step": 15929 }, { "epoch": 4.744689960721533, "grad_norm": 0.5824549794197083, "learning_rate": 1.1303269574262083e-05, "loss": 1.2277, "step": 15930 }, { "epoch": 4.744987806921201, "grad_norm": 0.32579800486564636, "learning_rate": 1.1302313228553253e-05, "loss": 1.222, "step": 15931 }, { "epoch": 4.745285653120869, "grad_norm": 0.3042381703853607, "learning_rate": 1.1301356870727848e-05, "loss": 1.2179, "step": 15932 }, { "epoch": 4.745583499320539, "grad_norm": 0.5567289590835571, "learning_rate": 1.1300400500794779e-05, "loss": 1.2273, "step": 15933 }, { "epoch": 4.745881345520207, "grad_norm": 0.2972211241722107, "learning_rate": 1.1299444118762933e-05, "loss": 1.2292, "step": 15934 }, { "epoch": 4.746179191719875, "grad_norm": 0.3997952342033386, "learning_rate": 1.1298487724641211e-05, "loss": 1.2067, "step": 15935 }, { "epoch": 4.7464770379195445, "grad_norm": 0.3564496338367462, "learning_rate": 1.1297531318438514e-05, "loss": 1.2253, "step": 15936 }, { "epoch": 4.746774884119213, "grad_norm": 0.29910174012184143, "learning_rate": 1.1296574900163734e-05, "loss": 1.226, "step": 15937 }, { "epoch": 4.747072730318882, "grad_norm": 0.2811172902584076, "learning_rate": 1.1295618469825774e-05, "loss": 1.2055, "step": 15938 }, { "epoch": 4.74737057651855, "grad_norm": 0.31470999121665955, "learning_rate": 1.1294662027433532e-05, "loss": 1.2299, "step": 15939 }, { "epoch": 4.747668422718219, "grad_norm": 0.3347685635089874, "learning_rate": 1.1293705572995909e-05, "loss": 1.2147, "step": 15940 }, { "epoch": 4.747966268917888, "grad_norm": 0.3421890139579773, "learning_rate": 1.1292749106521798e-05, "loss": 1.2212, "step": 15941 }, { "epoch": 4.748264115117556, "grad_norm": 0.2701142430305481, "learning_rate": 1.1291792628020103e-05, "loss": 1.2244, "step": 15942 }, { "epoch": 4.7485619613172245, "grad_norm": 0.2640717327594757, "learning_rate": 1.129083613749972e-05, "loss": 1.2176, "step": 15943 }, { "epoch": 4.748859807516894, "grad_norm": 0.29700079560279846, "learning_rate": 1.1289879634969548e-05, "loss": 1.2121, "step": 15944 }, { "epoch": 4.749157653716562, "grad_norm": 0.2528539001941681, "learning_rate": 1.1288923120438486e-05, "loss": 1.2327, "step": 15945 }, { "epoch": 4.74945549991623, "grad_norm": 0.4150124490261078, "learning_rate": 1.1287966593915438e-05, "loss": 1.2154, "step": 15946 }, { "epoch": 4.7497533461159, "grad_norm": 0.38475698232650757, "learning_rate": 1.1287010055409298e-05, "loss": 1.2144, "step": 15947 }, { "epoch": 4.750051192315568, "grad_norm": 0.3754880130290985, "learning_rate": 1.1286053504928966e-05, "loss": 1.2459, "step": 15948 }, { "epoch": 4.750349038515237, "grad_norm": 0.3564351201057434, "learning_rate": 1.1285096942483344e-05, "loss": 1.2186, "step": 15949 }, { "epoch": 4.7506468847149055, "grad_norm": 0.38486358523368835, "learning_rate": 1.128414036808133e-05, "loss": 1.2052, "step": 15950 }, { "epoch": 4.750944730914574, "grad_norm": 0.32017916440963745, "learning_rate": 1.1283183781731827e-05, "loss": 1.2082, "step": 15951 }, { "epoch": 4.751242577114242, "grad_norm": 0.3087635636329651, "learning_rate": 1.128222718344373e-05, "loss": 1.2183, "step": 15952 }, { "epoch": 4.751540423313911, "grad_norm": 0.3792348802089691, "learning_rate": 1.1281270573225947e-05, "loss": 1.2285, "step": 15953 }, { "epoch": 4.75183826951358, "grad_norm": 0.392402321100235, "learning_rate": 1.1280313951087367e-05, "loss": 1.2139, "step": 15954 }, { "epoch": 4.752136115713249, "grad_norm": 0.2829458713531494, "learning_rate": 1.1279357317036897e-05, "loss": 1.2198, "step": 15955 }, { "epoch": 4.752433961912917, "grad_norm": 0.2721094489097595, "learning_rate": 1.1278400671083435e-05, "loss": 1.2092, "step": 15956 }, { "epoch": 4.7527318081125856, "grad_norm": 0.2691056728363037, "learning_rate": 1.1277444013235888e-05, "loss": 1.2215, "step": 15957 }, { "epoch": 4.753029654312255, "grad_norm": 0.25471600890159607, "learning_rate": 1.1276487343503148e-05, "loss": 1.2296, "step": 15958 }, { "epoch": 4.753327500511923, "grad_norm": 0.2551353871822357, "learning_rate": 1.1275530661894121e-05, "loss": 1.231, "step": 15959 }, { "epoch": 4.753625346711591, "grad_norm": 0.3175426721572876, "learning_rate": 1.1274573968417708e-05, "loss": 1.217, "step": 15960 }, { "epoch": 4.753923192911261, "grad_norm": 0.4047291874885559, "learning_rate": 1.1273617263082804e-05, "loss": 1.1934, "step": 15961 }, { "epoch": 4.754221039110929, "grad_norm": 0.2636595070362091, "learning_rate": 1.1272660545898315e-05, "loss": 1.2184, "step": 15962 }, { "epoch": 4.754518885310597, "grad_norm": 0.49120742082595825, "learning_rate": 1.1271703816873147e-05, "loss": 1.2203, "step": 15963 }, { "epoch": 4.7548167315102665, "grad_norm": 0.3593755066394806, "learning_rate": 1.1270747076016191e-05, "loss": 1.2417, "step": 15964 }, { "epoch": 4.755114577709935, "grad_norm": 0.32109713554382324, "learning_rate": 1.1269790323336353e-05, "loss": 1.224, "step": 15965 }, { "epoch": 4.755412423909604, "grad_norm": 0.3326317369937897, "learning_rate": 1.1268833558842536e-05, "loss": 1.2134, "step": 15966 }, { "epoch": 4.755710270109272, "grad_norm": 0.24378105998039246, "learning_rate": 1.126787678254364e-05, "loss": 1.2122, "step": 15967 }, { "epoch": 4.756008116308941, "grad_norm": 0.33965176343917847, "learning_rate": 1.1266919994448567e-05, "loss": 1.2063, "step": 15968 }, { "epoch": 4.75630596250861, "grad_norm": 0.261719286441803, "learning_rate": 1.1265963194566218e-05, "loss": 1.2366, "step": 15969 }, { "epoch": 4.756603808708278, "grad_norm": 0.31250137090682983, "learning_rate": 1.1265006382905499e-05, "loss": 1.2277, "step": 15970 }, { "epoch": 4.756901654907947, "grad_norm": 0.2600827217102051, "learning_rate": 1.1264049559475307e-05, "loss": 1.2337, "step": 15971 }, { "epoch": 4.757199501107616, "grad_norm": 0.2892378568649292, "learning_rate": 1.1263092724284548e-05, "loss": 1.208, "step": 15972 }, { "epoch": 4.757497347307284, "grad_norm": 0.3316152095794678, "learning_rate": 1.126213587734212e-05, "loss": 1.219, "step": 15973 }, { "epoch": 4.7577951935069525, "grad_norm": 0.2919826805591583, "learning_rate": 1.1261179018656926e-05, "loss": 1.2263, "step": 15974 }, { "epoch": 4.758093039706622, "grad_norm": 0.39483243227005005, "learning_rate": 1.1260222148237874e-05, "loss": 1.2107, "step": 15975 }, { "epoch": 4.75839088590629, "grad_norm": 0.27996349334716797, "learning_rate": 1.1259265266093862e-05, "loss": 1.2147, "step": 15976 }, { "epoch": 4.758688732105959, "grad_norm": 0.5439602732658386, "learning_rate": 1.1258308372233794e-05, "loss": 1.2165, "step": 15977 }, { "epoch": 4.7589865783056275, "grad_norm": 0.4502294957637787, "learning_rate": 1.1257351466666572e-05, "loss": 1.2155, "step": 15978 }, { "epoch": 4.759284424505296, "grad_norm": 0.31706276535987854, "learning_rate": 1.1256394549401097e-05, "loss": 1.2151, "step": 15979 }, { "epoch": 4.759582270704965, "grad_norm": 0.5445091724395752, "learning_rate": 1.1255437620446279e-05, "loss": 1.2202, "step": 15980 }, { "epoch": 4.759880116904633, "grad_norm": 0.2917754054069519, "learning_rate": 1.1254480679811012e-05, "loss": 1.2174, "step": 15981 }, { "epoch": 4.760177963104302, "grad_norm": 0.3354133367538452, "learning_rate": 1.1253523727504207e-05, "loss": 1.2177, "step": 15982 }, { "epoch": 4.760475809303971, "grad_norm": 0.303833931684494, "learning_rate": 1.1252566763534762e-05, "loss": 1.2203, "step": 15983 }, { "epoch": 4.760773655503639, "grad_norm": 0.35081231594085693, "learning_rate": 1.1251609787911584e-05, "loss": 1.2156, "step": 15984 }, { "epoch": 4.761071501703308, "grad_norm": 0.29519397020339966, "learning_rate": 1.1250652800643576e-05, "loss": 1.2056, "step": 15985 }, { "epoch": 4.761369347902977, "grad_norm": 0.27962592244148254, "learning_rate": 1.1249695801739639e-05, "loss": 1.2172, "step": 15986 }, { "epoch": 4.761667194102645, "grad_norm": 0.2990504503250122, "learning_rate": 1.1248738791208682e-05, "loss": 1.2227, "step": 15987 }, { "epoch": 4.761965040302314, "grad_norm": 0.3045368790626526, "learning_rate": 1.1247781769059604e-05, "loss": 1.2197, "step": 15988 }, { "epoch": 4.762262886501983, "grad_norm": 0.2627740800380707, "learning_rate": 1.124682473530131e-05, "loss": 1.212, "step": 15989 }, { "epoch": 4.762560732701651, "grad_norm": 0.44368502497673035, "learning_rate": 1.1245867689942705e-05, "loss": 1.2142, "step": 15990 }, { "epoch": 4.762858578901319, "grad_norm": 0.2825852632522583, "learning_rate": 1.1244910632992694e-05, "loss": 1.2104, "step": 15991 }, { "epoch": 4.763156425100989, "grad_norm": 0.4482436776161194, "learning_rate": 1.1243953564460179e-05, "loss": 1.2277, "step": 15992 }, { "epoch": 4.763454271300657, "grad_norm": 0.28264114260673523, "learning_rate": 1.1242996484354068e-05, "loss": 1.2239, "step": 15993 }, { "epoch": 4.763752117500326, "grad_norm": 0.3293389081954956, "learning_rate": 1.1242039392683263e-05, "loss": 1.2199, "step": 15994 }, { "epoch": 4.7640499636999944, "grad_norm": 0.3082025945186615, "learning_rate": 1.1241082289456668e-05, "loss": 1.2353, "step": 15995 }, { "epoch": 4.764347809899663, "grad_norm": 0.24573594331741333, "learning_rate": 1.1240125174683189e-05, "loss": 1.2156, "step": 15996 }, { "epoch": 4.764645656099332, "grad_norm": 0.31183791160583496, "learning_rate": 1.1239168048371729e-05, "loss": 1.2313, "step": 15997 }, { "epoch": 4.764943502299, "grad_norm": 0.3353278338909149, "learning_rate": 1.12382109105312e-05, "loss": 1.2171, "step": 15998 }, { "epoch": 4.765241348498669, "grad_norm": 0.29151424765586853, "learning_rate": 1.12372537611705e-05, "loss": 1.2206, "step": 15999 }, { "epoch": 4.765539194698338, "grad_norm": 0.37144967913627625, "learning_rate": 1.1236296600298533e-05, "loss": 1.2322, "step": 16000 }, { "epoch": 4.765539194698338, "eval_loss": 1.321445107460022, "eval_runtime": 22.1535, "eval_samples_per_second": 78.272, "eval_steps_per_second": 4.92, "step": 16000 }, { "epoch": 4.765837040898006, "grad_norm": 0.24773110449314117, "learning_rate": 1.1235339427924212e-05, "loss": 1.2145, "step": 16001 }, { "epoch": 4.7661348870976745, "grad_norm": 0.31285032629966736, "learning_rate": 1.1234382244056433e-05, "loss": 1.2346, "step": 16002 }, { "epoch": 4.766432733297344, "grad_norm": 0.27079176902770996, "learning_rate": 1.1233425048704108e-05, "loss": 1.194, "step": 16003 }, { "epoch": 4.766730579497012, "grad_norm": 0.27916616201400757, "learning_rate": 1.1232467841876142e-05, "loss": 1.2209, "step": 16004 }, { "epoch": 4.767028425696681, "grad_norm": 0.2608424425125122, "learning_rate": 1.123151062358144e-05, "loss": 1.2204, "step": 16005 }, { "epoch": 4.76732627189635, "grad_norm": 0.2661018371582031, "learning_rate": 1.1230553393828908e-05, "loss": 1.2101, "step": 16006 }, { "epoch": 4.767624118096018, "grad_norm": 0.3621608316898346, "learning_rate": 1.1229596152627449e-05, "loss": 1.2297, "step": 16007 }, { "epoch": 4.767921964295687, "grad_norm": 0.3018997609615326, "learning_rate": 1.1228638899985973e-05, "loss": 1.2241, "step": 16008 }, { "epoch": 4.7682198104953555, "grad_norm": 0.4123643934726715, "learning_rate": 1.1227681635913384e-05, "loss": 1.2085, "step": 16009 }, { "epoch": 4.768517656695024, "grad_norm": 0.5294069051742554, "learning_rate": 1.1226724360418588e-05, "loss": 1.2186, "step": 16010 }, { "epoch": 4.768815502894693, "grad_norm": 0.3427787721157074, "learning_rate": 1.1225767073510496e-05, "loss": 1.2175, "step": 16011 }, { "epoch": 4.769113349094361, "grad_norm": 0.6941683292388916, "learning_rate": 1.1224809775198012e-05, "loss": 1.216, "step": 16012 }, { "epoch": 4.76941119529403, "grad_norm": 0.2954423427581787, "learning_rate": 1.1223852465490037e-05, "loss": 1.2246, "step": 16013 }, { "epoch": 4.769709041493699, "grad_norm": 0.3511301279067993, "learning_rate": 1.1222895144395482e-05, "loss": 1.2324, "step": 16014 }, { "epoch": 4.770006887693367, "grad_norm": 0.3313235342502594, "learning_rate": 1.1221937811923255e-05, "loss": 1.2298, "step": 16015 }, { "epoch": 4.770304733893036, "grad_norm": 0.29304108023643494, "learning_rate": 1.1220980468082264e-05, "loss": 1.2182, "step": 16016 }, { "epoch": 4.770602580092705, "grad_norm": 0.40669217705726624, "learning_rate": 1.1220023112881412e-05, "loss": 1.2177, "step": 16017 }, { "epoch": 4.770900426292373, "grad_norm": 0.3019462525844574, "learning_rate": 1.121906574632961e-05, "loss": 1.2168, "step": 16018 }, { "epoch": 4.771198272492041, "grad_norm": 0.3180030584335327, "learning_rate": 1.1218108368435761e-05, "loss": 1.208, "step": 16019 }, { "epoch": 4.771496118691711, "grad_norm": 0.290020227432251, "learning_rate": 1.1217150979208773e-05, "loss": 1.2181, "step": 16020 }, { "epoch": 4.771793964891379, "grad_norm": 0.28905245661735535, "learning_rate": 1.121619357865756e-05, "loss": 1.2102, "step": 16021 }, { "epoch": 4.772091811091048, "grad_norm": 0.3011939823627472, "learning_rate": 1.1215236166791022e-05, "loss": 1.2265, "step": 16022 }, { "epoch": 4.7723896572907165, "grad_norm": 0.29782941937446594, "learning_rate": 1.1214278743618068e-05, "loss": 1.2055, "step": 16023 }, { "epoch": 4.772687503490385, "grad_norm": 0.32444489002227783, "learning_rate": 1.1213321309147609e-05, "loss": 1.2217, "step": 16024 }, { "epoch": 4.772985349690054, "grad_norm": 0.41676148772239685, "learning_rate": 1.1212363863388549e-05, "loss": 1.215, "step": 16025 }, { "epoch": 4.773283195889722, "grad_norm": 0.27466535568237305, "learning_rate": 1.1211406406349798e-05, "loss": 1.2332, "step": 16026 }, { "epoch": 4.773581042089391, "grad_norm": 0.3036925494670868, "learning_rate": 1.1210448938040264e-05, "loss": 1.2242, "step": 16027 }, { "epoch": 4.77387888828906, "grad_norm": 0.41110652685165405, "learning_rate": 1.1209491458468858e-05, "loss": 1.2247, "step": 16028 }, { "epoch": 4.774176734488728, "grad_norm": 0.2643027901649475, "learning_rate": 1.1208533967644482e-05, "loss": 1.2155, "step": 16029 }, { "epoch": 4.774474580688397, "grad_norm": 0.37844428420066833, "learning_rate": 1.1207576465576049e-05, "loss": 1.2205, "step": 16030 }, { "epoch": 4.774772426888066, "grad_norm": 0.30798405408859253, "learning_rate": 1.1206618952272466e-05, "loss": 1.22, "step": 16031 }, { "epoch": 4.775070273087734, "grad_norm": 0.5551376938819885, "learning_rate": 1.120566142774264e-05, "loss": 1.217, "step": 16032 }, { "epoch": 4.775368119287403, "grad_norm": 0.4104938805103302, "learning_rate": 1.1204703891995483e-05, "loss": 1.2127, "step": 16033 }, { "epoch": 4.775665965487072, "grad_norm": 0.36480623483657837, "learning_rate": 1.1203746345039903e-05, "loss": 1.2151, "step": 16034 }, { "epoch": 4.77596381168674, "grad_norm": 0.39886218309402466, "learning_rate": 1.1202788786884808e-05, "loss": 1.2292, "step": 16035 }, { "epoch": 4.776261657886409, "grad_norm": 0.3694417178630829, "learning_rate": 1.1201831217539108e-05, "loss": 1.2249, "step": 16036 }, { "epoch": 4.7765595040860775, "grad_norm": 0.42429476976394653, "learning_rate": 1.1200873637011708e-05, "loss": 1.2301, "step": 16037 }, { "epoch": 4.776857350285746, "grad_norm": 0.3157731294631958, "learning_rate": 1.1199916045311522e-05, "loss": 1.2182, "step": 16038 }, { "epoch": 4.777155196485415, "grad_norm": 0.5065358877182007, "learning_rate": 1.119895844244746e-05, "loss": 1.2333, "step": 16039 }, { "epoch": 4.777453042685083, "grad_norm": 0.2756097614765167, "learning_rate": 1.1198000828428426e-05, "loss": 1.2055, "step": 16040 }, { "epoch": 4.777750888884752, "grad_norm": 0.8761605620384216, "learning_rate": 1.1197043203263334e-05, "loss": 1.2283, "step": 16041 }, { "epoch": 4.778048735084421, "grad_norm": 0.4738672077655792, "learning_rate": 1.1196085566961095e-05, "loss": 1.2189, "step": 16042 }, { "epoch": 4.778346581284089, "grad_norm": 0.5599620342254639, "learning_rate": 1.119512791953061e-05, "loss": 1.2101, "step": 16043 }, { "epoch": 4.7786444274837585, "grad_norm": 0.3681389093399048, "learning_rate": 1.1194170260980799e-05, "loss": 1.211, "step": 16044 }, { "epoch": 4.778942273683427, "grad_norm": 0.5590227246284485, "learning_rate": 1.1193212591320571e-05, "loss": 1.2064, "step": 16045 }, { "epoch": 4.779240119883095, "grad_norm": 0.30715882778167725, "learning_rate": 1.1192254910558828e-05, "loss": 1.2135, "step": 16046 }, { "epoch": 4.779537966082764, "grad_norm": 0.6409532427787781, "learning_rate": 1.1191297218704487e-05, "loss": 1.2243, "step": 16047 }, { "epoch": 4.779835812282433, "grad_norm": 0.3171904981136322, "learning_rate": 1.1190339515766454e-05, "loss": 1.204, "step": 16048 }, { "epoch": 4.780133658482101, "grad_norm": 0.5026242733001709, "learning_rate": 1.1189381801753645e-05, "loss": 1.2313, "step": 16049 }, { "epoch": 4.78043150468177, "grad_norm": 0.2544401288032532, "learning_rate": 1.1188424076674966e-05, "loss": 1.2163, "step": 16050 }, { "epoch": 4.780729350881439, "grad_norm": 0.3725774884223938, "learning_rate": 1.1187466340539328e-05, "loss": 1.2069, "step": 16051 }, { "epoch": 4.781027197081107, "grad_norm": 0.2977886497974396, "learning_rate": 1.1186508593355645e-05, "loss": 1.2289, "step": 16052 }, { "epoch": 4.781325043280776, "grad_norm": 0.3489167094230652, "learning_rate": 1.1185550835132824e-05, "loss": 1.2218, "step": 16053 }, { "epoch": 4.7816228894804444, "grad_norm": 0.31889593601226807, "learning_rate": 1.1184593065879773e-05, "loss": 1.2256, "step": 16054 }, { "epoch": 4.781920735680114, "grad_norm": 0.27601557970046997, "learning_rate": 1.1183635285605412e-05, "loss": 1.2222, "step": 16055 }, { "epoch": 4.782218581879782, "grad_norm": 0.449756920337677, "learning_rate": 1.1182677494318646e-05, "loss": 1.2291, "step": 16056 }, { "epoch": 4.78251642807945, "grad_norm": 0.2948785424232483, "learning_rate": 1.1181719692028385e-05, "loss": 1.2347, "step": 16057 }, { "epoch": 4.782814274279119, "grad_norm": 0.3799836337566376, "learning_rate": 1.1180761878743545e-05, "loss": 1.2174, "step": 16058 }, { "epoch": 4.783112120478788, "grad_norm": 0.254565954208374, "learning_rate": 1.1179804054473036e-05, "loss": 1.2159, "step": 16059 }, { "epoch": 4.783409966678456, "grad_norm": 0.35955533385276794, "learning_rate": 1.1178846219225767e-05, "loss": 1.2216, "step": 16060 }, { "epoch": 4.783707812878125, "grad_norm": 0.2900918126106262, "learning_rate": 1.1177888373010647e-05, "loss": 1.2218, "step": 16061 }, { "epoch": 4.784005659077794, "grad_norm": 0.3033221960067749, "learning_rate": 1.1176930515836599e-05, "loss": 1.2252, "step": 16062 }, { "epoch": 4.784303505277462, "grad_norm": 0.32291966676712036, "learning_rate": 1.1175972647712523e-05, "loss": 1.2185, "step": 16063 }, { "epoch": 4.784601351477131, "grad_norm": 0.40774357318878174, "learning_rate": 1.1175014768647336e-05, "loss": 1.2159, "step": 16064 }, { "epoch": 4.7848991976768, "grad_norm": 0.25964364409446716, "learning_rate": 1.1174056878649951e-05, "loss": 1.249, "step": 16065 }, { "epoch": 4.785197043876468, "grad_norm": 0.44589051604270935, "learning_rate": 1.117309897772928e-05, "loss": 1.2218, "step": 16066 }, { "epoch": 4.785494890076137, "grad_norm": 0.362463116645813, "learning_rate": 1.1172141065894227e-05, "loss": 1.2192, "step": 16067 }, { "epoch": 4.7857927362758055, "grad_norm": 0.3140842914581299, "learning_rate": 1.1171183143153714e-05, "loss": 1.229, "step": 16068 }, { "epoch": 4.786090582475474, "grad_norm": 0.5171372890472412, "learning_rate": 1.1170225209516654e-05, "loss": 1.2359, "step": 16069 }, { "epoch": 4.786388428675143, "grad_norm": 0.35875049233436584, "learning_rate": 1.1169267264991952e-05, "loss": 1.2267, "step": 16070 }, { "epoch": 4.786686274874811, "grad_norm": 0.29174157977104187, "learning_rate": 1.1168309309588525e-05, "loss": 1.2158, "step": 16071 }, { "epoch": 4.786984121074481, "grad_norm": 0.2554702162742615, "learning_rate": 1.1167351343315286e-05, "loss": 1.2288, "step": 16072 }, { "epoch": 4.787281967274149, "grad_norm": 0.35019010305404663, "learning_rate": 1.1166393366181147e-05, "loss": 1.2252, "step": 16073 }, { "epoch": 4.787579813473817, "grad_norm": 0.2655666470527649, "learning_rate": 1.116543537819502e-05, "loss": 1.2226, "step": 16074 }, { "epoch": 4.787877659673486, "grad_norm": 0.32722175121307373, "learning_rate": 1.1164477379365821e-05, "loss": 1.2015, "step": 16075 }, { "epoch": 4.788175505873155, "grad_norm": 0.33848732709884644, "learning_rate": 1.116351936970246e-05, "loss": 1.222, "step": 16076 }, { "epoch": 4.788473352072823, "grad_norm": 0.2932543158531189, "learning_rate": 1.1162561349213851e-05, "loss": 1.2187, "step": 16077 }, { "epoch": 4.788771198272492, "grad_norm": 0.450366348028183, "learning_rate": 1.1161603317908907e-05, "loss": 1.2189, "step": 16078 }, { "epoch": 4.789069044472161, "grad_norm": 0.3034180700778961, "learning_rate": 1.1160645275796543e-05, "loss": 1.2144, "step": 16079 }, { "epoch": 4.789366890671829, "grad_norm": 0.3472573459148407, "learning_rate": 1.1159687222885672e-05, "loss": 1.2085, "step": 16080 }, { "epoch": 4.789664736871498, "grad_norm": 0.3495369851589203, "learning_rate": 1.1158729159185204e-05, "loss": 1.2258, "step": 16081 }, { "epoch": 4.7899625830711665, "grad_norm": 0.2551051676273346, "learning_rate": 1.1157771084704056e-05, "loss": 1.2269, "step": 16082 }, { "epoch": 4.790260429270836, "grad_norm": 0.30192965269088745, "learning_rate": 1.1156812999451145e-05, "loss": 1.2, "step": 16083 }, { "epoch": 4.790558275470504, "grad_norm": 0.2525578439235687, "learning_rate": 1.115585490343538e-05, "loss": 1.2162, "step": 16084 }, { "epoch": 4.790856121670172, "grad_norm": 0.26708728075027466, "learning_rate": 1.1154896796665676e-05, "loss": 1.2413, "step": 16085 }, { "epoch": 4.791153967869841, "grad_norm": 0.28091707825660706, "learning_rate": 1.1153938679150948e-05, "loss": 1.2199, "step": 16086 }, { "epoch": 4.79145181406951, "grad_norm": 0.2770179212093353, "learning_rate": 1.115298055090011e-05, "loss": 1.2501, "step": 16087 }, { "epoch": 4.791749660269178, "grad_norm": 0.2677095830440521, "learning_rate": 1.1152022411922076e-05, "loss": 1.2258, "step": 16088 }, { "epoch": 4.7920475064688475, "grad_norm": 0.25663822889328003, "learning_rate": 1.115106426222576e-05, "loss": 1.2029, "step": 16089 }, { "epoch": 4.792345352668516, "grad_norm": 0.2832064628601074, "learning_rate": 1.1150106101820077e-05, "loss": 1.2244, "step": 16090 }, { "epoch": 4.792643198868184, "grad_norm": 0.2537904977798462, "learning_rate": 1.1149147930713941e-05, "loss": 1.2245, "step": 16091 }, { "epoch": 4.792941045067853, "grad_norm": 0.28968295454978943, "learning_rate": 1.114818974891627e-05, "loss": 1.2259, "step": 16092 }, { "epoch": 4.793238891267522, "grad_norm": 0.24708563089370728, "learning_rate": 1.1147231556435976e-05, "loss": 1.2031, "step": 16093 }, { "epoch": 4.79353673746719, "grad_norm": 0.4056083559989929, "learning_rate": 1.1146273353281974e-05, "loss": 1.2181, "step": 16094 }, { "epoch": 4.793834583666859, "grad_norm": 0.498331218957901, "learning_rate": 1.1145315139463178e-05, "loss": 1.2073, "step": 16095 }, { "epoch": 4.7941324298665275, "grad_norm": 0.4491882920265198, "learning_rate": 1.1144356914988504e-05, "loss": 1.2151, "step": 16096 }, { "epoch": 4.794430276066196, "grad_norm": 0.3053653836250305, "learning_rate": 1.1143398679866869e-05, "loss": 1.2184, "step": 16097 }, { "epoch": 4.794728122265865, "grad_norm": 0.3851979076862335, "learning_rate": 1.1142440434107182e-05, "loss": 1.2135, "step": 16098 }, { "epoch": 4.795025968465533, "grad_norm": 0.4008645713329315, "learning_rate": 1.1141482177718369e-05, "loss": 1.2086, "step": 16099 }, { "epoch": 4.795323814665203, "grad_norm": 0.28745028376579285, "learning_rate": 1.1140523910709338e-05, "loss": 1.2355, "step": 16100 }, { "epoch": 4.795621660864871, "grad_norm": 0.3781936466693878, "learning_rate": 1.1139565633089006e-05, "loss": 1.2294, "step": 16101 }, { "epoch": 4.795919507064539, "grad_norm": 0.2987149655818939, "learning_rate": 1.1138607344866286e-05, "loss": 1.2111, "step": 16102 }, { "epoch": 4.7962173532642085, "grad_norm": 0.27935880422592163, "learning_rate": 1.11376490460501e-05, "loss": 1.2187, "step": 16103 }, { "epoch": 4.796515199463877, "grad_norm": 0.36832916736602783, "learning_rate": 1.1136690736649362e-05, "loss": 1.2422, "step": 16104 }, { "epoch": 4.796813045663545, "grad_norm": 0.30461856722831726, "learning_rate": 1.1135732416672984e-05, "loss": 1.2087, "step": 16105 }, { "epoch": 4.797110891863214, "grad_norm": 0.3361176550388336, "learning_rate": 1.1134774086129884e-05, "loss": 1.2143, "step": 16106 }, { "epoch": 4.797408738062883, "grad_norm": 0.3742274343967438, "learning_rate": 1.1133815745028984e-05, "loss": 1.2165, "step": 16107 }, { "epoch": 4.797706584262551, "grad_norm": 0.2652067542076111, "learning_rate": 1.1132857393379191e-05, "loss": 1.226, "step": 16108 }, { "epoch": 4.79800443046222, "grad_norm": 0.3102246820926666, "learning_rate": 1.1131899031189427e-05, "loss": 1.2358, "step": 16109 }, { "epoch": 4.798302276661889, "grad_norm": 0.6317980885505676, "learning_rate": 1.1130940658468607e-05, "loss": 1.2431, "step": 16110 }, { "epoch": 4.798600122861558, "grad_norm": 0.6439951658248901, "learning_rate": 1.1129982275225646e-05, "loss": 1.2208, "step": 16111 }, { "epoch": 4.798897969061226, "grad_norm": 0.3831596374511719, "learning_rate": 1.1129023881469466e-05, "loss": 1.2113, "step": 16112 }, { "epoch": 4.7991958152608944, "grad_norm": 0.37428316473960876, "learning_rate": 1.1128065477208976e-05, "loss": 1.222, "step": 16113 }, { "epoch": 4.799493661460564, "grad_norm": 0.31380048394203186, "learning_rate": 1.11271070624531e-05, "loss": 1.2214, "step": 16114 }, { "epoch": 4.799791507660232, "grad_norm": 0.43900665640830994, "learning_rate": 1.112614863721075e-05, "loss": 1.2324, "step": 16115 }, { "epoch": 4.8000893538599, "grad_norm": 0.4185546338558197, "learning_rate": 1.1125190201490845e-05, "loss": 1.2354, "step": 16116 }, { "epoch": 4.8003872000595695, "grad_norm": 0.2961720824241638, "learning_rate": 1.1124231755302305e-05, "loss": 1.2194, "step": 16117 }, { "epoch": 4.800685046259238, "grad_norm": 0.4965033531188965, "learning_rate": 1.1123273298654044e-05, "loss": 1.2112, "step": 16118 }, { "epoch": 4.800982892458906, "grad_norm": 0.2967652678489685, "learning_rate": 1.1122314831554979e-05, "loss": 1.2163, "step": 16119 }, { "epoch": 4.801280738658575, "grad_norm": 0.4445110559463501, "learning_rate": 1.1121356354014028e-05, "loss": 1.2164, "step": 16120 }, { "epoch": 4.801578584858244, "grad_norm": 0.27836817502975464, "learning_rate": 1.1120397866040112e-05, "loss": 1.2389, "step": 16121 }, { "epoch": 4.801876431057913, "grad_norm": 0.4156196117401123, "learning_rate": 1.1119439367642142e-05, "loss": 1.2135, "step": 16122 }, { "epoch": 4.802174277257581, "grad_norm": 0.25994083285331726, "learning_rate": 1.1118480858829039e-05, "loss": 1.2088, "step": 16123 }, { "epoch": 4.80247212345725, "grad_norm": 0.37323346734046936, "learning_rate": 1.1117522339609725e-05, "loss": 1.2081, "step": 16124 }, { "epoch": 4.802769969656918, "grad_norm": 0.2517227828502655, "learning_rate": 1.1116563809993113e-05, "loss": 1.2275, "step": 16125 }, { "epoch": 4.803067815856587, "grad_norm": 0.28889697790145874, "learning_rate": 1.111560526998812e-05, "loss": 1.2276, "step": 16126 }, { "epoch": 4.8033656620562555, "grad_norm": 0.2838118374347687, "learning_rate": 1.1114646719603669e-05, "loss": 1.2244, "step": 16127 }, { "epoch": 4.803663508255925, "grad_norm": 0.26490119099617004, "learning_rate": 1.1113688158848672e-05, "loss": 1.2281, "step": 16128 }, { "epoch": 4.803961354455593, "grad_norm": 0.3336222469806671, "learning_rate": 1.1112729587732054e-05, "loss": 1.2238, "step": 16129 }, { "epoch": 4.804259200655261, "grad_norm": 0.285384863615036, "learning_rate": 1.1111771006262728e-05, "loss": 1.2293, "step": 16130 }, { "epoch": 4.804557046854931, "grad_norm": 0.32861578464508057, "learning_rate": 1.111081241444962e-05, "loss": 1.2262, "step": 16131 }, { "epoch": 4.804854893054599, "grad_norm": 0.24425429105758667, "learning_rate": 1.1109853812301638e-05, "loss": 1.2266, "step": 16132 }, { "epoch": 4.805152739254267, "grad_norm": 0.4054061770439148, "learning_rate": 1.1108895199827708e-05, "loss": 1.2304, "step": 16133 }, { "epoch": 4.805450585453936, "grad_norm": 0.4688212275505066, "learning_rate": 1.110793657703675e-05, "loss": 1.223, "step": 16134 }, { "epoch": 4.805748431653605, "grad_norm": 0.27641046047210693, "learning_rate": 1.1106977943937676e-05, "loss": 1.2241, "step": 16135 }, { "epoch": 4.806046277853273, "grad_norm": 0.6944801211357117, "learning_rate": 1.110601930053941e-05, "loss": 1.2068, "step": 16136 }, { "epoch": 4.806344124052942, "grad_norm": 0.402495801448822, "learning_rate": 1.1105060646850871e-05, "loss": 1.2159, "step": 16137 }, { "epoch": 4.806641970252611, "grad_norm": 0.457445353269577, "learning_rate": 1.1104101982880978e-05, "loss": 1.2148, "step": 16138 }, { "epoch": 4.80693981645228, "grad_norm": 0.3986983001232147, "learning_rate": 1.1103143308638648e-05, "loss": 1.2056, "step": 16139 }, { "epoch": 4.807237662651948, "grad_norm": 0.3038369119167328, "learning_rate": 1.1102184624132802e-05, "loss": 1.2168, "step": 16140 }, { "epoch": 4.8075355088516165, "grad_norm": 0.44019803404808044, "learning_rate": 1.1101225929372364e-05, "loss": 1.2097, "step": 16141 }, { "epoch": 4.807833355051286, "grad_norm": 0.2620181739330292, "learning_rate": 1.1100267224366243e-05, "loss": 1.223, "step": 16142 }, { "epoch": 4.808131201250954, "grad_norm": 0.3152141273021698, "learning_rate": 1.1099308509123367e-05, "loss": 1.2292, "step": 16143 }, { "epoch": 4.808429047450622, "grad_norm": 0.27323970198631287, "learning_rate": 1.1098349783652654e-05, "loss": 1.2153, "step": 16144 }, { "epoch": 4.808726893650292, "grad_norm": 0.2811146378517151, "learning_rate": 1.1097391047963022e-05, "loss": 1.2279, "step": 16145 }, { "epoch": 4.80902473984996, "grad_norm": 0.31005099415779114, "learning_rate": 1.1096432302063394e-05, "loss": 1.217, "step": 16146 }, { "epoch": 4.809322586049628, "grad_norm": 0.2507877051830292, "learning_rate": 1.1095473545962688e-05, "loss": 1.2057, "step": 16147 }, { "epoch": 4.8096204322492975, "grad_norm": 0.37099072337150574, "learning_rate": 1.1094514779669825e-05, "loss": 1.2328, "step": 16148 }, { "epoch": 4.809918278448966, "grad_norm": 0.27322760224342346, "learning_rate": 1.1093556003193722e-05, "loss": 1.2211, "step": 16149 }, { "epoch": 4.810216124648635, "grad_norm": 0.3571750521659851, "learning_rate": 1.1092597216543303e-05, "loss": 1.2116, "step": 16150 }, { "epoch": 4.810513970848303, "grad_norm": 0.3114352524280548, "learning_rate": 1.1091638419727493e-05, "loss": 1.2199, "step": 16151 }, { "epoch": 4.810811817047972, "grad_norm": 0.32958173751831055, "learning_rate": 1.1090679612755202e-05, "loss": 1.2013, "step": 16152 }, { "epoch": 4.81110966324764, "grad_norm": 0.3899846374988556, "learning_rate": 1.1089720795635357e-05, "loss": 1.2131, "step": 16153 }, { "epoch": 4.811407509447309, "grad_norm": 0.3302745223045349, "learning_rate": 1.1088761968376878e-05, "loss": 1.2019, "step": 16154 }, { "epoch": 4.8117053556469775, "grad_norm": 0.5378304719924927, "learning_rate": 1.1087803130988684e-05, "loss": 1.2104, "step": 16155 }, { "epoch": 4.812003201846647, "grad_norm": 0.2669019401073456, "learning_rate": 1.1086844283479699e-05, "loss": 1.1935, "step": 16156 }, { "epoch": 4.812301048046315, "grad_norm": 0.8667817711830139, "learning_rate": 1.108588542585884e-05, "loss": 1.219, "step": 16157 }, { "epoch": 4.812598894245983, "grad_norm": 0.5090213418006897, "learning_rate": 1.1084926558135034e-05, "loss": 1.2206, "step": 16158 }, { "epoch": 4.812896740445653, "grad_norm": 0.5283071994781494, "learning_rate": 1.1083967680317196e-05, "loss": 1.2225, "step": 16159 }, { "epoch": 4.813194586645321, "grad_norm": 0.3246292471885681, "learning_rate": 1.108300879241425e-05, "loss": 1.2135, "step": 16160 }, { "epoch": 4.813492432844989, "grad_norm": 0.4943784177303314, "learning_rate": 1.1082049894435116e-05, "loss": 1.2283, "step": 16161 }, { "epoch": 4.8137902790446585, "grad_norm": 0.3359585702419281, "learning_rate": 1.1081090986388718e-05, "loss": 1.2248, "step": 16162 }, { "epoch": 4.814088125244327, "grad_norm": 0.4059714078903198, "learning_rate": 1.1080132068283974e-05, "loss": 1.2143, "step": 16163 }, { "epoch": 4.814385971443995, "grad_norm": 0.45377418398857117, "learning_rate": 1.107917314012981e-05, "loss": 1.2078, "step": 16164 }, { "epoch": 4.814683817643664, "grad_norm": 0.34101006388664246, "learning_rate": 1.1078214201935148e-05, "loss": 1.2144, "step": 16165 }, { "epoch": 4.814981663843333, "grad_norm": 0.4971599280834198, "learning_rate": 1.1077255253708906e-05, "loss": 1.2322, "step": 16166 }, { "epoch": 4.815279510043002, "grad_norm": 0.26410800218582153, "learning_rate": 1.1076296295460003e-05, "loss": 1.2216, "step": 16167 }, { "epoch": 4.81557735624267, "grad_norm": 0.44251903891563416, "learning_rate": 1.107533732719737e-05, "loss": 1.2223, "step": 16168 }, { "epoch": 4.815875202442339, "grad_norm": 0.3852265179157257, "learning_rate": 1.1074378348929924e-05, "loss": 1.2287, "step": 16169 }, { "epoch": 4.816173048642008, "grad_norm": 0.3347865641117096, "learning_rate": 1.1073419360666588e-05, "loss": 1.222, "step": 16170 }, { "epoch": 4.816470894841676, "grad_norm": 0.3753680884838104, "learning_rate": 1.1072460362416284e-05, "loss": 1.2304, "step": 16171 }, { "epoch": 4.8167687410413444, "grad_norm": 0.2690870761871338, "learning_rate": 1.1071501354187933e-05, "loss": 1.2207, "step": 16172 }, { "epoch": 4.817066587241014, "grad_norm": 0.34019342064857483, "learning_rate": 1.107054233599046e-05, "loss": 1.2251, "step": 16173 }, { "epoch": 4.817364433440682, "grad_norm": 0.24536536633968353, "learning_rate": 1.1069583307832788e-05, "loss": 1.2086, "step": 16174 }, { "epoch": 4.81766227964035, "grad_norm": 0.5281050801277161, "learning_rate": 1.1068624269723837e-05, "loss": 1.2158, "step": 16175 }, { "epoch": 4.8179601258400195, "grad_norm": 0.45852169394493103, "learning_rate": 1.1067665221672532e-05, "loss": 1.2217, "step": 16176 }, { "epoch": 4.818257972039688, "grad_norm": 0.3672395944595337, "learning_rate": 1.1066706163687794e-05, "loss": 1.2332, "step": 16177 }, { "epoch": 4.818555818239357, "grad_norm": 0.582095205783844, "learning_rate": 1.1065747095778547e-05, "loss": 1.2274, "step": 16178 }, { "epoch": 4.818853664439025, "grad_norm": 0.2974826693534851, "learning_rate": 1.1064788017953714e-05, "loss": 1.2259, "step": 16179 }, { "epoch": 4.819151510638694, "grad_norm": 0.5012524724006653, "learning_rate": 1.1063828930222219e-05, "loss": 1.2325, "step": 16180 }, { "epoch": 4.819449356838363, "grad_norm": 0.3937076926231384, "learning_rate": 1.1062869832592981e-05, "loss": 1.2267, "step": 16181 }, { "epoch": 4.819747203038031, "grad_norm": 0.520210862159729, "learning_rate": 1.1061910725074933e-05, "loss": 1.219, "step": 16182 }, { "epoch": 4.8200450492377, "grad_norm": 0.39134225249290466, "learning_rate": 1.1060951607676989e-05, "loss": 1.2076, "step": 16183 }, { "epoch": 4.820342895437369, "grad_norm": 0.35149261355400085, "learning_rate": 1.1059992480408076e-05, "loss": 1.2023, "step": 16184 }, { "epoch": 4.820640741637037, "grad_norm": 0.33472779393196106, "learning_rate": 1.1059033343277114e-05, "loss": 1.2119, "step": 16185 }, { "epoch": 4.8209385878367055, "grad_norm": 0.3002585172653198, "learning_rate": 1.1058074196293034e-05, "loss": 1.2218, "step": 16186 }, { "epoch": 4.821236434036375, "grad_norm": 0.5443071126937866, "learning_rate": 1.1057115039464751e-05, "loss": 1.2247, "step": 16187 }, { "epoch": 4.821534280236043, "grad_norm": 0.28928321599960327, "learning_rate": 1.10561558728012e-05, "loss": 1.2367, "step": 16188 }, { "epoch": 4.821832126435712, "grad_norm": 0.41158461570739746, "learning_rate": 1.1055196696311296e-05, "loss": 1.2314, "step": 16189 }, { "epoch": 4.822129972635381, "grad_norm": 0.34556692838668823, "learning_rate": 1.1054237510003961e-05, "loss": 1.225, "step": 16190 }, { "epoch": 4.822427818835049, "grad_norm": 0.5000821948051453, "learning_rate": 1.105327831388813e-05, "loss": 1.2268, "step": 16191 }, { "epoch": 4.822725665034717, "grad_norm": 0.323465496301651, "learning_rate": 1.1052319107972719e-05, "loss": 1.2025, "step": 16192 }, { "epoch": 4.823023511234386, "grad_norm": 0.3453395962715149, "learning_rate": 1.1051359892266655e-05, "loss": 1.1967, "step": 16193 }, { "epoch": 4.823321357434055, "grad_norm": 0.34367984533309937, "learning_rate": 1.1050400666778863e-05, "loss": 1.2116, "step": 16194 }, { "epoch": 4.823619203633724, "grad_norm": 0.3218033015727997, "learning_rate": 1.1049441431518267e-05, "loss": 1.2118, "step": 16195 }, { "epoch": 4.823917049833392, "grad_norm": 0.269066721200943, "learning_rate": 1.1048482186493785e-05, "loss": 1.2078, "step": 16196 }, { "epoch": 4.824214896033061, "grad_norm": 0.28650161623954773, "learning_rate": 1.1047522931714352e-05, "loss": 1.2108, "step": 16197 }, { "epoch": 4.82451274223273, "grad_norm": 0.3001042306423187, "learning_rate": 1.1046563667188888e-05, "loss": 1.2141, "step": 16198 }, { "epoch": 4.824810588432398, "grad_norm": 0.4155952036380768, "learning_rate": 1.1045604392926318e-05, "loss": 1.2293, "step": 16199 }, { "epoch": 4.8251084346320665, "grad_norm": 0.29334381222724915, "learning_rate": 1.1044645108935568e-05, "loss": 1.2134, "step": 16200 }, { "epoch": 4.825406280831736, "grad_norm": 0.32471799850463867, "learning_rate": 1.1043685815225562e-05, "loss": 1.223, "step": 16201 }, { "epoch": 4.825704127031404, "grad_norm": 0.2967764437198639, "learning_rate": 1.1042726511805226e-05, "loss": 1.2231, "step": 16202 }, { "epoch": 4.826001973231072, "grad_norm": 0.3156512379646301, "learning_rate": 1.1041767198683483e-05, "loss": 1.2272, "step": 16203 }, { "epoch": 4.826299819430742, "grad_norm": 0.27915698289871216, "learning_rate": 1.1040807875869261e-05, "loss": 1.2229, "step": 16204 }, { "epoch": 4.82659766563041, "grad_norm": 0.3005918562412262, "learning_rate": 1.1039848543371485e-05, "loss": 1.2285, "step": 16205 }, { "epoch": 4.826895511830079, "grad_norm": 0.2813635468482971, "learning_rate": 1.1038889201199084e-05, "loss": 1.212, "step": 16206 }, { "epoch": 4.8271933580297475, "grad_norm": 0.45619508624076843, "learning_rate": 1.1037929849360976e-05, "loss": 1.2311, "step": 16207 }, { "epoch": 4.827491204229416, "grad_norm": 0.3799247145652771, "learning_rate": 1.1036970487866086e-05, "loss": 1.2272, "step": 16208 }, { "epoch": 4.827789050429085, "grad_norm": 0.30754658579826355, "learning_rate": 1.103601111672335e-05, "loss": 1.2273, "step": 16209 }, { "epoch": 4.828086896628753, "grad_norm": 0.33981654047966003, "learning_rate": 1.1035051735941686e-05, "loss": 1.2129, "step": 16210 }, { "epoch": 4.828384742828422, "grad_norm": 0.2944939136505127, "learning_rate": 1.1034092345530023e-05, "loss": 1.216, "step": 16211 }, { "epoch": 4.828682589028091, "grad_norm": 0.5903461575508118, "learning_rate": 1.1033132945497287e-05, "loss": 1.1992, "step": 16212 }, { "epoch": 4.828980435227759, "grad_norm": 0.46102413535118103, "learning_rate": 1.10321735358524e-05, "loss": 1.2253, "step": 16213 }, { "epoch": 4.8292782814274275, "grad_norm": 0.31985360383987427, "learning_rate": 1.1031214116604294e-05, "loss": 1.2162, "step": 16214 }, { "epoch": 4.829576127627097, "grad_norm": 0.31031545996665955, "learning_rate": 1.1030254687761892e-05, "loss": 1.2283, "step": 16215 }, { "epoch": 4.829873973826765, "grad_norm": 0.3633049726486206, "learning_rate": 1.1029295249334122e-05, "loss": 1.231, "step": 16216 }, { "epoch": 4.830171820026434, "grad_norm": 0.2707124352455139, "learning_rate": 1.1028335801329907e-05, "loss": 1.2105, "step": 16217 }, { "epoch": 4.830469666226103, "grad_norm": 0.2790050804615021, "learning_rate": 1.102737634375818e-05, "loss": 1.199, "step": 16218 }, { "epoch": 4.830767512425771, "grad_norm": 0.294877290725708, "learning_rate": 1.102641687662786e-05, "loss": 1.224, "step": 16219 }, { "epoch": 4.83106535862544, "grad_norm": 0.4410344064235687, "learning_rate": 1.102545739994788e-05, "loss": 1.215, "step": 16220 }, { "epoch": 4.8313632048251085, "grad_norm": 0.25842905044555664, "learning_rate": 1.1024497913727166e-05, "loss": 1.2171, "step": 16221 }, { "epoch": 4.831661051024777, "grad_norm": 0.44133102893829346, "learning_rate": 1.1023538417974641e-05, "loss": 1.2074, "step": 16222 }, { "epoch": 4.831958897224446, "grad_norm": 0.2675754427909851, "learning_rate": 1.1022578912699237e-05, "loss": 1.2259, "step": 16223 }, { "epoch": 4.832256743424114, "grad_norm": 0.5445632934570312, "learning_rate": 1.102161939790988e-05, "loss": 1.2173, "step": 16224 }, { "epoch": 4.832554589623783, "grad_norm": 0.3402521014213562, "learning_rate": 1.1020659873615491e-05, "loss": 1.2052, "step": 16225 }, { "epoch": 4.832852435823452, "grad_norm": 0.3629879653453827, "learning_rate": 1.1019700339825006e-05, "loss": 1.2254, "step": 16226 }, { "epoch": 4.83315028202312, "grad_norm": 0.27562153339385986, "learning_rate": 1.1018740796547347e-05, "loss": 1.2034, "step": 16227 }, { "epoch": 4.833448128222789, "grad_norm": 0.46772968769073486, "learning_rate": 1.1017781243791443e-05, "loss": 1.223, "step": 16228 }, { "epoch": 4.833745974422458, "grad_norm": 0.3419756591320038, "learning_rate": 1.1016821681566222e-05, "loss": 1.2179, "step": 16229 }, { "epoch": 4.834043820622126, "grad_norm": 0.35203564167022705, "learning_rate": 1.1015862109880612e-05, "loss": 1.2265, "step": 16230 }, { "epoch": 4.8343416668217944, "grad_norm": 0.2615616023540497, "learning_rate": 1.1014902528743537e-05, "loss": 1.2261, "step": 16231 }, { "epoch": 4.834639513021464, "grad_norm": 0.32637733221054077, "learning_rate": 1.101394293816393e-05, "loss": 1.2284, "step": 16232 }, { "epoch": 4.834937359221132, "grad_norm": 0.25350940227508545, "learning_rate": 1.101298333815072e-05, "loss": 1.219, "step": 16233 }, { "epoch": 4.835235205420801, "grad_norm": 0.3241324722766876, "learning_rate": 1.1012023728712828e-05, "loss": 1.2224, "step": 16234 }, { "epoch": 4.8355330516204695, "grad_norm": 0.2580425441265106, "learning_rate": 1.1011064109859186e-05, "loss": 1.212, "step": 16235 }, { "epoch": 4.835830897820138, "grad_norm": 0.26354455947875977, "learning_rate": 1.1010104481598725e-05, "loss": 1.2234, "step": 16236 }, { "epoch": 4.836128744019807, "grad_norm": 0.26966482400894165, "learning_rate": 1.1009144843940365e-05, "loss": 1.2114, "step": 16237 }, { "epoch": 4.836426590219475, "grad_norm": 0.2623552680015564, "learning_rate": 1.1008185196893044e-05, "loss": 1.2185, "step": 16238 }, { "epoch": 4.836724436419144, "grad_norm": 0.3894292712211609, "learning_rate": 1.1007225540465684e-05, "loss": 1.2181, "step": 16239 }, { "epoch": 4.837022282618813, "grad_norm": 0.27377384901046753, "learning_rate": 1.1006265874667219e-05, "loss": 1.2047, "step": 16240 }, { "epoch": 4.837320128818481, "grad_norm": 0.5420881509780884, "learning_rate": 1.1005306199506572e-05, "loss": 1.2085, "step": 16241 }, { "epoch": 4.83761797501815, "grad_norm": 0.4181584119796753, "learning_rate": 1.1004346514992674e-05, "loss": 1.2085, "step": 16242 }, { "epoch": 4.837915821217819, "grad_norm": 0.42847493290901184, "learning_rate": 1.1003386821134451e-05, "loss": 1.2315, "step": 16243 }, { "epoch": 4.838213667417487, "grad_norm": 0.505787193775177, "learning_rate": 1.100242711794084e-05, "loss": 1.223, "step": 16244 }, { "epoch": 4.838511513617156, "grad_norm": 0.2728482484817505, "learning_rate": 1.1001467405420761e-05, "loss": 1.1958, "step": 16245 }, { "epoch": 4.838809359816825, "grad_norm": 0.36392879486083984, "learning_rate": 1.1000507683583147e-05, "loss": 1.2007, "step": 16246 }, { "epoch": 4.839107206016493, "grad_norm": 0.32511842250823975, "learning_rate": 1.0999547952436932e-05, "loss": 1.2105, "step": 16247 }, { "epoch": 4.839405052216162, "grad_norm": 0.4247780442237854, "learning_rate": 1.0998588211991034e-05, "loss": 1.2135, "step": 16248 }, { "epoch": 4.839702898415831, "grad_norm": 0.29443636536598206, "learning_rate": 1.0997628462254388e-05, "loss": 1.2139, "step": 16249 }, { "epoch": 4.840000744615499, "grad_norm": 0.32411524653434753, "learning_rate": 1.0996668703235926e-05, "loss": 1.2366, "step": 16250 }, { "epoch": 4.840298590815168, "grad_norm": 0.3381761908531189, "learning_rate": 1.0995708934944575e-05, "loss": 1.2161, "step": 16251 }, { "epoch": 4.840596437014836, "grad_norm": 0.24481159448623657, "learning_rate": 1.0994749157389266e-05, "loss": 1.2224, "step": 16252 }, { "epoch": 4.840894283214505, "grad_norm": 0.253580778837204, "learning_rate": 1.0993789370578928e-05, "loss": 1.212, "step": 16253 }, { "epoch": 4.841192129414174, "grad_norm": 0.27224865555763245, "learning_rate": 1.099282957452249e-05, "loss": 1.2058, "step": 16254 }, { "epoch": 4.841489975613842, "grad_norm": 0.2752995193004608, "learning_rate": 1.0991869769228877e-05, "loss": 1.22, "step": 16255 }, { "epoch": 4.8417878218135115, "grad_norm": 0.30049604177474976, "learning_rate": 1.0990909954707028e-05, "loss": 1.2195, "step": 16256 }, { "epoch": 4.84208566801318, "grad_norm": 0.2794896364212036, "learning_rate": 1.0989950130965869e-05, "loss": 1.2089, "step": 16257 }, { "epoch": 4.842383514212848, "grad_norm": 0.31548571586608887, "learning_rate": 1.098899029801433e-05, "loss": 1.227, "step": 16258 }, { "epoch": 4.8426813604125165, "grad_norm": 0.2630378305912018, "learning_rate": 1.0988030455861342e-05, "loss": 1.217, "step": 16259 }, { "epoch": 4.842979206612186, "grad_norm": 0.32502326369285583, "learning_rate": 1.0987070604515832e-05, "loss": 1.2218, "step": 16260 }, { "epoch": 4.843277052811854, "grad_norm": 0.2506658732891083, "learning_rate": 1.0986110743986736e-05, "loss": 1.233, "step": 16261 }, { "epoch": 4.843574899011523, "grad_norm": 0.38177919387817383, "learning_rate": 1.0985150874282979e-05, "loss": 1.2318, "step": 16262 }, { "epoch": 4.843872745211192, "grad_norm": 0.3221963942050934, "learning_rate": 1.0984190995413495e-05, "loss": 1.2031, "step": 16263 }, { "epoch": 4.84417059141086, "grad_norm": 0.30547401309013367, "learning_rate": 1.0983231107387213e-05, "loss": 1.218, "step": 16264 }, { "epoch": 4.844468437610529, "grad_norm": 0.39576584100723267, "learning_rate": 1.0982271210213065e-05, "loss": 1.2191, "step": 16265 }, { "epoch": 4.8447662838101975, "grad_norm": 0.2645990252494812, "learning_rate": 1.098131130389998e-05, "loss": 1.2242, "step": 16266 }, { "epoch": 4.845064130009866, "grad_norm": 0.3115106225013733, "learning_rate": 1.0980351388456889e-05, "loss": 1.2205, "step": 16267 }, { "epoch": 4.845361976209535, "grad_norm": 0.25013476610183716, "learning_rate": 1.0979391463892724e-05, "loss": 1.2219, "step": 16268 }, { "epoch": 4.845659822409203, "grad_norm": 0.3354990482330322, "learning_rate": 1.0978431530216416e-05, "loss": 1.2341, "step": 16269 }, { "epoch": 4.845957668608872, "grad_norm": 0.3058474361896515, "learning_rate": 1.0977471587436897e-05, "loss": 1.2157, "step": 16270 }, { "epoch": 4.846255514808541, "grad_norm": 0.2506557106971741, "learning_rate": 1.0976511635563094e-05, "loss": 1.2137, "step": 16271 }, { "epoch": 4.846553361008209, "grad_norm": 0.23520879447460175, "learning_rate": 1.0975551674603944e-05, "loss": 1.2154, "step": 16272 }, { "epoch": 4.846851207207878, "grad_norm": 0.24506239593029022, "learning_rate": 1.0974591704568372e-05, "loss": 1.2026, "step": 16273 }, { "epoch": 4.847149053407547, "grad_norm": 0.2806796133518219, "learning_rate": 1.0973631725465318e-05, "loss": 1.2286, "step": 16274 }, { "epoch": 4.847446899607215, "grad_norm": 0.27367648482322693, "learning_rate": 1.0972671737303705e-05, "loss": 1.2304, "step": 16275 }, { "epoch": 4.847744745806884, "grad_norm": 0.28956472873687744, "learning_rate": 1.0971711740092468e-05, "loss": 1.2177, "step": 16276 }, { "epoch": 4.848042592006553, "grad_norm": 0.4239387512207031, "learning_rate": 1.0970751733840544e-05, "loss": 1.2135, "step": 16277 }, { "epoch": 4.848340438206221, "grad_norm": 0.2668099105358124, "learning_rate": 1.0969791718556853e-05, "loss": 1.1985, "step": 16278 }, { "epoch": 4.84863828440589, "grad_norm": 0.2952024042606354, "learning_rate": 1.0968831694250335e-05, "loss": 1.2088, "step": 16279 }, { "epoch": 4.8489361306055585, "grad_norm": 0.2664113938808441, "learning_rate": 1.0967871660929923e-05, "loss": 1.2339, "step": 16280 }, { "epoch": 4.849233976805227, "grad_norm": 0.3338851034641266, "learning_rate": 1.0966911618604548e-05, "loss": 1.2059, "step": 16281 }, { "epoch": 4.849531823004896, "grad_norm": 0.3878779411315918, "learning_rate": 1.0965951567283138e-05, "loss": 1.2103, "step": 16282 }, { "epoch": 4.849829669204564, "grad_norm": 0.3286162316799164, "learning_rate": 1.0964991506974628e-05, "loss": 1.2155, "step": 16283 }, { "epoch": 4.850127515404234, "grad_norm": 0.2817932665348053, "learning_rate": 1.096403143768795e-05, "loss": 1.2107, "step": 16284 }, { "epoch": 4.850425361603902, "grad_norm": 0.2845592200756073, "learning_rate": 1.0963071359432035e-05, "loss": 1.2231, "step": 16285 }, { "epoch": 4.85072320780357, "grad_norm": 0.46321550011634827, "learning_rate": 1.0962111272215818e-05, "loss": 1.2061, "step": 16286 }, { "epoch": 4.8510210540032395, "grad_norm": 0.5695003867149353, "learning_rate": 1.0961151176048233e-05, "loss": 1.2365, "step": 16287 }, { "epoch": 4.851318900202908, "grad_norm": 0.24942146241664886, "learning_rate": 1.096019107093821e-05, "loss": 1.2143, "step": 16288 }, { "epoch": 4.851616746402576, "grad_norm": 0.4089578092098236, "learning_rate": 1.095923095689468e-05, "loss": 1.2036, "step": 16289 }, { "epoch": 4.851914592602245, "grad_norm": 0.2610035836696625, "learning_rate": 1.0958270833926579e-05, "loss": 1.2214, "step": 16290 }, { "epoch": 4.852212438801914, "grad_norm": 0.3350369930267334, "learning_rate": 1.0957310702042836e-05, "loss": 1.2054, "step": 16291 }, { "epoch": 4.852510285001582, "grad_norm": 0.3857544958591461, "learning_rate": 1.0956350561252388e-05, "loss": 1.2236, "step": 16292 }, { "epoch": 4.852808131201251, "grad_norm": 0.2987971603870392, "learning_rate": 1.0955390411564166e-05, "loss": 1.2152, "step": 16293 }, { "epoch": 4.8531059774009195, "grad_norm": 0.2751263678073883, "learning_rate": 1.0954430252987107e-05, "loss": 1.2262, "step": 16294 }, { "epoch": 4.853403823600588, "grad_norm": 0.29420581459999084, "learning_rate": 1.0953470085530138e-05, "loss": 1.2135, "step": 16295 }, { "epoch": 4.853701669800257, "grad_norm": 0.25773486495018005, "learning_rate": 1.0952509909202194e-05, "loss": 1.2047, "step": 16296 }, { "epoch": 4.853999515999925, "grad_norm": 0.2723999321460724, "learning_rate": 1.0951549724012209e-05, "loss": 1.2172, "step": 16297 }, { "epoch": 4.854297362199594, "grad_norm": 0.2643665075302124, "learning_rate": 1.0950589529969123e-05, "loss": 1.2179, "step": 16298 }, { "epoch": 4.854595208399263, "grad_norm": 0.4759957790374756, "learning_rate": 1.0949629327081856e-05, "loss": 1.2171, "step": 16299 }, { "epoch": 4.854893054598931, "grad_norm": 0.40444934368133545, "learning_rate": 1.0948669115359354e-05, "loss": 1.2137, "step": 16300 }, { "epoch": 4.8551909007986005, "grad_norm": 0.3371957540512085, "learning_rate": 1.0947708894810542e-05, "loss": 1.2096, "step": 16301 }, { "epoch": 4.855488746998269, "grad_norm": 0.4310877323150635, "learning_rate": 1.0946748665444362e-05, "loss": 1.2196, "step": 16302 }, { "epoch": 4.855786593197937, "grad_norm": 0.28807327151298523, "learning_rate": 1.094578842726974e-05, "loss": 1.2155, "step": 16303 }, { "epoch": 4.856084439397606, "grad_norm": 0.3602162301540375, "learning_rate": 1.0944828180295615e-05, "loss": 1.248, "step": 16304 }, { "epoch": 4.856382285597275, "grad_norm": 0.4068543314933777, "learning_rate": 1.0943867924530919e-05, "loss": 1.2225, "step": 16305 }, { "epoch": 4.856680131796943, "grad_norm": 0.6764812469482422, "learning_rate": 1.0942907659984587e-05, "loss": 1.2324, "step": 16306 }, { "epoch": 4.856977977996612, "grad_norm": 0.26010850071907043, "learning_rate": 1.0941947386665553e-05, "loss": 1.2259, "step": 16307 }, { "epoch": 4.857275824196281, "grad_norm": 0.6575981974601746, "learning_rate": 1.094098710458275e-05, "loss": 1.2418, "step": 16308 }, { "epoch": 4.857573670395949, "grad_norm": 0.26086241006851196, "learning_rate": 1.0940026813745113e-05, "loss": 1.2003, "step": 16309 }, { "epoch": 4.857871516595618, "grad_norm": 0.6123746633529663, "learning_rate": 1.0939066514161576e-05, "loss": 1.2099, "step": 16310 }, { "epoch": 4.858169362795286, "grad_norm": 0.3198632597923279, "learning_rate": 1.0938106205841077e-05, "loss": 1.2182, "step": 16311 }, { "epoch": 4.858467208994956, "grad_norm": 0.5020037889480591, "learning_rate": 1.0937145888792546e-05, "loss": 1.2073, "step": 16312 }, { "epoch": 4.858765055194624, "grad_norm": 0.2920586168766022, "learning_rate": 1.0936185563024922e-05, "loss": 1.2143, "step": 16313 }, { "epoch": 4.859062901394292, "grad_norm": 0.4286530613899231, "learning_rate": 1.0935225228547133e-05, "loss": 1.2121, "step": 16314 }, { "epoch": 4.8593607475939615, "grad_norm": 0.35088905692100525, "learning_rate": 1.093426488536812e-05, "loss": 1.2192, "step": 16315 }, { "epoch": 4.85965859379363, "grad_norm": 0.38091519474983215, "learning_rate": 1.0933304533496817e-05, "loss": 1.2157, "step": 16316 }, { "epoch": 4.859956439993298, "grad_norm": 0.39246076345443726, "learning_rate": 1.0932344172942158e-05, "loss": 1.2127, "step": 16317 }, { "epoch": 4.860254286192967, "grad_norm": 0.43078768253326416, "learning_rate": 1.0931383803713079e-05, "loss": 1.2069, "step": 16318 }, { "epoch": 4.860552132392636, "grad_norm": 0.32488927245140076, "learning_rate": 1.093042342581851e-05, "loss": 1.2297, "step": 16319 }, { "epoch": 4.860849978592304, "grad_norm": 0.35250064730644226, "learning_rate": 1.0929463039267393e-05, "loss": 1.2218, "step": 16320 }, { "epoch": 4.861147824791973, "grad_norm": 0.27092909812927246, "learning_rate": 1.0928502644068662e-05, "loss": 1.226, "step": 16321 }, { "epoch": 4.861445670991642, "grad_norm": 0.43906137347221375, "learning_rate": 1.0927542240231253e-05, "loss": 1.2118, "step": 16322 }, { "epoch": 4.861743517191311, "grad_norm": 0.3613179922103882, "learning_rate": 1.0926581827764097e-05, "loss": 1.2143, "step": 16323 }, { "epoch": 4.862041363390979, "grad_norm": 0.3018326163291931, "learning_rate": 1.0925621406676132e-05, "loss": 1.2093, "step": 16324 }, { "epoch": 4.8623392095906475, "grad_norm": 0.30221664905548096, "learning_rate": 1.0924660976976295e-05, "loss": 1.2193, "step": 16325 }, { "epoch": 4.862637055790316, "grad_norm": 0.40009406208992004, "learning_rate": 1.092370053867352e-05, "loss": 1.201, "step": 16326 }, { "epoch": 4.862934901989985, "grad_norm": 0.4788067936897278, "learning_rate": 1.0922740091776744e-05, "loss": 1.2159, "step": 16327 }, { "epoch": 4.863232748189653, "grad_norm": 0.3238835632801056, "learning_rate": 1.0921779636294904e-05, "loss": 1.2291, "step": 16328 }, { "epoch": 4.863530594389323, "grad_norm": 0.3570665419101715, "learning_rate": 1.0920819172236932e-05, "loss": 1.2074, "step": 16329 }, { "epoch": 4.863828440588991, "grad_norm": 0.3871474266052246, "learning_rate": 1.0919858699611767e-05, "loss": 1.2091, "step": 16330 }, { "epoch": 4.864126286788659, "grad_norm": 0.4075256884098053, "learning_rate": 1.0918898218428344e-05, "loss": 1.233, "step": 16331 }, { "epoch": 4.864424132988328, "grad_norm": 0.2669723927974701, "learning_rate": 1.09179377286956e-05, "loss": 1.2266, "step": 16332 }, { "epoch": 4.864721979187997, "grad_norm": 0.2555624842643738, "learning_rate": 1.0916977230422472e-05, "loss": 1.2367, "step": 16333 }, { "epoch": 4.865019825387665, "grad_norm": 0.3796085715293884, "learning_rate": 1.0916016723617894e-05, "loss": 1.22, "step": 16334 }, { "epoch": 4.865317671587334, "grad_norm": 0.26811715960502625, "learning_rate": 1.0915056208290807e-05, "loss": 1.2075, "step": 16335 }, { "epoch": 4.865615517787003, "grad_norm": 0.32306674122810364, "learning_rate": 1.091409568445014e-05, "loss": 1.2161, "step": 16336 }, { "epoch": 4.865913363986671, "grad_norm": 0.30818474292755127, "learning_rate": 1.0913135152104835e-05, "loss": 1.2176, "step": 16337 }, { "epoch": 4.86621121018634, "grad_norm": 0.6810904145240784, "learning_rate": 1.0912174611263828e-05, "loss": 1.2138, "step": 16338 }, { "epoch": 4.8665090563860085, "grad_norm": 0.5148798227310181, "learning_rate": 1.0911214061936057e-05, "loss": 1.225, "step": 16339 }, { "epoch": 4.866806902585678, "grad_norm": 0.41127029061317444, "learning_rate": 1.0910253504130457e-05, "loss": 1.2184, "step": 16340 }, { "epoch": 4.867104748785346, "grad_norm": 0.2449532151222229, "learning_rate": 1.0909292937855964e-05, "loss": 1.2153, "step": 16341 }, { "epoch": 4.867402594985014, "grad_norm": 0.5934796929359436, "learning_rate": 1.0908332363121516e-05, "loss": 1.2326, "step": 16342 }, { "epoch": 4.867700441184684, "grad_norm": 0.2860567569732666, "learning_rate": 1.0907371779936051e-05, "loss": 1.2216, "step": 16343 }, { "epoch": 4.867998287384352, "grad_norm": 0.558010995388031, "learning_rate": 1.0906411188308504e-05, "loss": 1.2263, "step": 16344 }, { "epoch": 4.86829613358402, "grad_norm": 0.2574203312397003, "learning_rate": 1.0905450588247815e-05, "loss": 1.2233, "step": 16345 }, { "epoch": 4.8685939797836895, "grad_norm": 0.424591600894928, "learning_rate": 1.0904489979762922e-05, "loss": 1.2141, "step": 16346 }, { "epoch": 4.868891825983358, "grad_norm": 0.2902675271034241, "learning_rate": 1.0903529362862758e-05, "loss": 1.2281, "step": 16347 }, { "epoch": 4.869189672183026, "grad_norm": 0.28373557329177856, "learning_rate": 1.090256873755626e-05, "loss": 1.206, "step": 16348 }, { "epoch": 4.869487518382695, "grad_norm": 0.3348288834095001, "learning_rate": 1.0901608103852373e-05, "loss": 1.2062, "step": 16349 }, { "epoch": 4.869785364582364, "grad_norm": 0.34329766035079956, "learning_rate": 1.0900647461760025e-05, "loss": 1.2268, "step": 16350 }, { "epoch": 4.870083210782033, "grad_norm": 0.36737751960754395, "learning_rate": 1.0899686811288162e-05, "loss": 1.2409, "step": 16351 }, { "epoch": 4.870381056981701, "grad_norm": 0.42342904210090637, "learning_rate": 1.089872615244572e-05, "loss": 1.2172, "step": 16352 }, { "epoch": 4.8706789031813695, "grad_norm": 0.3289167284965515, "learning_rate": 1.0897765485241632e-05, "loss": 1.2026, "step": 16353 }, { "epoch": 4.870976749381039, "grad_norm": 0.33885636925697327, "learning_rate": 1.089680480968484e-05, "loss": 1.2022, "step": 16354 }, { "epoch": 4.871274595580707, "grad_norm": 0.2676118016242981, "learning_rate": 1.0895844125784278e-05, "loss": 1.2043, "step": 16355 }, { "epoch": 4.871572441780375, "grad_norm": 0.3302132785320282, "learning_rate": 1.0894883433548894e-05, "loss": 1.2302, "step": 16356 }, { "epoch": 4.871870287980045, "grad_norm": 0.26079061627388, "learning_rate": 1.0893922732987616e-05, "loss": 1.2163, "step": 16357 }, { "epoch": 4.872168134179713, "grad_norm": 0.3585273325443268, "learning_rate": 1.0892962024109385e-05, "loss": 1.2191, "step": 16358 }, { "epoch": 4.872465980379381, "grad_norm": 0.3633761405944824, "learning_rate": 1.0892001306923143e-05, "loss": 1.2327, "step": 16359 }, { "epoch": 4.8727638265790505, "grad_norm": 0.2803542912006378, "learning_rate": 1.0891040581437822e-05, "loss": 1.2075, "step": 16360 }, { "epoch": 4.873061672778719, "grad_norm": 0.3831990659236908, "learning_rate": 1.0890079847662364e-05, "loss": 1.2139, "step": 16361 }, { "epoch": 4.873359518978387, "grad_norm": 0.2607085108757019, "learning_rate": 1.088911910560571e-05, "loss": 1.2367, "step": 16362 }, { "epoch": 4.873657365178056, "grad_norm": 0.302704781293869, "learning_rate": 1.0888158355276796e-05, "loss": 1.2179, "step": 16363 }, { "epoch": 4.873955211377725, "grad_norm": 0.2732965648174286, "learning_rate": 1.088719759668456e-05, "loss": 1.2133, "step": 16364 }, { "epoch": 4.874253057577393, "grad_norm": 0.2925439774990082, "learning_rate": 1.0886236829837942e-05, "loss": 1.2175, "step": 16365 }, { "epoch": 4.874550903777062, "grad_norm": 0.27101626992225647, "learning_rate": 1.0885276054745879e-05, "loss": 1.2269, "step": 16366 }, { "epoch": 4.874848749976731, "grad_norm": 0.3258912265300751, "learning_rate": 1.0884315271417311e-05, "loss": 1.2213, "step": 16367 }, { "epoch": 4.8751465961764, "grad_norm": 0.2819480895996094, "learning_rate": 1.0883354479861179e-05, "loss": 1.2209, "step": 16368 }, { "epoch": 4.875444442376068, "grad_norm": 0.28845056891441345, "learning_rate": 1.0882393680086423e-05, "loss": 1.2409, "step": 16369 }, { "epoch": 4.875742288575736, "grad_norm": 0.3165302574634552, "learning_rate": 1.0881432872101976e-05, "loss": 1.2406, "step": 16370 }, { "epoch": 4.876040134775406, "grad_norm": 0.3963184952735901, "learning_rate": 1.0880472055916782e-05, "loss": 1.2119, "step": 16371 }, { "epoch": 4.876337980975074, "grad_norm": 0.2877182364463806, "learning_rate": 1.0879511231539778e-05, "loss": 1.2265, "step": 16372 }, { "epoch": 4.876635827174742, "grad_norm": 0.3519115447998047, "learning_rate": 1.0878550398979905e-05, "loss": 1.2153, "step": 16373 }, { "epoch": 4.8769336733744115, "grad_norm": 0.25945916771888733, "learning_rate": 1.0877589558246102e-05, "loss": 1.2227, "step": 16374 }, { "epoch": 4.87723151957408, "grad_norm": 0.6392548680305481, "learning_rate": 1.087662870934731e-05, "loss": 1.2284, "step": 16375 }, { "epoch": 4.877529365773748, "grad_norm": 0.5318585634231567, "learning_rate": 1.087566785229247e-05, "loss": 1.2234, "step": 16376 }, { "epoch": 4.877827211973417, "grad_norm": 0.3100576400756836, "learning_rate": 1.0874706987090513e-05, "loss": 1.2355, "step": 16377 }, { "epoch": 4.878125058173086, "grad_norm": 0.30222323536872864, "learning_rate": 1.0873746113750385e-05, "loss": 1.2178, "step": 16378 }, { "epoch": 4.878422904372755, "grad_norm": 0.260503351688385, "learning_rate": 1.087278523228103e-05, "loss": 1.2172, "step": 16379 }, { "epoch": 4.878720750572423, "grad_norm": 0.29023849964141846, "learning_rate": 1.0871824342691382e-05, "loss": 1.2371, "step": 16380 }, { "epoch": 4.879018596772092, "grad_norm": 0.28010281920433044, "learning_rate": 1.0870863444990383e-05, "loss": 1.217, "step": 16381 }, { "epoch": 4.879316442971761, "grad_norm": 0.2529861330986023, "learning_rate": 1.0869902539186972e-05, "loss": 1.2127, "step": 16382 }, { "epoch": 4.879614289171429, "grad_norm": 0.2620205879211426, "learning_rate": 1.086894162529009e-05, "loss": 1.2276, "step": 16383 }, { "epoch": 4.8799121353710975, "grad_norm": 0.27573156356811523, "learning_rate": 1.0867980703308673e-05, "loss": 1.2225, "step": 16384 }, { "epoch": 4.880209981570767, "grad_norm": 0.2804858684539795, "learning_rate": 1.086701977325167e-05, "loss": 1.2205, "step": 16385 }, { "epoch": 4.880507827770435, "grad_norm": 0.2714120149612427, "learning_rate": 1.0866058835128017e-05, "loss": 1.2305, "step": 16386 }, { "epoch": 4.880805673970103, "grad_norm": 0.34036484360694885, "learning_rate": 1.0865097888946654e-05, "loss": 1.2178, "step": 16387 }, { "epoch": 4.881103520169773, "grad_norm": 0.2547363340854645, "learning_rate": 1.086413693471652e-05, "loss": 1.2093, "step": 16388 }, { "epoch": 4.881401366369441, "grad_norm": 0.29458507895469666, "learning_rate": 1.0863175972446556e-05, "loss": 1.222, "step": 16389 }, { "epoch": 4.88169921256911, "grad_norm": 0.41369563341140747, "learning_rate": 1.0862215002145706e-05, "loss": 1.2143, "step": 16390 }, { "epoch": 4.881997058768778, "grad_norm": 0.3324941098690033, "learning_rate": 1.086125402382291e-05, "loss": 1.2142, "step": 16391 }, { "epoch": 4.882294904968447, "grad_norm": 0.25271379947662354, "learning_rate": 1.0860293037487104e-05, "loss": 1.2111, "step": 16392 }, { "epoch": 4.882592751168115, "grad_norm": 0.27145299315452576, "learning_rate": 1.0859332043147237e-05, "loss": 1.2071, "step": 16393 }, { "epoch": 4.882890597367784, "grad_norm": 0.2988566756248474, "learning_rate": 1.0858371040812243e-05, "loss": 1.2026, "step": 16394 }, { "epoch": 4.883188443567453, "grad_norm": 0.31120866537094116, "learning_rate": 1.0857410030491065e-05, "loss": 1.2241, "step": 16395 }, { "epoch": 4.883486289767122, "grad_norm": 0.37695109844207764, "learning_rate": 1.0856449012192642e-05, "loss": 1.2189, "step": 16396 }, { "epoch": 4.88378413596679, "grad_norm": 0.6909080147743225, "learning_rate": 1.0855487985925923e-05, "loss": 1.214, "step": 16397 }, { "epoch": 4.8840819821664585, "grad_norm": 0.44229087233543396, "learning_rate": 1.0854526951699842e-05, "loss": 1.2097, "step": 16398 }, { "epoch": 4.884379828366128, "grad_norm": 0.39378392696380615, "learning_rate": 1.0853565909523343e-05, "loss": 1.2238, "step": 16399 }, { "epoch": 4.884677674565796, "grad_norm": 0.48348134756088257, "learning_rate": 1.0852604859405367e-05, "loss": 1.227, "step": 16400 }, { "epoch": 4.884975520765464, "grad_norm": 0.27815037965774536, "learning_rate": 1.0851643801354855e-05, "loss": 1.2222, "step": 16401 }, { "epoch": 4.885273366965134, "grad_norm": 0.40838098526000977, "learning_rate": 1.0850682735380744e-05, "loss": 1.2166, "step": 16402 }, { "epoch": 4.885571213164802, "grad_norm": 0.3574656546115875, "learning_rate": 1.0849721661491986e-05, "loss": 1.2394, "step": 16403 }, { "epoch": 4.88586905936447, "grad_norm": 0.653743326663971, "learning_rate": 1.0848760579697519e-05, "loss": 1.2254, "step": 16404 }, { "epoch": 4.8861669055641395, "grad_norm": 0.26477012038230896, "learning_rate": 1.0847799490006278e-05, "loss": 1.2147, "step": 16405 }, { "epoch": 4.886464751763808, "grad_norm": 0.3457360863685608, "learning_rate": 1.0846838392427215e-05, "loss": 1.2195, "step": 16406 }, { "epoch": 4.886762597963477, "grad_norm": 0.2645303010940552, "learning_rate": 1.0845877286969265e-05, "loss": 1.236, "step": 16407 }, { "epoch": 4.887060444163145, "grad_norm": 0.28887856006622314, "learning_rate": 1.0844916173641369e-05, "loss": 1.2138, "step": 16408 }, { "epoch": 4.887358290362814, "grad_norm": 0.32342734932899475, "learning_rate": 1.0843955052452475e-05, "loss": 1.2403, "step": 16409 }, { "epoch": 4.887656136562483, "grad_norm": 0.3168574273586273, "learning_rate": 1.0842993923411523e-05, "loss": 1.2267, "step": 16410 }, { "epoch": 4.887953982762151, "grad_norm": 0.3962790369987488, "learning_rate": 1.0842032786527452e-05, "loss": 1.2319, "step": 16411 }, { "epoch": 4.8882518289618195, "grad_norm": 0.42341771721839905, "learning_rate": 1.0841071641809208e-05, "loss": 1.2311, "step": 16412 }, { "epoch": 4.888549675161489, "grad_norm": 0.26488226652145386, "learning_rate": 1.0840110489265731e-05, "loss": 1.2183, "step": 16413 }, { "epoch": 4.888847521361157, "grad_norm": 0.3394005298614502, "learning_rate": 1.0839149328905965e-05, "loss": 1.2168, "step": 16414 }, { "epoch": 4.889145367560825, "grad_norm": 0.29203420877456665, "learning_rate": 1.0838188160738852e-05, "loss": 1.2121, "step": 16415 }, { "epoch": 4.889443213760495, "grad_norm": 0.5804629325866699, "learning_rate": 1.0837226984773335e-05, "loss": 1.2104, "step": 16416 }, { "epoch": 4.889741059960163, "grad_norm": 0.5409330129623413, "learning_rate": 1.0836265801018358e-05, "loss": 1.219, "step": 16417 }, { "epoch": 4.890038906159832, "grad_norm": 0.2707678973674774, "learning_rate": 1.0835304609482859e-05, "loss": 1.2225, "step": 16418 }, { "epoch": 4.8903367523595005, "grad_norm": 0.46385636925697327, "learning_rate": 1.0834343410175784e-05, "loss": 1.2179, "step": 16419 }, { "epoch": 4.890634598559169, "grad_norm": 0.3915536105632782, "learning_rate": 1.0833382203106076e-05, "loss": 1.2159, "step": 16420 }, { "epoch": 4.890932444758838, "grad_norm": 0.2822267711162567, "learning_rate": 1.0832420988282682e-05, "loss": 1.2154, "step": 16421 }, { "epoch": 4.891230290958506, "grad_norm": 0.441423237323761, "learning_rate": 1.0831459765714536e-05, "loss": 1.2016, "step": 16422 }, { "epoch": 4.891528137158175, "grad_norm": 0.2737753987312317, "learning_rate": 1.0830498535410587e-05, "loss": 1.2177, "step": 16423 }, { "epoch": 4.891825983357844, "grad_norm": 0.625218391418457, "learning_rate": 1.082953729737978e-05, "loss": 1.2245, "step": 16424 }, { "epoch": 4.892123829557512, "grad_norm": 0.38089168071746826, "learning_rate": 1.0828576051631048e-05, "loss": 1.2164, "step": 16425 }, { "epoch": 4.892421675757181, "grad_norm": 0.2916334569454193, "learning_rate": 1.0827614798173347e-05, "loss": 1.223, "step": 16426 }, { "epoch": 4.89271952195685, "grad_norm": 0.32069745659828186, "learning_rate": 1.0826653537015615e-05, "loss": 1.2186, "step": 16427 }, { "epoch": 4.893017368156518, "grad_norm": 0.28550082445144653, "learning_rate": 1.0825692268166794e-05, "loss": 1.2155, "step": 16428 }, { "epoch": 4.893315214356186, "grad_norm": 0.3154747784137726, "learning_rate": 1.082473099163583e-05, "loss": 1.2046, "step": 16429 }, { "epoch": 4.893613060555856, "grad_norm": 0.35802021622657776, "learning_rate": 1.0823769707431663e-05, "loss": 1.2329, "step": 16430 }, { "epoch": 4.893910906755524, "grad_norm": 0.3719880282878876, "learning_rate": 1.0822808415563242e-05, "loss": 1.2279, "step": 16431 }, { "epoch": 4.894208752955192, "grad_norm": 0.2961256206035614, "learning_rate": 1.0821847116039508e-05, "loss": 1.2018, "step": 16432 }, { "epoch": 4.8945065991548615, "grad_norm": 0.352282851934433, "learning_rate": 1.0820885808869404e-05, "loss": 1.2299, "step": 16433 }, { "epoch": 4.89480444535453, "grad_norm": 0.28779590129852295, "learning_rate": 1.0819924494061875e-05, "loss": 1.2112, "step": 16434 }, { "epoch": 4.895102291554199, "grad_norm": 0.2735520899295807, "learning_rate": 1.0818963171625865e-05, "loss": 1.2135, "step": 16435 }, { "epoch": 4.895400137753867, "grad_norm": 0.335479199886322, "learning_rate": 1.0818001841570316e-05, "loss": 1.2188, "step": 16436 }, { "epoch": 4.895697983953536, "grad_norm": 0.2797480523586273, "learning_rate": 1.0817040503904172e-05, "loss": 1.2282, "step": 16437 }, { "epoch": 4.895995830153205, "grad_norm": 0.2558453679084778, "learning_rate": 1.0816079158636383e-05, "loss": 1.2277, "step": 16438 }, { "epoch": 4.896293676352873, "grad_norm": 0.32490307092666626, "learning_rate": 1.0815117805775889e-05, "loss": 1.2111, "step": 16439 }, { "epoch": 4.896591522552542, "grad_norm": 0.25975948572158813, "learning_rate": 1.0814156445331635e-05, "loss": 1.2087, "step": 16440 }, { "epoch": 4.896889368752211, "grad_norm": 0.3289744555950165, "learning_rate": 1.0813195077312564e-05, "loss": 1.206, "step": 16441 }, { "epoch": 4.897187214951879, "grad_norm": 0.2599387466907501, "learning_rate": 1.081223370172762e-05, "loss": 1.2208, "step": 16442 }, { "epoch": 4.8974850611515475, "grad_norm": 0.49145135283470154, "learning_rate": 1.0811272318585745e-05, "loss": 1.2194, "step": 16443 }, { "epoch": 4.897782907351217, "grad_norm": 0.28819048404693604, "learning_rate": 1.0810310927895897e-05, "loss": 1.1981, "step": 16444 }, { "epoch": 4.898080753550885, "grad_norm": 0.40600836277008057, "learning_rate": 1.0809349529667006e-05, "loss": 1.2087, "step": 16445 }, { "epoch": 4.898378599750554, "grad_norm": 0.4209963083267212, "learning_rate": 1.080838812390802e-05, "loss": 1.2194, "step": 16446 }, { "epoch": 4.898676445950223, "grad_norm": 0.2710931599140167, "learning_rate": 1.0807426710627888e-05, "loss": 1.2169, "step": 16447 }, { "epoch": 4.898974292149891, "grad_norm": 0.3259083926677704, "learning_rate": 1.0806465289835552e-05, "loss": 1.2131, "step": 16448 }, { "epoch": 4.89927213834956, "grad_norm": 0.3095962107181549, "learning_rate": 1.0805503861539959e-05, "loss": 1.2287, "step": 16449 }, { "epoch": 4.899569984549228, "grad_norm": 0.32337522506713867, "learning_rate": 1.0804542425750052e-05, "loss": 1.2082, "step": 16450 }, { "epoch": 4.899867830748897, "grad_norm": 0.4136713147163391, "learning_rate": 1.0803580982474777e-05, "loss": 1.2041, "step": 16451 }, { "epoch": 4.900165676948566, "grad_norm": 0.28453126549720764, "learning_rate": 1.0802619531723077e-05, "loss": 1.2072, "step": 16452 }, { "epoch": 4.900463523148234, "grad_norm": 0.46619874238967896, "learning_rate": 1.08016580735039e-05, "loss": 1.2068, "step": 16453 }, { "epoch": 4.900761369347903, "grad_norm": 0.4028174877166748, "learning_rate": 1.0800696607826188e-05, "loss": 1.2168, "step": 16454 }, { "epoch": 4.901059215547572, "grad_norm": 0.3680400252342224, "learning_rate": 1.0799735134698893e-05, "loss": 1.2184, "step": 16455 }, { "epoch": 4.90135706174724, "grad_norm": 0.613753080368042, "learning_rate": 1.0798773654130951e-05, "loss": 1.227, "step": 16456 }, { "epoch": 4.901654907946909, "grad_norm": 0.32443124055862427, "learning_rate": 1.0797812166131316e-05, "loss": 1.2075, "step": 16457 }, { "epoch": 4.901952754146578, "grad_norm": 0.409507155418396, "learning_rate": 1.0796850670708934e-05, "loss": 1.2139, "step": 16458 }, { "epoch": 4.902250600346246, "grad_norm": 0.2848486602306366, "learning_rate": 1.0795889167872743e-05, "loss": 1.22, "step": 16459 }, { "epoch": 4.902548446545914, "grad_norm": 0.2892007529735565, "learning_rate": 1.0794927657631689e-05, "loss": 1.1935, "step": 16460 }, { "epoch": 4.902846292745584, "grad_norm": 0.3312188684940338, "learning_rate": 1.0793966139994723e-05, "loss": 1.226, "step": 16461 }, { "epoch": 4.903144138945252, "grad_norm": 0.4444887936115265, "learning_rate": 1.0793004614970793e-05, "loss": 1.2277, "step": 16462 }, { "epoch": 4.903441985144921, "grad_norm": 0.2989336848258972, "learning_rate": 1.0792043082568836e-05, "loss": 1.2244, "step": 16463 }, { "epoch": 4.9037398313445895, "grad_norm": 0.3596751093864441, "learning_rate": 1.0791081542797805e-05, "loss": 1.2229, "step": 16464 }, { "epoch": 4.904037677544258, "grad_norm": 0.39726847410202026, "learning_rate": 1.0790119995666649e-05, "loss": 1.2193, "step": 16465 }, { "epoch": 4.904335523743927, "grad_norm": 0.37603044509887695, "learning_rate": 1.0789158441184302e-05, "loss": 1.2189, "step": 16466 }, { "epoch": 4.904633369943595, "grad_norm": 0.4337458312511444, "learning_rate": 1.0788196879359718e-05, "loss": 1.2165, "step": 16467 }, { "epoch": 4.904931216143264, "grad_norm": 0.5279236435890198, "learning_rate": 1.0787235310201847e-05, "loss": 1.2166, "step": 16468 }, { "epoch": 4.905229062342933, "grad_norm": 0.25897216796875, "learning_rate": 1.078627373371963e-05, "loss": 1.2118, "step": 16469 }, { "epoch": 4.905526908542601, "grad_norm": 0.27286821603775024, "learning_rate": 1.0785312149922012e-05, "loss": 1.2259, "step": 16470 }, { "epoch": 4.9058247547422695, "grad_norm": 0.28314799070358276, "learning_rate": 1.0784350558817942e-05, "loss": 1.2219, "step": 16471 }, { "epoch": 4.906122600941939, "grad_norm": 0.30134689807891846, "learning_rate": 1.0783388960416367e-05, "loss": 1.2227, "step": 16472 }, { "epoch": 4.906420447141607, "grad_norm": 0.3107009828090668, "learning_rate": 1.0782427354726235e-05, "loss": 1.2111, "step": 16473 }, { "epoch": 4.906718293341276, "grad_norm": 0.45756450295448303, "learning_rate": 1.0781465741756487e-05, "loss": 1.2211, "step": 16474 }, { "epoch": 4.907016139540945, "grad_norm": 0.2705390453338623, "learning_rate": 1.0780504121516077e-05, "loss": 1.2275, "step": 16475 }, { "epoch": 4.907313985740613, "grad_norm": 0.6505392789840698, "learning_rate": 1.0779542494013947e-05, "loss": 1.2343, "step": 16476 }, { "epoch": 4.907611831940282, "grad_norm": 0.32612910866737366, "learning_rate": 1.0778580859259045e-05, "loss": 1.2083, "step": 16477 }, { "epoch": 4.9079096781399505, "grad_norm": 0.4365836977958679, "learning_rate": 1.0777619217260317e-05, "loss": 1.2202, "step": 16478 }, { "epoch": 4.908207524339619, "grad_norm": 0.2678183615207672, "learning_rate": 1.0776657568026712e-05, "loss": 1.2287, "step": 16479 }, { "epoch": 4.908505370539288, "grad_norm": 0.5058385133743286, "learning_rate": 1.0775695911567175e-05, "loss": 1.2089, "step": 16480 }, { "epoch": 4.908803216738956, "grad_norm": 0.3062038719654083, "learning_rate": 1.0774734247890655e-05, "loss": 1.2147, "step": 16481 }, { "epoch": 4.909101062938625, "grad_norm": 0.3651920258998871, "learning_rate": 1.0773772577006101e-05, "loss": 1.2263, "step": 16482 }, { "epoch": 4.909398909138294, "grad_norm": 0.2616761028766632, "learning_rate": 1.0772810898922455e-05, "loss": 1.2088, "step": 16483 }, { "epoch": 4.909696755337962, "grad_norm": 0.3940376043319702, "learning_rate": 1.0771849213648663e-05, "loss": 1.217, "step": 16484 }, { "epoch": 4.9099946015376315, "grad_norm": 0.24980327486991882, "learning_rate": 1.0770887521193682e-05, "loss": 1.2212, "step": 16485 }, { "epoch": 4.9102924477373, "grad_norm": 0.4014640748500824, "learning_rate": 1.0769925821566455e-05, "loss": 1.2278, "step": 16486 }, { "epoch": 4.910590293936968, "grad_norm": 0.3518194556236267, "learning_rate": 1.0768964114775923e-05, "loss": 1.2264, "step": 16487 }, { "epoch": 4.910888140136637, "grad_norm": 0.3278154730796814, "learning_rate": 1.0768002400831042e-05, "loss": 1.2092, "step": 16488 }, { "epoch": 4.911185986336306, "grad_norm": 0.2689017653465271, "learning_rate": 1.0767040679740757e-05, "loss": 1.2237, "step": 16489 }, { "epoch": 4.911483832535974, "grad_norm": 0.39677876234054565, "learning_rate": 1.0766078951514014e-05, "loss": 1.2147, "step": 16490 }, { "epoch": 4.911781678735643, "grad_norm": 0.4278595745563507, "learning_rate": 1.0765117216159763e-05, "loss": 1.2331, "step": 16491 }, { "epoch": 4.9120795249353115, "grad_norm": 0.30160701274871826, "learning_rate": 1.0764155473686955e-05, "loss": 1.2353, "step": 16492 }, { "epoch": 4.91237737113498, "grad_norm": 0.33897310495376587, "learning_rate": 1.0763193724104531e-05, "loss": 1.2116, "step": 16493 }, { "epoch": 4.912675217334649, "grad_norm": 0.2941698133945465, "learning_rate": 1.076223196742144e-05, "loss": 1.2156, "step": 16494 }, { "epoch": 4.912973063534317, "grad_norm": 0.37687936425209045, "learning_rate": 1.0761270203646632e-05, "loss": 1.2231, "step": 16495 }, { "epoch": 4.913270909733986, "grad_norm": 0.2899411916732788, "learning_rate": 1.0760308432789058e-05, "loss": 1.2426, "step": 16496 }, { "epoch": 4.913568755933655, "grad_norm": 0.33309483528137207, "learning_rate": 1.0759346654857663e-05, "loss": 1.2249, "step": 16497 }, { "epoch": 4.913866602133323, "grad_norm": 0.27459484338760376, "learning_rate": 1.0758384869861393e-05, "loss": 1.2103, "step": 16498 }, { "epoch": 4.914164448332992, "grad_norm": 0.41233590245246887, "learning_rate": 1.0757423077809203e-05, "loss": 1.2136, "step": 16499 }, { "epoch": 4.914462294532661, "grad_norm": 0.2641564905643463, "learning_rate": 1.0756461278710038e-05, "loss": 1.2015, "step": 16500 }, { "epoch": 4.914462294532661, "eval_loss": 1.3246026039123535, "eval_runtime": 21.3127, "eval_samples_per_second": 81.36, "eval_steps_per_second": 5.114, "step": 16500 }, { "epoch": 4.914760140732329, "grad_norm": 0.4468182921409607, "learning_rate": 1.0755499472572844e-05, "loss": 1.2283, "step": 16501 }, { "epoch": 4.915057986931998, "grad_norm": 0.3703464865684509, "learning_rate": 1.075453765940657e-05, "loss": 1.2123, "step": 16502 }, { "epoch": 4.915355833131667, "grad_norm": 0.34836113452911377, "learning_rate": 1.0753575839220168e-05, "loss": 1.1944, "step": 16503 }, { "epoch": 4.915653679331335, "grad_norm": 0.4428093731403351, "learning_rate": 1.0752614012022583e-05, "loss": 1.2353, "step": 16504 }, { "epoch": 4.915951525531004, "grad_norm": 0.3108443319797516, "learning_rate": 1.0751652177822768e-05, "loss": 1.228, "step": 16505 }, { "epoch": 4.916249371730673, "grad_norm": 0.5079678297042847, "learning_rate": 1.075069033662967e-05, "loss": 1.2157, "step": 16506 }, { "epoch": 4.916547217930341, "grad_norm": 0.2944551706314087, "learning_rate": 1.0749728488452231e-05, "loss": 1.2403, "step": 16507 }, { "epoch": 4.91684506413001, "grad_norm": 0.4437999725341797, "learning_rate": 1.074876663329941e-05, "loss": 1.2133, "step": 16508 }, { "epoch": 4.917142910329678, "grad_norm": 0.3734782636165619, "learning_rate": 1.0747804771180154e-05, "loss": 1.2097, "step": 16509 }, { "epoch": 4.917440756529347, "grad_norm": 0.5850622057914734, "learning_rate": 1.074684290210341e-05, "loss": 1.2154, "step": 16510 }, { "epoch": 4.917738602729016, "grad_norm": 0.2878182530403137, "learning_rate": 1.0745881026078125e-05, "loss": 1.217, "step": 16511 }, { "epoch": 4.918036448928684, "grad_norm": 0.5440872311592102, "learning_rate": 1.074491914311325e-05, "loss": 1.2169, "step": 16512 }, { "epoch": 4.9183342951283535, "grad_norm": 0.2729608714580536, "learning_rate": 1.0743957253217736e-05, "loss": 1.2265, "step": 16513 }, { "epoch": 4.918632141328022, "grad_norm": 0.45563188195228577, "learning_rate": 1.0742995356400529e-05, "loss": 1.2315, "step": 16514 }, { "epoch": 4.91892998752769, "grad_norm": 0.2730143666267395, "learning_rate": 1.0742033452670582e-05, "loss": 1.2226, "step": 16515 }, { "epoch": 4.919227833727359, "grad_norm": 0.3658234179019928, "learning_rate": 1.0741071542036846e-05, "loss": 1.2184, "step": 16516 }, { "epoch": 4.919525679927028, "grad_norm": 0.3419719338417053, "learning_rate": 1.0740109624508263e-05, "loss": 1.2156, "step": 16517 }, { "epoch": 4.919823526126696, "grad_norm": 0.4722875654697418, "learning_rate": 1.0739147700093789e-05, "loss": 1.2074, "step": 16518 }, { "epoch": 4.920121372326365, "grad_norm": 0.27891281247138977, "learning_rate": 1.0738185768802369e-05, "loss": 1.2062, "step": 16519 }, { "epoch": 4.920419218526034, "grad_norm": 0.34261152148246765, "learning_rate": 1.0737223830642955e-05, "loss": 1.2229, "step": 16520 }, { "epoch": 4.920717064725702, "grad_norm": 0.2668758034706116, "learning_rate": 1.07362618856245e-05, "loss": 1.2226, "step": 16521 }, { "epoch": 4.921014910925371, "grad_norm": 0.45938822627067566, "learning_rate": 1.0735299933755947e-05, "loss": 1.2157, "step": 16522 }, { "epoch": 4.9213127571250395, "grad_norm": 0.2678782343864441, "learning_rate": 1.0734337975046254e-05, "loss": 1.2348, "step": 16523 }, { "epoch": 4.921610603324709, "grad_norm": 0.3673868179321289, "learning_rate": 1.0733376009504364e-05, "loss": 1.217, "step": 16524 }, { "epoch": 4.921908449524377, "grad_norm": 0.27473393082618713, "learning_rate": 1.0732414037139228e-05, "loss": 1.2059, "step": 16525 }, { "epoch": 4.922206295724045, "grad_norm": 0.4587046504020691, "learning_rate": 1.07314520579598e-05, "loss": 1.2276, "step": 16526 }, { "epoch": 4.922504141923714, "grad_norm": 0.37563133239746094, "learning_rate": 1.0730490071975028e-05, "loss": 1.2076, "step": 16527 }, { "epoch": 4.922801988123383, "grad_norm": 0.4534524977207184, "learning_rate": 1.0729528079193863e-05, "loss": 1.2489, "step": 16528 }, { "epoch": 4.923099834323051, "grad_norm": 0.5472278594970703, "learning_rate": 1.072856607962525e-05, "loss": 1.2077, "step": 16529 }, { "epoch": 4.92339768052272, "grad_norm": 0.2616935074329376, "learning_rate": 1.0727604073278148e-05, "loss": 1.2255, "step": 16530 }, { "epoch": 4.923695526722389, "grad_norm": 0.31283023953437805, "learning_rate": 1.07266420601615e-05, "loss": 1.2174, "step": 16531 }, { "epoch": 4.923993372922057, "grad_norm": 0.2985352575778961, "learning_rate": 1.0725680040284263e-05, "loss": 1.2149, "step": 16532 }, { "epoch": 4.924291219121726, "grad_norm": 0.3567928671836853, "learning_rate": 1.0724718013655384e-05, "loss": 1.2367, "step": 16533 }, { "epoch": 4.924589065321395, "grad_norm": 0.2781184911727905, "learning_rate": 1.0723755980283809e-05, "loss": 1.2157, "step": 16534 }, { "epoch": 4.924886911521063, "grad_norm": 0.3470194339752197, "learning_rate": 1.0722793940178498e-05, "loss": 1.2342, "step": 16535 }, { "epoch": 4.925184757720732, "grad_norm": 0.2665918469429016, "learning_rate": 1.0721831893348393e-05, "loss": 1.2242, "step": 16536 }, { "epoch": 4.9254826039204005, "grad_norm": 0.30841633677482605, "learning_rate": 1.072086983980245e-05, "loss": 1.2195, "step": 16537 }, { "epoch": 4.925780450120069, "grad_norm": 0.3293060064315796, "learning_rate": 1.0719907779549619e-05, "loss": 1.2148, "step": 16538 }, { "epoch": 4.926078296319738, "grad_norm": 0.30947989225387573, "learning_rate": 1.0718945712598852e-05, "loss": 1.2124, "step": 16539 }, { "epoch": 4.926376142519406, "grad_norm": 0.3894166052341461, "learning_rate": 1.0717983638959097e-05, "loss": 1.2194, "step": 16540 }, { "epoch": 4.926673988719076, "grad_norm": 0.32728278636932373, "learning_rate": 1.0717021558639306e-05, "loss": 1.2194, "step": 16541 }, { "epoch": 4.926971834918744, "grad_norm": 0.3462470471858978, "learning_rate": 1.071605947164843e-05, "loss": 1.2083, "step": 16542 }, { "epoch": 4.927269681118412, "grad_norm": 0.26606592535972595, "learning_rate": 1.0715097377995422e-05, "loss": 1.2075, "step": 16543 }, { "epoch": 4.9275675273180815, "grad_norm": 0.34016546607017517, "learning_rate": 1.071413527768923e-05, "loss": 1.2253, "step": 16544 }, { "epoch": 4.92786537351775, "grad_norm": 0.2924135625362396, "learning_rate": 1.0713173170738808e-05, "loss": 1.2282, "step": 16545 }, { "epoch": 4.928163219717418, "grad_norm": 0.31891173124313354, "learning_rate": 1.0712211057153108e-05, "loss": 1.2275, "step": 16546 }, { "epoch": 4.928461065917087, "grad_norm": 0.3394494354724884, "learning_rate": 1.0711248936941081e-05, "loss": 1.2095, "step": 16547 }, { "epoch": 4.928758912116756, "grad_norm": 0.2725721299648285, "learning_rate": 1.0710286810111672e-05, "loss": 1.2046, "step": 16548 }, { "epoch": 4.929056758316424, "grad_norm": 0.2920966148376465, "learning_rate": 1.070932467667384e-05, "loss": 1.2413, "step": 16549 }, { "epoch": 4.929354604516093, "grad_norm": 0.4335156977176666, "learning_rate": 1.0708362536636538e-05, "loss": 1.2211, "step": 16550 }, { "epoch": 4.9296524507157615, "grad_norm": 0.46687009930610657, "learning_rate": 1.0707400390008711e-05, "loss": 1.225, "step": 16551 }, { "epoch": 4.929950296915431, "grad_norm": 0.3849765658378601, "learning_rate": 1.0706438236799313e-05, "loss": 1.2245, "step": 16552 }, { "epoch": 4.930248143115099, "grad_norm": 0.3573441207408905, "learning_rate": 1.0705476077017298e-05, "loss": 1.2113, "step": 16553 }, { "epoch": 4.930545989314767, "grad_norm": 0.4444035291671753, "learning_rate": 1.0704513910671615e-05, "loss": 1.2131, "step": 16554 }, { "epoch": 4.930843835514437, "grad_norm": 0.33163416385650635, "learning_rate": 1.0703551737771218e-05, "loss": 1.2255, "step": 16555 }, { "epoch": 4.931141681714105, "grad_norm": 0.4915383458137512, "learning_rate": 1.0702589558325057e-05, "loss": 1.2123, "step": 16556 }, { "epoch": 4.931439527913773, "grad_norm": 0.29612472653388977, "learning_rate": 1.0701627372342089e-05, "loss": 1.2274, "step": 16557 }, { "epoch": 4.9317373741134425, "grad_norm": 0.539953887462616, "learning_rate": 1.0700665179831258e-05, "loss": 1.2099, "step": 16558 }, { "epoch": 4.932035220313111, "grad_norm": 0.35313206911087036, "learning_rate": 1.0699702980801522e-05, "loss": 1.2357, "step": 16559 }, { "epoch": 4.932333066512779, "grad_norm": 0.43425267934799194, "learning_rate": 1.069874077526183e-05, "loss": 1.2249, "step": 16560 }, { "epoch": 4.932630912712448, "grad_norm": 0.2519639730453491, "learning_rate": 1.0697778563221137e-05, "loss": 1.2166, "step": 16561 }, { "epoch": 4.932928758912117, "grad_norm": 0.7543262839317322, "learning_rate": 1.0696816344688394e-05, "loss": 1.213, "step": 16562 }, { "epoch": 4.933226605111786, "grad_norm": 0.37575799226760864, "learning_rate": 1.0695854119672553e-05, "loss": 1.2345, "step": 16563 }, { "epoch": 4.933524451311454, "grad_norm": 0.36105549335479736, "learning_rate": 1.069489188818257e-05, "loss": 1.2265, "step": 16564 }, { "epoch": 4.933822297511123, "grad_norm": 0.32097697257995605, "learning_rate": 1.0693929650227392e-05, "loss": 1.2115, "step": 16565 }, { "epoch": 4.934120143710791, "grad_norm": 0.3280133008956909, "learning_rate": 1.069296740581597e-05, "loss": 1.2306, "step": 16566 }, { "epoch": 4.93441798991046, "grad_norm": 0.38967204093933105, "learning_rate": 1.0692005154957265e-05, "loss": 1.2378, "step": 16567 }, { "epoch": 4.934715836110128, "grad_norm": 0.26550593972206116, "learning_rate": 1.0691042897660226e-05, "loss": 1.2188, "step": 16568 }, { "epoch": 4.935013682309798, "grad_norm": 0.34446680545806885, "learning_rate": 1.0690080633933803e-05, "loss": 1.2203, "step": 16569 }, { "epoch": 4.935311528509466, "grad_norm": 0.3465847969055176, "learning_rate": 1.068911836378695e-05, "loss": 1.2089, "step": 16570 }, { "epoch": 4.935609374709134, "grad_norm": 0.3076886236667633, "learning_rate": 1.0688156087228625e-05, "loss": 1.2128, "step": 16571 }, { "epoch": 4.9359072209088035, "grad_norm": 0.43502846360206604, "learning_rate": 1.068719380426777e-05, "loss": 1.2222, "step": 16572 }, { "epoch": 4.936205067108472, "grad_norm": 0.37854209542274475, "learning_rate": 1.0686231514913347e-05, "loss": 1.2309, "step": 16573 }, { "epoch": 4.93650291330814, "grad_norm": 0.49233517050743103, "learning_rate": 1.0685269219174308e-05, "loss": 1.2329, "step": 16574 }, { "epoch": 4.936800759507809, "grad_norm": 0.4589947462081909, "learning_rate": 1.0684306917059604e-05, "loss": 1.2298, "step": 16575 }, { "epoch": 4.937098605707478, "grad_norm": 0.41953331232070923, "learning_rate": 1.0683344608578187e-05, "loss": 1.2264, "step": 16576 }, { "epoch": 4.937396451907146, "grad_norm": 0.31615251302719116, "learning_rate": 1.0682382293739014e-05, "loss": 1.2115, "step": 16577 }, { "epoch": 4.937694298106815, "grad_norm": 0.37679243087768555, "learning_rate": 1.0681419972551036e-05, "loss": 1.2176, "step": 16578 }, { "epoch": 4.937992144306484, "grad_norm": 0.4259312152862549, "learning_rate": 1.0680457645023204e-05, "loss": 1.2218, "step": 16579 }, { "epoch": 4.938289990506153, "grad_norm": 0.29420703649520874, "learning_rate": 1.0679495311164473e-05, "loss": 1.2267, "step": 16580 }, { "epoch": 4.938587836705821, "grad_norm": 0.2868013083934784, "learning_rate": 1.0678532970983804e-05, "loss": 1.2311, "step": 16581 }, { "epoch": 4.9388856829054895, "grad_norm": 0.3442378044128418, "learning_rate": 1.0677570624490138e-05, "loss": 1.2353, "step": 16582 }, { "epoch": 4.939183529105159, "grad_norm": 0.2716892659664154, "learning_rate": 1.0676608271692437e-05, "loss": 1.2102, "step": 16583 }, { "epoch": 4.939481375304827, "grad_norm": 0.2793993055820465, "learning_rate": 1.067564591259965e-05, "loss": 1.2019, "step": 16584 }, { "epoch": 4.939779221504495, "grad_norm": 0.33249780535697937, "learning_rate": 1.0674683547220734e-05, "loss": 1.2233, "step": 16585 }, { "epoch": 4.9400770677041645, "grad_norm": 0.28556743264198303, "learning_rate": 1.067372117556464e-05, "loss": 1.2233, "step": 16586 }, { "epoch": 4.940374913903833, "grad_norm": 0.2517443299293518, "learning_rate": 1.0672758797640324e-05, "loss": 1.2298, "step": 16587 }, { "epoch": 4.940672760103501, "grad_norm": 0.2680200934410095, "learning_rate": 1.067179641345674e-05, "loss": 1.2179, "step": 16588 }, { "epoch": 4.94097060630317, "grad_norm": 0.26771560311317444, "learning_rate": 1.0670834023022843e-05, "loss": 1.2148, "step": 16589 }, { "epoch": 4.941268452502839, "grad_norm": 0.3182801902294159, "learning_rate": 1.066987162634758e-05, "loss": 1.2379, "step": 16590 }, { "epoch": 4.941566298702508, "grad_norm": 0.29841336607933044, "learning_rate": 1.0668909223439912e-05, "loss": 1.2207, "step": 16591 }, { "epoch": 4.941864144902176, "grad_norm": 0.2742636203765869, "learning_rate": 1.0667946814308792e-05, "loss": 1.2226, "step": 16592 }, { "epoch": 4.942161991101845, "grad_norm": 0.2781110107898712, "learning_rate": 1.0666984398963171e-05, "loss": 1.2145, "step": 16593 }, { "epoch": 4.942459837301513, "grad_norm": 0.36052560806274414, "learning_rate": 1.0666021977412007e-05, "loss": 1.2371, "step": 16594 }, { "epoch": 4.942757683501182, "grad_norm": 0.41813093423843384, "learning_rate": 1.0665059549664252e-05, "loss": 1.2129, "step": 16595 }, { "epoch": 4.9430555297008505, "grad_norm": 0.2921593487262726, "learning_rate": 1.066409711572886e-05, "loss": 1.2238, "step": 16596 }, { "epoch": 4.94335337590052, "grad_norm": 0.5107043981552124, "learning_rate": 1.0663134675614788e-05, "loss": 1.2128, "step": 16597 }, { "epoch": 4.943651222100188, "grad_norm": 0.7092805504798889, "learning_rate": 1.066217222933099e-05, "loss": 1.2299, "step": 16598 }, { "epoch": 4.943949068299856, "grad_norm": 0.3690491318702698, "learning_rate": 1.0661209776886417e-05, "loss": 1.2131, "step": 16599 }, { "epoch": 4.944246914499526, "grad_norm": 0.40929409861564636, "learning_rate": 1.0660247318290027e-05, "loss": 1.2132, "step": 16600 }, { "epoch": 4.944544760699194, "grad_norm": 0.25750359892845154, "learning_rate": 1.065928485355077e-05, "loss": 1.2123, "step": 16601 }, { "epoch": 4.944842606898862, "grad_norm": 0.3441201150417328, "learning_rate": 1.0658322382677608e-05, "loss": 1.1992, "step": 16602 }, { "epoch": 4.9451404530985315, "grad_norm": 0.28091517090797424, "learning_rate": 1.065735990567949e-05, "loss": 1.2086, "step": 16603 }, { "epoch": 4.9454382992982, "grad_norm": 0.32239842414855957, "learning_rate": 1.0656397422565373e-05, "loss": 1.2272, "step": 16604 }, { "epoch": 4.945736145497868, "grad_norm": 0.34925195574760437, "learning_rate": 1.0655434933344213e-05, "loss": 1.2248, "step": 16605 }, { "epoch": 4.946033991697537, "grad_norm": 0.3011159896850586, "learning_rate": 1.0654472438024962e-05, "loss": 1.2232, "step": 16606 }, { "epoch": 4.946331837897206, "grad_norm": 0.4502594470977783, "learning_rate": 1.0653509936616575e-05, "loss": 1.2162, "step": 16607 }, { "epoch": 4.946629684096875, "grad_norm": 0.3334161639213562, "learning_rate": 1.0652547429128008e-05, "loss": 1.2277, "step": 16608 }, { "epoch": 4.946927530296543, "grad_norm": 0.2528218924999237, "learning_rate": 1.0651584915568215e-05, "loss": 1.2299, "step": 16609 }, { "epoch": 4.9472253764962115, "grad_norm": 0.31391286849975586, "learning_rate": 1.0650622395946155e-05, "loss": 1.2095, "step": 16610 }, { "epoch": 4.947523222695881, "grad_norm": 0.3338140845298767, "learning_rate": 1.0649659870270781e-05, "loss": 1.2225, "step": 16611 }, { "epoch": 4.947821068895549, "grad_norm": 0.390227735042572, "learning_rate": 1.0648697338551048e-05, "loss": 1.2077, "step": 16612 }, { "epoch": 4.948118915095217, "grad_norm": 0.3159486949443817, "learning_rate": 1.0647734800795908e-05, "loss": 1.2334, "step": 16613 }, { "epoch": 4.948416761294887, "grad_norm": 0.4039250910282135, "learning_rate": 1.0646772257014319e-05, "loss": 1.2294, "step": 16614 }, { "epoch": 4.948714607494555, "grad_norm": 0.2981310784816742, "learning_rate": 1.0645809707215242e-05, "loss": 1.2164, "step": 16615 }, { "epoch": 4.949012453694223, "grad_norm": 0.2539900541305542, "learning_rate": 1.0644847151407624e-05, "loss": 1.2048, "step": 16616 }, { "epoch": 4.9493102998938925, "grad_norm": 0.257242888212204, "learning_rate": 1.0643884589600423e-05, "loss": 1.2153, "step": 16617 }, { "epoch": 4.949608146093561, "grad_norm": 0.33412307500839233, "learning_rate": 1.0642922021802597e-05, "loss": 1.205, "step": 16618 }, { "epoch": 4.94990599229323, "grad_norm": 0.3598838150501251, "learning_rate": 1.0641959448023099e-05, "loss": 1.2251, "step": 16619 }, { "epoch": 4.950203838492898, "grad_norm": 0.23968324065208435, "learning_rate": 1.0640996868270885e-05, "loss": 1.2164, "step": 16620 }, { "epoch": 4.950501684692567, "grad_norm": 0.3048483431339264, "learning_rate": 1.0640034282554912e-05, "loss": 1.2091, "step": 16621 }, { "epoch": 4.950799530892236, "grad_norm": 0.2794269323348999, "learning_rate": 1.0639071690884138e-05, "loss": 1.2181, "step": 16622 }, { "epoch": 4.951097377091904, "grad_norm": 0.26282474398612976, "learning_rate": 1.063810909326751e-05, "loss": 1.2193, "step": 16623 }, { "epoch": 4.951395223291573, "grad_norm": 0.3358785808086395, "learning_rate": 1.0637146489713993e-05, "loss": 1.2326, "step": 16624 }, { "epoch": 4.951693069491242, "grad_norm": 0.28418686985969543, "learning_rate": 1.0636183880232541e-05, "loss": 1.2107, "step": 16625 }, { "epoch": 4.95199091569091, "grad_norm": 0.28371039032936096, "learning_rate": 1.0635221264832105e-05, "loss": 1.213, "step": 16626 }, { "epoch": 4.952288761890578, "grad_norm": 0.26671627163887024, "learning_rate": 1.063425864352165e-05, "loss": 1.1987, "step": 16627 }, { "epoch": 4.952586608090248, "grad_norm": 0.284598708152771, "learning_rate": 1.0633296016310122e-05, "loss": 1.2128, "step": 16628 }, { "epoch": 4.952884454289916, "grad_norm": 0.47250860929489136, "learning_rate": 1.0632333383206486e-05, "loss": 1.2312, "step": 16629 }, { "epoch": 4.953182300489585, "grad_norm": 0.4156145751476288, "learning_rate": 1.0631370744219694e-05, "loss": 1.2304, "step": 16630 }, { "epoch": 4.9534801466892535, "grad_norm": 0.3234199285507202, "learning_rate": 1.0630408099358697e-05, "loss": 1.218, "step": 16631 }, { "epoch": 4.953777992888922, "grad_norm": 0.4637068510055542, "learning_rate": 1.0629445448632462e-05, "loss": 1.2133, "step": 16632 }, { "epoch": 4.95407583908859, "grad_norm": 0.2520059049129486, "learning_rate": 1.062848279204994e-05, "loss": 1.211, "step": 16633 }, { "epoch": 4.954373685288259, "grad_norm": 0.3065948486328125, "learning_rate": 1.0627520129620087e-05, "loss": 1.2251, "step": 16634 }, { "epoch": 4.954671531487928, "grad_norm": 0.2537427544593811, "learning_rate": 1.0626557461351862e-05, "loss": 1.204, "step": 16635 }, { "epoch": 4.954969377687597, "grad_norm": 0.2744297385215759, "learning_rate": 1.0625594787254216e-05, "loss": 1.2239, "step": 16636 }, { "epoch": 4.955267223887265, "grad_norm": 0.26712867617607117, "learning_rate": 1.0624632107336112e-05, "loss": 1.2185, "step": 16637 }, { "epoch": 4.955565070086934, "grad_norm": 0.28278082609176636, "learning_rate": 1.0623669421606504e-05, "loss": 1.2089, "step": 16638 }, { "epoch": 4.955862916286603, "grad_norm": 0.2572941780090332, "learning_rate": 1.0622706730074352e-05, "loss": 1.221, "step": 16639 }, { "epoch": 4.956160762486271, "grad_norm": 0.2989979684352875, "learning_rate": 1.0621744032748607e-05, "loss": 1.2103, "step": 16640 }, { "epoch": 4.9564586086859395, "grad_norm": 0.2684837281703949, "learning_rate": 1.0620781329638228e-05, "loss": 1.2171, "step": 16641 }, { "epoch": 4.956756454885609, "grad_norm": 0.40119245648384094, "learning_rate": 1.0619818620752172e-05, "loss": 1.2245, "step": 16642 }, { "epoch": 4.957054301085277, "grad_norm": 0.37163183093070984, "learning_rate": 1.0618855906099395e-05, "loss": 1.2182, "step": 16643 }, { "epoch": 4.957352147284945, "grad_norm": 0.26145032048225403, "learning_rate": 1.0617893185688856e-05, "loss": 1.2386, "step": 16644 }, { "epoch": 4.9576499934846145, "grad_norm": 0.45826685428619385, "learning_rate": 1.0616930459529513e-05, "loss": 1.2294, "step": 16645 }, { "epoch": 4.957947839684283, "grad_norm": 0.5881915092468262, "learning_rate": 1.061596772763032e-05, "loss": 1.2201, "step": 16646 }, { "epoch": 4.958245685883952, "grad_norm": 0.4020099639892578, "learning_rate": 1.0615004990000237e-05, "loss": 1.2128, "step": 16647 }, { "epoch": 4.95854353208362, "grad_norm": 0.2901490330696106, "learning_rate": 1.0614042246648217e-05, "loss": 1.2284, "step": 16648 }, { "epoch": 4.958841378283289, "grad_norm": 0.33594927191734314, "learning_rate": 1.0613079497583223e-05, "loss": 1.2197, "step": 16649 }, { "epoch": 4.959139224482958, "grad_norm": 0.2905789613723755, "learning_rate": 1.0612116742814207e-05, "loss": 1.2082, "step": 16650 }, { "epoch": 4.959437070682626, "grad_norm": 0.3242633044719696, "learning_rate": 1.0611153982350129e-05, "loss": 1.2226, "step": 16651 }, { "epoch": 4.959734916882295, "grad_norm": 0.30701521039009094, "learning_rate": 1.0610191216199945e-05, "loss": 1.217, "step": 16652 }, { "epoch": 4.960032763081964, "grad_norm": 0.5019434690475464, "learning_rate": 1.0609228444372617e-05, "loss": 1.2233, "step": 16653 }, { "epoch": 4.960330609281632, "grad_norm": 0.688578188419342, "learning_rate": 1.0608265666877095e-05, "loss": 1.2137, "step": 16654 }, { "epoch": 4.9606284554813005, "grad_norm": 0.32056665420532227, "learning_rate": 1.0607302883722342e-05, "loss": 1.2392, "step": 16655 }, { "epoch": 4.96092630168097, "grad_norm": 0.7792702317237854, "learning_rate": 1.0606340094917318e-05, "loss": 1.2381, "step": 16656 }, { "epoch": 4.961224147880638, "grad_norm": 0.3579455316066742, "learning_rate": 1.0605377300470971e-05, "loss": 1.219, "step": 16657 }, { "epoch": 4.961521994080307, "grad_norm": 0.9020829200744629, "learning_rate": 1.0604414500392269e-05, "loss": 1.2164, "step": 16658 }, { "epoch": 4.961819840279976, "grad_norm": 0.6582271456718445, "learning_rate": 1.060345169469016e-05, "loss": 1.2373, "step": 16659 }, { "epoch": 4.962117686479644, "grad_norm": 0.5560967922210693, "learning_rate": 1.0602488883373611e-05, "loss": 1.2232, "step": 16660 }, { "epoch": 4.962415532679312, "grad_norm": 0.37864935398101807, "learning_rate": 1.0601526066451577e-05, "loss": 1.2243, "step": 16661 }, { "epoch": 4.9627133788789815, "grad_norm": 0.4254647195339203, "learning_rate": 1.0600563243933012e-05, "loss": 1.2283, "step": 16662 }, { "epoch": 4.96301122507865, "grad_norm": 0.3214024603366852, "learning_rate": 1.059960041582688e-05, "loss": 1.2108, "step": 16663 }, { "epoch": 4.963309071278319, "grad_norm": 0.37460124492645264, "learning_rate": 1.0598637582142134e-05, "loss": 1.2047, "step": 16664 }, { "epoch": 4.963606917477987, "grad_norm": 0.5097197890281677, "learning_rate": 1.0597674742887737e-05, "loss": 1.234, "step": 16665 }, { "epoch": 4.963904763677656, "grad_norm": 0.29781848192214966, "learning_rate": 1.0596711898072642e-05, "loss": 1.2104, "step": 16666 }, { "epoch": 4.964202609877325, "grad_norm": 0.4486392140388489, "learning_rate": 1.0595749047705809e-05, "loss": 1.2232, "step": 16667 }, { "epoch": 4.964500456076993, "grad_norm": 0.2860359251499176, "learning_rate": 1.0594786191796195e-05, "loss": 1.2169, "step": 16668 }, { "epoch": 4.9647983022766615, "grad_norm": 0.36968132853507996, "learning_rate": 1.0593823330352764e-05, "loss": 1.2283, "step": 16669 }, { "epoch": 4.965096148476331, "grad_norm": 0.3152136504650116, "learning_rate": 1.0592860463384472e-05, "loss": 1.2331, "step": 16670 }, { "epoch": 4.965393994675999, "grad_norm": 0.24299803376197815, "learning_rate": 1.0591897590900272e-05, "loss": 1.2054, "step": 16671 }, { "epoch": 4.965691840875667, "grad_norm": 0.3697117269039154, "learning_rate": 1.0590934712909122e-05, "loss": 1.2058, "step": 16672 }, { "epoch": 4.965989687075337, "grad_norm": 0.3239505887031555, "learning_rate": 1.0589971829419992e-05, "loss": 1.2104, "step": 16673 }, { "epoch": 4.966287533275005, "grad_norm": 0.3296203911304474, "learning_rate": 1.0589008940441831e-05, "loss": 1.2199, "step": 16674 }, { "epoch": 4.966585379474674, "grad_norm": 0.3282039761543274, "learning_rate": 1.05880460459836e-05, "loss": 1.2145, "step": 16675 }, { "epoch": 4.9668832256743425, "grad_norm": 0.2871517539024353, "learning_rate": 1.0587083146054258e-05, "loss": 1.2284, "step": 16676 }, { "epoch": 4.967181071874011, "grad_norm": 0.2773374915122986, "learning_rate": 1.058612024066276e-05, "loss": 1.2132, "step": 16677 }, { "epoch": 4.96747891807368, "grad_norm": 0.264411985874176, "learning_rate": 1.0585157329818069e-05, "loss": 1.2254, "step": 16678 }, { "epoch": 4.967776764273348, "grad_norm": 0.2909695506095886, "learning_rate": 1.0584194413529145e-05, "loss": 1.2263, "step": 16679 }, { "epoch": 4.968074610473017, "grad_norm": 0.2745228707790375, "learning_rate": 1.0583231491804946e-05, "loss": 1.2186, "step": 16680 }, { "epoch": 4.968372456672686, "grad_norm": 0.27479180693626404, "learning_rate": 1.0582268564654427e-05, "loss": 1.2119, "step": 16681 }, { "epoch": 4.968670302872354, "grad_norm": 0.2838924825191498, "learning_rate": 1.058130563208655e-05, "loss": 1.2305, "step": 16682 }, { "epoch": 4.9689681490720226, "grad_norm": 0.2745339572429657, "learning_rate": 1.0580342694110272e-05, "loss": 1.2148, "step": 16683 }, { "epoch": 4.969265995271692, "grad_norm": 0.3092835545539856, "learning_rate": 1.0579379750734557e-05, "loss": 1.2076, "step": 16684 }, { "epoch": 4.96956384147136, "grad_norm": 0.29313743114471436, "learning_rate": 1.0578416801968359e-05, "loss": 1.2418, "step": 16685 }, { "epoch": 4.969861687671029, "grad_norm": 0.260153204202652, "learning_rate": 1.0577453847820638e-05, "loss": 1.2173, "step": 16686 }, { "epoch": 4.970159533870698, "grad_norm": 0.28208646178245544, "learning_rate": 1.0576490888300357e-05, "loss": 1.2278, "step": 16687 }, { "epoch": 4.970457380070366, "grad_norm": 0.27743834257125854, "learning_rate": 1.057552792341647e-05, "loss": 1.2268, "step": 16688 }, { "epoch": 4.970755226270035, "grad_norm": 0.278525710105896, "learning_rate": 1.057456495317794e-05, "loss": 1.2212, "step": 16689 }, { "epoch": 4.9710530724697035, "grad_norm": 0.3726790249347687, "learning_rate": 1.0573601977593724e-05, "loss": 1.221, "step": 16690 }, { "epoch": 4.971350918669372, "grad_norm": 0.41416075825691223, "learning_rate": 1.0572638996672781e-05, "loss": 1.2207, "step": 16691 }, { "epoch": 4.971648764869041, "grad_norm": 0.2695554494857788, "learning_rate": 1.0571676010424072e-05, "loss": 1.221, "step": 16692 }, { "epoch": 4.971946611068709, "grad_norm": 0.36792466044425964, "learning_rate": 1.057071301885656e-05, "loss": 1.2063, "step": 16693 }, { "epoch": 4.972244457268378, "grad_norm": 0.28067547082901, "learning_rate": 1.0569750021979199e-05, "loss": 1.2353, "step": 16694 }, { "epoch": 4.972542303468047, "grad_norm": 0.4062131643295288, "learning_rate": 1.0568787019800951e-05, "loss": 1.2273, "step": 16695 }, { "epoch": 4.972840149667715, "grad_norm": 0.31503891944885254, "learning_rate": 1.0567824012330772e-05, "loss": 1.2109, "step": 16696 }, { "epoch": 4.9731379958673845, "grad_norm": 0.2861665189266205, "learning_rate": 1.056686099957763e-05, "loss": 1.2136, "step": 16697 }, { "epoch": 4.973435842067053, "grad_norm": 0.2808992266654968, "learning_rate": 1.0565897981550477e-05, "loss": 1.2074, "step": 16698 }, { "epoch": 4.973733688266721, "grad_norm": 0.3202911615371704, "learning_rate": 1.0564934958258278e-05, "loss": 1.2055, "step": 16699 }, { "epoch": 4.9740315344663895, "grad_norm": 0.5615058541297913, "learning_rate": 1.0563971929709988e-05, "loss": 1.2232, "step": 16700 }, { "epoch": 4.974329380666059, "grad_norm": 0.5817986130714417, "learning_rate": 1.0563008895914569e-05, "loss": 1.2207, "step": 16701 }, { "epoch": 4.974627226865727, "grad_norm": 0.30600810050964355, "learning_rate": 1.056204585688098e-05, "loss": 1.1981, "step": 16702 }, { "epoch": 4.974925073065396, "grad_norm": 0.6330814361572266, "learning_rate": 1.0561082812618184e-05, "loss": 1.2219, "step": 16703 }, { "epoch": 4.9752229192650645, "grad_norm": 0.3917084038257599, "learning_rate": 1.0560119763135143e-05, "loss": 1.2194, "step": 16704 }, { "epoch": 4.975520765464733, "grad_norm": 0.5173817276954651, "learning_rate": 1.055915670844081e-05, "loss": 1.201, "step": 16705 }, { "epoch": 4.975818611664402, "grad_norm": 0.4674241244792938, "learning_rate": 1.0558193648544148e-05, "loss": 1.2212, "step": 16706 }, { "epoch": 4.97611645786407, "grad_norm": 0.48664671182632446, "learning_rate": 1.0557230583454119e-05, "loss": 1.2251, "step": 16707 }, { "epoch": 4.976414304063739, "grad_norm": 0.4714708924293518, "learning_rate": 1.055626751317968e-05, "loss": 1.2109, "step": 16708 }, { "epoch": 4.976712150263408, "grad_norm": 0.5601451396942139, "learning_rate": 1.0555304437729795e-05, "loss": 1.2094, "step": 16709 }, { "epoch": 4.977009996463076, "grad_norm": 0.6209285259246826, "learning_rate": 1.0554341357113423e-05, "loss": 1.2182, "step": 16710 }, { "epoch": 4.977307842662745, "grad_norm": 0.3429107367992401, "learning_rate": 1.0553378271339523e-05, "loss": 1.2225, "step": 16711 }, { "epoch": 4.977605688862414, "grad_norm": 0.32052913308143616, "learning_rate": 1.0552415180417057e-05, "loss": 1.2083, "step": 16712 }, { "epoch": 4.977903535062082, "grad_norm": 0.3625233471393585, "learning_rate": 1.0551452084354982e-05, "loss": 1.23, "step": 16713 }, { "epoch": 4.978201381261751, "grad_norm": 0.2513951361179352, "learning_rate": 1.0550488983162266e-05, "loss": 1.2109, "step": 16714 }, { "epoch": 4.97849922746142, "grad_norm": 0.3679635226726532, "learning_rate": 1.0549525876847863e-05, "loss": 1.222, "step": 16715 }, { "epoch": 4.978797073661088, "grad_norm": 0.2886575758457184, "learning_rate": 1.0548562765420735e-05, "loss": 1.1996, "step": 16716 }, { "epoch": 4.979094919860757, "grad_norm": 0.2606308162212372, "learning_rate": 1.0547599648889846e-05, "loss": 1.2123, "step": 16717 }, { "epoch": 4.979392766060426, "grad_norm": 0.38203656673431396, "learning_rate": 1.0546636527264154e-05, "loss": 1.2061, "step": 16718 }, { "epoch": 4.979690612260094, "grad_norm": 0.26509690284729004, "learning_rate": 1.0545673400552613e-05, "loss": 1.2082, "step": 16719 }, { "epoch": 4.979988458459763, "grad_norm": 0.4086620807647705, "learning_rate": 1.0544710268764198e-05, "loss": 1.2126, "step": 16720 }, { "epoch": 4.9802863046594315, "grad_norm": 0.3342956006526947, "learning_rate": 1.0543747131907862e-05, "loss": 1.2151, "step": 16721 }, { "epoch": 4.9805841508591, "grad_norm": 0.303300678730011, "learning_rate": 1.0542783989992565e-05, "loss": 1.2292, "step": 16722 }, { "epoch": 4.980881997058769, "grad_norm": 0.322784423828125, "learning_rate": 1.0541820843027268e-05, "loss": 1.2103, "step": 16723 }, { "epoch": 4.981179843258437, "grad_norm": 0.30369865894317627, "learning_rate": 1.0540857691020934e-05, "loss": 1.2106, "step": 16724 }, { "epoch": 4.9814776894581065, "grad_norm": 0.2760009765625, "learning_rate": 1.0539894533982524e-05, "loss": 1.213, "step": 16725 }, { "epoch": 4.981775535657775, "grad_norm": 0.2748645842075348, "learning_rate": 1.0538931371921e-05, "loss": 1.1896, "step": 16726 }, { "epoch": 4.982073381857443, "grad_norm": 0.29580578207969666, "learning_rate": 1.0537968204845319e-05, "loss": 1.2122, "step": 16727 }, { "epoch": 4.9823712280571115, "grad_norm": 0.266201376914978, "learning_rate": 1.0537005032764447e-05, "loss": 1.2032, "step": 16728 }, { "epoch": 4.982669074256781, "grad_norm": 0.3600304126739502, "learning_rate": 1.0536041855687343e-05, "loss": 1.2158, "step": 16729 }, { "epoch": 4.982966920456449, "grad_norm": 0.41808775067329407, "learning_rate": 1.0535078673622967e-05, "loss": 1.2106, "step": 16730 }, { "epoch": 4.983264766656118, "grad_norm": 0.3674350380897522, "learning_rate": 1.0534115486580283e-05, "loss": 1.22, "step": 16731 }, { "epoch": 4.983562612855787, "grad_norm": 0.496549129486084, "learning_rate": 1.053315229456825e-05, "loss": 1.2233, "step": 16732 }, { "epoch": 4.983860459055455, "grad_norm": 0.4117461144924164, "learning_rate": 1.0532189097595831e-05, "loss": 1.2182, "step": 16733 }, { "epoch": 4.984158305255124, "grad_norm": 0.5589625239372253, "learning_rate": 1.053122589567199e-05, "loss": 1.2178, "step": 16734 }, { "epoch": 4.9844561514547925, "grad_norm": 0.4036214351654053, "learning_rate": 1.0530262688805684e-05, "loss": 1.2201, "step": 16735 }, { "epoch": 4.984753997654461, "grad_norm": 0.38442564010620117, "learning_rate": 1.0529299477005874e-05, "loss": 1.2195, "step": 16736 }, { "epoch": 4.98505184385413, "grad_norm": 0.4841512143611908, "learning_rate": 1.0528336260281523e-05, "loss": 1.2209, "step": 16737 }, { "epoch": 4.985349690053798, "grad_norm": 0.2693743109703064, "learning_rate": 1.0527373038641595e-05, "loss": 1.2218, "step": 16738 }, { "epoch": 4.985647536253467, "grad_norm": 0.5425184965133667, "learning_rate": 1.0526409812095052e-05, "loss": 1.2191, "step": 16739 }, { "epoch": 4.985945382453136, "grad_norm": 0.33673861622810364, "learning_rate": 1.0525446580650852e-05, "loss": 1.2281, "step": 16740 }, { "epoch": 4.986243228652804, "grad_norm": 0.4996735453605652, "learning_rate": 1.0524483344317959e-05, "loss": 1.2295, "step": 16741 }, { "epoch": 4.986541074852473, "grad_norm": 0.3482604920864105, "learning_rate": 1.0523520103105331e-05, "loss": 1.2104, "step": 16742 }, { "epoch": 4.986838921052142, "grad_norm": 0.26863017678260803, "learning_rate": 1.0522556857021937e-05, "loss": 1.2201, "step": 16743 }, { "epoch": 4.98713676725181, "grad_norm": 0.31027543544769287, "learning_rate": 1.0521593606076734e-05, "loss": 1.2266, "step": 16744 }, { "epoch": 4.987434613451479, "grad_norm": 0.26575297117233276, "learning_rate": 1.0520630350278689e-05, "loss": 1.2221, "step": 16745 }, { "epoch": 4.987732459651148, "grad_norm": 0.27818453311920166, "learning_rate": 1.0519667089636758e-05, "loss": 1.2135, "step": 16746 }, { "epoch": 4.988030305850816, "grad_norm": 0.3446889817714691, "learning_rate": 1.0518703824159903e-05, "loss": 1.2133, "step": 16747 }, { "epoch": 4.988328152050485, "grad_norm": 0.2743852734565735, "learning_rate": 1.0517740553857089e-05, "loss": 1.2231, "step": 16748 }, { "epoch": 4.9886259982501535, "grad_norm": 0.3398355543613434, "learning_rate": 1.051677727873728e-05, "loss": 1.2129, "step": 16749 }, { "epoch": 4.988923844449822, "grad_norm": 0.3796647787094116, "learning_rate": 1.0515813998809434e-05, "loss": 1.2016, "step": 16750 }, { "epoch": 4.989221690649491, "grad_norm": 0.30225494503974915, "learning_rate": 1.0514850714082517e-05, "loss": 1.219, "step": 16751 }, { "epoch": 4.989519536849159, "grad_norm": 0.8262394070625305, "learning_rate": 1.051388742456549e-05, "loss": 1.239, "step": 16752 }, { "epoch": 4.989817383048829, "grad_norm": 0.515708327293396, "learning_rate": 1.051292413026731e-05, "loss": 1.2234, "step": 16753 }, { "epoch": 4.990115229248497, "grad_norm": 0.45032769441604614, "learning_rate": 1.0511960831196946e-05, "loss": 1.2183, "step": 16754 }, { "epoch": 4.990413075448165, "grad_norm": 0.4615631103515625, "learning_rate": 1.051099752736336e-05, "loss": 1.2119, "step": 16755 }, { "epoch": 4.9907109216478345, "grad_norm": 0.4467100203037262, "learning_rate": 1.0510034218775514e-05, "loss": 1.2253, "step": 16756 }, { "epoch": 4.991008767847503, "grad_norm": 0.3474428951740265, "learning_rate": 1.0509070905442364e-05, "loss": 1.2064, "step": 16757 }, { "epoch": 4.991306614047171, "grad_norm": 0.40014272928237915, "learning_rate": 1.0508107587372884e-05, "loss": 1.2208, "step": 16758 }, { "epoch": 4.99160446024684, "grad_norm": 0.321129709482193, "learning_rate": 1.0507144264576028e-05, "loss": 1.2146, "step": 16759 }, { "epoch": 4.991902306446509, "grad_norm": 0.2836153209209442, "learning_rate": 1.0506180937060759e-05, "loss": 1.1964, "step": 16760 }, { "epoch": 4.992200152646177, "grad_norm": 0.44101935625076294, "learning_rate": 1.0505217604836045e-05, "loss": 1.2227, "step": 16761 }, { "epoch": 4.992497998845846, "grad_norm": 0.2814334034919739, "learning_rate": 1.0504254267910847e-05, "loss": 1.2179, "step": 16762 }, { "epoch": 4.9927958450455145, "grad_norm": 0.3615776300430298, "learning_rate": 1.0503290926294122e-05, "loss": 1.2043, "step": 16763 }, { "epoch": 4.993093691245184, "grad_norm": 0.30053389072418213, "learning_rate": 1.050232757999484e-05, "loss": 1.2132, "step": 16764 }, { "epoch": 4.993391537444852, "grad_norm": 0.3011837303638458, "learning_rate": 1.0501364229021962e-05, "loss": 1.202, "step": 16765 }, { "epoch": 4.99368938364452, "grad_norm": 0.26689302921295166, "learning_rate": 1.0500400873384446e-05, "loss": 1.2103, "step": 16766 }, { "epoch": 4.993987229844189, "grad_norm": 0.3297526240348816, "learning_rate": 1.0499437513091263e-05, "loss": 1.2247, "step": 16767 }, { "epoch": 4.994285076043858, "grad_norm": 0.30646270513534546, "learning_rate": 1.049847414815137e-05, "loss": 1.2096, "step": 16768 }, { "epoch": 4.994582922243526, "grad_norm": 0.31208336353302, "learning_rate": 1.0497510778573733e-05, "loss": 1.2111, "step": 16769 }, { "epoch": 4.9948807684431955, "grad_norm": 0.2566390335559845, "learning_rate": 1.0496547404367314e-05, "loss": 1.2269, "step": 16770 }, { "epoch": 4.995178614642864, "grad_norm": 0.3259469270706177, "learning_rate": 1.0495584025541077e-05, "loss": 1.2038, "step": 16771 }, { "epoch": 4.995476460842532, "grad_norm": 0.2688966393470764, "learning_rate": 1.049462064210398e-05, "loss": 1.2179, "step": 16772 }, { "epoch": 4.995774307042201, "grad_norm": 0.24849647283554077, "learning_rate": 1.0493657254064995e-05, "loss": 1.215, "step": 16773 }, { "epoch": 4.99607215324187, "grad_norm": 0.2939499616622925, "learning_rate": 1.049269386143308e-05, "loss": 1.1907, "step": 16774 }, { "epoch": 4.996369999441538, "grad_norm": 0.2791326642036438, "learning_rate": 1.04917304642172e-05, "loss": 1.2196, "step": 16775 }, { "epoch": 4.996667845641207, "grad_norm": 0.28479069471359253, "learning_rate": 1.0490767062426314e-05, "loss": 1.2281, "step": 16776 }, { "epoch": 4.996965691840876, "grad_norm": 0.5685864686965942, "learning_rate": 1.0489803656069392e-05, "loss": 1.2241, "step": 16777 }, { "epoch": 4.997263538040544, "grad_norm": 0.43502071499824524, "learning_rate": 1.0488840245155392e-05, "loss": 1.2231, "step": 16778 }, { "epoch": 4.997561384240213, "grad_norm": 0.420467346906662, "learning_rate": 1.0487876829693283e-05, "loss": 1.2031, "step": 16779 }, { "epoch": 4.9978592304398815, "grad_norm": 0.4691842198371887, "learning_rate": 1.0486913409692022e-05, "loss": 1.2187, "step": 16780 }, { "epoch": 4.998157076639551, "grad_norm": 0.3853599727153778, "learning_rate": 1.0485949985160575e-05, "loss": 1.221, "step": 16781 }, { "epoch": 4.998454922839219, "grad_norm": 0.5169789791107178, "learning_rate": 1.048498655610791e-05, "loss": 1.2123, "step": 16782 }, { "epoch": 4.998752769038887, "grad_norm": 0.30979102849960327, "learning_rate": 1.0484023122542983e-05, "loss": 1.2158, "step": 16783 }, { "epoch": 4.9990506152385565, "grad_norm": 0.35916584730148315, "learning_rate": 1.0483059684474764e-05, "loss": 1.1891, "step": 16784 }, { "epoch": 4.999348461438225, "grad_norm": 0.3863771855831146, "learning_rate": 1.0482096241912211e-05, "loss": 1.2279, "step": 16785 }, { "epoch": 4.999646307637893, "grad_norm": 0.32903555035591125, "learning_rate": 1.0481132794864298e-05, "loss": 1.2017, "step": 16786 }, { "epoch": 4.999944153837562, "grad_norm": 0.3094794452190399, "learning_rate": 1.0480169343339976e-05, "loss": 1.2045, "step": 16787 }, { "epoch": 5.000242000037231, "grad_norm": 0.2456616908311844, "learning_rate": 1.0479205887348216e-05, "loss": 1.2252, "step": 16788 }, { "epoch": 5.000539846236899, "grad_norm": 0.3059477210044861, "learning_rate": 1.0478242426897982e-05, "loss": 1.2404, "step": 16789 }, { "epoch": 5.000837692436568, "grad_norm": 0.2795867621898651, "learning_rate": 1.0477278961998236e-05, "loss": 1.197, "step": 16790 }, { "epoch": 5.001135538636237, "grad_norm": 0.35036084055900574, "learning_rate": 1.047631549265794e-05, "loss": 1.2122, "step": 16791 }, { "epoch": 5.001433384835905, "grad_norm": 0.24780143797397614, "learning_rate": 1.0475352018886067e-05, "loss": 1.2192, "step": 16792 }, { "epoch": 5.001731231035574, "grad_norm": 0.2544531524181366, "learning_rate": 1.0474388540691569e-05, "loss": 1.2197, "step": 16793 }, { "epoch": 5.0020290772352425, "grad_norm": 0.35282015800476074, "learning_rate": 1.0473425058083418e-05, "loss": 1.2164, "step": 16794 }, { "epoch": 5.002326923434912, "grad_norm": 0.33501848578453064, "learning_rate": 1.0472461571070574e-05, "loss": 1.2349, "step": 16795 }, { "epoch": 5.00262476963458, "grad_norm": 0.24779658019542694, "learning_rate": 1.0471498079662001e-05, "loss": 1.2159, "step": 16796 }, { "epoch": 5.002922615834248, "grad_norm": 0.3270301818847656, "learning_rate": 1.0470534583866669e-05, "loss": 1.2143, "step": 16797 }, { "epoch": 5.003220462033918, "grad_norm": 0.3353484570980072, "learning_rate": 1.0469571083693538e-05, "loss": 1.2297, "step": 16798 }, { "epoch": 5.003518308233586, "grad_norm": 0.3318248689174652, "learning_rate": 1.0468607579151574e-05, "loss": 1.213, "step": 16799 }, { "epoch": 5.003816154433254, "grad_norm": 0.5154165625572205, "learning_rate": 1.0467644070249736e-05, "loss": 1.2114, "step": 16800 }, { "epoch": 5.004114000632923, "grad_norm": 0.2916905879974365, "learning_rate": 1.0466680556996994e-05, "loss": 1.2323, "step": 16801 }, { "epoch": 5.004411846832592, "grad_norm": 0.7062249183654785, "learning_rate": 1.046571703940231e-05, "loss": 1.2052, "step": 16802 }, { "epoch": 5.00470969303226, "grad_norm": 0.28323349356651306, "learning_rate": 1.046475351747465e-05, "loss": 1.2076, "step": 16803 }, { "epoch": 5.005007539231929, "grad_norm": 0.6860963702201843, "learning_rate": 1.046378999122298e-05, "loss": 1.2258, "step": 16804 }, { "epoch": 5.005305385431598, "grad_norm": 0.45981213450431824, "learning_rate": 1.046282646065626e-05, "loss": 1.2151, "step": 16805 }, { "epoch": 5.005603231631266, "grad_norm": 0.4969659149646759, "learning_rate": 1.0461862925783457e-05, "loss": 1.2304, "step": 16806 }, { "epoch": 5.005901077830935, "grad_norm": 0.5255463123321533, "learning_rate": 1.0460899386613537e-05, "loss": 1.2139, "step": 16807 }, { "epoch": 5.0061989240306035, "grad_norm": 0.31206822395324707, "learning_rate": 1.045993584315546e-05, "loss": 1.2146, "step": 16808 }, { "epoch": 5.006496770230273, "grad_norm": 0.36263447999954224, "learning_rate": 1.0458972295418196e-05, "loss": 1.2387, "step": 16809 }, { "epoch": 5.006794616429941, "grad_norm": 0.31523358821868896, "learning_rate": 1.045800874341071e-05, "loss": 1.2408, "step": 16810 }, { "epoch": 5.007092462629609, "grad_norm": 0.26450997591018677, "learning_rate": 1.0457045187141962e-05, "loss": 1.2104, "step": 16811 }, { "epoch": 5.007390308829279, "grad_norm": 0.33075958490371704, "learning_rate": 1.0456081626620918e-05, "loss": 1.2293, "step": 16812 }, { "epoch": 5.007688155028947, "grad_norm": 0.34704312682151794, "learning_rate": 1.0455118061856546e-05, "loss": 1.2166, "step": 16813 }, { "epoch": 5.007986001228615, "grad_norm": 0.40252599120140076, "learning_rate": 1.0454154492857806e-05, "loss": 1.22, "step": 16814 }, { "epoch": 5.0082838474282845, "grad_norm": 0.31370943784713745, "learning_rate": 1.0453190919633669e-05, "loss": 1.2253, "step": 16815 }, { "epoch": 5.008581693627953, "grad_norm": 0.28903695940971375, "learning_rate": 1.0452227342193098e-05, "loss": 1.2191, "step": 16816 }, { "epoch": 5.008879539827621, "grad_norm": 0.3101347088813782, "learning_rate": 1.0451263760545054e-05, "loss": 1.2165, "step": 16817 }, { "epoch": 5.00917738602729, "grad_norm": 0.3071835935115814, "learning_rate": 1.0450300174698505e-05, "loss": 1.1955, "step": 16818 }, { "epoch": 5.009475232226959, "grad_norm": 0.38228222727775574, "learning_rate": 1.0449336584662413e-05, "loss": 1.2072, "step": 16819 }, { "epoch": 5.009773078426628, "grad_norm": 0.4122900664806366, "learning_rate": 1.0448372990445752e-05, "loss": 1.1978, "step": 16820 }, { "epoch": 5.010070924626296, "grad_norm": 0.3079754412174225, "learning_rate": 1.0447409392057479e-05, "loss": 1.2248, "step": 16821 }, { "epoch": 5.0103687708259645, "grad_norm": 0.3712056577205658, "learning_rate": 1.0446445789506561e-05, "loss": 1.213, "step": 16822 }, { "epoch": 5.010666617025634, "grad_norm": 0.26759687066078186, "learning_rate": 1.0445482182801964e-05, "loss": 1.2028, "step": 16823 }, { "epoch": 5.010964463225302, "grad_norm": 0.5693494081497192, "learning_rate": 1.044451857195265e-05, "loss": 1.2246, "step": 16824 }, { "epoch": 5.01126230942497, "grad_norm": 0.2638775706291199, "learning_rate": 1.0443554956967592e-05, "loss": 1.2216, "step": 16825 }, { "epoch": 5.01156015562464, "grad_norm": 0.4996988773345947, "learning_rate": 1.044259133785575e-05, "loss": 1.2302, "step": 16826 }, { "epoch": 5.011858001824308, "grad_norm": 0.3001219928264618, "learning_rate": 1.044162771462609e-05, "loss": 1.2087, "step": 16827 }, { "epoch": 5.012155848023976, "grad_norm": 0.3501885235309601, "learning_rate": 1.0440664087287575e-05, "loss": 1.2154, "step": 16828 }, { "epoch": 5.0124536942236455, "grad_norm": 0.2479260414838791, "learning_rate": 1.0439700455849176e-05, "loss": 1.2149, "step": 16829 }, { "epoch": 5.012751540423314, "grad_norm": 0.3943127691745758, "learning_rate": 1.0438736820319855e-05, "loss": 1.2112, "step": 16830 }, { "epoch": 5.013049386622982, "grad_norm": 0.3294520676136017, "learning_rate": 1.0437773180708575e-05, "loss": 1.215, "step": 16831 }, { "epoch": 5.013347232822651, "grad_norm": 0.25596147775650024, "learning_rate": 1.0436809537024309e-05, "loss": 1.2236, "step": 16832 }, { "epoch": 5.01364507902232, "grad_norm": 0.2845492362976074, "learning_rate": 1.0435845889276018e-05, "loss": 1.2239, "step": 16833 }, { "epoch": 5.013942925221989, "grad_norm": 0.2554742097854614, "learning_rate": 1.0434882237472666e-05, "loss": 1.219, "step": 16834 }, { "epoch": 5.014240771421657, "grad_norm": 0.28055235743522644, "learning_rate": 1.043391858162322e-05, "loss": 1.2077, "step": 16835 }, { "epoch": 5.014538617621326, "grad_norm": 0.290095716714859, "learning_rate": 1.0432954921736646e-05, "loss": 1.2185, "step": 16836 }, { "epoch": 5.014836463820995, "grad_norm": 0.26479220390319824, "learning_rate": 1.0431991257821911e-05, "loss": 1.2041, "step": 16837 }, { "epoch": 5.015134310020663, "grad_norm": 0.27861976623535156, "learning_rate": 1.043102758988798e-05, "loss": 1.2362, "step": 16838 }, { "epoch": 5.0154321562203314, "grad_norm": 0.29868316650390625, "learning_rate": 1.043006391794382e-05, "loss": 1.2242, "step": 16839 }, { "epoch": 5.015730002420001, "grad_norm": 0.28946223855018616, "learning_rate": 1.0429100241998395e-05, "loss": 1.2185, "step": 16840 }, { "epoch": 5.016027848619669, "grad_norm": 0.3949662148952484, "learning_rate": 1.0428136562060673e-05, "loss": 1.2255, "step": 16841 }, { "epoch": 5.016325694819337, "grad_norm": 0.32382822036743164, "learning_rate": 1.0427172878139615e-05, "loss": 1.2142, "step": 16842 }, { "epoch": 5.0166235410190065, "grad_norm": 0.2649022936820984, "learning_rate": 1.0426209190244193e-05, "loss": 1.2183, "step": 16843 }, { "epoch": 5.016921387218675, "grad_norm": 0.2537255585193634, "learning_rate": 1.0425245498383372e-05, "loss": 1.2207, "step": 16844 }, { "epoch": 5.017219233418343, "grad_norm": 0.25246289372444153, "learning_rate": 1.0424281802566114e-05, "loss": 1.2226, "step": 16845 }, { "epoch": 5.017517079618012, "grad_norm": 0.29220494627952576, "learning_rate": 1.042331810280139e-05, "loss": 1.2143, "step": 16846 }, { "epoch": 5.017814925817681, "grad_norm": 0.25921738147735596, "learning_rate": 1.0422354399098165e-05, "loss": 1.2365, "step": 16847 }, { "epoch": 5.01811277201735, "grad_norm": 0.3867584466934204, "learning_rate": 1.04213906914654e-05, "loss": 1.2081, "step": 16848 }, { "epoch": 5.018410618217018, "grad_norm": 0.3751278817653656, "learning_rate": 1.0420426979912068e-05, "loss": 1.2161, "step": 16849 }, { "epoch": 5.018708464416687, "grad_norm": 0.329056054353714, "learning_rate": 1.0419463264447133e-05, "loss": 1.2059, "step": 16850 }, { "epoch": 5.019006310616356, "grad_norm": 0.3369464576244354, "learning_rate": 1.0418499545079562e-05, "loss": 1.2106, "step": 16851 }, { "epoch": 5.019304156816024, "grad_norm": 0.27653270959854126, "learning_rate": 1.0417535821818319e-05, "loss": 1.2201, "step": 16852 }, { "epoch": 5.0196020030156925, "grad_norm": 0.2683281898498535, "learning_rate": 1.041657209467237e-05, "loss": 1.2077, "step": 16853 }, { "epoch": 5.019899849215362, "grad_norm": 0.4458148181438446, "learning_rate": 1.0415608363650685e-05, "loss": 1.2174, "step": 16854 }, { "epoch": 5.02019769541503, "grad_norm": 0.30316099524497986, "learning_rate": 1.0414644628762227e-05, "loss": 1.2164, "step": 16855 }, { "epoch": 5.020495541614698, "grad_norm": 0.40766316652297974, "learning_rate": 1.0413680890015965e-05, "loss": 1.2128, "step": 16856 }, { "epoch": 5.020793387814368, "grad_norm": 0.500697672367096, "learning_rate": 1.0412717147420866e-05, "loss": 1.2112, "step": 16857 }, { "epoch": 5.021091234014036, "grad_norm": 0.2633652985095978, "learning_rate": 1.0411753400985894e-05, "loss": 1.2026, "step": 16858 }, { "epoch": 5.021389080213704, "grad_norm": 0.42190057039260864, "learning_rate": 1.0410789650720016e-05, "loss": 1.2165, "step": 16859 }, { "epoch": 5.021686926413373, "grad_norm": 0.2626347541809082, "learning_rate": 1.0409825896632198e-05, "loss": 1.2153, "step": 16860 }, { "epoch": 5.021984772613042, "grad_norm": 0.5549313426017761, "learning_rate": 1.040886213873141e-05, "loss": 1.2187, "step": 16861 }, { "epoch": 5.022282618812711, "grad_norm": 0.36056095361709595, "learning_rate": 1.0407898377026615e-05, "loss": 1.2166, "step": 16862 }, { "epoch": 5.022580465012379, "grad_norm": 0.4121907353401184, "learning_rate": 1.0406934611526785e-05, "loss": 1.2091, "step": 16863 }, { "epoch": 5.022878311212048, "grad_norm": 0.29632169008255005, "learning_rate": 1.040597084224088e-05, "loss": 1.2185, "step": 16864 }, { "epoch": 5.023176157411717, "grad_norm": 0.4148094654083252, "learning_rate": 1.0405007069177869e-05, "loss": 1.2176, "step": 16865 }, { "epoch": 5.023474003611385, "grad_norm": 0.31199824810028076, "learning_rate": 1.0404043292346722e-05, "loss": 1.2217, "step": 16866 }, { "epoch": 5.0237718498110535, "grad_norm": 0.39971500635147095, "learning_rate": 1.04030795117564e-05, "loss": 1.2261, "step": 16867 }, { "epoch": 5.024069696010723, "grad_norm": 0.35686275362968445, "learning_rate": 1.040211572741588e-05, "loss": 1.2113, "step": 16868 }, { "epoch": 5.024367542210391, "grad_norm": 0.36970195174217224, "learning_rate": 1.0401151939334118e-05, "loss": 1.1995, "step": 16869 }, { "epoch": 5.024665388410059, "grad_norm": 0.34617671370506287, "learning_rate": 1.0400188147520084e-05, "loss": 1.1871, "step": 16870 }, { "epoch": 5.024963234609729, "grad_norm": 0.3224303424358368, "learning_rate": 1.0399224351982748e-05, "loss": 1.2256, "step": 16871 }, { "epoch": 5.025261080809397, "grad_norm": 0.3524259328842163, "learning_rate": 1.0398260552731076e-05, "loss": 1.2142, "step": 16872 }, { "epoch": 5.025558927009065, "grad_norm": 0.3058825433254242, "learning_rate": 1.0397296749774034e-05, "loss": 1.2107, "step": 16873 }, { "epoch": 5.0258567732087345, "grad_norm": 0.34864315390586853, "learning_rate": 1.0396332943120593e-05, "loss": 1.2073, "step": 16874 }, { "epoch": 5.026154619408403, "grad_norm": 0.42834699153900146, "learning_rate": 1.0395369132779713e-05, "loss": 1.2065, "step": 16875 }, { "epoch": 5.026452465608072, "grad_norm": 0.25437748432159424, "learning_rate": 1.0394405318760365e-05, "loss": 1.2193, "step": 16876 }, { "epoch": 5.02675031180774, "grad_norm": 0.3262318968772888, "learning_rate": 1.0393441501071517e-05, "loss": 1.2304, "step": 16877 }, { "epoch": 5.027048158007409, "grad_norm": 0.3492392897605896, "learning_rate": 1.0392477679722135e-05, "loss": 1.2346, "step": 16878 }, { "epoch": 5.027346004207078, "grad_norm": 0.31459948420524597, "learning_rate": 1.0391513854721187e-05, "loss": 1.2128, "step": 16879 }, { "epoch": 5.027643850406746, "grad_norm": 0.4275921881198883, "learning_rate": 1.0390550026077642e-05, "loss": 1.2092, "step": 16880 }, { "epoch": 5.0279416966064145, "grad_norm": 0.27070149779319763, "learning_rate": 1.0389586193800462e-05, "loss": 1.2135, "step": 16881 }, { "epoch": 5.028239542806084, "grad_norm": 0.7566307187080383, "learning_rate": 1.0388622357898621e-05, "loss": 1.2071, "step": 16882 }, { "epoch": 5.028537389005752, "grad_norm": 0.4427952170372009, "learning_rate": 1.0387658518381078e-05, "loss": 1.2148, "step": 16883 }, { "epoch": 5.02883523520542, "grad_norm": 0.6872209906578064, "learning_rate": 1.038669467525681e-05, "loss": 1.2323, "step": 16884 }, { "epoch": 5.02913308140509, "grad_norm": 0.30873167514801025, "learning_rate": 1.038573082853478e-05, "loss": 1.1977, "step": 16885 }, { "epoch": 5.029430927604758, "grad_norm": 0.7701168060302734, "learning_rate": 1.0384766978223954e-05, "loss": 1.206, "step": 16886 }, { "epoch": 5.029728773804427, "grad_norm": 0.3259108066558838, "learning_rate": 1.0383803124333302e-05, "loss": 1.2131, "step": 16887 }, { "epoch": 5.0300266200040955, "grad_norm": 0.4072898030281067, "learning_rate": 1.038283926687179e-05, "loss": 1.2214, "step": 16888 }, { "epoch": 5.030324466203764, "grad_norm": 0.31779375672340393, "learning_rate": 1.0381875405848387e-05, "loss": 1.2164, "step": 16889 }, { "epoch": 5.030622312403433, "grad_norm": 0.27759429812431335, "learning_rate": 1.0380911541272059e-05, "loss": 1.208, "step": 16890 }, { "epoch": 5.030920158603101, "grad_norm": 0.31237512826919556, "learning_rate": 1.0379947673151778e-05, "loss": 1.2114, "step": 16891 }, { "epoch": 5.03121800480277, "grad_norm": 0.3162155747413635, "learning_rate": 1.0378983801496505e-05, "loss": 1.2206, "step": 16892 }, { "epoch": 5.031515851002439, "grad_norm": 0.2899010479450226, "learning_rate": 1.0378019926315212e-05, "loss": 1.2239, "step": 16893 }, { "epoch": 5.031813697202107, "grad_norm": 0.39960283041000366, "learning_rate": 1.0377056047616864e-05, "loss": 1.2271, "step": 16894 }, { "epoch": 5.032111543401776, "grad_norm": 0.26622918248176575, "learning_rate": 1.0376092165410434e-05, "loss": 1.2148, "step": 16895 }, { "epoch": 5.032409389601445, "grad_norm": 0.43999436497688293, "learning_rate": 1.0375128279704885e-05, "loss": 1.2341, "step": 16896 }, { "epoch": 5.032707235801113, "grad_norm": 0.2580682635307312, "learning_rate": 1.0374164390509187e-05, "loss": 1.2047, "step": 16897 }, { "epoch": 5.0330050820007814, "grad_norm": 0.3405083417892456, "learning_rate": 1.037320049783231e-05, "loss": 1.21, "step": 16898 }, { "epoch": 5.033302928200451, "grad_norm": 0.34114450216293335, "learning_rate": 1.0372236601683214e-05, "loss": 1.2241, "step": 16899 }, { "epoch": 5.033600774400119, "grad_norm": 0.37767964601516724, "learning_rate": 1.0371272702070875e-05, "loss": 1.2111, "step": 16900 }, { "epoch": 5.033898620599788, "grad_norm": 0.3744494915008545, "learning_rate": 1.0370308799004256e-05, "loss": 1.2207, "step": 16901 }, { "epoch": 5.0341964667994565, "grad_norm": 0.27336758375167847, "learning_rate": 1.0369344892492333e-05, "loss": 1.2101, "step": 16902 }, { "epoch": 5.034494312999125, "grad_norm": 0.472644567489624, "learning_rate": 1.0368380982544064e-05, "loss": 1.2285, "step": 16903 }, { "epoch": 5.034792159198794, "grad_norm": 0.2490297257900238, "learning_rate": 1.0367417069168422e-05, "loss": 1.2266, "step": 16904 }, { "epoch": 5.035090005398462, "grad_norm": 0.5181571841239929, "learning_rate": 1.0366453152374376e-05, "loss": 1.2284, "step": 16905 }, { "epoch": 5.035387851598131, "grad_norm": 0.3122056722640991, "learning_rate": 1.0365489232170893e-05, "loss": 1.2183, "step": 16906 }, { "epoch": 5.0356856977978, "grad_norm": 0.4880363941192627, "learning_rate": 1.0364525308566937e-05, "loss": 1.2227, "step": 16907 }, { "epoch": 5.035983543997468, "grad_norm": 0.47182410955429077, "learning_rate": 1.0363561381571485e-05, "loss": 1.2265, "step": 16908 }, { "epoch": 5.036281390197137, "grad_norm": 0.36670053005218506, "learning_rate": 1.0362597451193499e-05, "loss": 1.2301, "step": 16909 }, { "epoch": 5.036579236396806, "grad_norm": 0.64680016040802, "learning_rate": 1.0361633517441949e-05, "loss": 1.2088, "step": 16910 }, { "epoch": 5.036877082596474, "grad_norm": 0.3548673689365387, "learning_rate": 1.0360669580325802e-05, "loss": 1.2177, "step": 16911 }, { "epoch": 5.0371749287961425, "grad_norm": 0.4463796317577362, "learning_rate": 1.0359705639854027e-05, "loss": 1.2258, "step": 16912 }, { "epoch": 5.037472774995812, "grad_norm": 0.3287397623062134, "learning_rate": 1.0358741696035594e-05, "loss": 1.2183, "step": 16913 }, { "epoch": 5.03777062119548, "grad_norm": 0.40949130058288574, "learning_rate": 1.0357777748879472e-05, "loss": 1.2312, "step": 16914 }, { "epoch": 5.038068467395149, "grad_norm": 0.4259876012802124, "learning_rate": 1.0356813798394628e-05, "loss": 1.1945, "step": 16915 }, { "epoch": 5.038366313594818, "grad_norm": 0.27881747484207153, "learning_rate": 1.0355849844590029e-05, "loss": 1.2139, "step": 16916 }, { "epoch": 5.038664159794486, "grad_norm": 0.3707573711872101, "learning_rate": 1.0354885887474644e-05, "loss": 1.2116, "step": 16917 }, { "epoch": 5.038962005994155, "grad_norm": 0.31895044445991516, "learning_rate": 1.0353921927057442e-05, "loss": 1.2255, "step": 16918 }, { "epoch": 5.039259852193823, "grad_norm": 0.2716042995452881, "learning_rate": 1.0352957963347393e-05, "loss": 1.2233, "step": 16919 }, { "epoch": 5.039557698393492, "grad_norm": 0.3826630115509033, "learning_rate": 1.0351993996353463e-05, "loss": 1.2243, "step": 16920 }, { "epoch": 5.039855544593161, "grad_norm": 0.2522062063217163, "learning_rate": 1.0351030026084624e-05, "loss": 1.2282, "step": 16921 }, { "epoch": 5.040153390792829, "grad_norm": 0.3618514835834503, "learning_rate": 1.0350066052549842e-05, "loss": 1.2363, "step": 16922 }, { "epoch": 5.040451236992498, "grad_norm": 0.29095321893692017, "learning_rate": 1.0349102075758089e-05, "loss": 1.2134, "step": 16923 }, { "epoch": 5.040749083192167, "grad_norm": 0.30500462651252747, "learning_rate": 1.0348138095718327e-05, "loss": 1.1962, "step": 16924 }, { "epoch": 5.041046929391835, "grad_norm": 0.35836461186408997, "learning_rate": 1.0347174112439528e-05, "loss": 1.2178, "step": 16925 }, { "epoch": 5.0413447755915035, "grad_norm": 0.3042157292366028, "learning_rate": 1.0346210125930667e-05, "loss": 1.2136, "step": 16926 }, { "epoch": 5.041642621791173, "grad_norm": 0.35214394330978394, "learning_rate": 1.0345246136200704e-05, "loss": 1.2086, "step": 16927 }, { "epoch": 5.041940467990841, "grad_norm": 0.2565341889858246, "learning_rate": 1.0344282143258612e-05, "loss": 1.2281, "step": 16928 }, { "epoch": 5.04223831419051, "grad_norm": 0.36944258213043213, "learning_rate": 1.0343318147113364e-05, "loss": 1.2198, "step": 16929 }, { "epoch": 5.042536160390179, "grad_norm": 0.27086594700813293, "learning_rate": 1.0342354147773917e-05, "loss": 1.1991, "step": 16930 }, { "epoch": 5.042834006589847, "grad_norm": 0.3638143837451935, "learning_rate": 1.034139014524925e-05, "loss": 1.2186, "step": 16931 }, { "epoch": 5.043131852789516, "grad_norm": 0.34115177392959595, "learning_rate": 1.0340426139548332e-05, "loss": 1.2135, "step": 16932 }, { "epoch": 5.0434296989891845, "grad_norm": 0.4475928843021393, "learning_rate": 1.0339462130680125e-05, "loss": 1.2159, "step": 16933 }, { "epoch": 5.043727545188853, "grad_norm": 0.33746838569641113, "learning_rate": 1.0338498118653604e-05, "loss": 1.2213, "step": 16934 }, { "epoch": 5.044025391388522, "grad_norm": 0.26538002490997314, "learning_rate": 1.0337534103477736e-05, "loss": 1.2115, "step": 16935 }, { "epoch": 5.04432323758819, "grad_norm": 0.30209416151046753, "learning_rate": 1.0336570085161489e-05, "loss": 1.2304, "step": 16936 }, { "epoch": 5.044621083787859, "grad_norm": 0.2903737723827362, "learning_rate": 1.0335606063713835e-05, "loss": 1.2124, "step": 16937 }, { "epoch": 5.044918929987528, "grad_norm": 0.3577030599117279, "learning_rate": 1.0334642039143741e-05, "loss": 1.2226, "step": 16938 }, { "epoch": 5.045216776187196, "grad_norm": 0.29479989409446716, "learning_rate": 1.0333678011460178e-05, "loss": 1.221, "step": 16939 }, { "epoch": 5.0455146223868645, "grad_norm": 0.32492542266845703, "learning_rate": 1.0332713980672113e-05, "loss": 1.2151, "step": 16940 }, { "epoch": 5.045812468586534, "grad_norm": 0.2970866560935974, "learning_rate": 1.0331749946788514e-05, "loss": 1.2135, "step": 16941 }, { "epoch": 5.046110314786202, "grad_norm": 0.29935309290885925, "learning_rate": 1.0330785909818355e-05, "loss": 1.199, "step": 16942 }, { "epoch": 5.046408160985871, "grad_norm": 0.39471688866615295, "learning_rate": 1.0329821869770603e-05, "loss": 1.2133, "step": 16943 }, { "epoch": 5.04670600718554, "grad_norm": 0.3253411650657654, "learning_rate": 1.0328857826654223e-05, "loss": 1.2164, "step": 16944 }, { "epoch": 5.047003853385208, "grad_norm": 0.25424107909202576, "learning_rate": 1.0327893780478191e-05, "loss": 1.2234, "step": 16945 }, { "epoch": 5.047301699584877, "grad_norm": 0.3143296539783478, "learning_rate": 1.0326929731251475e-05, "loss": 1.2152, "step": 16946 }, { "epoch": 5.0475995457845455, "grad_norm": 0.25473228096961975, "learning_rate": 1.0325965678983044e-05, "loss": 1.201, "step": 16947 }, { "epoch": 5.047897391984214, "grad_norm": 0.33592459559440613, "learning_rate": 1.0325001623681863e-05, "loss": 1.2213, "step": 16948 }, { "epoch": 5.048195238183883, "grad_norm": 0.3559499979019165, "learning_rate": 1.0324037565356909e-05, "loss": 1.2004, "step": 16949 }, { "epoch": 5.048493084383551, "grad_norm": 0.28112372756004333, "learning_rate": 1.0323073504017142e-05, "loss": 1.1925, "step": 16950 }, { "epoch": 5.04879093058322, "grad_norm": 0.29977861046791077, "learning_rate": 1.0322109439671542e-05, "loss": 1.2019, "step": 16951 }, { "epoch": 5.049088776782889, "grad_norm": 0.4374518394470215, "learning_rate": 1.0321145372329071e-05, "loss": 1.2181, "step": 16952 }, { "epoch": 5.049386622982557, "grad_norm": 0.3195934295654297, "learning_rate": 1.0320181301998702e-05, "loss": 1.2192, "step": 16953 }, { "epoch": 5.0496844691822265, "grad_norm": 0.37285518646240234, "learning_rate": 1.0319217228689401e-05, "loss": 1.2041, "step": 16954 }, { "epoch": 5.049982315381895, "grad_norm": 0.341904878616333, "learning_rate": 1.0318253152410145e-05, "loss": 1.2203, "step": 16955 }, { "epoch": 5.050280161581563, "grad_norm": 0.41418173909187317, "learning_rate": 1.0317289073169898e-05, "loss": 1.2049, "step": 16956 }, { "epoch": 5.050578007781232, "grad_norm": 0.4288500249385834, "learning_rate": 1.0316324990977629e-05, "loss": 1.2099, "step": 16957 }, { "epoch": 5.050875853980901, "grad_norm": 0.31045854091644287, "learning_rate": 1.031536090584231e-05, "loss": 1.2121, "step": 16958 }, { "epoch": 5.051173700180569, "grad_norm": 0.3993091285228729, "learning_rate": 1.0314396817772909e-05, "loss": 1.2185, "step": 16959 }, { "epoch": 5.051471546380238, "grad_norm": 0.26940515637397766, "learning_rate": 1.0313432726778398e-05, "loss": 1.2201, "step": 16960 }, { "epoch": 5.0517693925799065, "grad_norm": 0.2706749439239502, "learning_rate": 1.0312468632867745e-05, "loss": 1.2141, "step": 16961 }, { "epoch": 5.052067238779575, "grad_norm": 0.35923677682876587, "learning_rate": 1.031150453604992e-05, "loss": 1.2192, "step": 16962 }, { "epoch": 5.052365084979244, "grad_norm": 0.31290218234062195, "learning_rate": 1.0310540436333898e-05, "loss": 1.2273, "step": 16963 }, { "epoch": 5.052662931178912, "grad_norm": 0.3103160858154297, "learning_rate": 1.030957633372864e-05, "loss": 1.2115, "step": 16964 }, { "epoch": 5.052960777378581, "grad_norm": 0.330236554145813, "learning_rate": 1.030861222824312e-05, "loss": 1.2031, "step": 16965 }, { "epoch": 5.05325862357825, "grad_norm": 0.2710285782814026, "learning_rate": 1.0307648119886308e-05, "loss": 1.2227, "step": 16966 }, { "epoch": 5.053556469777918, "grad_norm": 0.329816609621048, "learning_rate": 1.0306684008667173e-05, "loss": 1.2153, "step": 16967 }, { "epoch": 5.0538543159775875, "grad_norm": 0.3519997000694275, "learning_rate": 1.0305719894594688e-05, "loss": 1.2145, "step": 16968 }, { "epoch": 5.054152162177256, "grad_norm": 0.25790297985076904, "learning_rate": 1.0304755777677822e-05, "loss": 1.2149, "step": 16969 }, { "epoch": 5.054450008376924, "grad_norm": 0.3011535704135895, "learning_rate": 1.0303791657925544e-05, "loss": 1.2109, "step": 16970 }, { "epoch": 5.054747854576593, "grad_norm": 0.25497984886169434, "learning_rate": 1.030282753534682e-05, "loss": 1.199, "step": 16971 }, { "epoch": 5.055045700776262, "grad_norm": 0.32787832617759705, "learning_rate": 1.0301863409950625e-05, "loss": 1.2055, "step": 16972 }, { "epoch": 5.05534354697593, "grad_norm": 0.2479628622531891, "learning_rate": 1.0300899281745934e-05, "loss": 1.216, "step": 16973 }, { "epoch": 5.055641393175599, "grad_norm": 0.3591461181640625, "learning_rate": 1.0299935150741707e-05, "loss": 1.2179, "step": 16974 }, { "epoch": 5.055939239375268, "grad_norm": 0.26028865575790405, "learning_rate": 1.0298971016946917e-05, "loss": 1.2252, "step": 16975 }, { "epoch": 5.056237085574936, "grad_norm": 0.4700208306312561, "learning_rate": 1.0298006880370536e-05, "loss": 1.2343, "step": 16976 }, { "epoch": 5.056534931774605, "grad_norm": 0.3031938970088959, "learning_rate": 1.0297042741021537e-05, "loss": 1.1976, "step": 16977 }, { "epoch": 5.056832777974273, "grad_norm": 0.29954105615615845, "learning_rate": 1.0296078598908885e-05, "loss": 1.2042, "step": 16978 }, { "epoch": 5.057130624173942, "grad_norm": 0.2670590579509735, "learning_rate": 1.0295114454041553e-05, "loss": 1.2224, "step": 16979 }, { "epoch": 5.057428470373611, "grad_norm": 0.44915083050727844, "learning_rate": 1.0294150306428513e-05, "loss": 1.2038, "step": 16980 }, { "epoch": 5.057726316573279, "grad_norm": 0.40954411029815674, "learning_rate": 1.029318615607873e-05, "loss": 1.2036, "step": 16981 }, { "epoch": 5.0580241627729485, "grad_norm": 0.2915779948234558, "learning_rate": 1.0292222003001178e-05, "loss": 1.2119, "step": 16982 }, { "epoch": 5.058322008972617, "grad_norm": 0.4107596278190613, "learning_rate": 1.0291257847204827e-05, "loss": 1.2288, "step": 16983 }, { "epoch": 5.058619855172285, "grad_norm": 0.27059832215309143, "learning_rate": 1.0290293688698647e-05, "loss": 1.2141, "step": 16984 }, { "epoch": 5.058917701371954, "grad_norm": 0.3820253908634186, "learning_rate": 1.0289329527491607e-05, "loss": 1.2276, "step": 16985 }, { "epoch": 5.059215547571623, "grad_norm": 0.2683317959308624, "learning_rate": 1.0288365363592681e-05, "loss": 1.2337, "step": 16986 }, { "epoch": 5.059513393771291, "grad_norm": 0.35911110043525696, "learning_rate": 1.028740119701084e-05, "loss": 1.2133, "step": 16987 }, { "epoch": 5.05981123997096, "grad_norm": 0.2821185290813446, "learning_rate": 1.0286437027755047e-05, "loss": 1.2327, "step": 16988 }, { "epoch": 5.060109086170629, "grad_norm": 0.31514233350753784, "learning_rate": 1.0285472855834277e-05, "loss": 1.2127, "step": 16989 }, { "epoch": 5.060406932370297, "grad_norm": 0.36523517966270447, "learning_rate": 1.0284508681257508e-05, "loss": 1.2308, "step": 16990 }, { "epoch": 5.060704778569966, "grad_norm": 0.36553582549095154, "learning_rate": 1.0283544504033697e-05, "loss": 1.212, "step": 16991 }, { "epoch": 5.0610026247696345, "grad_norm": 0.34784194827079773, "learning_rate": 1.0282580324171826e-05, "loss": 1.2338, "step": 16992 }, { "epoch": 5.061300470969303, "grad_norm": 0.34247714281082153, "learning_rate": 1.0281616141680857e-05, "loss": 1.2258, "step": 16993 }, { "epoch": 5.061598317168972, "grad_norm": 0.40866032242774963, "learning_rate": 1.0280651956569765e-05, "loss": 1.215, "step": 16994 }, { "epoch": 5.06189616336864, "grad_norm": 0.3296501636505127, "learning_rate": 1.0279687768847519e-05, "loss": 1.2232, "step": 16995 }, { "epoch": 5.06219400956831, "grad_norm": 0.543762743473053, "learning_rate": 1.0278723578523093e-05, "loss": 1.1987, "step": 16996 }, { "epoch": 5.062491855767978, "grad_norm": 0.5276671648025513, "learning_rate": 1.0277759385605457e-05, "loss": 1.2185, "step": 16997 }, { "epoch": 5.062789701967646, "grad_norm": 0.3279116451740265, "learning_rate": 1.027679519010358e-05, "loss": 1.2224, "step": 16998 }, { "epoch": 5.063087548167315, "grad_norm": 0.431532621383667, "learning_rate": 1.027583099202643e-05, "loss": 1.2311, "step": 16999 }, { "epoch": 5.063385394366984, "grad_norm": 0.26742449402809143, "learning_rate": 1.027486679138298e-05, "loss": 1.208, "step": 17000 }, { "epoch": 5.063385394366984, "eval_loss": 1.321560263633728, "eval_runtime": 22.8145, "eval_samples_per_second": 76.004, "eval_steps_per_second": 4.778, "step": 17000 }, { "epoch": 5.063683240566652, "grad_norm": 0.33452680706977844, "learning_rate": 1.0273902588182205e-05, "loss": 1.2093, "step": 17001 }, { "epoch": 5.063981086766321, "grad_norm": 0.31214168667793274, "learning_rate": 1.027293838243307e-05, "loss": 1.2083, "step": 17002 }, { "epoch": 5.06427893296599, "grad_norm": 0.2911657392978668, "learning_rate": 1.027197417414455e-05, "loss": 1.2323, "step": 17003 }, { "epoch": 5.064576779165658, "grad_norm": 0.504849374294281, "learning_rate": 1.0271009963325616e-05, "loss": 1.2218, "step": 17004 }, { "epoch": 5.064874625365327, "grad_norm": 0.35744398832321167, "learning_rate": 1.0270045749985233e-05, "loss": 1.2135, "step": 17005 }, { "epoch": 5.0651724715649955, "grad_norm": 0.3834766447544098, "learning_rate": 1.0269081534132377e-05, "loss": 1.2088, "step": 17006 }, { "epoch": 5.065470317764664, "grad_norm": 0.30820387601852417, "learning_rate": 1.0268117315776018e-05, "loss": 1.2363, "step": 17007 }, { "epoch": 5.065768163964333, "grad_norm": 0.41436249017715454, "learning_rate": 1.026715309492513e-05, "loss": 1.2097, "step": 17008 }, { "epoch": 5.066066010164001, "grad_norm": 0.28900203108787537, "learning_rate": 1.0266188871588675e-05, "loss": 1.2063, "step": 17009 }, { "epoch": 5.066363856363671, "grad_norm": 0.5163151025772095, "learning_rate": 1.0265224645775636e-05, "loss": 1.2129, "step": 17010 }, { "epoch": 5.066661702563339, "grad_norm": 0.28551915287971497, "learning_rate": 1.0264260417494976e-05, "loss": 1.2118, "step": 17011 }, { "epoch": 5.066959548763007, "grad_norm": 0.6795360445976257, "learning_rate": 1.0263296186755665e-05, "loss": 1.2144, "step": 17012 }, { "epoch": 5.0672573949626765, "grad_norm": 0.3976183831691742, "learning_rate": 1.026233195356668e-05, "loss": 1.224, "step": 17013 }, { "epoch": 5.067555241162345, "grad_norm": 0.524844229221344, "learning_rate": 1.0261367717936992e-05, "loss": 1.2074, "step": 17014 }, { "epoch": 5.067853087362013, "grad_norm": 0.350325345993042, "learning_rate": 1.0260403479875565e-05, "loss": 1.2154, "step": 17015 }, { "epoch": 5.068150933561682, "grad_norm": 0.47722166776657104, "learning_rate": 1.0259439239391376e-05, "loss": 1.2163, "step": 17016 }, { "epoch": 5.068448779761351, "grad_norm": 0.3014627695083618, "learning_rate": 1.0258474996493395e-05, "loss": 1.2247, "step": 17017 }, { "epoch": 5.068746625961019, "grad_norm": 0.36576247215270996, "learning_rate": 1.0257510751190592e-05, "loss": 1.2153, "step": 17018 }, { "epoch": 5.069044472160688, "grad_norm": 0.28036728501319885, "learning_rate": 1.025654650349194e-05, "loss": 1.2096, "step": 17019 }, { "epoch": 5.0693423183603565, "grad_norm": 0.437121719121933, "learning_rate": 1.025558225340641e-05, "loss": 1.2081, "step": 17020 }, { "epoch": 5.069640164560026, "grad_norm": 0.278429239988327, "learning_rate": 1.025461800094297e-05, "loss": 1.2183, "step": 17021 }, { "epoch": 5.069938010759694, "grad_norm": 0.4991644620895386, "learning_rate": 1.0253653746110597e-05, "loss": 1.213, "step": 17022 }, { "epoch": 5.070235856959362, "grad_norm": 0.2669813334941864, "learning_rate": 1.0252689488918257e-05, "loss": 1.2264, "step": 17023 }, { "epoch": 5.070533703159032, "grad_norm": 0.3850683867931366, "learning_rate": 1.0251725229374925e-05, "loss": 1.2246, "step": 17024 }, { "epoch": 5.0708315493587, "grad_norm": 0.299041748046875, "learning_rate": 1.025076096748957e-05, "loss": 1.2083, "step": 17025 }, { "epoch": 5.071129395558368, "grad_norm": 0.3268550634384155, "learning_rate": 1.0249796703271167e-05, "loss": 1.2006, "step": 17026 }, { "epoch": 5.0714272417580375, "grad_norm": 0.340476393699646, "learning_rate": 1.0248832436728682e-05, "loss": 1.2162, "step": 17027 }, { "epoch": 5.071725087957706, "grad_norm": 0.288663387298584, "learning_rate": 1.0247868167871091e-05, "loss": 1.2188, "step": 17028 }, { "epoch": 5.072022934157374, "grad_norm": 0.41877585649490356, "learning_rate": 1.0246903896707364e-05, "loss": 1.2143, "step": 17029 }, { "epoch": 5.072320780357043, "grad_norm": 0.26447010040283203, "learning_rate": 1.0245939623246468e-05, "loss": 1.2057, "step": 17030 }, { "epoch": 5.072618626556712, "grad_norm": 0.32551413774490356, "learning_rate": 1.0244975347497383e-05, "loss": 1.2274, "step": 17031 }, { "epoch": 5.07291647275638, "grad_norm": 0.3196723163127899, "learning_rate": 1.0244011069469072e-05, "loss": 1.2249, "step": 17032 }, { "epoch": 5.073214318956049, "grad_norm": 0.2677357792854309, "learning_rate": 1.0243046789170515e-05, "loss": 1.2089, "step": 17033 }, { "epoch": 5.073512165155718, "grad_norm": 0.2594597637653351, "learning_rate": 1.0242082506610677e-05, "loss": 1.2218, "step": 17034 }, { "epoch": 5.073810011355387, "grad_norm": 0.37905433773994446, "learning_rate": 1.0241118221798534e-05, "loss": 1.2289, "step": 17035 }, { "epoch": 5.074107857555055, "grad_norm": 0.25232434272766113, "learning_rate": 1.0240153934743052e-05, "loss": 1.2154, "step": 17036 }, { "epoch": 5.074405703754723, "grad_norm": 0.4842188060283661, "learning_rate": 1.0239189645453204e-05, "loss": 1.2251, "step": 17037 }, { "epoch": 5.074703549954393, "grad_norm": 0.2551998794078827, "learning_rate": 1.0238225353937972e-05, "loss": 1.2072, "step": 17038 }, { "epoch": 5.075001396154061, "grad_norm": 0.4909974932670593, "learning_rate": 1.0237261060206312e-05, "loss": 1.211, "step": 17039 }, { "epoch": 5.075299242353729, "grad_norm": 0.2678253650665283, "learning_rate": 1.0236296764267206e-05, "loss": 1.218, "step": 17040 }, { "epoch": 5.0755970885533985, "grad_norm": 0.42401137948036194, "learning_rate": 1.0235332466129621e-05, "loss": 1.2196, "step": 17041 }, { "epoch": 5.075894934753067, "grad_norm": 0.26835349202156067, "learning_rate": 1.0234368165802531e-05, "loss": 1.2153, "step": 17042 }, { "epoch": 5.076192780952735, "grad_norm": 0.30196458101272583, "learning_rate": 1.0233403863294907e-05, "loss": 1.2211, "step": 17043 }, { "epoch": 5.076490627152404, "grad_norm": 0.25685325264930725, "learning_rate": 1.023243955861572e-05, "loss": 1.2164, "step": 17044 }, { "epoch": 5.076788473352073, "grad_norm": 0.32649120688438416, "learning_rate": 1.0231475251773945e-05, "loss": 1.2201, "step": 17045 }, { "epoch": 5.077086319551741, "grad_norm": 0.26368629932403564, "learning_rate": 1.023051094277855e-05, "loss": 1.2056, "step": 17046 }, { "epoch": 5.07738416575141, "grad_norm": 0.3373166024684906, "learning_rate": 1.0229546631638506e-05, "loss": 1.2039, "step": 17047 }, { "epoch": 5.077682011951079, "grad_norm": 0.262728214263916, "learning_rate": 1.0228582318362791e-05, "loss": 1.2269, "step": 17048 }, { "epoch": 5.077979858150748, "grad_norm": 0.40728479623794556, "learning_rate": 1.0227618002960371e-05, "loss": 1.2304, "step": 17049 }, { "epoch": 5.078277704350416, "grad_norm": 0.2462330013513565, "learning_rate": 1.022665368544022e-05, "loss": 1.1993, "step": 17050 }, { "epoch": 5.0785755505500845, "grad_norm": 0.44621655344963074, "learning_rate": 1.0225689365811308e-05, "loss": 1.2073, "step": 17051 }, { "epoch": 5.078873396749754, "grad_norm": 0.3645736575126648, "learning_rate": 1.0224725044082612e-05, "loss": 1.222, "step": 17052 }, { "epoch": 5.079171242949422, "grad_norm": 0.26708006858825684, "learning_rate": 1.0223760720263099e-05, "loss": 1.2282, "step": 17053 }, { "epoch": 5.07946908914909, "grad_norm": 0.3193638026714325, "learning_rate": 1.022279639436174e-05, "loss": 1.2154, "step": 17054 }, { "epoch": 5.07976693534876, "grad_norm": 0.3130495846271515, "learning_rate": 1.0221832066387514e-05, "loss": 1.2205, "step": 17055 }, { "epoch": 5.080064781548428, "grad_norm": 0.2514370083808899, "learning_rate": 1.0220867736349384e-05, "loss": 1.22, "step": 17056 }, { "epoch": 5.080362627748096, "grad_norm": 0.35595253109931946, "learning_rate": 1.021990340425633e-05, "loss": 1.2175, "step": 17057 }, { "epoch": 5.080660473947765, "grad_norm": 0.34136271476745605, "learning_rate": 1.021893907011732e-05, "loss": 1.2155, "step": 17058 }, { "epoch": 5.080958320147434, "grad_norm": 0.3032478094100952, "learning_rate": 1.0217974733941325e-05, "loss": 1.2271, "step": 17059 }, { "epoch": 5.081256166347102, "grad_norm": 0.5057386159896851, "learning_rate": 1.021701039573732e-05, "loss": 1.2074, "step": 17060 }, { "epoch": 5.081554012546771, "grad_norm": 0.30139559507369995, "learning_rate": 1.0216046055514273e-05, "loss": 1.2203, "step": 17061 }, { "epoch": 5.08185185874644, "grad_norm": 0.3969147801399231, "learning_rate": 1.0215081713281162e-05, "loss": 1.2123, "step": 17062 }, { "epoch": 5.082149704946109, "grad_norm": 0.4047640860080719, "learning_rate": 1.0214117369046957e-05, "loss": 1.2153, "step": 17063 }, { "epoch": 5.082447551145777, "grad_norm": 0.2553812265396118, "learning_rate": 1.0213153022820625e-05, "loss": 1.2176, "step": 17064 }, { "epoch": 5.0827453973454455, "grad_norm": 0.2984394133090973, "learning_rate": 1.0212188674611145e-05, "loss": 1.2265, "step": 17065 }, { "epoch": 5.083043243545115, "grad_norm": 0.27930763363838196, "learning_rate": 1.0211224324427485e-05, "loss": 1.2174, "step": 17066 }, { "epoch": 5.083341089744783, "grad_norm": 0.2714507281780243, "learning_rate": 1.021025997227862e-05, "loss": 1.2113, "step": 17067 }, { "epoch": 5.083638935944451, "grad_norm": 0.3868211507797241, "learning_rate": 1.020929561817352e-05, "loss": 1.2325, "step": 17068 }, { "epoch": 5.083936782144121, "grad_norm": 0.26492103934288025, "learning_rate": 1.020833126212116e-05, "loss": 1.1994, "step": 17069 }, { "epoch": 5.084234628343789, "grad_norm": 0.37112167477607727, "learning_rate": 1.0207366904130508e-05, "loss": 1.1966, "step": 17070 }, { "epoch": 5.084532474543457, "grad_norm": 0.29438942670822144, "learning_rate": 1.0206402544210537e-05, "loss": 1.2228, "step": 17071 }, { "epoch": 5.0848303207431265, "grad_norm": 0.31094077229499817, "learning_rate": 1.0205438182370225e-05, "loss": 1.1953, "step": 17072 }, { "epoch": 5.085128166942795, "grad_norm": 0.3183203339576721, "learning_rate": 1.0204473818618539e-05, "loss": 1.2291, "step": 17073 }, { "epoch": 5.085426013142463, "grad_norm": 0.25317835807800293, "learning_rate": 1.0203509452964451e-05, "loss": 1.2387, "step": 17074 }, { "epoch": 5.085723859342132, "grad_norm": 0.2823701500892639, "learning_rate": 1.0202545085416938e-05, "loss": 1.2124, "step": 17075 }, { "epoch": 5.086021705541801, "grad_norm": 0.3418533205986023, "learning_rate": 1.0201580715984968e-05, "loss": 1.2131, "step": 17076 }, { "epoch": 5.08631955174147, "grad_norm": 0.2873172461986542, "learning_rate": 1.0200616344677512e-05, "loss": 1.2091, "step": 17077 }, { "epoch": 5.086617397941138, "grad_norm": 0.3285514712333679, "learning_rate": 1.0199651971503545e-05, "loss": 1.2142, "step": 17078 }, { "epoch": 5.0869152441408065, "grad_norm": 0.3224962055683136, "learning_rate": 1.0198687596472044e-05, "loss": 1.2327, "step": 17079 }, { "epoch": 5.087213090340476, "grad_norm": 0.3088357448577881, "learning_rate": 1.0197723219591975e-05, "loss": 1.2268, "step": 17080 }, { "epoch": 5.087510936540144, "grad_norm": 0.2855260372161865, "learning_rate": 1.0196758840872311e-05, "loss": 1.2197, "step": 17081 }, { "epoch": 5.087808782739812, "grad_norm": 0.31933191418647766, "learning_rate": 1.0195794460322026e-05, "loss": 1.2124, "step": 17082 }, { "epoch": 5.088106628939482, "grad_norm": 0.31816890835762024, "learning_rate": 1.0194830077950092e-05, "loss": 1.2271, "step": 17083 }, { "epoch": 5.08840447513915, "grad_norm": 0.49468258023262024, "learning_rate": 1.0193865693765483e-05, "loss": 1.2125, "step": 17084 }, { "epoch": 5.088702321338818, "grad_norm": 0.26526886224746704, "learning_rate": 1.019290130777717e-05, "loss": 1.2154, "step": 17085 }, { "epoch": 5.0890001675384875, "grad_norm": 0.4613696038722992, "learning_rate": 1.0191936919994127e-05, "loss": 1.2211, "step": 17086 }, { "epoch": 5.089298013738156, "grad_norm": 0.28655552864074707, "learning_rate": 1.0190972530425324e-05, "loss": 1.2128, "step": 17087 }, { "epoch": 5.089595859937825, "grad_norm": 0.45737722516059875, "learning_rate": 1.0190008139079734e-05, "loss": 1.2246, "step": 17088 }, { "epoch": 5.089893706137493, "grad_norm": 0.3493267893791199, "learning_rate": 1.0189043745966334e-05, "loss": 1.2185, "step": 17089 }, { "epoch": 5.090191552337162, "grad_norm": 0.2617943286895752, "learning_rate": 1.0188079351094088e-05, "loss": 1.2251, "step": 17090 }, { "epoch": 5.090489398536831, "grad_norm": 0.30303752422332764, "learning_rate": 1.0187114954471978e-05, "loss": 1.2262, "step": 17091 }, { "epoch": 5.090787244736499, "grad_norm": 0.3064664304256439, "learning_rate": 1.018615055610897e-05, "loss": 1.2159, "step": 17092 }, { "epoch": 5.091085090936168, "grad_norm": 0.2800910174846649, "learning_rate": 1.0185186156014043e-05, "loss": 1.2074, "step": 17093 }, { "epoch": 5.091382937135837, "grad_norm": 0.30335918068885803, "learning_rate": 1.0184221754196162e-05, "loss": 1.2294, "step": 17094 }, { "epoch": 5.091680783335505, "grad_norm": 0.25953903794288635, "learning_rate": 1.0183257350664302e-05, "loss": 1.2077, "step": 17095 }, { "epoch": 5.091978629535173, "grad_norm": 0.28466346859931946, "learning_rate": 1.018229294542744e-05, "loss": 1.2151, "step": 17096 }, { "epoch": 5.092276475734843, "grad_norm": 0.26710572838783264, "learning_rate": 1.0181328538494545e-05, "loss": 1.2169, "step": 17097 }, { "epoch": 5.092574321934511, "grad_norm": 0.4515911042690277, "learning_rate": 1.0180364129874591e-05, "loss": 1.2093, "step": 17098 }, { "epoch": 5.092872168134179, "grad_norm": 0.3598514497280121, "learning_rate": 1.017939971957655e-05, "loss": 1.2221, "step": 17099 }, { "epoch": 5.0931700143338485, "grad_norm": 0.42421722412109375, "learning_rate": 1.0178435307609396e-05, "loss": 1.2121, "step": 17100 }, { "epoch": 5.093467860533517, "grad_norm": 0.3820800483226776, "learning_rate": 1.01774708939821e-05, "loss": 1.215, "step": 17101 }, { "epoch": 5.093765706733186, "grad_norm": 0.4031168818473816, "learning_rate": 1.0176506478703634e-05, "loss": 1.2076, "step": 17102 }, { "epoch": 5.094063552932854, "grad_norm": 0.4623296856880188, "learning_rate": 1.0175542061782977e-05, "loss": 1.2228, "step": 17103 }, { "epoch": 5.094361399132523, "grad_norm": 0.24452053010463715, "learning_rate": 1.0174577643229093e-05, "loss": 1.2068, "step": 17104 }, { "epoch": 5.094659245332192, "grad_norm": 0.3492816090583801, "learning_rate": 1.017361322305096e-05, "loss": 1.2233, "step": 17105 }, { "epoch": 5.09495709153186, "grad_norm": 0.30999088287353516, "learning_rate": 1.017264880125755e-05, "loss": 1.2133, "step": 17106 }, { "epoch": 5.095254937731529, "grad_norm": 0.458389550447464, "learning_rate": 1.0171684377857836e-05, "loss": 1.2108, "step": 17107 }, { "epoch": 5.095552783931198, "grad_norm": 0.2613278925418854, "learning_rate": 1.017071995286079e-05, "loss": 1.2163, "step": 17108 }, { "epoch": 5.095850630130866, "grad_norm": 0.7034139037132263, "learning_rate": 1.0169755526275386e-05, "loss": 1.2313, "step": 17109 }, { "epoch": 5.0961484763305345, "grad_norm": 0.4257388412952423, "learning_rate": 1.01687910981106e-05, "loss": 1.22, "step": 17110 }, { "epoch": 5.096446322530204, "grad_norm": 0.363075315952301, "learning_rate": 1.0167826668375398e-05, "loss": 1.2138, "step": 17111 }, { "epoch": 5.096744168729872, "grad_norm": 0.26515504717826843, "learning_rate": 1.0166862237078756e-05, "loss": 1.2068, "step": 17112 }, { "epoch": 5.09704201492954, "grad_norm": 0.6181104779243469, "learning_rate": 1.0165897804229647e-05, "loss": 1.1972, "step": 17113 }, { "epoch": 5.09733986112921, "grad_norm": 0.32947444915771484, "learning_rate": 1.0164933369837045e-05, "loss": 1.2239, "step": 17114 }, { "epoch": 5.097637707328878, "grad_norm": 0.4215790033340454, "learning_rate": 1.0163968933909922e-05, "loss": 1.2431, "step": 17115 }, { "epoch": 5.097935553528547, "grad_norm": 0.2851150333881378, "learning_rate": 1.0163004496457252e-05, "loss": 1.2091, "step": 17116 }, { "epoch": 5.098233399728215, "grad_norm": 0.4571061432361603, "learning_rate": 1.0162040057488007e-05, "loss": 1.2235, "step": 17117 }, { "epoch": 5.098531245927884, "grad_norm": 0.2850176692008972, "learning_rate": 1.0161075617011159e-05, "loss": 1.1987, "step": 17118 }, { "epoch": 5.098829092127553, "grad_norm": 0.3262980580329895, "learning_rate": 1.0160111175035684e-05, "loss": 1.2172, "step": 17119 }, { "epoch": 5.099126938327221, "grad_norm": 0.3843151032924652, "learning_rate": 1.0159146731570556e-05, "loss": 1.2077, "step": 17120 }, { "epoch": 5.09942478452689, "grad_norm": 0.2500159740447998, "learning_rate": 1.0158182286624741e-05, "loss": 1.2274, "step": 17121 }, { "epoch": 5.099722630726559, "grad_norm": 0.4776756465435028, "learning_rate": 1.0157217840207217e-05, "loss": 1.2176, "step": 17122 }, { "epoch": 5.100020476926227, "grad_norm": 0.32060763239860535, "learning_rate": 1.0156253392326958e-05, "loss": 1.2113, "step": 17123 }, { "epoch": 5.1003183231258955, "grad_norm": 0.49617981910705566, "learning_rate": 1.0155288942992933e-05, "loss": 1.2133, "step": 17124 }, { "epoch": 5.100616169325565, "grad_norm": 0.30920445919036865, "learning_rate": 1.0154324492214123e-05, "loss": 1.2167, "step": 17125 }, { "epoch": 5.100914015525233, "grad_norm": 0.5329804420471191, "learning_rate": 1.0153360039999492e-05, "loss": 1.2076, "step": 17126 }, { "epoch": 5.101211861724901, "grad_norm": 0.38505634665489197, "learning_rate": 1.015239558635802e-05, "loss": 1.2309, "step": 17127 }, { "epoch": 5.101509707924571, "grad_norm": 0.46117085218429565, "learning_rate": 1.0151431131298674e-05, "loss": 1.2127, "step": 17128 }, { "epoch": 5.101807554124239, "grad_norm": 0.3312280774116516, "learning_rate": 1.0150466674830433e-05, "loss": 1.2096, "step": 17129 }, { "epoch": 5.102105400323908, "grad_norm": 0.6624826192855835, "learning_rate": 1.0149502216962269e-05, "loss": 1.2088, "step": 17130 }, { "epoch": 5.1024032465235765, "grad_norm": 0.39815059304237366, "learning_rate": 1.0148537757703149e-05, "loss": 1.2267, "step": 17131 }, { "epoch": 5.102701092723245, "grad_norm": 0.6149324774742126, "learning_rate": 1.0147573297062055e-05, "loss": 1.2041, "step": 17132 }, { "epoch": 5.102998938922914, "grad_norm": 0.4168391525745392, "learning_rate": 1.0146608835047955e-05, "loss": 1.2109, "step": 17133 }, { "epoch": 5.103296785122582, "grad_norm": 0.5807532072067261, "learning_rate": 1.0145644371669824e-05, "loss": 1.2124, "step": 17134 }, { "epoch": 5.103594631322251, "grad_norm": 0.25392472743988037, "learning_rate": 1.0144679906936636e-05, "loss": 1.2329, "step": 17135 }, { "epoch": 5.10389247752192, "grad_norm": 1.1156970262527466, "learning_rate": 1.0143715440857361e-05, "loss": 1.2118, "step": 17136 }, { "epoch": 5.104190323721588, "grad_norm": 0.3196142911911011, "learning_rate": 1.0142750973440975e-05, "loss": 1.2248, "step": 17137 }, { "epoch": 5.1044881699212565, "grad_norm": 0.9182085990905762, "learning_rate": 1.0141786504696452e-05, "loss": 1.2096, "step": 17138 }, { "epoch": 5.104786016120926, "grad_norm": 0.25468823313713074, "learning_rate": 1.0140822034632763e-05, "loss": 1.2209, "step": 17139 }, { "epoch": 5.105083862320594, "grad_norm": 0.8345251083374023, "learning_rate": 1.0139857563258883e-05, "loss": 1.2238, "step": 17140 }, { "epoch": 5.105381708520262, "grad_norm": 0.3281252384185791, "learning_rate": 1.0138893090583783e-05, "loss": 1.2343, "step": 17141 }, { "epoch": 5.105679554719932, "grad_norm": 0.46737855672836304, "learning_rate": 1.013792861661644e-05, "loss": 1.2216, "step": 17142 }, { "epoch": 5.1059774009196, "grad_norm": 0.45355433225631714, "learning_rate": 1.013696414136582e-05, "loss": 1.1997, "step": 17143 }, { "epoch": 5.106275247119269, "grad_norm": 0.3648901879787445, "learning_rate": 1.013599966484091e-05, "loss": 1.2295, "step": 17144 }, { "epoch": 5.1065730933189375, "grad_norm": 0.3974287211894989, "learning_rate": 1.013503518705067e-05, "loss": 1.2223, "step": 17145 }, { "epoch": 5.106870939518606, "grad_norm": 0.34418654441833496, "learning_rate": 1.013407070800408e-05, "loss": 1.1982, "step": 17146 }, { "epoch": 5.107168785718275, "grad_norm": 0.3958006501197815, "learning_rate": 1.0133106227710113e-05, "loss": 1.2162, "step": 17147 }, { "epoch": 5.107466631917943, "grad_norm": 0.2721463143825531, "learning_rate": 1.0132141746177737e-05, "loss": 1.2272, "step": 17148 }, { "epoch": 5.107764478117612, "grad_norm": 0.3730507493019104, "learning_rate": 1.0131177263415933e-05, "loss": 1.2221, "step": 17149 }, { "epoch": 5.108062324317281, "grad_norm": 0.35416150093078613, "learning_rate": 1.013021277943367e-05, "loss": 1.2154, "step": 17150 }, { "epoch": 5.108360170516949, "grad_norm": 0.28584015369415283, "learning_rate": 1.0129248294239924e-05, "loss": 1.2311, "step": 17151 }, { "epoch": 5.108658016716618, "grad_norm": 0.5706966519355774, "learning_rate": 1.0128283807843666e-05, "loss": 1.2268, "step": 17152 }, { "epoch": 5.108955862916287, "grad_norm": 0.340482622385025, "learning_rate": 1.012731932025387e-05, "loss": 1.2147, "step": 17153 }, { "epoch": 5.109253709115955, "grad_norm": 0.42856964468955994, "learning_rate": 1.012635483147951e-05, "loss": 1.2214, "step": 17154 }, { "epoch": 5.109551555315624, "grad_norm": 0.2704993486404419, "learning_rate": 1.012539034152956e-05, "loss": 1.2267, "step": 17155 }, { "epoch": 5.109849401515293, "grad_norm": 0.3846414387226105, "learning_rate": 1.0124425850412992e-05, "loss": 1.221, "step": 17156 }, { "epoch": 5.110147247714961, "grad_norm": 0.35049378871917725, "learning_rate": 1.0123461358138782e-05, "loss": 1.2149, "step": 17157 }, { "epoch": 5.11044509391463, "grad_norm": 0.2807372212409973, "learning_rate": 1.0122496864715903e-05, "loss": 1.2131, "step": 17158 }, { "epoch": 5.1107429401142985, "grad_norm": 0.3807115852832794, "learning_rate": 1.0121532370153324e-05, "loss": 1.2065, "step": 17159 }, { "epoch": 5.111040786313967, "grad_norm": 0.271686315536499, "learning_rate": 1.0120567874460024e-05, "loss": 1.2185, "step": 17160 }, { "epoch": 5.111338632513636, "grad_norm": 0.4014586806297302, "learning_rate": 1.0119603377644976e-05, "loss": 1.2171, "step": 17161 }, { "epoch": 5.111636478713304, "grad_norm": 0.2593546509742737, "learning_rate": 1.011863887971715e-05, "loss": 1.2111, "step": 17162 }, { "epoch": 5.111934324912973, "grad_norm": 0.2558513581752777, "learning_rate": 1.0117674380685523e-05, "loss": 1.1981, "step": 17163 }, { "epoch": 5.112232171112642, "grad_norm": 0.28646785020828247, "learning_rate": 1.0116709880559071e-05, "loss": 1.2059, "step": 17164 }, { "epoch": 5.11253001731231, "grad_norm": 0.3175455331802368, "learning_rate": 1.0115745379346755e-05, "loss": 1.2068, "step": 17165 }, { "epoch": 5.112827863511979, "grad_norm": 0.49774348735809326, "learning_rate": 1.0114780877057562e-05, "loss": 1.226, "step": 17166 }, { "epoch": 5.113125709711648, "grad_norm": 0.31071609258651733, "learning_rate": 1.0113816373700462e-05, "loss": 1.197, "step": 17167 }, { "epoch": 5.113423555911316, "grad_norm": 0.2739827632904053, "learning_rate": 1.011285186928443e-05, "loss": 1.23, "step": 17168 }, { "epoch": 5.113721402110985, "grad_norm": 0.2822209596633911, "learning_rate": 1.0111887363818433e-05, "loss": 1.2026, "step": 17169 }, { "epoch": 5.114019248310654, "grad_norm": 0.37194135785102844, "learning_rate": 1.0110922857311452e-05, "loss": 1.2078, "step": 17170 }, { "epoch": 5.114317094510322, "grad_norm": 0.388312429189682, "learning_rate": 1.0109958349772457e-05, "loss": 1.2242, "step": 17171 }, { "epoch": 5.114614940709991, "grad_norm": 0.2602137625217438, "learning_rate": 1.0108993841210421e-05, "loss": 1.2086, "step": 17172 }, { "epoch": 5.11491278690966, "grad_norm": 0.2920834422111511, "learning_rate": 1.0108029331634322e-05, "loss": 1.2221, "step": 17173 }, { "epoch": 5.115210633109328, "grad_norm": 0.24184873700141907, "learning_rate": 1.0107064821053127e-05, "loss": 1.2071, "step": 17174 }, { "epoch": 5.115508479308997, "grad_norm": 0.26793280243873596, "learning_rate": 1.0106100309475816e-05, "loss": 1.211, "step": 17175 }, { "epoch": 5.115806325508665, "grad_norm": 0.2672754228115082, "learning_rate": 1.010513579691136e-05, "loss": 1.1999, "step": 17176 }, { "epoch": 5.116104171708334, "grad_norm": 0.28690797090530396, "learning_rate": 1.0104171283368731e-05, "loss": 1.2122, "step": 17177 }, { "epoch": 5.116402017908003, "grad_norm": 0.3485107123851776, "learning_rate": 1.0103206768856909e-05, "loss": 1.2072, "step": 17178 }, { "epoch": 5.116699864107671, "grad_norm": 0.27620166540145874, "learning_rate": 1.0102242253384859e-05, "loss": 1.202, "step": 17179 }, { "epoch": 5.11699771030734, "grad_norm": 0.29888781905174255, "learning_rate": 1.0101277736961561e-05, "loss": 1.202, "step": 17180 }, { "epoch": 5.117295556507009, "grad_norm": 0.24393923580646515, "learning_rate": 1.0100313219595988e-05, "loss": 1.2174, "step": 17181 }, { "epoch": 5.117593402706677, "grad_norm": 0.3704393208026886, "learning_rate": 1.0099348701297107e-05, "loss": 1.2037, "step": 17182 }, { "epoch": 5.117891248906346, "grad_norm": 0.24490593373775482, "learning_rate": 1.0098384182073902e-05, "loss": 1.213, "step": 17183 }, { "epoch": 5.118189095106015, "grad_norm": 0.5924113988876343, "learning_rate": 1.0097419661935341e-05, "loss": 1.2157, "step": 17184 }, { "epoch": 5.118486941305683, "grad_norm": 0.44691896438598633, "learning_rate": 1.0096455140890403e-05, "loss": 1.2153, "step": 17185 }, { "epoch": 5.118784787505352, "grad_norm": 0.4877368211746216, "learning_rate": 1.0095490618948052e-05, "loss": 1.2307, "step": 17186 }, { "epoch": 5.119082633705021, "grad_norm": 0.4787835478782654, "learning_rate": 1.009452609611727e-05, "loss": 1.2202, "step": 17187 }, { "epoch": 5.119380479904689, "grad_norm": 0.33387571573257446, "learning_rate": 1.0093561572407027e-05, "loss": 1.2269, "step": 17188 }, { "epoch": 5.119678326104358, "grad_norm": 0.3363417983055115, "learning_rate": 1.0092597047826298e-05, "loss": 1.2053, "step": 17189 }, { "epoch": 5.1199761723040265, "grad_norm": 0.29627305269241333, "learning_rate": 1.0091632522384056e-05, "loss": 1.228, "step": 17190 }, { "epoch": 5.120274018503695, "grad_norm": 0.2782908082008362, "learning_rate": 1.0090667996089278e-05, "loss": 1.2095, "step": 17191 }, { "epoch": 5.120571864703364, "grad_norm": 0.34067395329475403, "learning_rate": 1.0089703468950937e-05, "loss": 1.2176, "step": 17192 }, { "epoch": 5.120869710903032, "grad_norm": 0.27354928851127625, "learning_rate": 1.0088738940978003e-05, "loss": 1.2138, "step": 17193 }, { "epoch": 5.121167557102701, "grad_norm": 0.40996381640434265, "learning_rate": 1.0087774412179452e-05, "loss": 1.2236, "step": 17194 }, { "epoch": 5.12146540330237, "grad_norm": 0.26813942193984985, "learning_rate": 1.008680988256426e-05, "loss": 1.2099, "step": 17195 }, { "epoch": 5.121763249502038, "grad_norm": 0.35690394043922424, "learning_rate": 1.0085845352141397e-05, "loss": 1.2072, "step": 17196 }, { "epoch": 5.122061095701707, "grad_norm": 0.27852919697761536, "learning_rate": 1.008488082091984e-05, "loss": 1.2167, "step": 17197 }, { "epoch": 5.122358941901376, "grad_norm": 0.2842017412185669, "learning_rate": 1.0083916288908565e-05, "loss": 1.2292, "step": 17198 }, { "epoch": 5.122656788101044, "grad_norm": 0.37780824303627014, "learning_rate": 1.0082951756116536e-05, "loss": 1.2029, "step": 17199 }, { "epoch": 5.122954634300713, "grad_norm": 0.2934509515762329, "learning_rate": 1.0081987222552735e-05, "loss": 1.2238, "step": 17200 }, { "epoch": 5.123252480500382, "grad_norm": 0.3488498330116272, "learning_rate": 1.0081022688226136e-05, "loss": 1.2137, "step": 17201 }, { "epoch": 5.12355032670005, "grad_norm": 0.3076275885105133, "learning_rate": 1.0080058153145715e-05, "loss": 1.2141, "step": 17202 }, { "epoch": 5.123848172899719, "grad_norm": 0.2605409026145935, "learning_rate": 1.0079093617320438e-05, "loss": 1.2083, "step": 17203 }, { "epoch": 5.1241460190993875, "grad_norm": 0.3248927593231201, "learning_rate": 1.0078129080759284e-05, "loss": 1.2143, "step": 17204 }, { "epoch": 5.124443865299056, "grad_norm": 0.31479761004447937, "learning_rate": 1.007716454347123e-05, "loss": 1.2165, "step": 17205 }, { "epoch": 5.124741711498725, "grad_norm": 0.42612096667289734, "learning_rate": 1.007620000546524e-05, "loss": 1.222, "step": 17206 }, { "epoch": 5.125039557698393, "grad_norm": 0.34823623299598694, "learning_rate": 1.0075235466750298e-05, "loss": 1.206, "step": 17207 }, { "epoch": 5.125337403898062, "grad_norm": 0.31496137380599976, "learning_rate": 1.007427092733537e-05, "loss": 1.1924, "step": 17208 }, { "epoch": 5.125635250097731, "grad_norm": 0.3191942870616913, "learning_rate": 1.007330638722944e-05, "loss": 1.2107, "step": 17209 }, { "epoch": 5.125933096297399, "grad_norm": 0.3097342550754547, "learning_rate": 1.0072341846441473e-05, "loss": 1.2052, "step": 17210 }, { "epoch": 5.1262309424970685, "grad_norm": 0.2566518187522888, "learning_rate": 1.0071377304980445e-05, "loss": 1.2096, "step": 17211 }, { "epoch": 5.126528788696737, "grad_norm": 0.2853011190891266, "learning_rate": 1.007041276285533e-05, "loss": 1.2021, "step": 17212 }, { "epoch": 5.126826634896405, "grad_norm": 0.4396412670612335, "learning_rate": 1.0069448220075106e-05, "loss": 1.2148, "step": 17213 }, { "epoch": 5.127124481096074, "grad_norm": 0.5236174464225769, "learning_rate": 1.0068483676648742e-05, "loss": 1.2054, "step": 17214 }, { "epoch": 5.127422327295743, "grad_norm": 0.32824137806892395, "learning_rate": 1.0067519132585215e-05, "loss": 1.2092, "step": 17215 }, { "epoch": 5.127720173495411, "grad_norm": 0.6092197895050049, "learning_rate": 1.0066554587893499e-05, "loss": 1.2026, "step": 17216 }, { "epoch": 5.12801801969508, "grad_norm": 0.25081637501716614, "learning_rate": 1.0065590042582565e-05, "loss": 1.202, "step": 17217 }, { "epoch": 5.1283158658947485, "grad_norm": 0.4604050815105438, "learning_rate": 1.0064625496661387e-05, "loss": 1.219, "step": 17218 }, { "epoch": 5.128613712094417, "grad_norm": 0.27120324969291687, "learning_rate": 1.0063660950138945e-05, "loss": 1.2328, "step": 17219 }, { "epoch": 5.128911558294086, "grad_norm": 0.4967385232448578, "learning_rate": 1.0062696403024206e-05, "loss": 1.2301, "step": 17220 }, { "epoch": 5.129209404493754, "grad_norm": 0.26437661051750183, "learning_rate": 1.0061731855326149e-05, "loss": 1.2166, "step": 17221 }, { "epoch": 5.129507250693424, "grad_norm": 0.3599362373352051, "learning_rate": 1.0060767307053746e-05, "loss": 1.2215, "step": 17222 }, { "epoch": 5.129805096893092, "grad_norm": 0.27200520038604736, "learning_rate": 1.0059802758215971e-05, "loss": 1.1992, "step": 17223 }, { "epoch": 5.13010294309276, "grad_norm": 0.5720693469047546, "learning_rate": 1.0058838208821795e-05, "loss": 1.2138, "step": 17224 }, { "epoch": 5.1304007892924295, "grad_norm": 0.3677472174167633, "learning_rate": 1.0057873658880197e-05, "loss": 1.1955, "step": 17225 }, { "epoch": 5.130698635492098, "grad_norm": 0.3923113942146301, "learning_rate": 1.0056909108400153e-05, "loss": 1.2113, "step": 17226 }, { "epoch": 5.130996481691766, "grad_norm": 0.37376493215560913, "learning_rate": 1.0055944557390629e-05, "loss": 1.2135, "step": 17227 }, { "epoch": 5.131294327891435, "grad_norm": 0.29747623205184937, "learning_rate": 1.0054980005860604e-05, "loss": 1.2151, "step": 17228 }, { "epoch": 5.131592174091104, "grad_norm": 0.2658478617668152, "learning_rate": 1.0054015453819052e-05, "loss": 1.2116, "step": 17229 }, { "epoch": 5.131890020290772, "grad_norm": 0.2868218719959259, "learning_rate": 1.0053050901274947e-05, "loss": 1.215, "step": 17230 }, { "epoch": 5.132187866490441, "grad_norm": 0.27445900440216064, "learning_rate": 1.0052086348237262e-05, "loss": 1.2141, "step": 17231 }, { "epoch": 5.13248571269011, "grad_norm": 0.28511521220207214, "learning_rate": 1.0051121794714971e-05, "loss": 1.2288, "step": 17232 }, { "epoch": 5.132783558889778, "grad_norm": 0.3396657705307007, "learning_rate": 1.0050157240717053e-05, "loss": 1.2153, "step": 17233 }, { "epoch": 5.133081405089447, "grad_norm": 0.2693929076194763, "learning_rate": 1.0049192686252474e-05, "loss": 1.2167, "step": 17234 }, { "epoch": 5.133379251289115, "grad_norm": 0.35997962951660156, "learning_rate": 1.0048228131330214e-05, "loss": 1.2178, "step": 17235 }, { "epoch": 5.133677097488785, "grad_norm": 0.25350475311279297, "learning_rate": 1.0047263575959243e-05, "loss": 1.221, "step": 17236 }, { "epoch": 5.133974943688453, "grad_norm": 0.5521942377090454, "learning_rate": 1.0046299020148538e-05, "loss": 1.2327, "step": 17237 }, { "epoch": 5.134272789888121, "grad_norm": 0.37590649724006653, "learning_rate": 1.004533446390707e-05, "loss": 1.2087, "step": 17238 }, { "epoch": 5.1345706360877905, "grad_norm": 0.46460583806037903, "learning_rate": 1.004436990724382e-05, "loss": 1.2286, "step": 17239 }, { "epoch": 5.134868482287459, "grad_norm": 0.4416270852088928, "learning_rate": 1.0043405350167757e-05, "loss": 1.2159, "step": 17240 }, { "epoch": 5.135166328487127, "grad_norm": 0.5738375186920166, "learning_rate": 1.0042440792687854e-05, "loss": 1.2152, "step": 17241 }, { "epoch": 5.135464174686796, "grad_norm": 0.5010666847229004, "learning_rate": 1.0041476234813084e-05, "loss": 1.2202, "step": 17242 }, { "epoch": 5.135762020886465, "grad_norm": 0.6015894412994385, "learning_rate": 1.0040511676552428e-05, "loss": 1.2079, "step": 17243 }, { "epoch": 5.136059867086133, "grad_norm": 0.6593461632728577, "learning_rate": 1.0039547117914856e-05, "loss": 1.2118, "step": 17244 }, { "epoch": 5.136357713285802, "grad_norm": 0.3574669063091278, "learning_rate": 1.0038582558909342e-05, "loss": 1.2159, "step": 17245 }, { "epoch": 5.136655559485471, "grad_norm": 0.3862614035606384, "learning_rate": 1.0037617999544862e-05, "loss": 1.2239, "step": 17246 }, { "epoch": 5.136953405685139, "grad_norm": 0.3918178379535675, "learning_rate": 1.0036653439830384e-05, "loss": 1.223, "step": 17247 }, { "epoch": 5.137251251884808, "grad_norm": 0.27867281436920166, "learning_rate": 1.0035688879774887e-05, "loss": 1.218, "step": 17248 }, { "epoch": 5.1375490980844765, "grad_norm": 0.5913288593292236, "learning_rate": 1.0034724319387349e-05, "loss": 1.2221, "step": 17249 }, { "epoch": 5.137846944284146, "grad_norm": 0.257610023021698, "learning_rate": 1.0033759758676738e-05, "loss": 1.2002, "step": 17250 }, { "epoch": 5.138144790483814, "grad_norm": 0.4588865041732788, "learning_rate": 1.003279519765203e-05, "loss": 1.22, "step": 17251 }, { "epoch": 5.138442636683482, "grad_norm": 0.3236146867275238, "learning_rate": 1.00318306363222e-05, "loss": 1.2074, "step": 17252 }, { "epoch": 5.1387404828831516, "grad_norm": 0.28302136063575745, "learning_rate": 1.003086607469622e-05, "loss": 1.2233, "step": 17253 }, { "epoch": 5.13903832908282, "grad_norm": 0.41775062680244446, "learning_rate": 1.0029901512783067e-05, "loss": 1.2371, "step": 17254 }, { "epoch": 5.139336175282488, "grad_norm": 0.2608824074268341, "learning_rate": 1.0028936950591712e-05, "loss": 1.2233, "step": 17255 }, { "epoch": 5.139634021482157, "grad_norm": 0.3283870816230774, "learning_rate": 1.0027972388131135e-05, "loss": 1.2045, "step": 17256 }, { "epoch": 5.139931867681826, "grad_norm": 0.3376162648200989, "learning_rate": 1.0027007825410305e-05, "loss": 1.194, "step": 17257 }, { "epoch": 5.140229713881494, "grad_norm": 0.3030361533164978, "learning_rate": 1.0026043262438195e-05, "loss": 1.2076, "step": 17258 }, { "epoch": 5.140527560081163, "grad_norm": 0.3645525872707367, "learning_rate": 1.002507869922378e-05, "loss": 1.2226, "step": 17259 }, { "epoch": 5.140825406280832, "grad_norm": 0.30467063188552856, "learning_rate": 1.002411413577604e-05, "loss": 1.2206, "step": 17260 }, { "epoch": 5.141123252480501, "grad_norm": 0.3725603222846985, "learning_rate": 1.0023149572103943e-05, "loss": 1.2415, "step": 17261 }, { "epoch": 5.141421098680169, "grad_norm": 0.2602631151676178, "learning_rate": 1.0022185008216463e-05, "loss": 1.2129, "step": 17262 }, { "epoch": 5.1417189448798375, "grad_norm": 0.5139057636260986, "learning_rate": 1.002122044412258e-05, "loss": 1.2146, "step": 17263 }, { "epoch": 5.142016791079507, "grad_norm": 0.2883543372154236, "learning_rate": 1.0020255879831264e-05, "loss": 1.2277, "step": 17264 }, { "epoch": 5.142314637279175, "grad_norm": 0.36388319730758667, "learning_rate": 1.0019291315351487e-05, "loss": 1.2146, "step": 17265 }, { "epoch": 5.142612483478843, "grad_norm": 0.2720346748828888, "learning_rate": 1.0018326750692226e-05, "loss": 1.2423, "step": 17266 }, { "epoch": 5.142910329678513, "grad_norm": 0.393311470746994, "learning_rate": 1.0017362185862459e-05, "loss": 1.2339, "step": 17267 }, { "epoch": 5.143208175878181, "grad_norm": 0.30610939860343933, "learning_rate": 1.0016397620871154e-05, "loss": 1.2124, "step": 17268 }, { "epoch": 5.143506022077849, "grad_norm": 0.30627381801605225, "learning_rate": 1.0015433055727286e-05, "loss": 1.2222, "step": 17269 }, { "epoch": 5.1438038682775185, "grad_norm": 0.29076746106147766, "learning_rate": 1.0014468490439833e-05, "loss": 1.2219, "step": 17270 }, { "epoch": 5.144101714477187, "grad_norm": 0.2629736065864563, "learning_rate": 1.0013503925017764e-05, "loss": 1.228, "step": 17271 }, { "epoch": 5.144399560676855, "grad_norm": 0.29096749424934387, "learning_rate": 1.0012539359470059e-05, "loss": 1.2282, "step": 17272 }, { "epoch": 5.144697406876524, "grad_norm": 0.3243168294429779, "learning_rate": 1.0011574793805688e-05, "loss": 1.2092, "step": 17273 }, { "epoch": 5.144995253076193, "grad_norm": 0.2627241313457489, "learning_rate": 1.0010610228033629e-05, "loss": 1.2236, "step": 17274 }, { "epoch": 5.145293099275861, "grad_norm": 0.29743027687072754, "learning_rate": 1.000964566216285e-05, "loss": 1.2117, "step": 17275 }, { "epoch": 5.14559094547553, "grad_norm": 0.2733718156814575, "learning_rate": 1.000868109620233e-05, "loss": 1.2265, "step": 17276 }, { "epoch": 5.1458887916751985, "grad_norm": 0.26167747378349304, "learning_rate": 1.0007716530161042e-05, "loss": 1.2074, "step": 17277 }, { "epoch": 5.146186637874868, "grad_norm": 0.25628238916397095, "learning_rate": 1.000675196404796e-05, "loss": 1.2209, "step": 17278 }, { "epoch": 5.146484484074536, "grad_norm": 0.3680971562862396, "learning_rate": 1.000578739787206e-05, "loss": 1.2127, "step": 17279 }, { "epoch": 5.146782330274204, "grad_norm": 0.30542105436325073, "learning_rate": 1.0004822831642316e-05, "loss": 1.208, "step": 17280 }, { "epoch": 5.147080176473874, "grad_norm": 0.3616526424884796, "learning_rate": 1.0003858265367698e-05, "loss": 1.2073, "step": 17281 }, { "epoch": 5.147378022673542, "grad_norm": 0.39930588006973267, "learning_rate": 1.0002893699057184e-05, "loss": 1.2141, "step": 17282 }, { "epoch": 5.14767586887321, "grad_norm": 0.28660163283348083, "learning_rate": 1.0001929132719743e-05, "loss": 1.2291, "step": 17283 }, { "epoch": 5.1479737150728795, "grad_norm": 0.3477869927883148, "learning_rate": 1.0000964566364361e-05, "loss": 1.211, "step": 17284 }, { "epoch": 5.148271561272548, "grad_norm": 0.3396701216697693, "learning_rate": 1e-05, "loss": 1.2195, "step": 17285 }, { "epoch": 5.148569407472216, "grad_norm": 0.4077571630477905, "learning_rate": 9.999035433635644e-06, "loss": 1.2298, "step": 17286 }, { "epoch": 5.148867253671885, "grad_norm": 0.27981382608413696, "learning_rate": 9.998070867280256e-06, "loss": 1.2029, "step": 17287 }, { "epoch": 5.149165099871554, "grad_norm": 0.40182021260261536, "learning_rate": 9.99710630094282e-06, "loss": 1.2286, "step": 17288 }, { "epoch": 5.149462946071223, "grad_norm": 0.31784218549728394, "learning_rate": 9.996141734632308e-06, "loss": 1.2135, "step": 17289 }, { "epoch": 5.149760792270891, "grad_norm": 0.28021320700645447, "learning_rate": 9.995177168357687e-06, "loss": 1.2165, "step": 17290 }, { "epoch": 5.15005863847056, "grad_norm": 0.4634260833263397, "learning_rate": 9.994212602127944e-06, "loss": 1.2148, "step": 17291 }, { "epoch": 5.150356484670229, "grad_norm": 0.4654317796230316, "learning_rate": 9.993248035952045e-06, "loss": 1.2105, "step": 17292 }, { "epoch": 5.150654330869897, "grad_norm": 0.2636204659938812, "learning_rate": 9.99228346983896e-06, "loss": 1.2008, "step": 17293 }, { "epoch": 5.150952177069565, "grad_norm": 0.3003879487514496, "learning_rate": 9.991318903797673e-06, "loss": 1.2342, "step": 17294 }, { "epoch": 5.151250023269235, "grad_norm": 0.5199518203735352, "learning_rate": 9.99035433783715e-06, "loss": 1.2129, "step": 17295 }, { "epoch": 5.151547869468903, "grad_norm": 0.31848806142807007, "learning_rate": 9.989389771966375e-06, "loss": 1.2095, "step": 17296 }, { "epoch": 5.151845715668571, "grad_norm": 0.6944403648376465, "learning_rate": 9.988425206194315e-06, "loss": 1.216, "step": 17297 }, { "epoch": 5.1521435618682405, "grad_norm": 0.5113841891288757, "learning_rate": 9.987460640529943e-06, "loss": 1.2107, "step": 17298 }, { "epoch": 5.152441408067909, "grad_norm": 0.5333827137947083, "learning_rate": 9.986496074982239e-06, "loss": 1.2103, "step": 17299 }, { "epoch": 5.152739254267577, "grad_norm": 0.4030782878398895, "learning_rate": 9.98553150956017e-06, "loss": 1.1972, "step": 17300 }, { "epoch": 5.153037100467246, "grad_norm": 0.28179246187210083, "learning_rate": 9.984566944272714e-06, "loss": 1.2131, "step": 17301 }, { "epoch": 5.153334946666915, "grad_norm": 0.5105195045471191, "learning_rate": 9.98360237912885e-06, "loss": 1.2183, "step": 17302 }, { "epoch": 5.153632792866584, "grad_norm": 0.3332255482673645, "learning_rate": 9.982637814137546e-06, "loss": 1.2217, "step": 17303 }, { "epoch": 5.153930639066252, "grad_norm": 0.3595399260520935, "learning_rate": 9.981673249307774e-06, "loss": 1.2045, "step": 17304 }, { "epoch": 5.154228485265921, "grad_norm": 0.3326016366481781, "learning_rate": 9.980708684648516e-06, "loss": 1.1971, "step": 17305 }, { "epoch": 5.15452633146559, "grad_norm": 0.36100009083747864, "learning_rate": 9.979744120168741e-06, "loss": 1.2112, "step": 17306 }, { "epoch": 5.154824177665258, "grad_norm": 0.28945910930633545, "learning_rate": 9.97877955587742e-06, "loss": 1.1938, "step": 17307 }, { "epoch": 5.1551220238649265, "grad_norm": 0.4248252511024475, "learning_rate": 9.977814991783539e-06, "loss": 1.2257, "step": 17308 }, { "epoch": 5.155419870064596, "grad_norm": 0.27031490206718445, "learning_rate": 9.976850427896063e-06, "loss": 1.225, "step": 17309 }, { "epoch": 5.155717716264264, "grad_norm": 0.44048282504081726, "learning_rate": 9.975885864223965e-06, "loss": 1.231, "step": 17310 }, { "epoch": 5.156015562463932, "grad_norm": 0.3263550102710724, "learning_rate": 9.974921300776222e-06, "loss": 1.2365, "step": 17311 }, { "epoch": 5.1563134086636015, "grad_norm": 0.49528902769088745, "learning_rate": 9.973956737561809e-06, "loss": 1.2331, "step": 17312 }, { "epoch": 5.15661125486327, "grad_norm": 0.2641638219356537, "learning_rate": 9.9729921745897e-06, "loss": 1.209, "step": 17313 }, { "epoch": 5.156909101062938, "grad_norm": 0.36507704854011536, "learning_rate": 9.97202761186887e-06, "loss": 1.2132, "step": 17314 }, { "epoch": 5.157206947262607, "grad_norm": 0.29481709003448486, "learning_rate": 9.971063049408288e-06, "loss": 1.2166, "step": 17315 }, { "epoch": 5.157504793462276, "grad_norm": 0.33992066979408264, "learning_rate": 9.970098487216937e-06, "loss": 1.2195, "step": 17316 }, { "epoch": 5.157802639661945, "grad_norm": 0.3174651861190796, "learning_rate": 9.969133925303783e-06, "loss": 1.2139, "step": 17317 }, { "epoch": 5.158100485861613, "grad_norm": 0.3178800642490387, "learning_rate": 9.968169363677803e-06, "loss": 1.2255, "step": 17318 }, { "epoch": 5.158398332061282, "grad_norm": 0.5044521689414978, "learning_rate": 9.967204802347972e-06, "loss": 1.1994, "step": 17319 }, { "epoch": 5.158696178260951, "grad_norm": 0.2825145125389099, "learning_rate": 9.966240241323267e-06, "loss": 1.2096, "step": 17320 }, { "epoch": 5.158994024460619, "grad_norm": 0.532430112361908, "learning_rate": 9.965275680612653e-06, "loss": 1.2184, "step": 17321 }, { "epoch": 5.1592918706602875, "grad_norm": 0.3720751106739044, "learning_rate": 9.964311120225116e-06, "loss": 1.2142, "step": 17322 }, { "epoch": 5.159589716859957, "grad_norm": 0.47451266646385193, "learning_rate": 9.96334656016962e-06, "loss": 1.219, "step": 17323 }, { "epoch": 5.159887563059625, "grad_norm": 0.4867680072784424, "learning_rate": 9.962382000455142e-06, "loss": 1.2282, "step": 17324 }, { "epoch": 5.160185409259293, "grad_norm": 0.34293991327285767, "learning_rate": 9.961417441090661e-06, "loss": 1.2111, "step": 17325 }, { "epoch": 5.160483255458963, "grad_norm": 0.4056837260723114, "learning_rate": 9.960452882085149e-06, "loss": 1.2323, "step": 17326 }, { "epoch": 5.160781101658631, "grad_norm": 0.33025863766670227, "learning_rate": 9.959488323447573e-06, "loss": 1.2088, "step": 17327 }, { "epoch": 5.1610789478583, "grad_norm": 0.31803029775619507, "learning_rate": 9.958523765186917e-06, "loss": 1.2342, "step": 17328 }, { "epoch": 5.1613767940579685, "grad_norm": 0.3175404667854309, "learning_rate": 9.95755920731215e-06, "loss": 1.2191, "step": 17329 }, { "epoch": 5.161674640257637, "grad_norm": 0.30470433831214905, "learning_rate": 9.956594649832246e-06, "loss": 1.2125, "step": 17330 }, { "epoch": 5.161972486457306, "grad_norm": 0.3382590115070343, "learning_rate": 9.955630092756183e-06, "loss": 1.2256, "step": 17331 }, { "epoch": 5.162270332656974, "grad_norm": 0.27963191270828247, "learning_rate": 9.95466553609293e-06, "loss": 1.2183, "step": 17332 }, { "epoch": 5.162568178856643, "grad_norm": 0.3190222978591919, "learning_rate": 9.953700979851467e-06, "loss": 1.2033, "step": 17333 }, { "epoch": 5.162866025056312, "grad_norm": 0.2902185916900635, "learning_rate": 9.952736424040764e-06, "loss": 1.2371, "step": 17334 }, { "epoch": 5.16316387125598, "grad_norm": 0.31622791290283203, "learning_rate": 9.95177186866979e-06, "loss": 1.2219, "step": 17335 }, { "epoch": 5.1634617174556485, "grad_norm": 0.34580332040786743, "learning_rate": 9.95080731374753e-06, "loss": 1.2018, "step": 17336 }, { "epoch": 5.163759563655318, "grad_norm": 0.430586040019989, "learning_rate": 9.949842759282952e-06, "loss": 1.2038, "step": 17337 }, { "epoch": 5.164057409854986, "grad_norm": 0.265218049287796, "learning_rate": 9.948878205285028e-06, "loss": 1.2268, "step": 17338 }, { "epoch": 5.164355256054654, "grad_norm": 0.4097771644592285, "learning_rate": 9.947913651762741e-06, "loss": 1.2067, "step": 17339 }, { "epoch": 5.164653102254324, "grad_norm": 0.27291157841682434, "learning_rate": 9.946949098725056e-06, "loss": 1.217, "step": 17340 }, { "epoch": 5.164950948453992, "grad_norm": 0.5387572050094604, "learning_rate": 9.94598454618095e-06, "loss": 1.2107, "step": 17341 }, { "epoch": 5.16524879465366, "grad_norm": 0.5005273818969727, "learning_rate": 9.9450199941394e-06, "loss": 1.224, "step": 17342 }, { "epoch": 5.1655466408533295, "grad_norm": 0.2867977023124695, "learning_rate": 9.944055442609371e-06, "loss": 1.1963, "step": 17343 }, { "epoch": 5.165844487052998, "grad_norm": 0.33826544880867004, "learning_rate": 9.94309089159985e-06, "loss": 1.2124, "step": 17344 }, { "epoch": 5.166142333252667, "grad_norm": 0.31648164987564087, "learning_rate": 9.942126341119806e-06, "loss": 1.2157, "step": 17345 }, { "epoch": 5.166440179452335, "grad_norm": 0.3026304841041565, "learning_rate": 9.941161791178206e-06, "loss": 1.2132, "step": 17346 }, { "epoch": 5.166738025652004, "grad_norm": 0.3801819086074829, "learning_rate": 9.940197241784032e-06, "loss": 1.1988, "step": 17347 }, { "epoch": 5.167035871851673, "grad_norm": 0.3131085932254791, "learning_rate": 9.939232692946259e-06, "loss": 1.1996, "step": 17348 }, { "epoch": 5.167333718051341, "grad_norm": 0.3868970572948456, "learning_rate": 9.938268144673853e-06, "loss": 1.2109, "step": 17349 }, { "epoch": 5.16763156425101, "grad_norm": 0.2998538613319397, "learning_rate": 9.937303596975798e-06, "loss": 1.2008, "step": 17350 }, { "epoch": 5.167929410450679, "grad_norm": 0.32032912969589233, "learning_rate": 9.93633904986106e-06, "loss": 1.2054, "step": 17351 }, { "epoch": 5.168227256650347, "grad_norm": 0.2641001045703888, "learning_rate": 9.935374503338614e-06, "loss": 1.1972, "step": 17352 }, { "epoch": 5.168525102850015, "grad_norm": 0.48422569036483765, "learning_rate": 9.934409957417438e-06, "loss": 1.207, "step": 17353 }, { "epoch": 5.168822949049685, "grad_norm": 0.25899428129196167, "learning_rate": 9.933445412106506e-06, "loss": 1.2119, "step": 17354 }, { "epoch": 5.169120795249353, "grad_norm": 0.4854237139225006, "learning_rate": 9.932480867414786e-06, "loss": 1.2002, "step": 17355 }, { "epoch": 5.169418641449022, "grad_norm": 0.25135356187820435, "learning_rate": 9.93151632335126e-06, "loss": 1.2103, "step": 17356 }, { "epoch": 5.1697164876486905, "grad_norm": 0.3396010100841522, "learning_rate": 9.930551779924899e-06, "loss": 1.2187, "step": 17357 }, { "epoch": 5.170014333848359, "grad_norm": 0.3366449177265167, "learning_rate": 9.929587237144671e-06, "loss": 1.2255, "step": 17358 }, { "epoch": 5.170312180048028, "grad_norm": 0.410990834236145, "learning_rate": 9.928622695019558e-06, "loss": 1.2166, "step": 17359 }, { "epoch": 5.170610026247696, "grad_norm": 0.29336434602737427, "learning_rate": 9.927658153558528e-06, "loss": 1.2211, "step": 17360 }, { "epoch": 5.170907872447365, "grad_norm": 0.3034501373767853, "learning_rate": 9.926693612770563e-06, "loss": 1.211, "step": 17361 }, { "epoch": 5.171205718647034, "grad_norm": 0.38883015513420105, "learning_rate": 9.925729072664632e-06, "loss": 1.2069, "step": 17362 }, { "epoch": 5.171503564846702, "grad_norm": 0.2636663317680359, "learning_rate": 9.924764533249704e-06, "loss": 1.208, "step": 17363 }, { "epoch": 5.171801411046371, "grad_norm": 0.33574822545051575, "learning_rate": 9.923799994534764e-06, "loss": 1.2327, "step": 17364 }, { "epoch": 5.17209925724604, "grad_norm": 0.28025588393211365, "learning_rate": 9.922835456528775e-06, "loss": 1.2241, "step": 17365 }, { "epoch": 5.172397103445708, "grad_norm": 0.38576653599739075, "learning_rate": 9.921870919240716e-06, "loss": 1.2103, "step": 17366 }, { "epoch": 5.1726949496453765, "grad_norm": 0.2783429026603699, "learning_rate": 9.920906382679563e-06, "loss": 1.2128, "step": 17367 }, { "epoch": 5.172992795845046, "grad_norm": 0.5975845456123352, "learning_rate": 9.91994184685429e-06, "loss": 1.2154, "step": 17368 }, { "epoch": 5.173290642044714, "grad_norm": 0.4833067059516907, "learning_rate": 9.918977311773863e-06, "loss": 1.2295, "step": 17369 }, { "epoch": 5.173588488244383, "grad_norm": 0.33450496196746826, "learning_rate": 9.918012777447266e-06, "loss": 1.2057, "step": 17370 }, { "epoch": 5.1738863344440515, "grad_norm": 0.28698986768722534, "learning_rate": 9.917048243883469e-06, "loss": 1.2013, "step": 17371 }, { "epoch": 5.17418418064372, "grad_norm": 0.6007641553878784, "learning_rate": 9.91608371109144e-06, "loss": 1.2038, "step": 17372 }, { "epoch": 5.174482026843389, "grad_norm": 0.3245409429073334, "learning_rate": 9.915119179080163e-06, "loss": 1.2287, "step": 17373 }, { "epoch": 5.174779873043057, "grad_norm": 0.3489818274974823, "learning_rate": 9.914154647858608e-06, "loss": 1.2307, "step": 17374 }, { "epoch": 5.175077719242726, "grad_norm": 0.36396849155426025, "learning_rate": 9.913190117435744e-06, "loss": 1.2152, "step": 17375 }, { "epoch": 5.175375565442395, "grad_norm": 0.4610700011253357, "learning_rate": 9.91222558782055e-06, "loss": 1.2149, "step": 17376 }, { "epoch": 5.175673411642063, "grad_norm": 0.3610832393169403, "learning_rate": 9.911261059021999e-06, "loss": 1.2042, "step": 17377 }, { "epoch": 5.175971257841732, "grad_norm": 0.3699630796909332, "learning_rate": 9.910296531049066e-06, "loss": 1.2235, "step": 17378 }, { "epoch": 5.176269104041401, "grad_norm": 0.32827654480934143, "learning_rate": 9.909332003910725e-06, "loss": 1.2286, "step": 17379 }, { "epoch": 5.176566950241069, "grad_norm": 0.4925483167171478, "learning_rate": 9.908367477615944e-06, "loss": 1.2264, "step": 17380 }, { "epoch": 5.1768647964407375, "grad_norm": 0.36989858746528625, "learning_rate": 9.907402952173705e-06, "loss": 1.218, "step": 17381 }, { "epoch": 5.177162642640407, "grad_norm": 0.6347072720527649, "learning_rate": 9.906438427592977e-06, "loss": 1.2149, "step": 17382 }, { "epoch": 5.177460488840075, "grad_norm": 0.2984049916267395, "learning_rate": 9.905473903882732e-06, "loss": 1.2115, "step": 17383 }, { "epoch": 5.177758335039744, "grad_norm": 0.27661293745040894, "learning_rate": 9.904509381051952e-06, "loss": 1.231, "step": 17384 }, { "epoch": 5.178056181239413, "grad_norm": 0.4410458207130432, "learning_rate": 9.903544859109604e-06, "loss": 1.2063, "step": 17385 }, { "epoch": 5.178354027439081, "grad_norm": 0.33633315563201904, "learning_rate": 9.90258033806466e-06, "loss": 1.2144, "step": 17386 }, { "epoch": 5.17865187363875, "grad_norm": 0.32096680998802185, "learning_rate": 9.901615817926101e-06, "loss": 1.2039, "step": 17387 }, { "epoch": 5.1789497198384185, "grad_norm": 0.3884566128253937, "learning_rate": 9.900651298702895e-06, "loss": 1.2143, "step": 17388 }, { "epoch": 5.179247566038087, "grad_norm": 0.27329471707344055, "learning_rate": 9.899686780404015e-06, "loss": 1.2063, "step": 17389 }, { "epoch": 5.179545412237756, "grad_norm": 0.34216269850730896, "learning_rate": 9.898722263038442e-06, "loss": 1.2148, "step": 17390 }, { "epoch": 5.179843258437424, "grad_norm": 0.2769976556301117, "learning_rate": 9.897757746615146e-06, "loss": 1.2117, "step": 17391 }, { "epoch": 5.180141104637093, "grad_norm": 0.33744072914123535, "learning_rate": 9.896793231143095e-06, "loss": 1.2177, "step": 17392 }, { "epoch": 5.180438950836762, "grad_norm": 0.3166804313659668, "learning_rate": 9.895828716631272e-06, "loss": 1.2223, "step": 17393 }, { "epoch": 5.18073679703643, "grad_norm": 0.2969924211502075, "learning_rate": 9.894864203088642e-06, "loss": 1.2149, "step": 17394 }, { "epoch": 5.181034643236099, "grad_norm": 0.449819952249527, "learning_rate": 9.893899690524185e-06, "loss": 1.2107, "step": 17395 }, { "epoch": 5.181332489435768, "grad_norm": 0.2913896441459656, "learning_rate": 9.892935178946875e-06, "loss": 1.2222, "step": 17396 }, { "epoch": 5.181630335635436, "grad_norm": 0.4125060439109802, "learning_rate": 9.89197066836568e-06, "loss": 1.2265, "step": 17397 }, { "epoch": 5.181928181835105, "grad_norm": 0.25802966952323914, "learning_rate": 9.89100615878958e-06, "loss": 1.204, "step": 17398 }, { "epoch": 5.182226028034774, "grad_norm": 0.6255542039871216, "learning_rate": 9.890041650227546e-06, "loss": 1.2259, "step": 17399 }, { "epoch": 5.182523874234442, "grad_norm": 0.3694813549518585, "learning_rate": 9.889077142688552e-06, "loss": 1.2026, "step": 17400 }, { "epoch": 5.182821720434111, "grad_norm": 0.484057754278183, "learning_rate": 9.888112636181568e-06, "loss": 1.2181, "step": 17401 }, { "epoch": 5.1831195666337795, "grad_norm": 0.2654632329940796, "learning_rate": 9.887148130715576e-06, "loss": 1.2032, "step": 17402 }, { "epoch": 5.183417412833448, "grad_norm": 0.4504523277282715, "learning_rate": 9.886183626299538e-06, "loss": 1.2116, "step": 17403 }, { "epoch": 5.183715259033117, "grad_norm": 0.31855273246765137, "learning_rate": 9.88521912294244e-06, "loss": 1.2243, "step": 17404 }, { "epoch": 5.184013105232785, "grad_norm": 0.42272868752479553, "learning_rate": 9.884254620653247e-06, "loss": 1.2259, "step": 17405 }, { "epoch": 5.184310951432454, "grad_norm": 0.4378338158130646, "learning_rate": 9.883290119440934e-06, "loss": 1.2033, "step": 17406 }, { "epoch": 5.184608797632123, "grad_norm": 0.41525697708129883, "learning_rate": 9.88232561931448e-06, "loss": 1.215, "step": 17407 }, { "epoch": 5.184906643831791, "grad_norm": 0.34648171067237854, "learning_rate": 9.881361120282854e-06, "loss": 1.2228, "step": 17408 }, { "epoch": 5.18520449003146, "grad_norm": 0.3187936544418335, "learning_rate": 9.880396622355026e-06, "loss": 1.2164, "step": 17409 }, { "epoch": 5.185502336231129, "grad_norm": 0.31613689661026, "learning_rate": 9.87943212553998e-06, "loss": 1.2156, "step": 17410 }, { "epoch": 5.185800182430797, "grad_norm": 0.35116851329803467, "learning_rate": 9.878467629846676e-06, "loss": 1.2082, "step": 17411 }, { "epoch": 5.186098028630466, "grad_norm": 0.33654022216796875, "learning_rate": 9.877503135284099e-06, "loss": 1.2396, "step": 17412 }, { "epoch": 5.186395874830135, "grad_norm": 0.3258071839809418, "learning_rate": 9.87653864186122e-06, "loss": 1.1907, "step": 17413 }, { "epoch": 5.186693721029803, "grad_norm": 0.31604331731796265, "learning_rate": 9.875574149587007e-06, "loss": 1.2334, "step": 17414 }, { "epoch": 5.186991567229472, "grad_norm": 0.3823065757751465, "learning_rate": 9.874609658470442e-06, "loss": 1.1986, "step": 17415 }, { "epoch": 5.1872894134291405, "grad_norm": 0.30230191349983215, "learning_rate": 9.873645168520494e-06, "loss": 1.2282, "step": 17416 }, { "epoch": 5.187587259628809, "grad_norm": 0.30811795592308044, "learning_rate": 9.872680679746132e-06, "loss": 1.2131, "step": 17417 }, { "epoch": 5.187885105828478, "grad_norm": 0.3960643708705902, "learning_rate": 9.871716192156337e-06, "loss": 1.2352, "step": 17418 }, { "epoch": 5.188182952028146, "grad_norm": 0.2723821997642517, "learning_rate": 9.870751705760079e-06, "loss": 1.2048, "step": 17419 }, { "epoch": 5.188480798227815, "grad_norm": 0.30438661575317383, "learning_rate": 9.869787220566332e-06, "loss": 1.2186, "step": 17420 }, { "epoch": 5.188778644427484, "grad_norm": 0.2908874452114105, "learning_rate": 9.86882273658407e-06, "loss": 1.2127, "step": 17421 }, { "epoch": 5.189076490627152, "grad_norm": 0.24763324856758118, "learning_rate": 9.867858253822265e-06, "loss": 1.2073, "step": 17422 }, { "epoch": 5.1893743368268215, "grad_norm": 0.43248996138572693, "learning_rate": 9.866893772289892e-06, "loss": 1.2118, "step": 17423 }, { "epoch": 5.18967218302649, "grad_norm": 0.2770494520664215, "learning_rate": 9.865929291995922e-06, "loss": 1.2275, "step": 17424 }, { "epoch": 5.189970029226158, "grad_norm": 0.40055957436561584, "learning_rate": 9.864964812949334e-06, "loss": 1.1986, "step": 17425 }, { "epoch": 5.190267875425827, "grad_norm": 0.43054723739624023, "learning_rate": 9.864000335159093e-06, "loss": 1.2248, "step": 17426 }, { "epoch": 5.190565721625496, "grad_norm": 0.2807416319847107, "learning_rate": 9.863035858634181e-06, "loss": 1.2126, "step": 17427 }, { "epoch": 5.190863567825164, "grad_norm": 0.39825111627578735, "learning_rate": 9.862071383383562e-06, "loss": 1.2228, "step": 17428 }, { "epoch": 5.191161414024833, "grad_norm": 0.2781282067298889, "learning_rate": 9.86110690941622e-06, "loss": 1.2319, "step": 17429 }, { "epoch": 5.1914592602245015, "grad_norm": 0.4083573520183563, "learning_rate": 9.860142436741122e-06, "loss": 1.2199, "step": 17430 }, { "epoch": 5.19175710642417, "grad_norm": 0.332750529050827, "learning_rate": 9.859177965367239e-06, "loss": 1.1956, "step": 17431 }, { "epoch": 5.192054952623839, "grad_norm": 0.38965967297554016, "learning_rate": 9.85821349530355e-06, "loss": 1.206, "step": 17432 }, { "epoch": 5.192352798823507, "grad_norm": 0.44553112983703613, "learning_rate": 9.857249026559028e-06, "loss": 1.2231, "step": 17433 }, { "epoch": 5.192650645023176, "grad_norm": 0.26943814754486084, "learning_rate": 9.85628455914264e-06, "loss": 1.2109, "step": 17434 }, { "epoch": 5.192948491222845, "grad_norm": 0.2865230143070221, "learning_rate": 9.855320093063366e-06, "loss": 1.2177, "step": 17435 }, { "epoch": 5.193246337422513, "grad_norm": 0.26364055275917053, "learning_rate": 9.854355628330179e-06, "loss": 1.2138, "step": 17436 }, { "epoch": 5.1935441836221825, "grad_norm": 0.2749254107475281, "learning_rate": 9.853391164952045e-06, "loss": 1.2055, "step": 17437 }, { "epoch": 5.193842029821851, "grad_norm": 0.30862370133399963, "learning_rate": 9.852426702937949e-06, "loss": 1.2385, "step": 17438 }, { "epoch": 5.194139876021519, "grad_norm": 0.23923292756080627, "learning_rate": 9.851462242296856e-06, "loss": 1.2168, "step": 17439 }, { "epoch": 5.194437722221188, "grad_norm": 0.37787410616874695, "learning_rate": 9.850497783037736e-06, "loss": 1.222, "step": 17440 }, { "epoch": 5.194735568420857, "grad_norm": 0.28293028473854065, "learning_rate": 9.849533325169568e-06, "loss": 1.2158, "step": 17441 }, { "epoch": 5.195033414620525, "grad_norm": 0.420856773853302, "learning_rate": 9.848568868701329e-06, "loss": 1.2021, "step": 17442 }, { "epoch": 5.195331260820194, "grad_norm": 0.4423983097076416, "learning_rate": 9.847604413641982e-06, "loss": 1.196, "step": 17443 }, { "epoch": 5.195629107019863, "grad_norm": 0.26101332902908325, "learning_rate": 9.846639960000512e-06, "loss": 1.2212, "step": 17444 }, { "epoch": 5.195926953219531, "grad_norm": 0.38214921951293945, "learning_rate": 9.845675507785879e-06, "loss": 1.2202, "step": 17445 }, { "epoch": 5.1962247994192, "grad_norm": 0.2879766821861267, "learning_rate": 9.844711057007068e-06, "loss": 1.2254, "step": 17446 }, { "epoch": 5.1965226456188685, "grad_norm": 0.4692278206348419, "learning_rate": 9.843746607673045e-06, "loss": 1.2331, "step": 17447 }, { "epoch": 5.196820491818537, "grad_norm": 0.2960914373397827, "learning_rate": 9.842782159792785e-06, "loss": 1.2097, "step": 17448 }, { "epoch": 5.197118338018206, "grad_norm": 0.49584126472473145, "learning_rate": 9.841817713375262e-06, "loss": 1.2179, "step": 17449 }, { "epoch": 5.197416184217874, "grad_norm": 0.354303240776062, "learning_rate": 9.840853268429451e-06, "loss": 1.1989, "step": 17450 }, { "epoch": 5.1977140304175435, "grad_norm": 0.5706049799919128, "learning_rate": 9.839888824964318e-06, "loss": 1.2098, "step": 17451 }, { "epoch": 5.198011876617212, "grad_norm": 0.2618613839149475, "learning_rate": 9.838924382988843e-06, "loss": 1.2201, "step": 17452 }, { "epoch": 5.19830972281688, "grad_norm": 0.30057284235954285, "learning_rate": 9.837959942511996e-06, "loss": 1.2042, "step": 17453 }, { "epoch": 5.198607569016549, "grad_norm": 0.3330328166484833, "learning_rate": 9.83699550354275e-06, "loss": 1.2128, "step": 17454 }, { "epoch": 5.198905415216218, "grad_norm": 0.30026569962501526, "learning_rate": 9.836031066090081e-06, "loss": 1.2241, "step": 17455 }, { "epoch": 5.199203261415886, "grad_norm": 0.2807164788246155, "learning_rate": 9.83506663016296e-06, "loss": 1.2092, "step": 17456 }, { "epoch": 5.199501107615555, "grad_norm": 0.28054577112197876, "learning_rate": 9.834102195770356e-06, "loss": 1.2205, "step": 17457 }, { "epoch": 5.199798953815224, "grad_norm": 0.3039628267288208, "learning_rate": 9.833137762921248e-06, "loss": 1.2155, "step": 17458 }, { "epoch": 5.200096800014892, "grad_norm": 0.256278932094574, "learning_rate": 9.832173331624607e-06, "loss": 1.2158, "step": 17459 }, { "epoch": 5.200394646214561, "grad_norm": 0.2707452178001404, "learning_rate": 9.831208901889405e-06, "loss": 1.2053, "step": 17460 }, { "epoch": 5.2006924924142295, "grad_norm": 0.28423362970352173, "learning_rate": 9.830244473724616e-06, "loss": 1.2057, "step": 17461 }, { "epoch": 5.200990338613899, "grad_norm": 0.25178104639053345, "learning_rate": 9.829280047139211e-06, "loss": 1.2222, "step": 17462 }, { "epoch": 5.201288184813567, "grad_norm": 0.25856050848960876, "learning_rate": 9.828315622142167e-06, "loss": 1.2144, "step": 17463 }, { "epoch": 5.201586031013235, "grad_norm": 0.3158572316169739, "learning_rate": 9.827351198742452e-06, "loss": 1.2236, "step": 17464 }, { "epoch": 5.201883877212905, "grad_norm": 0.3710111975669861, "learning_rate": 9.826386776949041e-06, "loss": 1.2191, "step": 17465 }, { "epoch": 5.202181723412573, "grad_norm": 0.37602880597114563, "learning_rate": 9.82542235677091e-06, "loss": 1.2092, "step": 17466 }, { "epoch": 5.202479569612241, "grad_norm": 0.27766236662864685, "learning_rate": 9.824457938217028e-06, "loss": 1.22, "step": 17467 }, { "epoch": 5.2027774158119104, "grad_norm": 0.2806062400341034, "learning_rate": 9.823493521296366e-06, "loss": 1.2019, "step": 17468 }, { "epoch": 5.203075262011579, "grad_norm": 0.33705195784568787, "learning_rate": 9.822529106017904e-06, "loss": 1.229, "step": 17469 }, { "epoch": 5.203373108211247, "grad_norm": 0.2857660949230194, "learning_rate": 9.821564692390607e-06, "loss": 1.2103, "step": 17470 }, { "epoch": 5.203670954410916, "grad_norm": 0.2896214425563812, "learning_rate": 9.82060028042345e-06, "loss": 1.2071, "step": 17471 }, { "epoch": 5.203968800610585, "grad_norm": 0.3136449456214905, "learning_rate": 9.81963587012541e-06, "loss": 1.2203, "step": 17472 }, { "epoch": 5.204266646810253, "grad_norm": 0.2500639855861664, "learning_rate": 9.818671461505458e-06, "loss": 1.1988, "step": 17473 }, { "epoch": 5.204564493009922, "grad_norm": 0.3904259502887726, "learning_rate": 9.81770705457256e-06, "loss": 1.2142, "step": 17474 }, { "epoch": 5.2048623392095905, "grad_norm": 0.2964465320110321, "learning_rate": 9.8167426493357e-06, "loss": 1.2073, "step": 17475 }, { "epoch": 5.205160185409259, "grad_norm": 0.3154318630695343, "learning_rate": 9.81577824580384e-06, "loss": 1.2363, "step": 17476 }, { "epoch": 5.205458031608928, "grad_norm": 0.3880579173564911, "learning_rate": 9.81481384398596e-06, "loss": 1.2217, "step": 17477 }, { "epoch": 5.205755877808596, "grad_norm": 0.261506050825119, "learning_rate": 9.813849443891031e-06, "loss": 1.2332, "step": 17478 }, { "epoch": 5.206053724008266, "grad_norm": 0.32930904626846313, "learning_rate": 9.812885045528022e-06, "loss": 1.213, "step": 17479 }, { "epoch": 5.206351570207934, "grad_norm": 0.27337881922721863, "learning_rate": 9.811920648905913e-06, "loss": 1.2198, "step": 17480 }, { "epoch": 5.206649416407602, "grad_norm": 0.32574567198753357, "learning_rate": 9.810956254033673e-06, "loss": 1.2195, "step": 17481 }, { "epoch": 5.2069472626072715, "grad_norm": 0.33570176362991333, "learning_rate": 9.809991860920267e-06, "loss": 1.2148, "step": 17482 }, { "epoch": 5.20724510880694, "grad_norm": 0.2787150740623474, "learning_rate": 9.809027469574677e-06, "loss": 1.2398, "step": 17483 }, { "epoch": 5.207542955006608, "grad_norm": 0.3820635676383972, "learning_rate": 9.808063080005878e-06, "loss": 1.2077, "step": 17484 }, { "epoch": 5.207840801206277, "grad_norm": 0.2835637629032135, "learning_rate": 9.80709869222283e-06, "loss": 1.2236, "step": 17485 }, { "epoch": 5.208138647405946, "grad_norm": 0.29232335090637207, "learning_rate": 9.806134306234519e-06, "loss": 1.2161, "step": 17486 }, { "epoch": 5.208436493605614, "grad_norm": 0.2579543888568878, "learning_rate": 9.80516992204991e-06, "loss": 1.221, "step": 17487 }, { "epoch": 5.208734339805283, "grad_norm": 0.32679247856140137, "learning_rate": 9.804205539677976e-06, "loss": 1.2198, "step": 17488 }, { "epoch": 5.2090321860049515, "grad_norm": 0.27821290493011475, "learning_rate": 9.803241159127692e-06, "loss": 1.2147, "step": 17489 }, { "epoch": 5.209330032204621, "grad_norm": 0.3944130837917328, "learning_rate": 9.802276780408031e-06, "loss": 1.2221, "step": 17490 }, { "epoch": 5.209627878404289, "grad_norm": 0.26376447081565857, "learning_rate": 9.801312403527958e-06, "loss": 1.2195, "step": 17491 }, { "epoch": 5.209925724603957, "grad_norm": 0.45310312509536743, "learning_rate": 9.800348028496456e-06, "loss": 1.2403, "step": 17492 }, { "epoch": 5.210223570803627, "grad_norm": 0.28856009244918823, "learning_rate": 9.79938365532249e-06, "loss": 1.2018, "step": 17493 }, { "epoch": 5.210521417003295, "grad_norm": 0.37543657422065735, "learning_rate": 9.798419284015034e-06, "loss": 1.2027, "step": 17494 }, { "epoch": 5.210819263202963, "grad_norm": 0.2522352635860443, "learning_rate": 9.797454914583067e-06, "loss": 1.2192, "step": 17495 }, { "epoch": 5.2111171094026325, "grad_norm": 0.7417417168617249, "learning_rate": 9.796490547035549e-06, "loss": 1.1934, "step": 17496 }, { "epoch": 5.211414955602301, "grad_norm": 0.5598888993263245, "learning_rate": 9.795526181381464e-06, "loss": 1.214, "step": 17497 }, { "epoch": 5.211712801801969, "grad_norm": 0.48745712637901306, "learning_rate": 9.79456181762978e-06, "loss": 1.2204, "step": 17498 }, { "epoch": 5.212010648001638, "grad_norm": 0.4387660026550293, "learning_rate": 9.793597455789463e-06, "loss": 1.2176, "step": 17499 }, { "epoch": 5.212308494201307, "grad_norm": 0.49820026755332947, "learning_rate": 9.792633095869495e-06, "loss": 1.2248, "step": 17500 }, { "epoch": 5.212308494201307, "eval_loss": 1.3193069696426392, "eval_runtime": 24.0255, "eval_samples_per_second": 72.173, "eval_steps_per_second": 4.537, "step": 17500 }, { "epoch": 5.212606340400975, "grad_norm": 0.2895793318748474, "learning_rate": 9.791668737878846e-06, "loss": 1.2119, "step": 17501 }, { "epoch": 5.212904186600644, "grad_norm": 0.38679054379463196, "learning_rate": 9.790704381826481e-06, "loss": 1.2001, "step": 17502 }, { "epoch": 5.213202032800313, "grad_norm": 0.3843201696872711, "learning_rate": 9.789740027721384e-06, "loss": 1.212, "step": 17503 }, { "epoch": 5.213499878999982, "grad_norm": 0.38126033544540405, "learning_rate": 9.78877567557252e-06, "loss": 1.2244, "step": 17504 }, { "epoch": 5.21379772519965, "grad_norm": 0.362287700176239, "learning_rate": 9.787811325388858e-06, "loss": 1.2214, "step": 17505 }, { "epoch": 5.2140955713993185, "grad_norm": 0.2599650025367737, "learning_rate": 9.786846977179377e-06, "loss": 1.2052, "step": 17506 }, { "epoch": 5.214393417598988, "grad_norm": 0.3888007700443268, "learning_rate": 9.785882630953048e-06, "loss": 1.206, "step": 17507 }, { "epoch": 5.214691263798656, "grad_norm": 0.3158775269985199, "learning_rate": 9.78491828671884e-06, "loss": 1.2135, "step": 17508 }, { "epoch": 5.214989109998324, "grad_norm": 0.3141838610172272, "learning_rate": 9.783953944485729e-06, "loss": 1.206, "step": 17509 }, { "epoch": 5.2152869561979935, "grad_norm": 0.3854498565196991, "learning_rate": 9.782989604262682e-06, "loss": 1.2131, "step": 17510 }, { "epoch": 5.215584802397662, "grad_norm": 0.2511853873729706, "learning_rate": 9.782025266058679e-06, "loss": 1.1993, "step": 17511 }, { "epoch": 5.21588264859733, "grad_norm": 0.539507269859314, "learning_rate": 9.781060929882684e-06, "loss": 1.2137, "step": 17512 }, { "epoch": 5.216180494796999, "grad_norm": 0.2853509485721588, "learning_rate": 9.780096595743671e-06, "loss": 1.2201, "step": 17513 }, { "epoch": 5.216478340996668, "grad_norm": 0.5206080079078674, "learning_rate": 9.779132263650618e-06, "loss": 1.2328, "step": 17514 }, { "epoch": 5.216776187196336, "grad_norm": 0.344535768032074, "learning_rate": 9.778167933612492e-06, "loss": 1.2202, "step": 17515 }, { "epoch": 5.217074033396005, "grad_norm": 0.483694851398468, "learning_rate": 9.777203605638261e-06, "loss": 1.2102, "step": 17516 }, { "epoch": 5.217371879595674, "grad_norm": 0.5337632298469543, "learning_rate": 9.776239279736903e-06, "loss": 1.218, "step": 17517 }, { "epoch": 5.217669725795343, "grad_norm": 0.5443182587623596, "learning_rate": 9.775274955917393e-06, "loss": 1.2189, "step": 17518 }, { "epoch": 5.217967571995011, "grad_norm": 0.460360050201416, "learning_rate": 9.774310634188692e-06, "loss": 1.2124, "step": 17519 }, { "epoch": 5.2182654181946795, "grad_norm": 0.36664527654647827, "learning_rate": 9.773346314559784e-06, "loss": 1.2119, "step": 17520 }, { "epoch": 5.218563264394349, "grad_norm": 0.3464254140853882, "learning_rate": 9.772381997039634e-06, "loss": 1.2099, "step": 17521 }, { "epoch": 5.218861110594017, "grad_norm": 0.4611830711364746, "learning_rate": 9.771417681637212e-06, "loss": 1.223, "step": 17522 }, { "epoch": 5.219158956793685, "grad_norm": 0.28207167983055115, "learning_rate": 9.770453368361495e-06, "loss": 1.211, "step": 17523 }, { "epoch": 5.219456802993355, "grad_norm": 0.32353290915489197, "learning_rate": 9.769489057221455e-06, "loss": 1.2154, "step": 17524 }, { "epoch": 5.219754649193023, "grad_norm": 0.3409768342971802, "learning_rate": 9.768524748226056e-06, "loss": 1.227, "step": 17525 }, { "epoch": 5.220052495392691, "grad_norm": 0.29958558082580566, "learning_rate": 9.767560441384283e-06, "loss": 1.2078, "step": 17526 }, { "epoch": 5.22035034159236, "grad_norm": 0.3059716522693634, "learning_rate": 9.766596136705095e-06, "loss": 1.2103, "step": 17527 }, { "epoch": 5.220648187792029, "grad_norm": 0.4833516478538513, "learning_rate": 9.765631834197472e-06, "loss": 1.2104, "step": 17528 }, { "epoch": 5.220946033991698, "grad_norm": 0.2676942050457001, "learning_rate": 9.764667533870382e-06, "loss": 1.2193, "step": 17529 }, { "epoch": 5.221243880191366, "grad_norm": 0.4212324321269989, "learning_rate": 9.763703235732796e-06, "loss": 1.2406, "step": 17530 }, { "epoch": 5.221541726391035, "grad_norm": 0.3484458029270172, "learning_rate": 9.76273893979369e-06, "loss": 1.2358, "step": 17531 }, { "epoch": 5.221839572590704, "grad_norm": 0.4059697389602661, "learning_rate": 9.761774646062035e-06, "loss": 1.2106, "step": 17532 }, { "epoch": 5.222137418790372, "grad_norm": 0.39221394062042236, "learning_rate": 9.760810354546794e-06, "loss": 1.2103, "step": 17533 }, { "epoch": 5.2224352649900405, "grad_norm": 0.5751363039016724, "learning_rate": 9.759846065256953e-06, "loss": 1.215, "step": 17534 }, { "epoch": 5.22273311118971, "grad_norm": 0.267691969871521, "learning_rate": 9.758881778201471e-06, "loss": 1.2276, "step": 17535 }, { "epoch": 5.223030957389378, "grad_norm": 0.41326481103897095, "learning_rate": 9.757917493389324e-06, "loss": 1.2136, "step": 17536 }, { "epoch": 5.223328803589046, "grad_norm": 0.34432196617126465, "learning_rate": 9.756953210829489e-06, "loss": 1.2152, "step": 17537 }, { "epoch": 5.223626649788716, "grad_norm": 0.29226770997047424, "learning_rate": 9.755988930530931e-06, "loss": 1.2106, "step": 17538 }, { "epoch": 5.223924495988384, "grad_norm": 0.2618924379348755, "learning_rate": 9.75502465250262e-06, "loss": 1.2285, "step": 17539 }, { "epoch": 5.224222342188052, "grad_norm": 0.34921810030937195, "learning_rate": 9.754060376753536e-06, "loss": 1.216, "step": 17540 }, { "epoch": 5.2245201883877215, "grad_norm": 0.29432594776153564, "learning_rate": 9.753096103292641e-06, "loss": 1.2133, "step": 17541 }, { "epoch": 5.22481803458739, "grad_norm": 0.2998746335506439, "learning_rate": 9.752131832128912e-06, "loss": 1.2209, "step": 17542 }, { "epoch": 5.225115880787058, "grad_norm": 0.28745922446250916, "learning_rate": 9.751167563271322e-06, "loss": 1.1923, "step": 17543 }, { "epoch": 5.225413726986727, "grad_norm": 0.31892332434654236, "learning_rate": 9.750203296728835e-06, "loss": 1.2131, "step": 17544 }, { "epoch": 5.225711573186396, "grad_norm": 0.26440367102622986, "learning_rate": 9.749239032510432e-06, "loss": 1.2247, "step": 17545 }, { "epoch": 5.226009419386065, "grad_norm": 0.26981377601623535, "learning_rate": 9.748274770625077e-06, "loss": 1.214, "step": 17546 }, { "epoch": 5.226307265585733, "grad_norm": 0.366583913564682, "learning_rate": 9.747310511081745e-06, "loss": 1.2091, "step": 17547 }, { "epoch": 5.2266051117854015, "grad_norm": 0.30591365694999695, "learning_rate": 9.746346253889406e-06, "loss": 1.2096, "step": 17548 }, { "epoch": 5.226902957985071, "grad_norm": 0.3112654387950897, "learning_rate": 9.745381999057033e-06, "loss": 1.211, "step": 17549 }, { "epoch": 5.227200804184739, "grad_norm": 0.35584914684295654, "learning_rate": 9.744417746593592e-06, "loss": 1.2054, "step": 17550 }, { "epoch": 5.227498650384407, "grad_norm": 0.26016563177108765, "learning_rate": 9.743453496508065e-06, "loss": 1.216, "step": 17551 }, { "epoch": 5.227796496584077, "grad_norm": 0.3549031615257263, "learning_rate": 9.742489248809411e-06, "loss": 1.2164, "step": 17552 }, { "epoch": 5.228094342783745, "grad_norm": 0.2862130403518677, "learning_rate": 9.741525003506606e-06, "loss": 1.1969, "step": 17553 }, { "epoch": 5.228392188983413, "grad_norm": 0.3492962121963501, "learning_rate": 9.740560760608627e-06, "loss": 1.2084, "step": 17554 }, { "epoch": 5.2286900351830825, "grad_norm": 0.4583175480365753, "learning_rate": 9.73959652012444e-06, "loss": 1.2102, "step": 17555 }, { "epoch": 5.228987881382751, "grad_norm": 0.24967622756958008, "learning_rate": 9.738632282063013e-06, "loss": 1.2008, "step": 17556 }, { "epoch": 5.22928572758242, "grad_norm": 0.4711858630180359, "learning_rate": 9.737668046433322e-06, "loss": 1.203, "step": 17557 }, { "epoch": 5.229583573782088, "grad_norm": 0.28161999583244324, "learning_rate": 9.736703813244336e-06, "loss": 1.2067, "step": 17558 }, { "epoch": 5.229881419981757, "grad_norm": 0.4732733368873596, "learning_rate": 9.735739582505026e-06, "loss": 1.2155, "step": 17559 }, { "epoch": 5.230179266181426, "grad_norm": 0.366251677274704, "learning_rate": 9.734775354224368e-06, "loss": 1.1981, "step": 17560 }, { "epoch": 5.230477112381094, "grad_norm": 0.3029963970184326, "learning_rate": 9.733811128411323e-06, "loss": 1.2099, "step": 17561 }, { "epoch": 5.230774958580763, "grad_norm": 0.33303940296173096, "learning_rate": 9.732846905074874e-06, "loss": 1.2302, "step": 17562 }, { "epoch": 5.231072804780432, "grad_norm": 0.3316311240196228, "learning_rate": 9.731882684223985e-06, "loss": 1.2288, "step": 17563 }, { "epoch": 5.2313706509801, "grad_norm": 0.2685508728027344, "learning_rate": 9.730918465867624e-06, "loss": 1.2142, "step": 17564 }, { "epoch": 5.2316684971797685, "grad_norm": 0.2979573905467987, "learning_rate": 9.729954250014769e-06, "loss": 1.2045, "step": 17565 }, { "epoch": 5.231966343379438, "grad_norm": 0.24490252137184143, "learning_rate": 9.728990036674391e-06, "loss": 1.224, "step": 17566 }, { "epoch": 5.232264189579106, "grad_norm": 0.2672775089740753, "learning_rate": 9.728025825855452e-06, "loss": 1.2051, "step": 17567 }, { "epoch": 5.232562035778774, "grad_norm": 0.2893596887588501, "learning_rate": 9.727061617566933e-06, "loss": 1.2056, "step": 17568 }, { "epoch": 5.2328598819784435, "grad_norm": 0.2536107301712036, "learning_rate": 9.726097411817798e-06, "loss": 1.219, "step": 17569 }, { "epoch": 5.233157728178112, "grad_norm": 0.3155689835548401, "learning_rate": 9.725133208617023e-06, "loss": 1.2312, "step": 17570 }, { "epoch": 5.233455574377781, "grad_norm": 0.3005101680755615, "learning_rate": 9.724169007973575e-06, "loss": 1.2119, "step": 17571 }, { "epoch": 5.233753420577449, "grad_norm": 0.3837868273258209, "learning_rate": 9.723204809896427e-06, "loss": 1.2167, "step": 17572 }, { "epoch": 5.234051266777118, "grad_norm": 0.4749842882156372, "learning_rate": 9.722240614394546e-06, "loss": 1.2254, "step": 17573 }, { "epoch": 5.234349112976787, "grad_norm": 1.0652549266815186, "learning_rate": 9.72127642147691e-06, "loss": 1.2283, "step": 17574 }, { "epoch": 5.234646959176455, "grad_norm": 0.4877004027366638, "learning_rate": 9.720312231152481e-06, "loss": 1.1977, "step": 17575 }, { "epoch": 5.234944805376124, "grad_norm": 0.5479477643966675, "learning_rate": 9.719348043430239e-06, "loss": 1.209, "step": 17576 }, { "epoch": 5.235242651575793, "grad_norm": 0.26567378640174866, "learning_rate": 9.718383858319146e-06, "loss": 1.212, "step": 17577 }, { "epoch": 5.235540497775461, "grad_norm": 0.5459462404251099, "learning_rate": 9.717419675828176e-06, "loss": 1.2013, "step": 17578 }, { "epoch": 5.2358383439751295, "grad_norm": 0.4284096360206604, "learning_rate": 9.716455495966305e-06, "loss": 1.2098, "step": 17579 }, { "epoch": 5.236136190174799, "grad_norm": 0.34207162261009216, "learning_rate": 9.715491318742499e-06, "loss": 1.214, "step": 17580 }, { "epoch": 5.236434036374467, "grad_norm": 0.3737085461616516, "learning_rate": 9.714527144165721e-06, "loss": 1.2201, "step": 17581 }, { "epoch": 5.236731882574135, "grad_norm": 0.3189679980278015, "learning_rate": 9.713562972244955e-06, "loss": 1.2147, "step": 17582 }, { "epoch": 5.237029728773805, "grad_norm": 0.4002085030078888, "learning_rate": 9.712598802989166e-06, "loss": 1.2063, "step": 17583 }, { "epoch": 5.237327574973473, "grad_norm": 0.2849947214126587, "learning_rate": 9.711634636407319e-06, "loss": 1.2209, "step": 17584 }, { "epoch": 5.237625421173142, "grad_norm": 0.31420210003852844, "learning_rate": 9.710670472508395e-06, "loss": 1.2154, "step": 17585 }, { "epoch": 5.23792326737281, "grad_norm": 0.3056415915489197, "learning_rate": 9.709706311301358e-06, "loss": 1.2037, "step": 17586 }, { "epoch": 5.238221113572479, "grad_norm": 0.24837148189544678, "learning_rate": 9.708742152795176e-06, "loss": 1.1954, "step": 17587 }, { "epoch": 5.238518959772148, "grad_norm": 0.2823173403739929, "learning_rate": 9.707777996998825e-06, "loss": 1.2145, "step": 17588 }, { "epoch": 5.238816805971816, "grad_norm": 0.33144593238830566, "learning_rate": 9.706813843921274e-06, "loss": 1.2109, "step": 17589 }, { "epoch": 5.239114652171485, "grad_norm": 0.2580885887145996, "learning_rate": 9.70584969357149e-06, "loss": 1.2253, "step": 17590 }, { "epoch": 5.239412498371154, "grad_norm": 0.2694717347621918, "learning_rate": 9.70488554595845e-06, "loss": 1.2105, "step": 17591 }, { "epoch": 5.239710344570822, "grad_norm": 0.3748553991317749, "learning_rate": 9.703921401091115e-06, "loss": 1.2233, "step": 17592 }, { "epoch": 5.2400081907704905, "grad_norm": 0.35062405467033386, "learning_rate": 9.702957258978466e-06, "loss": 1.2265, "step": 17593 }, { "epoch": 5.24030603697016, "grad_norm": 0.31125640869140625, "learning_rate": 9.701993119629465e-06, "loss": 1.2274, "step": 17594 }, { "epoch": 5.240603883169828, "grad_norm": 0.4129265248775482, "learning_rate": 9.701028983053083e-06, "loss": 1.217, "step": 17595 }, { "epoch": 5.240901729369497, "grad_norm": 0.2553410530090332, "learning_rate": 9.700064849258298e-06, "loss": 1.2156, "step": 17596 }, { "epoch": 5.241199575569166, "grad_norm": 0.36992135643959045, "learning_rate": 9.699100718254071e-06, "loss": 1.2004, "step": 17597 }, { "epoch": 5.241497421768834, "grad_norm": 0.2936934530735016, "learning_rate": 9.698136590049375e-06, "loss": 1.2098, "step": 17598 }, { "epoch": 5.241795267968503, "grad_norm": 0.4095652997493744, "learning_rate": 9.697172464653184e-06, "loss": 1.2192, "step": 17599 }, { "epoch": 5.2420931141681715, "grad_norm": 0.27325400710105896, "learning_rate": 9.696208342074461e-06, "loss": 1.2032, "step": 17600 }, { "epoch": 5.24239096036784, "grad_norm": 0.4788769483566284, "learning_rate": 9.69524422232218e-06, "loss": 1.2196, "step": 17601 }, { "epoch": 5.242688806567509, "grad_norm": 0.2858133912086487, "learning_rate": 9.694280105405314e-06, "loss": 1.2188, "step": 17602 }, { "epoch": 5.242986652767177, "grad_norm": 0.3255537748336792, "learning_rate": 9.69331599133283e-06, "loss": 1.2285, "step": 17603 }, { "epoch": 5.243284498966846, "grad_norm": 0.31990641355514526, "learning_rate": 9.692351880113695e-06, "loss": 1.2128, "step": 17604 }, { "epoch": 5.243582345166515, "grad_norm": 0.25648993253707886, "learning_rate": 9.691387771756883e-06, "loss": 1.2196, "step": 17605 }, { "epoch": 5.243880191366183, "grad_norm": 0.2624090909957886, "learning_rate": 9.690423666271365e-06, "loss": 1.2048, "step": 17606 }, { "epoch": 5.2441780375658515, "grad_norm": 0.2932678163051605, "learning_rate": 9.689459563666105e-06, "loss": 1.2028, "step": 17607 }, { "epoch": 5.244475883765521, "grad_norm": 0.27394574880599976, "learning_rate": 9.688495463950081e-06, "loss": 1.2082, "step": 17608 }, { "epoch": 5.244773729965189, "grad_norm": 0.2619938850402832, "learning_rate": 9.687531367132257e-06, "loss": 1.1827, "step": 17609 }, { "epoch": 5.245071576164857, "grad_norm": 0.3157538175582886, "learning_rate": 9.686567273221605e-06, "loss": 1.2123, "step": 17610 }, { "epoch": 5.245369422364527, "grad_norm": 0.38306286931037903, "learning_rate": 9.685603182227093e-06, "loss": 1.2202, "step": 17611 }, { "epoch": 5.245667268564195, "grad_norm": 0.3174692988395691, "learning_rate": 9.68463909415769e-06, "loss": 1.2023, "step": 17612 }, { "epoch": 5.245965114763864, "grad_norm": 0.2601618766784668, "learning_rate": 9.683675009022375e-06, "loss": 1.2242, "step": 17613 }, { "epoch": 5.2462629609635325, "grad_norm": 0.30510395765304565, "learning_rate": 9.682710926830107e-06, "loss": 1.2169, "step": 17614 }, { "epoch": 5.246560807163201, "grad_norm": 0.2462833821773529, "learning_rate": 9.681746847589856e-06, "loss": 1.221, "step": 17615 }, { "epoch": 5.24685865336287, "grad_norm": 0.3740920424461365, "learning_rate": 9.6807827713106e-06, "loss": 1.1939, "step": 17616 }, { "epoch": 5.247156499562538, "grad_norm": 0.33551445603370667, "learning_rate": 9.679818698001303e-06, "loss": 1.208, "step": 17617 }, { "epoch": 5.247454345762207, "grad_norm": 0.3807724118232727, "learning_rate": 9.67885462767093e-06, "loss": 1.2273, "step": 17618 }, { "epoch": 5.247752191961876, "grad_norm": 0.34905049204826355, "learning_rate": 9.677890560328463e-06, "loss": 1.1955, "step": 17619 }, { "epoch": 5.248050038161544, "grad_norm": 0.41026571393013, "learning_rate": 9.676926495982861e-06, "loss": 1.2262, "step": 17620 }, { "epoch": 5.248347884361213, "grad_norm": 0.4517008662223816, "learning_rate": 9.675962434643096e-06, "loss": 1.2145, "step": 17621 }, { "epoch": 5.248645730560882, "grad_norm": 0.3837314546108246, "learning_rate": 9.67499837631814e-06, "loss": 1.2141, "step": 17622 }, { "epoch": 5.24894357676055, "grad_norm": 0.7110933661460876, "learning_rate": 9.674034321016961e-06, "loss": 1.2122, "step": 17623 }, { "epoch": 5.249241422960219, "grad_norm": 0.2553151547908783, "learning_rate": 9.673070268748526e-06, "loss": 1.2309, "step": 17624 }, { "epoch": 5.249539269159888, "grad_norm": 0.6351589560508728, "learning_rate": 9.67210621952181e-06, "loss": 1.2229, "step": 17625 }, { "epoch": 5.249837115359556, "grad_norm": 0.2685965299606323, "learning_rate": 9.671142173345777e-06, "loss": 1.2156, "step": 17626 }, { "epoch": 5.250134961559225, "grad_norm": 0.5644192695617676, "learning_rate": 9.670178130229402e-06, "loss": 1.2083, "step": 17627 }, { "epoch": 5.2504328077588935, "grad_norm": 0.30944108963012695, "learning_rate": 9.669214090181649e-06, "loss": 1.2108, "step": 17628 }, { "epoch": 5.250730653958562, "grad_norm": 0.6028257012367249, "learning_rate": 9.668250053211487e-06, "loss": 1.2264, "step": 17629 }, { "epoch": 5.251028500158231, "grad_norm": 0.426084041595459, "learning_rate": 9.66728601932789e-06, "loss": 1.2186, "step": 17630 }, { "epoch": 5.251326346357899, "grad_norm": 0.4742308557033539, "learning_rate": 9.666321988539827e-06, "loss": 1.2166, "step": 17631 }, { "epoch": 5.251624192557568, "grad_norm": 0.5128064751625061, "learning_rate": 9.66535796085626e-06, "loss": 1.2101, "step": 17632 }, { "epoch": 5.251922038757237, "grad_norm": 0.28773415088653564, "learning_rate": 9.664393936286169e-06, "loss": 1.2323, "step": 17633 }, { "epoch": 5.252219884956905, "grad_norm": 0.8632734417915344, "learning_rate": 9.663429914838513e-06, "loss": 1.2105, "step": 17634 }, { "epoch": 5.252517731156574, "grad_norm": 0.32639697194099426, "learning_rate": 9.662465896522267e-06, "loss": 1.2116, "step": 17635 }, { "epoch": 5.252815577356243, "grad_norm": 0.653701901435852, "learning_rate": 9.6615018813464e-06, "loss": 1.2181, "step": 17636 }, { "epoch": 5.253113423555911, "grad_norm": 0.3668355345726013, "learning_rate": 9.66053786931988e-06, "loss": 1.2244, "step": 17637 }, { "epoch": 5.25341126975558, "grad_norm": 0.3794378340244293, "learning_rate": 9.659573860451671e-06, "loss": 1.2342, "step": 17638 }, { "epoch": 5.253709115955249, "grad_norm": 0.9495980739593506, "learning_rate": 9.658609854750753e-06, "loss": 1.2059, "step": 17639 }, { "epoch": 5.254006962154917, "grad_norm": 0.3066650927066803, "learning_rate": 9.657645852226086e-06, "loss": 1.208, "step": 17640 }, { "epoch": 5.254304808354586, "grad_norm": 0.6032782793045044, "learning_rate": 9.65668185288664e-06, "loss": 1.2145, "step": 17641 }, { "epoch": 5.254602654554255, "grad_norm": 0.5128459334373474, "learning_rate": 9.65571785674139e-06, "loss": 1.2139, "step": 17642 }, { "epoch": 5.254900500753923, "grad_norm": 0.5280553102493286, "learning_rate": 9.654753863799296e-06, "loss": 1.231, "step": 17643 }, { "epoch": 5.255198346953592, "grad_norm": 0.6278181672096252, "learning_rate": 9.653789874069337e-06, "loss": 1.2198, "step": 17644 }, { "epoch": 5.25549619315326, "grad_norm": 0.49107906222343445, "learning_rate": 9.652825887560474e-06, "loss": 1.2191, "step": 17645 }, { "epoch": 5.255794039352929, "grad_norm": 0.5377057194709778, "learning_rate": 9.651861904281675e-06, "loss": 1.208, "step": 17646 }, { "epoch": 5.256091885552598, "grad_norm": 0.3466222286224365, "learning_rate": 9.650897924241916e-06, "loss": 1.2331, "step": 17647 }, { "epoch": 5.256389731752266, "grad_norm": 0.4054228663444519, "learning_rate": 9.649933947450163e-06, "loss": 1.2129, "step": 17648 }, { "epoch": 5.256687577951935, "grad_norm": 0.29387709498405457, "learning_rate": 9.648969973915378e-06, "loss": 1.2155, "step": 17649 }, { "epoch": 5.256985424151604, "grad_norm": 0.4338807165622711, "learning_rate": 9.648006003646539e-06, "loss": 1.2124, "step": 17650 }, { "epoch": 5.257283270351272, "grad_norm": 0.2910458445549011, "learning_rate": 9.647042036652612e-06, "loss": 1.2221, "step": 17651 }, { "epoch": 5.257581116550941, "grad_norm": 0.2524261474609375, "learning_rate": 9.646078072942561e-06, "loss": 1.2138, "step": 17652 }, { "epoch": 5.25787896275061, "grad_norm": 0.28109875321388245, "learning_rate": 9.64511411252536e-06, "loss": 1.2215, "step": 17653 }, { "epoch": 5.258176808950278, "grad_norm": 0.272085964679718, "learning_rate": 9.644150155409976e-06, "loss": 1.2126, "step": 17654 }, { "epoch": 5.258474655149947, "grad_norm": 0.26784154772758484, "learning_rate": 9.643186201605374e-06, "loss": 1.2166, "step": 17655 }, { "epoch": 5.258772501349616, "grad_norm": 0.2522640824317932, "learning_rate": 9.642222251120531e-06, "loss": 1.2058, "step": 17656 }, { "epoch": 5.259070347549284, "grad_norm": 0.26999029517173767, "learning_rate": 9.641258303964408e-06, "loss": 1.2084, "step": 17657 }, { "epoch": 5.259368193748953, "grad_norm": 0.2779795229434967, "learning_rate": 9.640294360145975e-06, "loss": 1.2173, "step": 17658 }, { "epoch": 5.2596660399486215, "grad_norm": 0.29149943590164185, "learning_rate": 9.639330419674201e-06, "loss": 1.2034, "step": 17659 }, { "epoch": 5.25996388614829, "grad_norm": 0.278231680393219, "learning_rate": 9.638366482558052e-06, "loss": 1.2171, "step": 17660 }, { "epoch": 5.260261732347959, "grad_norm": 0.30518946051597595, "learning_rate": 9.637402548806503e-06, "loss": 1.2219, "step": 17661 }, { "epoch": 5.260559578547627, "grad_norm": 0.2650560736656189, "learning_rate": 9.63643861842852e-06, "loss": 1.2044, "step": 17662 }, { "epoch": 5.260857424747297, "grad_norm": 0.28559422492980957, "learning_rate": 9.635474691433063e-06, "loss": 1.2218, "step": 17663 }, { "epoch": 5.261155270946965, "grad_norm": 0.2490164190530777, "learning_rate": 9.63451076782911e-06, "loss": 1.2148, "step": 17664 }, { "epoch": 5.261453117146633, "grad_norm": 0.2811809480190277, "learning_rate": 9.633546847625627e-06, "loss": 1.232, "step": 17665 }, { "epoch": 5.261750963346302, "grad_norm": 0.3814217150211334, "learning_rate": 9.63258293083158e-06, "loss": 1.2042, "step": 17666 }, { "epoch": 5.262048809545971, "grad_norm": 0.29409468173980713, "learning_rate": 9.63161901745594e-06, "loss": 1.2282, "step": 17667 }, { "epoch": 5.262346655745639, "grad_norm": 0.2859947681427002, "learning_rate": 9.630655107507674e-06, "loss": 1.2236, "step": 17668 }, { "epoch": 5.262644501945308, "grad_norm": 0.32990679144859314, "learning_rate": 9.629691200995744e-06, "loss": 1.2226, "step": 17669 }, { "epoch": 5.262942348144977, "grad_norm": 0.25081712007522583, "learning_rate": 9.628727297929127e-06, "loss": 1.2071, "step": 17670 }, { "epoch": 5.263240194344645, "grad_norm": 0.357969731092453, "learning_rate": 9.62776339831679e-06, "loss": 1.2039, "step": 17671 }, { "epoch": 5.263538040544314, "grad_norm": 0.2931002676486969, "learning_rate": 9.626799502167694e-06, "loss": 1.2117, "step": 17672 }, { "epoch": 5.2638358867439825, "grad_norm": 0.33453667163848877, "learning_rate": 9.625835609490817e-06, "loss": 1.2101, "step": 17673 }, { "epoch": 5.264133732943651, "grad_norm": 0.4000210165977478, "learning_rate": 9.62487172029512e-06, "loss": 1.2081, "step": 17674 }, { "epoch": 5.26443157914332, "grad_norm": 0.25864914059638977, "learning_rate": 9.62390783458957e-06, "loss": 1.2191, "step": 17675 }, { "epoch": 5.264729425342988, "grad_norm": 0.460506796836853, "learning_rate": 9.622943952383138e-06, "loss": 1.2148, "step": 17676 }, { "epoch": 5.265027271542657, "grad_norm": 0.41654765605926514, "learning_rate": 9.62198007368479e-06, "loss": 1.2235, "step": 17677 }, { "epoch": 5.265325117742326, "grad_norm": 0.26073628664016724, "learning_rate": 9.621016198503498e-06, "loss": 1.2126, "step": 17678 }, { "epoch": 5.265622963941994, "grad_norm": 0.34627699851989746, "learning_rate": 9.620052326848229e-06, "loss": 1.2296, "step": 17679 }, { "epoch": 5.2659208101416635, "grad_norm": 0.25202104449272156, "learning_rate": 9.619088458727943e-06, "loss": 1.2184, "step": 17680 }, { "epoch": 5.266218656341332, "grad_norm": 0.263837605714798, "learning_rate": 9.618124594151616e-06, "loss": 1.2041, "step": 17681 }, { "epoch": 5.266516502541, "grad_norm": 0.30499914288520813, "learning_rate": 9.617160733128214e-06, "loss": 1.223, "step": 17682 }, { "epoch": 5.266814348740669, "grad_norm": 0.36672741174697876, "learning_rate": 9.6161968756667e-06, "loss": 1.196, "step": 17683 }, { "epoch": 5.267112194940338, "grad_norm": 0.2590443789958954, "learning_rate": 9.615233021776049e-06, "loss": 1.2014, "step": 17684 }, { "epoch": 5.267410041140006, "grad_norm": 0.380149781703949, "learning_rate": 9.614269171465224e-06, "loss": 1.2197, "step": 17685 }, { "epoch": 5.267707887339675, "grad_norm": 0.25747448205947876, "learning_rate": 9.613305324743191e-06, "loss": 1.2121, "step": 17686 }, { "epoch": 5.2680057335393435, "grad_norm": 0.4346485137939453, "learning_rate": 9.612341481618924e-06, "loss": 1.215, "step": 17687 }, { "epoch": 5.268303579739012, "grad_norm": 0.3029664158821106, "learning_rate": 9.611377642101386e-06, "loss": 1.2256, "step": 17688 }, { "epoch": 5.268601425938681, "grad_norm": 0.31039392948150635, "learning_rate": 9.61041380619954e-06, "loss": 1.2062, "step": 17689 }, { "epoch": 5.268899272138349, "grad_norm": 0.3747265636920929, "learning_rate": 9.609449973922363e-06, "loss": 1.2106, "step": 17690 }, { "epoch": 5.269197118338019, "grad_norm": 0.28945842385292053, "learning_rate": 9.608486145278813e-06, "loss": 1.2366, "step": 17691 }, { "epoch": 5.269494964537687, "grad_norm": 0.2742080092430115, "learning_rate": 9.607522320277866e-06, "loss": 1.2289, "step": 17692 }, { "epoch": 5.269792810737355, "grad_norm": 0.23953720927238464, "learning_rate": 9.606558498928485e-06, "loss": 1.214, "step": 17693 }, { "epoch": 5.2700906569370245, "grad_norm": 0.30654656887054443, "learning_rate": 9.605594681239636e-06, "loss": 1.2046, "step": 17694 }, { "epoch": 5.270388503136693, "grad_norm": 0.27098795771598816, "learning_rate": 9.604630867220288e-06, "loss": 1.2117, "step": 17695 }, { "epoch": 5.270686349336361, "grad_norm": 0.3117823600769043, "learning_rate": 9.603667056879412e-06, "loss": 1.2012, "step": 17696 }, { "epoch": 5.27098419553603, "grad_norm": 0.3452744781970978, "learning_rate": 9.602703250225966e-06, "loss": 1.2038, "step": 17697 }, { "epoch": 5.271282041735699, "grad_norm": 0.2786961793899536, "learning_rate": 9.601739447268926e-06, "loss": 1.2262, "step": 17698 }, { "epoch": 5.271579887935367, "grad_norm": 0.3331828713417053, "learning_rate": 9.600775648017253e-06, "loss": 1.2126, "step": 17699 }, { "epoch": 5.271877734135036, "grad_norm": 0.26668357849121094, "learning_rate": 9.599811852479916e-06, "loss": 1.2136, "step": 17700 }, { "epoch": 5.272175580334705, "grad_norm": 0.27471211552619934, "learning_rate": 9.598848060665885e-06, "loss": 1.2303, "step": 17701 }, { "epoch": 5.272473426534373, "grad_norm": 0.2641032338142395, "learning_rate": 9.597884272584126e-06, "loss": 1.2158, "step": 17702 }, { "epoch": 5.272771272734042, "grad_norm": 0.28214526176452637, "learning_rate": 9.5969204882436e-06, "loss": 1.2057, "step": 17703 }, { "epoch": 5.27306911893371, "grad_norm": 0.37983354926109314, "learning_rate": 9.595956707653282e-06, "loss": 1.2041, "step": 17704 }, { "epoch": 5.27336696513338, "grad_norm": 0.4316655397415161, "learning_rate": 9.594992930822134e-06, "loss": 1.2116, "step": 17705 }, { "epoch": 5.273664811333048, "grad_norm": 0.285697340965271, "learning_rate": 9.594029157759122e-06, "loss": 1.2097, "step": 17706 }, { "epoch": 5.273962657532716, "grad_norm": 0.3625587522983551, "learning_rate": 9.59306538847322e-06, "loss": 1.2089, "step": 17707 }, { "epoch": 5.2742605037323855, "grad_norm": 0.33040204644203186, "learning_rate": 9.592101622973385e-06, "loss": 1.2388, "step": 17708 }, { "epoch": 5.274558349932054, "grad_norm": 0.26028236746788025, "learning_rate": 9.591137861268593e-06, "loss": 1.2072, "step": 17709 }, { "epoch": 5.274856196131722, "grad_norm": 0.27688753604888916, "learning_rate": 9.590174103367807e-06, "loss": 1.2125, "step": 17710 }, { "epoch": 5.275154042331391, "grad_norm": 0.29101675748825073, "learning_rate": 9.589210349279987e-06, "loss": 1.1932, "step": 17711 }, { "epoch": 5.27545188853106, "grad_norm": 0.4670671224594116, "learning_rate": 9.588246599014109e-06, "loss": 1.2191, "step": 17712 }, { "epoch": 5.275749734730728, "grad_norm": 0.6348828077316284, "learning_rate": 9.587282852579139e-06, "loss": 1.1951, "step": 17713 }, { "epoch": 5.276047580930397, "grad_norm": 0.3590271770954132, "learning_rate": 9.586319109984035e-06, "loss": 1.2174, "step": 17714 }, { "epoch": 5.276345427130066, "grad_norm": 0.2943142056465149, "learning_rate": 9.585355371237776e-06, "loss": 1.2091, "step": 17715 }, { "epoch": 5.276643273329734, "grad_norm": 0.3032686710357666, "learning_rate": 9.584391636349319e-06, "loss": 1.2194, "step": 17716 }, { "epoch": 5.276941119529403, "grad_norm": 0.28097230195999146, "learning_rate": 9.583427905327634e-06, "loss": 1.2386, "step": 17717 }, { "epoch": 5.2772389657290715, "grad_norm": 0.36082130670547485, "learning_rate": 9.582464178181685e-06, "loss": 1.2099, "step": 17718 }, { "epoch": 5.277536811928741, "grad_norm": 0.2619275152683258, "learning_rate": 9.581500454920443e-06, "loss": 1.2171, "step": 17719 }, { "epoch": 5.277834658128409, "grad_norm": 0.2929697334766388, "learning_rate": 9.580536735552869e-06, "loss": 1.1921, "step": 17720 }, { "epoch": 5.278132504328077, "grad_norm": 0.2946698069572449, "learning_rate": 9.579573020087935e-06, "loss": 1.2141, "step": 17721 }, { "epoch": 5.278430350527747, "grad_norm": 0.3796788156032562, "learning_rate": 9.578609308534604e-06, "loss": 1.2278, "step": 17722 }, { "epoch": 5.278728196727415, "grad_norm": 0.2787896692752838, "learning_rate": 9.577645600901838e-06, "loss": 1.215, "step": 17723 }, { "epoch": 5.279026042927083, "grad_norm": 0.5407164096832275, "learning_rate": 9.576681897198613e-06, "loss": 1.2331, "step": 17724 }, { "epoch": 5.279323889126752, "grad_norm": 0.31933292746543884, "learning_rate": 9.575718197433886e-06, "loss": 1.2001, "step": 17725 }, { "epoch": 5.279621735326421, "grad_norm": 0.3746192455291748, "learning_rate": 9.574754501616631e-06, "loss": 1.2093, "step": 17726 }, { "epoch": 5.279919581526089, "grad_norm": 0.3737927973270416, "learning_rate": 9.57379080975581e-06, "loss": 1.2042, "step": 17727 }, { "epoch": 5.280217427725758, "grad_norm": 0.2543933093547821, "learning_rate": 9.572827121860387e-06, "loss": 1.2233, "step": 17728 }, { "epoch": 5.280515273925427, "grad_norm": 0.27333301305770874, "learning_rate": 9.57186343793933e-06, "loss": 1.225, "step": 17729 }, { "epoch": 5.280813120125096, "grad_norm": 0.2692798376083374, "learning_rate": 9.570899758001608e-06, "loss": 1.2261, "step": 17730 }, { "epoch": 5.281110966324764, "grad_norm": 0.25104212760925293, "learning_rate": 9.569936082056181e-06, "loss": 1.2219, "step": 17731 }, { "epoch": 5.2814088125244325, "grad_norm": 0.3911302089691162, "learning_rate": 9.568972410112023e-06, "loss": 1.2354, "step": 17732 }, { "epoch": 5.281706658724102, "grad_norm": 0.37411147356033325, "learning_rate": 9.568008742178094e-06, "loss": 1.2028, "step": 17733 }, { "epoch": 5.28200450492377, "grad_norm": 0.290594220161438, "learning_rate": 9.567045078263357e-06, "loss": 1.2115, "step": 17734 }, { "epoch": 5.282302351123438, "grad_norm": 0.3030377924442291, "learning_rate": 9.566081418376784e-06, "loss": 1.2088, "step": 17735 }, { "epoch": 5.282600197323108, "grad_norm": 0.41590309143066406, "learning_rate": 9.56511776252734e-06, "loss": 1.2045, "step": 17736 }, { "epoch": 5.282898043522776, "grad_norm": 0.4200705587863922, "learning_rate": 9.564154110723986e-06, "loss": 1.2218, "step": 17737 }, { "epoch": 5.283195889722444, "grad_norm": 0.35149914026260376, "learning_rate": 9.563190462975696e-06, "loss": 1.2257, "step": 17738 }, { "epoch": 5.2834937359221135, "grad_norm": 0.39634302258491516, "learning_rate": 9.562226819291426e-06, "loss": 1.2153, "step": 17739 }, { "epoch": 5.283791582121782, "grad_norm": 0.27166733145713806, "learning_rate": 9.561263179680149e-06, "loss": 1.2203, "step": 17740 }, { "epoch": 5.28408942832145, "grad_norm": 0.25245028734207153, "learning_rate": 9.560299544150828e-06, "loss": 1.2002, "step": 17741 }, { "epoch": 5.284387274521119, "grad_norm": 0.3067881166934967, "learning_rate": 9.559335912712425e-06, "loss": 1.2105, "step": 17742 }, { "epoch": 5.284685120720788, "grad_norm": 0.35038381814956665, "learning_rate": 9.558372285373913e-06, "loss": 1.1963, "step": 17743 }, { "epoch": 5.284982966920456, "grad_norm": 0.2612840235233307, "learning_rate": 9.557408662144254e-06, "loss": 1.2286, "step": 17744 }, { "epoch": 5.285280813120125, "grad_norm": 0.3415238559246063, "learning_rate": 9.556445043032408e-06, "loss": 1.2139, "step": 17745 }, { "epoch": 5.2855786593197935, "grad_norm": 0.3501487672328949, "learning_rate": 9.555481428047351e-06, "loss": 1.1985, "step": 17746 }, { "epoch": 5.285876505519463, "grad_norm": 0.2651747465133667, "learning_rate": 9.55451781719804e-06, "loss": 1.1988, "step": 17747 }, { "epoch": 5.286174351719131, "grad_norm": 0.38092848658561707, "learning_rate": 9.553554210493439e-06, "loss": 1.2156, "step": 17748 }, { "epoch": 5.286472197918799, "grad_norm": 0.26460573077201843, "learning_rate": 9.552590607942524e-06, "loss": 1.2366, "step": 17749 }, { "epoch": 5.286770044118469, "grad_norm": 0.37430649995803833, "learning_rate": 9.551627009554253e-06, "loss": 1.2026, "step": 17750 }, { "epoch": 5.287067890318137, "grad_norm": 0.30642634630203247, "learning_rate": 9.550663415337587e-06, "loss": 1.2131, "step": 17751 }, { "epoch": 5.287365736517805, "grad_norm": 0.32907119393348694, "learning_rate": 9.549699825301499e-06, "loss": 1.2165, "step": 17752 }, { "epoch": 5.2876635827174745, "grad_norm": 0.29158130288124084, "learning_rate": 9.548736239454953e-06, "loss": 1.2203, "step": 17753 }, { "epoch": 5.287961428917143, "grad_norm": 0.4016822874546051, "learning_rate": 9.547772657806905e-06, "loss": 1.2239, "step": 17754 }, { "epoch": 5.288259275116811, "grad_norm": 0.381194531917572, "learning_rate": 9.546809080366335e-06, "loss": 1.2095, "step": 17755 }, { "epoch": 5.28855712131648, "grad_norm": 0.33159124851226807, "learning_rate": 9.5458455071422e-06, "loss": 1.2109, "step": 17756 }, { "epoch": 5.288854967516149, "grad_norm": 0.4588654041290283, "learning_rate": 9.544881938143457e-06, "loss": 1.1984, "step": 17757 }, { "epoch": 5.289152813715818, "grad_norm": 0.2766730487346649, "learning_rate": 9.543918373379084e-06, "loss": 1.2229, "step": 17758 }, { "epoch": 5.289450659915486, "grad_norm": 0.3315102159976959, "learning_rate": 9.54295481285804e-06, "loss": 1.2087, "step": 17759 }, { "epoch": 5.289748506115155, "grad_norm": 0.34840163588523865, "learning_rate": 9.541991256589292e-06, "loss": 1.2084, "step": 17760 }, { "epoch": 5.290046352314824, "grad_norm": 0.33905497193336487, "learning_rate": 9.541027704581807e-06, "loss": 1.1986, "step": 17761 }, { "epoch": 5.290344198514492, "grad_norm": 0.4713587760925293, "learning_rate": 9.540064156844539e-06, "loss": 1.2222, "step": 17762 }, { "epoch": 5.29064204471416, "grad_norm": 0.35660380125045776, "learning_rate": 9.539100613386468e-06, "loss": 1.2171, "step": 17763 }, { "epoch": 5.29093989091383, "grad_norm": 0.4753265976905823, "learning_rate": 9.538137074216546e-06, "loss": 1.2329, "step": 17764 }, { "epoch": 5.291237737113498, "grad_norm": 0.3240477442741394, "learning_rate": 9.53717353934374e-06, "loss": 1.1892, "step": 17765 }, { "epoch": 5.291535583313166, "grad_norm": 0.39129161834716797, "learning_rate": 9.536210008777022e-06, "loss": 1.2146, "step": 17766 }, { "epoch": 5.2918334295128355, "grad_norm": 0.250642865896225, "learning_rate": 9.535246482525353e-06, "loss": 1.2152, "step": 17767 }, { "epoch": 5.292131275712504, "grad_norm": 0.3610994219779968, "learning_rate": 9.534282960597692e-06, "loss": 1.2171, "step": 17768 }, { "epoch": 5.292429121912173, "grad_norm": 0.25481879711151123, "learning_rate": 9.533319443003011e-06, "loss": 1.2119, "step": 17769 }, { "epoch": 5.292726968111841, "grad_norm": 0.30706268548965454, "learning_rate": 9.532355929750269e-06, "loss": 1.2121, "step": 17770 }, { "epoch": 5.29302481431151, "grad_norm": 0.28423357009887695, "learning_rate": 9.53139242084843e-06, "loss": 1.2116, "step": 17771 }, { "epoch": 5.293322660511179, "grad_norm": 0.35022106766700745, "learning_rate": 9.530428916306466e-06, "loss": 1.1977, "step": 17772 }, { "epoch": 5.293620506710847, "grad_norm": 0.2840099632740021, "learning_rate": 9.529465416133336e-06, "loss": 1.2053, "step": 17773 }, { "epoch": 5.293918352910516, "grad_norm": 0.3796977400779724, "learning_rate": 9.528501920338e-06, "loss": 1.2172, "step": 17774 }, { "epoch": 5.294216199110185, "grad_norm": 0.2766299843788147, "learning_rate": 9.52753842892943e-06, "loss": 1.1984, "step": 17775 }, { "epoch": 5.294514045309853, "grad_norm": 0.4037684500217438, "learning_rate": 9.526574941916587e-06, "loss": 1.2149, "step": 17776 }, { "epoch": 5.2948118915095215, "grad_norm": 0.25487518310546875, "learning_rate": 9.525611459308434e-06, "loss": 1.2085, "step": 17777 }, { "epoch": 5.295109737709191, "grad_norm": 0.4670896530151367, "learning_rate": 9.524647981113938e-06, "loss": 1.2128, "step": 17778 }, { "epoch": 5.295407583908859, "grad_norm": 0.2758674621582031, "learning_rate": 9.523684507342059e-06, "loss": 1.2209, "step": 17779 }, { "epoch": 5.295705430108527, "grad_norm": 0.40095141530036926, "learning_rate": 9.522721038001768e-06, "loss": 1.2096, "step": 17780 }, { "epoch": 5.296003276308197, "grad_norm": 0.29094254970550537, "learning_rate": 9.521757573102021e-06, "loss": 1.209, "step": 17781 }, { "epoch": 5.296301122507865, "grad_norm": 0.2720484137535095, "learning_rate": 9.520794112651786e-06, "loss": 1.2176, "step": 17782 }, { "epoch": 5.296598968707533, "grad_norm": 0.45520949363708496, "learning_rate": 9.519830656660026e-06, "loss": 1.2073, "step": 17783 }, { "epoch": 5.296896814907202, "grad_norm": 0.27867141366004944, "learning_rate": 9.518867205135707e-06, "loss": 1.2325, "step": 17784 }, { "epoch": 5.297194661106871, "grad_norm": 0.4318096339702606, "learning_rate": 9.517903758087788e-06, "loss": 1.2067, "step": 17785 }, { "epoch": 5.29749250730654, "grad_norm": 0.29537105560302734, "learning_rate": 9.516940315525241e-06, "loss": 1.2041, "step": 17786 }, { "epoch": 5.297790353506208, "grad_norm": 0.5036028027534485, "learning_rate": 9.51597687745702e-06, "loss": 1.2101, "step": 17787 }, { "epoch": 5.298088199705877, "grad_norm": 0.4049065411090851, "learning_rate": 9.515013443892094e-06, "loss": 1.217, "step": 17788 }, { "epoch": 5.298386045905546, "grad_norm": 0.3740525245666504, "learning_rate": 9.514050014839429e-06, "loss": 1.2225, "step": 17789 }, { "epoch": 5.298683892105214, "grad_norm": 0.3414017856121063, "learning_rate": 9.513086590307983e-06, "loss": 1.204, "step": 17790 }, { "epoch": 5.2989817383048825, "grad_norm": 0.35962656140327454, "learning_rate": 9.512123170306722e-06, "loss": 1.1991, "step": 17791 }, { "epoch": 5.299279584504552, "grad_norm": 0.32481124997138977, "learning_rate": 9.511159754844613e-06, "loss": 1.2131, "step": 17792 }, { "epoch": 5.29957743070422, "grad_norm": 0.3517327904701233, "learning_rate": 9.510196343930611e-06, "loss": 1.2195, "step": 17793 }, { "epoch": 5.299875276903888, "grad_norm": 0.27785369753837585, "learning_rate": 9.509232937573688e-06, "loss": 1.2305, "step": 17794 }, { "epoch": 5.300173123103558, "grad_norm": 0.5341268181800842, "learning_rate": 9.508269535782805e-06, "loss": 1.208, "step": 17795 }, { "epoch": 5.300470969303226, "grad_norm": 0.2722247242927551, "learning_rate": 9.50730613856692e-06, "loss": 1.2207, "step": 17796 }, { "epoch": 5.300768815502895, "grad_norm": 0.49601635336875916, "learning_rate": 9.506342745935007e-06, "loss": 1.198, "step": 17797 }, { "epoch": 5.3010666617025635, "grad_norm": 0.2821829915046692, "learning_rate": 9.505379357896023e-06, "loss": 1.2195, "step": 17798 }, { "epoch": 5.301364507902232, "grad_norm": 0.34916460514068604, "learning_rate": 9.504415974458926e-06, "loss": 1.2172, "step": 17799 }, { "epoch": 5.301662354101901, "grad_norm": 0.2600118815898895, "learning_rate": 9.503452595632688e-06, "loss": 1.213, "step": 17800 }, { "epoch": 5.301960200301569, "grad_norm": 0.37544959783554077, "learning_rate": 9.502489221426272e-06, "loss": 1.2135, "step": 17801 }, { "epoch": 5.302258046501238, "grad_norm": 0.25689440965652466, "learning_rate": 9.501525851848632e-06, "loss": 1.2068, "step": 17802 }, { "epoch": 5.302555892700907, "grad_norm": 0.2783738076686859, "learning_rate": 9.500562486908739e-06, "loss": 1.2299, "step": 17803 }, { "epoch": 5.302853738900575, "grad_norm": 0.38854414224624634, "learning_rate": 9.499599126615556e-06, "loss": 1.2184, "step": 17804 }, { "epoch": 5.3031515851002435, "grad_norm": 0.4283054769039154, "learning_rate": 9.498635770978042e-06, "loss": 1.2103, "step": 17805 }, { "epoch": 5.303449431299913, "grad_norm": 0.3234732449054718, "learning_rate": 9.497672420005164e-06, "loss": 1.2152, "step": 17806 }, { "epoch": 5.303747277499581, "grad_norm": 0.408430278301239, "learning_rate": 9.496709073705878e-06, "loss": 1.2168, "step": 17807 }, { "epoch": 5.304045123699249, "grad_norm": 0.26550015807151794, "learning_rate": 9.495745732089156e-06, "loss": 1.2022, "step": 17808 }, { "epoch": 5.304342969898919, "grad_norm": 0.5079704523086548, "learning_rate": 9.494782395163958e-06, "loss": 1.2119, "step": 17809 }, { "epoch": 5.304640816098587, "grad_norm": 0.30164191126823425, "learning_rate": 9.493819062939241e-06, "loss": 1.2243, "step": 17810 }, { "epoch": 5.304938662298255, "grad_norm": 0.3827284574508667, "learning_rate": 9.492855735423973e-06, "loss": 1.2108, "step": 17811 }, { "epoch": 5.3052365084979245, "grad_norm": 0.4162214696407318, "learning_rate": 9.491892412627121e-06, "loss": 1.226, "step": 17812 }, { "epoch": 5.305534354697593, "grad_norm": 0.27671313285827637, "learning_rate": 9.490929094557636e-06, "loss": 1.2261, "step": 17813 }, { "epoch": 5.305832200897262, "grad_norm": 0.725034236907959, "learning_rate": 9.489965781224491e-06, "loss": 1.2205, "step": 17814 }, { "epoch": 5.30613004709693, "grad_norm": 0.6562210321426392, "learning_rate": 9.489002472636645e-06, "loss": 1.2204, "step": 17815 }, { "epoch": 5.306427893296599, "grad_norm": 0.2668152451515198, "learning_rate": 9.488039168803054e-06, "loss": 1.1972, "step": 17816 }, { "epoch": 5.306725739496268, "grad_norm": 0.5385468006134033, "learning_rate": 9.487075869732691e-06, "loss": 1.2154, "step": 17817 }, { "epoch": 5.307023585695936, "grad_norm": 0.30791765451431274, "learning_rate": 9.486112575434516e-06, "loss": 1.2202, "step": 17818 }, { "epoch": 5.307321431895605, "grad_norm": 0.2882668375968933, "learning_rate": 9.485149285917485e-06, "loss": 1.2193, "step": 17819 }, { "epoch": 5.307619278095274, "grad_norm": 0.2871219515800476, "learning_rate": 9.484186001190569e-06, "loss": 1.2082, "step": 17820 }, { "epoch": 5.307917124294942, "grad_norm": 0.32335707545280457, "learning_rate": 9.483222721262725e-06, "loss": 1.2187, "step": 17821 }, { "epoch": 5.30821497049461, "grad_norm": 0.3050103187561035, "learning_rate": 9.482259446142913e-06, "loss": 1.2008, "step": 17822 }, { "epoch": 5.30851281669428, "grad_norm": 0.33260276913642883, "learning_rate": 9.481296175840099e-06, "loss": 1.2144, "step": 17823 }, { "epoch": 5.308810662893948, "grad_norm": 0.3357332944869995, "learning_rate": 9.480332910363243e-06, "loss": 1.23, "step": 17824 }, { "epoch": 5.309108509093617, "grad_norm": 0.2864167392253876, "learning_rate": 9.479369649721314e-06, "loss": 1.2138, "step": 17825 }, { "epoch": 5.3094063552932855, "grad_norm": 0.42594629526138306, "learning_rate": 9.478406393923267e-06, "loss": 1.2064, "step": 17826 }, { "epoch": 5.309704201492954, "grad_norm": 0.3915179669857025, "learning_rate": 9.477443142978063e-06, "loss": 1.2205, "step": 17827 }, { "epoch": 5.310002047692623, "grad_norm": 0.27900174260139465, "learning_rate": 9.47647989689467e-06, "loss": 1.2366, "step": 17828 }, { "epoch": 5.310299893892291, "grad_norm": 0.25326424837112427, "learning_rate": 9.475516655682046e-06, "loss": 1.21, "step": 17829 }, { "epoch": 5.31059774009196, "grad_norm": 0.4116702973842621, "learning_rate": 9.47455341934915e-06, "loss": 1.2168, "step": 17830 }, { "epoch": 5.310895586291629, "grad_norm": 0.3074418902397156, "learning_rate": 9.473590187904952e-06, "loss": 1.2171, "step": 17831 }, { "epoch": 5.311193432491297, "grad_norm": 0.31306183338165283, "learning_rate": 9.472626961358408e-06, "loss": 1.2301, "step": 17832 }, { "epoch": 5.311491278690966, "grad_norm": 0.2802980840206146, "learning_rate": 9.471663739718479e-06, "loss": 1.2108, "step": 17833 }, { "epoch": 5.311789124890635, "grad_norm": 0.2544321119785309, "learning_rate": 9.47070052299413e-06, "loss": 1.2143, "step": 17834 }, { "epoch": 5.312086971090303, "grad_norm": 0.292193740606308, "learning_rate": 9.469737311194323e-06, "loss": 1.2123, "step": 17835 }, { "epoch": 5.312384817289972, "grad_norm": 0.3129338324069977, "learning_rate": 9.468774104328013e-06, "loss": 1.2038, "step": 17836 }, { "epoch": 5.312682663489641, "grad_norm": 0.252285897731781, "learning_rate": 9.46781090240417e-06, "loss": 1.2071, "step": 17837 }, { "epoch": 5.312980509689309, "grad_norm": 0.4913385212421417, "learning_rate": 9.466847705431755e-06, "loss": 1.2365, "step": 17838 }, { "epoch": 5.313278355888978, "grad_norm": 0.34144484996795654, "learning_rate": 9.46588451341972e-06, "loss": 1.2136, "step": 17839 }, { "epoch": 5.313576202088647, "grad_norm": 0.46132946014404297, "learning_rate": 9.464921326377035e-06, "loss": 1.222, "step": 17840 }, { "epoch": 5.313874048288315, "grad_norm": 0.3557392954826355, "learning_rate": 9.463958144312659e-06, "loss": 1.2168, "step": 17841 }, { "epoch": 5.314171894487984, "grad_norm": 0.4408482313156128, "learning_rate": 9.462994967235555e-06, "loss": 1.213, "step": 17842 }, { "epoch": 5.314469740687652, "grad_norm": 0.3457951545715332, "learning_rate": 9.462031795154685e-06, "loss": 1.2156, "step": 17843 }, { "epoch": 5.314767586887321, "grad_norm": 0.34607723355293274, "learning_rate": 9.461068628079002e-06, "loss": 1.2225, "step": 17844 }, { "epoch": 5.31506543308699, "grad_norm": 0.2906036674976349, "learning_rate": 9.460105466017478e-06, "loss": 1.2226, "step": 17845 }, { "epoch": 5.315363279286658, "grad_norm": 0.4787485599517822, "learning_rate": 9.45914230897907e-06, "loss": 1.1961, "step": 17846 }, { "epoch": 5.315661125486327, "grad_norm": 0.37462329864501953, "learning_rate": 9.458179156972733e-06, "loss": 1.2196, "step": 17847 }, { "epoch": 5.315958971685996, "grad_norm": 0.6380683779716492, "learning_rate": 9.457216010007439e-06, "loss": 1.2072, "step": 17848 }, { "epoch": 5.316256817885664, "grad_norm": 0.3334093689918518, "learning_rate": 9.456252868092145e-06, "loss": 1.2152, "step": 17849 }, { "epoch": 5.3165546640853325, "grad_norm": 0.622526228427887, "learning_rate": 9.455289731235804e-06, "loss": 1.2119, "step": 17850 }, { "epoch": 5.316852510285002, "grad_norm": 0.3051876723766327, "learning_rate": 9.454326599447388e-06, "loss": 1.2189, "step": 17851 }, { "epoch": 5.31715035648467, "grad_norm": 0.48956653475761414, "learning_rate": 9.453363472735853e-06, "loss": 1.2174, "step": 17852 }, { "epoch": 5.317448202684339, "grad_norm": 0.2591938376426697, "learning_rate": 9.452400351110155e-06, "loss": 1.2161, "step": 17853 }, { "epoch": 5.317746048884008, "grad_norm": 0.36875611543655396, "learning_rate": 9.451437234579266e-06, "loss": 1.1962, "step": 17854 }, { "epoch": 5.318043895083676, "grad_norm": 0.33657798171043396, "learning_rate": 9.450474123152142e-06, "loss": 1.2126, "step": 17855 }, { "epoch": 5.318341741283345, "grad_norm": 0.2821645736694336, "learning_rate": 9.449511016837738e-06, "loss": 1.2124, "step": 17856 }, { "epoch": 5.3186395874830135, "grad_norm": 0.42428329586982727, "learning_rate": 9.448547915645021e-06, "loss": 1.217, "step": 17857 }, { "epoch": 5.318937433682682, "grad_norm": 0.2546981871128082, "learning_rate": 9.447584819582946e-06, "loss": 1.2193, "step": 17858 }, { "epoch": 5.319235279882351, "grad_norm": 0.44893303513526917, "learning_rate": 9.446621728660479e-06, "loss": 1.2146, "step": 17859 }, { "epoch": 5.319533126082019, "grad_norm": 0.30277305841445923, "learning_rate": 9.445658642886582e-06, "loss": 1.2293, "step": 17860 }, { "epoch": 5.319830972281688, "grad_norm": 0.30651018023490906, "learning_rate": 9.444695562270207e-06, "loss": 1.2051, "step": 17861 }, { "epoch": 5.320128818481357, "grad_norm": 0.26027822494506836, "learning_rate": 9.443732486820323e-06, "loss": 1.22, "step": 17862 }, { "epoch": 5.320426664681025, "grad_norm": 0.4425266981124878, "learning_rate": 9.442769416545884e-06, "loss": 1.2201, "step": 17863 }, { "epoch": 5.320724510880694, "grad_norm": 0.24000824987888336, "learning_rate": 9.441806351455855e-06, "loss": 1.2179, "step": 17864 }, { "epoch": 5.321022357080363, "grad_norm": 0.6749671697616577, "learning_rate": 9.440843291559193e-06, "loss": 1.2134, "step": 17865 }, { "epoch": 5.321320203280031, "grad_norm": 0.337689608335495, "learning_rate": 9.439880236864862e-06, "loss": 1.2113, "step": 17866 }, { "epoch": 5.3216180494797, "grad_norm": 0.4312625825405121, "learning_rate": 9.438917187381815e-06, "loss": 1.2298, "step": 17867 }, { "epoch": 5.321915895679369, "grad_norm": 0.28401559591293335, "learning_rate": 9.437954143119021e-06, "loss": 1.233, "step": 17868 }, { "epoch": 5.322213741879037, "grad_norm": 0.3657039403915405, "learning_rate": 9.436991104085435e-06, "loss": 1.2235, "step": 17869 }, { "epoch": 5.322511588078706, "grad_norm": 0.3694859743118286, "learning_rate": 9.436028070290013e-06, "loss": 1.226, "step": 17870 }, { "epoch": 5.3228094342783745, "grad_norm": 0.32885557413101196, "learning_rate": 9.435065041741727e-06, "loss": 1.246, "step": 17871 }, { "epoch": 5.323107280478043, "grad_norm": 0.4674130380153656, "learning_rate": 9.434102018449527e-06, "loss": 1.2229, "step": 17872 }, { "epoch": 5.323405126677712, "grad_norm": 0.5305304527282715, "learning_rate": 9.433139000422373e-06, "loss": 1.2012, "step": 17873 }, { "epoch": 5.32370297287738, "grad_norm": 0.36852362751960754, "learning_rate": 9.43217598766923e-06, "loss": 1.2154, "step": 17874 }, { "epoch": 5.324000819077049, "grad_norm": 0.3169606626033783, "learning_rate": 9.43121298019905e-06, "loss": 1.2203, "step": 17875 }, { "epoch": 5.324298665276718, "grad_norm": 0.5542791485786438, "learning_rate": 9.430249978020803e-06, "loss": 1.2147, "step": 17876 }, { "epoch": 5.324596511476386, "grad_norm": 0.3020332455635071, "learning_rate": 9.429286981143445e-06, "loss": 1.2097, "step": 17877 }, { "epoch": 5.324894357676055, "grad_norm": 0.43916815519332886, "learning_rate": 9.428323989575928e-06, "loss": 1.2228, "step": 17878 }, { "epoch": 5.325192203875724, "grad_norm": 0.307855486869812, "learning_rate": 9.42736100332722e-06, "loss": 1.2083, "step": 17879 }, { "epoch": 5.325490050075392, "grad_norm": 0.38172781467437744, "learning_rate": 9.426398022406283e-06, "loss": 1.189, "step": 17880 }, { "epoch": 5.325787896275061, "grad_norm": 0.4009075164794922, "learning_rate": 9.425435046822064e-06, "loss": 1.2176, "step": 17881 }, { "epoch": 5.32608574247473, "grad_norm": 0.39744630455970764, "learning_rate": 9.424472076583533e-06, "loss": 1.2135, "step": 17882 }, { "epoch": 5.326383588674398, "grad_norm": 0.4499882757663727, "learning_rate": 9.423509111699648e-06, "loss": 1.2094, "step": 17883 }, { "epoch": 5.326681434874067, "grad_norm": 0.2811439335346222, "learning_rate": 9.422546152179363e-06, "loss": 1.2125, "step": 17884 }, { "epoch": 5.3269792810737355, "grad_norm": 0.7157266139984131, "learning_rate": 9.421583198031644e-06, "loss": 1.2169, "step": 17885 }, { "epoch": 5.327277127273404, "grad_norm": 0.3518158793449402, "learning_rate": 9.420620249265446e-06, "loss": 1.2124, "step": 17886 }, { "epoch": 5.327574973473073, "grad_norm": 0.5372270941734314, "learning_rate": 9.41965730588973e-06, "loss": 1.2254, "step": 17887 }, { "epoch": 5.327872819672741, "grad_norm": 0.24127618968486786, "learning_rate": 9.418694367913452e-06, "loss": 1.2182, "step": 17888 }, { "epoch": 5.32817066587241, "grad_norm": 0.4268087148666382, "learning_rate": 9.417731435345578e-06, "loss": 1.2406, "step": 17889 }, { "epoch": 5.328468512072079, "grad_norm": 0.35385340452194214, "learning_rate": 9.416768508195057e-06, "loss": 1.2134, "step": 17890 }, { "epoch": 5.328766358271747, "grad_norm": 0.41965508460998535, "learning_rate": 9.415805586470858e-06, "loss": 1.2291, "step": 17891 }, { "epoch": 5.3290642044714165, "grad_norm": 0.43825778365135193, "learning_rate": 9.414842670181931e-06, "loss": 1.2106, "step": 17892 }, { "epoch": 5.329362050671085, "grad_norm": 0.41483011841773987, "learning_rate": 9.413879759337242e-06, "loss": 1.2263, "step": 17893 }, { "epoch": 5.329659896870753, "grad_norm": 0.43688637018203735, "learning_rate": 9.412916853945747e-06, "loss": 1.2183, "step": 17894 }, { "epoch": 5.329957743070422, "grad_norm": 0.31809523701667786, "learning_rate": 9.411953954016402e-06, "loss": 1.213, "step": 17895 }, { "epoch": 5.330255589270091, "grad_norm": 0.37871140241622925, "learning_rate": 9.410991059558172e-06, "loss": 1.2118, "step": 17896 }, { "epoch": 5.330553435469759, "grad_norm": 0.28996196389198303, "learning_rate": 9.410028170580013e-06, "loss": 1.2075, "step": 17897 }, { "epoch": 5.330851281669428, "grad_norm": 0.5004257559776306, "learning_rate": 9.409065287090878e-06, "loss": 1.2207, "step": 17898 }, { "epoch": 5.331149127869097, "grad_norm": 0.2947828471660614, "learning_rate": 9.408102409099732e-06, "loss": 1.2302, "step": 17899 }, { "epoch": 5.331446974068765, "grad_norm": 0.538334846496582, "learning_rate": 9.407139536615535e-06, "loss": 1.2083, "step": 17900 }, { "epoch": 5.331744820268434, "grad_norm": 0.29365020990371704, "learning_rate": 9.406176669647237e-06, "loss": 1.2143, "step": 17901 }, { "epoch": 5.332042666468102, "grad_norm": 0.38196516036987305, "learning_rate": 9.405213808203807e-06, "loss": 1.202, "step": 17902 }, { "epoch": 5.332340512667772, "grad_norm": 0.37977781891822815, "learning_rate": 9.404250952294196e-06, "loss": 1.2042, "step": 17903 }, { "epoch": 5.33263835886744, "grad_norm": 0.37083926796913147, "learning_rate": 9.403288101927361e-06, "loss": 1.2232, "step": 17904 }, { "epoch": 5.332936205067108, "grad_norm": 0.545269787311554, "learning_rate": 9.402325257112265e-06, "loss": 1.2124, "step": 17905 }, { "epoch": 5.3332340512667775, "grad_norm": 0.4284341335296631, "learning_rate": 9.401362417857869e-06, "loss": 1.2099, "step": 17906 }, { "epoch": 5.333531897466446, "grad_norm": 0.4500286877155304, "learning_rate": 9.40039958417312e-06, "loss": 1.2032, "step": 17907 }, { "epoch": 5.333829743666114, "grad_norm": 0.40348461270332336, "learning_rate": 9.39943675606699e-06, "loss": 1.209, "step": 17908 }, { "epoch": 5.334127589865783, "grad_norm": 0.3139120042324066, "learning_rate": 9.398473933548424e-06, "loss": 1.2135, "step": 17909 }, { "epoch": 5.334425436065452, "grad_norm": 0.34571802616119385, "learning_rate": 9.39751111662639e-06, "loss": 1.2213, "step": 17910 }, { "epoch": 5.33472328226512, "grad_norm": 0.3927350342273712, "learning_rate": 9.396548305309841e-06, "loss": 1.208, "step": 17911 }, { "epoch": 5.335021128464789, "grad_norm": 0.3874667286872864, "learning_rate": 9.395585499607733e-06, "loss": 1.2036, "step": 17912 }, { "epoch": 5.335318974664458, "grad_norm": 0.3238022029399872, "learning_rate": 9.39462269952903e-06, "loss": 1.2117, "step": 17913 }, { "epoch": 5.335616820864126, "grad_norm": 0.4481665790081024, "learning_rate": 9.393659905082687e-06, "loss": 1.2279, "step": 17914 }, { "epoch": 5.335914667063795, "grad_norm": 0.34377190470695496, "learning_rate": 9.392697116277658e-06, "loss": 1.2223, "step": 17915 }, { "epoch": 5.3362125132634635, "grad_norm": 0.510254442691803, "learning_rate": 9.391734333122908e-06, "loss": 1.2092, "step": 17916 }, { "epoch": 5.336510359463132, "grad_norm": 0.29175788164138794, "learning_rate": 9.390771555627386e-06, "loss": 1.2047, "step": 17917 }, { "epoch": 5.336808205662801, "grad_norm": 0.3241147994995117, "learning_rate": 9.389808783800054e-06, "loss": 1.206, "step": 17918 }, { "epoch": 5.337106051862469, "grad_norm": 0.278617799282074, "learning_rate": 9.388846017649874e-06, "loss": 1.2159, "step": 17919 }, { "epoch": 5.3374038980621386, "grad_norm": 0.29171955585479736, "learning_rate": 9.387883257185798e-06, "loss": 1.2128, "step": 17920 }, { "epoch": 5.337701744261807, "grad_norm": 0.2922581434249878, "learning_rate": 9.38692050241678e-06, "loss": 1.2119, "step": 17921 }, { "epoch": 5.337999590461475, "grad_norm": 0.38868823647499084, "learning_rate": 9.385957753351785e-06, "loss": 1.2143, "step": 17922 }, { "epoch": 5.338297436661144, "grad_norm": 0.4154965877532959, "learning_rate": 9.384995009999765e-06, "loss": 1.2204, "step": 17923 }, { "epoch": 5.338595282860813, "grad_norm": 0.3261200487613678, "learning_rate": 9.384032272369681e-06, "loss": 1.2043, "step": 17924 }, { "epoch": 5.338893129060481, "grad_norm": 0.5034027695655823, "learning_rate": 9.38306954047049e-06, "loss": 1.2086, "step": 17925 }, { "epoch": 5.33919097526015, "grad_norm": 0.267456591129303, "learning_rate": 9.382106814311144e-06, "loss": 1.2194, "step": 17926 }, { "epoch": 5.339488821459819, "grad_norm": 0.311736524105072, "learning_rate": 9.381144093900608e-06, "loss": 1.2, "step": 17927 }, { "epoch": 5.339786667659487, "grad_norm": 0.35112830996513367, "learning_rate": 9.380181379247833e-06, "loss": 1.2188, "step": 17928 }, { "epoch": 5.340084513859156, "grad_norm": 0.30734434723854065, "learning_rate": 9.379218670361775e-06, "loss": 1.2018, "step": 17929 }, { "epoch": 5.3403823600588245, "grad_norm": 0.3171696662902832, "learning_rate": 9.378255967251397e-06, "loss": 1.2034, "step": 17930 }, { "epoch": 5.340680206258494, "grad_norm": 0.38718852400779724, "learning_rate": 9.377293269925653e-06, "loss": 1.1832, "step": 17931 }, { "epoch": 5.340978052458162, "grad_norm": 0.3817012310028076, "learning_rate": 9.376330578393496e-06, "loss": 1.2307, "step": 17932 }, { "epoch": 5.34127589865783, "grad_norm": 0.28209900856018066, "learning_rate": 9.375367892663891e-06, "loss": 1.2172, "step": 17933 }, { "epoch": 5.3415737448575, "grad_norm": 0.5042916536331177, "learning_rate": 9.374405212745786e-06, "loss": 1.2226, "step": 17934 }, { "epoch": 5.341871591057168, "grad_norm": 0.3044802248477936, "learning_rate": 9.37344253864814e-06, "loss": 1.2117, "step": 17935 }, { "epoch": 5.342169437256836, "grad_norm": 0.3037538230419159, "learning_rate": 9.372479870379916e-06, "loss": 1.2138, "step": 17936 }, { "epoch": 5.3424672834565055, "grad_norm": 0.2792463004589081, "learning_rate": 9.371517207950065e-06, "loss": 1.2184, "step": 17937 }, { "epoch": 5.342765129656174, "grad_norm": 0.3734724521636963, "learning_rate": 9.37055455136754e-06, "loss": 1.2171, "step": 17938 }, { "epoch": 5.343062975855842, "grad_norm": 0.26259225606918335, "learning_rate": 9.369591900641306e-06, "loss": 1.2193, "step": 17939 }, { "epoch": 5.343360822055511, "grad_norm": 0.5744872093200684, "learning_rate": 9.36862925578031e-06, "loss": 1.2346, "step": 17940 }, { "epoch": 5.34365866825518, "grad_norm": 0.48147451877593994, "learning_rate": 9.367666616793518e-06, "loss": 1.2165, "step": 17941 }, { "epoch": 5.343956514454848, "grad_norm": 0.43163394927978516, "learning_rate": 9.366703983689881e-06, "loss": 1.2116, "step": 17942 }, { "epoch": 5.344254360654517, "grad_norm": 0.5540128946304321, "learning_rate": 9.365741356478352e-06, "loss": 1.2129, "step": 17943 }, { "epoch": 5.3445522068541855, "grad_norm": 0.26069483160972595, "learning_rate": 9.364778735167896e-06, "loss": 1.1946, "step": 17944 }, { "epoch": 5.344850053053854, "grad_norm": 0.3064899742603302, "learning_rate": 9.363816119767462e-06, "loss": 1.2158, "step": 17945 }, { "epoch": 5.345147899253523, "grad_norm": 0.2740856111049652, "learning_rate": 9.36285351028601e-06, "loss": 1.2178, "step": 17946 }, { "epoch": 5.345445745453191, "grad_norm": 0.276441365480423, "learning_rate": 9.361890906732492e-06, "loss": 1.2188, "step": 17947 }, { "epoch": 5.345743591652861, "grad_norm": 0.2677435576915741, "learning_rate": 9.360928309115869e-06, "loss": 1.2214, "step": 17948 }, { "epoch": 5.346041437852529, "grad_norm": 0.2829286754131317, "learning_rate": 9.359965717445088e-06, "loss": 1.2325, "step": 17949 }, { "epoch": 5.346339284052197, "grad_norm": 0.26164767146110535, "learning_rate": 9.359003131729117e-06, "loss": 1.2073, "step": 17950 }, { "epoch": 5.3466371302518665, "grad_norm": 0.2733970880508423, "learning_rate": 9.358040551976905e-06, "loss": 1.2282, "step": 17951 }, { "epoch": 5.346934976451535, "grad_norm": 0.25731900334358215, "learning_rate": 9.357077978197406e-06, "loss": 1.2132, "step": 17952 }, { "epoch": 5.347232822651203, "grad_norm": 0.2532481849193573, "learning_rate": 9.356115410399578e-06, "loss": 1.1961, "step": 17953 }, { "epoch": 5.347530668850872, "grad_norm": 0.24681970477104187, "learning_rate": 9.35515284859238e-06, "loss": 1.2083, "step": 17954 }, { "epoch": 5.347828515050541, "grad_norm": 0.26690852642059326, "learning_rate": 9.35419029278476e-06, "loss": 1.2168, "step": 17955 }, { "epoch": 5.348126361250209, "grad_norm": 0.3608032464981079, "learning_rate": 9.353227742985683e-06, "loss": 1.2013, "step": 17956 }, { "epoch": 5.348424207449878, "grad_norm": 0.3427022099494934, "learning_rate": 9.352265199204093e-06, "loss": 1.2331, "step": 17957 }, { "epoch": 5.348722053649547, "grad_norm": 0.27666598558425903, "learning_rate": 9.351302661448954e-06, "loss": 1.2087, "step": 17958 }, { "epoch": 5.349019899849216, "grad_norm": 0.30473482608795166, "learning_rate": 9.350340129729222e-06, "loss": 1.2099, "step": 17959 }, { "epoch": 5.349317746048884, "grad_norm": 0.34101781249046326, "learning_rate": 9.349377604053844e-06, "loss": 1.2122, "step": 17960 }, { "epoch": 5.349615592248552, "grad_norm": 0.5237231850624084, "learning_rate": 9.348415084431786e-06, "loss": 1.2027, "step": 17961 }, { "epoch": 5.349913438448222, "grad_norm": 0.2598274350166321, "learning_rate": 9.347452570871997e-06, "loss": 1.2104, "step": 17962 }, { "epoch": 5.35021128464789, "grad_norm": 0.629736602306366, "learning_rate": 9.346490063383428e-06, "loss": 1.2289, "step": 17963 }, { "epoch": 5.350509130847558, "grad_norm": 0.3449530005455017, "learning_rate": 9.345527561975042e-06, "loss": 1.215, "step": 17964 }, { "epoch": 5.3508069770472275, "grad_norm": 0.5009580254554749, "learning_rate": 9.344565066655794e-06, "loss": 1.22, "step": 17965 }, { "epoch": 5.351104823246896, "grad_norm": 0.3275929093360901, "learning_rate": 9.343602577434629e-06, "loss": 1.2253, "step": 17966 }, { "epoch": 5.351402669446564, "grad_norm": 0.45423343777656555, "learning_rate": 9.342640094320512e-06, "loss": 1.1974, "step": 17967 }, { "epoch": 5.351700515646233, "grad_norm": 0.3593066930770874, "learning_rate": 9.341677617322397e-06, "loss": 1.2035, "step": 17968 }, { "epoch": 5.351998361845902, "grad_norm": 0.3311804234981537, "learning_rate": 9.340715146449231e-06, "loss": 1.2092, "step": 17969 }, { "epoch": 5.352296208045571, "grad_norm": 0.26455381512641907, "learning_rate": 9.339752681709977e-06, "loss": 1.2205, "step": 17970 }, { "epoch": 5.352594054245239, "grad_norm": 0.5381209254264832, "learning_rate": 9.338790223113588e-06, "loss": 1.2136, "step": 17971 }, { "epoch": 5.352891900444908, "grad_norm": 0.36212044954299927, "learning_rate": 9.337827770669013e-06, "loss": 1.2138, "step": 17972 }, { "epoch": 5.353189746644577, "grad_norm": 0.37515610456466675, "learning_rate": 9.336865324385216e-06, "loss": 1.2114, "step": 17973 }, { "epoch": 5.353487592844245, "grad_norm": 0.27836519479751587, "learning_rate": 9.33590288427114e-06, "loss": 1.2183, "step": 17974 }, { "epoch": 5.3537854390439135, "grad_norm": 0.6861031651496887, "learning_rate": 9.334940450335751e-06, "loss": 1.2132, "step": 17975 }, { "epoch": 5.354083285243583, "grad_norm": 0.3119991719722748, "learning_rate": 9.333978022587996e-06, "loss": 1.2244, "step": 17976 }, { "epoch": 5.354381131443251, "grad_norm": 0.46049776673316956, "learning_rate": 9.33301560103683e-06, "loss": 1.2165, "step": 17977 }, { "epoch": 5.354678977642919, "grad_norm": 0.30692294239997864, "learning_rate": 9.332053185691213e-06, "loss": 1.2065, "step": 17978 }, { "epoch": 5.3549768238425886, "grad_norm": 0.4809039235115051, "learning_rate": 9.331090776560093e-06, "loss": 1.2108, "step": 17979 }, { "epoch": 5.355274670042257, "grad_norm": 0.37557461857795715, "learning_rate": 9.330128373652422e-06, "loss": 1.1953, "step": 17980 }, { "epoch": 5.355572516241925, "grad_norm": 0.3039226830005646, "learning_rate": 9.32916597697716e-06, "loss": 1.2384, "step": 17981 }, { "epoch": 5.355870362441594, "grad_norm": 0.368528813123703, "learning_rate": 9.328203586543263e-06, "loss": 1.1951, "step": 17982 }, { "epoch": 5.356168208641263, "grad_norm": 0.32374516129493713, "learning_rate": 9.327241202359676e-06, "loss": 1.2297, "step": 17983 }, { "epoch": 5.356466054840931, "grad_norm": 0.3582445979118347, "learning_rate": 9.326278824435362e-06, "loss": 1.2151, "step": 17984 }, { "epoch": 5.3567639010406, "grad_norm": 0.5023511052131653, "learning_rate": 9.325316452779272e-06, "loss": 1.213, "step": 17985 }, { "epoch": 5.357061747240269, "grad_norm": 0.28617364168167114, "learning_rate": 9.324354087400352e-06, "loss": 1.2169, "step": 17986 }, { "epoch": 5.357359593439938, "grad_norm": 0.5104385018348694, "learning_rate": 9.323391728307566e-06, "loss": 1.2146, "step": 17987 }, { "epoch": 5.357657439639606, "grad_norm": 0.2949276268482208, "learning_rate": 9.322429375509867e-06, "loss": 1.2286, "step": 17988 }, { "epoch": 5.3579552858392745, "grad_norm": 0.370175838470459, "learning_rate": 9.3214670290162e-06, "loss": 1.2187, "step": 17989 }, { "epoch": 5.358253132038944, "grad_norm": 0.36740320920944214, "learning_rate": 9.320504688835529e-06, "loss": 1.2088, "step": 17990 }, { "epoch": 5.358550978238612, "grad_norm": 0.34314897656440735, "learning_rate": 9.319542354976798e-06, "loss": 1.2278, "step": 17991 }, { "epoch": 5.35884882443828, "grad_norm": 0.42298799753189087, "learning_rate": 9.31858002744897e-06, "loss": 1.2119, "step": 17992 }, { "epoch": 5.35914667063795, "grad_norm": 0.2853969931602478, "learning_rate": 9.31761770626099e-06, "loss": 1.2096, "step": 17993 }, { "epoch": 5.359444516837618, "grad_norm": 0.41448962688446045, "learning_rate": 9.316655391421813e-06, "loss": 1.2232, "step": 17994 }, { "epoch": 5.359742363037286, "grad_norm": 0.2936389744281769, "learning_rate": 9.3156930829404e-06, "loss": 1.2195, "step": 17995 }, { "epoch": 5.3600402092369555, "grad_norm": 0.4390643835067749, "learning_rate": 9.314730780825696e-06, "loss": 1.2135, "step": 17996 }, { "epoch": 5.360338055436624, "grad_norm": 0.3160460889339447, "learning_rate": 9.313768485086654e-06, "loss": 1.2192, "step": 17997 }, { "epoch": 5.360635901636293, "grad_norm": 0.3259376287460327, "learning_rate": 9.312806195732234e-06, "loss": 1.2287, "step": 17998 }, { "epoch": 5.360933747835961, "grad_norm": 0.3526167571544647, "learning_rate": 9.311843912771381e-06, "loss": 1.203, "step": 17999 }, { "epoch": 5.36123159403563, "grad_norm": 0.24719078838825226, "learning_rate": 9.310881636213049e-06, "loss": 1.2155, "step": 18000 }, { "epoch": 5.36123159403563, "eval_loss": 1.3248833417892456, "eval_runtime": 23.5169, "eval_samples_per_second": 73.734, "eval_steps_per_second": 4.635, "step": 18000 }, { "epoch": 5.361529440235299, "grad_norm": 0.6050386428833008, "learning_rate": 9.3099193660662e-06, "loss": 1.2225, "step": 18001 }, { "epoch": 5.361827286434967, "grad_norm": 0.3603726625442505, "learning_rate": 9.30895710233978e-06, "loss": 1.2119, "step": 18002 }, { "epoch": 5.3621251326346355, "grad_norm": 0.49911609292030334, "learning_rate": 9.307994845042737e-06, "loss": 1.2209, "step": 18003 }, { "epoch": 5.362422978834305, "grad_norm": 0.3440111577510834, "learning_rate": 9.307032594184033e-06, "loss": 1.207, "step": 18004 }, { "epoch": 5.362720825033973, "grad_norm": 0.48627927899360657, "learning_rate": 9.306070349772613e-06, "loss": 1.2057, "step": 18005 }, { "epoch": 5.363018671233641, "grad_norm": 0.29417282342910767, "learning_rate": 9.305108111817433e-06, "loss": 1.2203, "step": 18006 }, { "epoch": 5.363316517433311, "grad_norm": 0.5244109630584717, "learning_rate": 9.304145880327449e-06, "loss": 1.2114, "step": 18007 }, { "epoch": 5.363614363632979, "grad_norm": 0.3801937401294708, "learning_rate": 9.303183655311606e-06, "loss": 1.2046, "step": 18008 }, { "epoch": 5.363912209832647, "grad_norm": 0.3669234812259674, "learning_rate": 9.302221436778866e-06, "loss": 1.1938, "step": 18009 }, { "epoch": 5.3642100560323165, "grad_norm": 0.3036079704761505, "learning_rate": 9.301259224738171e-06, "loss": 1.2144, "step": 18010 }, { "epoch": 5.364507902231985, "grad_norm": 0.29160866141319275, "learning_rate": 9.300297019198481e-06, "loss": 1.2146, "step": 18011 }, { "epoch": 5.364805748431653, "grad_norm": 0.2609872817993164, "learning_rate": 9.299334820168744e-06, "loss": 1.2157, "step": 18012 }, { "epoch": 5.365103594631322, "grad_norm": 0.31827905774116516, "learning_rate": 9.298372627657916e-06, "loss": 1.2018, "step": 18013 }, { "epoch": 5.365401440830991, "grad_norm": 0.2499566376209259, "learning_rate": 9.297410441674943e-06, "loss": 1.218, "step": 18014 }, { "epoch": 5.36569928703066, "grad_norm": 0.2731861174106598, "learning_rate": 9.296448262228786e-06, "loss": 1.2339, "step": 18015 }, { "epoch": 5.365997133230328, "grad_norm": 0.31170159578323364, "learning_rate": 9.295486089328389e-06, "loss": 1.2083, "step": 18016 }, { "epoch": 5.366294979429997, "grad_norm": 0.3354482352733612, "learning_rate": 9.294523922982704e-06, "loss": 1.2025, "step": 18017 }, { "epoch": 5.366592825629666, "grad_norm": 0.24018102884292603, "learning_rate": 9.293561763200689e-06, "loss": 1.2288, "step": 18018 }, { "epoch": 5.366890671829334, "grad_norm": 0.2768344283103943, "learning_rate": 9.292599609991294e-06, "loss": 1.216, "step": 18019 }, { "epoch": 5.367188518029002, "grad_norm": 0.2534888684749603, "learning_rate": 9.291637463363465e-06, "loss": 1.2002, "step": 18020 }, { "epoch": 5.367486364228672, "grad_norm": 0.2585897743701935, "learning_rate": 9.290675323326163e-06, "loss": 1.201, "step": 18021 }, { "epoch": 5.36778421042834, "grad_norm": 0.25801509618759155, "learning_rate": 9.289713189888331e-06, "loss": 1.2112, "step": 18022 }, { "epoch": 5.368082056628008, "grad_norm": 0.2865597903728485, "learning_rate": 9.288751063058922e-06, "loss": 1.2266, "step": 18023 }, { "epoch": 5.3683799028276775, "grad_norm": 0.3403066098690033, "learning_rate": 9.287788942846895e-06, "loss": 1.2114, "step": 18024 }, { "epoch": 5.368677749027346, "grad_norm": 0.3064972162246704, "learning_rate": 9.286826829261191e-06, "loss": 1.227, "step": 18025 }, { "epoch": 5.368975595227015, "grad_norm": 0.2508450448513031, "learning_rate": 9.285864722310771e-06, "loss": 1.2192, "step": 18026 }, { "epoch": 5.369273441426683, "grad_norm": 0.2641198933124542, "learning_rate": 9.284902622004583e-06, "loss": 1.2297, "step": 18027 }, { "epoch": 5.369571287626352, "grad_norm": 0.270623117685318, "learning_rate": 9.283940528351572e-06, "loss": 1.1957, "step": 18028 }, { "epoch": 5.369869133826021, "grad_norm": 0.2816285192966461, "learning_rate": 9.282978441360695e-06, "loss": 1.2281, "step": 18029 }, { "epoch": 5.370166980025689, "grad_norm": 0.265140563249588, "learning_rate": 9.282016361040908e-06, "loss": 1.219, "step": 18030 }, { "epoch": 5.370464826225358, "grad_norm": 0.259594589471817, "learning_rate": 9.28105428740115e-06, "loss": 1.2095, "step": 18031 }, { "epoch": 5.370762672425027, "grad_norm": 0.297926664352417, "learning_rate": 9.280092220450383e-06, "loss": 1.2337, "step": 18032 }, { "epoch": 5.371060518624695, "grad_norm": 0.27153486013412476, "learning_rate": 9.279130160197552e-06, "loss": 1.2038, "step": 18033 }, { "epoch": 5.3713583648243635, "grad_norm": 0.28715774416923523, "learning_rate": 9.278168106651609e-06, "loss": 1.1976, "step": 18034 }, { "epoch": 5.371656211024033, "grad_norm": 0.3049171268939972, "learning_rate": 9.277206059821505e-06, "loss": 1.2263, "step": 18035 }, { "epoch": 5.371954057223701, "grad_norm": 0.3039532005786896, "learning_rate": 9.276244019716194e-06, "loss": 1.2125, "step": 18036 }, { "epoch": 5.37225190342337, "grad_norm": 0.2982765734195709, "learning_rate": 9.27528198634462e-06, "loss": 1.2146, "step": 18037 }, { "epoch": 5.3725497496230386, "grad_norm": 0.253120094537735, "learning_rate": 9.274319959715742e-06, "loss": 1.2187, "step": 18038 }, { "epoch": 5.372847595822707, "grad_norm": 0.26992347836494446, "learning_rate": 9.273357939838499e-06, "loss": 1.2178, "step": 18039 }, { "epoch": 5.373145442022376, "grad_norm": 0.2781268060207367, "learning_rate": 9.272395926721855e-06, "loss": 1.2198, "step": 18040 }, { "epoch": 5.373443288222044, "grad_norm": 0.2756299376487732, "learning_rate": 9.271433920374751e-06, "loss": 1.214, "step": 18041 }, { "epoch": 5.373741134421713, "grad_norm": 0.2943572700023651, "learning_rate": 9.270471920806138e-06, "loss": 1.2098, "step": 18042 }, { "epoch": 5.374038980621382, "grad_norm": 0.2673749625682831, "learning_rate": 9.269509928024973e-06, "loss": 1.1977, "step": 18043 }, { "epoch": 5.37433682682105, "grad_norm": 0.2815070152282715, "learning_rate": 9.268547942040204e-06, "loss": 1.2069, "step": 18044 }, { "epoch": 5.374634673020719, "grad_norm": 0.26712632179260254, "learning_rate": 9.267585962860772e-06, "loss": 1.2251, "step": 18045 }, { "epoch": 5.374932519220388, "grad_norm": 0.2705835998058319, "learning_rate": 9.266623990495637e-06, "loss": 1.2116, "step": 18046 }, { "epoch": 5.375230365420056, "grad_norm": 0.2494700849056244, "learning_rate": 9.26566202495375e-06, "loss": 1.214, "step": 18047 }, { "epoch": 5.3755282116197245, "grad_norm": 0.29077938199043274, "learning_rate": 9.264700066244053e-06, "loss": 1.2048, "step": 18048 }, { "epoch": 5.375826057819394, "grad_norm": 0.2771974802017212, "learning_rate": 9.263738114375504e-06, "loss": 1.1968, "step": 18049 }, { "epoch": 5.376123904019062, "grad_norm": 0.3069455623626709, "learning_rate": 9.26277616935705e-06, "loss": 1.2043, "step": 18050 }, { "epoch": 5.37642175021873, "grad_norm": 0.5123701691627502, "learning_rate": 9.261814231197634e-06, "loss": 1.2239, "step": 18051 }, { "epoch": 5.3767195964184, "grad_norm": 0.461436003446579, "learning_rate": 9.260852299906216e-06, "loss": 1.2143, "step": 18052 }, { "epoch": 5.377017442618068, "grad_norm": 0.3305855691432953, "learning_rate": 9.259890375491742e-06, "loss": 1.2199, "step": 18053 }, { "epoch": 5.377315288817737, "grad_norm": 0.2641879618167877, "learning_rate": 9.258928457963158e-06, "loss": 1.2123, "step": 18054 }, { "epoch": 5.3776131350174055, "grad_norm": 0.28682488203048706, "learning_rate": 9.25796654732942e-06, "loss": 1.204, "step": 18055 }, { "epoch": 5.377910981217074, "grad_norm": 0.3019677698612213, "learning_rate": 9.257004643599471e-06, "loss": 1.1973, "step": 18056 }, { "epoch": 5.378208827416743, "grad_norm": 0.24850806593894958, "learning_rate": 9.256042746782267e-06, "loss": 1.1932, "step": 18057 }, { "epoch": 5.378506673616411, "grad_norm": 0.25137320160865784, "learning_rate": 9.255080856886752e-06, "loss": 1.1915, "step": 18058 }, { "epoch": 5.37880451981608, "grad_norm": 0.29007431864738464, "learning_rate": 9.254118973921877e-06, "loss": 1.2136, "step": 18059 }, { "epoch": 5.379102366015749, "grad_norm": 0.2802312672138214, "learning_rate": 9.253157097896594e-06, "loss": 1.1988, "step": 18060 }, { "epoch": 5.379400212215417, "grad_norm": 0.36415979266166687, "learning_rate": 9.252195228819851e-06, "loss": 1.2182, "step": 18061 }, { "epoch": 5.3796980584150855, "grad_norm": 0.5965977907180786, "learning_rate": 9.251233366700591e-06, "loss": 1.235, "step": 18062 }, { "epoch": 5.379995904614755, "grad_norm": 0.38883692026138306, "learning_rate": 9.250271511547772e-06, "loss": 1.193, "step": 18063 }, { "epoch": 5.380293750814423, "grad_norm": 0.3269845247268677, "learning_rate": 9.249309663370337e-06, "loss": 1.2292, "step": 18064 }, { "epoch": 5.380591597014092, "grad_norm": 0.34158313274383545, "learning_rate": 9.248347822177234e-06, "loss": 1.2024, "step": 18065 }, { "epoch": 5.380889443213761, "grad_norm": 0.31512251496315, "learning_rate": 9.24738598797742e-06, "loss": 1.2162, "step": 18066 }, { "epoch": 5.381187289413429, "grad_norm": 0.2715738117694855, "learning_rate": 9.246424160779837e-06, "loss": 1.2141, "step": 18067 }, { "epoch": 5.381485135613098, "grad_norm": 0.3289284110069275, "learning_rate": 9.245462340593433e-06, "loss": 1.2114, "step": 18068 }, { "epoch": 5.3817829818127665, "grad_norm": 0.30728328227996826, "learning_rate": 9.24450052742716e-06, "loss": 1.2002, "step": 18069 }, { "epoch": 5.382080828012435, "grad_norm": 0.2602955102920532, "learning_rate": 9.243538721289967e-06, "loss": 1.2094, "step": 18070 }, { "epoch": 5.382378674212104, "grad_norm": 0.3036993145942688, "learning_rate": 9.242576922190799e-06, "loss": 1.2198, "step": 18071 }, { "epoch": 5.382676520411772, "grad_norm": 0.27439242601394653, "learning_rate": 9.241615130138609e-06, "loss": 1.2206, "step": 18072 }, { "epoch": 5.382974366611441, "grad_norm": 0.41943836212158203, "learning_rate": 9.240653345142339e-06, "loss": 1.2101, "step": 18073 }, { "epoch": 5.38327221281111, "grad_norm": 0.30936744809150696, "learning_rate": 9.239691567210944e-06, "loss": 1.2138, "step": 18074 }, { "epoch": 5.383570059010778, "grad_norm": 0.4905613958835602, "learning_rate": 9.23872979635337e-06, "loss": 1.2157, "step": 18075 }, { "epoch": 5.383867905210447, "grad_norm": 0.461593359708786, "learning_rate": 9.237768032578562e-06, "loss": 1.2015, "step": 18076 }, { "epoch": 5.384165751410116, "grad_norm": 0.2584933042526245, "learning_rate": 9.236806275895474e-06, "loss": 1.2083, "step": 18077 }, { "epoch": 5.384463597609784, "grad_norm": 0.3259392976760864, "learning_rate": 9.235844526313052e-06, "loss": 1.2151, "step": 18078 }, { "epoch": 5.384761443809452, "grad_norm": 0.26622921228408813, "learning_rate": 9.234882783840237e-06, "loss": 1.2024, "step": 18079 }, { "epoch": 5.385059290009122, "grad_norm": 0.32340699434280396, "learning_rate": 9.23392104848599e-06, "loss": 1.2195, "step": 18080 }, { "epoch": 5.38535713620879, "grad_norm": 0.26871415972709656, "learning_rate": 9.232959320259246e-06, "loss": 1.212, "step": 18081 }, { "epoch": 5.385654982408459, "grad_norm": 0.2671147286891937, "learning_rate": 9.231997599168958e-06, "loss": 1.2365, "step": 18082 }, { "epoch": 5.3859528286081275, "grad_norm": 0.2677510678768158, "learning_rate": 9.231035885224078e-06, "loss": 1.2061, "step": 18083 }, { "epoch": 5.386250674807796, "grad_norm": 0.2720658779144287, "learning_rate": 9.230074178433552e-06, "loss": 1.2226, "step": 18084 }, { "epoch": 5.386548521007465, "grad_norm": 0.29764074087142944, "learning_rate": 9.22911247880632e-06, "loss": 1.2092, "step": 18085 }, { "epoch": 5.386846367207133, "grad_norm": 0.2718268632888794, "learning_rate": 9.228150786351338e-06, "loss": 1.2151, "step": 18086 }, { "epoch": 5.387144213406802, "grad_norm": 0.2869923412799835, "learning_rate": 9.22718910107755e-06, "loss": 1.2134, "step": 18087 }, { "epoch": 5.387442059606471, "grad_norm": 0.32346364855766296, "learning_rate": 9.226227422993902e-06, "loss": 1.2153, "step": 18088 }, { "epoch": 5.387739905806139, "grad_norm": 0.29786375164985657, "learning_rate": 9.225265752109348e-06, "loss": 1.2364, "step": 18089 }, { "epoch": 5.388037752005808, "grad_norm": 0.34467774629592896, "learning_rate": 9.224304088432825e-06, "loss": 1.2188, "step": 18090 }, { "epoch": 5.388335598205477, "grad_norm": 0.3004086911678314, "learning_rate": 9.223342431973291e-06, "loss": 1.2172, "step": 18091 }, { "epoch": 5.388633444405145, "grad_norm": 0.26614007353782654, "learning_rate": 9.222380782739684e-06, "loss": 1.2121, "step": 18092 }, { "epoch": 5.388931290604814, "grad_norm": 0.33353137969970703, "learning_rate": 9.221419140740956e-06, "loss": 1.2229, "step": 18093 }, { "epoch": 5.389229136804483, "grad_norm": 0.3915778696537018, "learning_rate": 9.220457505986054e-06, "loss": 1.2106, "step": 18094 }, { "epoch": 5.389526983004151, "grad_norm": 0.4370432198047638, "learning_rate": 9.219495878483926e-06, "loss": 1.2036, "step": 18095 }, { "epoch": 5.38982482920382, "grad_norm": 0.29130157828330994, "learning_rate": 9.218534258243513e-06, "loss": 1.1929, "step": 18096 }, { "epoch": 5.3901226754034886, "grad_norm": 0.8444786667823792, "learning_rate": 9.21757264527377e-06, "loss": 1.2169, "step": 18097 }, { "epoch": 5.390420521603157, "grad_norm": 0.5754273533821106, "learning_rate": 9.216611039583634e-06, "loss": 1.2039, "step": 18098 }, { "epoch": 5.390718367802826, "grad_norm": 0.48834800720214844, "learning_rate": 9.21564944118206e-06, "loss": 1.2149, "step": 18099 }, { "epoch": 5.391016214002494, "grad_norm": 0.5807204246520996, "learning_rate": 9.21468785007799e-06, "loss": 1.2197, "step": 18100 }, { "epoch": 5.391314060202163, "grad_norm": 0.31268003582954407, "learning_rate": 9.213726266280376e-06, "loss": 1.2186, "step": 18101 }, { "epoch": 5.391611906401832, "grad_norm": 0.3523191809654236, "learning_rate": 9.212764689798154e-06, "loss": 1.2136, "step": 18102 }, { "epoch": 5.3919097526015, "grad_norm": 0.42938998341560364, "learning_rate": 9.211803120640284e-06, "loss": 1.1947, "step": 18103 }, { "epoch": 5.3922075988011695, "grad_norm": 0.3200710117816925, "learning_rate": 9.210841558815701e-06, "loss": 1.1886, "step": 18104 }, { "epoch": 5.392505445000838, "grad_norm": 0.5177934765815735, "learning_rate": 9.209880004333356e-06, "loss": 1.2166, "step": 18105 }, { "epoch": 5.392803291200506, "grad_norm": 0.3521498143672943, "learning_rate": 9.208918457202196e-06, "loss": 1.2151, "step": 18106 }, { "epoch": 5.393101137400175, "grad_norm": 0.34822359681129456, "learning_rate": 9.207956917431164e-06, "loss": 1.2223, "step": 18107 }, { "epoch": 5.393398983599844, "grad_norm": 0.26184678077697754, "learning_rate": 9.206995385029212e-06, "loss": 1.2318, "step": 18108 }, { "epoch": 5.393696829799512, "grad_norm": 0.37167298793792725, "learning_rate": 9.20603386000528e-06, "loss": 1.2011, "step": 18109 }, { "epoch": 5.393994675999181, "grad_norm": 0.3147827088832855, "learning_rate": 9.205072342368313e-06, "loss": 1.1932, "step": 18110 }, { "epoch": 5.39429252219885, "grad_norm": 0.3623134195804596, "learning_rate": 9.204110832127262e-06, "loss": 1.2324, "step": 18111 }, { "epoch": 5.394590368398518, "grad_norm": 0.2865893244743347, "learning_rate": 9.203149329291072e-06, "loss": 1.2222, "step": 18112 }, { "epoch": 5.394888214598187, "grad_norm": 0.3469681441783905, "learning_rate": 9.202187833868684e-06, "loss": 1.206, "step": 18113 }, { "epoch": 5.3951860607978555, "grad_norm": 0.3164348304271698, "learning_rate": 9.20122634586905e-06, "loss": 1.2132, "step": 18114 }, { "epoch": 5.395483906997524, "grad_norm": 0.49953708052635193, "learning_rate": 9.200264865301112e-06, "loss": 1.2122, "step": 18115 }, { "epoch": 5.395781753197193, "grad_norm": 0.25462424755096436, "learning_rate": 9.199303392173813e-06, "loss": 1.2072, "step": 18116 }, { "epoch": 5.396079599396861, "grad_norm": 0.526871919631958, "learning_rate": 9.198341926496103e-06, "loss": 1.2332, "step": 18117 }, { "epoch": 5.39637744559653, "grad_norm": 0.3667275607585907, "learning_rate": 9.197380468276928e-06, "loss": 1.2296, "step": 18118 }, { "epoch": 5.396675291796199, "grad_norm": 0.4166170358657837, "learning_rate": 9.196419017525226e-06, "loss": 1.2276, "step": 18119 }, { "epoch": 5.396973137995867, "grad_norm": 0.4676266610622406, "learning_rate": 9.195457574249952e-06, "loss": 1.2217, "step": 18120 }, { "epoch": 5.397270984195536, "grad_norm": 0.26535314321517944, "learning_rate": 9.194496138460045e-06, "loss": 1.2196, "step": 18121 }, { "epoch": 5.397568830395205, "grad_norm": 0.3117094039916992, "learning_rate": 9.19353471016445e-06, "loss": 1.2165, "step": 18122 }, { "epoch": 5.397866676594873, "grad_norm": 0.2659699022769928, "learning_rate": 9.192573289372115e-06, "loss": 1.2111, "step": 18123 }, { "epoch": 5.398164522794542, "grad_norm": 0.290687620639801, "learning_rate": 9.19161187609198e-06, "loss": 1.2025, "step": 18124 }, { "epoch": 5.398462368994211, "grad_norm": 0.2657442092895508, "learning_rate": 9.190650470332998e-06, "loss": 1.1948, "step": 18125 }, { "epoch": 5.398760215193879, "grad_norm": 0.27361974120140076, "learning_rate": 9.18968907210411e-06, "loss": 1.2114, "step": 18126 }, { "epoch": 5.399058061393548, "grad_norm": 0.2510913908481598, "learning_rate": 9.188727681414253e-06, "loss": 1.2179, "step": 18127 }, { "epoch": 5.3993559075932165, "grad_norm": 0.27142348885536194, "learning_rate": 9.187766298272382e-06, "loss": 1.2206, "step": 18128 }, { "epoch": 5.399653753792885, "grad_norm": 0.29261237382888794, "learning_rate": 9.18680492268744e-06, "loss": 1.217, "step": 18129 }, { "epoch": 5.399951599992554, "grad_norm": 0.26918327808380127, "learning_rate": 9.185843554668367e-06, "loss": 1.2178, "step": 18130 }, { "epoch": 5.400249446192222, "grad_norm": 0.26643863320350647, "learning_rate": 9.184882194224114e-06, "loss": 1.2121, "step": 18131 }, { "epoch": 5.400547292391892, "grad_norm": 0.38250744342803955, "learning_rate": 9.18392084136362e-06, "loss": 1.1919, "step": 18132 }, { "epoch": 5.40084513859156, "grad_norm": 0.30113449692726135, "learning_rate": 9.182959496095828e-06, "loss": 1.2066, "step": 18133 }, { "epoch": 5.401142984791228, "grad_norm": 0.2831449806690216, "learning_rate": 9.181998158429687e-06, "loss": 1.218, "step": 18134 }, { "epoch": 5.4014408309908974, "grad_norm": 0.2621733248233795, "learning_rate": 9.18103682837414e-06, "loss": 1.2143, "step": 18135 }, { "epoch": 5.401738677190566, "grad_norm": 0.31273043155670166, "learning_rate": 9.180075505938128e-06, "loss": 1.2174, "step": 18136 }, { "epoch": 5.402036523390234, "grad_norm": 0.2662215828895569, "learning_rate": 9.179114191130601e-06, "loss": 1.209, "step": 18137 }, { "epoch": 5.402334369589903, "grad_norm": 0.27449601888656616, "learning_rate": 9.178152883960499e-06, "loss": 1.2049, "step": 18138 }, { "epoch": 5.402632215789572, "grad_norm": 0.27994704246520996, "learning_rate": 9.177191584436761e-06, "loss": 1.2015, "step": 18139 }, { "epoch": 5.40293006198924, "grad_norm": 0.2788822650909424, "learning_rate": 9.176230292568338e-06, "loss": 1.1974, "step": 18140 }, { "epoch": 5.403227908188909, "grad_norm": 0.2680191099643707, "learning_rate": 9.175269008364171e-06, "loss": 1.2183, "step": 18141 }, { "epoch": 5.4035257543885775, "grad_norm": 0.2693394720554352, "learning_rate": 9.174307731833209e-06, "loss": 1.2179, "step": 18142 }, { "epoch": 5.403823600588246, "grad_norm": 0.29945388436317444, "learning_rate": 9.173346462984389e-06, "loss": 1.2279, "step": 18143 }, { "epoch": 5.404121446787915, "grad_norm": 0.35322317481040955, "learning_rate": 9.172385201826655e-06, "loss": 1.2076, "step": 18144 }, { "epoch": 5.404419292987583, "grad_norm": 0.2653945982456207, "learning_rate": 9.171423948368953e-06, "loss": 1.2353, "step": 18145 }, { "epoch": 5.404717139187252, "grad_norm": 0.29993048310279846, "learning_rate": 9.170462702620226e-06, "loss": 1.2341, "step": 18146 }, { "epoch": 5.405014985386921, "grad_norm": 0.2617839574813843, "learning_rate": 9.169501464589415e-06, "loss": 1.1914, "step": 18147 }, { "epoch": 5.405312831586589, "grad_norm": 0.3259333670139313, "learning_rate": 9.168540234285467e-06, "loss": 1.2127, "step": 18148 }, { "epoch": 5.4056106777862585, "grad_norm": 0.27893179655075073, "learning_rate": 9.167579011717325e-06, "loss": 1.2069, "step": 18149 }, { "epoch": 5.405908523985927, "grad_norm": 0.4722149968147278, "learning_rate": 9.166617796893924e-06, "loss": 1.2231, "step": 18150 }, { "epoch": 5.406206370185595, "grad_norm": 0.38504910469055176, "learning_rate": 9.165656589824217e-06, "loss": 1.2233, "step": 18151 }, { "epoch": 5.406504216385264, "grad_norm": 0.32826289534568787, "learning_rate": 9.164695390517146e-06, "loss": 1.2198, "step": 18152 }, { "epoch": 5.406802062584933, "grad_norm": 0.5815978646278381, "learning_rate": 9.163734198981646e-06, "loss": 1.2187, "step": 18153 }, { "epoch": 5.407099908784601, "grad_norm": 0.3500232398509979, "learning_rate": 9.162773015226668e-06, "loss": 1.2191, "step": 18154 }, { "epoch": 5.40739775498427, "grad_norm": 0.35605713725090027, "learning_rate": 9.161811839261148e-06, "loss": 1.2168, "step": 18155 }, { "epoch": 5.4076956011839385, "grad_norm": 0.25141337513923645, "learning_rate": 9.160850671094039e-06, "loss": 1.1975, "step": 18156 }, { "epoch": 5.407993447383607, "grad_norm": 0.528760552406311, "learning_rate": 9.159889510734272e-06, "loss": 1.2074, "step": 18157 }, { "epoch": 5.408291293583276, "grad_norm": 0.33496665954589844, "learning_rate": 9.158928358190795e-06, "loss": 1.2165, "step": 18158 }, { "epoch": 5.408589139782944, "grad_norm": 0.41628533601760864, "learning_rate": 9.157967213472551e-06, "loss": 1.2201, "step": 18159 }, { "epoch": 5.408886985982614, "grad_norm": 0.2675477862358093, "learning_rate": 9.157006076588482e-06, "loss": 1.2163, "step": 18160 }, { "epoch": 5.409184832182282, "grad_norm": 0.5859951376914978, "learning_rate": 9.156044947547527e-06, "loss": 1.2208, "step": 18161 }, { "epoch": 5.40948267838195, "grad_norm": 0.29776814579963684, "learning_rate": 9.155083826358633e-06, "loss": 1.2189, "step": 18162 }, { "epoch": 5.4097805245816195, "grad_norm": 0.49345192313194275, "learning_rate": 9.15412271303074e-06, "loss": 1.2356, "step": 18163 }, { "epoch": 5.410078370781288, "grad_norm": 0.30862030386924744, "learning_rate": 9.153161607572787e-06, "loss": 1.2398, "step": 18164 }, { "epoch": 5.410376216980956, "grad_norm": 0.43222489953041077, "learning_rate": 9.152200509993723e-06, "loss": 1.2104, "step": 18165 }, { "epoch": 5.410674063180625, "grad_norm": 0.39378243684768677, "learning_rate": 9.151239420302486e-06, "loss": 1.2025, "step": 18166 }, { "epoch": 5.410971909380294, "grad_norm": 0.46720996499061584, "learning_rate": 9.150278338508014e-06, "loss": 1.2142, "step": 18167 }, { "epoch": 5.411269755579962, "grad_norm": 0.45590829849243164, "learning_rate": 9.149317264619258e-06, "loss": 1.2117, "step": 18168 }, { "epoch": 5.411567601779631, "grad_norm": 0.35495367646217346, "learning_rate": 9.14835619864515e-06, "loss": 1.2097, "step": 18169 }, { "epoch": 5.4118654479793, "grad_norm": 0.5380088090896606, "learning_rate": 9.147395140594636e-06, "loss": 1.2135, "step": 18170 }, { "epoch": 5.412163294178969, "grad_norm": 0.28789058327674866, "learning_rate": 9.146434090476662e-06, "loss": 1.2117, "step": 18171 }, { "epoch": 5.412461140378637, "grad_norm": 0.526710033416748, "learning_rate": 9.14547304830016e-06, "loss": 1.2172, "step": 18172 }, { "epoch": 5.4127589865783055, "grad_norm": 0.4094356596469879, "learning_rate": 9.14451201407408e-06, "loss": 1.2259, "step": 18173 }, { "epoch": 5.413056832777975, "grad_norm": 0.6341694593429565, "learning_rate": 9.143550987807362e-06, "loss": 1.2236, "step": 18174 }, { "epoch": 5.413354678977643, "grad_norm": 0.3391723036766052, "learning_rate": 9.142589969508939e-06, "loss": 1.2087, "step": 18175 }, { "epoch": 5.413652525177311, "grad_norm": 0.3829106092453003, "learning_rate": 9.14162895918776e-06, "loss": 1.2058, "step": 18176 }, { "epoch": 5.4139503713769805, "grad_norm": 0.4187778830528259, "learning_rate": 9.14066795685277e-06, "loss": 1.2121, "step": 18177 }, { "epoch": 5.414248217576649, "grad_norm": 0.3036881387233734, "learning_rate": 9.139706962512896e-06, "loss": 1.2073, "step": 18178 }, { "epoch": 5.414546063776317, "grad_norm": 0.5648795366287231, "learning_rate": 9.138745976177095e-06, "loss": 1.2092, "step": 18179 }, { "epoch": 5.414843909975986, "grad_norm": 0.3204457461833954, "learning_rate": 9.137784997854296e-06, "loss": 1.2077, "step": 18180 }, { "epoch": 5.415141756175655, "grad_norm": 0.34159207344055176, "learning_rate": 9.136824027553446e-06, "loss": 1.2049, "step": 18181 }, { "epoch": 5.415439602375323, "grad_norm": 0.3680896759033203, "learning_rate": 9.135863065283483e-06, "loss": 1.2051, "step": 18182 }, { "epoch": 5.415737448574992, "grad_norm": 0.2764698565006256, "learning_rate": 9.134902111053351e-06, "loss": 1.2079, "step": 18183 }, { "epoch": 5.416035294774661, "grad_norm": 0.3397405445575714, "learning_rate": 9.133941164871986e-06, "loss": 1.2207, "step": 18184 }, { "epoch": 5.416333140974329, "grad_norm": 0.4269631505012512, "learning_rate": 9.132980226748332e-06, "loss": 1.2093, "step": 18185 }, { "epoch": 5.416630987173998, "grad_norm": 0.26480230689048767, "learning_rate": 9.132019296691328e-06, "loss": 1.2033, "step": 18186 }, { "epoch": 5.4169288333736665, "grad_norm": 0.4031720459461212, "learning_rate": 9.131058374709913e-06, "loss": 1.2122, "step": 18187 }, { "epoch": 5.417226679573336, "grad_norm": 0.31415700912475586, "learning_rate": 9.130097460813033e-06, "loss": 1.2125, "step": 18188 }, { "epoch": 5.417524525773004, "grad_norm": 0.31522077322006226, "learning_rate": 9.129136555009618e-06, "loss": 1.226, "step": 18189 }, { "epoch": 5.417822371972672, "grad_norm": 0.38622716069221497, "learning_rate": 9.12817565730862e-06, "loss": 1.2163, "step": 18190 }, { "epoch": 5.418120218172342, "grad_norm": 0.3281724452972412, "learning_rate": 9.127214767718974e-06, "loss": 1.2082, "step": 18191 }, { "epoch": 5.41841806437201, "grad_norm": 0.4519134759902954, "learning_rate": 9.126253886249615e-06, "loss": 1.2136, "step": 18192 }, { "epoch": 5.418715910571678, "grad_norm": 0.2930494248867035, "learning_rate": 9.125293012909488e-06, "loss": 1.211, "step": 18193 }, { "epoch": 5.4190137567713474, "grad_norm": 0.3956511616706848, "learning_rate": 9.124332147707536e-06, "loss": 1.1966, "step": 18194 }, { "epoch": 5.419311602971016, "grad_norm": 0.2750784456729889, "learning_rate": 9.12337129065269e-06, "loss": 1.2092, "step": 18195 }, { "epoch": 5.419609449170684, "grad_norm": 0.28932952880859375, "learning_rate": 9.1224104417539e-06, "loss": 1.2123, "step": 18196 }, { "epoch": 5.419907295370353, "grad_norm": 0.3183478116989136, "learning_rate": 9.1214496010201e-06, "loss": 1.2372, "step": 18197 }, { "epoch": 5.420205141570022, "grad_norm": 0.334583044052124, "learning_rate": 9.120488768460225e-06, "loss": 1.2048, "step": 18198 }, { "epoch": 5.420502987769691, "grad_norm": 0.283008873462677, "learning_rate": 9.119527944083221e-06, "loss": 1.2254, "step": 18199 }, { "epoch": 5.420800833969359, "grad_norm": 0.2623262107372284, "learning_rate": 9.118567127898029e-06, "loss": 1.2186, "step": 18200 }, { "epoch": 5.4210986801690275, "grad_norm": 0.24411393702030182, "learning_rate": 9.11760631991358e-06, "loss": 1.1984, "step": 18201 }, { "epoch": 5.421396526368697, "grad_norm": 0.29732614755630493, "learning_rate": 9.116645520138825e-06, "loss": 1.2079, "step": 18202 }, { "epoch": 5.421694372568365, "grad_norm": 0.2932446300983429, "learning_rate": 9.11568472858269e-06, "loss": 1.2149, "step": 18203 }, { "epoch": 5.421992218768033, "grad_norm": 0.2652064561843872, "learning_rate": 9.114723945254124e-06, "loss": 1.2271, "step": 18204 }, { "epoch": 5.422290064967703, "grad_norm": 0.2633882761001587, "learning_rate": 9.113763170162063e-06, "loss": 1.2182, "step": 18205 }, { "epoch": 5.422587911167371, "grad_norm": 0.2902883291244507, "learning_rate": 9.11280240331544e-06, "loss": 1.2125, "step": 18206 }, { "epoch": 5.422885757367039, "grad_norm": 0.27402180433273315, "learning_rate": 9.111841644723206e-06, "loss": 1.2167, "step": 18207 }, { "epoch": 5.4231836035667085, "grad_norm": 0.26779574155807495, "learning_rate": 9.110880894394293e-06, "loss": 1.2106, "step": 18208 }, { "epoch": 5.423481449766377, "grad_norm": 0.31007951498031616, "learning_rate": 9.109920152337636e-06, "loss": 1.217, "step": 18209 }, { "epoch": 5.423779295966045, "grad_norm": 0.32808202505111694, "learning_rate": 9.108959418562181e-06, "loss": 1.2104, "step": 18210 }, { "epoch": 5.424077142165714, "grad_norm": 0.2879560589790344, "learning_rate": 9.10799869307686e-06, "loss": 1.2195, "step": 18211 }, { "epoch": 5.424374988365383, "grad_norm": 0.33658215403556824, "learning_rate": 9.107037975890615e-06, "loss": 1.2163, "step": 18212 }, { "epoch": 5.424672834565051, "grad_norm": 0.25510627031326294, "learning_rate": 9.106077267012386e-06, "loss": 1.2117, "step": 18213 }, { "epoch": 5.42497068076472, "grad_norm": 0.3268873989582062, "learning_rate": 9.10511656645111e-06, "loss": 1.2161, "step": 18214 }, { "epoch": 5.4252685269643885, "grad_norm": 0.2651354968547821, "learning_rate": 9.10415587421572e-06, "loss": 1.2273, "step": 18215 }, { "epoch": 5.425566373164058, "grad_norm": 0.2550216317176819, "learning_rate": 9.103195190315163e-06, "loss": 1.2082, "step": 18216 }, { "epoch": 5.425864219363726, "grad_norm": 0.3427906334400177, "learning_rate": 9.102234514758372e-06, "loss": 1.1958, "step": 18217 }, { "epoch": 5.426162065563394, "grad_norm": 0.30091381072998047, "learning_rate": 9.101273847554282e-06, "loss": 1.185, "step": 18218 }, { "epoch": 5.426459911763064, "grad_norm": 0.45391273498535156, "learning_rate": 9.100313188711841e-06, "loss": 1.2302, "step": 18219 }, { "epoch": 5.426757757962732, "grad_norm": 0.4105113446712494, "learning_rate": 9.09935253823998e-06, "loss": 1.2077, "step": 18220 }, { "epoch": 5.4270556041624, "grad_norm": 0.3463220000267029, "learning_rate": 9.098391896147632e-06, "loss": 1.2034, "step": 18221 }, { "epoch": 5.4273534503620695, "grad_norm": 0.28957387804985046, "learning_rate": 9.097431262443742e-06, "loss": 1.219, "step": 18222 }, { "epoch": 5.427651296561738, "grad_norm": 0.7690960764884949, "learning_rate": 9.096470637137244e-06, "loss": 1.1962, "step": 18223 }, { "epoch": 5.427949142761406, "grad_norm": 0.8276326060295105, "learning_rate": 9.095510020237081e-06, "loss": 1.2095, "step": 18224 }, { "epoch": 5.428246988961075, "grad_norm": 0.4270513355731964, "learning_rate": 9.094549411752189e-06, "loss": 1.2106, "step": 18225 }, { "epoch": 5.428544835160744, "grad_norm": 0.6832343935966492, "learning_rate": 9.093588811691496e-06, "loss": 1.2024, "step": 18226 }, { "epoch": 5.428842681360413, "grad_norm": 0.35265395045280457, "learning_rate": 9.092628220063952e-06, "loss": 1.2139, "step": 18227 }, { "epoch": 5.429140527560081, "grad_norm": 0.4333440959453583, "learning_rate": 9.091667636878485e-06, "loss": 1.2171, "step": 18228 }, { "epoch": 5.42943837375975, "grad_norm": 0.38746553659439087, "learning_rate": 9.090707062144036e-06, "loss": 1.2053, "step": 18229 }, { "epoch": 5.429736219959419, "grad_norm": 0.27784475684165955, "learning_rate": 9.089746495869546e-06, "loss": 1.2268, "step": 18230 }, { "epoch": 5.430034066159087, "grad_norm": 0.4725354313850403, "learning_rate": 9.088785938063946e-06, "loss": 1.2313, "step": 18231 }, { "epoch": 5.4303319123587555, "grad_norm": 0.3004723787307739, "learning_rate": 9.087825388736172e-06, "loss": 1.2157, "step": 18232 }, { "epoch": 5.430629758558425, "grad_norm": 0.4051141142845154, "learning_rate": 9.086864847895167e-06, "loss": 1.2036, "step": 18233 }, { "epoch": 5.430927604758093, "grad_norm": 0.2720029056072235, "learning_rate": 9.085904315549863e-06, "loss": 1.2042, "step": 18234 }, { "epoch": 5.431225450957761, "grad_norm": 0.3584952652454376, "learning_rate": 9.084943791709195e-06, "loss": 1.2117, "step": 18235 }, { "epoch": 5.4315232971574305, "grad_norm": 0.2827838063240051, "learning_rate": 9.083983276382107e-06, "loss": 1.2068, "step": 18236 }, { "epoch": 5.431821143357099, "grad_norm": 0.28493532538414, "learning_rate": 9.083022769577533e-06, "loss": 1.2201, "step": 18237 }, { "epoch": 5.432118989556768, "grad_norm": 0.31898900866508484, "learning_rate": 9.082062271304402e-06, "loss": 1.23, "step": 18238 }, { "epoch": 5.432416835756436, "grad_norm": 0.25817587971687317, "learning_rate": 9.081101781571657e-06, "loss": 1.2027, "step": 18239 }, { "epoch": 5.432714681956105, "grad_norm": 0.295402467250824, "learning_rate": 9.080141300388236e-06, "loss": 1.2, "step": 18240 }, { "epoch": 5.433012528155774, "grad_norm": 0.3077404797077179, "learning_rate": 9.07918082776307e-06, "loss": 1.2176, "step": 18241 }, { "epoch": 5.433310374355442, "grad_norm": 0.31870508193969727, "learning_rate": 9.0782203637051e-06, "loss": 1.2089, "step": 18242 }, { "epoch": 5.433608220555111, "grad_norm": 0.287725567817688, "learning_rate": 9.077259908223258e-06, "loss": 1.2217, "step": 18243 }, { "epoch": 5.43390606675478, "grad_norm": 0.6407656669616699, "learning_rate": 9.076299461326483e-06, "loss": 1.2058, "step": 18244 }, { "epoch": 5.434203912954448, "grad_norm": 0.4119032621383667, "learning_rate": 9.075339023023708e-06, "loss": 1.2338, "step": 18245 }, { "epoch": 5.4345017591541165, "grad_norm": 0.3540785312652588, "learning_rate": 9.074378593323871e-06, "loss": 1.2247, "step": 18246 }, { "epoch": 5.434799605353786, "grad_norm": 0.38739004731178284, "learning_rate": 9.073418172235906e-06, "loss": 1.2142, "step": 18247 }, { "epoch": 5.435097451553454, "grad_norm": 0.2594635486602783, "learning_rate": 9.072457759768752e-06, "loss": 1.2107, "step": 18248 }, { "epoch": 5.435395297753122, "grad_norm": 0.34582409262657166, "learning_rate": 9.071497355931338e-06, "loss": 1.2054, "step": 18249 }, { "epoch": 5.435693143952792, "grad_norm": 0.2676573097705841, "learning_rate": 9.070536960732608e-06, "loss": 1.2036, "step": 18250 }, { "epoch": 5.43599099015246, "grad_norm": 0.28851285576820374, "learning_rate": 9.069576574181492e-06, "loss": 1.2087, "step": 18251 }, { "epoch": 5.436288836352128, "grad_norm": 0.2775110900402069, "learning_rate": 9.068616196286923e-06, "loss": 1.2247, "step": 18252 }, { "epoch": 5.4365866825517974, "grad_norm": 0.2745726406574249, "learning_rate": 9.067655827057845e-06, "loss": 1.2229, "step": 18253 }, { "epoch": 5.436884528751466, "grad_norm": 0.28324687480926514, "learning_rate": 9.066695466503188e-06, "loss": 1.2208, "step": 18254 }, { "epoch": 5.437182374951135, "grad_norm": 0.3074977695941925, "learning_rate": 9.065735114631882e-06, "loss": 1.2083, "step": 18255 }, { "epoch": 5.437480221150803, "grad_norm": 0.28714606165885925, "learning_rate": 9.06477477145287e-06, "loss": 1.2214, "step": 18256 }, { "epoch": 5.437778067350472, "grad_norm": 0.3407638370990753, "learning_rate": 9.063814436975083e-06, "loss": 1.2245, "step": 18257 }, { "epoch": 5.438075913550141, "grad_norm": 0.2992120087146759, "learning_rate": 9.062854111207455e-06, "loss": 1.203, "step": 18258 }, { "epoch": 5.438373759749809, "grad_norm": 0.41462570428848267, "learning_rate": 9.061893794158928e-06, "loss": 1.2146, "step": 18259 }, { "epoch": 5.4386716059494775, "grad_norm": 0.26136377453804016, "learning_rate": 9.060933485838424e-06, "loss": 1.2159, "step": 18260 }, { "epoch": 5.438969452149147, "grad_norm": 0.3173859417438507, "learning_rate": 9.05997318625489e-06, "loss": 1.2015, "step": 18261 }, { "epoch": 5.439267298348815, "grad_norm": 0.2862769067287445, "learning_rate": 9.059012895417253e-06, "loss": 1.2199, "step": 18262 }, { "epoch": 5.439565144548483, "grad_norm": 0.4170054495334625, "learning_rate": 9.05805261333445e-06, "loss": 1.2215, "step": 18263 }, { "epoch": 5.439862990748153, "grad_norm": 0.2662082016468048, "learning_rate": 9.057092340015415e-06, "loss": 1.2128, "step": 18264 }, { "epoch": 5.440160836947821, "grad_norm": 0.2805964946746826, "learning_rate": 9.056132075469084e-06, "loss": 1.2208, "step": 18265 }, { "epoch": 5.44045868314749, "grad_norm": 0.26170092821121216, "learning_rate": 9.055171819704386e-06, "loss": 1.2281, "step": 18266 }, { "epoch": 5.4407565293471585, "grad_norm": 0.2577800452709198, "learning_rate": 9.054211572730262e-06, "loss": 1.2095, "step": 18267 }, { "epoch": 5.441054375546827, "grad_norm": 0.3083866238594055, "learning_rate": 9.053251334555642e-06, "loss": 1.2248, "step": 18268 }, { "epoch": 5.441352221746496, "grad_norm": 0.3356550335884094, "learning_rate": 9.05229110518946e-06, "loss": 1.2078, "step": 18269 }, { "epoch": 5.441650067946164, "grad_norm": 0.3031206429004669, "learning_rate": 9.051330884640649e-06, "loss": 1.2214, "step": 18270 }, { "epoch": 5.441947914145833, "grad_norm": 0.45580852031707764, "learning_rate": 9.050370672918142e-06, "loss": 1.2097, "step": 18271 }, { "epoch": 5.442245760345502, "grad_norm": 0.2769217789173126, "learning_rate": 9.049410470030882e-06, "loss": 1.2442, "step": 18272 }, { "epoch": 5.44254360654517, "grad_norm": 0.26227572560310364, "learning_rate": 9.048450275987793e-06, "loss": 1.2209, "step": 18273 }, { "epoch": 5.4428414527448385, "grad_norm": 0.3485429883003235, "learning_rate": 9.047490090797807e-06, "loss": 1.1998, "step": 18274 }, { "epoch": 5.443139298944508, "grad_norm": 0.27909791469573975, "learning_rate": 9.046529914469865e-06, "loss": 1.2104, "step": 18275 }, { "epoch": 5.443437145144176, "grad_norm": 0.37177976965904236, "learning_rate": 9.045569747012899e-06, "loss": 1.2253, "step": 18276 }, { "epoch": 5.443734991343844, "grad_norm": 0.3017202913761139, "learning_rate": 9.044609588435833e-06, "loss": 1.2138, "step": 18277 }, { "epoch": 5.444032837543514, "grad_norm": 0.5364689826965332, "learning_rate": 9.043649438747615e-06, "loss": 1.2166, "step": 18278 }, { "epoch": 5.444330683743182, "grad_norm": 0.6448546051979065, "learning_rate": 9.042689297957169e-06, "loss": 1.221, "step": 18279 }, { "epoch": 5.44462852994285, "grad_norm": 0.31614118814468384, "learning_rate": 9.041729166073425e-06, "loss": 1.2145, "step": 18280 }, { "epoch": 5.4449263761425195, "grad_norm": 0.7211610078811646, "learning_rate": 9.040769043105324e-06, "loss": 1.2261, "step": 18281 }, { "epoch": 5.445224222342188, "grad_norm": 0.2672927677631378, "learning_rate": 9.039808929061794e-06, "loss": 1.2147, "step": 18282 }, { "epoch": 5.445522068541857, "grad_norm": 0.4884949028491974, "learning_rate": 9.038848823951767e-06, "loss": 1.2154, "step": 18283 }, { "epoch": 5.445819914741525, "grad_norm": 0.5389834642410278, "learning_rate": 9.037888727784184e-06, "loss": 1.2121, "step": 18284 }, { "epoch": 5.446117760941194, "grad_norm": 0.5120760798454285, "learning_rate": 9.03692864056797e-06, "loss": 1.2138, "step": 18285 }, { "epoch": 5.446415607140863, "grad_norm": 0.43643325567245483, "learning_rate": 9.035968562312053e-06, "loss": 1.2054, "step": 18286 }, { "epoch": 5.446713453340531, "grad_norm": 0.3008381724357605, "learning_rate": 9.035008493025376e-06, "loss": 1.2004, "step": 18287 }, { "epoch": 5.4470112995402, "grad_norm": 0.3887217044830322, "learning_rate": 9.034048432716864e-06, "loss": 1.2344, "step": 18288 }, { "epoch": 5.447309145739869, "grad_norm": 0.3350813388824463, "learning_rate": 9.033088381395457e-06, "loss": 1.205, "step": 18289 }, { "epoch": 5.447606991939537, "grad_norm": 0.4774436950683594, "learning_rate": 9.032128339070082e-06, "loss": 1.2161, "step": 18290 }, { "epoch": 5.4479048381392055, "grad_norm": 0.278996080160141, "learning_rate": 9.031168305749665e-06, "loss": 1.2103, "step": 18291 }, { "epoch": 5.448202684338875, "grad_norm": 0.38473767042160034, "learning_rate": 9.03020828144315e-06, "loss": 1.2301, "step": 18292 }, { "epoch": 5.448500530538543, "grad_norm": 0.298493891954422, "learning_rate": 9.029248266159463e-06, "loss": 1.2248, "step": 18293 }, { "epoch": 5.448798376738212, "grad_norm": 0.3384168744087219, "learning_rate": 9.028288259907532e-06, "loss": 1.1965, "step": 18294 }, { "epoch": 5.4490962229378805, "grad_norm": 0.3702334761619568, "learning_rate": 9.027328262696299e-06, "loss": 1.2223, "step": 18295 }, { "epoch": 5.449394069137549, "grad_norm": 0.30994391441345215, "learning_rate": 9.026368274534687e-06, "loss": 1.2046, "step": 18296 }, { "epoch": 5.449691915337218, "grad_norm": 0.42713069915771484, "learning_rate": 9.02540829543163e-06, "loss": 1.2071, "step": 18297 }, { "epoch": 5.449989761536886, "grad_norm": 0.4066449999809265, "learning_rate": 9.024448325396061e-06, "loss": 1.218, "step": 18298 }, { "epoch": 5.450287607736555, "grad_norm": 0.26543983817100525, "learning_rate": 9.02348836443691e-06, "loss": 1.2051, "step": 18299 }, { "epoch": 5.450585453936224, "grad_norm": 0.2768924832344055, "learning_rate": 9.022528412563106e-06, "loss": 1.2095, "step": 18300 }, { "epoch": 5.450883300135892, "grad_norm": 0.3367660343647003, "learning_rate": 9.021568469783588e-06, "loss": 1.2271, "step": 18301 }, { "epoch": 5.451181146335561, "grad_norm": 0.48054754734039307, "learning_rate": 9.020608536107281e-06, "loss": 1.2254, "step": 18302 }, { "epoch": 5.45147899253523, "grad_norm": 0.283011257648468, "learning_rate": 9.019648611543114e-06, "loss": 1.2157, "step": 18303 }, { "epoch": 5.451776838734898, "grad_norm": 0.5190119743347168, "learning_rate": 9.018688696100024e-06, "loss": 1.2134, "step": 18304 }, { "epoch": 5.452074684934567, "grad_norm": 0.2879144847393036, "learning_rate": 9.017728789786939e-06, "loss": 1.2123, "step": 18305 }, { "epoch": 5.452372531134236, "grad_norm": 0.4425755739212036, "learning_rate": 9.016768892612789e-06, "loss": 1.2165, "step": 18306 }, { "epoch": 5.452670377333904, "grad_norm": 0.2664521634578705, "learning_rate": 9.01580900458651e-06, "loss": 1.2049, "step": 18307 }, { "epoch": 5.452968223533573, "grad_norm": 0.39505434036254883, "learning_rate": 9.014849125717023e-06, "loss": 1.2173, "step": 18308 }, { "epoch": 5.453266069733242, "grad_norm": 0.45414382219314575, "learning_rate": 9.013889256013267e-06, "loss": 1.2137, "step": 18309 }, { "epoch": 5.45356391593291, "grad_norm": 0.41616225242614746, "learning_rate": 9.01292939548417e-06, "loss": 1.2219, "step": 18310 }, { "epoch": 5.453861762132579, "grad_norm": 0.489369660615921, "learning_rate": 9.01196954413866e-06, "loss": 1.2301, "step": 18311 }, { "epoch": 5.4541596083322474, "grad_norm": 0.4263423681259155, "learning_rate": 9.011009701985672e-06, "loss": 1.204, "step": 18312 }, { "epoch": 5.454457454531916, "grad_norm": 0.5042636394500732, "learning_rate": 9.010049869034134e-06, "loss": 1.2235, "step": 18313 }, { "epoch": 5.454755300731585, "grad_norm": 0.25747379660606384, "learning_rate": 9.009090045292974e-06, "loss": 1.2085, "step": 18314 }, { "epoch": 5.455053146931253, "grad_norm": 0.5377143025398254, "learning_rate": 9.008130230771126e-06, "loss": 1.229, "step": 18315 }, { "epoch": 5.455350993130922, "grad_norm": 0.2980526089668274, "learning_rate": 9.007170425477516e-06, "loss": 1.2155, "step": 18316 }, { "epoch": 5.455648839330591, "grad_norm": 0.41524243354797363, "learning_rate": 9.006210629421073e-06, "loss": 1.2021, "step": 18317 }, { "epoch": 5.455946685530259, "grad_norm": 0.30433499813079834, "learning_rate": 9.005250842610737e-06, "loss": 1.2032, "step": 18318 }, { "epoch": 5.4562445317299275, "grad_norm": 0.33835574984550476, "learning_rate": 9.004291065055429e-06, "loss": 1.2287, "step": 18319 }, { "epoch": 5.456542377929597, "grad_norm": 0.28729456663131714, "learning_rate": 9.003331296764075e-06, "loss": 1.213, "step": 18320 }, { "epoch": 5.456840224129265, "grad_norm": 0.31967926025390625, "learning_rate": 9.002371537745615e-06, "loss": 1.2085, "step": 18321 }, { "epoch": 5.457138070328934, "grad_norm": 0.3398250937461853, "learning_rate": 9.001411788008969e-06, "loss": 1.2216, "step": 18322 }, { "epoch": 5.457435916528603, "grad_norm": 0.2628474831581116, "learning_rate": 9.000452047563073e-06, "loss": 1.2137, "step": 18323 }, { "epoch": 5.457733762728271, "grad_norm": 0.2607848644256592, "learning_rate": 8.999492316416854e-06, "loss": 1.2297, "step": 18324 }, { "epoch": 5.45803160892794, "grad_norm": 0.31650781631469727, "learning_rate": 8.998532594579239e-06, "loss": 1.2153, "step": 18325 }, { "epoch": 5.4583294551276085, "grad_norm": 0.27923887968063354, "learning_rate": 8.997572882059164e-06, "loss": 1.2249, "step": 18326 }, { "epoch": 5.458627301327277, "grad_norm": 0.28001120686531067, "learning_rate": 8.99661317886555e-06, "loss": 1.2096, "step": 18327 }, { "epoch": 5.458925147526946, "grad_norm": 0.320626437664032, "learning_rate": 8.99565348500733e-06, "loss": 1.2126, "step": 18328 }, { "epoch": 5.459222993726614, "grad_norm": 0.3825687766075134, "learning_rate": 8.994693800493431e-06, "loss": 1.2081, "step": 18329 }, { "epoch": 5.459520839926283, "grad_norm": 0.2936956286430359, "learning_rate": 8.993734125332787e-06, "loss": 1.2197, "step": 18330 }, { "epoch": 5.459818686125952, "grad_norm": 0.33461901545524597, "learning_rate": 8.992774459534317e-06, "loss": 1.2147, "step": 18331 }, { "epoch": 5.46011653232562, "grad_norm": 0.3067677319049835, "learning_rate": 8.99181480310696e-06, "loss": 1.2107, "step": 18332 }, { "epoch": 5.460414378525289, "grad_norm": 0.32353153824806213, "learning_rate": 8.990855156059636e-06, "loss": 1.209, "step": 18333 }, { "epoch": 5.460712224724958, "grad_norm": 0.3095123767852783, "learning_rate": 8.989895518401279e-06, "loss": 1.2131, "step": 18334 }, { "epoch": 5.461010070924626, "grad_norm": 0.33417394757270813, "learning_rate": 8.988935890140816e-06, "loss": 1.2177, "step": 18335 }, { "epoch": 5.461307917124295, "grad_norm": 0.2994416356086731, "learning_rate": 8.987976271287177e-06, "loss": 1.2155, "step": 18336 }, { "epoch": 5.461605763323964, "grad_norm": 0.28538838028907776, "learning_rate": 8.987016661849284e-06, "loss": 1.2049, "step": 18337 }, { "epoch": 5.461903609523632, "grad_norm": 0.46026667952537537, "learning_rate": 8.986057061836073e-06, "loss": 1.2048, "step": 18338 }, { "epoch": 5.462201455723301, "grad_norm": 0.4897211194038391, "learning_rate": 8.985097471256463e-06, "loss": 1.2067, "step": 18339 }, { "epoch": 5.4624993019229695, "grad_norm": 0.2775269150733948, "learning_rate": 8.984137890119391e-06, "loss": 1.2, "step": 18340 }, { "epoch": 5.462797148122638, "grad_norm": 0.2839283347129822, "learning_rate": 8.983178318433782e-06, "loss": 1.2031, "step": 18341 }, { "epoch": 5.463094994322307, "grad_norm": 0.2751613259315491, "learning_rate": 8.982218756208558e-06, "loss": 1.218, "step": 18342 }, { "epoch": 5.463392840521975, "grad_norm": 0.3411848545074463, "learning_rate": 8.981259203452656e-06, "loss": 1.2398, "step": 18343 }, { "epoch": 5.463690686721644, "grad_norm": 0.33500733971595764, "learning_rate": 8.980299660174999e-06, "loss": 1.2235, "step": 18344 }, { "epoch": 5.463988532921313, "grad_norm": 0.40132421255111694, "learning_rate": 8.97934012638451e-06, "loss": 1.2379, "step": 18345 }, { "epoch": 5.464286379120981, "grad_norm": 0.3655446469783783, "learning_rate": 8.978380602090123e-06, "loss": 1.2182, "step": 18346 }, { "epoch": 5.46458422532065, "grad_norm": 0.405704140663147, "learning_rate": 8.977421087300766e-06, "loss": 1.2228, "step": 18347 }, { "epoch": 5.464882071520319, "grad_norm": 0.2523886263370514, "learning_rate": 8.976461582025359e-06, "loss": 1.2207, "step": 18348 }, { "epoch": 5.465179917719987, "grad_norm": 0.2591855227947235, "learning_rate": 8.975502086272838e-06, "loss": 1.1991, "step": 18349 }, { "epoch": 5.465477763919656, "grad_norm": 0.27837905287742615, "learning_rate": 8.974542600052121e-06, "loss": 1.2202, "step": 18350 }, { "epoch": 5.465775610119325, "grad_norm": 0.2816462218761444, "learning_rate": 8.973583123372141e-06, "loss": 1.2033, "step": 18351 }, { "epoch": 5.466073456318993, "grad_norm": 0.25594595074653625, "learning_rate": 8.972623656241824e-06, "loss": 1.2113, "step": 18352 }, { "epoch": 5.466371302518662, "grad_norm": 0.33204081654548645, "learning_rate": 8.971664198670096e-06, "loss": 1.2096, "step": 18353 }, { "epoch": 5.4666691487183305, "grad_norm": 0.27806761860847473, "learning_rate": 8.970704750665881e-06, "loss": 1.213, "step": 18354 }, { "epoch": 5.466966994917999, "grad_norm": 0.4787549078464508, "learning_rate": 8.969745312238111e-06, "loss": 1.1913, "step": 18355 }, { "epoch": 5.467264841117668, "grad_norm": 0.5014739632606506, "learning_rate": 8.968785883395708e-06, "loss": 1.1924, "step": 18356 }, { "epoch": 5.467562687317336, "grad_norm": 0.3203149437904358, "learning_rate": 8.967826464147602e-06, "loss": 1.2279, "step": 18357 }, { "epoch": 5.467860533517005, "grad_norm": 0.3924318552017212, "learning_rate": 8.966867054502716e-06, "loss": 1.2199, "step": 18358 }, { "epoch": 5.468158379716674, "grad_norm": 0.43972548842430115, "learning_rate": 8.965907654469977e-06, "loss": 1.2139, "step": 18359 }, { "epoch": 5.468456225916342, "grad_norm": 0.40511569380760193, "learning_rate": 8.964948264058316e-06, "loss": 1.2193, "step": 18360 }, { "epoch": 5.4687540721160115, "grad_norm": 0.550166666507721, "learning_rate": 8.963988883276655e-06, "loss": 1.2036, "step": 18361 }, { "epoch": 5.46905191831568, "grad_norm": 0.5369710326194763, "learning_rate": 8.963029512133914e-06, "loss": 1.1987, "step": 18362 }, { "epoch": 5.469349764515348, "grad_norm": 0.39129918813705444, "learning_rate": 8.96207015063903e-06, "loss": 1.2158, "step": 18363 }, { "epoch": 5.469647610715017, "grad_norm": 0.5112589001655579, "learning_rate": 8.961110798800923e-06, "loss": 1.2255, "step": 18364 }, { "epoch": 5.469945456914686, "grad_norm": 0.31732383370399475, "learning_rate": 8.960151456628515e-06, "loss": 1.2118, "step": 18365 }, { "epoch": 5.470243303114354, "grad_norm": 0.38156500458717346, "learning_rate": 8.95919212413074e-06, "loss": 1.2154, "step": 18366 }, { "epoch": 5.470541149314023, "grad_norm": 0.3965311348438263, "learning_rate": 8.958232801316522e-06, "loss": 1.2045, "step": 18367 }, { "epoch": 5.470838995513692, "grad_norm": 0.3953753411769867, "learning_rate": 8.957273488194777e-06, "loss": 1.2268, "step": 18368 }, { "epoch": 5.47113684171336, "grad_norm": 0.3397037386894226, "learning_rate": 8.956314184774441e-06, "loss": 1.2149, "step": 18369 }, { "epoch": 5.471434687913029, "grad_norm": 0.28182947635650635, "learning_rate": 8.955354891064437e-06, "loss": 1.2082, "step": 18370 }, { "epoch": 5.471732534112697, "grad_norm": 0.42395129799842834, "learning_rate": 8.954395607073683e-06, "loss": 1.1916, "step": 18371 }, { "epoch": 5.472030380312367, "grad_norm": 0.30112504959106445, "learning_rate": 8.953436332811117e-06, "loss": 1.219, "step": 18372 }, { "epoch": 5.472328226512035, "grad_norm": 0.445017009973526, "learning_rate": 8.95247706828565e-06, "loss": 1.2093, "step": 18373 }, { "epoch": 5.472626072711703, "grad_norm": 0.3907952606678009, "learning_rate": 8.951517813506218e-06, "loss": 1.2053, "step": 18374 }, { "epoch": 5.4729239189113725, "grad_norm": 0.5662532448768616, "learning_rate": 8.95055856848174e-06, "loss": 1.2022, "step": 18375 }, { "epoch": 5.473221765111041, "grad_norm": 0.31141963601112366, "learning_rate": 8.949599333221139e-06, "loss": 1.2199, "step": 18376 }, { "epoch": 5.473519611310709, "grad_norm": 0.4439815580844879, "learning_rate": 8.948640107733347e-06, "loss": 1.2111, "step": 18377 }, { "epoch": 5.473817457510378, "grad_norm": 0.3216348886489868, "learning_rate": 8.947680892027284e-06, "loss": 1.2042, "step": 18378 }, { "epoch": 5.474115303710047, "grad_norm": 0.28621119260787964, "learning_rate": 8.94672168611187e-06, "loss": 1.2067, "step": 18379 }, { "epoch": 5.474413149909715, "grad_norm": 0.4639444649219513, "learning_rate": 8.94576248999604e-06, "loss": 1.207, "step": 18380 }, { "epoch": 5.474710996109384, "grad_norm": 0.3653220534324646, "learning_rate": 8.944803303688709e-06, "loss": 1.2242, "step": 18381 }, { "epoch": 5.475008842309053, "grad_norm": 0.305291086435318, "learning_rate": 8.943844127198802e-06, "loss": 1.2147, "step": 18382 }, { "epoch": 5.475306688508721, "grad_norm": 0.2845461666584015, "learning_rate": 8.94288496053525e-06, "loss": 1.2144, "step": 18383 }, { "epoch": 5.47560453470839, "grad_norm": 0.295358270406723, "learning_rate": 8.941925803706973e-06, "loss": 1.1963, "step": 18384 }, { "epoch": 5.4759023809080585, "grad_norm": 0.39815086126327515, "learning_rate": 8.940966656722887e-06, "loss": 1.2201, "step": 18385 }, { "epoch": 5.476200227107727, "grad_norm": 0.28686094284057617, "learning_rate": 8.940007519591928e-06, "loss": 1.2102, "step": 18386 }, { "epoch": 5.476498073307396, "grad_norm": 0.3080119490623474, "learning_rate": 8.939048392323015e-06, "loss": 1.2257, "step": 18387 }, { "epoch": 5.476795919507064, "grad_norm": 0.29281705617904663, "learning_rate": 8.93808927492507e-06, "loss": 1.207, "step": 18388 }, { "epoch": 5.477093765706734, "grad_norm": 0.3752022087574005, "learning_rate": 8.93713016740702e-06, "loss": 1.2202, "step": 18389 }, { "epoch": 5.477391611906402, "grad_norm": 0.25749897956848145, "learning_rate": 8.936171069777783e-06, "loss": 1.2357, "step": 18390 }, { "epoch": 5.47768945810607, "grad_norm": 0.34351646900177, "learning_rate": 8.93521198204629e-06, "loss": 1.2126, "step": 18391 }, { "epoch": 5.477987304305739, "grad_norm": 0.336144357919693, "learning_rate": 8.934252904221455e-06, "loss": 1.2029, "step": 18392 }, { "epoch": 5.478285150505408, "grad_norm": 0.7184677124023438, "learning_rate": 8.933293836312208e-06, "loss": 1.2172, "step": 18393 }, { "epoch": 5.478582996705076, "grad_norm": 0.4423912465572357, "learning_rate": 8.932334778327471e-06, "loss": 1.2211, "step": 18394 }, { "epoch": 5.478880842904745, "grad_norm": 0.49526455998420715, "learning_rate": 8.931375730276168e-06, "loss": 1.2241, "step": 18395 }, { "epoch": 5.479178689104414, "grad_norm": 0.43283307552337646, "learning_rate": 8.930416692167214e-06, "loss": 1.2188, "step": 18396 }, { "epoch": 5.479476535304082, "grad_norm": 0.4464777708053589, "learning_rate": 8.929457664009541e-06, "loss": 1.199, "step": 18397 }, { "epoch": 5.479774381503751, "grad_norm": 0.29845792055130005, "learning_rate": 8.928498645812068e-06, "loss": 1.2119, "step": 18398 }, { "epoch": 5.4800722277034195, "grad_norm": 0.5206387639045715, "learning_rate": 8.927539637583718e-06, "loss": 1.2363, "step": 18399 }, { "epoch": 5.480370073903089, "grad_norm": 0.2644738554954529, "learning_rate": 8.926580639333415e-06, "loss": 1.2211, "step": 18400 }, { "epoch": 5.480667920102757, "grad_norm": 0.6316187977790833, "learning_rate": 8.92562165107008e-06, "loss": 1.2272, "step": 18401 }, { "epoch": 5.480965766302425, "grad_norm": 0.2890859842300415, "learning_rate": 8.924662672802631e-06, "loss": 1.1993, "step": 18402 }, { "epoch": 5.481263612502095, "grad_norm": 0.4356417953968048, "learning_rate": 8.92370370454e-06, "loss": 1.2013, "step": 18403 }, { "epoch": 5.481561458701763, "grad_norm": 0.3311325013637543, "learning_rate": 8.922744746291098e-06, "loss": 1.2298, "step": 18404 }, { "epoch": 5.481859304901431, "grad_norm": 0.3121452033519745, "learning_rate": 8.921785798064855e-06, "loss": 1.2123, "step": 18405 }, { "epoch": 5.4821571511011005, "grad_norm": 0.4611448645591736, "learning_rate": 8.920826859870191e-06, "loss": 1.2307, "step": 18406 }, { "epoch": 5.482454997300769, "grad_norm": 0.3852958381175995, "learning_rate": 8.919867931716025e-06, "loss": 1.2162, "step": 18407 }, { "epoch": 5.482752843500437, "grad_norm": 0.3547942638397217, "learning_rate": 8.918909013611286e-06, "loss": 1.2042, "step": 18408 }, { "epoch": 5.483050689700106, "grad_norm": 0.28711792826652527, "learning_rate": 8.917950105564888e-06, "loss": 1.2028, "step": 18409 }, { "epoch": 5.483348535899775, "grad_norm": 0.4506579637527466, "learning_rate": 8.916991207585754e-06, "loss": 1.2153, "step": 18410 }, { "epoch": 5.483646382099443, "grad_norm": 0.2823413908481598, "learning_rate": 8.916032319682808e-06, "loss": 1.239, "step": 18411 }, { "epoch": 5.483944228299112, "grad_norm": 0.44218409061431885, "learning_rate": 8.915073441864972e-06, "loss": 1.2009, "step": 18412 }, { "epoch": 5.4842420744987805, "grad_norm": 0.28987812995910645, "learning_rate": 8.914114574141161e-06, "loss": 1.2083, "step": 18413 }, { "epoch": 5.484539920698449, "grad_norm": 0.43456321954727173, "learning_rate": 8.913155716520305e-06, "loss": 1.2262, "step": 18414 }, { "epoch": 5.484837766898118, "grad_norm": 0.3784896731376648, "learning_rate": 8.91219686901132e-06, "loss": 1.1998, "step": 18415 }, { "epoch": 5.485135613097786, "grad_norm": 0.41341546177864075, "learning_rate": 8.911238031623126e-06, "loss": 1.2083, "step": 18416 }, { "epoch": 5.485433459297456, "grad_norm": 0.4241754114627838, "learning_rate": 8.910279204364646e-06, "loss": 1.2373, "step": 18417 }, { "epoch": 5.485731305497124, "grad_norm": 0.35157403349876404, "learning_rate": 8.909320387244803e-06, "loss": 1.2006, "step": 18418 }, { "epoch": 5.486029151696792, "grad_norm": 0.5017350316047668, "learning_rate": 8.908361580272512e-06, "loss": 1.2114, "step": 18419 }, { "epoch": 5.4863269978964615, "grad_norm": 0.3144020140171051, "learning_rate": 8.907402783456698e-06, "loss": 1.225, "step": 18420 }, { "epoch": 5.48662484409613, "grad_norm": 0.5000543594360352, "learning_rate": 8.90644399680628e-06, "loss": 1.2153, "step": 18421 }, { "epoch": 5.486922690295798, "grad_norm": 0.25704795122146606, "learning_rate": 8.905485220330178e-06, "loss": 1.1973, "step": 18422 }, { "epoch": 5.487220536495467, "grad_norm": 0.7779128551483154, "learning_rate": 8.904526454037316e-06, "loss": 1.2219, "step": 18423 }, { "epoch": 5.487518382695136, "grad_norm": 0.35181763768196106, "learning_rate": 8.903567697936607e-06, "loss": 1.2091, "step": 18424 }, { "epoch": 5.487816228894804, "grad_norm": 0.6172589063644409, "learning_rate": 8.902608952036981e-06, "loss": 1.2107, "step": 18425 }, { "epoch": 5.488114075094473, "grad_norm": 0.3031884431838989, "learning_rate": 8.901650216347351e-06, "loss": 1.219, "step": 18426 }, { "epoch": 5.488411921294142, "grad_norm": 0.6517040729522705, "learning_rate": 8.900691490876636e-06, "loss": 1.2083, "step": 18427 }, { "epoch": 5.488709767493811, "grad_norm": 0.36943522095680237, "learning_rate": 8.89973277563376e-06, "loss": 1.2268, "step": 18428 }, { "epoch": 5.489007613693479, "grad_norm": 0.37507328391075134, "learning_rate": 8.898774070627643e-06, "loss": 1.2116, "step": 18429 }, { "epoch": 5.489305459893147, "grad_norm": 0.4282570481300354, "learning_rate": 8.897815375867198e-06, "loss": 1.2036, "step": 18430 }, { "epoch": 5.489603306092817, "grad_norm": 0.25638723373413086, "learning_rate": 8.896856691361355e-06, "loss": 1.2073, "step": 18431 }, { "epoch": 5.489901152292485, "grad_norm": 0.4131999909877777, "learning_rate": 8.895898017119027e-06, "loss": 1.2227, "step": 18432 }, { "epoch": 5.490198998492153, "grad_norm": 0.3734094202518463, "learning_rate": 8.894939353149132e-06, "loss": 1.2203, "step": 18433 }, { "epoch": 5.4904968446918225, "grad_norm": 0.5136287212371826, "learning_rate": 8.893980699460592e-06, "loss": 1.2178, "step": 18434 }, { "epoch": 5.490794690891491, "grad_norm": 0.4026089012622833, "learning_rate": 8.893022056062329e-06, "loss": 1.2056, "step": 18435 }, { "epoch": 5.491092537091159, "grad_norm": 0.38764455914497375, "learning_rate": 8.892063422963252e-06, "loss": 1.2145, "step": 18436 }, { "epoch": 5.491390383290828, "grad_norm": 0.4066968560218811, "learning_rate": 8.891104800172294e-06, "loss": 1.221, "step": 18437 }, { "epoch": 5.491688229490497, "grad_norm": 0.2829129993915558, "learning_rate": 8.890146187698363e-06, "loss": 1.2168, "step": 18438 }, { "epoch": 5.491986075690166, "grad_norm": 0.3591943085193634, "learning_rate": 8.889187585550386e-06, "loss": 1.2168, "step": 18439 }, { "epoch": 5.492283921889834, "grad_norm": 0.3042803704738617, "learning_rate": 8.888228993737274e-06, "loss": 1.2036, "step": 18440 }, { "epoch": 5.492581768089503, "grad_norm": 0.30777686834335327, "learning_rate": 8.887270412267947e-06, "loss": 1.1961, "step": 18441 }, { "epoch": 5.492879614289172, "grad_norm": 0.3260495364665985, "learning_rate": 8.88631184115133e-06, "loss": 1.1952, "step": 18442 }, { "epoch": 5.49317746048884, "grad_norm": 0.29427823424339294, "learning_rate": 8.885353280396336e-06, "loss": 1.2176, "step": 18443 }, { "epoch": 5.4934753066885085, "grad_norm": 0.3360000252723694, "learning_rate": 8.884394730011882e-06, "loss": 1.2074, "step": 18444 }, { "epoch": 5.493773152888178, "grad_norm": 0.27873557806015015, "learning_rate": 8.88343619000689e-06, "loss": 1.2113, "step": 18445 }, { "epoch": 5.494070999087846, "grad_norm": 0.3372797966003418, "learning_rate": 8.88247766039028e-06, "loss": 1.2249, "step": 18446 }, { "epoch": 5.494368845287514, "grad_norm": 0.29132211208343506, "learning_rate": 8.881519141170961e-06, "loss": 1.2078, "step": 18447 }, { "epoch": 5.494666691487184, "grad_norm": 0.2600608468055725, "learning_rate": 8.880560632357863e-06, "loss": 1.2101, "step": 18448 }, { "epoch": 5.494964537686852, "grad_norm": 0.3094232678413391, "learning_rate": 8.879602133959895e-06, "loss": 1.2166, "step": 18449 }, { "epoch": 5.49526238388652, "grad_norm": 0.25211331248283386, "learning_rate": 8.878643645985973e-06, "loss": 1.2241, "step": 18450 }, { "epoch": 5.495560230086189, "grad_norm": 0.2739332616329193, "learning_rate": 8.877685168445023e-06, "loss": 1.2284, "step": 18451 }, { "epoch": 5.495858076285858, "grad_norm": 0.31273096799850464, "learning_rate": 8.87672670134596e-06, "loss": 1.2279, "step": 18452 }, { "epoch": 5.496155922485526, "grad_norm": 0.27651849389076233, "learning_rate": 8.875768244697696e-06, "loss": 1.2305, "step": 18453 }, { "epoch": 5.496453768685195, "grad_norm": 0.41167640686035156, "learning_rate": 8.874809798509157e-06, "loss": 1.2204, "step": 18454 }, { "epoch": 5.496751614884864, "grad_norm": 0.32396993041038513, "learning_rate": 8.87385136278925e-06, "loss": 1.2188, "step": 18455 }, { "epoch": 5.497049461084533, "grad_norm": 0.34511902928352356, "learning_rate": 8.872892937546904e-06, "loss": 1.2022, "step": 18456 }, { "epoch": 5.497347307284201, "grad_norm": 0.37354591488838196, "learning_rate": 8.871934522791027e-06, "loss": 1.2257, "step": 18457 }, { "epoch": 5.4976451534838695, "grad_norm": 0.25477805733680725, "learning_rate": 8.870976118530536e-06, "loss": 1.2144, "step": 18458 }, { "epoch": 5.497942999683539, "grad_norm": 0.3068343698978424, "learning_rate": 8.870017724774356e-06, "loss": 1.2195, "step": 18459 }, { "epoch": 5.498240845883207, "grad_norm": 0.2564184069633484, "learning_rate": 8.869059341531398e-06, "loss": 1.2271, "step": 18460 }, { "epoch": 5.498538692082875, "grad_norm": 0.3234730660915375, "learning_rate": 8.868100968810575e-06, "loss": 1.226, "step": 18461 }, { "epoch": 5.498836538282545, "grad_norm": 0.2749391496181488, "learning_rate": 8.867142606620814e-06, "loss": 1.217, "step": 18462 }, { "epoch": 5.499134384482213, "grad_norm": 0.2855779528617859, "learning_rate": 8.866184254971021e-06, "loss": 1.2347, "step": 18463 }, { "epoch": 5.499432230681881, "grad_norm": 0.3118726909160614, "learning_rate": 8.865225913870114e-06, "loss": 1.2201, "step": 18464 }, { "epoch": 5.4997300768815505, "grad_norm": 0.3345693051815033, "learning_rate": 8.864267583327019e-06, "loss": 1.2166, "step": 18465 }, { "epoch": 5.500027923081219, "grad_norm": 0.6133779287338257, "learning_rate": 8.863309263350644e-06, "loss": 1.2284, "step": 18466 }, { "epoch": 5.500325769280888, "grad_norm": 0.3372969329357147, "learning_rate": 8.862350953949901e-06, "loss": 1.1898, "step": 18467 }, { "epoch": 5.500623615480556, "grad_norm": 0.5956374406814575, "learning_rate": 8.861392655133717e-06, "loss": 1.2131, "step": 18468 }, { "epoch": 5.500921461680225, "grad_norm": 0.45349788665771484, "learning_rate": 8.860434366910999e-06, "loss": 1.2157, "step": 18469 }, { "epoch": 5.501219307879894, "grad_norm": 0.2968503534793854, "learning_rate": 8.859476089290666e-06, "loss": 1.2075, "step": 18470 }, { "epoch": 5.501517154079562, "grad_norm": 0.2783360779285431, "learning_rate": 8.858517822281635e-06, "loss": 1.2184, "step": 18471 }, { "epoch": 5.5018150002792305, "grad_norm": 0.4133557677268982, "learning_rate": 8.857559565892818e-06, "loss": 1.1962, "step": 18472 }, { "epoch": 5.5021128464789, "grad_norm": 0.278022438287735, "learning_rate": 8.856601320133136e-06, "loss": 1.2136, "step": 18473 }, { "epoch": 5.502410692678568, "grad_norm": 0.4163760840892792, "learning_rate": 8.8556430850115e-06, "loss": 1.2145, "step": 18474 }, { "epoch": 5.502708538878236, "grad_norm": 0.3640947639942169, "learning_rate": 8.854684860536826e-06, "loss": 1.1836, "step": 18475 }, { "epoch": 5.503006385077906, "grad_norm": 0.3118014335632324, "learning_rate": 8.85372664671803e-06, "loss": 1.2163, "step": 18476 }, { "epoch": 5.503304231277574, "grad_norm": 0.3818114399909973, "learning_rate": 8.852768443564028e-06, "loss": 1.2028, "step": 18477 }, { "epoch": 5.503602077477243, "grad_norm": 0.25287437438964844, "learning_rate": 8.851810251083731e-06, "loss": 1.2116, "step": 18478 }, { "epoch": 5.5038999236769115, "grad_norm": 0.31228604912757874, "learning_rate": 8.85085206928606e-06, "loss": 1.2138, "step": 18479 }, { "epoch": 5.50419776987658, "grad_norm": 0.4218159317970276, "learning_rate": 8.849893898179926e-06, "loss": 1.2261, "step": 18480 }, { "epoch": 5.504495616076248, "grad_norm": 0.33540377020835876, "learning_rate": 8.84893573777424e-06, "loss": 1.2219, "step": 18481 }, { "epoch": 5.504793462275917, "grad_norm": 0.44699543714523315, "learning_rate": 8.847977588077927e-06, "loss": 1.2134, "step": 18482 }, { "epoch": 5.505091308475586, "grad_norm": 0.34513965249061584, "learning_rate": 8.847019449099895e-06, "loss": 1.2288, "step": 18483 }, { "epoch": 5.505389154675255, "grad_norm": 0.3132277727127075, "learning_rate": 8.846061320849055e-06, "loss": 1.2226, "step": 18484 }, { "epoch": 5.505687000874923, "grad_norm": 0.3509070575237274, "learning_rate": 8.845103203334329e-06, "loss": 1.2312, "step": 18485 }, { "epoch": 5.505984847074592, "grad_norm": 0.3406696617603302, "learning_rate": 8.844145096564624e-06, "loss": 1.2114, "step": 18486 }, { "epoch": 5.506282693274261, "grad_norm": 0.39734143018722534, "learning_rate": 8.843187000548857e-06, "loss": 1.189, "step": 18487 }, { "epoch": 5.506580539473929, "grad_norm": 0.25761228799819946, "learning_rate": 8.842228915295945e-06, "loss": 1.2072, "step": 18488 }, { "epoch": 5.506878385673597, "grad_norm": 0.3563632071018219, "learning_rate": 8.841270840814798e-06, "loss": 1.2216, "step": 18489 }, { "epoch": 5.507176231873267, "grad_norm": 0.3468446135520935, "learning_rate": 8.840312777114333e-06, "loss": 1.2229, "step": 18490 }, { "epoch": 5.507474078072935, "grad_norm": 0.3897448182106018, "learning_rate": 8.839354724203463e-06, "loss": 1.2155, "step": 18491 }, { "epoch": 5.507771924272603, "grad_norm": 0.30430126190185547, "learning_rate": 8.838396682091095e-06, "loss": 1.2164, "step": 18492 }, { "epoch": 5.5080697704722725, "grad_norm": 0.4468208849430084, "learning_rate": 8.837438650786152e-06, "loss": 1.2188, "step": 18493 }, { "epoch": 5.508367616671941, "grad_norm": 0.4163860082626343, "learning_rate": 8.836480630297544e-06, "loss": 1.1951, "step": 18494 }, { "epoch": 5.50866546287161, "grad_norm": 0.3813307285308838, "learning_rate": 8.83552262063418e-06, "loss": 1.2161, "step": 18495 }, { "epoch": 5.508963309071278, "grad_norm": 0.4980100989341736, "learning_rate": 8.834564621804982e-06, "loss": 1.214, "step": 18496 }, { "epoch": 5.509261155270947, "grad_norm": 0.2579140067100525, "learning_rate": 8.833606633818855e-06, "loss": 1.1984, "step": 18497 }, { "epoch": 5.509559001470616, "grad_norm": 0.32582640647888184, "learning_rate": 8.832648656684715e-06, "loss": 1.2078, "step": 18498 }, { "epoch": 5.509856847670284, "grad_norm": 0.3629567325115204, "learning_rate": 8.831690690411476e-06, "loss": 1.2263, "step": 18499 }, { "epoch": 5.510154693869953, "grad_norm": 0.3442101776599884, "learning_rate": 8.830732735008052e-06, "loss": 1.2194, "step": 18500 }, { "epoch": 5.510154693869953, "eval_loss": 1.3182575702667236, "eval_runtime": 24.6666, "eval_samples_per_second": 70.298, "eval_steps_per_second": 4.419, "step": 18500 }, { "epoch": 5.510452540069622, "grad_norm": 0.3001834452152252, "learning_rate": 8.829774790483349e-06, "loss": 1.2075, "step": 18501 }, { "epoch": 5.51075038626929, "grad_norm": 0.32820916175842285, "learning_rate": 8.828816856846288e-06, "loss": 1.2052, "step": 18502 }, { "epoch": 5.5110482324689585, "grad_norm": 0.3135179281234741, "learning_rate": 8.827858934105773e-06, "loss": 1.2233, "step": 18503 }, { "epoch": 5.511346078668628, "grad_norm": 0.2526983916759491, "learning_rate": 8.826901022270723e-06, "loss": 1.2112, "step": 18504 }, { "epoch": 5.511643924868296, "grad_norm": 0.33921104669570923, "learning_rate": 8.825943121350052e-06, "loss": 1.2052, "step": 18505 }, { "epoch": 5.511941771067965, "grad_norm": 0.29194679856300354, "learning_rate": 8.824985231352664e-06, "loss": 1.2115, "step": 18506 }, { "epoch": 5.512239617267634, "grad_norm": 0.30259567499160767, "learning_rate": 8.824027352287479e-06, "loss": 1.2089, "step": 18507 }, { "epoch": 5.512537463467302, "grad_norm": 0.26926037669181824, "learning_rate": 8.823069484163406e-06, "loss": 1.2225, "step": 18508 }, { "epoch": 5.512835309666971, "grad_norm": 0.27251744270324707, "learning_rate": 8.82211162698935e-06, "loss": 1.1978, "step": 18509 }, { "epoch": 5.513133155866639, "grad_norm": 0.2935759425163269, "learning_rate": 8.821153780774235e-06, "loss": 1.2113, "step": 18510 }, { "epoch": 5.513431002066308, "grad_norm": 0.3544779121875763, "learning_rate": 8.820195945526969e-06, "loss": 1.2084, "step": 18511 }, { "epoch": 5.513728848265977, "grad_norm": 0.2574149966239929, "learning_rate": 8.819238121256455e-06, "loss": 1.2179, "step": 18512 }, { "epoch": 5.514026694465645, "grad_norm": 0.4223068058490753, "learning_rate": 8.818280307971617e-06, "loss": 1.1989, "step": 18513 }, { "epoch": 5.514324540665314, "grad_norm": 0.2965776324272156, "learning_rate": 8.81732250568136e-06, "loss": 1.2182, "step": 18514 }, { "epoch": 5.514622386864983, "grad_norm": 0.3070499002933502, "learning_rate": 8.81636471439459e-06, "loss": 1.2067, "step": 18515 }, { "epoch": 5.514920233064651, "grad_norm": 0.2785414755344391, "learning_rate": 8.815406934120228e-06, "loss": 1.2132, "step": 18516 }, { "epoch": 5.5152180792643195, "grad_norm": 0.2849379777908325, "learning_rate": 8.814449164867183e-06, "loss": 1.2151, "step": 18517 }, { "epoch": 5.515515925463989, "grad_norm": 0.2550511658191681, "learning_rate": 8.813491406644358e-06, "loss": 1.1991, "step": 18518 }, { "epoch": 5.515813771663657, "grad_norm": 0.5555850863456726, "learning_rate": 8.812533659460675e-06, "loss": 1.231, "step": 18519 }, { "epoch": 5.516111617863325, "grad_norm": 0.4058053493499756, "learning_rate": 8.811575923325036e-06, "loss": 1.2164, "step": 18520 }, { "epoch": 5.516409464062995, "grad_norm": 0.38803279399871826, "learning_rate": 8.81061819824636e-06, "loss": 1.22, "step": 18521 }, { "epoch": 5.516707310262663, "grad_norm": 0.26936450600624084, "learning_rate": 8.809660484233547e-06, "loss": 1.2179, "step": 18522 }, { "epoch": 5.517005156462332, "grad_norm": 0.6515083909034729, "learning_rate": 8.808702781295514e-06, "loss": 1.2145, "step": 18523 }, { "epoch": 5.5173030026620005, "grad_norm": 0.3911673128604889, "learning_rate": 8.807745089441175e-06, "loss": 1.2138, "step": 18524 }, { "epoch": 5.517600848861669, "grad_norm": 0.5121654868125916, "learning_rate": 8.806787408679436e-06, "loss": 1.2131, "step": 18525 }, { "epoch": 5.517898695061338, "grad_norm": 0.26042410731315613, "learning_rate": 8.805829739019201e-06, "loss": 1.2077, "step": 18526 }, { "epoch": 5.518196541261006, "grad_norm": 0.7270402908325195, "learning_rate": 8.804872080469393e-06, "loss": 1.2068, "step": 18527 }, { "epoch": 5.518494387460675, "grad_norm": 0.2767256796360016, "learning_rate": 8.803914433038912e-06, "loss": 1.2086, "step": 18528 }, { "epoch": 5.518792233660344, "grad_norm": 0.5118314027786255, "learning_rate": 8.802956796736668e-06, "loss": 1.2073, "step": 18529 }, { "epoch": 5.519090079860012, "grad_norm": 0.2819366753101349, "learning_rate": 8.801999171571577e-06, "loss": 1.1999, "step": 18530 }, { "epoch": 5.5193879260596805, "grad_norm": 0.440983384847641, "learning_rate": 8.801041557552546e-06, "loss": 1.2039, "step": 18531 }, { "epoch": 5.51968577225935, "grad_norm": 0.3366145193576813, "learning_rate": 8.800083954688481e-06, "loss": 1.1966, "step": 18532 }, { "epoch": 5.519983618459018, "grad_norm": 0.27563244104385376, "learning_rate": 8.799126362988295e-06, "loss": 1.2102, "step": 18533 }, { "epoch": 5.520281464658687, "grad_norm": 0.370993047952652, "learning_rate": 8.798168782460899e-06, "loss": 1.1929, "step": 18534 }, { "epoch": 5.520579310858356, "grad_norm": 0.2635239362716675, "learning_rate": 8.797211213115195e-06, "loss": 1.2147, "step": 18535 }, { "epoch": 5.520877157058024, "grad_norm": 0.3107627034187317, "learning_rate": 8.7962536549601e-06, "loss": 1.2119, "step": 18536 }, { "epoch": 5.521175003257693, "grad_norm": 0.3534769117832184, "learning_rate": 8.795296108004517e-06, "loss": 1.2235, "step": 18537 }, { "epoch": 5.5214728494573615, "grad_norm": 0.25283482670783997, "learning_rate": 8.794338572257362e-06, "loss": 1.2023, "step": 18538 }, { "epoch": 5.52177069565703, "grad_norm": 0.3561493754386902, "learning_rate": 8.793381047727538e-06, "loss": 1.2204, "step": 18539 }, { "epoch": 5.522068541856699, "grad_norm": 0.26294341683387756, "learning_rate": 8.792423534423951e-06, "loss": 1.2209, "step": 18540 }, { "epoch": 5.522366388056367, "grad_norm": 0.4278562664985657, "learning_rate": 8.79146603235552e-06, "loss": 1.2298, "step": 18541 }, { "epoch": 5.522664234256036, "grad_norm": 0.2937805652618408, "learning_rate": 8.790508541531147e-06, "loss": 1.2144, "step": 18542 }, { "epoch": 5.522962080455705, "grad_norm": 0.34140509366989136, "learning_rate": 8.789551061959736e-06, "loss": 1.2189, "step": 18543 }, { "epoch": 5.523259926655373, "grad_norm": 0.27104368805885315, "learning_rate": 8.788593593650204e-06, "loss": 1.2167, "step": 18544 }, { "epoch": 5.5235577728550425, "grad_norm": 0.45311427116394043, "learning_rate": 8.787636136611455e-06, "loss": 1.2114, "step": 18545 }, { "epoch": 5.523855619054711, "grad_norm": 0.36299455165863037, "learning_rate": 8.786678690852393e-06, "loss": 1.2189, "step": 18546 }, { "epoch": 5.524153465254379, "grad_norm": 0.31058499217033386, "learning_rate": 8.785721256381935e-06, "loss": 1.2253, "step": 18547 }, { "epoch": 5.524451311454047, "grad_norm": 0.2723679840564728, "learning_rate": 8.784763833208983e-06, "loss": 1.2067, "step": 18548 }, { "epoch": 5.524749157653717, "grad_norm": 0.34415721893310547, "learning_rate": 8.783806421342444e-06, "loss": 1.2154, "step": 18549 }, { "epoch": 5.525047003853385, "grad_norm": 0.26098328828811646, "learning_rate": 8.782849020791229e-06, "loss": 1.2159, "step": 18550 }, { "epoch": 5.525344850053054, "grad_norm": 0.3889535367488861, "learning_rate": 8.781891631564244e-06, "loss": 1.2156, "step": 18551 }, { "epoch": 5.5256426962527225, "grad_norm": 0.30614978075027466, "learning_rate": 8.780934253670392e-06, "loss": 1.2065, "step": 18552 }, { "epoch": 5.525940542452391, "grad_norm": 0.2868693768978119, "learning_rate": 8.77997688711859e-06, "loss": 1.2106, "step": 18553 }, { "epoch": 5.52623838865206, "grad_norm": 0.2587234675884247, "learning_rate": 8.779019531917738e-06, "loss": 1.2135, "step": 18554 }, { "epoch": 5.526536234851728, "grad_norm": 0.2812231183052063, "learning_rate": 8.778062188076747e-06, "loss": 1.2034, "step": 18555 }, { "epoch": 5.526834081051397, "grad_norm": 0.27662885189056396, "learning_rate": 8.77710485560452e-06, "loss": 1.2187, "step": 18556 }, { "epoch": 5.527131927251066, "grad_norm": 0.2594603896141052, "learning_rate": 8.776147534509966e-06, "loss": 1.2186, "step": 18557 }, { "epoch": 5.527429773450734, "grad_norm": 0.27191632986068726, "learning_rate": 8.775190224801993e-06, "loss": 1.2423, "step": 18558 }, { "epoch": 5.527727619650403, "grad_norm": 0.32991641759872437, "learning_rate": 8.774232926489507e-06, "loss": 1.2202, "step": 18559 }, { "epoch": 5.528025465850072, "grad_norm": 0.330274373292923, "learning_rate": 8.773275639581412e-06, "loss": 1.2141, "step": 18560 }, { "epoch": 5.52832331204974, "grad_norm": 0.4951980710029602, "learning_rate": 8.77231836408662e-06, "loss": 1.2053, "step": 18561 }, { "epoch": 5.528621158249409, "grad_norm": 0.32925066351890564, "learning_rate": 8.77136110001403e-06, "loss": 1.2068, "step": 18562 }, { "epoch": 5.528919004449078, "grad_norm": 0.7124512195587158, "learning_rate": 8.770403847372555e-06, "loss": 1.2128, "step": 18563 }, { "epoch": 5.529216850648746, "grad_norm": 0.29837125539779663, "learning_rate": 8.769446606171097e-06, "loss": 1.217, "step": 18564 }, { "epoch": 5.529514696848415, "grad_norm": 0.7584240436553955, "learning_rate": 8.768489376418566e-06, "loss": 1.2025, "step": 18565 }, { "epoch": 5.529812543048084, "grad_norm": 0.3047201931476593, "learning_rate": 8.767532158123861e-06, "loss": 1.2153, "step": 18566 }, { "epoch": 5.530110389247752, "grad_norm": 0.34465089440345764, "learning_rate": 8.766574951295895e-06, "loss": 1.2253, "step": 18567 }, { "epoch": 5.530408235447421, "grad_norm": 0.3658747375011444, "learning_rate": 8.76561775594357e-06, "loss": 1.2177, "step": 18568 }, { "epoch": 5.530706081647089, "grad_norm": 0.39325231313705444, "learning_rate": 8.764660572075791e-06, "loss": 1.2274, "step": 18569 }, { "epoch": 5.531003927846758, "grad_norm": 0.32030919194221497, "learning_rate": 8.76370339970147e-06, "loss": 1.2102, "step": 18570 }, { "epoch": 5.531301774046427, "grad_norm": 0.3121950626373291, "learning_rate": 8.762746238829502e-06, "loss": 1.2034, "step": 18571 }, { "epoch": 5.531599620246095, "grad_norm": 0.2956571877002716, "learning_rate": 8.761789089468803e-06, "loss": 1.2112, "step": 18572 }, { "epoch": 5.5318974664457645, "grad_norm": 0.28109005093574524, "learning_rate": 8.760831951628274e-06, "loss": 1.2143, "step": 18573 }, { "epoch": 5.532195312645433, "grad_norm": 0.2740158438682556, "learning_rate": 8.759874825316813e-06, "loss": 1.2155, "step": 18574 }, { "epoch": 5.532493158845101, "grad_norm": 0.2645137310028076, "learning_rate": 8.758917710543334e-06, "loss": 1.2234, "step": 18575 }, { "epoch": 5.53279100504477, "grad_norm": 0.340607613325119, "learning_rate": 8.757960607316742e-06, "loss": 1.2356, "step": 18576 }, { "epoch": 5.533088851244439, "grad_norm": 0.26851311326026917, "learning_rate": 8.757003515645932e-06, "loss": 1.212, "step": 18577 }, { "epoch": 5.533386697444107, "grad_norm": 0.4204433560371399, "learning_rate": 8.756046435539823e-06, "loss": 1.217, "step": 18578 }, { "epoch": 5.533684543643776, "grad_norm": 0.2745577096939087, "learning_rate": 8.75508936700731e-06, "loss": 1.2187, "step": 18579 }, { "epoch": 5.533982389843445, "grad_norm": 0.40016692876815796, "learning_rate": 8.754132310057297e-06, "loss": 1.2246, "step": 18580 }, { "epoch": 5.534280236043113, "grad_norm": 0.3720231354236603, "learning_rate": 8.753175264698692e-06, "loss": 1.1945, "step": 18581 }, { "epoch": 5.534578082242782, "grad_norm": 0.24383926391601562, "learning_rate": 8.7522182309404e-06, "loss": 1.2113, "step": 18582 }, { "epoch": 5.5348759284424505, "grad_norm": 0.27411460876464844, "learning_rate": 8.751261208791321e-06, "loss": 1.2208, "step": 18583 }, { "epoch": 5.535173774642119, "grad_norm": 0.30837035179138184, "learning_rate": 8.750304198260363e-06, "loss": 1.2063, "step": 18584 }, { "epoch": 5.535471620841788, "grad_norm": 0.28115174174308777, "learning_rate": 8.749347199356427e-06, "loss": 1.2168, "step": 18585 }, { "epoch": 5.535769467041456, "grad_norm": 0.35087934136390686, "learning_rate": 8.748390212088417e-06, "loss": 1.2076, "step": 18586 }, { "epoch": 5.536067313241125, "grad_norm": 0.33117544651031494, "learning_rate": 8.74743323646524e-06, "loss": 1.2235, "step": 18587 }, { "epoch": 5.536365159440794, "grad_norm": 0.26276567578315735, "learning_rate": 8.746476272495794e-06, "loss": 1.2051, "step": 18588 }, { "epoch": 5.536663005640462, "grad_norm": 0.40405192971229553, "learning_rate": 8.74551932018899e-06, "loss": 1.2053, "step": 18589 }, { "epoch": 5.536960851840131, "grad_norm": 0.31916743516921997, "learning_rate": 8.744562379553728e-06, "loss": 1.2294, "step": 18590 }, { "epoch": 5.5372586980398, "grad_norm": 0.28235921263694763, "learning_rate": 8.743605450598904e-06, "loss": 1.1922, "step": 18591 }, { "epoch": 5.537556544239468, "grad_norm": 0.3274117410182953, "learning_rate": 8.74264853333343e-06, "loss": 1.2165, "step": 18592 }, { "epoch": 5.537854390439137, "grad_norm": 0.365242063999176, "learning_rate": 8.74169162776621e-06, "loss": 1.2052, "step": 18593 }, { "epoch": 5.538152236638806, "grad_norm": 0.25951096415519714, "learning_rate": 8.74073473390614e-06, "loss": 1.2147, "step": 18594 }, { "epoch": 5.538450082838474, "grad_norm": 0.5193450450897217, "learning_rate": 8.73977785176213e-06, "loss": 1.215, "step": 18595 }, { "epoch": 5.538747929038143, "grad_norm": 0.36995893716812134, "learning_rate": 8.738820981343079e-06, "loss": 1.2198, "step": 18596 }, { "epoch": 5.5390457752378115, "grad_norm": 0.31989729404449463, "learning_rate": 8.737864122657884e-06, "loss": 1.2118, "step": 18597 }, { "epoch": 5.53934362143748, "grad_norm": 0.4452241361141205, "learning_rate": 8.736907275715455e-06, "loss": 1.2305, "step": 18598 }, { "epoch": 5.539641467637149, "grad_norm": 0.29764696955680847, "learning_rate": 8.735950440524696e-06, "loss": 1.2093, "step": 18599 }, { "epoch": 5.539939313836817, "grad_norm": 0.27439337968826294, "learning_rate": 8.734993617094503e-06, "loss": 1.2046, "step": 18600 }, { "epoch": 5.540237160036487, "grad_norm": 0.40285763144493103, "learning_rate": 8.734036805433784e-06, "loss": 1.2097, "step": 18601 }, { "epoch": 5.540535006236155, "grad_norm": 0.30859941244125366, "learning_rate": 8.733080005551437e-06, "loss": 1.2013, "step": 18602 }, { "epoch": 5.540832852435823, "grad_norm": 0.41414734721183777, "learning_rate": 8.732123217456363e-06, "loss": 1.204, "step": 18603 }, { "epoch": 5.5411306986354925, "grad_norm": 0.3561864197254181, "learning_rate": 8.731166441157467e-06, "loss": 1.2186, "step": 18604 }, { "epoch": 5.541428544835161, "grad_norm": 0.31468212604522705, "learning_rate": 8.730209676663647e-06, "loss": 1.2236, "step": 18605 }, { "epoch": 5.541726391034829, "grad_norm": 0.2603742182254791, "learning_rate": 8.729252923983812e-06, "loss": 1.2058, "step": 18606 }, { "epoch": 5.542024237234498, "grad_norm": 0.42395567893981934, "learning_rate": 8.72829618312686e-06, "loss": 1.2131, "step": 18607 }, { "epoch": 5.542322083434167, "grad_norm": 0.2771393060684204, "learning_rate": 8.727339454101685e-06, "loss": 1.2138, "step": 18608 }, { "epoch": 5.542619929633835, "grad_norm": 0.3564935028553009, "learning_rate": 8.726382736917198e-06, "loss": 1.208, "step": 18609 }, { "epoch": 5.542917775833504, "grad_norm": 0.313340425491333, "learning_rate": 8.725426031582297e-06, "loss": 1.2148, "step": 18610 }, { "epoch": 5.5432156220331725, "grad_norm": 0.43277180194854736, "learning_rate": 8.724469338105879e-06, "loss": 1.2183, "step": 18611 }, { "epoch": 5.543513468232842, "grad_norm": 0.6412144899368286, "learning_rate": 8.723512656496855e-06, "loss": 1.2178, "step": 18612 }, { "epoch": 5.54381131443251, "grad_norm": 0.3044149577617645, "learning_rate": 8.722555986764117e-06, "loss": 1.202, "step": 18613 }, { "epoch": 5.544109160632178, "grad_norm": 0.5289282202720642, "learning_rate": 8.721599328916566e-06, "loss": 1.2052, "step": 18614 }, { "epoch": 5.544407006831847, "grad_norm": 0.34878963232040405, "learning_rate": 8.720642682963105e-06, "loss": 1.2101, "step": 18615 }, { "epoch": 5.544704853031516, "grad_norm": 0.35936352610588074, "learning_rate": 8.719686048912638e-06, "loss": 1.2199, "step": 18616 }, { "epoch": 5.545002699231184, "grad_norm": 0.2650383412837982, "learning_rate": 8.718729426774057e-06, "loss": 1.2374, "step": 18617 }, { "epoch": 5.5453005454308535, "grad_norm": 0.44902336597442627, "learning_rate": 8.717772816556273e-06, "loss": 1.2209, "step": 18618 }, { "epoch": 5.545598391630522, "grad_norm": 0.3704821765422821, "learning_rate": 8.716816218268174e-06, "loss": 1.2021, "step": 18619 }, { "epoch": 5.54589623783019, "grad_norm": 0.3121505677700043, "learning_rate": 8.715859631918671e-06, "loss": 1.2147, "step": 18620 }, { "epoch": 5.546194084029859, "grad_norm": 0.3140326738357544, "learning_rate": 8.714903057516658e-06, "loss": 1.2442, "step": 18621 }, { "epoch": 5.546491930229528, "grad_norm": 0.391445130109787, "learning_rate": 8.713946495071036e-06, "loss": 1.2062, "step": 18622 }, { "epoch": 5.546789776429196, "grad_norm": 0.30727216601371765, "learning_rate": 8.712989944590707e-06, "loss": 1.2142, "step": 18623 }, { "epoch": 5.547087622628865, "grad_norm": 0.6961880922317505, "learning_rate": 8.712033406084569e-06, "loss": 1.2115, "step": 18624 }, { "epoch": 5.547385468828534, "grad_norm": 0.2826353907585144, "learning_rate": 8.711076879561516e-06, "loss": 1.2188, "step": 18625 }, { "epoch": 5.547683315028202, "grad_norm": 0.5970001816749573, "learning_rate": 8.710120365030456e-06, "loss": 1.2031, "step": 18626 }, { "epoch": 5.547981161227871, "grad_norm": 0.3100568652153015, "learning_rate": 8.709163862500286e-06, "loss": 1.2106, "step": 18627 }, { "epoch": 5.548279007427539, "grad_norm": 0.3357331156730652, "learning_rate": 8.708207371979899e-06, "loss": 1.2049, "step": 18628 }, { "epoch": 5.548576853627209, "grad_norm": 0.5884115695953369, "learning_rate": 8.707250893478205e-06, "loss": 1.2124, "step": 18629 }, { "epoch": 5.548874699826877, "grad_norm": 0.2585620582103729, "learning_rate": 8.706294427004096e-06, "loss": 1.2203, "step": 18630 }, { "epoch": 5.549172546026545, "grad_norm": 0.585599958896637, "learning_rate": 8.705337972566468e-06, "loss": 1.2078, "step": 18631 }, { "epoch": 5.5494703922262145, "grad_norm": 0.3208726644515991, "learning_rate": 8.70438153017423e-06, "loss": 1.2138, "step": 18632 }, { "epoch": 5.549768238425883, "grad_norm": 0.35780036449432373, "learning_rate": 8.70342509983627e-06, "loss": 1.2029, "step": 18633 }, { "epoch": 5.550066084625551, "grad_norm": 0.39285334944725037, "learning_rate": 8.702468681561489e-06, "loss": 1.2305, "step": 18634 }, { "epoch": 5.55036393082522, "grad_norm": 0.3960075080394745, "learning_rate": 8.701512275358792e-06, "loss": 1.2247, "step": 18635 }, { "epoch": 5.550661777024889, "grad_norm": 0.28272077441215515, "learning_rate": 8.700555881237067e-06, "loss": 1.2208, "step": 18636 }, { "epoch": 5.550959623224557, "grad_norm": 0.35603034496307373, "learning_rate": 8.699599499205223e-06, "loss": 1.2145, "step": 18637 }, { "epoch": 5.551257469424226, "grad_norm": 0.3324495851993561, "learning_rate": 8.698643129272153e-06, "loss": 1.2128, "step": 18638 }, { "epoch": 5.551555315623895, "grad_norm": 0.4525752067565918, "learning_rate": 8.69768677144675e-06, "loss": 1.1984, "step": 18639 }, { "epoch": 5.551853161823564, "grad_norm": 0.31580206751823425, "learning_rate": 8.696730425737919e-06, "loss": 1.2108, "step": 18640 }, { "epoch": 5.552151008023232, "grad_norm": 0.4952149987220764, "learning_rate": 8.695774092154557e-06, "loss": 1.2176, "step": 18641 }, { "epoch": 5.5524488542229005, "grad_norm": 0.24655117094516754, "learning_rate": 8.694817770705556e-06, "loss": 1.1995, "step": 18642 }, { "epoch": 5.55274670042257, "grad_norm": 0.4545476734638214, "learning_rate": 8.693861461399822e-06, "loss": 1.2256, "step": 18643 }, { "epoch": 5.553044546622238, "grad_norm": 0.2850292921066284, "learning_rate": 8.692905164246246e-06, "loss": 1.2114, "step": 18644 }, { "epoch": 5.553342392821906, "grad_norm": 0.3026027977466583, "learning_rate": 8.691948879253725e-06, "loss": 1.2075, "step": 18645 }, { "epoch": 5.5536402390215756, "grad_norm": 0.2584564983844757, "learning_rate": 8.69099260643116e-06, "loss": 1.2205, "step": 18646 }, { "epoch": 5.553938085221244, "grad_norm": 0.26178839802742004, "learning_rate": 8.690036345787448e-06, "loss": 1.2336, "step": 18647 }, { "epoch": 5.554235931420912, "grad_norm": 0.31421634554862976, "learning_rate": 8.68908009733148e-06, "loss": 1.1965, "step": 18648 }, { "epoch": 5.554533777620581, "grad_norm": 0.33658066391944885, "learning_rate": 8.688123861072161e-06, "loss": 1.2293, "step": 18649 }, { "epoch": 5.55483162382025, "grad_norm": 0.279276967048645, "learning_rate": 8.687167637018382e-06, "loss": 1.2275, "step": 18650 }, { "epoch": 5.555129470019918, "grad_norm": 0.328046590089798, "learning_rate": 8.68621142517904e-06, "loss": 1.2176, "step": 18651 }, { "epoch": 5.555427316219587, "grad_norm": 0.41847747564315796, "learning_rate": 8.685255225563036e-06, "loss": 1.1987, "step": 18652 }, { "epoch": 5.555725162419256, "grad_norm": 0.2721949517726898, "learning_rate": 8.68429903817926e-06, "loss": 1.2135, "step": 18653 }, { "epoch": 5.556023008618924, "grad_norm": 0.3741403818130493, "learning_rate": 8.683342863036614e-06, "loss": 1.2309, "step": 18654 }, { "epoch": 5.556320854818593, "grad_norm": 0.26806801557540894, "learning_rate": 8.682386700143992e-06, "loss": 1.2134, "step": 18655 }, { "epoch": 5.5566187010182615, "grad_norm": 0.33318209648132324, "learning_rate": 8.681430549510285e-06, "loss": 1.206, "step": 18656 }, { "epoch": 5.556916547217931, "grad_norm": 0.24343553185462952, "learning_rate": 8.680474411144397e-06, "loss": 1.2038, "step": 18657 }, { "epoch": 5.557214393417599, "grad_norm": 0.31716734170913696, "learning_rate": 8.679518285055223e-06, "loss": 1.2201, "step": 18658 }, { "epoch": 5.557512239617267, "grad_norm": 0.2625264823436737, "learning_rate": 8.67856217125165e-06, "loss": 1.2048, "step": 18659 }, { "epoch": 5.557810085816937, "grad_norm": 0.2422669231891632, "learning_rate": 8.677606069742585e-06, "loss": 1.2226, "step": 18660 }, { "epoch": 5.558107932016605, "grad_norm": 0.260936439037323, "learning_rate": 8.676649980536917e-06, "loss": 1.2042, "step": 18661 }, { "epoch": 5.558405778216273, "grad_norm": 0.27901342511177063, "learning_rate": 8.67569390364354e-06, "loss": 1.2098, "step": 18662 }, { "epoch": 5.5587036244159425, "grad_norm": 0.2622801661491394, "learning_rate": 8.674737839071352e-06, "loss": 1.2123, "step": 18663 }, { "epoch": 5.559001470615611, "grad_norm": 0.28370293974876404, "learning_rate": 8.67378178682925e-06, "loss": 1.2006, "step": 18664 }, { "epoch": 5.559299316815279, "grad_norm": 0.29662513732910156, "learning_rate": 8.672825746926124e-06, "loss": 1.2139, "step": 18665 }, { "epoch": 5.559597163014948, "grad_norm": 0.2562781572341919, "learning_rate": 8.671869719370875e-06, "loss": 1.2201, "step": 18666 }, { "epoch": 5.559895009214617, "grad_norm": 0.24717770516872406, "learning_rate": 8.670913704172392e-06, "loss": 1.2094, "step": 18667 }, { "epoch": 5.560192855414286, "grad_norm": 0.33261075615882874, "learning_rate": 8.669957701339571e-06, "loss": 1.2179, "step": 18668 }, { "epoch": 5.560490701613954, "grad_norm": 0.4968486726284027, "learning_rate": 8.669001710881309e-06, "loss": 1.2071, "step": 18669 }, { "epoch": 5.5607885478136225, "grad_norm": 0.27463075518608093, "learning_rate": 8.668045732806496e-06, "loss": 1.2028, "step": 18670 }, { "epoch": 5.561086394013292, "grad_norm": 0.615999698638916, "learning_rate": 8.667089767124036e-06, "loss": 1.2257, "step": 18671 }, { "epoch": 5.56138424021296, "grad_norm": 0.36775702238082886, "learning_rate": 8.666133813842814e-06, "loss": 1.211, "step": 18672 }, { "epoch": 5.561682086412628, "grad_norm": 0.3622463345527649, "learning_rate": 8.665177872971722e-06, "loss": 1.1954, "step": 18673 }, { "epoch": 5.561979932612298, "grad_norm": 0.3058466613292694, "learning_rate": 8.664221944519665e-06, "loss": 1.2134, "step": 18674 }, { "epoch": 5.562277778811966, "grad_norm": 0.3505442142486572, "learning_rate": 8.663266028495526e-06, "loss": 1.1974, "step": 18675 }, { "epoch": 5.562575625011634, "grad_norm": 0.2794327437877655, "learning_rate": 8.662310124908202e-06, "loss": 1.2144, "step": 18676 }, { "epoch": 5.5628734712113035, "grad_norm": 0.32328954339027405, "learning_rate": 8.661354233766593e-06, "loss": 1.2119, "step": 18677 }, { "epoch": 5.563171317410972, "grad_norm": 0.2575834095478058, "learning_rate": 8.660398355079585e-06, "loss": 1.2076, "step": 18678 }, { "epoch": 5.563469163610641, "grad_norm": 0.30577370524406433, "learning_rate": 8.65944248885607e-06, "loss": 1.229, "step": 18679 }, { "epoch": 5.563767009810309, "grad_norm": 0.2884595990180969, "learning_rate": 8.658486635104948e-06, "loss": 1.2007, "step": 18680 }, { "epoch": 5.564064856009978, "grad_norm": 0.4116250276565552, "learning_rate": 8.657530793835111e-06, "loss": 1.2152, "step": 18681 }, { "epoch": 5.564362702209646, "grad_norm": 0.2661183476448059, "learning_rate": 8.656574965055445e-06, "loss": 1.2063, "step": 18682 }, { "epoch": 5.564660548409315, "grad_norm": 0.5799434185028076, "learning_rate": 8.655619148774853e-06, "loss": 1.2191, "step": 18683 }, { "epoch": 5.564958394608984, "grad_norm": 0.47470203042030334, "learning_rate": 8.654663345002222e-06, "loss": 1.214, "step": 18684 }, { "epoch": 5.565256240808653, "grad_norm": 0.43432360887527466, "learning_rate": 8.653707553746441e-06, "loss": 1.2132, "step": 18685 }, { "epoch": 5.565554087008321, "grad_norm": 0.6057737469673157, "learning_rate": 8.652751775016411e-06, "loss": 1.2212, "step": 18686 }, { "epoch": 5.565851933207989, "grad_norm": 0.2544526755809784, "learning_rate": 8.651796008821018e-06, "loss": 1.1973, "step": 18687 }, { "epoch": 5.566149779407659, "grad_norm": 0.4432823061943054, "learning_rate": 8.650840255169161e-06, "loss": 1.2099, "step": 18688 }, { "epoch": 5.566447625607327, "grad_norm": 0.2819180488586426, "learning_rate": 8.649884514069726e-06, "loss": 1.1989, "step": 18689 }, { "epoch": 5.566745471806995, "grad_norm": 0.3026280105113983, "learning_rate": 8.648928785531606e-06, "loss": 1.2189, "step": 18690 }, { "epoch": 5.5670433180066645, "grad_norm": 0.3398076593875885, "learning_rate": 8.647973069563696e-06, "loss": 1.2319, "step": 18691 }, { "epoch": 5.567341164206333, "grad_norm": 0.3294442892074585, "learning_rate": 8.647017366174884e-06, "loss": 1.2072, "step": 18692 }, { "epoch": 5.567639010406001, "grad_norm": 0.316977322101593, "learning_rate": 8.646061675374062e-06, "loss": 1.2243, "step": 18693 }, { "epoch": 5.56793685660567, "grad_norm": 0.28699055314064026, "learning_rate": 8.645105997170128e-06, "loss": 1.2149, "step": 18694 }, { "epoch": 5.568234702805339, "grad_norm": 0.3433881402015686, "learning_rate": 8.64415033157197e-06, "loss": 1.2225, "step": 18695 }, { "epoch": 5.568532549005008, "grad_norm": 0.2621270716190338, "learning_rate": 8.643194678588473e-06, "loss": 1.2206, "step": 18696 }, { "epoch": 5.568830395204676, "grad_norm": 0.28355029225349426, "learning_rate": 8.642239038228537e-06, "loss": 1.2218, "step": 18697 }, { "epoch": 5.569128241404345, "grad_norm": 0.41647031903266907, "learning_rate": 8.641283410501048e-06, "loss": 1.2096, "step": 18698 }, { "epoch": 5.569426087604014, "grad_norm": 0.3549540340900421, "learning_rate": 8.640327795414898e-06, "loss": 1.2188, "step": 18699 }, { "epoch": 5.569723933803682, "grad_norm": 0.31762734055519104, "learning_rate": 8.639372192978982e-06, "loss": 1.1985, "step": 18700 }, { "epoch": 5.5700217800033505, "grad_norm": 0.336806982755661, "learning_rate": 8.638416603202188e-06, "loss": 1.2094, "step": 18701 }, { "epoch": 5.57031962620302, "grad_norm": 0.27429044246673584, "learning_rate": 8.6374610260934e-06, "loss": 1.223, "step": 18702 }, { "epoch": 5.570617472402688, "grad_norm": 0.29460474848747253, "learning_rate": 8.636505461661521e-06, "loss": 1.2174, "step": 18703 }, { "epoch": 5.570915318602356, "grad_norm": 0.3771706819534302, "learning_rate": 8.635549909915433e-06, "loss": 1.2074, "step": 18704 }, { "epoch": 5.5712131648020256, "grad_norm": 0.26914507150650024, "learning_rate": 8.634594370864029e-06, "loss": 1.2124, "step": 18705 }, { "epoch": 5.571511011001694, "grad_norm": 0.31377163529396057, "learning_rate": 8.6336388445162e-06, "loss": 1.2175, "step": 18706 }, { "epoch": 5.571808857201363, "grad_norm": 0.3108786344528198, "learning_rate": 8.632683330880832e-06, "loss": 1.2009, "step": 18707 }, { "epoch": 5.572106703401031, "grad_norm": 0.29319754242897034, "learning_rate": 8.63172782996682e-06, "loss": 1.1998, "step": 18708 }, { "epoch": 5.5724045496007, "grad_norm": 0.30423447489738464, "learning_rate": 8.630772341783051e-06, "loss": 1.2221, "step": 18709 }, { "epoch": 5.572702395800369, "grad_norm": 0.36488884687423706, "learning_rate": 8.629816866338414e-06, "loss": 1.2176, "step": 18710 }, { "epoch": 5.573000242000037, "grad_norm": 0.27655303478240967, "learning_rate": 8.628861403641804e-06, "loss": 1.2165, "step": 18711 }, { "epoch": 5.573298088199706, "grad_norm": 0.31800028681755066, "learning_rate": 8.627905953702106e-06, "loss": 1.2167, "step": 18712 }, { "epoch": 5.573595934399375, "grad_norm": 0.282143771648407, "learning_rate": 8.626950516528208e-06, "loss": 1.2117, "step": 18713 }, { "epoch": 5.573893780599043, "grad_norm": 0.4625825583934784, "learning_rate": 8.625995092129004e-06, "loss": 1.2113, "step": 18714 }, { "epoch": 5.5741916267987115, "grad_norm": 0.2888241410255432, "learning_rate": 8.625039680513378e-06, "loss": 1.2345, "step": 18715 }, { "epoch": 5.574489472998381, "grad_norm": 0.4537830054759979, "learning_rate": 8.624084281690221e-06, "loss": 1.2212, "step": 18716 }, { "epoch": 5.574787319198049, "grad_norm": 0.28274741768836975, "learning_rate": 8.623128895668426e-06, "loss": 1.1989, "step": 18717 }, { "epoch": 5.575085165397717, "grad_norm": 0.48294222354888916, "learning_rate": 8.622173522456877e-06, "loss": 1.2164, "step": 18718 }, { "epoch": 5.575383011597387, "grad_norm": 0.2852092385292053, "learning_rate": 8.621218162064463e-06, "loss": 1.2211, "step": 18719 }, { "epoch": 5.575680857797055, "grad_norm": 0.27762648463249207, "learning_rate": 8.620262814500074e-06, "loss": 1.2014, "step": 18720 }, { "epoch": 5.575978703996723, "grad_norm": 0.3186171352863312, "learning_rate": 8.619307479772597e-06, "loss": 1.2106, "step": 18721 }, { "epoch": 5.5762765501963925, "grad_norm": 0.3749951124191284, "learning_rate": 8.618352157890921e-06, "loss": 1.2206, "step": 18722 }, { "epoch": 5.576574396396061, "grad_norm": 0.28278741240501404, "learning_rate": 8.617396848863937e-06, "loss": 1.2109, "step": 18723 }, { "epoch": 5.57687224259573, "grad_norm": 0.2718892991542816, "learning_rate": 8.616441552700528e-06, "loss": 1.2149, "step": 18724 }, { "epoch": 5.577170088795398, "grad_norm": 0.4734122157096863, "learning_rate": 8.615486269409587e-06, "loss": 1.221, "step": 18725 }, { "epoch": 5.577467934995067, "grad_norm": 0.3258187472820282, "learning_rate": 8.614530998999996e-06, "loss": 1.1993, "step": 18726 }, { "epoch": 5.577765781194736, "grad_norm": 0.3898892402648926, "learning_rate": 8.61357574148065e-06, "loss": 1.2075, "step": 18727 }, { "epoch": 5.578063627394404, "grad_norm": 0.32334861159324646, "learning_rate": 8.61262049686043e-06, "loss": 1.2034, "step": 18728 }, { "epoch": 5.5783614735940725, "grad_norm": 0.3380454182624817, "learning_rate": 8.61166526514823e-06, "loss": 1.2182, "step": 18729 }, { "epoch": 5.578659319793742, "grad_norm": 0.2916748821735382, "learning_rate": 8.610710046352929e-06, "loss": 1.2187, "step": 18730 }, { "epoch": 5.57895716599341, "grad_norm": 0.3558313846588135, "learning_rate": 8.609754840483422e-06, "loss": 1.208, "step": 18731 }, { "epoch": 5.579255012193078, "grad_norm": 0.32697343826293945, "learning_rate": 8.60879964754859e-06, "loss": 1.2068, "step": 18732 }, { "epoch": 5.579552858392748, "grad_norm": 0.26370006799697876, "learning_rate": 8.607844467557324e-06, "loss": 1.2128, "step": 18733 }, { "epoch": 5.579850704592416, "grad_norm": 0.3471877872943878, "learning_rate": 8.60688930051851e-06, "loss": 1.2152, "step": 18734 }, { "epoch": 5.580148550792085, "grad_norm": 0.2663270831108093, "learning_rate": 8.605934146441032e-06, "loss": 1.2091, "step": 18735 }, { "epoch": 5.5804463969917535, "grad_norm": 0.35175296664237976, "learning_rate": 8.604979005333784e-06, "loss": 1.2053, "step": 18736 }, { "epoch": 5.580744243191422, "grad_norm": 0.28595876693725586, "learning_rate": 8.604023877205649e-06, "loss": 1.2011, "step": 18737 }, { "epoch": 5.581042089391091, "grad_norm": 0.3111506998538971, "learning_rate": 8.603068762065507e-06, "loss": 1.2178, "step": 18738 }, { "epoch": 5.581339935590759, "grad_norm": 0.4185241758823395, "learning_rate": 8.60211365992225e-06, "loss": 1.2049, "step": 18739 }, { "epoch": 5.581637781790428, "grad_norm": 0.2716710567474365, "learning_rate": 8.601158570784768e-06, "loss": 1.2111, "step": 18740 }, { "epoch": 5.581935627990097, "grad_norm": 0.4536549746990204, "learning_rate": 8.600203494661936e-06, "loss": 1.1988, "step": 18741 }, { "epoch": 5.582233474189765, "grad_norm": 0.25362464785575867, "learning_rate": 8.599248431562652e-06, "loss": 1.2091, "step": 18742 }, { "epoch": 5.582531320389434, "grad_norm": 0.5159428119659424, "learning_rate": 8.598293381495797e-06, "loss": 1.224, "step": 18743 }, { "epoch": 5.582829166589103, "grad_norm": 0.2671842873096466, "learning_rate": 8.597338344470252e-06, "loss": 1.2188, "step": 18744 }, { "epoch": 5.583127012788771, "grad_norm": 0.4450357258319855, "learning_rate": 8.596383320494907e-06, "loss": 1.2102, "step": 18745 }, { "epoch": 5.58342485898844, "grad_norm": 0.2808002233505249, "learning_rate": 8.595428309578649e-06, "loss": 1.2119, "step": 18746 }, { "epoch": 5.583722705188109, "grad_norm": 0.3414168953895569, "learning_rate": 8.594473311730357e-06, "loss": 1.2076, "step": 18747 }, { "epoch": 5.584020551387777, "grad_norm": 0.2827218770980835, "learning_rate": 8.593518326958927e-06, "loss": 1.1986, "step": 18748 }, { "epoch": 5.584318397587445, "grad_norm": 0.45315414667129517, "learning_rate": 8.592563355273237e-06, "loss": 1.206, "step": 18749 }, { "epoch": 5.5846162437871145, "grad_norm": 0.4909801781177521, "learning_rate": 8.591608396682168e-06, "loss": 1.207, "step": 18750 }, { "epoch": 5.584914089986783, "grad_norm": 0.28537270426750183, "learning_rate": 8.59065345119461e-06, "loss": 1.1959, "step": 18751 }, { "epoch": 5.585211936186452, "grad_norm": 0.2702919542789459, "learning_rate": 8.589698518819445e-06, "loss": 1.2125, "step": 18752 }, { "epoch": 5.58550978238612, "grad_norm": 0.2869855463504791, "learning_rate": 8.588743599565565e-06, "loss": 1.2124, "step": 18753 }, { "epoch": 5.585807628585789, "grad_norm": 0.2739764451980591, "learning_rate": 8.58778869344185e-06, "loss": 1.2187, "step": 18754 }, { "epoch": 5.586105474785458, "grad_norm": 0.2951570153236389, "learning_rate": 8.586833800457178e-06, "loss": 1.1987, "step": 18755 }, { "epoch": 5.586403320985126, "grad_norm": 0.2770441770553589, "learning_rate": 8.585878920620442e-06, "loss": 1.2057, "step": 18756 }, { "epoch": 5.586701167184795, "grad_norm": 0.3250736594200134, "learning_rate": 8.58492405394052e-06, "loss": 1.2193, "step": 18757 }, { "epoch": 5.586999013384464, "grad_norm": 0.2726987600326538, "learning_rate": 8.5839692004263e-06, "loss": 1.2042, "step": 18758 }, { "epoch": 5.587296859584132, "grad_norm": 0.35027015209198, "learning_rate": 8.583014360086666e-06, "loss": 1.2219, "step": 18759 }, { "epoch": 5.5875947057838005, "grad_norm": 0.26397666335105896, "learning_rate": 8.5820595329305e-06, "loss": 1.2154, "step": 18760 }, { "epoch": 5.58789255198347, "grad_norm": 0.297736257314682, "learning_rate": 8.581104718966683e-06, "loss": 1.1915, "step": 18761 }, { "epoch": 5.588190398183138, "grad_norm": 0.2567819356918335, "learning_rate": 8.580149918204103e-06, "loss": 1.2153, "step": 18762 }, { "epoch": 5.588488244382807, "grad_norm": 0.25121933221817017, "learning_rate": 8.579195130651643e-06, "loss": 1.2009, "step": 18763 }, { "epoch": 5.5887860905824756, "grad_norm": 0.2932000160217285, "learning_rate": 8.578240356318182e-06, "loss": 1.2024, "step": 18764 }, { "epoch": 5.589083936782144, "grad_norm": 0.2614770531654358, "learning_rate": 8.577285595212609e-06, "loss": 1.2157, "step": 18765 }, { "epoch": 5.589381782981813, "grad_norm": 0.4117249548435211, "learning_rate": 8.576330847343805e-06, "loss": 1.2268, "step": 18766 }, { "epoch": 5.589679629181481, "grad_norm": 0.2886642813682556, "learning_rate": 8.575376112720646e-06, "loss": 1.2096, "step": 18767 }, { "epoch": 5.58997747538115, "grad_norm": 0.6023201942443848, "learning_rate": 8.574421391352025e-06, "loss": 1.2227, "step": 18768 }, { "epoch": 5.590275321580819, "grad_norm": 0.3868212401866913, "learning_rate": 8.573466683246818e-06, "loss": 1.197, "step": 18769 }, { "epoch": 5.590573167780487, "grad_norm": 0.555033266544342, "learning_rate": 8.572511988413911e-06, "loss": 1.2141, "step": 18770 }, { "epoch": 5.590871013980156, "grad_norm": 0.41188016533851624, "learning_rate": 8.571557306862187e-06, "loss": 1.1943, "step": 18771 }, { "epoch": 5.591168860179825, "grad_norm": 0.30140528082847595, "learning_rate": 8.570602638600522e-06, "loss": 1.2171, "step": 18772 }, { "epoch": 5.591466706379493, "grad_norm": 0.2842493951320648, "learning_rate": 8.569647983637807e-06, "loss": 1.2128, "step": 18773 }, { "epoch": 5.591764552579162, "grad_norm": 0.28710734844207764, "learning_rate": 8.568693341982916e-06, "loss": 1.2281, "step": 18774 }, { "epoch": 5.592062398778831, "grad_norm": 0.3069527745246887, "learning_rate": 8.567738713644733e-06, "loss": 1.2262, "step": 18775 }, { "epoch": 5.592360244978499, "grad_norm": 0.2823931872844696, "learning_rate": 8.566784098632145e-06, "loss": 1.2168, "step": 18776 }, { "epoch": 5.592658091178168, "grad_norm": 0.2592270076274872, "learning_rate": 8.565829496954031e-06, "loss": 1.2094, "step": 18777 }, { "epoch": 5.592955937377837, "grad_norm": 0.2832771837711334, "learning_rate": 8.564874908619267e-06, "loss": 1.2212, "step": 18778 }, { "epoch": 5.593253783577505, "grad_norm": 0.3229985535144806, "learning_rate": 8.563920333636741e-06, "loss": 1.2213, "step": 18779 }, { "epoch": 5.593551629777174, "grad_norm": 0.36390164494514465, "learning_rate": 8.56296577201533e-06, "loss": 1.2064, "step": 18780 }, { "epoch": 5.5938494759768425, "grad_norm": 0.3118399977684021, "learning_rate": 8.562011223763915e-06, "loss": 1.2143, "step": 18781 }, { "epoch": 5.594147322176511, "grad_norm": 0.3940849304199219, "learning_rate": 8.561056688891384e-06, "loss": 1.2118, "step": 18782 }, { "epoch": 5.59444516837618, "grad_norm": 0.32886838912963867, "learning_rate": 8.560102167406613e-06, "loss": 1.2108, "step": 18783 }, { "epoch": 5.594743014575848, "grad_norm": 0.4006761610507965, "learning_rate": 8.559147659318477e-06, "loss": 1.2062, "step": 18784 }, { "epoch": 5.595040860775517, "grad_norm": 0.36049771308898926, "learning_rate": 8.558193164635867e-06, "loss": 1.2015, "step": 18785 }, { "epoch": 5.595338706975186, "grad_norm": 0.5363227128982544, "learning_rate": 8.557238683367654e-06, "loss": 1.2056, "step": 18786 }, { "epoch": 5.595636553174854, "grad_norm": 0.2785434126853943, "learning_rate": 8.556284215522728e-06, "loss": 1.2278, "step": 18787 }, { "epoch": 5.5959343993745225, "grad_norm": 0.4219331443309784, "learning_rate": 8.555329761109964e-06, "loss": 1.2122, "step": 18788 }, { "epoch": 5.596232245574192, "grad_norm": 0.353397011756897, "learning_rate": 8.55437532013824e-06, "loss": 1.2174, "step": 18789 }, { "epoch": 5.59653009177386, "grad_norm": 0.37983760237693787, "learning_rate": 8.55342089261644e-06, "loss": 1.2183, "step": 18790 }, { "epoch": 5.596827937973529, "grad_norm": 0.47747302055358887, "learning_rate": 8.55246647855344e-06, "loss": 1.2159, "step": 18791 }, { "epoch": 5.597125784173198, "grad_norm": 0.3402019143104553, "learning_rate": 8.551512077958125e-06, "loss": 1.2093, "step": 18792 }, { "epoch": 5.597423630372866, "grad_norm": 0.43803659081459045, "learning_rate": 8.55055769083937e-06, "loss": 1.2114, "step": 18793 }, { "epoch": 5.597721476572535, "grad_norm": 0.2545213997364044, "learning_rate": 8.549603317206058e-06, "loss": 1.2043, "step": 18794 }, { "epoch": 5.5980193227722035, "grad_norm": 0.4699476361274719, "learning_rate": 8.548648957067065e-06, "loss": 1.2068, "step": 18795 }, { "epoch": 5.598317168971872, "grad_norm": 0.36989253759384155, "learning_rate": 8.547694610431275e-06, "loss": 1.1999, "step": 18796 }, { "epoch": 5.598615015171541, "grad_norm": 0.6188362836837769, "learning_rate": 8.54674027730756e-06, "loss": 1.2172, "step": 18797 }, { "epoch": 5.598912861371209, "grad_norm": 0.349924236536026, "learning_rate": 8.545785957704803e-06, "loss": 1.2266, "step": 18798 }, { "epoch": 5.599210707570878, "grad_norm": 0.2815835475921631, "learning_rate": 8.544831651631887e-06, "loss": 1.2155, "step": 18799 }, { "epoch": 5.599508553770547, "grad_norm": 0.6178797483444214, "learning_rate": 8.543877359097685e-06, "loss": 1.2115, "step": 18800 }, { "epoch": 5.599806399970215, "grad_norm": 0.30827033519744873, "learning_rate": 8.542923080111074e-06, "loss": 1.2162, "step": 18801 }, { "epoch": 5.6001042461698844, "grad_norm": 0.6834231615066528, "learning_rate": 8.54196881468094e-06, "loss": 1.2137, "step": 18802 }, { "epoch": 5.600402092369553, "grad_norm": 0.27194470167160034, "learning_rate": 8.541014562816155e-06, "loss": 1.2091, "step": 18803 }, { "epoch": 5.600699938569221, "grad_norm": 0.5771031379699707, "learning_rate": 8.540060324525599e-06, "loss": 1.2368, "step": 18804 }, { "epoch": 5.60099778476889, "grad_norm": 0.31596338748931885, "learning_rate": 8.539106099818153e-06, "loss": 1.2241, "step": 18805 }, { "epoch": 5.601295630968559, "grad_norm": 0.5005790591239929, "learning_rate": 8.538151888702689e-06, "loss": 1.2164, "step": 18806 }, { "epoch": 5.601593477168227, "grad_norm": 0.48484519124031067, "learning_rate": 8.53719769118809e-06, "loss": 1.2168, "step": 18807 }, { "epoch": 5.601891323367896, "grad_norm": 0.37810784578323364, "learning_rate": 8.536243507283237e-06, "loss": 1.2111, "step": 18808 }, { "epoch": 5.6021891695675645, "grad_norm": 0.4823022782802582, "learning_rate": 8.535289336996994e-06, "loss": 1.2149, "step": 18809 }, { "epoch": 5.602487015767233, "grad_norm": 0.2944742441177368, "learning_rate": 8.534335180338252e-06, "loss": 1.2198, "step": 18810 }, { "epoch": 5.602784861966902, "grad_norm": 0.5242235064506531, "learning_rate": 8.533381037315886e-06, "loss": 1.2257, "step": 18811 }, { "epoch": 5.60308270816657, "grad_norm": 0.3142121136188507, "learning_rate": 8.532426907938766e-06, "loss": 1.2181, "step": 18812 }, { "epoch": 5.60338055436624, "grad_norm": 0.6312505602836609, "learning_rate": 8.531472792215777e-06, "loss": 1.2207, "step": 18813 }, { "epoch": 5.603678400565908, "grad_norm": 0.2981034517288208, "learning_rate": 8.530518690155791e-06, "loss": 1.2019, "step": 18814 }, { "epoch": 5.603976246765576, "grad_norm": 0.47545936703681946, "learning_rate": 8.529564601767688e-06, "loss": 1.2126, "step": 18815 }, { "epoch": 5.604274092965245, "grad_norm": 0.33774271607398987, "learning_rate": 8.528610527060343e-06, "loss": 1.205, "step": 18816 }, { "epoch": 5.604571939164914, "grad_norm": 0.5593737959861755, "learning_rate": 8.527656466042635e-06, "loss": 1.2098, "step": 18817 }, { "epoch": 5.604869785364582, "grad_norm": 0.2931670844554901, "learning_rate": 8.526702418723434e-06, "loss": 1.2033, "step": 18818 }, { "epoch": 5.605167631564251, "grad_norm": 0.3417098820209503, "learning_rate": 8.525748385111627e-06, "loss": 1.2194, "step": 18819 }, { "epoch": 5.60546547776392, "grad_norm": 0.30952945351600647, "learning_rate": 8.524794365216079e-06, "loss": 1.2114, "step": 18820 }, { "epoch": 5.605763323963588, "grad_norm": 0.28921374678611755, "learning_rate": 8.523840359045671e-06, "loss": 1.2054, "step": 18821 }, { "epoch": 5.606061170163257, "grad_norm": 0.4081917703151703, "learning_rate": 8.522886366609284e-06, "loss": 1.2007, "step": 18822 }, { "epoch": 5.6063590163629256, "grad_norm": 0.27021026611328125, "learning_rate": 8.521932387915785e-06, "loss": 1.2161, "step": 18823 }, { "epoch": 5.606656862562594, "grad_norm": 0.3343936800956726, "learning_rate": 8.520978422974056e-06, "loss": 1.2213, "step": 18824 }, { "epoch": 5.606954708762263, "grad_norm": 0.38951677083969116, "learning_rate": 8.520024471792973e-06, "loss": 1.2167, "step": 18825 }, { "epoch": 5.607252554961931, "grad_norm": 0.28023195266723633, "learning_rate": 8.519070534381402e-06, "loss": 1.212, "step": 18826 }, { "epoch": 5.6075504011616, "grad_norm": 0.3627184331417084, "learning_rate": 8.518116610748229e-06, "loss": 1.2015, "step": 18827 }, { "epoch": 5.607848247361269, "grad_norm": 0.4622196555137634, "learning_rate": 8.517162700902327e-06, "loss": 1.2171, "step": 18828 }, { "epoch": 5.608146093560937, "grad_norm": 0.24727940559387207, "learning_rate": 8.516208804852563e-06, "loss": 1.2236, "step": 18829 }, { "epoch": 5.6084439397606065, "grad_norm": 0.49555546045303345, "learning_rate": 8.515254922607826e-06, "loss": 1.2069, "step": 18830 }, { "epoch": 5.608741785960275, "grad_norm": 0.33231067657470703, "learning_rate": 8.51430105417698e-06, "loss": 1.2274, "step": 18831 }, { "epoch": 5.609039632159943, "grad_norm": 0.3582731783390045, "learning_rate": 8.5133471995689e-06, "loss": 1.221, "step": 18832 }, { "epoch": 5.609337478359612, "grad_norm": 0.29329031705856323, "learning_rate": 8.512393358792464e-06, "loss": 1.2344, "step": 18833 }, { "epoch": 5.609635324559281, "grad_norm": 0.8112807273864746, "learning_rate": 8.511439531856546e-06, "loss": 1.2053, "step": 18834 }, { "epoch": 5.609933170758949, "grad_norm": 0.42413321137428284, "learning_rate": 8.510485718770023e-06, "loss": 1.2241, "step": 18835 }, { "epoch": 5.610231016958618, "grad_norm": 0.509304404258728, "learning_rate": 8.509531919541766e-06, "loss": 1.2085, "step": 18836 }, { "epoch": 5.610528863158287, "grad_norm": 0.2927802801132202, "learning_rate": 8.508578134180646e-06, "loss": 1.222, "step": 18837 }, { "epoch": 5.610826709357955, "grad_norm": 0.6966512799263, "learning_rate": 8.507624362695544e-06, "loss": 1.1927, "step": 18838 }, { "epoch": 5.611124555557624, "grad_norm": 0.2601867914199829, "learning_rate": 8.506670605095327e-06, "loss": 1.2122, "step": 18839 }, { "epoch": 5.6114224017572925, "grad_norm": 0.619441568851471, "learning_rate": 8.50571686138887e-06, "loss": 1.224, "step": 18840 }, { "epoch": 5.611720247956962, "grad_norm": 0.25965413451194763, "learning_rate": 8.504763131585051e-06, "loss": 1.2239, "step": 18841 }, { "epoch": 5.61201809415663, "grad_norm": 0.41319066286087036, "learning_rate": 8.503809415692743e-06, "loss": 1.2229, "step": 18842 }, { "epoch": 5.612315940356298, "grad_norm": 0.4615105390548706, "learning_rate": 8.50285571372081e-06, "loss": 1.236, "step": 18843 }, { "epoch": 5.6126137865559675, "grad_norm": 0.3761954605579376, "learning_rate": 8.501902025678139e-06, "loss": 1.2214, "step": 18844 }, { "epoch": 5.612911632755636, "grad_norm": 0.6200788617134094, "learning_rate": 8.500948351573591e-06, "loss": 1.1989, "step": 18845 }, { "epoch": 5.613209478955304, "grad_norm": 0.24761039018630981, "learning_rate": 8.499994691416043e-06, "loss": 1.2109, "step": 18846 }, { "epoch": 5.613507325154973, "grad_norm": 0.6030553579330444, "learning_rate": 8.499041045214373e-06, "loss": 1.2018, "step": 18847 }, { "epoch": 5.613805171354642, "grad_norm": 0.42672887444496155, "learning_rate": 8.498087412977448e-06, "loss": 1.2113, "step": 18848 }, { "epoch": 5.61410301755431, "grad_norm": 0.44532349705696106, "learning_rate": 8.497133794714138e-06, "loss": 1.2105, "step": 18849 }, { "epoch": 5.614400863753979, "grad_norm": 0.624701976776123, "learning_rate": 8.49618019043332e-06, "loss": 1.2215, "step": 18850 }, { "epoch": 5.614698709953648, "grad_norm": 0.2653880715370178, "learning_rate": 8.495226600143867e-06, "loss": 1.2222, "step": 18851 }, { "epoch": 5.614996556153316, "grad_norm": 0.5351306200027466, "learning_rate": 8.494273023854649e-06, "loss": 1.1984, "step": 18852 }, { "epoch": 5.615294402352985, "grad_norm": 0.38506999611854553, "learning_rate": 8.493319461574539e-06, "loss": 1.1987, "step": 18853 }, { "epoch": 5.6155922485526535, "grad_norm": 0.38100340962409973, "learning_rate": 8.492365913312404e-06, "loss": 1.204, "step": 18854 }, { "epoch": 5.615890094752322, "grad_norm": 0.44310852885246277, "learning_rate": 8.491412379077125e-06, "loss": 1.2164, "step": 18855 }, { "epoch": 5.616187940951991, "grad_norm": 0.32990390062332153, "learning_rate": 8.490458858877566e-06, "loss": 1.221, "step": 18856 }, { "epoch": 5.616485787151659, "grad_norm": 0.4279313385486603, "learning_rate": 8.489505352722598e-06, "loss": 1.2042, "step": 18857 }, { "epoch": 5.616783633351329, "grad_norm": 0.4555051922798157, "learning_rate": 8.4885518606211e-06, "loss": 1.2099, "step": 18858 }, { "epoch": 5.617081479550997, "grad_norm": 0.2626411020755768, "learning_rate": 8.487598382581939e-06, "loss": 1.218, "step": 18859 }, { "epoch": 5.617379325750665, "grad_norm": 0.534856915473938, "learning_rate": 8.48664491861398e-06, "loss": 1.2085, "step": 18860 }, { "epoch": 5.6176771719503344, "grad_norm": 0.3554114103317261, "learning_rate": 8.485691468726103e-06, "loss": 1.2312, "step": 18861 }, { "epoch": 5.617975018150003, "grad_norm": 0.2575479745864868, "learning_rate": 8.484738032927173e-06, "loss": 1.2201, "step": 18862 }, { "epoch": 5.618272864349671, "grad_norm": 0.34061622619628906, "learning_rate": 8.48378461122606e-06, "loss": 1.2132, "step": 18863 }, { "epoch": 5.61857071054934, "grad_norm": 0.37505868077278137, "learning_rate": 8.482831203631643e-06, "loss": 1.2189, "step": 18864 }, { "epoch": 5.618868556749009, "grad_norm": 0.25552624464035034, "learning_rate": 8.481877810152785e-06, "loss": 1.2091, "step": 18865 }, { "epoch": 5.619166402948677, "grad_norm": 0.35332217812538147, "learning_rate": 8.480924430798356e-06, "loss": 1.2025, "step": 18866 }, { "epoch": 5.619464249148346, "grad_norm": 0.27599403262138367, "learning_rate": 8.47997106557723e-06, "loss": 1.2023, "step": 18867 }, { "epoch": 5.6197620953480145, "grad_norm": 0.2798996567726135, "learning_rate": 8.479017714498272e-06, "loss": 1.205, "step": 18868 }, { "epoch": 5.620059941547684, "grad_norm": 0.2830234467983246, "learning_rate": 8.478064377570356e-06, "loss": 1.2093, "step": 18869 }, { "epoch": 5.620357787747352, "grad_norm": 0.24677562713623047, "learning_rate": 8.477111054802353e-06, "loss": 1.2056, "step": 18870 }, { "epoch": 5.62065563394702, "grad_norm": 0.2698245048522949, "learning_rate": 8.476157746203127e-06, "loss": 1.2119, "step": 18871 }, { "epoch": 5.62095348014669, "grad_norm": 0.318942666053772, "learning_rate": 8.475204451781552e-06, "loss": 1.21, "step": 18872 }, { "epoch": 5.621251326346358, "grad_norm": 0.3281908929347992, "learning_rate": 8.474251171546497e-06, "loss": 1.2293, "step": 18873 }, { "epoch": 5.621549172546026, "grad_norm": 0.27140331268310547, "learning_rate": 8.473297905506827e-06, "loss": 1.2012, "step": 18874 }, { "epoch": 5.6218470187456955, "grad_norm": 0.32080644369125366, "learning_rate": 8.472344653671418e-06, "loss": 1.221, "step": 18875 }, { "epoch": 5.622144864945364, "grad_norm": 0.42807522416114807, "learning_rate": 8.471391416049135e-06, "loss": 1.2233, "step": 18876 }, { "epoch": 5.622442711145032, "grad_norm": 0.33594784140586853, "learning_rate": 8.470438192648842e-06, "loss": 1.2104, "step": 18877 }, { "epoch": 5.622740557344701, "grad_norm": 0.3213953375816345, "learning_rate": 8.469484983479418e-06, "loss": 1.2106, "step": 18878 }, { "epoch": 5.62303840354437, "grad_norm": 0.4297904372215271, "learning_rate": 8.468531788549725e-06, "loss": 1.2107, "step": 18879 }, { "epoch": 5.623336249744039, "grad_norm": 0.28557974100112915, "learning_rate": 8.467578607868632e-06, "loss": 1.2018, "step": 18880 }, { "epoch": 5.623634095943707, "grad_norm": 0.2677142322063446, "learning_rate": 8.466625441445008e-06, "loss": 1.2246, "step": 18881 }, { "epoch": 5.6239319421433756, "grad_norm": 0.2982311546802521, "learning_rate": 8.465672289287723e-06, "loss": 1.1946, "step": 18882 }, { "epoch": 5.624229788343044, "grad_norm": 0.25173237919807434, "learning_rate": 8.464719151405637e-06, "loss": 1.2024, "step": 18883 }, { "epoch": 5.624527634542713, "grad_norm": 0.3390488922595978, "learning_rate": 8.463766027807631e-06, "loss": 1.2011, "step": 18884 }, { "epoch": 5.624825480742381, "grad_norm": 0.2797108292579651, "learning_rate": 8.462812918502561e-06, "loss": 1.2204, "step": 18885 }, { "epoch": 5.625123326942051, "grad_norm": 0.403302937746048, "learning_rate": 8.461859823499301e-06, "loss": 1.2188, "step": 18886 }, { "epoch": 5.625421173141719, "grad_norm": 0.2568717300891876, "learning_rate": 8.460906742806719e-06, "loss": 1.2233, "step": 18887 }, { "epoch": 5.625719019341387, "grad_norm": 0.27788710594177246, "learning_rate": 8.459953676433676e-06, "loss": 1.2051, "step": 18888 }, { "epoch": 5.6260168655410565, "grad_norm": 0.35678115487098694, "learning_rate": 8.459000624389048e-06, "loss": 1.2251, "step": 18889 }, { "epoch": 5.626314711740725, "grad_norm": 0.31350693106651306, "learning_rate": 8.458047586681697e-06, "loss": 1.2183, "step": 18890 }, { "epoch": 5.626612557940393, "grad_norm": 0.31936779618263245, "learning_rate": 8.457094563320486e-06, "loss": 1.2098, "step": 18891 }, { "epoch": 5.626910404140062, "grad_norm": 0.33972495794296265, "learning_rate": 8.456141554314289e-06, "loss": 1.2224, "step": 18892 }, { "epoch": 5.627208250339731, "grad_norm": 0.32694512605667114, "learning_rate": 8.455188559671972e-06, "loss": 1.2177, "step": 18893 }, { "epoch": 5.627506096539399, "grad_norm": 0.2845608592033386, "learning_rate": 8.454235579402395e-06, "loss": 1.2339, "step": 18894 }, { "epoch": 5.627803942739068, "grad_norm": 0.5377947092056274, "learning_rate": 8.453282613514432e-06, "loss": 1.2087, "step": 18895 }, { "epoch": 5.628101788938737, "grad_norm": 0.41585099697113037, "learning_rate": 8.452329662016946e-06, "loss": 1.194, "step": 18896 }, { "epoch": 5.628399635138406, "grad_norm": 0.4627498388290405, "learning_rate": 8.451376724918802e-06, "loss": 1.1897, "step": 18897 }, { "epoch": 5.628697481338074, "grad_norm": 0.320951372385025, "learning_rate": 8.450423802228868e-06, "loss": 1.2084, "step": 18898 }, { "epoch": 5.6289953275377425, "grad_norm": 0.5758249759674072, "learning_rate": 8.449470893956012e-06, "loss": 1.2189, "step": 18899 }, { "epoch": 5.629293173737412, "grad_norm": 0.29355067014694214, "learning_rate": 8.44851800010909e-06, "loss": 1.2192, "step": 18900 }, { "epoch": 5.62959101993708, "grad_norm": 0.44087085127830505, "learning_rate": 8.447565120696982e-06, "loss": 1.2107, "step": 18901 }, { "epoch": 5.629888866136748, "grad_norm": 0.2866421341896057, "learning_rate": 8.44661225572854e-06, "loss": 1.2108, "step": 18902 }, { "epoch": 5.6301867123364175, "grad_norm": 0.2748970091342926, "learning_rate": 8.445659405212641e-06, "loss": 1.1986, "step": 18903 }, { "epoch": 5.630484558536086, "grad_norm": 0.37564316391944885, "learning_rate": 8.444706569158141e-06, "loss": 1.2284, "step": 18904 }, { "epoch": 5.630782404735754, "grad_norm": 0.29964256286621094, "learning_rate": 8.443753747573907e-06, "loss": 1.219, "step": 18905 }, { "epoch": 5.631080250935423, "grad_norm": 0.3106033504009247, "learning_rate": 8.442800940468811e-06, "loss": 1.1973, "step": 18906 }, { "epoch": 5.631378097135092, "grad_norm": 0.473921537399292, "learning_rate": 8.44184814785171e-06, "loss": 1.2174, "step": 18907 }, { "epoch": 5.631675943334761, "grad_norm": 0.38011687994003296, "learning_rate": 8.44089536973147e-06, "loss": 1.2218, "step": 18908 }, { "epoch": 5.631973789534429, "grad_norm": 0.5878388285636902, "learning_rate": 8.439942606116958e-06, "loss": 1.2024, "step": 18909 }, { "epoch": 5.632271635734098, "grad_norm": 0.35579267144203186, "learning_rate": 8.438989857017036e-06, "loss": 1.2191, "step": 18910 }, { "epoch": 5.632569481933767, "grad_norm": 0.4992106854915619, "learning_rate": 8.438037122440568e-06, "loss": 1.216, "step": 18911 }, { "epoch": 5.632867328133435, "grad_norm": 0.32686930894851685, "learning_rate": 8.437084402396424e-06, "loss": 1.1955, "step": 18912 }, { "epoch": 5.6331651743331035, "grad_norm": 0.3842843770980835, "learning_rate": 8.436131696893462e-06, "loss": 1.205, "step": 18913 }, { "epoch": 5.633463020532773, "grad_norm": 0.40132343769073486, "learning_rate": 8.435179005940545e-06, "loss": 1.226, "step": 18914 }, { "epoch": 5.633760866732441, "grad_norm": 0.3229738473892212, "learning_rate": 8.43422632954654e-06, "loss": 1.2151, "step": 18915 }, { "epoch": 5.634058712932109, "grad_norm": 0.4361542761325836, "learning_rate": 8.433273667720311e-06, "loss": 1.1996, "step": 18916 }, { "epoch": 5.634356559131779, "grad_norm": 0.3054964244365692, "learning_rate": 8.432321020470716e-06, "loss": 1.2243, "step": 18917 }, { "epoch": 5.634654405331447, "grad_norm": 0.4058511555194855, "learning_rate": 8.431368387806628e-06, "loss": 1.2217, "step": 18918 }, { "epoch": 5.634952251531116, "grad_norm": 0.27990689873695374, "learning_rate": 8.430415769736899e-06, "loss": 1.2277, "step": 18919 }, { "epoch": 5.6352500977307844, "grad_norm": 0.372094988822937, "learning_rate": 8.429463166270402e-06, "loss": 1.2076, "step": 18920 }, { "epoch": 5.635547943930453, "grad_norm": 0.2758762538433075, "learning_rate": 8.428510577415994e-06, "loss": 1.2251, "step": 18921 }, { "epoch": 5.635845790130121, "grad_norm": 0.4484846591949463, "learning_rate": 8.427558003182537e-06, "loss": 1.2135, "step": 18922 }, { "epoch": 5.63614363632979, "grad_norm": 0.28194659948349, "learning_rate": 8.4266054435789e-06, "loss": 1.2089, "step": 18923 }, { "epoch": 5.636441482529459, "grad_norm": 0.3799077868461609, "learning_rate": 8.42565289861394e-06, "loss": 1.1986, "step": 18924 }, { "epoch": 5.636739328729128, "grad_norm": 0.297743558883667, "learning_rate": 8.42470036829652e-06, "loss": 1.2172, "step": 18925 }, { "epoch": 5.637037174928796, "grad_norm": 0.3965313136577606, "learning_rate": 8.423747852635505e-06, "loss": 1.2199, "step": 18926 }, { "epoch": 5.6373350211284645, "grad_norm": 0.3734728693962097, "learning_rate": 8.422795351639753e-06, "loss": 1.2079, "step": 18927 }, { "epoch": 5.637632867328134, "grad_norm": 0.43451786041259766, "learning_rate": 8.421842865318126e-06, "loss": 1.2205, "step": 18928 }, { "epoch": 5.637930713527802, "grad_norm": 0.45394381880760193, "learning_rate": 8.420890393679493e-06, "loss": 1.207, "step": 18929 }, { "epoch": 5.63822855972747, "grad_norm": 0.3637072443962097, "learning_rate": 8.41993793673271e-06, "loss": 1.205, "step": 18930 }, { "epoch": 5.63852640592714, "grad_norm": 0.5764725208282471, "learning_rate": 8.418985494486634e-06, "loss": 1.2403, "step": 18931 }, { "epoch": 5.638824252126808, "grad_norm": 0.2863178551197052, "learning_rate": 8.418033066950135e-06, "loss": 1.2124, "step": 18932 }, { "epoch": 5.639122098326476, "grad_norm": 0.5326183438301086, "learning_rate": 8.417080654132073e-06, "loss": 1.2185, "step": 18933 }, { "epoch": 5.6394199445261455, "grad_norm": 0.26585251092910767, "learning_rate": 8.416128256041303e-06, "loss": 1.213, "step": 18934 }, { "epoch": 5.639717790725814, "grad_norm": 0.528961181640625, "learning_rate": 8.415175872686692e-06, "loss": 1.2138, "step": 18935 }, { "epoch": 5.640015636925483, "grad_norm": 0.2646341919898987, "learning_rate": 8.414223504077097e-06, "loss": 1.2055, "step": 18936 }, { "epoch": 5.640313483125151, "grad_norm": 0.45840033888816833, "learning_rate": 8.413271150221385e-06, "loss": 1.2116, "step": 18937 }, { "epoch": 5.64061132932482, "grad_norm": 0.3243200182914734, "learning_rate": 8.412318811128408e-06, "loss": 1.208, "step": 18938 }, { "epoch": 5.640909175524489, "grad_norm": 0.3127284049987793, "learning_rate": 8.411366486807032e-06, "loss": 1.2072, "step": 18939 }, { "epoch": 5.641207021724157, "grad_norm": 0.44704189896583557, "learning_rate": 8.410414177266115e-06, "loss": 1.2271, "step": 18940 }, { "epoch": 5.6415048679238256, "grad_norm": 0.3079887330532074, "learning_rate": 8.409461882514522e-06, "loss": 1.2151, "step": 18941 }, { "epoch": 5.641802714123495, "grad_norm": 0.35030487179756165, "learning_rate": 8.408509602561104e-06, "loss": 1.2223, "step": 18942 }, { "epoch": 5.642100560323163, "grad_norm": 0.2610160708427429, "learning_rate": 8.407557337414729e-06, "loss": 1.2239, "step": 18943 }, { "epoch": 5.642398406522831, "grad_norm": 0.25979888439178467, "learning_rate": 8.406605087084252e-06, "loss": 1.2207, "step": 18944 }, { "epoch": 5.642696252722501, "grad_norm": 0.3337382376194, "learning_rate": 8.405652851578533e-06, "loss": 1.2228, "step": 18945 }, { "epoch": 5.642994098922169, "grad_norm": 0.34534624218940735, "learning_rate": 8.404700630906437e-06, "loss": 1.2325, "step": 18946 }, { "epoch": 5.643291945121838, "grad_norm": 0.25939908623695374, "learning_rate": 8.40374842507682e-06, "loss": 1.2174, "step": 18947 }, { "epoch": 5.6435897913215065, "grad_norm": 0.2573728859424591, "learning_rate": 8.402796234098535e-06, "loss": 1.2184, "step": 18948 }, { "epoch": 5.643887637521175, "grad_norm": 0.3470657467842102, "learning_rate": 8.401844057980452e-06, "loss": 1.2045, "step": 18949 }, { "epoch": 5.644185483720843, "grad_norm": 0.2562240958213806, "learning_rate": 8.40089189673142e-06, "loss": 1.2018, "step": 18950 }, { "epoch": 5.644483329920512, "grad_norm": 0.3188422620296478, "learning_rate": 8.399939750360302e-06, "loss": 1.2067, "step": 18951 }, { "epoch": 5.644781176120181, "grad_norm": 0.29142096638679504, "learning_rate": 8.398987618875963e-06, "loss": 1.2017, "step": 18952 }, { "epoch": 5.64507902231985, "grad_norm": 0.2580066919326782, "learning_rate": 8.398035502287247e-06, "loss": 1.2134, "step": 18953 }, { "epoch": 5.645376868519518, "grad_norm": 0.3329210877418518, "learning_rate": 8.397083400603028e-06, "loss": 1.2081, "step": 18954 }, { "epoch": 5.645674714719187, "grad_norm": 0.2585013806819916, "learning_rate": 8.396131313832157e-06, "loss": 1.2118, "step": 18955 }, { "epoch": 5.645972560918856, "grad_norm": 0.34125280380249023, "learning_rate": 8.395179241983486e-06, "loss": 1.2101, "step": 18956 }, { "epoch": 5.646270407118524, "grad_norm": 0.27053216099739075, "learning_rate": 8.394227185065883e-06, "loss": 1.198, "step": 18957 }, { "epoch": 5.6465682533181925, "grad_norm": 0.3379208743572235, "learning_rate": 8.393275143088205e-06, "loss": 1.2102, "step": 18958 }, { "epoch": 5.646866099517862, "grad_norm": 0.26291197538375854, "learning_rate": 8.3923231160593e-06, "loss": 1.2126, "step": 18959 }, { "epoch": 5.64716394571753, "grad_norm": 0.29343560338020325, "learning_rate": 8.391371103988037e-06, "loss": 1.2033, "step": 18960 }, { "epoch": 5.647461791917198, "grad_norm": 0.30597466230392456, "learning_rate": 8.390419106883268e-06, "loss": 1.219, "step": 18961 }, { "epoch": 5.6477596381168675, "grad_norm": 0.3227294385433197, "learning_rate": 8.38946712475385e-06, "loss": 1.2069, "step": 18962 }, { "epoch": 5.648057484316536, "grad_norm": 0.42713555693626404, "learning_rate": 8.38851515760864e-06, "loss": 1.195, "step": 18963 }, { "epoch": 5.648355330516205, "grad_norm": 0.6076002717018127, "learning_rate": 8.3875632054565e-06, "loss": 1.225, "step": 18964 }, { "epoch": 5.648653176715873, "grad_norm": 0.2540733218193054, "learning_rate": 8.386611268306275e-06, "loss": 1.2237, "step": 18965 }, { "epoch": 5.648951022915542, "grad_norm": 0.41091498732566833, "learning_rate": 8.385659346166837e-06, "loss": 1.233, "step": 18966 }, { "epoch": 5.649248869115211, "grad_norm": 0.28372156620025635, "learning_rate": 8.384707439047031e-06, "loss": 1.2191, "step": 18967 }, { "epoch": 5.649546715314879, "grad_norm": 0.3598971962928772, "learning_rate": 8.383755546955718e-06, "loss": 1.2235, "step": 18968 }, { "epoch": 5.649844561514548, "grad_norm": 0.2593168020248413, "learning_rate": 8.382803669901758e-06, "loss": 1.217, "step": 18969 }, { "epoch": 5.650142407714217, "grad_norm": 0.3058864176273346, "learning_rate": 8.381851807893997e-06, "loss": 1.2026, "step": 18970 }, { "epoch": 5.650440253913885, "grad_norm": 0.2763136625289917, "learning_rate": 8.380899960941303e-06, "loss": 1.2111, "step": 18971 }, { "epoch": 5.6507381001135535, "grad_norm": 0.26642781496047974, "learning_rate": 8.379948129052524e-06, "loss": 1.2084, "step": 18972 }, { "epoch": 5.651035946313223, "grad_norm": 0.36185839772224426, "learning_rate": 8.378996312236515e-06, "loss": 1.207, "step": 18973 }, { "epoch": 5.651333792512891, "grad_norm": 0.29904067516326904, "learning_rate": 8.378044510502135e-06, "loss": 1.2122, "step": 18974 }, { "epoch": 5.65163163871256, "grad_norm": 0.32451102137565613, "learning_rate": 8.377092723858242e-06, "loss": 1.2137, "step": 18975 }, { "epoch": 5.651929484912229, "grad_norm": 0.26365503668785095, "learning_rate": 8.376140952313683e-06, "loss": 1.2039, "step": 18976 }, { "epoch": 5.652227331111897, "grad_norm": 0.42056193947792053, "learning_rate": 8.375189195877322e-06, "loss": 1.2356, "step": 18977 }, { "epoch": 5.652525177311566, "grad_norm": 0.281960666179657, "learning_rate": 8.374237454558012e-06, "loss": 1.2018, "step": 18978 }, { "epoch": 5.6528230235112344, "grad_norm": 0.3317055404186249, "learning_rate": 8.3732857283646e-06, "loss": 1.2156, "step": 18979 }, { "epoch": 5.653120869710903, "grad_norm": 0.289728581905365, "learning_rate": 8.37233401730595e-06, "loss": 1.2128, "step": 18980 }, { "epoch": 5.653418715910572, "grad_norm": 0.3112536072731018, "learning_rate": 8.371382321390914e-06, "loss": 1.2087, "step": 18981 }, { "epoch": 5.65371656211024, "grad_norm": 0.26634669303894043, "learning_rate": 8.370430640628342e-06, "loss": 1.1908, "step": 18982 }, { "epoch": 5.654014408309909, "grad_norm": 0.26398199796676636, "learning_rate": 8.369478975027098e-06, "loss": 1.2136, "step": 18983 }, { "epoch": 5.654312254509578, "grad_norm": 0.28051403164863586, "learning_rate": 8.368527324596026e-06, "loss": 1.2097, "step": 18984 }, { "epoch": 5.654610100709246, "grad_norm": 0.2899719178676605, "learning_rate": 8.367575689343987e-06, "loss": 1.2244, "step": 18985 }, { "epoch": 5.654907946908915, "grad_norm": 0.3312186896800995, "learning_rate": 8.366624069279832e-06, "loss": 1.2207, "step": 18986 }, { "epoch": 5.655205793108584, "grad_norm": 0.26433223485946655, "learning_rate": 8.365672464412412e-06, "loss": 1.2051, "step": 18987 }, { "epoch": 5.655503639308252, "grad_norm": 0.28281447291374207, "learning_rate": 8.364720874750589e-06, "loss": 1.1937, "step": 18988 }, { "epoch": 5.65580148550792, "grad_norm": 0.2555307447910309, "learning_rate": 8.36376930030321e-06, "loss": 1.2254, "step": 18989 }, { "epoch": 5.65609933170759, "grad_norm": 0.30408424139022827, "learning_rate": 8.362817741079126e-06, "loss": 1.2226, "step": 18990 }, { "epoch": 5.656397177907258, "grad_norm": 0.27140143513679504, "learning_rate": 8.361866197087198e-06, "loss": 1.2, "step": 18991 }, { "epoch": 5.656695024106927, "grad_norm": 0.4277518689632416, "learning_rate": 8.360914668336273e-06, "loss": 1.2277, "step": 18992 }, { "epoch": 5.6569928703065955, "grad_norm": 0.33056220412254333, "learning_rate": 8.359963154835202e-06, "loss": 1.2102, "step": 18993 }, { "epoch": 5.657290716506264, "grad_norm": 0.4178558886051178, "learning_rate": 8.359011656592847e-06, "loss": 1.2317, "step": 18994 }, { "epoch": 5.657588562705933, "grad_norm": 0.33721283078193665, "learning_rate": 8.358060173618055e-06, "loss": 1.2214, "step": 18995 }, { "epoch": 5.657886408905601, "grad_norm": 0.3312799334526062, "learning_rate": 8.357108705919674e-06, "loss": 1.1996, "step": 18996 }, { "epoch": 5.65818425510527, "grad_norm": 0.2949334979057312, "learning_rate": 8.356157253506563e-06, "loss": 1.1997, "step": 18997 }, { "epoch": 5.658482101304939, "grad_norm": 0.2550343871116638, "learning_rate": 8.355205816387574e-06, "loss": 1.2174, "step": 18998 }, { "epoch": 5.658779947504607, "grad_norm": 0.29912087321281433, "learning_rate": 8.354254394571555e-06, "loss": 1.2096, "step": 18999 }, { "epoch": 5.6590777937042755, "grad_norm": 0.2648436725139618, "learning_rate": 8.353302988067364e-06, "loss": 1.2185, "step": 19000 }, { "epoch": 5.6590777937042755, "eval_loss": 1.319551944732666, "eval_runtime": 24.0597, "eval_samples_per_second": 72.071, "eval_steps_per_second": 4.53, "step": 19000 }, { "epoch": 5.659375639903945, "grad_norm": 0.3021543622016907, "learning_rate": 8.352351596883842e-06, "loss": 1.2241, "step": 19001 }, { "epoch": 5.659673486103613, "grad_norm": 0.25659576058387756, "learning_rate": 8.351400221029854e-06, "loss": 1.2227, "step": 19002 }, { "epoch": 5.659971332303282, "grad_norm": 0.3784889578819275, "learning_rate": 8.350448860514243e-06, "loss": 1.2191, "step": 19003 }, { "epoch": 5.660269178502951, "grad_norm": 0.3093247711658478, "learning_rate": 8.349497515345859e-06, "loss": 1.2171, "step": 19004 }, { "epoch": 5.660567024702619, "grad_norm": 0.29522302746772766, "learning_rate": 8.348546185533562e-06, "loss": 1.2271, "step": 19005 }, { "epoch": 5.660864870902288, "grad_norm": 0.2940315306186676, "learning_rate": 8.347594871086196e-06, "loss": 1.2193, "step": 19006 }, { "epoch": 5.6611627171019565, "grad_norm": 0.2734469175338745, "learning_rate": 8.34664357201261e-06, "loss": 1.2209, "step": 19007 }, { "epoch": 5.661460563301625, "grad_norm": 0.2689495086669922, "learning_rate": 8.345692288321664e-06, "loss": 1.2209, "step": 19008 }, { "epoch": 5.661758409501294, "grad_norm": 0.27630409598350525, "learning_rate": 8.344741020022199e-06, "loss": 1.2123, "step": 19009 }, { "epoch": 5.662056255700962, "grad_norm": 0.26288482546806335, "learning_rate": 8.343789767123067e-06, "loss": 1.2226, "step": 19010 }, { "epoch": 5.662354101900631, "grad_norm": 0.2920735478401184, "learning_rate": 8.342838529633127e-06, "loss": 1.1971, "step": 19011 }, { "epoch": 5.6626519481003, "grad_norm": 0.24532313644886017, "learning_rate": 8.341887307561222e-06, "loss": 1.2094, "step": 19012 }, { "epoch": 5.662949794299968, "grad_norm": 0.27133235335350037, "learning_rate": 8.3409361009162e-06, "loss": 1.2149, "step": 19013 }, { "epoch": 5.6632476404996375, "grad_norm": 0.30312278866767883, "learning_rate": 8.339984909706917e-06, "loss": 1.2144, "step": 19014 }, { "epoch": 5.663545486699306, "grad_norm": 0.35235607624053955, "learning_rate": 8.339033733942217e-06, "loss": 1.2117, "step": 19015 }, { "epoch": 5.663843332898974, "grad_norm": 0.3692496716976166, "learning_rate": 8.33808257363095e-06, "loss": 1.2185, "step": 19016 }, { "epoch": 5.6641411790986425, "grad_norm": 0.29738691449165344, "learning_rate": 8.337131428781974e-06, "loss": 1.2064, "step": 19017 }, { "epoch": 5.664439025298312, "grad_norm": 0.41260096430778503, "learning_rate": 8.336180299404126e-06, "loss": 1.2193, "step": 19018 }, { "epoch": 5.66473687149798, "grad_norm": 0.2588588297367096, "learning_rate": 8.335229185506267e-06, "loss": 1.211, "step": 19019 }, { "epoch": 5.665034717697649, "grad_norm": 0.28206971287727356, "learning_rate": 8.33427808709724e-06, "loss": 1.2098, "step": 19020 }, { "epoch": 5.6653325638973175, "grad_norm": 0.3220142126083374, "learning_rate": 8.333327004185889e-06, "loss": 1.2041, "step": 19021 }, { "epoch": 5.665630410096986, "grad_norm": 0.2897467315196991, "learning_rate": 8.332375936781072e-06, "loss": 1.2004, "step": 19022 }, { "epoch": 5.665928256296655, "grad_norm": 0.3630657196044922, "learning_rate": 8.331424884891636e-06, "loss": 1.2017, "step": 19023 }, { "epoch": 5.666226102496323, "grad_norm": 0.3009297251701355, "learning_rate": 8.330473848526421e-06, "loss": 1.2175, "step": 19024 }, { "epoch": 5.666523948695992, "grad_norm": 0.450808048248291, "learning_rate": 8.329522827694288e-06, "loss": 1.2163, "step": 19025 }, { "epoch": 5.666821794895661, "grad_norm": 0.292876273393631, "learning_rate": 8.328571822404074e-06, "loss": 1.2134, "step": 19026 }, { "epoch": 5.667119641095329, "grad_norm": 0.5328595638275146, "learning_rate": 8.327620832664632e-06, "loss": 1.2056, "step": 19027 }, { "epoch": 5.667417487294998, "grad_norm": 0.3636288642883301, "learning_rate": 8.326669858484814e-06, "loss": 1.1931, "step": 19028 }, { "epoch": 5.667715333494667, "grad_norm": 0.5162214636802673, "learning_rate": 8.325718899873461e-06, "loss": 1.211, "step": 19029 }, { "epoch": 5.668013179694335, "grad_norm": 0.36680299043655396, "learning_rate": 8.324767956839422e-06, "loss": 1.2063, "step": 19030 }, { "epoch": 5.668311025894004, "grad_norm": 0.461832731962204, "learning_rate": 8.323817029391548e-06, "loss": 1.208, "step": 19031 }, { "epoch": 5.668608872093673, "grad_norm": 0.2879669666290283, "learning_rate": 8.322866117538681e-06, "loss": 1.2145, "step": 19032 }, { "epoch": 5.668906718293341, "grad_norm": 0.36926543712615967, "learning_rate": 8.32191522128967e-06, "loss": 1.1999, "step": 19033 }, { "epoch": 5.66920456449301, "grad_norm": 0.36631879210472107, "learning_rate": 8.320964340653365e-06, "loss": 1.2087, "step": 19034 }, { "epoch": 5.669502410692679, "grad_norm": 0.4344761073589325, "learning_rate": 8.32001347563861e-06, "loss": 1.2041, "step": 19035 }, { "epoch": 5.669800256892347, "grad_norm": 0.39795640110969543, "learning_rate": 8.319062626254256e-06, "loss": 1.2098, "step": 19036 }, { "epoch": 5.670098103092016, "grad_norm": 0.36752912402153015, "learning_rate": 8.318111792509144e-06, "loss": 1.2046, "step": 19037 }, { "epoch": 5.6703959492916844, "grad_norm": 0.44404342770576477, "learning_rate": 8.31716097441212e-06, "loss": 1.1956, "step": 19038 }, { "epoch": 5.670693795491353, "grad_norm": 0.4044566750526428, "learning_rate": 8.316210171972035e-06, "loss": 1.2201, "step": 19039 }, { "epoch": 5.670991641691022, "grad_norm": 0.4066852033138275, "learning_rate": 8.315259385197736e-06, "loss": 1.2309, "step": 19040 }, { "epoch": 5.67128948789069, "grad_norm": 0.3474169969558716, "learning_rate": 8.31430861409806e-06, "loss": 1.2164, "step": 19041 }, { "epoch": 5.6715873340903595, "grad_norm": 0.3894362151622772, "learning_rate": 8.313357858681866e-06, "loss": 1.213, "step": 19042 }, { "epoch": 5.671885180290028, "grad_norm": 0.2640187442302704, "learning_rate": 8.312407118957987e-06, "loss": 1.2096, "step": 19043 }, { "epoch": 5.672183026489696, "grad_norm": 0.5282630920410156, "learning_rate": 8.311456394935276e-06, "loss": 1.2067, "step": 19044 }, { "epoch": 5.672480872689365, "grad_norm": 0.3081777095794678, "learning_rate": 8.310505686622578e-06, "loss": 1.1988, "step": 19045 }, { "epoch": 5.672778718889034, "grad_norm": 0.31176304817199707, "learning_rate": 8.309554994028737e-06, "loss": 1.2089, "step": 19046 }, { "epoch": 5.673076565088702, "grad_norm": 0.2757018506526947, "learning_rate": 8.308604317162595e-06, "loss": 1.2213, "step": 19047 }, { "epoch": 5.673374411288371, "grad_norm": 0.30400386452674866, "learning_rate": 8.307653656033004e-06, "loss": 1.2051, "step": 19048 }, { "epoch": 5.67367225748804, "grad_norm": 0.25698554515838623, "learning_rate": 8.306703010648803e-06, "loss": 1.2182, "step": 19049 }, { "epoch": 5.673970103687708, "grad_norm": 0.35520604252815247, "learning_rate": 8.305752381018838e-06, "loss": 1.2223, "step": 19050 }, { "epoch": 5.674267949887377, "grad_norm": 0.322625070810318, "learning_rate": 8.304801767151955e-06, "loss": 1.2248, "step": 19051 }, { "epoch": 5.6745657960870455, "grad_norm": 0.44615495204925537, "learning_rate": 8.303851169056996e-06, "loss": 1.2166, "step": 19052 }, { "epoch": 5.674863642286715, "grad_norm": 0.31453388929367065, "learning_rate": 8.30290058674281e-06, "loss": 1.2085, "step": 19053 }, { "epoch": 5.675161488486383, "grad_norm": 0.5074651837348938, "learning_rate": 8.301950020218239e-06, "loss": 1.2077, "step": 19054 }, { "epoch": 5.675459334686051, "grad_norm": 0.264446496963501, "learning_rate": 8.300999469492119e-06, "loss": 1.2259, "step": 19055 }, { "epoch": 5.67575718088572, "grad_norm": 0.503488302230835, "learning_rate": 8.300048934573306e-06, "loss": 1.2303, "step": 19056 }, { "epoch": 5.676055027085389, "grad_norm": 0.4028741419315338, "learning_rate": 8.29909841547064e-06, "loss": 1.2191, "step": 19057 }, { "epoch": 5.676352873285057, "grad_norm": 0.5179157853126526, "learning_rate": 8.298147912192957e-06, "loss": 1.2093, "step": 19058 }, { "epoch": 5.676650719484726, "grad_norm": 0.3718646168708801, "learning_rate": 8.29719742474911e-06, "loss": 1.2216, "step": 19059 }, { "epoch": 5.676948565684395, "grad_norm": 0.41000795364379883, "learning_rate": 8.29624695314794e-06, "loss": 1.2196, "step": 19060 }, { "epoch": 5.677246411884063, "grad_norm": 0.30932697653770447, "learning_rate": 8.295296497398285e-06, "loss": 1.2069, "step": 19061 }, { "epoch": 5.677544258083732, "grad_norm": 0.2911328077316284, "learning_rate": 8.294346057508992e-06, "loss": 1.2038, "step": 19062 }, { "epoch": 5.677842104283401, "grad_norm": 0.36987268924713135, "learning_rate": 8.293395633488905e-06, "loss": 1.2031, "step": 19063 }, { "epoch": 5.678139950483069, "grad_norm": 0.3083730638027191, "learning_rate": 8.292445225346863e-06, "loss": 1.21, "step": 19064 }, { "epoch": 5.678437796682738, "grad_norm": 0.2953846752643585, "learning_rate": 8.291494833091714e-06, "loss": 1.2102, "step": 19065 }, { "epoch": 5.6787356428824065, "grad_norm": 0.28472769260406494, "learning_rate": 8.29054445673229e-06, "loss": 1.1993, "step": 19066 }, { "epoch": 5.679033489082075, "grad_norm": 0.3901467025279999, "learning_rate": 8.289594096277447e-06, "loss": 1.2172, "step": 19067 }, { "epoch": 5.679331335281744, "grad_norm": 0.3293989300727844, "learning_rate": 8.288643751736016e-06, "loss": 1.2302, "step": 19068 }, { "epoch": 5.679629181481412, "grad_norm": 0.46018221974372864, "learning_rate": 8.287693423116841e-06, "loss": 1.1961, "step": 19069 }, { "epoch": 5.679927027681082, "grad_norm": 0.34103935956954956, "learning_rate": 8.28674311042877e-06, "loss": 1.2098, "step": 19070 }, { "epoch": 5.68022487388075, "grad_norm": 0.4321513772010803, "learning_rate": 8.285792813680641e-06, "loss": 1.193, "step": 19071 }, { "epoch": 5.680522720080418, "grad_norm": 0.3278217315673828, "learning_rate": 8.28484253288129e-06, "loss": 1.2055, "step": 19072 }, { "epoch": 5.6808205662800875, "grad_norm": 0.5738650560379028, "learning_rate": 8.283892268039568e-06, "loss": 1.2072, "step": 19073 }, { "epoch": 5.681118412479756, "grad_norm": 0.3378513753414154, "learning_rate": 8.282942019164308e-06, "loss": 1.2192, "step": 19074 }, { "epoch": 5.681416258679424, "grad_norm": 0.5449102520942688, "learning_rate": 8.281991786264352e-06, "loss": 1.2096, "step": 19075 }, { "epoch": 5.681714104879093, "grad_norm": 0.33484184741973877, "learning_rate": 8.281041569348547e-06, "loss": 1.2169, "step": 19076 }, { "epoch": 5.682011951078762, "grad_norm": 0.5389516353607178, "learning_rate": 8.280091368425731e-06, "loss": 1.2295, "step": 19077 }, { "epoch": 5.68230979727843, "grad_norm": 0.3976714611053467, "learning_rate": 8.279141183504737e-06, "loss": 1.2135, "step": 19078 }, { "epoch": 5.682607643478099, "grad_norm": 0.48223838210105896, "learning_rate": 8.278191014594415e-06, "loss": 1.2308, "step": 19079 }, { "epoch": 5.6829054896777675, "grad_norm": 0.41766807436943054, "learning_rate": 8.277240861703604e-06, "loss": 1.2225, "step": 19080 }, { "epoch": 5.683203335877437, "grad_norm": 0.3423973023891449, "learning_rate": 8.276290724841138e-06, "loss": 1.2155, "step": 19081 }, { "epoch": 5.683501182077105, "grad_norm": 0.7863313555717468, "learning_rate": 8.275340604015864e-06, "loss": 1.22, "step": 19082 }, { "epoch": 5.683799028276773, "grad_norm": 0.3401656746864319, "learning_rate": 8.274390499236616e-06, "loss": 1.2122, "step": 19083 }, { "epoch": 5.684096874476442, "grad_norm": 1.1781949996948242, "learning_rate": 8.273440410512238e-06, "loss": 1.211, "step": 19084 }, { "epoch": 5.684394720676111, "grad_norm": 0.3618912398815155, "learning_rate": 8.272490337851568e-06, "loss": 1.2213, "step": 19085 }, { "epoch": 5.684692566875779, "grad_norm": 0.677024245262146, "learning_rate": 8.271540281263446e-06, "loss": 1.219, "step": 19086 }, { "epoch": 5.6849904130754485, "grad_norm": 0.34416821599006653, "learning_rate": 8.270590240756709e-06, "loss": 1.2082, "step": 19087 }, { "epoch": 5.685288259275117, "grad_norm": 0.4384899437427521, "learning_rate": 8.2696402163402e-06, "loss": 1.2183, "step": 19088 }, { "epoch": 5.685586105474785, "grad_norm": 0.39428257942199707, "learning_rate": 8.268690208022752e-06, "loss": 1.2003, "step": 19089 }, { "epoch": 5.685883951674454, "grad_norm": 0.29664862155914307, "learning_rate": 8.267740215813212e-06, "loss": 1.2021, "step": 19090 }, { "epoch": 5.686181797874123, "grad_norm": 0.3758775293827057, "learning_rate": 8.26679023972041e-06, "loss": 1.2317, "step": 19091 }, { "epoch": 5.686479644073791, "grad_norm": 0.3397117853164673, "learning_rate": 8.265840279753187e-06, "loss": 1.2138, "step": 19092 }, { "epoch": 5.68677749027346, "grad_norm": 0.3522505760192871, "learning_rate": 8.264890335920387e-06, "loss": 1.2023, "step": 19093 }, { "epoch": 5.687075336473129, "grad_norm": 0.37819424271583557, "learning_rate": 8.263940408230844e-06, "loss": 1.2054, "step": 19094 }, { "epoch": 5.687373182672797, "grad_norm": 0.3502015769481659, "learning_rate": 8.262990496693391e-06, "loss": 1.2097, "step": 19095 }, { "epoch": 5.687671028872466, "grad_norm": 0.3551258444786072, "learning_rate": 8.262040601316876e-06, "loss": 1.1903, "step": 19096 }, { "epoch": 5.6879688750721344, "grad_norm": 0.28365689516067505, "learning_rate": 8.261090722110128e-06, "loss": 1.2216, "step": 19097 }, { "epoch": 5.688266721271804, "grad_norm": 0.29984399676322937, "learning_rate": 8.260140859081988e-06, "loss": 1.2217, "step": 19098 }, { "epoch": 5.688564567471472, "grad_norm": 0.41037264466285706, "learning_rate": 8.259191012241296e-06, "loss": 1.2084, "step": 19099 }, { "epoch": 5.68886241367114, "grad_norm": 0.2847413122653961, "learning_rate": 8.258241181596881e-06, "loss": 1.2201, "step": 19100 }, { "epoch": 5.6891602598708095, "grad_norm": 0.4303090572357178, "learning_rate": 8.257291367157593e-06, "loss": 1.2142, "step": 19101 }, { "epoch": 5.689458106070478, "grad_norm": 0.27760329842567444, "learning_rate": 8.25634156893226e-06, "loss": 1.2067, "step": 19102 }, { "epoch": 5.689755952270146, "grad_norm": 0.4031229019165039, "learning_rate": 8.255391786929718e-06, "loss": 1.2062, "step": 19103 }, { "epoch": 5.690053798469815, "grad_norm": 0.2993726432323456, "learning_rate": 8.254442021158807e-06, "loss": 1.2055, "step": 19104 }, { "epoch": 5.690351644669484, "grad_norm": 0.46615636348724365, "learning_rate": 8.253492271628365e-06, "loss": 1.2108, "step": 19105 }, { "epoch": 5.690649490869152, "grad_norm": 0.3425360918045044, "learning_rate": 8.252542538347222e-06, "loss": 1.2313, "step": 19106 }, { "epoch": 5.690947337068821, "grad_norm": 0.4575381278991699, "learning_rate": 8.251592821324222e-06, "loss": 1.2252, "step": 19107 }, { "epoch": 5.69124518326849, "grad_norm": 0.2913895547389984, "learning_rate": 8.250643120568197e-06, "loss": 1.2329, "step": 19108 }, { "epoch": 5.691543029468159, "grad_norm": 0.38088569045066833, "learning_rate": 8.249693436087982e-06, "loss": 1.2112, "step": 19109 }, { "epoch": 5.691840875667827, "grad_norm": 0.3749718964099884, "learning_rate": 8.248743767892413e-06, "loss": 1.2124, "step": 19110 }, { "epoch": 5.6921387218674955, "grad_norm": 0.37592044472694397, "learning_rate": 8.247794115990331e-06, "loss": 1.2101, "step": 19111 }, { "epoch": 5.692436568067165, "grad_norm": 0.31984758377075195, "learning_rate": 8.246844480390561e-06, "loss": 1.2035, "step": 19112 }, { "epoch": 5.692734414266833, "grad_norm": 0.2917548716068268, "learning_rate": 8.24589486110195e-06, "loss": 1.1921, "step": 19113 }, { "epoch": 5.693032260466501, "grad_norm": 0.3189534842967987, "learning_rate": 8.244945258133324e-06, "loss": 1.194, "step": 19114 }, { "epoch": 5.693330106666171, "grad_norm": 0.272670179605484, "learning_rate": 8.24399567149352e-06, "loss": 1.2146, "step": 19115 }, { "epoch": 5.693627952865839, "grad_norm": 0.25924283266067505, "learning_rate": 8.243046101191379e-06, "loss": 1.2105, "step": 19116 }, { "epoch": 5.693925799065507, "grad_norm": 0.2516448199748993, "learning_rate": 8.242096547235726e-06, "loss": 1.2196, "step": 19117 }, { "epoch": 5.694223645265176, "grad_norm": 0.29821452498435974, "learning_rate": 8.241147009635405e-06, "loss": 1.205, "step": 19118 }, { "epoch": 5.694521491464845, "grad_norm": 0.2622608542442322, "learning_rate": 8.240197488399246e-06, "loss": 1.2152, "step": 19119 }, { "epoch": 5.694819337664514, "grad_norm": 0.4225327968597412, "learning_rate": 8.23924798353608e-06, "loss": 1.2009, "step": 19120 }, { "epoch": 5.695117183864182, "grad_norm": 0.38297656178474426, "learning_rate": 8.238298495054745e-06, "loss": 1.2149, "step": 19121 }, { "epoch": 5.695415030063851, "grad_norm": 0.300771027803421, "learning_rate": 8.237349022964077e-06, "loss": 1.2134, "step": 19122 }, { "epoch": 5.695712876263519, "grad_norm": 0.3504418134689331, "learning_rate": 8.236399567272903e-06, "loss": 1.2099, "step": 19123 }, { "epoch": 5.696010722463188, "grad_norm": 0.24541020393371582, "learning_rate": 8.235450127990064e-06, "loss": 1.1963, "step": 19124 }, { "epoch": 5.6963085686628565, "grad_norm": 0.32109183073043823, "learning_rate": 8.23450070512439e-06, "loss": 1.2078, "step": 19125 }, { "epoch": 5.696606414862526, "grad_norm": 0.3012921214103699, "learning_rate": 8.233551298684711e-06, "loss": 1.2171, "step": 19126 }, { "epoch": 5.696904261062194, "grad_norm": 0.29415959119796753, "learning_rate": 8.232601908679865e-06, "loss": 1.231, "step": 19127 }, { "epoch": 5.697202107261862, "grad_norm": 0.33890417218208313, "learning_rate": 8.231652535118687e-06, "loss": 1.2107, "step": 19128 }, { "epoch": 5.697499953461532, "grad_norm": 0.30771562457084656, "learning_rate": 8.230703178010002e-06, "loss": 1.2321, "step": 19129 }, { "epoch": 5.6977977996612, "grad_norm": 0.310462087392807, "learning_rate": 8.22975383736265e-06, "loss": 1.2197, "step": 19130 }, { "epoch": 5.698095645860868, "grad_norm": 0.2630466818809509, "learning_rate": 8.22880451318546e-06, "loss": 1.1987, "step": 19131 }, { "epoch": 5.6983934920605375, "grad_norm": 0.3235372006893158, "learning_rate": 8.227855205487264e-06, "loss": 1.2148, "step": 19132 }, { "epoch": 5.698691338260206, "grad_norm": 0.26138269901275635, "learning_rate": 8.226905914276895e-06, "loss": 1.2171, "step": 19133 }, { "epoch": 5.698989184459874, "grad_norm": 0.3011850118637085, "learning_rate": 8.225956639563186e-06, "loss": 1.2199, "step": 19134 }, { "epoch": 5.699287030659543, "grad_norm": 0.3192721903324127, "learning_rate": 8.22500738135497e-06, "loss": 1.2137, "step": 19135 }, { "epoch": 5.699584876859212, "grad_norm": 0.46231967210769653, "learning_rate": 8.224058139661077e-06, "loss": 1.2038, "step": 19136 }, { "epoch": 5.699882723058881, "grad_norm": 0.28124892711639404, "learning_rate": 8.223108914490337e-06, "loss": 1.222, "step": 19137 }, { "epoch": 5.700180569258549, "grad_norm": 0.5450407862663269, "learning_rate": 8.222159705851583e-06, "loss": 1.2043, "step": 19138 }, { "epoch": 5.7004784154582175, "grad_norm": 0.26735734939575195, "learning_rate": 8.22121051375365e-06, "loss": 1.2262, "step": 19139 }, { "epoch": 5.700776261657887, "grad_norm": 0.6248392462730408, "learning_rate": 8.220261338205361e-06, "loss": 1.2291, "step": 19140 }, { "epoch": 5.701074107857555, "grad_norm": 0.3227476179599762, "learning_rate": 8.219312179215557e-06, "loss": 1.2025, "step": 19141 }, { "epoch": 5.701371954057223, "grad_norm": 0.4438363313674927, "learning_rate": 8.218363036793063e-06, "loss": 1.2093, "step": 19142 }, { "epoch": 5.701669800256893, "grad_norm": 0.5310785174369812, "learning_rate": 8.217413910946705e-06, "loss": 1.2251, "step": 19143 }, { "epoch": 5.701967646456561, "grad_norm": 0.42326900362968445, "learning_rate": 8.216464801685324e-06, "loss": 1.2351, "step": 19144 }, { "epoch": 5.702265492656229, "grad_norm": 0.6061856150627136, "learning_rate": 8.215515709017745e-06, "loss": 1.2225, "step": 19145 }, { "epoch": 5.7025633388558985, "grad_norm": 0.4341288208961487, "learning_rate": 8.214566632952795e-06, "loss": 1.2215, "step": 19146 }, { "epoch": 5.702861185055567, "grad_norm": 0.5088263750076294, "learning_rate": 8.21361757349931e-06, "loss": 1.1955, "step": 19147 }, { "epoch": 5.703159031255236, "grad_norm": 0.2771662473678589, "learning_rate": 8.212668530666122e-06, "loss": 1.2127, "step": 19148 }, { "epoch": 5.703456877454904, "grad_norm": 0.4809740483760834, "learning_rate": 8.211719504462047e-06, "loss": 1.2099, "step": 19149 }, { "epoch": 5.703754723654573, "grad_norm": 0.47004154324531555, "learning_rate": 8.21077049489593e-06, "loss": 1.2326, "step": 19150 }, { "epoch": 5.704052569854241, "grad_norm": 0.35316845774650574, "learning_rate": 8.209821501976591e-06, "loss": 1.236, "step": 19151 }, { "epoch": 5.70435041605391, "grad_norm": 0.5005456209182739, "learning_rate": 8.208872525712868e-06, "loss": 1.21, "step": 19152 }, { "epoch": 5.704648262253579, "grad_norm": 0.35879549384117126, "learning_rate": 8.207923566113584e-06, "loss": 1.2116, "step": 19153 }, { "epoch": 5.704946108453248, "grad_norm": 0.3176892399787903, "learning_rate": 8.206974623187564e-06, "loss": 1.2238, "step": 19154 }, { "epoch": 5.705243954652916, "grad_norm": 0.6765245199203491, "learning_rate": 8.206025696943648e-06, "loss": 1.2009, "step": 19155 }, { "epoch": 5.7055418008525844, "grad_norm": 0.29464155435562134, "learning_rate": 8.205076787390655e-06, "loss": 1.2245, "step": 19156 }, { "epoch": 5.705839647052254, "grad_norm": 0.49003055691719055, "learning_rate": 8.204127894537417e-06, "loss": 1.2129, "step": 19157 }, { "epoch": 5.706137493251922, "grad_norm": 0.4680720567703247, "learning_rate": 8.203179018392764e-06, "loss": 1.1992, "step": 19158 }, { "epoch": 5.70643533945159, "grad_norm": 0.3750953674316406, "learning_rate": 8.202230158965524e-06, "loss": 1.2268, "step": 19159 }, { "epoch": 5.7067331856512595, "grad_norm": 0.38773584365844727, "learning_rate": 8.201281316264518e-06, "loss": 1.213, "step": 19160 }, { "epoch": 5.707031031850928, "grad_norm": 0.3487052619457245, "learning_rate": 8.200332490298587e-06, "loss": 1.2156, "step": 19161 }, { "epoch": 5.707328878050596, "grad_norm": 0.32750964164733887, "learning_rate": 8.199383681076547e-06, "loss": 1.228, "step": 19162 }, { "epoch": 5.707626724250265, "grad_norm": 0.45693573355674744, "learning_rate": 8.198434888607229e-06, "loss": 1.2036, "step": 19163 }, { "epoch": 5.707924570449934, "grad_norm": 0.26374465227127075, "learning_rate": 8.197486112899465e-06, "loss": 1.2312, "step": 19164 }, { "epoch": 5.708222416649603, "grad_norm": 0.30079081654548645, "learning_rate": 8.19653735396208e-06, "loss": 1.2031, "step": 19165 }, { "epoch": 5.708520262849271, "grad_norm": 0.34954196214675903, "learning_rate": 8.195588611803894e-06, "loss": 1.2035, "step": 19166 }, { "epoch": 5.70881810904894, "grad_norm": 0.3157309889793396, "learning_rate": 8.194639886433745e-06, "loss": 1.2155, "step": 19167 }, { "epoch": 5.709115955248609, "grad_norm": 0.2708793580532074, "learning_rate": 8.19369117786045e-06, "loss": 1.2207, "step": 19168 }, { "epoch": 5.709413801448277, "grad_norm": 0.2903445065021515, "learning_rate": 8.192742486092844e-06, "loss": 1.2214, "step": 19169 }, { "epoch": 5.7097116476479455, "grad_norm": 0.4261883795261383, "learning_rate": 8.191793811139753e-06, "loss": 1.2022, "step": 19170 }, { "epoch": 5.710009493847615, "grad_norm": 0.2846320569515228, "learning_rate": 8.190845153009994e-06, "loss": 1.2159, "step": 19171 }, { "epoch": 5.710307340047283, "grad_norm": 0.447803258895874, "learning_rate": 8.189896511712404e-06, "loss": 1.2066, "step": 19172 }, { "epoch": 5.710605186246951, "grad_norm": 0.5086649656295776, "learning_rate": 8.188947887255803e-06, "loss": 1.2152, "step": 19173 }, { "epoch": 5.710903032446621, "grad_norm": 0.27532675862312317, "learning_rate": 8.187999279649016e-06, "loss": 1.213, "step": 19174 }, { "epoch": 5.711200878646289, "grad_norm": 0.40517324209213257, "learning_rate": 8.187050688900874e-06, "loss": 1.2157, "step": 19175 }, { "epoch": 5.711498724845958, "grad_norm": 0.31303679943084717, "learning_rate": 8.186102115020203e-06, "loss": 1.2161, "step": 19176 }, { "epoch": 5.711796571045626, "grad_norm": 0.2991512715816498, "learning_rate": 8.18515355801582e-06, "loss": 1.1982, "step": 19177 }, { "epoch": 5.712094417245295, "grad_norm": 0.32588645815849304, "learning_rate": 8.184205017896558e-06, "loss": 1.2086, "step": 19178 }, { "epoch": 5.712392263444964, "grad_norm": 0.27404963970184326, "learning_rate": 8.183256494671239e-06, "loss": 1.237, "step": 19179 }, { "epoch": 5.712690109644632, "grad_norm": 0.2768762409687042, "learning_rate": 8.182307988348686e-06, "loss": 1.2134, "step": 19180 }, { "epoch": 5.712987955844301, "grad_norm": 0.28320375084877014, "learning_rate": 8.181359498937731e-06, "loss": 1.2211, "step": 19181 }, { "epoch": 5.71328580204397, "grad_norm": 0.2991725206375122, "learning_rate": 8.180411026447188e-06, "loss": 1.2194, "step": 19182 }, { "epoch": 5.713583648243638, "grad_norm": 0.2722277343273163, "learning_rate": 8.179462570885893e-06, "loss": 1.2049, "step": 19183 }, { "epoch": 5.7138814944433065, "grad_norm": 0.33952224254608154, "learning_rate": 8.178514132262665e-06, "loss": 1.188, "step": 19184 }, { "epoch": 5.714179340642976, "grad_norm": 0.4549599885940552, "learning_rate": 8.177565710586324e-06, "loss": 1.1926, "step": 19185 }, { "epoch": 5.714477186842644, "grad_norm": 0.38919007778167725, "learning_rate": 8.1766173058657e-06, "loss": 1.2194, "step": 19186 }, { "epoch": 5.714775033042313, "grad_norm": 0.4535031318664551, "learning_rate": 8.175668918109614e-06, "loss": 1.1982, "step": 19187 }, { "epoch": 5.715072879241982, "grad_norm": 0.6928510665893555, "learning_rate": 8.17472054732689e-06, "loss": 1.207, "step": 19188 }, { "epoch": 5.71537072544165, "grad_norm": 0.3158282935619354, "learning_rate": 8.173772193526355e-06, "loss": 1.2142, "step": 19189 }, { "epoch": 5.715668571641318, "grad_norm": 0.471537709236145, "learning_rate": 8.172823856716826e-06, "loss": 1.2081, "step": 19190 }, { "epoch": 5.7159664178409875, "grad_norm": 0.43880006670951843, "learning_rate": 8.171875536907131e-06, "loss": 1.2195, "step": 19191 }, { "epoch": 5.716264264040656, "grad_norm": 0.4804430902004242, "learning_rate": 8.17092723410609e-06, "loss": 1.2173, "step": 19192 }, { "epoch": 5.716562110240325, "grad_norm": 0.45681121945381165, "learning_rate": 8.169978948322532e-06, "loss": 1.2119, "step": 19193 }, { "epoch": 5.716859956439993, "grad_norm": 0.4069162607192993, "learning_rate": 8.16903067956527e-06, "loss": 1.222, "step": 19194 }, { "epoch": 5.717157802639662, "grad_norm": 0.49170956015586853, "learning_rate": 8.168082427843136e-06, "loss": 1.2319, "step": 19195 }, { "epoch": 5.717455648839331, "grad_norm": 0.3608390688896179, "learning_rate": 8.167134193164945e-06, "loss": 1.2169, "step": 19196 }, { "epoch": 5.717753495038999, "grad_norm": 0.3721214830875397, "learning_rate": 8.166185975539525e-06, "loss": 1.2056, "step": 19197 }, { "epoch": 5.7180513412386675, "grad_norm": 0.26780083775520325, "learning_rate": 8.165237774975694e-06, "loss": 1.2252, "step": 19198 }, { "epoch": 5.718349187438337, "grad_norm": 0.3509320914745331, "learning_rate": 8.164289591482273e-06, "loss": 1.1987, "step": 19199 }, { "epoch": 5.718647033638005, "grad_norm": 0.3072911500930786, "learning_rate": 8.163341425068091e-06, "loss": 1.2136, "step": 19200 }, { "epoch": 5.718944879837673, "grad_norm": 0.34929409623146057, "learning_rate": 8.162393275741965e-06, "loss": 1.2269, "step": 19201 }, { "epoch": 5.719242726037343, "grad_norm": 0.2987555265426636, "learning_rate": 8.161445143512714e-06, "loss": 1.2274, "step": 19202 }, { "epoch": 5.719540572237011, "grad_norm": 0.26666340231895447, "learning_rate": 8.160497028389162e-06, "loss": 1.2093, "step": 19203 }, { "epoch": 5.71983841843668, "grad_norm": 0.3248349130153656, "learning_rate": 8.159548930380132e-06, "loss": 1.1982, "step": 19204 }, { "epoch": 5.7201362646363485, "grad_norm": 0.2517450451850891, "learning_rate": 8.158600849494439e-06, "loss": 1.1918, "step": 19205 }, { "epoch": 5.720434110836017, "grad_norm": 0.3940218687057495, "learning_rate": 8.157652785740913e-06, "loss": 1.2134, "step": 19206 }, { "epoch": 5.720731957035686, "grad_norm": 0.3380928337574005, "learning_rate": 8.156704739128368e-06, "loss": 1.2055, "step": 19207 }, { "epoch": 5.721029803235354, "grad_norm": 0.7472136616706848, "learning_rate": 8.155756709665623e-06, "loss": 1.2026, "step": 19208 }, { "epoch": 5.721327649435023, "grad_norm": 0.29066941142082214, "learning_rate": 8.154808697361503e-06, "loss": 1.2207, "step": 19209 }, { "epoch": 5.721625495634692, "grad_norm": 0.472308486700058, "learning_rate": 8.153860702224827e-06, "loss": 1.2335, "step": 19210 }, { "epoch": 5.72192334183436, "grad_norm": 0.2740098536014557, "learning_rate": 8.152912724264413e-06, "loss": 1.2163, "step": 19211 }, { "epoch": 5.722221188034029, "grad_norm": 0.3556009531021118, "learning_rate": 8.151964763489084e-06, "loss": 1.198, "step": 19212 }, { "epoch": 5.722519034233698, "grad_norm": 0.3266046643257141, "learning_rate": 8.15101681990766e-06, "loss": 1.2253, "step": 19213 }, { "epoch": 5.722816880433366, "grad_norm": 0.3187634348869324, "learning_rate": 8.150068893528953e-06, "loss": 1.2037, "step": 19214 }, { "epoch": 5.723114726633035, "grad_norm": 0.2581399977207184, "learning_rate": 8.149120984361792e-06, "loss": 1.2068, "step": 19215 }, { "epoch": 5.723412572832704, "grad_norm": 0.2611939013004303, "learning_rate": 8.14817309241499e-06, "loss": 1.2066, "step": 19216 }, { "epoch": 5.723710419032372, "grad_norm": 0.2910304367542267, "learning_rate": 8.147225217697372e-06, "loss": 1.2156, "step": 19217 }, { "epoch": 5.72400826523204, "grad_norm": 0.3033101260662079, "learning_rate": 8.146277360217753e-06, "loss": 1.2008, "step": 19218 }, { "epoch": 5.7243061114317095, "grad_norm": 0.2592804431915283, "learning_rate": 8.145329519984948e-06, "loss": 1.2114, "step": 19219 }, { "epoch": 5.724603957631378, "grad_norm": 0.337984174489975, "learning_rate": 8.144381697007784e-06, "loss": 1.2291, "step": 19220 }, { "epoch": 5.724901803831047, "grad_norm": 0.3439250588417053, "learning_rate": 8.143433891295073e-06, "loss": 1.2158, "step": 19221 }, { "epoch": 5.725199650030715, "grad_norm": 0.316287636756897, "learning_rate": 8.142486102855633e-06, "loss": 1.2024, "step": 19222 }, { "epoch": 5.725497496230384, "grad_norm": 0.32141873240470886, "learning_rate": 8.141538331698289e-06, "loss": 1.2128, "step": 19223 }, { "epoch": 5.725795342430053, "grad_norm": 0.29094937443733215, "learning_rate": 8.140590577831855e-06, "loss": 1.2066, "step": 19224 }, { "epoch": 5.726093188629721, "grad_norm": 0.2842795252799988, "learning_rate": 8.139642841265143e-06, "loss": 1.2027, "step": 19225 }, { "epoch": 5.72639103482939, "grad_norm": 0.25643765926361084, "learning_rate": 8.138695122006978e-06, "loss": 1.2219, "step": 19226 }, { "epoch": 5.726688881029059, "grad_norm": 0.3652380704879761, "learning_rate": 8.137747420066178e-06, "loss": 1.1971, "step": 19227 }, { "epoch": 5.726986727228727, "grad_norm": 0.39302146434783936, "learning_rate": 8.136799735451553e-06, "loss": 1.2139, "step": 19228 }, { "epoch": 5.7272845734283955, "grad_norm": 0.2970762252807617, "learning_rate": 8.135852068171929e-06, "loss": 1.1927, "step": 19229 }, { "epoch": 5.727582419628065, "grad_norm": 0.26464593410491943, "learning_rate": 8.13490441823612e-06, "loss": 1.2139, "step": 19230 }, { "epoch": 5.727880265827733, "grad_norm": 0.3146645724773407, "learning_rate": 8.133956785652935e-06, "loss": 1.2171, "step": 19231 }, { "epoch": 5.728178112027402, "grad_norm": 0.2746916711330414, "learning_rate": 8.1330091704312e-06, "loss": 1.2115, "step": 19232 }, { "epoch": 5.728475958227071, "grad_norm": 0.3675459027290344, "learning_rate": 8.132061572579728e-06, "loss": 1.2101, "step": 19233 }, { "epoch": 5.728773804426739, "grad_norm": 0.4824284017086029, "learning_rate": 8.13111399210734e-06, "loss": 1.2117, "step": 19234 }, { "epoch": 5.729071650626408, "grad_norm": 0.2716885209083557, "learning_rate": 8.130166429022848e-06, "loss": 1.2091, "step": 19235 }, { "epoch": 5.729369496826076, "grad_norm": 0.4524732828140259, "learning_rate": 8.129218883335063e-06, "loss": 1.2208, "step": 19236 }, { "epoch": 5.729667343025745, "grad_norm": 0.25957584381103516, "learning_rate": 8.128271355052812e-06, "loss": 1.2136, "step": 19237 }, { "epoch": 5.729965189225414, "grad_norm": 0.4286913275718689, "learning_rate": 8.127323844184901e-06, "loss": 1.2279, "step": 19238 }, { "epoch": 5.730263035425082, "grad_norm": 0.3456757664680481, "learning_rate": 8.126376350740148e-06, "loss": 1.2104, "step": 19239 }, { "epoch": 5.730560881624751, "grad_norm": 0.34896788001060486, "learning_rate": 8.125428874727374e-06, "loss": 1.2083, "step": 19240 }, { "epoch": 5.73085872782442, "grad_norm": 0.31779733300209045, "learning_rate": 8.12448141615539e-06, "loss": 1.207, "step": 19241 }, { "epoch": 5.731156574024088, "grad_norm": 0.2977970242500305, "learning_rate": 8.123533975033007e-06, "loss": 1.2074, "step": 19242 }, { "epoch": 5.731454420223757, "grad_norm": 0.27172422409057617, "learning_rate": 8.122586551369049e-06, "loss": 1.209, "step": 19243 }, { "epoch": 5.731752266423426, "grad_norm": 0.24826082587242126, "learning_rate": 8.121639145172322e-06, "loss": 1.2047, "step": 19244 }, { "epoch": 5.732050112623094, "grad_norm": 0.34129610657691956, "learning_rate": 8.120691756451642e-06, "loss": 1.2173, "step": 19245 }, { "epoch": 5.732347958822763, "grad_norm": 0.2712298631668091, "learning_rate": 8.119744385215829e-06, "loss": 1.2168, "step": 19246 }, { "epoch": 5.732645805022432, "grad_norm": 0.34561648964881897, "learning_rate": 8.118797031473696e-06, "loss": 1.2094, "step": 19247 }, { "epoch": 5.7329436512221, "grad_norm": 0.2662794888019562, "learning_rate": 8.11784969523405e-06, "loss": 1.1895, "step": 19248 }, { "epoch": 5.733241497421769, "grad_norm": 0.4298684000968933, "learning_rate": 8.11690237650571e-06, "loss": 1.2255, "step": 19249 }, { "epoch": 5.7335393436214375, "grad_norm": 0.41783541440963745, "learning_rate": 8.11595507529749e-06, "loss": 1.2065, "step": 19250 }, { "epoch": 5.733837189821106, "grad_norm": 0.26658281683921814, "learning_rate": 8.115007791618204e-06, "loss": 1.222, "step": 19251 }, { "epoch": 5.734135036020775, "grad_norm": 0.33240213990211487, "learning_rate": 8.114060525476667e-06, "loss": 1.219, "step": 19252 }, { "epoch": 5.734432882220443, "grad_norm": 0.3223637640476227, "learning_rate": 8.113113276881685e-06, "loss": 1.2047, "step": 19253 }, { "epoch": 5.7347307284201126, "grad_norm": 0.2844304144382477, "learning_rate": 8.112166045842079e-06, "loss": 1.2111, "step": 19254 }, { "epoch": 5.735028574619781, "grad_norm": 0.3888123631477356, "learning_rate": 8.111218832366657e-06, "loss": 1.211, "step": 19255 }, { "epoch": 5.735326420819449, "grad_norm": 0.24945160746574402, "learning_rate": 8.110271636464236e-06, "loss": 1.2113, "step": 19256 }, { "epoch": 5.7356242670191175, "grad_norm": 0.4705652594566345, "learning_rate": 8.109324458143622e-06, "loss": 1.2278, "step": 19257 }, { "epoch": 5.735922113218787, "grad_norm": 0.25226935744285583, "learning_rate": 8.108377297413636e-06, "loss": 1.2071, "step": 19258 }, { "epoch": 5.736219959418455, "grad_norm": 0.40976861119270325, "learning_rate": 8.10743015428308e-06, "loss": 1.2139, "step": 19259 }, { "epoch": 5.736517805618124, "grad_norm": 0.3066800832748413, "learning_rate": 8.106483028760778e-06, "loss": 1.2137, "step": 19260 }, { "epoch": 5.736815651817793, "grad_norm": 0.31210604310035706, "learning_rate": 8.105535920855534e-06, "loss": 1.218, "step": 19261 }, { "epoch": 5.737113498017461, "grad_norm": 0.2538163661956787, "learning_rate": 8.104588830576158e-06, "loss": 1.217, "step": 19262 }, { "epoch": 5.73741134421713, "grad_norm": 0.3894919455051422, "learning_rate": 8.103641757931472e-06, "loss": 1.2154, "step": 19263 }, { "epoch": 5.7377091904167985, "grad_norm": 0.27999603748321533, "learning_rate": 8.102694702930278e-06, "loss": 1.2243, "step": 19264 }, { "epoch": 5.738007036616467, "grad_norm": 0.28781571984291077, "learning_rate": 8.101747665581386e-06, "loss": 1.2122, "step": 19265 }, { "epoch": 5.738304882816136, "grad_norm": 0.5580193400382996, "learning_rate": 8.100800645893616e-06, "loss": 1.2131, "step": 19266 }, { "epoch": 5.738602729015804, "grad_norm": 0.29654157161712646, "learning_rate": 8.099853643875771e-06, "loss": 1.2186, "step": 19267 }, { "epoch": 5.738900575215473, "grad_norm": 0.5443137884140015, "learning_rate": 8.098906659536664e-06, "loss": 1.2124, "step": 19268 }, { "epoch": 5.739198421415142, "grad_norm": 0.2780085802078247, "learning_rate": 8.097959692885111e-06, "loss": 1.2158, "step": 19269 }, { "epoch": 5.73949626761481, "grad_norm": 0.39644768834114075, "learning_rate": 8.097012743929913e-06, "loss": 1.2163, "step": 19270 }, { "epoch": 5.7397941138144795, "grad_norm": 0.4487845301628113, "learning_rate": 8.09606581267989e-06, "loss": 1.1921, "step": 19271 }, { "epoch": 5.740091960014148, "grad_norm": 0.5084857940673828, "learning_rate": 8.095118899143848e-06, "loss": 1.2046, "step": 19272 }, { "epoch": 5.740389806213816, "grad_norm": 0.36046403646469116, "learning_rate": 8.09417200333059e-06, "loss": 1.224, "step": 19273 }, { "epoch": 5.740687652413485, "grad_norm": 0.27846071124076843, "learning_rate": 8.093225125248936e-06, "loss": 1.2185, "step": 19274 }, { "epoch": 5.740985498613154, "grad_norm": 0.4942079186439514, "learning_rate": 8.092278264907694e-06, "loss": 1.2153, "step": 19275 }, { "epoch": 5.741283344812822, "grad_norm": 0.2698473632335663, "learning_rate": 8.091331422315667e-06, "loss": 1.2119, "step": 19276 }, { "epoch": 5.741581191012491, "grad_norm": 0.4069843292236328, "learning_rate": 8.090384597481672e-06, "loss": 1.2127, "step": 19277 }, { "epoch": 5.7418790372121595, "grad_norm": 0.26671722531318665, "learning_rate": 8.089437790414513e-06, "loss": 1.2021, "step": 19278 }, { "epoch": 5.742176883411828, "grad_norm": 0.2694877088069916, "learning_rate": 8.088491001123e-06, "loss": 1.2091, "step": 19279 }, { "epoch": 5.742474729611497, "grad_norm": 0.35932037234306335, "learning_rate": 8.087544229615945e-06, "loss": 1.2067, "step": 19280 }, { "epoch": 5.742772575811165, "grad_norm": 0.3522062599658966, "learning_rate": 8.086597475902155e-06, "loss": 1.2123, "step": 19281 }, { "epoch": 5.743070422010835, "grad_norm": 0.4137420356273651, "learning_rate": 8.085650739990432e-06, "loss": 1.203, "step": 19282 }, { "epoch": 5.743368268210503, "grad_norm": 0.3549848198890686, "learning_rate": 8.084704021889596e-06, "loss": 1.2117, "step": 19283 }, { "epoch": 5.743666114410171, "grad_norm": 0.4859996736049652, "learning_rate": 8.083757321608443e-06, "loss": 1.2066, "step": 19284 }, { "epoch": 5.74396396060984, "grad_norm": 0.2760433554649353, "learning_rate": 8.08281063915579e-06, "loss": 1.2112, "step": 19285 }, { "epoch": 5.744261806809509, "grad_norm": 0.6055020689964294, "learning_rate": 8.081863974540447e-06, "loss": 1.2284, "step": 19286 }, { "epoch": 5.744559653009177, "grad_norm": 0.28894391655921936, "learning_rate": 8.080917327771209e-06, "loss": 1.2234, "step": 19287 }, { "epoch": 5.744857499208846, "grad_norm": 0.45702478289604187, "learning_rate": 8.079970698856897e-06, "loss": 1.217, "step": 19288 }, { "epoch": 5.745155345408515, "grad_norm": 0.25937530398368835, "learning_rate": 8.079024087806313e-06, "loss": 1.2195, "step": 19289 }, { "epoch": 5.745453191608183, "grad_norm": 0.35895881056785583, "learning_rate": 8.078077494628258e-06, "loss": 1.2191, "step": 19290 }, { "epoch": 5.745751037807852, "grad_norm": 0.28511467576026917, "learning_rate": 8.077130919331547e-06, "loss": 1.2254, "step": 19291 }, { "epoch": 5.746048884007521, "grad_norm": 0.3195737302303314, "learning_rate": 8.076184361924986e-06, "loss": 1.2095, "step": 19292 }, { "epoch": 5.746346730207189, "grad_norm": 0.30785128474235535, "learning_rate": 8.075237822417378e-06, "loss": 1.2191, "step": 19293 }, { "epoch": 5.746644576406858, "grad_norm": 0.3183385729789734, "learning_rate": 8.074291300817534e-06, "loss": 1.2129, "step": 19294 }, { "epoch": 5.746942422606526, "grad_norm": 0.5007601976394653, "learning_rate": 8.073344797134258e-06, "loss": 1.2093, "step": 19295 }, { "epoch": 5.747240268806195, "grad_norm": 0.39805346727371216, "learning_rate": 8.072398311376352e-06, "loss": 1.2024, "step": 19296 }, { "epoch": 5.747538115005864, "grad_norm": 0.43453508615493774, "learning_rate": 8.07145184355263e-06, "loss": 1.217, "step": 19297 }, { "epoch": 5.747835961205532, "grad_norm": 0.3058512210845947, "learning_rate": 8.07050539367189e-06, "loss": 1.2059, "step": 19298 }, { "epoch": 5.7481338074052015, "grad_norm": 0.5790659785270691, "learning_rate": 8.069558961742945e-06, "loss": 1.2206, "step": 19299 }, { "epoch": 5.74843165360487, "grad_norm": 0.3180426359176636, "learning_rate": 8.068612547774598e-06, "loss": 1.2102, "step": 19300 }, { "epoch": 5.748729499804538, "grad_norm": 0.40445476770401, "learning_rate": 8.067666151775649e-06, "loss": 1.2152, "step": 19301 }, { "epoch": 5.749027346004207, "grad_norm": 0.26040560007095337, "learning_rate": 8.066719773754911e-06, "loss": 1.2103, "step": 19302 }, { "epoch": 5.749325192203876, "grad_norm": 0.3587285280227661, "learning_rate": 8.065773413721186e-06, "loss": 1.2262, "step": 19303 }, { "epoch": 5.749623038403544, "grad_norm": 0.27444958686828613, "learning_rate": 8.064827071683275e-06, "loss": 1.229, "step": 19304 }, { "epoch": 5.749920884603213, "grad_norm": 0.28268489241600037, "learning_rate": 8.06388074764999e-06, "loss": 1.2281, "step": 19305 }, { "epoch": 5.750218730802882, "grad_norm": 0.28504547476768494, "learning_rate": 8.062934441630131e-06, "loss": 1.2039, "step": 19306 }, { "epoch": 5.75051657700255, "grad_norm": 0.42083004117012024, "learning_rate": 8.061988153632499e-06, "loss": 1.1994, "step": 19307 }, { "epoch": 5.750814423202219, "grad_norm": 0.26682978868484497, "learning_rate": 8.061041883665906e-06, "loss": 1.2367, "step": 19308 }, { "epoch": 5.7511122694018875, "grad_norm": 0.46401992440223694, "learning_rate": 8.060095631739148e-06, "loss": 1.2209, "step": 19309 }, { "epoch": 5.751410115601557, "grad_norm": 0.2903793454170227, "learning_rate": 8.059149397861034e-06, "loss": 1.2298, "step": 19310 }, { "epoch": 5.751707961801225, "grad_norm": 0.4303167164325714, "learning_rate": 8.058203182040369e-06, "loss": 1.2184, "step": 19311 }, { "epoch": 5.752005808000893, "grad_norm": 0.2538061738014221, "learning_rate": 8.057256984285954e-06, "loss": 1.204, "step": 19312 }, { "epoch": 5.7523036542005626, "grad_norm": 0.31230324506759644, "learning_rate": 8.056310804606587e-06, "loss": 1.2003, "step": 19313 }, { "epoch": 5.752601500400231, "grad_norm": 0.2916463315486908, "learning_rate": 8.05536464301108e-06, "loss": 1.2084, "step": 19314 }, { "epoch": 5.752899346599899, "grad_norm": 0.2824610769748688, "learning_rate": 8.054418499508232e-06, "loss": 1.2125, "step": 19315 }, { "epoch": 5.753197192799568, "grad_norm": 0.3339831829071045, "learning_rate": 8.053472374106845e-06, "loss": 1.215, "step": 19316 }, { "epoch": 5.753495038999237, "grad_norm": 0.2544691264629364, "learning_rate": 8.052526266815726e-06, "loss": 1.1975, "step": 19317 }, { "epoch": 5.753792885198905, "grad_norm": 0.2540312707424164, "learning_rate": 8.05158017764367e-06, "loss": 1.2128, "step": 19318 }, { "epoch": 5.754090731398574, "grad_norm": 0.2933233380317688, "learning_rate": 8.050634106599488e-06, "loss": 1.2117, "step": 19319 }, { "epoch": 5.754388577598243, "grad_norm": 0.40847817063331604, "learning_rate": 8.049688053691976e-06, "loss": 1.1959, "step": 19320 }, { "epoch": 5.754686423797912, "grad_norm": 0.2613731026649475, "learning_rate": 8.048742018929934e-06, "loss": 1.2155, "step": 19321 }, { "epoch": 5.75498426999758, "grad_norm": 0.3122713267803192, "learning_rate": 8.047796002322171e-06, "loss": 1.2108, "step": 19322 }, { "epoch": 5.7552821161972485, "grad_norm": 0.2656697630882263, "learning_rate": 8.046850003877487e-06, "loss": 1.2146, "step": 19323 }, { "epoch": 5.755579962396917, "grad_norm": 0.29155322909355164, "learning_rate": 8.045904023604677e-06, "loss": 1.2178, "step": 19324 }, { "epoch": 5.755877808596586, "grad_norm": 0.3093149662017822, "learning_rate": 8.04495806151255e-06, "loss": 1.2181, "step": 19325 }, { "epoch": 5.756175654796254, "grad_norm": 0.28253135085105896, "learning_rate": 8.044012117609902e-06, "loss": 1.2167, "step": 19326 }, { "epoch": 5.756473500995924, "grad_norm": 0.29040855169296265, "learning_rate": 8.043066191905536e-06, "loss": 1.2022, "step": 19327 }, { "epoch": 5.756771347195592, "grad_norm": 0.27360713481903076, "learning_rate": 8.042120284408255e-06, "loss": 1.2145, "step": 19328 }, { "epoch": 5.75706919339526, "grad_norm": 0.3463855981826782, "learning_rate": 8.041174395126857e-06, "loss": 1.2137, "step": 19329 }, { "epoch": 5.7573670395949295, "grad_norm": 0.2982986569404602, "learning_rate": 8.04022852407014e-06, "loss": 1.2015, "step": 19330 }, { "epoch": 5.757664885794598, "grad_norm": 0.3009822964668274, "learning_rate": 8.039282671246909e-06, "loss": 1.2093, "step": 19331 }, { "epoch": 5.757962731994266, "grad_norm": 0.2775547504425049, "learning_rate": 8.03833683666596e-06, "loss": 1.2097, "step": 19332 }, { "epoch": 5.758260578193935, "grad_norm": 0.2594717741012573, "learning_rate": 8.037391020336095e-06, "loss": 1.2067, "step": 19333 }, { "epoch": 5.758558424393604, "grad_norm": 0.4200427532196045, "learning_rate": 8.036445222266119e-06, "loss": 1.2209, "step": 19334 }, { "epoch": 5.758856270593272, "grad_norm": 0.32719770073890686, "learning_rate": 8.03549944246482e-06, "loss": 1.2004, "step": 19335 }, { "epoch": 5.759154116792941, "grad_norm": 0.36505675315856934, "learning_rate": 8.034553680941007e-06, "loss": 1.2191, "step": 19336 }, { "epoch": 5.7594519629926095, "grad_norm": 0.42838454246520996, "learning_rate": 8.033607937703475e-06, "loss": 1.2267, "step": 19337 }, { "epoch": 5.759749809192279, "grad_norm": 0.2745087146759033, "learning_rate": 8.032662212761025e-06, "loss": 1.2143, "step": 19338 }, { "epoch": 5.760047655391947, "grad_norm": 0.4445969760417938, "learning_rate": 8.031716506122454e-06, "loss": 1.2037, "step": 19339 }, { "epoch": 5.760345501591615, "grad_norm": 0.2586653530597687, "learning_rate": 8.030770817796566e-06, "loss": 1.2191, "step": 19340 }, { "epoch": 5.760643347791285, "grad_norm": 0.40743568539619446, "learning_rate": 8.029825147792149e-06, "loss": 1.2105, "step": 19341 }, { "epoch": 5.760941193990953, "grad_norm": 0.25817403197288513, "learning_rate": 8.028879496118013e-06, "loss": 1.2091, "step": 19342 }, { "epoch": 5.761239040190621, "grad_norm": 0.3616870045661926, "learning_rate": 8.027933862782949e-06, "loss": 1.2226, "step": 19343 }, { "epoch": 5.7615368863902905, "grad_norm": 0.35949820280075073, "learning_rate": 8.026988247795753e-06, "loss": 1.2155, "step": 19344 }, { "epoch": 5.761834732589959, "grad_norm": 0.5290505290031433, "learning_rate": 8.026042651165234e-06, "loss": 1.2185, "step": 19345 }, { "epoch": 5.762132578789627, "grad_norm": 0.3248412013053894, "learning_rate": 8.025097072900181e-06, "loss": 1.223, "step": 19346 }, { "epoch": 5.762430424989296, "grad_norm": 0.4513121247291565, "learning_rate": 8.02415151300939e-06, "loss": 1.2118, "step": 19347 }, { "epoch": 5.762728271188965, "grad_norm": 0.34418195486068726, "learning_rate": 8.023205971501664e-06, "loss": 1.2233, "step": 19348 }, { "epoch": 5.763026117388634, "grad_norm": 0.3057760000228882, "learning_rate": 8.022260448385795e-06, "loss": 1.2173, "step": 19349 }, { "epoch": 5.763323963588302, "grad_norm": 0.3657165765762329, "learning_rate": 8.021314943670584e-06, "loss": 1.2229, "step": 19350 }, { "epoch": 5.763621809787971, "grad_norm": 0.3583250045776367, "learning_rate": 8.020369457364832e-06, "loss": 1.2071, "step": 19351 }, { "epoch": 5.763919655987639, "grad_norm": 0.4996359944343567, "learning_rate": 8.019423989477324e-06, "loss": 1.2232, "step": 19352 }, { "epoch": 5.764217502187308, "grad_norm": 0.30307087302207947, "learning_rate": 8.018478540016866e-06, "loss": 1.2144, "step": 19353 }, { "epoch": 5.764515348386976, "grad_norm": 0.2732860743999481, "learning_rate": 8.017533108992252e-06, "loss": 1.22, "step": 19354 }, { "epoch": 5.764813194586646, "grad_norm": 0.3784959614276886, "learning_rate": 8.016587696412275e-06, "loss": 1.22, "step": 19355 }, { "epoch": 5.765111040786314, "grad_norm": 0.2700169086456299, "learning_rate": 8.015642302285733e-06, "loss": 1.208, "step": 19356 }, { "epoch": 5.765408886985982, "grad_norm": 0.3230387568473816, "learning_rate": 8.014696926621425e-06, "loss": 1.2161, "step": 19357 }, { "epoch": 5.7657067331856515, "grad_norm": 0.2916318476200104, "learning_rate": 8.013751569428141e-06, "loss": 1.2202, "step": 19358 }, { "epoch": 5.76600457938532, "grad_norm": 0.2668834626674652, "learning_rate": 8.012806230714684e-06, "loss": 1.2045, "step": 19359 }, { "epoch": 5.766302425584988, "grad_norm": 0.2626957595348358, "learning_rate": 8.011860910489841e-06, "loss": 1.2112, "step": 19360 }, { "epoch": 5.766600271784657, "grad_norm": 0.25540390610694885, "learning_rate": 8.010915608762412e-06, "loss": 1.2169, "step": 19361 }, { "epoch": 5.766898117984326, "grad_norm": 0.27726197242736816, "learning_rate": 8.009970325541192e-06, "loss": 1.2048, "step": 19362 }, { "epoch": 5.767195964183994, "grad_norm": 0.3548414707183838, "learning_rate": 8.009025060834974e-06, "loss": 1.2143, "step": 19363 }, { "epoch": 5.767493810383663, "grad_norm": 0.3036036789417267, "learning_rate": 8.008079814652551e-06, "loss": 1.2103, "step": 19364 }, { "epoch": 5.767791656583332, "grad_norm": 0.3073996305465698, "learning_rate": 8.007134587002724e-06, "loss": 1.2019, "step": 19365 }, { "epoch": 5.768089502783001, "grad_norm": 0.3572111427783966, "learning_rate": 8.00618937789428e-06, "loss": 1.2206, "step": 19366 }, { "epoch": 5.768387348982669, "grad_norm": 0.2723371386528015, "learning_rate": 8.005244187336018e-06, "loss": 1.2139, "step": 19367 }, { "epoch": 5.7686851951823375, "grad_norm": 0.3631337881088257, "learning_rate": 8.004299015336729e-06, "loss": 1.2168, "step": 19368 }, { "epoch": 5.768983041382007, "grad_norm": 0.3015212416648865, "learning_rate": 8.003353861905206e-06, "loss": 1.1829, "step": 19369 }, { "epoch": 5.769280887581675, "grad_norm": 0.3681679666042328, "learning_rate": 8.002408727050248e-06, "loss": 1.2057, "step": 19370 }, { "epoch": 5.769578733781343, "grad_norm": 0.40260106325149536, "learning_rate": 8.001463610780647e-06, "loss": 1.2184, "step": 19371 }, { "epoch": 5.7698765799810126, "grad_norm": 0.4095645844936371, "learning_rate": 8.000518513105188e-06, "loss": 1.2184, "step": 19372 }, { "epoch": 5.770174426180681, "grad_norm": 0.35802316665649414, "learning_rate": 7.999573434032672e-06, "loss": 1.2027, "step": 19373 }, { "epoch": 5.770472272380349, "grad_norm": 0.3471449613571167, "learning_rate": 7.998628373571894e-06, "loss": 1.2115, "step": 19374 }, { "epoch": 5.770770118580018, "grad_norm": 0.39103150367736816, "learning_rate": 7.997683331731638e-06, "loss": 1.2126, "step": 19375 }, { "epoch": 5.771067964779687, "grad_norm": 0.2856650948524475, "learning_rate": 7.996738308520706e-06, "loss": 1.202, "step": 19376 }, { "epoch": 5.771365810979356, "grad_norm": 0.38072243332862854, "learning_rate": 7.995793303947885e-06, "loss": 1.2012, "step": 19377 }, { "epoch": 5.771663657179024, "grad_norm": 0.28907880187034607, "learning_rate": 7.994848318021966e-06, "loss": 1.2067, "step": 19378 }, { "epoch": 5.771961503378693, "grad_norm": 0.493081659078598, "learning_rate": 7.993903350751742e-06, "loss": 1.2209, "step": 19379 }, { "epoch": 5.772259349578362, "grad_norm": 0.26373782753944397, "learning_rate": 7.99295840214601e-06, "loss": 1.2122, "step": 19380 }, { "epoch": 5.77255719577803, "grad_norm": 0.4298916459083557, "learning_rate": 7.992013472213553e-06, "loss": 1.2122, "step": 19381 }, { "epoch": 5.7728550419776985, "grad_norm": 0.3300621211528778, "learning_rate": 7.991068560963173e-06, "loss": 1.2254, "step": 19382 }, { "epoch": 5.773152888177368, "grad_norm": 0.2928406894207001, "learning_rate": 7.990123668403649e-06, "loss": 1.2318, "step": 19383 }, { "epoch": 5.773450734377036, "grad_norm": 0.3084755539894104, "learning_rate": 7.989178794543784e-06, "loss": 1.2125, "step": 19384 }, { "epoch": 5.773748580576704, "grad_norm": 0.31379589438438416, "learning_rate": 7.988233939392362e-06, "loss": 1.2073, "step": 19385 }, { "epoch": 5.774046426776374, "grad_norm": 0.2632942199707031, "learning_rate": 7.987289102958172e-06, "loss": 1.224, "step": 19386 }, { "epoch": 5.774344272976042, "grad_norm": 0.44776707887649536, "learning_rate": 7.986344285250014e-06, "loss": 1.2063, "step": 19387 }, { "epoch": 5.774642119175711, "grad_norm": 0.25434747338294983, "learning_rate": 7.985399486276672e-06, "loss": 1.2062, "step": 19388 }, { "epoch": 5.7749399653753795, "grad_norm": 0.43667274713516235, "learning_rate": 7.984454706046931e-06, "loss": 1.2085, "step": 19389 }, { "epoch": 5.775237811575048, "grad_norm": 0.3168714642524719, "learning_rate": 7.983509944569594e-06, "loss": 1.201, "step": 19390 }, { "epoch": 5.775535657774716, "grad_norm": 0.4105963706970215, "learning_rate": 7.98256520185344e-06, "loss": 1.2145, "step": 19391 }, { "epoch": 5.775833503974385, "grad_norm": 0.38376665115356445, "learning_rate": 7.98162047790726e-06, "loss": 1.2121, "step": 19392 }, { "epoch": 5.776131350174054, "grad_norm": 0.28114792704582214, "learning_rate": 7.980675772739855e-06, "loss": 1.2052, "step": 19393 }, { "epoch": 5.776429196373723, "grad_norm": 0.26987892389297485, "learning_rate": 7.979731086360002e-06, "loss": 1.2218, "step": 19394 }, { "epoch": 5.776727042573391, "grad_norm": 0.30014467239379883, "learning_rate": 7.978786418776491e-06, "loss": 1.2092, "step": 19395 }, { "epoch": 5.7770248887730595, "grad_norm": 0.3099175691604614, "learning_rate": 7.977841769998115e-06, "loss": 1.2346, "step": 19396 }, { "epoch": 5.777322734972729, "grad_norm": 0.32902398705482483, "learning_rate": 7.976897140033665e-06, "loss": 1.2061, "step": 19397 }, { "epoch": 5.777620581172397, "grad_norm": 0.3419867753982544, "learning_rate": 7.975952528891924e-06, "loss": 1.233, "step": 19398 }, { "epoch": 5.777918427372065, "grad_norm": 0.34635451436042786, "learning_rate": 7.975007936581685e-06, "loss": 1.2125, "step": 19399 }, { "epoch": 5.778216273571735, "grad_norm": 0.31191107630729675, "learning_rate": 7.974063363111733e-06, "loss": 1.2266, "step": 19400 }, { "epoch": 5.778514119771403, "grad_norm": 0.3498908281326294, "learning_rate": 7.97311880849086e-06, "loss": 1.2317, "step": 19401 }, { "epoch": 5.778811965971071, "grad_norm": 0.3947415351867676, "learning_rate": 7.97217427272785e-06, "loss": 1.2084, "step": 19402 }, { "epoch": 5.7791098121707405, "grad_norm": 0.28250670433044434, "learning_rate": 7.971229755831494e-06, "loss": 1.2108, "step": 19403 }, { "epoch": 5.779407658370409, "grad_norm": 0.2685490846633911, "learning_rate": 7.970285257810577e-06, "loss": 1.2109, "step": 19404 }, { "epoch": 5.779705504570078, "grad_norm": 0.31267213821411133, "learning_rate": 7.969340778673891e-06, "loss": 1.226, "step": 19405 }, { "epoch": 5.780003350769746, "grad_norm": 0.3549420237541199, "learning_rate": 7.968396318430216e-06, "loss": 1.2155, "step": 19406 }, { "epoch": 5.780301196969415, "grad_norm": 0.2676822245121002, "learning_rate": 7.967451877088348e-06, "loss": 1.2141, "step": 19407 }, { "epoch": 5.780599043169084, "grad_norm": 0.4018521010875702, "learning_rate": 7.966507454657067e-06, "loss": 1.2148, "step": 19408 }, { "epoch": 5.780896889368752, "grad_norm": 0.349960058927536, "learning_rate": 7.96556305114516e-06, "loss": 1.225, "step": 19409 }, { "epoch": 5.781194735568421, "grad_norm": 0.4603356122970581, "learning_rate": 7.964618666561418e-06, "loss": 1.2232, "step": 19410 }, { "epoch": 5.78149258176809, "grad_norm": 0.2915216088294983, "learning_rate": 7.96367430091463e-06, "loss": 1.2269, "step": 19411 }, { "epoch": 5.781790427967758, "grad_norm": 0.3176279366016388, "learning_rate": 7.96272995421357e-06, "loss": 1.221, "step": 19412 }, { "epoch": 5.782088274167426, "grad_norm": 0.2789140045642853, "learning_rate": 7.961785626467036e-06, "loss": 1.2267, "step": 19413 }, { "epoch": 5.782386120367096, "grad_norm": 0.2658219039440155, "learning_rate": 7.960841317683805e-06, "loss": 1.1986, "step": 19414 }, { "epoch": 5.782683966566764, "grad_norm": 0.3000280559062958, "learning_rate": 7.95989702787267e-06, "loss": 1.222, "step": 19415 }, { "epoch": 5.782981812766433, "grad_norm": 0.27580147981643677, "learning_rate": 7.958952757042415e-06, "loss": 1.2208, "step": 19416 }, { "epoch": 5.7832796589661015, "grad_norm": 0.2718835175037384, "learning_rate": 7.958008505201821e-06, "loss": 1.2155, "step": 19417 }, { "epoch": 5.78357750516577, "grad_norm": 0.25872963666915894, "learning_rate": 7.957064272359682e-06, "loss": 1.2101, "step": 19418 }, { "epoch": 5.783875351365438, "grad_norm": 0.3069838583469391, "learning_rate": 7.956120058524774e-06, "loss": 1.2184, "step": 19419 }, { "epoch": 5.784173197565107, "grad_norm": 0.4329480528831482, "learning_rate": 7.955175863705884e-06, "loss": 1.2139, "step": 19420 }, { "epoch": 5.784471043764776, "grad_norm": 0.3012837767601013, "learning_rate": 7.9542316879118e-06, "loss": 1.1893, "step": 19421 }, { "epoch": 5.784768889964445, "grad_norm": 0.4695422351360321, "learning_rate": 7.953287531151305e-06, "loss": 1.2073, "step": 19422 }, { "epoch": 5.785066736164113, "grad_norm": 0.3115541934967041, "learning_rate": 7.95234339343318e-06, "loss": 1.2165, "step": 19423 }, { "epoch": 5.785364582363782, "grad_norm": 0.49520808458328247, "learning_rate": 7.951399274766214e-06, "loss": 1.2114, "step": 19424 }, { "epoch": 5.785662428563451, "grad_norm": 0.2969418466091156, "learning_rate": 7.950455175159188e-06, "loss": 1.2283, "step": 19425 }, { "epoch": 5.785960274763119, "grad_norm": 0.3932274580001831, "learning_rate": 7.949511094620888e-06, "loss": 1.2163, "step": 19426 }, { "epoch": 5.7862581209627875, "grad_norm": 0.27351394295692444, "learning_rate": 7.948567033160095e-06, "loss": 1.2033, "step": 19427 }, { "epoch": 5.786555967162457, "grad_norm": 0.33573392033576965, "learning_rate": 7.947622990785596e-06, "loss": 1.2052, "step": 19428 }, { "epoch": 5.786853813362125, "grad_norm": 0.32856041193008423, "learning_rate": 7.946678967506167e-06, "loss": 1.2176, "step": 19429 }, { "epoch": 5.787151659561793, "grad_norm": 0.4364908039569855, "learning_rate": 7.945734963330602e-06, "loss": 1.2045, "step": 19430 }, { "epoch": 5.7874495057614626, "grad_norm": 0.2859799265861511, "learning_rate": 7.944790978267673e-06, "loss": 1.2285, "step": 19431 }, { "epoch": 5.787747351961131, "grad_norm": 0.268753319978714, "learning_rate": 7.94384701232617e-06, "loss": 1.2036, "step": 19432 }, { "epoch": 5.7880451981608, "grad_norm": 0.2933937609195709, "learning_rate": 7.942903065514876e-06, "loss": 1.215, "step": 19433 }, { "epoch": 5.788343044360468, "grad_norm": 0.3423647880554199, "learning_rate": 7.941959137842565e-06, "loss": 1.2122, "step": 19434 }, { "epoch": 5.788640890560137, "grad_norm": 0.3489307761192322, "learning_rate": 7.94101522931803e-06, "loss": 1.2048, "step": 19435 }, { "epoch": 5.788938736759806, "grad_norm": 0.3236292600631714, "learning_rate": 7.940071339950047e-06, "loss": 1.1957, "step": 19436 }, { "epoch": 5.789236582959474, "grad_norm": 0.3881048262119293, "learning_rate": 7.939127469747397e-06, "loss": 1.2432, "step": 19437 }, { "epoch": 5.789534429159143, "grad_norm": 0.29771867394447327, "learning_rate": 7.938183618718862e-06, "loss": 1.2061, "step": 19438 }, { "epoch": 5.789832275358812, "grad_norm": 0.34279343485832214, "learning_rate": 7.93723978687323e-06, "loss": 1.2221, "step": 19439 }, { "epoch": 5.79013012155848, "grad_norm": 0.27765029668807983, "learning_rate": 7.93629597421927e-06, "loss": 1.2094, "step": 19440 }, { "epoch": 5.7904279677581485, "grad_norm": 0.2648593485355377, "learning_rate": 7.935352180765776e-06, "loss": 1.2163, "step": 19441 }, { "epoch": 5.790725813957818, "grad_norm": 0.28902289271354675, "learning_rate": 7.934408406521523e-06, "loss": 1.2161, "step": 19442 }, { "epoch": 5.791023660157486, "grad_norm": 0.3197891414165497, "learning_rate": 7.933464651495289e-06, "loss": 1.1946, "step": 19443 }, { "epoch": 5.791321506357155, "grad_norm": 0.26033326983451843, "learning_rate": 7.932520915695857e-06, "loss": 1.2228, "step": 19444 }, { "epoch": 5.791619352556824, "grad_norm": 0.2686390280723572, "learning_rate": 7.931577199132012e-06, "loss": 1.2038, "step": 19445 }, { "epoch": 5.791917198756492, "grad_norm": 0.26463714241981506, "learning_rate": 7.930633501812525e-06, "loss": 1.2083, "step": 19446 }, { "epoch": 5.792215044956161, "grad_norm": 0.28965914249420166, "learning_rate": 7.929689823746187e-06, "loss": 1.2215, "step": 19447 }, { "epoch": 5.7925128911558295, "grad_norm": 0.34461528062820435, "learning_rate": 7.928746164941767e-06, "loss": 1.2092, "step": 19448 }, { "epoch": 5.792810737355498, "grad_norm": 0.3160446286201477, "learning_rate": 7.927802525408053e-06, "loss": 1.2237, "step": 19449 }, { "epoch": 5.793108583555167, "grad_norm": 0.4149305820465088, "learning_rate": 7.926858905153821e-06, "loss": 1.2123, "step": 19450 }, { "epoch": 5.793406429754835, "grad_norm": 0.311864972114563, "learning_rate": 7.925915304187847e-06, "loss": 1.2251, "step": 19451 }, { "epoch": 5.793704275954504, "grad_norm": 0.3452128469944, "learning_rate": 7.924971722518918e-06, "loss": 1.2047, "step": 19452 }, { "epoch": 5.794002122154173, "grad_norm": 0.33326712250709534, "learning_rate": 7.92402816015581e-06, "loss": 1.2124, "step": 19453 }, { "epoch": 5.794299968353841, "grad_norm": 0.32764118909835815, "learning_rate": 7.923084617107295e-06, "loss": 1.1997, "step": 19454 }, { "epoch": 5.79459781455351, "grad_norm": 0.3002004027366638, "learning_rate": 7.92214109338216e-06, "loss": 1.2065, "step": 19455 }, { "epoch": 5.794895660753179, "grad_norm": 0.3065284192562103, "learning_rate": 7.921197588989183e-06, "loss": 1.2075, "step": 19456 }, { "epoch": 5.795193506952847, "grad_norm": 0.2906924784183502, "learning_rate": 7.920254103937135e-06, "loss": 1.2079, "step": 19457 }, { "epoch": 5.795491353152515, "grad_norm": 0.27946940064430237, "learning_rate": 7.919310638234804e-06, "loss": 1.2122, "step": 19458 }, { "epoch": 5.795789199352185, "grad_norm": 0.307682067155838, "learning_rate": 7.918367191890962e-06, "loss": 1.2315, "step": 19459 }, { "epoch": 5.796087045551853, "grad_norm": 0.29961058497428894, "learning_rate": 7.917423764914385e-06, "loss": 1.2075, "step": 19460 }, { "epoch": 5.796384891751522, "grad_norm": 0.31368833780288696, "learning_rate": 7.916480357313852e-06, "loss": 1.2183, "step": 19461 }, { "epoch": 5.7966827379511905, "grad_norm": 0.27859219908714294, "learning_rate": 7.915536969098146e-06, "loss": 1.2106, "step": 19462 }, { "epoch": 5.796980584150859, "grad_norm": 0.3880733251571655, "learning_rate": 7.914593600276035e-06, "loss": 1.2315, "step": 19463 }, { "epoch": 5.797278430350528, "grad_norm": 0.26957470178604126, "learning_rate": 7.913650250856304e-06, "loss": 1.2171, "step": 19464 }, { "epoch": 5.797576276550196, "grad_norm": 0.5684119462966919, "learning_rate": 7.912706920847724e-06, "loss": 1.2159, "step": 19465 }, { "epoch": 5.797874122749865, "grad_norm": 0.2869011461734772, "learning_rate": 7.911763610259076e-06, "loss": 1.2193, "step": 19466 }, { "epoch": 5.798171968949534, "grad_norm": 0.40862399339675903, "learning_rate": 7.910820319099134e-06, "loss": 1.2169, "step": 19467 }, { "epoch": 5.798469815149202, "grad_norm": 0.26902005076408386, "learning_rate": 7.909877047376671e-06, "loss": 1.2057, "step": 19468 }, { "epoch": 5.798767661348871, "grad_norm": 0.5130660533905029, "learning_rate": 7.908933795100473e-06, "loss": 1.2088, "step": 19469 }, { "epoch": 5.79906550754854, "grad_norm": 0.2949145436286926, "learning_rate": 7.907990562279309e-06, "loss": 1.2215, "step": 19470 }, { "epoch": 5.799363353748208, "grad_norm": 0.4076009690761566, "learning_rate": 7.90704734892195e-06, "loss": 1.2172, "step": 19471 }, { "epoch": 5.799661199947877, "grad_norm": 0.37305235862731934, "learning_rate": 7.906104155037183e-06, "loss": 1.2247, "step": 19472 }, { "epoch": 5.799959046147546, "grad_norm": 0.5357394218444824, "learning_rate": 7.905160980633774e-06, "loss": 1.2077, "step": 19473 }, { "epoch": 5.800256892347214, "grad_norm": 0.3946038484573364, "learning_rate": 7.904217825720502e-06, "loss": 1.2238, "step": 19474 }, { "epoch": 5.800554738546883, "grad_norm": 0.4440886378288269, "learning_rate": 7.903274690306143e-06, "loss": 1.2033, "step": 19475 }, { "epoch": 5.8008525847465515, "grad_norm": 0.4744853079319, "learning_rate": 7.902331574399471e-06, "loss": 1.2139, "step": 19476 }, { "epoch": 5.80115043094622, "grad_norm": 0.30114537477493286, "learning_rate": 7.901388478009258e-06, "loss": 1.2301, "step": 19477 }, { "epoch": 5.801448277145889, "grad_norm": 0.5893943905830383, "learning_rate": 7.900445401144282e-06, "loss": 1.1888, "step": 19478 }, { "epoch": 5.801746123345557, "grad_norm": 0.2603563964366913, "learning_rate": 7.899502343813314e-06, "loss": 1.2091, "step": 19479 }, { "epoch": 5.802043969545226, "grad_norm": 0.5266931056976318, "learning_rate": 7.898559306025129e-06, "loss": 1.2035, "step": 19480 }, { "epoch": 5.802341815744895, "grad_norm": 0.3926750421524048, "learning_rate": 7.897616287788506e-06, "loss": 1.1993, "step": 19481 }, { "epoch": 5.802639661944563, "grad_norm": 0.5756117105484009, "learning_rate": 7.896673289112209e-06, "loss": 1.1926, "step": 19482 }, { "epoch": 5.8029375081442325, "grad_norm": 0.4146365821361542, "learning_rate": 7.895730310005023e-06, "loss": 1.215, "step": 19483 }, { "epoch": 5.803235354343901, "grad_norm": 0.37578409910202026, "learning_rate": 7.894787350475712e-06, "loss": 1.2171, "step": 19484 }, { "epoch": 5.803533200543569, "grad_norm": 0.5479852557182312, "learning_rate": 7.893844410533052e-06, "loss": 1.2107, "step": 19485 }, { "epoch": 5.8038310467432375, "grad_norm": 0.2593201994895935, "learning_rate": 7.892901490185818e-06, "loss": 1.2214, "step": 19486 }, { "epoch": 5.804128892942907, "grad_norm": 0.6040177345275879, "learning_rate": 7.891958589442783e-06, "loss": 1.218, "step": 19487 }, { "epoch": 5.804426739142575, "grad_norm": 0.26518529653549194, "learning_rate": 7.891015708312714e-06, "loss": 1.2083, "step": 19488 }, { "epoch": 5.804724585342244, "grad_norm": 0.42248404026031494, "learning_rate": 7.89007284680439e-06, "loss": 1.2098, "step": 19489 }, { "epoch": 5.8050224315419126, "grad_norm": 0.3890303075313568, "learning_rate": 7.889130004926582e-06, "loss": 1.2243, "step": 19490 }, { "epoch": 5.805320277741581, "grad_norm": 0.3540903329849243, "learning_rate": 7.888187182688057e-06, "loss": 1.2221, "step": 19491 }, { "epoch": 5.80561812394125, "grad_norm": 0.5370272397994995, "learning_rate": 7.887244380097595e-06, "loss": 1.2219, "step": 19492 }, { "epoch": 5.805915970140918, "grad_norm": 0.3935164213180542, "learning_rate": 7.886301597163964e-06, "loss": 1.2015, "step": 19493 }, { "epoch": 5.806213816340587, "grad_norm": 0.4174191355705261, "learning_rate": 7.885358833895931e-06, "loss": 1.2106, "step": 19494 }, { "epoch": 5.806511662540256, "grad_norm": 0.322135865688324, "learning_rate": 7.884416090302275e-06, "loss": 1.2237, "step": 19495 }, { "epoch": 5.806809508739924, "grad_norm": 0.38452598452568054, "learning_rate": 7.883473366391761e-06, "loss": 1.1945, "step": 19496 }, { "epoch": 5.807107354939593, "grad_norm": 0.41428133845329285, "learning_rate": 7.882530662173163e-06, "loss": 1.2004, "step": 19497 }, { "epoch": 5.807405201139262, "grad_norm": 0.2633223235607147, "learning_rate": 7.881587977655254e-06, "loss": 1.2087, "step": 19498 }, { "epoch": 5.80770304733893, "grad_norm": 0.4220786392688751, "learning_rate": 7.880645312846796e-06, "loss": 1.2245, "step": 19499 }, { "epoch": 5.808000893538599, "grad_norm": 0.3942488729953766, "learning_rate": 7.879702667756573e-06, "loss": 1.2119, "step": 19500 }, { "epoch": 5.808000893538599, "eval_loss": 1.3142019510269165, "eval_runtime": 24.4626, "eval_samples_per_second": 70.884, "eval_steps_per_second": 4.456, "step": 19500 }, { "epoch": 5.808298739738268, "grad_norm": 0.32796359062194824, "learning_rate": 7.878760042393346e-06, "loss": 1.2214, "step": 19501 }, { "epoch": 5.808596585937936, "grad_norm": 0.4206756353378296, "learning_rate": 7.877817436765882e-06, "loss": 1.2121, "step": 19502 }, { "epoch": 5.808894432137605, "grad_norm": 0.30258476734161377, "learning_rate": 7.876874850882959e-06, "loss": 1.2147, "step": 19503 }, { "epoch": 5.809192278337274, "grad_norm": 0.3711523413658142, "learning_rate": 7.875932284753345e-06, "loss": 1.2151, "step": 19504 }, { "epoch": 5.809490124536942, "grad_norm": 0.3838064670562744, "learning_rate": 7.874989738385803e-06, "loss": 1.2135, "step": 19505 }, { "epoch": 5.809787970736611, "grad_norm": 0.34310483932495117, "learning_rate": 7.874047211789113e-06, "loss": 1.1999, "step": 19506 }, { "epoch": 5.8100858169362795, "grad_norm": 0.2482302486896515, "learning_rate": 7.873104704972036e-06, "loss": 1.2241, "step": 19507 }, { "epoch": 5.810383663135948, "grad_norm": 0.3790917992591858, "learning_rate": 7.872162217943343e-06, "loss": 1.224, "step": 19508 }, { "epoch": 5.810681509335617, "grad_norm": 0.34777647256851196, "learning_rate": 7.871219750711803e-06, "loss": 1.2231, "step": 19509 }, { "epoch": 5.810979355535285, "grad_norm": 0.3179490864276886, "learning_rate": 7.870277303286186e-06, "loss": 1.2188, "step": 19510 }, { "epoch": 5.8112772017349545, "grad_norm": 0.2821601331233978, "learning_rate": 7.869334875675257e-06, "loss": 1.209, "step": 19511 }, { "epoch": 5.811575047934623, "grad_norm": 0.2729527950286865, "learning_rate": 7.86839246788779e-06, "loss": 1.2022, "step": 19512 }, { "epoch": 5.811872894134291, "grad_norm": 0.29866573214530945, "learning_rate": 7.867450079932548e-06, "loss": 1.225, "step": 19513 }, { "epoch": 5.81217074033396, "grad_norm": 0.29793581366539, "learning_rate": 7.866507711818299e-06, "loss": 1.2237, "step": 19514 }, { "epoch": 5.812468586533629, "grad_norm": 0.26419177651405334, "learning_rate": 7.865565363553813e-06, "loss": 1.2181, "step": 19515 }, { "epoch": 5.812766432733297, "grad_norm": 0.5411391854286194, "learning_rate": 7.864623035147853e-06, "loss": 1.2193, "step": 19516 }, { "epoch": 5.813064278932966, "grad_norm": 0.32239797711372375, "learning_rate": 7.863680726609197e-06, "loss": 1.2162, "step": 19517 }, { "epoch": 5.813362125132635, "grad_norm": 0.37849095463752747, "learning_rate": 7.862738437946602e-06, "loss": 1.2094, "step": 19518 }, { "epoch": 5.813659971332303, "grad_norm": 0.27907320857048035, "learning_rate": 7.861796169168836e-06, "loss": 1.225, "step": 19519 }, { "epoch": 5.813957817531972, "grad_norm": 0.38774049282073975, "learning_rate": 7.860853920284668e-06, "loss": 1.2262, "step": 19520 }, { "epoch": 5.8142556637316405, "grad_norm": 0.27382105588912964, "learning_rate": 7.859911691302868e-06, "loss": 1.2235, "step": 19521 }, { "epoch": 5.81455350993131, "grad_norm": 0.4559738337993622, "learning_rate": 7.858969482232193e-06, "loss": 1.2217, "step": 19522 }, { "epoch": 5.814851356130978, "grad_norm": 0.363189160823822, "learning_rate": 7.85802729308142e-06, "loss": 1.1912, "step": 19523 }, { "epoch": 5.815149202330646, "grad_norm": 0.6832521557807922, "learning_rate": 7.857085123859309e-06, "loss": 1.2023, "step": 19524 }, { "epoch": 5.815447048530315, "grad_norm": 0.42572712898254395, "learning_rate": 7.856142974574624e-06, "loss": 1.2209, "step": 19525 }, { "epoch": 5.815744894729984, "grad_norm": 0.6698419451713562, "learning_rate": 7.855200845236135e-06, "loss": 1.2232, "step": 19526 }, { "epoch": 5.816042740929652, "grad_norm": 0.48179465532302856, "learning_rate": 7.854258735852608e-06, "loss": 1.2204, "step": 19527 }, { "epoch": 5.8163405871293214, "grad_norm": 0.5433464646339417, "learning_rate": 7.853316646432801e-06, "loss": 1.2085, "step": 19528 }, { "epoch": 5.81663843332899, "grad_norm": 0.41237905621528625, "learning_rate": 7.85237457698549e-06, "loss": 1.215, "step": 19529 }, { "epoch": 5.816936279528658, "grad_norm": 0.30683696269989014, "learning_rate": 7.851432527519428e-06, "loss": 1.2046, "step": 19530 }, { "epoch": 5.817234125728327, "grad_norm": 0.5190821886062622, "learning_rate": 7.850490498043389e-06, "loss": 1.1982, "step": 19531 }, { "epoch": 5.817531971927996, "grad_norm": 0.29013538360595703, "learning_rate": 7.849548488566134e-06, "loss": 1.2062, "step": 19532 }, { "epoch": 5.817829818127664, "grad_norm": 0.47048938274383545, "learning_rate": 7.848606499096427e-06, "loss": 1.2164, "step": 19533 }, { "epoch": 5.818127664327333, "grad_norm": 0.39608705043792725, "learning_rate": 7.847664529643035e-06, "loss": 1.2188, "step": 19534 }, { "epoch": 5.8184255105270015, "grad_norm": 0.48446598649024963, "learning_rate": 7.84672258021472e-06, "loss": 1.2065, "step": 19535 }, { "epoch": 5.81872335672667, "grad_norm": 0.4329598546028137, "learning_rate": 7.845780650820242e-06, "loss": 1.1901, "step": 19536 }, { "epoch": 5.819021202926339, "grad_norm": 0.3019412159919739, "learning_rate": 7.844838741468372e-06, "loss": 1.2078, "step": 19537 }, { "epoch": 5.819319049126007, "grad_norm": 0.40126192569732666, "learning_rate": 7.843896852167868e-06, "loss": 1.217, "step": 19538 }, { "epoch": 5.819616895325677, "grad_norm": 0.4770059287548065, "learning_rate": 7.842954982927493e-06, "loss": 1.2181, "step": 19539 }, { "epoch": 5.819914741525345, "grad_norm": 0.29250651597976685, "learning_rate": 7.842013133756014e-06, "loss": 1.1887, "step": 19540 }, { "epoch": 5.820212587725013, "grad_norm": 0.7883771657943726, "learning_rate": 7.841071304662194e-06, "loss": 1.21, "step": 19541 }, { "epoch": 5.8205104339246825, "grad_norm": 0.258577436208725, "learning_rate": 7.840129495654789e-06, "loss": 1.2065, "step": 19542 }, { "epoch": 5.820808280124351, "grad_norm": 0.4956992566585541, "learning_rate": 7.839187706742569e-06, "loss": 1.2114, "step": 19543 }, { "epoch": 5.821106126324019, "grad_norm": 0.4127381443977356, "learning_rate": 7.838245937934292e-06, "loss": 1.2121, "step": 19544 }, { "epoch": 5.821403972523688, "grad_norm": 0.33163201808929443, "learning_rate": 7.837304189238719e-06, "loss": 1.2069, "step": 19545 }, { "epoch": 5.821701818723357, "grad_norm": 0.3702181875705719, "learning_rate": 7.836362460664619e-06, "loss": 1.2206, "step": 19546 }, { "epoch": 5.821999664923025, "grad_norm": 0.4005001485347748, "learning_rate": 7.835420752220744e-06, "loss": 1.2168, "step": 19547 }, { "epoch": 5.822297511122694, "grad_norm": 0.31537196040153503, "learning_rate": 7.834479063915865e-06, "loss": 1.2047, "step": 19548 }, { "epoch": 5.8225953573223626, "grad_norm": 0.3628023862838745, "learning_rate": 7.833537395758736e-06, "loss": 1.199, "step": 19549 }, { "epoch": 5.822893203522032, "grad_norm": 0.33250901103019714, "learning_rate": 7.832595747758121e-06, "loss": 1.2091, "step": 19550 }, { "epoch": 5.8231910497217, "grad_norm": 0.361563116312027, "learning_rate": 7.831654119922783e-06, "loss": 1.2159, "step": 19551 }, { "epoch": 5.823488895921368, "grad_norm": 0.3284603953361511, "learning_rate": 7.83071251226148e-06, "loss": 1.2137, "step": 19552 }, { "epoch": 5.823786742121037, "grad_norm": 0.28391581773757935, "learning_rate": 7.829770924782971e-06, "loss": 1.208, "step": 19553 }, { "epoch": 5.824084588320706, "grad_norm": 0.4764794707298279, "learning_rate": 7.828829357496023e-06, "loss": 1.2304, "step": 19554 }, { "epoch": 5.824382434520374, "grad_norm": 0.25552383065223694, "learning_rate": 7.827887810409388e-06, "loss": 1.2166, "step": 19555 }, { "epoch": 5.8246802807200435, "grad_norm": 0.4202573597431183, "learning_rate": 7.826946283531831e-06, "loss": 1.1973, "step": 19556 }, { "epoch": 5.824978126919712, "grad_norm": 0.31306782364845276, "learning_rate": 7.826004776872114e-06, "loss": 1.2175, "step": 19557 }, { "epoch": 5.82527597311938, "grad_norm": 0.3671114146709442, "learning_rate": 7.825063290438993e-06, "loss": 1.2223, "step": 19558 }, { "epoch": 5.825573819319049, "grad_norm": 0.4436177611351013, "learning_rate": 7.824121824241225e-06, "loss": 1.2199, "step": 19559 }, { "epoch": 5.825871665518718, "grad_norm": 0.6168495416641235, "learning_rate": 7.823180378287576e-06, "loss": 1.2144, "step": 19560 }, { "epoch": 5.826169511718386, "grad_norm": 0.2953750193119049, "learning_rate": 7.822238952586798e-06, "loss": 1.2196, "step": 19561 }, { "epoch": 5.826467357918055, "grad_norm": 0.4011472463607788, "learning_rate": 7.821297547147652e-06, "loss": 1.2033, "step": 19562 }, { "epoch": 5.826765204117724, "grad_norm": 0.3375413417816162, "learning_rate": 7.820356161978904e-06, "loss": 1.2148, "step": 19563 }, { "epoch": 5.827063050317392, "grad_norm": 0.30218881368637085, "learning_rate": 7.819414797089301e-06, "loss": 1.2278, "step": 19564 }, { "epoch": 5.827360896517061, "grad_norm": 0.6413726806640625, "learning_rate": 7.818473452487612e-06, "loss": 1.2009, "step": 19565 }, { "epoch": 5.8276587427167295, "grad_norm": 0.31588879227638245, "learning_rate": 7.817532128182587e-06, "loss": 1.2118, "step": 19566 }, { "epoch": 5.827956588916399, "grad_norm": 0.4257681369781494, "learning_rate": 7.816590824182988e-06, "loss": 1.2133, "step": 19567 }, { "epoch": 5.828254435116067, "grad_norm": 0.35621654987335205, "learning_rate": 7.815649540497572e-06, "loss": 1.1961, "step": 19568 }, { "epoch": 5.828552281315735, "grad_norm": 0.408843070268631, "learning_rate": 7.8147082771351e-06, "loss": 1.2071, "step": 19569 }, { "epoch": 5.8288501275154045, "grad_norm": 0.4065593481063843, "learning_rate": 7.813767034104319e-06, "loss": 1.2207, "step": 19570 }, { "epoch": 5.829147973715073, "grad_norm": 0.27984321117401123, "learning_rate": 7.812825811413998e-06, "loss": 1.1934, "step": 19571 }, { "epoch": 5.829445819914741, "grad_norm": 0.37439045310020447, "learning_rate": 7.811884609072888e-06, "loss": 1.2224, "step": 19572 }, { "epoch": 5.82974366611441, "grad_norm": 0.2947523593902588, "learning_rate": 7.810943427089746e-06, "loss": 1.1902, "step": 19573 }, { "epoch": 5.830041512314079, "grad_norm": 0.3579213619232178, "learning_rate": 7.810002265473331e-06, "loss": 1.2041, "step": 19574 }, { "epoch": 5.830339358513747, "grad_norm": 0.3616520166397095, "learning_rate": 7.809061124232399e-06, "loss": 1.2165, "step": 19575 }, { "epoch": 5.830637204713416, "grad_norm": 0.28629428148269653, "learning_rate": 7.808120003375703e-06, "loss": 1.1932, "step": 19576 }, { "epoch": 5.830935050913085, "grad_norm": 0.42465123534202576, "learning_rate": 7.807178902912002e-06, "loss": 1.211, "step": 19577 }, { "epoch": 5.831232897112754, "grad_norm": 0.3263692557811737, "learning_rate": 7.806237822850053e-06, "loss": 1.2122, "step": 19578 }, { "epoch": 5.831530743312422, "grad_norm": 0.6494307518005371, "learning_rate": 7.805296763198605e-06, "loss": 1.2217, "step": 19579 }, { "epoch": 5.8318285895120905, "grad_norm": 0.28989386558532715, "learning_rate": 7.804355723966425e-06, "loss": 1.2212, "step": 19580 }, { "epoch": 5.83212643571176, "grad_norm": 0.4110584259033203, "learning_rate": 7.803414705162256e-06, "loss": 1.2119, "step": 19581 }, { "epoch": 5.832424281911428, "grad_norm": 0.38376542925834656, "learning_rate": 7.802473706794865e-06, "loss": 1.2061, "step": 19582 }, { "epoch": 5.832722128111096, "grad_norm": 0.3380451798439026, "learning_rate": 7.801532728873e-06, "loss": 1.2063, "step": 19583 }, { "epoch": 5.833019974310766, "grad_norm": 0.3984583914279938, "learning_rate": 7.800591771405412e-06, "loss": 1.1853, "step": 19584 }, { "epoch": 5.833317820510434, "grad_norm": 0.2838040888309479, "learning_rate": 7.799650834400863e-06, "loss": 1.2087, "step": 19585 }, { "epoch": 5.833615666710102, "grad_norm": 0.38319075107574463, "learning_rate": 7.798709917868107e-06, "loss": 1.2206, "step": 19586 }, { "epoch": 5.8339135129097714, "grad_norm": 0.2788871228694916, "learning_rate": 7.797769021815891e-06, "loss": 1.2148, "step": 19587 }, { "epoch": 5.83421135910944, "grad_norm": 0.3699237108230591, "learning_rate": 7.796828146252979e-06, "loss": 1.221, "step": 19588 }, { "epoch": 5.834509205309109, "grad_norm": 0.29569047689437866, "learning_rate": 7.795887291188118e-06, "loss": 1.1963, "step": 19589 }, { "epoch": 5.834807051508777, "grad_norm": 0.3124069273471832, "learning_rate": 7.79494645663006e-06, "loss": 1.2016, "step": 19590 }, { "epoch": 5.835104897708446, "grad_norm": 0.2546287775039673, "learning_rate": 7.794005642587563e-06, "loss": 1.1898, "step": 19591 }, { "epoch": 5.835402743908114, "grad_norm": 0.40595853328704834, "learning_rate": 7.79306484906938e-06, "loss": 1.2096, "step": 19592 }, { "epoch": 5.835700590107783, "grad_norm": 0.2685578167438507, "learning_rate": 7.79212407608426e-06, "loss": 1.2236, "step": 19593 }, { "epoch": 5.8359984363074515, "grad_norm": 0.37070587277412415, "learning_rate": 7.791183323640963e-06, "loss": 1.2133, "step": 19594 }, { "epoch": 5.836296282507121, "grad_norm": 0.2616555690765381, "learning_rate": 7.790242591748235e-06, "loss": 1.2164, "step": 19595 }, { "epoch": 5.836594128706789, "grad_norm": 0.38597285747528076, "learning_rate": 7.789301880414832e-06, "loss": 1.2073, "step": 19596 }, { "epoch": 5.836891974906457, "grad_norm": 0.2547573745250702, "learning_rate": 7.788361189649503e-06, "loss": 1.2062, "step": 19597 }, { "epoch": 5.837189821106127, "grad_norm": 0.3700876533985138, "learning_rate": 7.787420519461e-06, "loss": 1.2249, "step": 19598 }, { "epoch": 5.837487667305795, "grad_norm": 0.2842254340648651, "learning_rate": 7.786479869858083e-06, "loss": 1.2332, "step": 19599 }, { "epoch": 5.837785513505463, "grad_norm": 0.2934574782848358, "learning_rate": 7.785539240849497e-06, "loss": 1.2046, "step": 19600 }, { "epoch": 5.8380833597051325, "grad_norm": 0.30779892206192017, "learning_rate": 7.784598632443989e-06, "loss": 1.221, "step": 19601 }, { "epoch": 5.838381205904801, "grad_norm": 0.27394330501556396, "learning_rate": 7.78365804465032e-06, "loss": 1.2167, "step": 19602 }, { "epoch": 5.838679052104469, "grad_norm": 0.3246789574623108, "learning_rate": 7.782717477477237e-06, "loss": 1.2125, "step": 19603 }, { "epoch": 5.838976898304138, "grad_norm": 0.33262526988983154, "learning_rate": 7.781776930933487e-06, "loss": 1.2117, "step": 19604 }, { "epoch": 5.839274744503807, "grad_norm": 0.2643575370311737, "learning_rate": 7.780836405027827e-06, "loss": 1.2057, "step": 19605 }, { "epoch": 5.839572590703476, "grad_norm": 0.26078900694847107, "learning_rate": 7.779895899769006e-06, "loss": 1.2248, "step": 19606 }, { "epoch": 5.839870436903144, "grad_norm": 0.26408693194389343, "learning_rate": 7.77895541516577e-06, "loss": 1.2108, "step": 19607 }, { "epoch": 5.8401682831028126, "grad_norm": 0.2535662353038788, "learning_rate": 7.778014951226874e-06, "loss": 1.2346, "step": 19608 }, { "epoch": 5.840466129302482, "grad_norm": 0.2951176166534424, "learning_rate": 7.777074507961067e-06, "loss": 1.2101, "step": 19609 }, { "epoch": 5.84076397550215, "grad_norm": 0.26551011204719543, "learning_rate": 7.776134085377096e-06, "loss": 1.2071, "step": 19610 }, { "epoch": 5.841061821701818, "grad_norm": 0.348847359418869, "learning_rate": 7.775193683483715e-06, "loss": 1.2096, "step": 19611 }, { "epoch": 5.841359667901488, "grad_norm": 0.27657821774482727, "learning_rate": 7.774253302289671e-06, "loss": 1.2165, "step": 19612 }, { "epoch": 5.841657514101156, "grad_norm": 0.2629542648792267, "learning_rate": 7.77331294180371e-06, "loss": 1.2102, "step": 19613 }, { "epoch": 5.841955360300824, "grad_norm": 0.331338107585907, "learning_rate": 7.772372602034585e-06, "loss": 1.2122, "step": 19614 }, { "epoch": 5.8422532065004935, "grad_norm": 0.3670068681240082, "learning_rate": 7.771432282991045e-06, "loss": 1.2259, "step": 19615 }, { "epoch": 5.842551052700162, "grad_norm": 0.2694351077079773, "learning_rate": 7.770491984681839e-06, "loss": 1.215, "step": 19616 }, { "epoch": 5.842848898899831, "grad_norm": 0.2863113582134247, "learning_rate": 7.769551707115715e-06, "loss": 1.204, "step": 19617 }, { "epoch": 5.843146745099499, "grad_norm": 0.3022710382938385, "learning_rate": 7.768611450301418e-06, "loss": 1.2147, "step": 19618 }, { "epoch": 5.843444591299168, "grad_norm": 0.3116869032382965, "learning_rate": 7.767671214247701e-06, "loss": 1.2067, "step": 19619 }, { "epoch": 5.843742437498836, "grad_norm": 0.262445867061615, "learning_rate": 7.766730998963307e-06, "loss": 1.205, "step": 19620 }, { "epoch": 5.844040283698505, "grad_norm": 0.2657754421234131, "learning_rate": 7.765790804456982e-06, "loss": 1.2149, "step": 19621 }, { "epoch": 5.844338129898174, "grad_norm": 0.38192668557167053, "learning_rate": 7.764850630737484e-06, "loss": 1.2054, "step": 19622 }, { "epoch": 5.844635976097843, "grad_norm": 0.3063715994358063, "learning_rate": 7.763910477813555e-06, "loss": 1.2116, "step": 19623 }, { "epoch": 5.844933822297511, "grad_norm": 0.38201138377189636, "learning_rate": 7.762970345693934e-06, "loss": 1.2045, "step": 19624 }, { "epoch": 5.8452316684971795, "grad_norm": 0.272349089384079, "learning_rate": 7.76203023438738e-06, "loss": 1.2091, "step": 19625 }, { "epoch": 5.845529514696849, "grad_norm": 0.6438816785812378, "learning_rate": 7.761090143902631e-06, "loss": 1.2123, "step": 19626 }, { "epoch": 5.845827360896517, "grad_norm": 0.3567163944244385, "learning_rate": 7.760150074248435e-06, "loss": 1.2221, "step": 19627 }, { "epoch": 5.846125207096185, "grad_norm": 0.5482204556465149, "learning_rate": 7.759210025433544e-06, "loss": 1.2137, "step": 19628 }, { "epoch": 5.8464230532958545, "grad_norm": 0.3015652000904083, "learning_rate": 7.7582699974667e-06, "loss": 1.2034, "step": 19629 }, { "epoch": 5.846720899495523, "grad_norm": 0.3942241668701172, "learning_rate": 7.757329990356647e-06, "loss": 1.2083, "step": 19630 }, { "epoch": 5.847018745695191, "grad_norm": 0.34201639890670776, "learning_rate": 7.756390004112133e-06, "loss": 1.2009, "step": 19631 }, { "epoch": 5.84731659189486, "grad_norm": 0.42357155680656433, "learning_rate": 7.755450038741903e-06, "loss": 1.2032, "step": 19632 }, { "epoch": 5.847614438094529, "grad_norm": 0.4093886911869049, "learning_rate": 7.754510094254703e-06, "loss": 1.206, "step": 19633 }, { "epoch": 5.847912284294198, "grad_norm": 0.39763838052749634, "learning_rate": 7.753570170659281e-06, "loss": 1.237, "step": 19634 }, { "epoch": 5.848210130493866, "grad_norm": 0.3132558763027191, "learning_rate": 7.752630267964373e-06, "loss": 1.212, "step": 19635 }, { "epoch": 5.848507976693535, "grad_norm": 0.2729364335536957, "learning_rate": 7.751690386178734e-06, "loss": 1.2014, "step": 19636 }, { "epoch": 5.848805822893204, "grad_norm": 0.32816725969314575, "learning_rate": 7.750750525311103e-06, "loss": 1.2119, "step": 19637 }, { "epoch": 5.849103669092872, "grad_norm": 0.25775423645973206, "learning_rate": 7.749810685370223e-06, "loss": 1.2138, "step": 19638 }, { "epoch": 5.8494015152925405, "grad_norm": 0.3045441508293152, "learning_rate": 7.748870866364844e-06, "loss": 1.2212, "step": 19639 }, { "epoch": 5.84969936149221, "grad_norm": 0.2741446793079376, "learning_rate": 7.747931068303707e-06, "loss": 1.2104, "step": 19640 }, { "epoch": 5.849997207691878, "grad_norm": 0.2722054123878479, "learning_rate": 7.74699129119555e-06, "loss": 1.2135, "step": 19641 }, { "epoch": 5.850295053891546, "grad_norm": 0.26878291368484497, "learning_rate": 7.746051535049127e-06, "loss": 1.2002, "step": 19642 }, { "epoch": 5.850592900091216, "grad_norm": 0.3480394780635834, "learning_rate": 7.745111799873173e-06, "loss": 1.2237, "step": 19643 }, { "epoch": 5.850890746290884, "grad_norm": 0.4168858528137207, "learning_rate": 7.744172085676433e-06, "loss": 1.1948, "step": 19644 }, { "epoch": 5.851188592490553, "grad_norm": 0.26998400688171387, "learning_rate": 7.743232392467657e-06, "loss": 1.2154, "step": 19645 }, { "epoch": 5.8514864386902214, "grad_norm": 0.2897645831108093, "learning_rate": 7.742292720255576e-06, "loss": 1.2206, "step": 19646 }, { "epoch": 5.85178428488989, "grad_norm": 0.29250776767730713, "learning_rate": 7.741353069048945e-06, "loss": 1.2305, "step": 19647 }, { "epoch": 5.852082131089559, "grad_norm": 0.2947573661804199, "learning_rate": 7.740413438856498e-06, "loss": 1.2079, "step": 19648 }, { "epoch": 5.852379977289227, "grad_norm": 0.2997695207595825, "learning_rate": 7.739473829686977e-06, "loss": 1.2069, "step": 19649 }, { "epoch": 5.852677823488896, "grad_norm": 0.2877860367298126, "learning_rate": 7.738534241549128e-06, "loss": 1.2138, "step": 19650 }, { "epoch": 5.852975669688565, "grad_norm": 0.3143649995326996, "learning_rate": 7.737594674451692e-06, "loss": 1.2203, "step": 19651 }, { "epoch": 5.853273515888233, "grad_norm": 0.2758687138557434, "learning_rate": 7.736655128403407e-06, "loss": 1.2242, "step": 19652 }, { "epoch": 5.8535713620879015, "grad_norm": 0.2680954039096832, "learning_rate": 7.735715603413021e-06, "loss": 1.1969, "step": 19653 }, { "epoch": 5.853869208287571, "grad_norm": 0.28473687171936035, "learning_rate": 7.734776099489268e-06, "loss": 1.2199, "step": 19654 }, { "epoch": 5.854167054487239, "grad_norm": 0.30702653527259827, "learning_rate": 7.733836616640896e-06, "loss": 1.2171, "step": 19655 }, { "epoch": 5.854464900686908, "grad_norm": 0.3919333219528198, "learning_rate": 7.732897154876639e-06, "loss": 1.2042, "step": 19656 }, { "epoch": 5.854762746886577, "grad_norm": 0.29652485251426697, "learning_rate": 7.731957714205244e-06, "loss": 1.2252, "step": 19657 }, { "epoch": 5.855060593086245, "grad_norm": 0.36527273058891296, "learning_rate": 7.731018294635446e-06, "loss": 1.2118, "step": 19658 }, { "epoch": 5.855358439285913, "grad_norm": 0.27338355779647827, "learning_rate": 7.73007889617599e-06, "loss": 1.1971, "step": 19659 }, { "epoch": 5.8556562854855825, "grad_norm": 0.26514914631843567, "learning_rate": 7.729139518835612e-06, "loss": 1.2009, "step": 19660 }, { "epoch": 5.855954131685251, "grad_norm": 0.2890794575214386, "learning_rate": 7.728200162623052e-06, "loss": 1.2105, "step": 19661 }, { "epoch": 5.85625197788492, "grad_norm": 0.2984280586242676, "learning_rate": 7.727260827547055e-06, "loss": 1.2434, "step": 19662 }, { "epoch": 5.856549824084588, "grad_norm": 0.2639959454536438, "learning_rate": 7.726321513616352e-06, "loss": 1.2058, "step": 19663 }, { "epoch": 5.856847670284257, "grad_norm": 0.2727898061275482, "learning_rate": 7.725382220839693e-06, "loss": 1.2095, "step": 19664 }, { "epoch": 5.857145516483926, "grad_norm": 0.31113842129707336, "learning_rate": 7.72444294922581e-06, "loss": 1.2144, "step": 19665 }, { "epoch": 5.857443362683594, "grad_norm": 0.3023146986961365, "learning_rate": 7.723503698783438e-06, "loss": 1.2103, "step": 19666 }, { "epoch": 5.8577412088832626, "grad_norm": 0.2729348838329315, "learning_rate": 7.722564469521324e-06, "loss": 1.2076, "step": 19667 }, { "epoch": 5.858039055082932, "grad_norm": 0.29502829909324646, "learning_rate": 7.721625261448206e-06, "loss": 1.1948, "step": 19668 }, { "epoch": 5.8583369012826, "grad_norm": 0.5774202346801758, "learning_rate": 7.720686074572813e-06, "loss": 1.2138, "step": 19669 }, { "epoch": 5.858634747482268, "grad_norm": 0.3133317530155182, "learning_rate": 7.719746908903895e-06, "loss": 1.2005, "step": 19670 }, { "epoch": 5.858932593681938, "grad_norm": 0.4840753376483917, "learning_rate": 7.718807764450185e-06, "loss": 1.2186, "step": 19671 }, { "epoch": 5.859230439881606, "grad_norm": 0.30928316712379456, "learning_rate": 7.717868641220414e-06, "loss": 1.2075, "step": 19672 }, { "epoch": 5.859528286081275, "grad_norm": 0.5464794039726257, "learning_rate": 7.716929539223329e-06, "loss": 1.2111, "step": 19673 }, { "epoch": 5.8598261322809435, "grad_norm": 0.4610711336135864, "learning_rate": 7.715990458467667e-06, "loss": 1.2153, "step": 19674 }, { "epoch": 5.860123978480612, "grad_norm": 0.4352073073387146, "learning_rate": 7.715051398962155e-06, "loss": 1.2133, "step": 19675 }, { "epoch": 5.860421824680281, "grad_norm": 0.37265628576278687, "learning_rate": 7.714112360715542e-06, "loss": 1.2327, "step": 19676 }, { "epoch": 5.860719670879949, "grad_norm": 0.3591066598892212, "learning_rate": 7.713173343736557e-06, "loss": 1.2159, "step": 19677 }, { "epoch": 5.861017517079618, "grad_norm": 0.29050585627555847, "learning_rate": 7.71223434803394e-06, "loss": 1.1929, "step": 19678 }, { "epoch": 5.861315363279287, "grad_norm": 0.37478896975517273, "learning_rate": 7.711295373616426e-06, "loss": 1.2006, "step": 19679 }, { "epoch": 5.861613209478955, "grad_norm": 0.338222473859787, "learning_rate": 7.71035642049275e-06, "loss": 1.2158, "step": 19680 }, { "epoch": 5.861911055678624, "grad_norm": 0.27898043394088745, "learning_rate": 7.709417488671653e-06, "loss": 1.2014, "step": 19681 }, { "epoch": 5.862208901878293, "grad_norm": 0.2537229359149933, "learning_rate": 7.708478578161866e-06, "loss": 1.2084, "step": 19682 }, { "epoch": 5.862506748077961, "grad_norm": 0.28727245330810547, "learning_rate": 7.707539688972124e-06, "loss": 1.2165, "step": 19683 }, { "epoch": 5.86280459427763, "grad_norm": 0.26053178310394287, "learning_rate": 7.706600821111166e-06, "loss": 1.2284, "step": 19684 }, { "epoch": 5.863102440477299, "grad_norm": 0.28430110216140747, "learning_rate": 7.705661974587723e-06, "loss": 1.2194, "step": 19685 }, { "epoch": 5.863400286676967, "grad_norm": 0.26683878898620605, "learning_rate": 7.70472314941053e-06, "loss": 1.2252, "step": 19686 }, { "epoch": 5.863698132876636, "grad_norm": 0.33253082633018494, "learning_rate": 7.70378434558833e-06, "loss": 1.2143, "step": 19687 }, { "epoch": 5.8639959790763045, "grad_norm": 0.2606544494628906, "learning_rate": 7.702845563129848e-06, "loss": 1.2307, "step": 19688 }, { "epoch": 5.864293825275973, "grad_norm": 0.2861187160015106, "learning_rate": 7.701906802043819e-06, "loss": 1.2088, "step": 19689 }, { "epoch": 5.864591671475642, "grad_norm": 0.2976442277431488, "learning_rate": 7.700968062338981e-06, "loss": 1.2102, "step": 19690 }, { "epoch": 5.86488951767531, "grad_norm": 0.2559099793434143, "learning_rate": 7.70002934402407e-06, "loss": 1.1972, "step": 19691 }, { "epoch": 5.865187363874979, "grad_norm": 0.4708477258682251, "learning_rate": 7.699090647107811e-06, "loss": 1.2245, "step": 19692 }, { "epoch": 5.865485210074648, "grad_norm": 0.3552057445049286, "learning_rate": 7.698151971598947e-06, "loss": 1.2135, "step": 19693 }, { "epoch": 5.865783056274316, "grad_norm": 0.4166930615901947, "learning_rate": 7.697213317506208e-06, "loss": 1.1927, "step": 19694 }, { "epoch": 5.866080902473985, "grad_norm": 0.2781657874584198, "learning_rate": 7.69627468483832e-06, "loss": 1.1973, "step": 19695 }, { "epoch": 5.866378748673654, "grad_norm": 0.6836292743682861, "learning_rate": 7.695336073604027e-06, "loss": 1.2058, "step": 19696 }, { "epoch": 5.866676594873322, "grad_norm": 0.4408610761165619, "learning_rate": 7.694397483812052e-06, "loss": 1.1983, "step": 19697 }, { "epoch": 5.8669744410729905, "grad_norm": 0.5116508603096008, "learning_rate": 7.69345891547114e-06, "loss": 1.2179, "step": 19698 }, { "epoch": 5.86727228727266, "grad_norm": 0.2547590732574463, "learning_rate": 7.692520368590013e-06, "loss": 1.2052, "step": 19699 }, { "epoch": 5.867570133472328, "grad_norm": 0.7557486295700073, "learning_rate": 7.691581843177403e-06, "loss": 1.2109, "step": 19700 }, { "epoch": 5.867867979671997, "grad_norm": 0.29969343543052673, "learning_rate": 7.690643339242048e-06, "loss": 1.2232, "step": 19701 }, { "epoch": 5.868165825871666, "grad_norm": 0.40185675024986267, "learning_rate": 7.689704856792674e-06, "loss": 1.2019, "step": 19702 }, { "epoch": 5.868463672071334, "grad_norm": 0.31418320536613464, "learning_rate": 7.688766395838016e-06, "loss": 1.2, "step": 19703 }, { "epoch": 5.868761518271003, "grad_norm": 0.3252184987068176, "learning_rate": 7.687827956386807e-06, "loss": 1.2026, "step": 19704 }, { "epoch": 5.8690593644706714, "grad_norm": 0.4650043249130249, "learning_rate": 7.686889538447775e-06, "loss": 1.2149, "step": 19705 }, { "epoch": 5.86935721067034, "grad_norm": 0.26629945635795593, "learning_rate": 7.685951142029646e-06, "loss": 1.2091, "step": 19706 }, { "epoch": 5.869655056870009, "grad_norm": 0.5208491086959839, "learning_rate": 7.685012767141162e-06, "loss": 1.2003, "step": 19707 }, { "epoch": 5.869952903069677, "grad_norm": 0.33644038438796997, "learning_rate": 7.684074413791047e-06, "loss": 1.2144, "step": 19708 }, { "epoch": 5.870250749269346, "grad_norm": 0.4062998592853546, "learning_rate": 7.683136081988027e-06, "loss": 1.2199, "step": 19709 }, { "epoch": 5.870548595469015, "grad_norm": 0.4381130039691925, "learning_rate": 7.682197771740843e-06, "loss": 1.2065, "step": 19710 }, { "epoch": 5.870846441668683, "grad_norm": 0.25779780745506287, "learning_rate": 7.68125948305822e-06, "loss": 1.2159, "step": 19711 }, { "epoch": 5.871144287868352, "grad_norm": 0.4768553376197815, "learning_rate": 7.680321215948882e-06, "loss": 1.2089, "step": 19712 }, { "epoch": 5.871442134068021, "grad_norm": 0.302226185798645, "learning_rate": 7.679382970421565e-06, "loss": 1.2161, "step": 19713 }, { "epoch": 5.871739980267689, "grad_norm": 0.3917502164840698, "learning_rate": 7.678444746484996e-06, "loss": 1.2202, "step": 19714 }, { "epoch": 5.872037826467358, "grad_norm": 0.37382709980010986, "learning_rate": 7.677506544147906e-06, "loss": 1.2126, "step": 19715 }, { "epoch": 5.872335672667027, "grad_norm": 0.28242412209510803, "learning_rate": 7.676568363419026e-06, "loss": 1.2133, "step": 19716 }, { "epoch": 5.872633518866695, "grad_norm": 0.334397554397583, "learning_rate": 7.675630204307075e-06, "loss": 1.2013, "step": 19717 }, { "epoch": 5.872931365066364, "grad_norm": 0.27919960021972656, "learning_rate": 7.674692066820794e-06, "loss": 1.2248, "step": 19718 }, { "epoch": 5.8732292112660325, "grad_norm": 0.310494601726532, "learning_rate": 7.673753950968902e-06, "loss": 1.2175, "step": 19719 }, { "epoch": 5.873527057465701, "grad_norm": 0.30901026725769043, "learning_rate": 7.672815856760131e-06, "loss": 1.2174, "step": 19720 }, { "epoch": 5.87382490366537, "grad_norm": 0.2734905779361725, "learning_rate": 7.671877784203208e-06, "loss": 1.2095, "step": 19721 }, { "epoch": 5.874122749865038, "grad_norm": 0.2951793074607849, "learning_rate": 7.670939733306863e-06, "loss": 1.2031, "step": 19722 }, { "epoch": 5.874420596064708, "grad_norm": 0.2693183720111847, "learning_rate": 7.670001704079817e-06, "loss": 1.2172, "step": 19723 }, { "epoch": 5.874718442264376, "grad_norm": 0.28688564896583557, "learning_rate": 7.669063696530808e-06, "loss": 1.2089, "step": 19724 }, { "epoch": 5.875016288464044, "grad_norm": 0.2883380055427551, "learning_rate": 7.668125710668554e-06, "loss": 1.2284, "step": 19725 }, { "epoch": 5.8753141346637126, "grad_norm": 0.35950028896331787, "learning_rate": 7.667187746501783e-06, "loss": 1.1866, "step": 19726 }, { "epoch": 5.875611980863382, "grad_norm": 0.2990448474884033, "learning_rate": 7.666249804039227e-06, "loss": 1.2333, "step": 19727 }, { "epoch": 5.87590982706305, "grad_norm": 0.40367069840431213, "learning_rate": 7.665311883289611e-06, "loss": 1.2208, "step": 19728 }, { "epoch": 5.876207673262719, "grad_norm": 0.2846141755580902, "learning_rate": 7.664373984261653e-06, "loss": 1.2303, "step": 19729 }, { "epoch": 5.876505519462388, "grad_norm": 0.4116760790348053, "learning_rate": 7.66343610696409e-06, "loss": 1.2218, "step": 19730 }, { "epoch": 5.876803365662056, "grad_norm": 0.27822595834732056, "learning_rate": 7.66249825140564e-06, "loss": 1.2121, "step": 19731 }, { "epoch": 5.877101211861725, "grad_norm": 0.3134772777557373, "learning_rate": 7.661560417595034e-06, "loss": 1.2094, "step": 19732 }, { "epoch": 5.8773990580613935, "grad_norm": 0.2531176805496216, "learning_rate": 7.660622605540999e-06, "loss": 1.2039, "step": 19733 }, { "epoch": 5.877696904261062, "grad_norm": 0.30348435044288635, "learning_rate": 7.65968481525225e-06, "loss": 1.2014, "step": 19734 }, { "epoch": 5.877994750460731, "grad_norm": 0.26167163252830505, "learning_rate": 7.658747046737525e-06, "loss": 1.2089, "step": 19735 }, { "epoch": 5.878292596660399, "grad_norm": 0.2847365438938141, "learning_rate": 7.657809300005544e-06, "loss": 1.2203, "step": 19736 }, { "epoch": 5.878590442860068, "grad_norm": 0.3568847179412842, "learning_rate": 7.656871575065025e-06, "loss": 1.2316, "step": 19737 }, { "epoch": 5.878888289059737, "grad_norm": 0.34484249353408813, "learning_rate": 7.6559338719247e-06, "loss": 1.2234, "step": 19738 }, { "epoch": 5.879186135259405, "grad_norm": 0.40856823325157166, "learning_rate": 7.654996190593295e-06, "loss": 1.1945, "step": 19739 }, { "epoch": 5.8794839814590745, "grad_norm": 0.3517288863658905, "learning_rate": 7.654058531079524e-06, "loss": 1.2309, "step": 19740 }, { "epoch": 5.879781827658743, "grad_norm": 0.40801236033439636, "learning_rate": 7.653120893392124e-06, "loss": 1.2108, "step": 19741 }, { "epoch": 5.880079673858411, "grad_norm": 0.2814236581325531, "learning_rate": 7.65218327753981e-06, "loss": 1.2134, "step": 19742 }, { "epoch": 5.88037752005808, "grad_norm": 0.41062241792678833, "learning_rate": 7.651245683531304e-06, "loss": 1.207, "step": 19743 }, { "epoch": 5.880675366257749, "grad_norm": 0.27050167322158813, "learning_rate": 7.650308111375334e-06, "loss": 1.2226, "step": 19744 }, { "epoch": 5.880973212457417, "grad_norm": 0.31314024329185486, "learning_rate": 7.649370561080628e-06, "loss": 1.2091, "step": 19745 }, { "epoch": 5.881271058657086, "grad_norm": 0.2666119337081909, "learning_rate": 7.648433032655893e-06, "loss": 1.1988, "step": 19746 }, { "epoch": 5.8815689048567545, "grad_norm": 0.3014333248138428, "learning_rate": 7.647495526109869e-06, "loss": 1.2095, "step": 19747 }, { "epoch": 5.881866751056423, "grad_norm": 0.2822328805923462, "learning_rate": 7.646558041451264e-06, "loss": 1.1923, "step": 19748 }, { "epoch": 5.882164597256092, "grad_norm": 0.32284829020500183, "learning_rate": 7.64562057868881e-06, "loss": 1.2149, "step": 19749 }, { "epoch": 5.88246244345576, "grad_norm": 0.27972474694252014, "learning_rate": 7.644683137831229e-06, "loss": 1.2031, "step": 19750 }, { "epoch": 5.88276028965543, "grad_norm": 0.28591328859329224, "learning_rate": 7.643745718887236e-06, "loss": 1.2073, "step": 19751 }, { "epoch": 5.883058135855098, "grad_norm": 0.2891368567943573, "learning_rate": 7.64280832186556e-06, "loss": 1.195, "step": 19752 }, { "epoch": 5.883355982054766, "grad_norm": 0.2828793227672577, "learning_rate": 7.641870946774919e-06, "loss": 1.2132, "step": 19753 }, { "epoch": 5.8836538282544355, "grad_norm": 0.3005000948905945, "learning_rate": 7.640933593624029e-06, "loss": 1.2077, "step": 19754 }, { "epoch": 5.883951674454104, "grad_norm": 0.26643726229667664, "learning_rate": 7.639996262421619e-06, "loss": 1.2157, "step": 19755 }, { "epoch": 5.884249520653772, "grad_norm": 0.3420386016368866, "learning_rate": 7.63905895317641e-06, "loss": 1.2136, "step": 19756 }, { "epoch": 5.884547366853441, "grad_norm": 0.29570236802101135, "learning_rate": 7.638121665897115e-06, "loss": 1.2052, "step": 19757 }, { "epoch": 5.88484521305311, "grad_norm": 0.33349350094795227, "learning_rate": 7.637184400592462e-06, "loss": 1.206, "step": 19758 }, { "epoch": 5.885143059252778, "grad_norm": 0.36793774366378784, "learning_rate": 7.63624715727117e-06, "loss": 1.2179, "step": 19759 }, { "epoch": 5.885440905452447, "grad_norm": 0.2907385528087616, "learning_rate": 7.635309935941953e-06, "loss": 1.2136, "step": 19760 }, { "epoch": 5.885738751652116, "grad_norm": 0.28459247946739197, "learning_rate": 7.634372736613535e-06, "loss": 1.1965, "step": 19761 }, { "epoch": 5.886036597851784, "grad_norm": 0.31736648082733154, "learning_rate": 7.633435559294635e-06, "loss": 1.2093, "step": 19762 }, { "epoch": 5.886334444051453, "grad_norm": 0.280977725982666, "learning_rate": 7.632498403993978e-06, "loss": 1.2093, "step": 19763 }, { "epoch": 5.8866322902511214, "grad_norm": 0.27929916977882385, "learning_rate": 7.631561270720278e-06, "loss": 1.2057, "step": 19764 }, { "epoch": 5.88693013645079, "grad_norm": 0.2878957986831665, "learning_rate": 7.63062415948225e-06, "loss": 1.1996, "step": 19765 }, { "epoch": 5.887227982650459, "grad_norm": 0.2803519666194916, "learning_rate": 7.629687070288619e-06, "loss": 1.2138, "step": 19766 }, { "epoch": 5.887525828850127, "grad_norm": 0.3282645046710968, "learning_rate": 7.628750003148102e-06, "loss": 1.2081, "step": 19767 }, { "epoch": 5.8878236750497965, "grad_norm": 0.2690000534057617, "learning_rate": 7.627812958069414e-06, "loss": 1.2241, "step": 19768 }, { "epoch": 5.888121521249465, "grad_norm": 0.30320388078689575, "learning_rate": 7.62687593506128e-06, "loss": 1.2101, "step": 19769 }, { "epoch": 5.888419367449133, "grad_norm": 0.33145949244499207, "learning_rate": 7.625938934132414e-06, "loss": 1.2162, "step": 19770 }, { "epoch": 5.888717213648802, "grad_norm": 0.3910709619522095, "learning_rate": 7.62500195529153e-06, "loss": 1.2086, "step": 19771 }, { "epoch": 5.889015059848471, "grad_norm": 0.28167808055877686, "learning_rate": 7.624064998547352e-06, "loss": 1.2151, "step": 19772 }, { "epoch": 5.889312906048139, "grad_norm": 0.2926369309425354, "learning_rate": 7.623128063908595e-06, "loss": 1.2043, "step": 19773 }, { "epoch": 5.889610752247808, "grad_norm": 0.2957414388656616, "learning_rate": 7.622191151383972e-06, "loss": 1.2069, "step": 19774 }, { "epoch": 5.889908598447477, "grad_norm": 0.27410802245140076, "learning_rate": 7.621254260982208e-06, "loss": 1.1913, "step": 19775 }, { "epoch": 5.890206444647145, "grad_norm": 0.28888213634490967, "learning_rate": 7.620317392712017e-06, "loss": 1.2156, "step": 19776 }, { "epoch": 5.890504290846814, "grad_norm": 0.28191816806793213, "learning_rate": 7.619380546582108e-06, "loss": 1.2265, "step": 19777 }, { "epoch": 5.8908021370464825, "grad_norm": 0.2632920444011688, "learning_rate": 7.618443722601205e-06, "loss": 1.2269, "step": 19778 }, { "epoch": 5.891099983246152, "grad_norm": 0.29163745045661926, "learning_rate": 7.6175069207780235e-06, "loss": 1.2136, "step": 19779 }, { "epoch": 5.89139782944582, "grad_norm": 0.29716843366622925, "learning_rate": 7.616570141121277e-06, "loss": 1.2261, "step": 19780 }, { "epoch": 5.891695675645488, "grad_norm": 0.3333028256893158, "learning_rate": 7.615633383639687e-06, "loss": 1.2004, "step": 19781 }, { "epoch": 5.891993521845158, "grad_norm": 0.25077876448631287, "learning_rate": 7.614696648341958e-06, "loss": 1.2159, "step": 19782 }, { "epoch": 5.892291368044826, "grad_norm": 0.45058029890060425, "learning_rate": 7.613759935236817e-06, "loss": 1.2052, "step": 19783 }, { "epoch": 5.892589214244494, "grad_norm": 0.2832079529762268, "learning_rate": 7.612823244332971e-06, "loss": 1.2084, "step": 19784 }, { "epoch": 5.892887060444163, "grad_norm": 0.4495527744293213, "learning_rate": 7.611886575639135e-06, "loss": 1.2083, "step": 19785 }, { "epoch": 5.893184906643832, "grad_norm": 0.2885785400867462, "learning_rate": 7.610949929164031e-06, "loss": 1.2109, "step": 19786 }, { "epoch": 5.8934827528435, "grad_norm": 0.3337485194206238, "learning_rate": 7.610013304916369e-06, "loss": 1.2041, "step": 19787 }, { "epoch": 5.893780599043169, "grad_norm": 0.29731377959251404, "learning_rate": 7.609076702904858e-06, "loss": 1.2176, "step": 19788 }, { "epoch": 5.894078445242838, "grad_norm": 0.345464289188385, "learning_rate": 7.608140123138222e-06, "loss": 1.2265, "step": 19789 }, { "epoch": 5.894376291442507, "grad_norm": 0.2939906418323517, "learning_rate": 7.607203565625168e-06, "loss": 1.2016, "step": 19790 }, { "epoch": 5.894674137642175, "grad_norm": 0.489046186208725, "learning_rate": 7.606267030374408e-06, "loss": 1.2291, "step": 19791 }, { "epoch": 5.8949719838418435, "grad_norm": 0.362615704536438, "learning_rate": 7.605330517394664e-06, "loss": 1.211, "step": 19792 }, { "epoch": 5.895269830041512, "grad_norm": 0.2884233891963959, "learning_rate": 7.604394026694644e-06, "loss": 1.199, "step": 19793 }, { "epoch": 5.895567676241181, "grad_norm": 0.4300801157951355, "learning_rate": 7.603457558283056e-06, "loss": 1.211, "step": 19794 }, { "epoch": 5.895865522440849, "grad_norm": 0.2703288793563843, "learning_rate": 7.602521112168624e-06, "loss": 1.2148, "step": 19795 }, { "epoch": 5.896163368640519, "grad_norm": 0.42262008786201477, "learning_rate": 7.601584688360049e-06, "loss": 1.2094, "step": 19796 }, { "epoch": 5.896461214840187, "grad_norm": 0.26051944494247437, "learning_rate": 7.60064828686605e-06, "loss": 1.1958, "step": 19797 }, { "epoch": 5.896759061039855, "grad_norm": 0.38392624258995056, "learning_rate": 7.599711907695341e-06, "loss": 1.2148, "step": 19798 }, { "epoch": 5.8970569072395245, "grad_norm": 0.2545011341571808, "learning_rate": 7.598775550856626e-06, "loss": 1.2116, "step": 19799 }, { "epoch": 5.897354753439193, "grad_norm": 0.2836942970752716, "learning_rate": 7.597839216358626e-06, "loss": 1.2115, "step": 19800 }, { "epoch": 5.897652599638861, "grad_norm": 0.345489501953125, "learning_rate": 7.5969029042100485e-06, "loss": 1.2126, "step": 19801 }, { "epoch": 5.89795044583853, "grad_norm": 0.29392674565315247, "learning_rate": 7.595966614419602e-06, "loss": 1.2323, "step": 19802 }, { "epoch": 5.898248292038199, "grad_norm": 0.5595733523368835, "learning_rate": 7.5950303469960005e-06, "loss": 1.2012, "step": 19803 }, { "epoch": 5.898546138237867, "grad_norm": 0.4478977918624878, "learning_rate": 7.594094101947957e-06, "loss": 1.1948, "step": 19804 }, { "epoch": 5.898843984437536, "grad_norm": 0.4497094452381134, "learning_rate": 7.593157879284177e-06, "loss": 1.2032, "step": 19805 }, { "epoch": 5.8991418306372045, "grad_norm": 0.2579902112483978, "learning_rate": 7.592221679013377e-06, "loss": 1.2131, "step": 19806 }, { "epoch": 5.899439676836874, "grad_norm": 0.7427989840507507, "learning_rate": 7.591285501144261e-06, "loss": 1.2173, "step": 19807 }, { "epoch": 5.899737523036542, "grad_norm": 0.25360074639320374, "learning_rate": 7.590349345685542e-06, "loss": 1.2181, "step": 19808 }, { "epoch": 5.90003536923621, "grad_norm": 0.41214168071746826, "learning_rate": 7.589413212645933e-06, "loss": 1.2305, "step": 19809 }, { "epoch": 5.90033321543588, "grad_norm": 0.4207543432712555, "learning_rate": 7.588477102034142e-06, "loss": 1.2187, "step": 19810 }, { "epoch": 5.900631061635548, "grad_norm": 0.2610822916030884, "learning_rate": 7.587541013858871e-06, "loss": 1.2179, "step": 19811 }, { "epoch": 5.900928907835216, "grad_norm": 0.5251509547233582, "learning_rate": 7.586604948128841e-06, "loss": 1.2047, "step": 19812 }, { "epoch": 5.9012267540348855, "grad_norm": 0.2954830229282379, "learning_rate": 7.585668904852752e-06, "loss": 1.2049, "step": 19813 }, { "epoch": 5.901524600234554, "grad_norm": 0.3921951651573181, "learning_rate": 7.584732884039317e-06, "loss": 1.202, "step": 19814 }, { "epoch": 5.901822446434222, "grad_norm": 0.3233911693096161, "learning_rate": 7.583796885697248e-06, "loss": 1.199, "step": 19815 }, { "epoch": 5.902120292633891, "grad_norm": 0.3623232841491699, "learning_rate": 7.582860909835243e-06, "loss": 1.2218, "step": 19816 }, { "epoch": 5.90241813883356, "grad_norm": 0.345813512802124, "learning_rate": 7.581924956462022e-06, "loss": 1.1904, "step": 19817 }, { "epoch": 5.902715985033229, "grad_norm": 0.3087981939315796, "learning_rate": 7.5809890255862885e-06, "loss": 1.2139, "step": 19818 }, { "epoch": 5.903013831232897, "grad_norm": 0.34175044298171997, "learning_rate": 7.580053117216744e-06, "loss": 1.2182, "step": 19819 }, { "epoch": 5.903311677432566, "grad_norm": 0.38446667790412903, "learning_rate": 7.579117231362104e-06, "loss": 1.23, "step": 19820 }, { "epoch": 5.903609523632235, "grad_norm": 0.3683859705924988, "learning_rate": 7.578181368031076e-06, "loss": 1.2236, "step": 19821 }, { "epoch": 5.903907369831903, "grad_norm": 0.2987349331378937, "learning_rate": 7.5772455272323595e-06, "loss": 1.1938, "step": 19822 }, { "epoch": 5.9042052160315714, "grad_norm": 0.28819775581359863, "learning_rate": 7.576309708974672e-06, "loss": 1.2069, "step": 19823 }, { "epoch": 5.904503062231241, "grad_norm": 0.3232839107513428, "learning_rate": 7.57537391326671e-06, "loss": 1.2079, "step": 19824 }, { "epoch": 5.904800908430909, "grad_norm": 0.29022216796875, "learning_rate": 7.5744381401171876e-06, "loss": 1.211, "step": 19825 }, { "epoch": 5.905098754630577, "grad_norm": 0.3549312949180603, "learning_rate": 7.573502389534807e-06, "loss": 1.1946, "step": 19826 }, { "epoch": 5.9053966008302465, "grad_norm": 0.24365216493606567, "learning_rate": 7.572566661528278e-06, "loss": 1.2172, "step": 19827 }, { "epoch": 5.905694447029915, "grad_norm": 0.423112154006958, "learning_rate": 7.5716309561063e-06, "loss": 1.2049, "step": 19828 }, { "epoch": 5.905992293229583, "grad_norm": 0.28737735748291016, "learning_rate": 7.5706952732775866e-06, "loss": 1.2189, "step": 19829 }, { "epoch": 5.906290139429252, "grad_norm": 0.39810413122177124, "learning_rate": 7.569759613050835e-06, "loss": 1.2068, "step": 19830 }, { "epoch": 5.906587985628921, "grad_norm": 0.39153480529785156, "learning_rate": 7.5688239754347605e-06, "loss": 1.2071, "step": 19831 }, { "epoch": 5.906885831828589, "grad_norm": 0.3402028977870941, "learning_rate": 7.56788836043806e-06, "loss": 1.2197, "step": 19832 }, { "epoch": 5.907183678028258, "grad_norm": 0.45036885142326355, "learning_rate": 7.566952768069438e-06, "loss": 1.2309, "step": 19833 }, { "epoch": 5.907481524227927, "grad_norm": 0.2677386999130249, "learning_rate": 7.566017198337607e-06, "loss": 1.2181, "step": 19834 }, { "epoch": 5.907779370427596, "grad_norm": 0.7536735534667969, "learning_rate": 7.565081651251267e-06, "loss": 1.2099, "step": 19835 }, { "epoch": 5.908077216627264, "grad_norm": 0.3469991981983185, "learning_rate": 7.564146126819117e-06, "loss": 1.2354, "step": 19836 }, { "epoch": 5.9083750628269325, "grad_norm": 0.5384515523910522, "learning_rate": 7.563210625049868e-06, "loss": 1.1995, "step": 19837 }, { "epoch": 5.908672909026602, "grad_norm": 0.257614403963089, "learning_rate": 7.562275145952225e-06, "loss": 1.2103, "step": 19838 }, { "epoch": 5.90897075522627, "grad_norm": 0.5288355350494385, "learning_rate": 7.561339689534881e-06, "loss": 1.2221, "step": 19839 }, { "epoch": 5.909268601425938, "grad_norm": 0.31223511695861816, "learning_rate": 7.560404255806554e-06, "loss": 1.211, "step": 19840 }, { "epoch": 5.909566447625608, "grad_norm": 0.42010438442230225, "learning_rate": 7.5594688447759375e-06, "loss": 1.2126, "step": 19841 }, { "epoch": 5.909864293825276, "grad_norm": 0.4113349914550781, "learning_rate": 7.558533456451733e-06, "loss": 1.2088, "step": 19842 }, { "epoch": 5.910162140024944, "grad_norm": 0.29698100686073303, "learning_rate": 7.5575980908426495e-06, "loss": 1.2351, "step": 19843 }, { "epoch": 5.910459986224613, "grad_norm": 0.5019407868385315, "learning_rate": 7.556662747957388e-06, "loss": 1.2101, "step": 19844 }, { "epoch": 5.910757832424282, "grad_norm": 0.2548619508743286, "learning_rate": 7.5557274278046466e-06, "loss": 1.2037, "step": 19845 }, { "epoch": 5.911055678623951, "grad_norm": 0.3295741379261017, "learning_rate": 7.554792130393135e-06, "loss": 1.2158, "step": 19846 }, { "epoch": 5.911353524823619, "grad_norm": 0.33598634600639343, "learning_rate": 7.5538568557315464e-06, "loss": 1.2051, "step": 19847 }, { "epoch": 5.911651371023288, "grad_norm": 0.3634411692619324, "learning_rate": 7.55292160382859e-06, "loss": 1.2389, "step": 19848 }, { "epoch": 5.911949217222957, "grad_norm": 0.3242103159427643, "learning_rate": 7.551986374692963e-06, "loss": 1.1995, "step": 19849 }, { "epoch": 5.912247063422625, "grad_norm": 0.3475632071495056, "learning_rate": 7.551051168333365e-06, "loss": 1.2008, "step": 19850 }, { "epoch": 5.9125449096222935, "grad_norm": 0.28974246978759766, "learning_rate": 7.550115984758504e-06, "loss": 1.2093, "step": 19851 }, { "epoch": 5.912842755821963, "grad_norm": 0.27499011158943176, "learning_rate": 7.549180823977077e-06, "loss": 1.2199, "step": 19852 }, { "epoch": 5.913140602021631, "grad_norm": 0.3993379473686218, "learning_rate": 7.548245685997779e-06, "loss": 1.2215, "step": 19853 }, { "epoch": 5.913438448221299, "grad_norm": 0.36304888129234314, "learning_rate": 7.54731057082932e-06, "loss": 1.2005, "step": 19854 }, { "epoch": 5.913736294420969, "grad_norm": 0.5126381516456604, "learning_rate": 7.546375478480394e-06, "loss": 1.2229, "step": 19855 }, { "epoch": 5.914034140620637, "grad_norm": 0.34435397386550903, "learning_rate": 7.5454404089597026e-06, "loss": 1.2157, "step": 19856 }, { "epoch": 5.914331986820306, "grad_norm": 0.45478591322898865, "learning_rate": 7.544505362275948e-06, "loss": 1.1986, "step": 19857 }, { "epoch": 5.9146298330199745, "grad_norm": 0.3183295428752899, "learning_rate": 7.543570338437828e-06, "loss": 1.2115, "step": 19858 }, { "epoch": 5.914927679219643, "grad_norm": 0.25597986578941345, "learning_rate": 7.542635337454037e-06, "loss": 1.2072, "step": 19859 }, { "epoch": 5.915225525419311, "grad_norm": 0.4365426003932953, "learning_rate": 7.5417003593332815e-06, "loss": 1.1927, "step": 19860 }, { "epoch": 5.91552337161898, "grad_norm": 0.3271276652812958, "learning_rate": 7.540765404084258e-06, "loss": 1.2189, "step": 19861 }, { "epoch": 5.915821217818649, "grad_norm": 0.44658908247947693, "learning_rate": 7.539830471715663e-06, "loss": 1.2112, "step": 19862 }, { "epoch": 5.916119064018318, "grad_norm": 0.3291972577571869, "learning_rate": 7.538895562236199e-06, "loss": 1.2147, "step": 19863 }, { "epoch": 5.916416910217986, "grad_norm": 0.3678570091724396, "learning_rate": 7.5379606756545585e-06, "loss": 1.2228, "step": 19864 }, { "epoch": 5.9167147564176545, "grad_norm": 0.2758341431617737, "learning_rate": 7.5370258119794485e-06, "loss": 1.2055, "step": 19865 }, { "epoch": 5.917012602617324, "grad_norm": 0.2678714394569397, "learning_rate": 7.536090971219557e-06, "loss": 1.2098, "step": 19866 }, { "epoch": 5.917310448816992, "grad_norm": 0.27749374508857727, "learning_rate": 7.535156153383588e-06, "loss": 1.1947, "step": 19867 }, { "epoch": 5.91760829501666, "grad_norm": 0.25734415650367737, "learning_rate": 7.534221358480237e-06, "loss": 1.2171, "step": 19868 }, { "epoch": 5.91790614121633, "grad_norm": 0.29759058356285095, "learning_rate": 7.5332865865182035e-06, "loss": 1.2056, "step": 19869 }, { "epoch": 5.918203987415998, "grad_norm": 0.30260103940963745, "learning_rate": 7.532351837506177e-06, "loss": 1.216, "step": 19870 }, { "epoch": 5.918501833615666, "grad_norm": 0.2465955913066864, "learning_rate": 7.531417111452866e-06, "loss": 1.2082, "step": 19871 }, { "epoch": 5.9187996798153355, "grad_norm": 0.28616493940353394, "learning_rate": 7.530482408366957e-06, "loss": 1.2153, "step": 19872 }, { "epoch": 5.919097526015004, "grad_norm": 0.29654261469841003, "learning_rate": 7.529547728257147e-06, "loss": 1.2132, "step": 19873 }, { "epoch": 5.919395372214673, "grad_norm": 0.28355711698532104, "learning_rate": 7.528613071132141e-06, "loss": 1.2141, "step": 19874 }, { "epoch": 5.919693218414341, "grad_norm": 0.4401582181453705, "learning_rate": 7.52767843700063e-06, "loss": 1.2302, "step": 19875 }, { "epoch": 5.91999106461401, "grad_norm": 0.404633104801178, "learning_rate": 7.526743825871303e-06, "loss": 1.2256, "step": 19876 }, { "epoch": 5.920288910813679, "grad_norm": 0.2894860804080963, "learning_rate": 7.525809237752866e-06, "loss": 1.2043, "step": 19877 }, { "epoch": 5.920586757013347, "grad_norm": 0.2649744749069214, "learning_rate": 7.524874672654005e-06, "loss": 1.2171, "step": 19878 }, { "epoch": 5.920884603213016, "grad_norm": 0.3483593761920929, "learning_rate": 7.5239401305834225e-06, "loss": 1.2182, "step": 19879 }, { "epoch": 5.921182449412685, "grad_norm": 0.2558915913105011, "learning_rate": 7.523005611549813e-06, "loss": 1.2175, "step": 19880 }, { "epoch": 5.921480295612353, "grad_norm": 0.3790781795978546, "learning_rate": 7.522071115561864e-06, "loss": 1.217, "step": 19881 }, { "epoch": 5.9217781418120214, "grad_norm": 0.26363861560821533, "learning_rate": 7.5211366426282795e-06, "loss": 1.2173, "step": 19882 }, { "epoch": 5.922075988011691, "grad_norm": 0.2687417268753052, "learning_rate": 7.520202192757747e-06, "loss": 1.2006, "step": 19883 }, { "epoch": 5.922373834211359, "grad_norm": 0.3512256443500519, "learning_rate": 7.519267765958963e-06, "loss": 1.2271, "step": 19884 }, { "epoch": 5.922671680411028, "grad_norm": 0.28896406292915344, "learning_rate": 7.518333362240619e-06, "loss": 1.2374, "step": 19885 }, { "epoch": 5.9229695266106965, "grad_norm": 0.29075995087623596, "learning_rate": 7.5173989816114145e-06, "loss": 1.2098, "step": 19886 }, { "epoch": 5.923267372810365, "grad_norm": 0.2722116708755493, "learning_rate": 7.516464624080033e-06, "loss": 1.2059, "step": 19887 }, { "epoch": 5.923565219010034, "grad_norm": 0.33740612864494324, "learning_rate": 7.515530289655179e-06, "loss": 1.2112, "step": 19888 }, { "epoch": 5.923863065209702, "grad_norm": 0.29584792256355286, "learning_rate": 7.5145959783455365e-06, "loss": 1.2078, "step": 19889 }, { "epoch": 5.924160911409371, "grad_norm": 0.3502522110939026, "learning_rate": 7.513661690159802e-06, "loss": 1.1741, "step": 19890 }, { "epoch": 5.92445875760904, "grad_norm": 0.2546685039997101, "learning_rate": 7.512727425106668e-06, "loss": 1.216, "step": 19891 }, { "epoch": 5.924756603808708, "grad_norm": 0.45806533098220825, "learning_rate": 7.511793183194829e-06, "loss": 1.2215, "step": 19892 }, { "epoch": 5.925054450008377, "grad_norm": 0.28287121653556824, "learning_rate": 7.510858964432969e-06, "loss": 1.1988, "step": 19893 }, { "epoch": 5.925352296208046, "grad_norm": 0.5181441307067871, "learning_rate": 7.509924768829789e-06, "loss": 1.2286, "step": 19894 }, { "epoch": 5.925650142407714, "grad_norm": 0.2704135775566101, "learning_rate": 7.5089905963939734e-06, "loss": 1.2006, "step": 19895 }, { "epoch": 5.9259479886073825, "grad_norm": 0.36298900842666626, "learning_rate": 7.50805644713422e-06, "loss": 1.2174, "step": 19896 }, { "epoch": 5.926245834807052, "grad_norm": 0.27507761120796204, "learning_rate": 7.507122321059219e-06, "loss": 1.2071, "step": 19897 }, { "epoch": 5.92654368100672, "grad_norm": 0.2856441140174866, "learning_rate": 7.506188218177655e-06, "loss": 1.1948, "step": 19898 }, { "epoch": 5.926841527206388, "grad_norm": 0.2776634693145752, "learning_rate": 7.505254138498228e-06, "loss": 1.2182, "step": 19899 }, { "epoch": 5.927139373406058, "grad_norm": 0.2945333421230316, "learning_rate": 7.504320082029623e-06, "loss": 1.2086, "step": 19900 }, { "epoch": 5.927437219605726, "grad_norm": 0.26102709770202637, "learning_rate": 7.5033860487805275e-06, "loss": 1.2187, "step": 19901 }, { "epoch": 5.927735065805395, "grad_norm": 0.3198640048503876, "learning_rate": 7.502452038759637e-06, "loss": 1.2311, "step": 19902 }, { "epoch": 5.928032912005063, "grad_norm": 0.2610388994216919, "learning_rate": 7.501518051975644e-06, "loss": 1.2136, "step": 19903 }, { "epoch": 5.928330758204732, "grad_norm": 0.3690877854824066, "learning_rate": 7.500584088437228e-06, "loss": 1.2181, "step": 19904 }, { "epoch": 5.928628604404401, "grad_norm": 0.2494429498910904, "learning_rate": 7.499650148153091e-06, "loss": 1.1962, "step": 19905 }, { "epoch": 5.928926450604069, "grad_norm": 0.3058006167411804, "learning_rate": 7.498716231131913e-06, "loss": 1.207, "step": 19906 }, { "epoch": 5.929224296803738, "grad_norm": 0.2546834945678711, "learning_rate": 7.497782337382383e-06, "loss": 1.2112, "step": 19907 }, { "epoch": 5.929522143003407, "grad_norm": 0.3980565369129181, "learning_rate": 7.496848466913195e-06, "loss": 1.2146, "step": 19908 }, { "epoch": 5.929819989203075, "grad_norm": 0.2976199686527252, "learning_rate": 7.495914619733037e-06, "loss": 1.2112, "step": 19909 }, { "epoch": 5.9301178354027435, "grad_norm": 0.4909135401248932, "learning_rate": 7.4949807958505915e-06, "loss": 1.2305, "step": 19910 }, { "epoch": 5.930415681602413, "grad_norm": 0.4183238744735718, "learning_rate": 7.494046995274556e-06, "loss": 1.2058, "step": 19911 }, { "epoch": 5.930713527802081, "grad_norm": 0.3827158510684967, "learning_rate": 7.493113218013608e-06, "loss": 1.2073, "step": 19912 }, { "epoch": 5.93101137400175, "grad_norm": 0.2779160439968109, "learning_rate": 7.492179464076446e-06, "loss": 1.1992, "step": 19913 }, { "epoch": 5.931309220201419, "grad_norm": 0.4791818857192993, "learning_rate": 7.491245733471748e-06, "loss": 1.2239, "step": 19914 }, { "epoch": 5.931607066401087, "grad_norm": 0.2737295627593994, "learning_rate": 7.490312026208205e-06, "loss": 1.2189, "step": 19915 }, { "epoch": 5.931904912600756, "grad_norm": 0.41740521788597107, "learning_rate": 7.4893783422945085e-06, "loss": 1.2203, "step": 19916 }, { "epoch": 5.9322027588004245, "grad_norm": 0.2548010051250458, "learning_rate": 7.488444681739342e-06, "loss": 1.2161, "step": 19917 }, { "epoch": 5.932500605000093, "grad_norm": 0.3685665428638458, "learning_rate": 7.487511044551388e-06, "loss": 1.2135, "step": 19918 }, { "epoch": 5.932798451199762, "grad_norm": 0.33215752243995667, "learning_rate": 7.486577430739337e-06, "loss": 1.2023, "step": 19919 }, { "epoch": 5.93309629739943, "grad_norm": 0.319754034280777, "learning_rate": 7.485643840311878e-06, "loss": 1.2233, "step": 19920 }, { "epoch": 5.933394143599099, "grad_norm": 0.3260960876941681, "learning_rate": 7.484710273277689e-06, "loss": 1.1984, "step": 19921 }, { "epoch": 5.933691989798768, "grad_norm": 0.2729150354862213, "learning_rate": 7.4837767296454645e-06, "loss": 1.2185, "step": 19922 }, { "epoch": 5.933989835998436, "grad_norm": 0.29566365480422974, "learning_rate": 7.482843209423887e-06, "loss": 1.2045, "step": 19923 }, { "epoch": 5.934287682198105, "grad_norm": 0.30384641885757446, "learning_rate": 7.4819097126216374e-06, "loss": 1.2194, "step": 19924 }, { "epoch": 5.934585528397774, "grad_norm": 0.28885459899902344, "learning_rate": 7.480976239247406e-06, "loss": 1.2152, "step": 19925 }, { "epoch": 5.934883374597442, "grad_norm": 0.31526216864585876, "learning_rate": 7.480042789309878e-06, "loss": 1.2103, "step": 19926 }, { "epoch": 5.93518122079711, "grad_norm": 0.3096555173397064, "learning_rate": 7.479109362817733e-06, "loss": 1.2158, "step": 19927 }, { "epoch": 5.93547906699678, "grad_norm": 0.278586745262146, "learning_rate": 7.478175959779663e-06, "loss": 1.1954, "step": 19928 }, { "epoch": 5.935776913196448, "grad_norm": 0.29671400785446167, "learning_rate": 7.477242580204343e-06, "loss": 1.2012, "step": 19929 }, { "epoch": 5.936074759396117, "grad_norm": 0.29190900921821594, "learning_rate": 7.476309224100468e-06, "loss": 1.2245, "step": 19930 }, { "epoch": 5.9363726055957855, "grad_norm": 0.3125079870223999, "learning_rate": 7.475375891476713e-06, "loss": 1.2131, "step": 19931 }, { "epoch": 5.936670451795454, "grad_norm": 0.2816937267780304, "learning_rate": 7.474442582341762e-06, "loss": 1.2136, "step": 19932 }, { "epoch": 5.936968297995123, "grad_norm": 0.2571212351322174, "learning_rate": 7.4735092967043066e-06, "loss": 1.2086, "step": 19933 }, { "epoch": 5.937266144194791, "grad_norm": 0.47276771068573, "learning_rate": 7.472576034573023e-06, "loss": 1.2021, "step": 19934 }, { "epoch": 5.93756399039446, "grad_norm": 0.36985161900520325, "learning_rate": 7.4716427959565925e-06, "loss": 1.2058, "step": 19935 }, { "epoch": 5.937861836594129, "grad_norm": 0.31467387080192566, "learning_rate": 7.470709580863704e-06, "loss": 1.2008, "step": 19936 }, { "epoch": 5.938159682793797, "grad_norm": 0.28341153264045715, "learning_rate": 7.469776389303036e-06, "loss": 1.2057, "step": 19937 }, { "epoch": 5.938457528993466, "grad_norm": 0.43463417887687683, "learning_rate": 7.468843221283269e-06, "loss": 1.2076, "step": 19938 }, { "epoch": 5.938755375193135, "grad_norm": 0.27061885595321655, "learning_rate": 7.467910076813092e-06, "loss": 1.2038, "step": 19939 }, { "epoch": 5.939053221392803, "grad_norm": 0.33500590920448303, "learning_rate": 7.466976955901184e-06, "loss": 1.206, "step": 19940 }, { "epoch": 5.939351067592472, "grad_norm": 0.31054073572158813, "learning_rate": 7.46604385855622e-06, "loss": 1.203, "step": 19941 }, { "epoch": 5.939648913792141, "grad_norm": 0.2667149007320404, "learning_rate": 7.465110784786891e-06, "loss": 1.2102, "step": 19942 }, { "epoch": 5.939946759991809, "grad_norm": 0.2642519772052765, "learning_rate": 7.4641777346018715e-06, "loss": 1.2058, "step": 19943 }, { "epoch": 5.940244606191478, "grad_norm": 0.36663520336151123, "learning_rate": 7.463244708009845e-06, "loss": 1.2281, "step": 19944 }, { "epoch": 5.9405424523911465, "grad_norm": 0.270404577255249, "learning_rate": 7.462311705019494e-06, "loss": 1.209, "step": 19945 }, { "epoch": 5.940840298590815, "grad_norm": 0.27171462774276733, "learning_rate": 7.461378725639495e-06, "loss": 1.2081, "step": 19946 }, { "epoch": 5.941138144790484, "grad_norm": 0.34753552079200745, "learning_rate": 7.4604457698785325e-06, "loss": 1.2037, "step": 19947 }, { "epoch": 5.941435990990152, "grad_norm": 0.3226587176322937, "learning_rate": 7.459512837745284e-06, "loss": 1.2135, "step": 19948 }, { "epoch": 5.941733837189821, "grad_norm": 0.3982117772102356, "learning_rate": 7.458579929248431e-06, "loss": 1.2064, "step": 19949 }, { "epoch": 5.94203168338949, "grad_norm": 0.5194597840309143, "learning_rate": 7.457647044396651e-06, "loss": 1.2028, "step": 19950 }, { "epoch": 5.942329529589158, "grad_norm": 0.27021604776382446, "learning_rate": 7.456714183198628e-06, "loss": 1.2288, "step": 19951 }, { "epoch": 5.9426273757888275, "grad_norm": 0.400470107793808, "learning_rate": 7.455781345663032e-06, "loss": 1.2179, "step": 19952 }, { "epoch": 5.942925221988496, "grad_norm": 0.32334771752357483, "learning_rate": 7.454848531798553e-06, "loss": 1.1957, "step": 19953 }, { "epoch": 5.943223068188164, "grad_norm": 0.48630544543266296, "learning_rate": 7.453915741613864e-06, "loss": 1.2048, "step": 19954 }, { "epoch": 5.943520914387833, "grad_norm": 0.3211387097835541, "learning_rate": 7.452982975117642e-06, "loss": 1.2081, "step": 19955 }, { "epoch": 5.943818760587502, "grad_norm": 0.42263099551200867, "learning_rate": 7.452050232318571e-06, "loss": 1.1995, "step": 19956 }, { "epoch": 5.94411660678717, "grad_norm": 0.4230688214302063, "learning_rate": 7.451117513225325e-06, "loss": 1.2047, "step": 19957 }, { "epoch": 5.944414452986839, "grad_norm": 0.304991751909256, "learning_rate": 7.4501848178465795e-06, "loss": 1.1918, "step": 19958 }, { "epoch": 5.944712299186508, "grad_norm": 0.49376922845840454, "learning_rate": 7.44925214619102e-06, "loss": 1.2028, "step": 19959 }, { "epoch": 5.945010145386176, "grad_norm": 0.2916540205478668, "learning_rate": 7.448319498267318e-06, "loss": 1.1948, "step": 19960 }, { "epoch": 5.945307991585845, "grad_norm": 0.2906514108181, "learning_rate": 7.447386874084148e-06, "loss": 1.2075, "step": 19961 }, { "epoch": 5.945605837785513, "grad_norm": 0.28675130009651184, "learning_rate": 7.446454273650198e-06, "loss": 1.2037, "step": 19962 }, { "epoch": 5.945903683985182, "grad_norm": 0.3520452678203583, "learning_rate": 7.445521696974132e-06, "loss": 1.216, "step": 19963 }, { "epoch": 5.946201530184851, "grad_norm": 0.2449936717748642, "learning_rate": 7.4445891440646365e-06, "loss": 1.2141, "step": 19964 }, { "epoch": 5.946499376384519, "grad_norm": 0.26454365253448486, "learning_rate": 7.443656614930386e-06, "loss": 1.2042, "step": 19965 }, { "epoch": 5.946797222584188, "grad_norm": 0.3150053322315216, "learning_rate": 7.442724109580048e-06, "loss": 1.2032, "step": 19966 }, { "epoch": 5.947095068783857, "grad_norm": 0.2718449532985687, "learning_rate": 7.441791628022308e-06, "loss": 1.1929, "step": 19967 }, { "epoch": 5.947392914983525, "grad_norm": 0.2793262302875519, "learning_rate": 7.440859170265841e-06, "loss": 1.2268, "step": 19968 }, { "epoch": 5.947690761183194, "grad_norm": 0.2563835680484772, "learning_rate": 7.439926736319316e-06, "loss": 1.2244, "step": 19969 }, { "epoch": 5.947988607382863, "grad_norm": 0.3162914216518402, "learning_rate": 7.438994326191417e-06, "loss": 1.2076, "step": 19970 }, { "epoch": 5.948286453582531, "grad_norm": 0.4131770730018616, "learning_rate": 7.43806193989081e-06, "loss": 1.2097, "step": 19971 }, { "epoch": 5.9485842997822, "grad_norm": 0.2641558349132538, "learning_rate": 7.437129577426177e-06, "loss": 1.2121, "step": 19972 }, { "epoch": 5.948882145981869, "grad_norm": 0.40105366706848145, "learning_rate": 7.436197238806189e-06, "loss": 1.2045, "step": 19973 }, { "epoch": 5.949179992181537, "grad_norm": 0.312832236289978, "learning_rate": 7.435264924039523e-06, "loss": 1.2291, "step": 19974 }, { "epoch": 5.949477838381206, "grad_norm": 0.27298468351364136, "learning_rate": 7.434332633134849e-06, "loss": 1.211, "step": 19975 }, { "epoch": 5.9497756845808745, "grad_norm": 0.28135013580322266, "learning_rate": 7.433400366100845e-06, "loss": 1.2132, "step": 19976 }, { "epoch": 5.950073530780543, "grad_norm": 0.2630771994590759, "learning_rate": 7.432468122946183e-06, "loss": 1.2169, "step": 19977 }, { "epoch": 5.950371376980212, "grad_norm": 0.4849890470504761, "learning_rate": 7.431535903679534e-06, "loss": 1.2166, "step": 19978 }, { "epoch": 5.95066922317988, "grad_norm": 0.2566927373409271, "learning_rate": 7.430603708309576e-06, "loss": 1.2213, "step": 19979 }, { "epoch": 5.95096706937955, "grad_norm": 0.3909648060798645, "learning_rate": 7.4296715368449775e-06, "loss": 1.2254, "step": 19980 }, { "epoch": 5.951264915579218, "grad_norm": 0.26811352372169495, "learning_rate": 7.428739389294417e-06, "loss": 1.1927, "step": 19981 }, { "epoch": 5.951562761778886, "grad_norm": 0.317093163728714, "learning_rate": 7.427807265666565e-06, "loss": 1.2083, "step": 19982 }, { "epoch": 5.951860607978555, "grad_norm": 0.2598755657672882, "learning_rate": 7.426875165970087e-06, "loss": 1.2277, "step": 19983 }, { "epoch": 5.952158454178224, "grad_norm": 0.24861247837543488, "learning_rate": 7.425943090213663e-06, "loss": 1.2053, "step": 19984 }, { "epoch": 5.952456300377892, "grad_norm": 0.2631685733795166, "learning_rate": 7.425011038405965e-06, "loss": 1.2027, "step": 19985 }, { "epoch": 5.952754146577561, "grad_norm": 0.27041855454444885, "learning_rate": 7.424079010555657e-06, "loss": 1.2099, "step": 19986 }, { "epoch": 5.95305199277723, "grad_norm": 0.25152724981307983, "learning_rate": 7.423147006671421e-06, "loss": 1.1967, "step": 19987 }, { "epoch": 5.953349838976898, "grad_norm": 0.29649677872657776, "learning_rate": 7.422215026761923e-06, "loss": 1.206, "step": 19988 }, { "epoch": 5.953647685176567, "grad_norm": 0.2471916675567627, "learning_rate": 7.421283070835831e-06, "loss": 1.2142, "step": 19989 }, { "epoch": 5.9539455313762355, "grad_norm": 0.25260427594184875, "learning_rate": 7.4203511389018204e-06, "loss": 1.1993, "step": 19990 }, { "epoch": 5.954243377575905, "grad_norm": 0.42111966013908386, "learning_rate": 7.419419230968561e-06, "loss": 1.2141, "step": 19991 }, { "epoch": 5.954541223775573, "grad_norm": 0.44608354568481445, "learning_rate": 7.418487347044719e-06, "loss": 1.2041, "step": 19992 }, { "epoch": 5.954839069975241, "grad_norm": 0.35318055748939514, "learning_rate": 7.4175554871389735e-06, "loss": 1.2191, "step": 19993 }, { "epoch": 5.95513691617491, "grad_norm": 0.3805428445339203, "learning_rate": 7.416623651259983e-06, "loss": 1.1964, "step": 19994 }, { "epoch": 5.955434762374579, "grad_norm": 0.41821813583374023, "learning_rate": 7.415691839416428e-06, "loss": 1.2229, "step": 19995 }, { "epoch": 5.955732608574247, "grad_norm": 0.3168116509914398, "learning_rate": 7.414760051616972e-06, "loss": 1.2023, "step": 19996 }, { "epoch": 5.9560304547739165, "grad_norm": 0.3952611982822418, "learning_rate": 7.413828287870281e-06, "loss": 1.2217, "step": 19997 }, { "epoch": 5.956328300973585, "grad_norm": 0.3050205409526825, "learning_rate": 7.412896548185033e-06, "loss": 1.1859, "step": 19998 }, { "epoch": 5.956626147173253, "grad_norm": 0.3769543170928955, "learning_rate": 7.411964832569894e-06, "loss": 1.2154, "step": 19999 }, { "epoch": 5.956923993372922, "grad_norm": 0.45296552777290344, "learning_rate": 7.411033141033525e-06, "loss": 1.2171, "step": 20000 }, { "epoch": 5.956923993372922, "eval_loss": 1.3239740133285522, "eval_runtime": 23.6264, "eval_samples_per_second": 73.392, "eval_steps_per_second": 4.613, "step": 20000 }, { "epoch": 5.957221839572591, "grad_norm": 0.354196697473526, "learning_rate": 7.410101473584605e-06, "loss": 1.2021, "step": 20001 }, { "epoch": 5.957519685772259, "grad_norm": 0.4095867872238159, "learning_rate": 7.409169830231795e-06, "loss": 1.205, "step": 20002 }, { "epoch": 5.957817531971928, "grad_norm": 0.2558134198188782, "learning_rate": 7.408238210983763e-06, "loss": 1.2169, "step": 20003 }, { "epoch": 5.9581153781715965, "grad_norm": 0.31956395506858826, "learning_rate": 7.407306615849182e-06, "loss": 1.2084, "step": 20004 }, { "epoch": 5.958413224371265, "grad_norm": 0.41249266266822815, "learning_rate": 7.406375044836717e-06, "loss": 1.2099, "step": 20005 }, { "epoch": 5.958711070570934, "grad_norm": 0.3225463926792145, "learning_rate": 7.4054434979550305e-06, "loss": 1.2103, "step": 20006 }, { "epoch": 5.959008916770602, "grad_norm": 0.3751731812953949, "learning_rate": 7.404511975212796e-06, "loss": 1.2268, "step": 20007 }, { "epoch": 5.959306762970272, "grad_norm": 0.28736716508865356, "learning_rate": 7.403580476618678e-06, "loss": 1.1916, "step": 20008 }, { "epoch": 5.95960460916994, "grad_norm": 0.44467705488204956, "learning_rate": 7.402649002181339e-06, "loss": 1.2004, "step": 20009 }, { "epoch": 5.959902455369608, "grad_norm": 0.31485164165496826, "learning_rate": 7.4017175519094555e-06, "loss": 1.2189, "step": 20010 }, { "epoch": 5.9602003015692775, "grad_norm": 0.37998661398887634, "learning_rate": 7.400786125811681e-06, "loss": 1.2049, "step": 20011 }, { "epoch": 5.960498147768946, "grad_norm": 0.363171249628067, "learning_rate": 7.399854723896691e-06, "loss": 1.1948, "step": 20012 }, { "epoch": 5.960795993968614, "grad_norm": 0.268888384103775, "learning_rate": 7.398923346173148e-06, "loss": 1.2134, "step": 20013 }, { "epoch": 5.961093840168283, "grad_norm": 0.3031296133995056, "learning_rate": 7.397991992649713e-06, "loss": 1.2232, "step": 20014 }, { "epoch": 5.961391686367952, "grad_norm": 0.2481214851140976, "learning_rate": 7.397060663335061e-06, "loss": 1.2176, "step": 20015 }, { "epoch": 5.96168953256762, "grad_norm": 0.30856403708457947, "learning_rate": 7.396129358237851e-06, "loss": 1.219, "step": 20016 }, { "epoch": 5.961987378767289, "grad_norm": 0.3058025538921356, "learning_rate": 7.395198077366743e-06, "loss": 1.2061, "step": 20017 }, { "epoch": 5.962285224966958, "grad_norm": 0.3234337568283081, "learning_rate": 7.394266820730412e-06, "loss": 1.2101, "step": 20018 }, { "epoch": 5.962583071166627, "grad_norm": 0.25915324687957764, "learning_rate": 7.393335588337514e-06, "loss": 1.2086, "step": 20019 }, { "epoch": 5.962880917366295, "grad_norm": 0.3813258707523346, "learning_rate": 7.392404380196714e-06, "loss": 1.1911, "step": 20020 }, { "epoch": 5.963178763565963, "grad_norm": 0.326995313167572, "learning_rate": 7.391473196316683e-06, "loss": 1.2214, "step": 20021 }, { "epoch": 5.963476609765633, "grad_norm": 0.370278924703598, "learning_rate": 7.3905420367060784e-06, "loss": 1.2223, "step": 20022 }, { "epoch": 5.963774455965301, "grad_norm": 0.4342767894268036, "learning_rate": 7.389610901373561e-06, "loss": 1.2211, "step": 20023 }, { "epoch": 5.964072302164969, "grad_norm": 0.32407841086387634, "learning_rate": 7.388679790327802e-06, "loss": 1.2157, "step": 20024 }, { "epoch": 5.9643701483646385, "grad_norm": 0.4559040367603302, "learning_rate": 7.387748703577458e-06, "loss": 1.218, "step": 20025 }, { "epoch": 5.964667994564307, "grad_norm": 0.2781195640563965, "learning_rate": 7.386817641131191e-06, "loss": 1.2271, "step": 20026 }, { "epoch": 5.964965840763975, "grad_norm": 0.38680049777030945, "learning_rate": 7.385886602997672e-06, "loss": 1.1975, "step": 20027 }, { "epoch": 5.965263686963644, "grad_norm": 0.3065037131309509, "learning_rate": 7.384955589185552e-06, "loss": 1.2029, "step": 20028 }, { "epoch": 5.965561533163313, "grad_norm": 0.482476145029068, "learning_rate": 7.384024599703502e-06, "loss": 1.2281, "step": 20029 }, { "epoch": 5.965859379362981, "grad_norm": 0.3051374554634094, "learning_rate": 7.3830936345601785e-06, "loss": 1.2111, "step": 20030 }, { "epoch": 5.96615722556265, "grad_norm": 0.4405565857887268, "learning_rate": 7.382162693764245e-06, "loss": 1.2121, "step": 20031 }, { "epoch": 5.966455071762319, "grad_norm": 0.3059985339641571, "learning_rate": 7.381231777324365e-06, "loss": 1.2155, "step": 20032 }, { "epoch": 5.966752917961987, "grad_norm": 0.33760663866996765, "learning_rate": 7.380300885249197e-06, "loss": 1.208, "step": 20033 }, { "epoch": 5.967050764161656, "grad_norm": 0.28422605991363525, "learning_rate": 7.379370017547399e-06, "loss": 1.2118, "step": 20034 }, { "epoch": 5.9673486103613245, "grad_norm": 0.262935608625412, "learning_rate": 7.37843917422764e-06, "loss": 1.2182, "step": 20035 }, { "epoch": 5.967646456560994, "grad_norm": 0.30111443996429443, "learning_rate": 7.377508355298572e-06, "loss": 1.2026, "step": 20036 }, { "epoch": 5.967944302760662, "grad_norm": 0.3197158873081207, "learning_rate": 7.376577560768861e-06, "loss": 1.2114, "step": 20037 }, { "epoch": 5.96824214896033, "grad_norm": 0.2820962965488434, "learning_rate": 7.375646790647162e-06, "loss": 1.2015, "step": 20038 }, { "epoch": 5.96853999516, "grad_norm": 0.3418351709842682, "learning_rate": 7.37471604494214e-06, "loss": 1.2303, "step": 20039 }, { "epoch": 5.968837841359668, "grad_norm": 0.2564753592014313, "learning_rate": 7.37378532366245e-06, "loss": 1.2147, "step": 20040 }, { "epoch": 5.969135687559336, "grad_norm": 0.2663310170173645, "learning_rate": 7.3728546268167554e-06, "loss": 1.2172, "step": 20041 }, { "epoch": 5.969433533759005, "grad_norm": 0.44179773330688477, "learning_rate": 7.371923954413712e-06, "loss": 1.2004, "step": 20042 }, { "epoch": 5.969731379958674, "grad_norm": 0.3131563663482666, "learning_rate": 7.370993306461978e-06, "loss": 1.2097, "step": 20043 }, { "epoch": 5.970029226158342, "grad_norm": 0.409889817237854, "learning_rate": 7.3700626829702185e-06, "loss": 1.2083, "step": 20044 }, { "epoch": 5.970327072358011, "grad_norm": 0.2620203495025635, "learning_rate": 7.369132083947083e-06, "loss": 1.2062, "step": 20045 }, { "epoch": 5.97062491855768, "grad_norm": 0.34795284271240234, "learning_rate": 7.368201509401238e-06, "loss": 1.2023, "step": 20046 }, { "epoch": 5.970922764757349, "grad_norm": 0.45827654004096985, "learning_rate": 7.367270959341337e-06, "loss": 1.2206, "step": 20047 }, { "epoch": 5.971220610957017, "grad_norm": 0.6986866593360901, "learning_rate": 7.366340433776034e-06, "loss": 1.2059, "step": 20048 }, { "epoch": 5.9715184571566855, "grad_norm": 0.28741317987442017, "learning_rate": 7.3654099327139936e-06, "loss": 1.2188, "step": 20049 }, { "epoch": 5.971816303356355, "grad_norm": 0.40813660621643066, "learning_rate": 7.364479456163873e-06, "loss": 1.2007, "step": 20050 }, { "epoch": 5.972114149556023, "grad_norm": 0.4476781189441681, "learning_rate": 7.363549004134322e-06, "loss": 1.2138, "step": 20051 }, { "epoch": 5.972411995755691, "grad_norm": 0.27198249101638794, "learning_rate": 7.362618576634005e-06, "loss": 1.2125, "step": 20052 }, { "epoch": 5.972709841955361, "grad_norm": 0.5791362524032593, "learning_rate": 7.361688173671576e-06, "loss": 1.2073, "step": 20053 }, { "epoch": 5.973007688155029, "grad_norm": 0.30177468061447144, "learning_rate": 7.3607577952556864e-06, "loss": 1.2159, "step": 20054 }, { "epoch": 5.973305534354697, "grad_norm": 0.35324525833129883, "learning_rate": 7.3598274413949996e-06, "loss": 1.2069, "step": 20055 }, { "epoch": 5.9736033805543665, "grad_norm": 0.4216848313808441, "learning_rate": 7.358897112098171e-06, "loss": 1.2163, "step": 20056 }, { "epoch": 5.973901226754035, "grad_norm": 0.2792426347732544, "learning_rate": 7.357966807373848e-06, "loss": 1.2211, "step": 20057 }, { "epoch": 5.974199072953704, "grad_norm": 0.367888867855072, "learning_rate": 7.357036527230699e-06, "loss": 1.2098, "step": 20058 }, { "epoch": 5.974496919153372, "grad_norm": 0.2614375054836273, "learning_rate": 7.356106271677368e-06, "loss": 1.1958, "step": 20059 }, { "epoch": 5.974794765353041, "grad_norm": 0.2798803150653839, "learning_rate": 7.355176040722514e-06, "loss": 1.2257, "step": 20060 }, { "epoch": 5.975092611552709, "grad_norm": 0.2662627696990967, "learning_rate": 7.354245834374793e-06, "loss": 1.2105, "step": 20061 }, { "epoch": 5.975390457752378, "grad_norm": 0.2591420114040375, "learning_rate": 7.3533156526428565e-06, "loss": 1.222, "step": 20062 }, { "epoch": 5.9756883039520465, "grad_norm": 0.3019726574420929, "learning_rate": 7.3523854955353635e-06, "loss": 1.2012, "step": 20063 }, { "epoch": 5.975986150151716, "grad_norm": 0.2563430368900299, "learning_rate": 7.351455363060967e-06, "loss": 1.217, "step": 20064 }, { "epoch": 5.976283996351384, "grad_norm": 0.2991671562194824, "learning_rate": 7.350525255228314e-06, "loss": 1.21, "step": 20065 }, { "epoch": 5.976581842551052, "grad_norm": 0.28286856412887573, "learning_rate": 7.3495951720460665e-06, "loss": 1.1913, "step": 20066 }, { "epoch": 5.976879688750722, "grad_norm": 0.25297608971595764, "learning_rate": 7.348665113522877e-06, "loss": 1.2024, "step": 20067 }, { "epoch": 5.97717753495039, "grad_norm": 0.42464059591293335, "learning_rate": 7.347735079667391e-06, "loss": 1.2023, "step": 20068 }, { "epoch": 5.977475381150058, "grad_norm": 0.32270076870918274, "learning_rate": 7.346805070488271e-06, "loss": 1.2263, "step": 20069 }, { "epoch": 5.9777732273497275, "grad_norm": 0.30218422412872314, "learning_rate": 7.3458750859941655e-06, "loss": 1.222, "step": 20070 }, { "epoch": 5.978071073549396, "grad_norm": 0.2741331160068512, "learning_rate": 7.344945126193723e-06, "loss": 1.2006, "step": 20071 }, { "epoch": 5.978368919749064, "grad_norm": 0.4426378309726715, "learning_rate": 7.344015191095603e-06, "loss": 1.1973, "step": 20072 }, { "epoch": 5.978666765948733, "grad_norm": 0.260190486907959, "learning_rate": 7.343085280708455e-06, "loss": 1.2037, "step": 20073 }, { "epoch": 5.978964612148402, "grad_norm": 0.37847578525543213, "learning_rate": 7.342155395040926e-06, "loss": 1.2049, "step": 20074 }, { "epoch": 5.979262458348071, "grad_norm": 0.26997873187065125, "learning_rate": 7.341225534101676e-06, "loss": 1.2282, "step": 20075 }, { "epoch": 5.979560304547739, "grad_norm": 0.4619077444076538, "learning_rate": 7.3402956978993514e-06, "loss": 1.2095, "step": 20076 }, { "epoch": 5.979858150747408, "grad_norm": 0.28734198212623596, "learning_rate": 7.3393658864426e-06, "loss": 1.2076, "step": 20077 }, { "epoch": 5.980155996947077, "grad_norm": 0.28254473209381104, "learning_rate": 7.338436099740079e-06, "loss": 1.2234, "step": 20078 }, { "epoch": 5.980453843146745, "grad_norm": 0.27819713950157166, "learning_rate": 7.337506337800433e-06, "loss": 1.2155, "step": 20079 }, { "epoch": 5.980751689346413, "grad_norm": 0.2774719297885895, "learning_rate": 7.33657660063232e-06, "loss": 1.212, "step": 20080 }, { "epoch": 5.981049535546083, "grad_norm": 0.29275059700012207, "learning_rate": 7.335646888244386e-06, "loss": 1.1926, "step": 20081 }, { "epoch": 5.981347381745751, "grad_norm": 0.26836857199668884, "learning_rate": 7.334717200645278e-06, "loss": 1.2126, "step": 20082 }, { "epoch": 5.981645227945419, "grad_norm": 0.2950221300125122, "learning_rate": 7.333787537843652e-06, "loss": 1.2217, "step": 20083 }, { "epoch": 5.9819430741450885, "grad_norm": 0.2860493063926697, "learning_rate": 7.3328578998481514e-06, "loss": 1.2034, "step": 20084 }, { "epoch": 5.982240920344757, "grad_norm": 0.2634318470954895, "learning_rate": 7.331928286667427e-06, "loss": 1.2034, "step": 20085 }, { "epoch": 5.982538766544426, "grad_norm": 0.2731645405292511, "learning_rate": 7.330998698310133e-06, "loss": 1.1952, "step": 20086 }, { "epoch": 5.982836612744094, "grad_norm": 0.29142895340919495, "learning_rate": 7.330069134784914e-06, "loss": 1.218, "step": 20087 }, { "epoch": 5.983134458943763, "grad_norm": 0.275729775428772, "learning_rate": 7.329139596100414e-06, "loss": 1.2227, "step": 20088 }, { "epoch": 5.983432305143432, "grad_norm": 0.38682979345321655, "learning_rate": 7.328210082265289e-06, "loss": 1.2105, "step": 20089 }, { "epoch": 5.9837301513431, "grad_norm": 0.3280767798423767, "learning_rate": 7.327280593288186e-06, "loss": 1.2086, "step": 20090 }, { "epoch": 5.984027997542769, "grad_norm": 0.3915572762489319, "learning_rate": 7.326351129177746e-06, "loss": 1.2036, "step": 20091 }, { "epoch": 5.984325843742438, "grad_norm": 0.25731709599494934, "learning_rate": 7.325421689942627e-06, "loss": 1.2025, "step": 20092 }, { "epoch": 5.984623689942106, "grad_norm": 0.7086104154586792, "learning_rate": 7.32449227559147e-06, "loss": 1.1935, "step": 20093 }, { "epoch": 5.9849215361417745, "grad_norm": 0.3194904923439026, "learning_rate": 7.323562886132919e-06, "loss": 1.2096, "step": 20094 }, { "epoch": 5.985219382341444, "grad_norm": 0.4881739318370819, "learning_rate": 7.3226335215756274e-06, "loss": 1.2136, "step": 20095 }, { "epoch": 5.985517228541112, "grad_norm": 0.2634449601173401, "learning_rate": 7.32170418192824e-06, "loss": 1.2184, "step": 20096 }, { "epoch": 5.985815074740781, "grad_norm": 0.4980546832084656, "learning_rate": 7.3207748671994016e-06, "loss": 1.2074, "step": 20097 }, { "epoch": 5.9861129209404496, "grad_norm": 0.2874886691570282, "learning_rate": 7.319845577397763e-06, "loss": 1.2205, "step": 20098 }, { "epoch": 5.986410767140118, "grad_norm": 0.342190682888031, "learning_rate": 7.3189163125319615e-06, "loss": 1.2136, "step": 20099 }, { "epoch": 5.986708613339786, "grad_norm": 0.4686349630355835, "learning_rate": 7.317987072610653e-06, "loss": 1.2031, "step": 20100 }, { "epoch": 5.987006459539455, "grad_norm": 0.29796168208122253, "learning_rate": 7.317057857642476e-06, "loss": 1.2036, "step": 20101 }, { "epoch": 5.987304305739124, "grad_norm": 0.5794475674629211, "learning_rate": 7.316128667636077e-06, "loss": 1.2193, "step": 20102 }, { "epoch": 5.987602151938793, "grad_norm": 0.36775219440460205, "learning_rate": 7.315199502600106e-06, "loss": 1.2163, "step": 20103 }, { "epoch": 5.987899998138461, "grad_norm": 0.380149245262146, "learning_rate": 7.314270362543203e-06, "loss": 1.2146, "step": 20104 }, { "epoch": 5.98819784433813, "grad_norm": 0.34766989946365356, "learning_rate": 7.313341247474009e-06, "loss": 1.2176, "step": 20105 }, { "epoch": 5.988495690537799, "grad_norm": 0.7177759408950806, "learning_rate": 7.312412157401179e-06, "loss": 1.2095, "step": 20106 }, { "epoch": 5.988793536737467, "grad_norm": 0.274689257144928, "learning_rate": 7.311483092333347e-06, "loss": 1.2226, "step": 20107 }, { "epoch": 5.9890913829371355, "grad_norm": 0.4082862138748169, "learning_rate": 7.310554052279161e-06, "loss": 1.2127, "step": 20108 }, { "epoch": 5.989389229136805, "grad_norm": 0.3884943425655365, "learning_rate": 7.309625037247268e-06, "loss": 1.2079, "step": 20109 }, { "epoch": 5.989687075336473, "grad_norm": 0.5350891351699829, "learning_rate": 7.308696047246304e-06, "loss": 1.2075, "step": 20110 }, { "epoch": 5.989984921536141, "grad_norm": 0.4405856430530548, "learning_rate": 7.30776708228492e-06, "loss": 1.2106, "step": 20111 }, { "epoch": 5.990282767735811, "grad_norm": 0.4748976528644562, "learning_rate": 7.306838142371756e-06, "loss": 1.2141, "step": 20112 }, { "epoch": 5.990580613935479, "grad_norm": 0.4101448655128479, "learning_rate": 7.3059092275154495e-06, "loss": 1.1996, "step": 20113 }, { "epoch": 5.990878460135148, "grad_norm": 0.31333962082862854, "learning_rate": 7.30498033772465e-06, "loss": 1.2052, "step": 20114 }, { "epoch": 5.9911763063348165, "grad_norm": 0.3447631895542145, "learning_rate": 7.304051473008e-06, "loss": 1.2148, "step": 20115 }, { "epoch": 5.991474152534485, "grad_norm": 0.37121906876564026, "learning_rate": 7.3031226333741334e-06, "loss": 1.2133, "step": 20116 }, { "epoch": 5.991771998734154, "grad_norm": 0.3757255971431732, "learning_rate": 7.302193818831703e-06, "loss": 1.2047, "step": 20117 }, { "epoch": 5.992069844933822, "grad_norm": 0.3267693817615509, "learning_rate": 7.301265029389342e-06, "loss": 1.1951, "step": 20118 }, { "epoch": 5.992367691133491, "grad_norm": 0.287718802690506, "learning_rate": 7.300336265055697e-06, "loss": 1.2135, "step": 20119 }, { "epoch": 5.99266553733316, "grad_norm": 0.4105055034160614, "learning_rate": 7.299407525839404e-06, "loss": 1.2068, "step": 20120 }, { "epoch": 5.992963383532828, "grad_norm": 0.2679644227027893, "learning_rate": 7.29847881174911e-06, "loss": 1.2249, "step": 20121 }, { "epoch": 5.9932612297324965, "grad_norm": 0.2722683846950531, "learning_rate": 7.297550122793447e-06, "loss": 1.1984, "step": 20122 }, { "epoch": 5.993559075932166, "grad_norm": 0.37727558612823486, "learning_rate": 7.296621458981066e-06, "loss": 1.2037, "step": 20123 }, { "epoch": 5.993856922131834, "grad_norm": 0.2851454019546509, "learning_rate": 7.295692820320599e-06, "loss": 1.222, "step": 20124 }, { "epoch": 5.994154768331503, "grad_norm": 0.346204936504364, "learning_rate": 7.294764206820688e-06, "loss": 1.2093, "step": 20125 }, { "epoch": 5.994452614531172, "grad_norm": 0.29175764322280884, "learning_rate": 7.293835618489977e-06, "loss": 1.2222, "step": 20126 }, { "epoch": 5.99475046073084, "grad_norm": 0.2784067690372467, "learning_rate": 7.292907055337099e-06, "loss": 1.1932, "step": 20127 }, { "epoch": 5.995048306930508, "grad_norm": 0.2747276723384857, "learning_rate": 7.2919785173706994e-06, "loss": 1.209, "step": 20128 }, { "epoch": 5.9953461531301775, "grad_norm": 0.47183743119239807, "learning_rate": 7.291050004599414e-06, "loss": 1.2141, "step": 20129 }, { "epoch": 5.995643999329846, "grad_norm": 0.38707008957862854, "learning_rate": 7.290121517031879e-06, "loss": 1.2008, "step": 20130 }, { "epoch": 5.995941845529515, "grad_norm": 0.26040273904800415, "learning_rate": 7.289193054676735e-06, "loss": 1.2055, "step": 20131 }, { "epoch": 5.996239691729183, "grad_norm": 0.2804533839225769, "learning_rate": 7.2882646175426254e-06, "loss": 1.2178, "step": 20132 }, { "epoch": 5.996537537928852, "grad_norm": 0.2642832100391388, "learning_rate": 7.287336205638178e-06, "loss": 1.2115, "step": 20133 }, { "epoch": 5.996835384128521, "grad_norm": 0.26809191703796387, "learning_rate": 7.286407818972042e-06, "loss": 1.2166, "step": 20134 }, { "epoch": 5.997133230328189, "grad_norm": 0.31540343165397644, "learning_rate": 7.2854794575528485e-06, "loss": 1.2273, "step": 20135 }, { "epoch": 5.997431076527858, "grad_norm": 0.31573593616485596, "learning_rate": 7.284551121389232e-06, "loss": 1.2178, "step": 20136 }, { "epoch": 5.997728922727527, "grad_norm": 0.2828293442726135, "learning_rate": 7.283622810489836e-06, "loss": 1.2003, "step": 20137 }, { "epoch": 5.998026768927195, "grad_norm": 0.3221212923526764, "learning_rate": 7.282694524863297e-06, "loss": 1.2245, "step": 20138 }, { "epoch": 5.998324615126863, "grad_norm": 0.32015347480773926, "learning_rate": 7.281766264518244e-06, "loss": 1.2138, "step": 20139 }, { "epoch": 5.998622461326533, "grad_norm": 0.28515100479125977, "learning_rate": 7.280838029463324e-06, "loss": 1.2293, "step": 20140 }, { "epoch": 5.998920307526201, "grad_norm": 0.2944874167442322, "learning_rate": 7.279909819707166e-06, "loss": 1.1981, "step": 20141 }, { "epoch": 5.99921815372587, "grad_norm": 0.26799100637435913, "learning_rate": 7.278981635258408e-06, "loss": 1.2172, "step": 20142 }, { "epoch": 5.9995159999255385, "grad_norm": 0.41549232602119446, "learning_rate": 7.278053476125686e-06, "loss": 1.1884, "step": 20143 }, { "epoch": 5.999813846125207, "grad_norm": 0.26328161358833313, "learning_rate": 7.277125342317632e-06, "loss": 1.2016, "step": 20144 }, { "epoch": 6.000111692324876, "grad_norm": 0.43518418073654175, "learning_rate": 7.27619723384289e-06, "loss": 1.2256, "step": 20145 }, { "epoch": 6.000409538524544, "grad_norm": 0.32351791858673096, "learning_rate": 7.2752691507100895e-06, "loss": 1.2045, "step": 20146 }, { "epoch": 6.000707384724213, "grad_norm": 0.4505084156990051, "learning_rate": 7.274341092927861e-06, "loss": 1.2168, "step": 20147 }, { "epoch": 6.001005230923882, "grad_norm": 0.302665114402771, "learning_rate": 7.273413060504846e-06, "loss": 1.2008, "step": 20148 }, { "epoch": 6.00130307712355, "grad_norm": 0.43544793128967285, "learning_rate": 7.272485053449676e-06, "loss": 1.2138, "step": 20149 }, { "epoch": 6.001600923323219, "grad_norm": 0.3466516435146332, "learning_rate": 7.271557071770983e-06, "loss": 1.213, "step": 20150 }, { "epoch": 6.001898769522888, "grad_norm": 0.40594616532325745, "learning_rate": 7.270629115477406e-06, "loss": 1.2088, "step": 20151 }, { "epoch": 6.002196615722556, "grad_norm": 0.3146449029445648, "learning_rate": 7.269701184577577e-06, "loss": 1.2139, "step": 20152 }, { "epoch": 6.002494461922225, "grad_norm": 0.29757311940193176, "learning_rate": 7.2687732790801225e-06, "loss": 1.192, "step": 20153 }, { "epoch": 6.002792308121894, "grad_norm": 0.2778853476047516, "learning_rate": 7.267845398993685e-06, "loss": 1.2113, "step": 20154 }, { "epoch": 6.003090154321562, "grad_norm": 0.33108317852020264, "learning_rate": 7.266917544326894e-06, "loss": 1.2155, "step": 20155 }, { "epoch": 6.003388000521231, "grad_norm": 0.26440855860710144, "learning_rate": 7.265989715088377e-06, "loss": 1.2172, "step": 20156 }, { "epoch": 6.0036858467208996, "grad_norm": 0.3025246560573578, "learning_rate": 7.265061911286777e-06, "loss": 1.2307, "step": 20157 }, { "epoch": 6.003983692920568, "grad_norm": 0.2890378534793854, "learning_rate": 7.264134132930719e-06, "loss": 1.1996, "step": 20158 }, { "epoch": 6.004281539120237, "grad_norm": 0.32154908776283264, "learning_rate": 7.263206380028833e-06, "loss": 1.2276, "step": 20159 }, { "epoch": 6.004579385319905, "grad_norm": 0.2817894220352173, "learning_rate": 7.262278652589756e-06, "loss": 1.1953, "step": 20160 }, { "epoch": 6.004877231519574, "grad_norm": 0.3327573835849762, "learning_rate": 7.261350950622115e-06, "loss": 1.2209, "step": 20161 }, { "epoch": 6.005175077719243, "grad_norm": 0.2718318700790405, "learning_rate": 7.260423274134547e-06, "loss": 1.1992, "step": 20162 }, { "epoch": 6.005472923918911, "grad_norm": 0.41871002316474915, "learning_rate": 7.25949562313568e-06, "loss": 1.2104, "step": 20163 }, { "epoch": 6.00577077011858, "grad_norm": 0.5034345984458923, "learning_rate": 7.258567997634141e-06, "loss": 1.2115, "step": 20164 }, { "epoch": 6.006068616318249, "grad_norm": 0.25733083486557007, "learning_rate": 7.257640397638567e-06, "loss": 1.2109, "step": 20165 }, { "epoch": 6.006366462517917, "grad_norm": 0.33275723457336426, "learning_rate": 7.256712823157584e-06, "loss": 1.2166, "step": 20166 }, { "epoch": 6.006664308717586, "grad_norm": 0.4351447820663452, "learning_rate": 7.25578527419982e-06, "loss": 1.2281, "step": 20167 }, { "epoch": 6.006962154917255, "grad_norm": 0.31585893034935, "learning_rate": 7.254857750773913e-06, "loss": 1.1977, "step": 20168 }, { "epoch": 6.007260001116923, "grad_norm": 0.3916688561439514, "learning_rate": 7.253930252888487e-06, "loss": 1.203, "step": 20169 }, { "epoch": 6.007557847316592, "grad_norm": 0.3016495108604431, "learning_rate": 7.253002780552167e-06, "loss": 1.1975, "step": 20170 }, { "epoch": 6.007855693516261, "grad_norm": 0.3804759383201599, "learning_rate": 7.252075333773594e-06, "loss": 1.205, "step": 20171 }, { "epoch": 6.008153539715929, "grad_norm": 0.2956993281841278, "learning_rate": 7.251147912561385e-06, "loss": 1.1952, "step": 20172 }, { "epoch": 6.008451385915598, "grad_norm": 0.40233540534973145, "learning_rate": 7.250220516924174e-06, "loss": 1.2127, "step": 20173 }, { "epoch": 6.0087492321152665, "grad_norm": 0.30694687366485596, "learning_rate": 7.24929314687059e-06, "loss": 1.2116, "step": 20174 }, { "epoch": 6.009047078314935, "grad_norm": 0.3779052495956421, "learning_rate": 7.248365802409262e-06, "loss": 1.1989, "step": 20175 }, { "epoch": 6.009344924514604, "grad_norm": 0.4814597964286804, "learning_rate": 7.247438483548811e-06, "loss": 1.2011, "step": 20176 }, { "epoch": 6.009642770714272, "grad_norm": 0.43889567255973816, "learning_rate": 7.246511190297871e-06, "loss": 1.2116, "step": 20177 }, { "epoch": 6.009940616913941, "grad_norm": 0.5229358673095703, "learning_rate": 7.2455839226650696e-06, "loss": 1.2141, "step": 20178 }, { "epoch": 6.01023846311361, "grad_norm": 0.32213860750198364, "learning_rate": 7.244656680659032e-06, "loss": 1.195, "step": 20179 }, { "epoch": 6.010536309313278, "grad_norm": 0.9367316961288452, "learning_rate": 7.243729464288387e-06, "loss": 1.2101, "step": 20180 }, { "epoch": 6.010834155512947, "grad_norm": 0.5796099901199341, "learning_rate": 7.242802273561755e-06, "loss": 1.204, "step": 20181 }, { "epoch": 6.011132001712616, "grad_norm": 0.596804141998291, "learning_rate": 7.241875108487773e-06, "loss": 1.2033, "step": 20182 }, { "epoch": 6.011429847912284, "grad_norm": 0.43341487646102905, "learning_rate": 7.240947969075059e-06, "loss": 1.2018, "step": 20183 }, { "epoch": 6.011727694111953, "grad_norm": 0.49243173003196716, "learning_rate": 7.240020855332241e-06, "loss": 1.2303, "step": 20184 }, { "epoch": 6.012025540311622, "grad_norm": 0.30709803104400635, "learning_rate": 7.239093767267946e-06, "loss": 1.2088, "step": 20185 }, { "epoch": 6.01232338651129, "grad_norm": 0.3742426037788391, "learning_rate": 7.238166704890801e-06, "loss": 1.1972, "step": 20186 }, { "epoch": 6.012621232710959, "grad_norm": 0.3649977743625641, "learning_rate": 7.237239668209425e-06, "loss": 1.2157, "step": 20187 }, { "epoch": 6.0129190789106275, "grad_norm": 0.26593589782714844, "learning_rate": 7.2363126572324505e-06, "loss": 1.2058, "step": 20188 }, { "epoch": 6.013216925110296, "grad_norm": 0.5346266031265259, "learning_rate": 7.235385671968498e-06, "loss": 1.2205, "step": 20189 }, { "epoch": 6.013514771309965, "grad_norm": 0.3156408965587616, "learning_rate": 7.234458712426189e-06, "loss": 1.2021, "step": 20190 }, { "epoch": 6.013812617509633, "grad_norm": 0.45501449704170227, "learning_rate": 7.2335317786141575e-06, "loss": 1.1988, "step": 20191 }, { "epoch": 6.014110463709302, "grad_norm": 0.35815557837486267, "learning_rate": 7.232604870541022e-06, "loss": 1.1927, "step": 20192 }, { "epoch": 6.014408309908971, "grad_norm": 0.30453506112098694, "learning_rate": 7.231677988215402e-06, "loss": 1.2149, "step": 20193 }, { "epoch": 6.014706156108639, "grad_norm": 0.4766293466091156, "learning_rate": 7.230751131645929e-06, "loss": 1.2094, "step": 20194 }, { "epoch": 6.0150040023083085, "grad_norm": 0.29360222816467285, "learning_rate": 7.229824300841219e-06, "loss": 1.2275, "step": 20195 }, { "epoch": 6.015301848507977, "grad_norm": 0.3880390226840973, "learning_rate": 7.228897495809901e-06, "loss": 1.2104, "step": 20196 }, { "epoch": 6.015599694707645, "grad_norm": 0.45414289832115173, "learning_rate": 7.227970716560597e-06, "loss": 1.2102, "step": 20197 }, { "epoch": 6.015897540907314, "grad_norm": 0.35279580950737, "learning_rate": 7.227043963101925e-06, "loss": 1.2236, "step": 20198 }, { "epoch": 6.016195387106983, "grad_norm": 0.48844024538993835, "learning_rate": 7.226117235442515e-06, "loss": 1.2091, "step": 20199 }, { "epoch": 6.016493233306651, "grad_norm": 0.2899201512336731, "learning_rate": 7.225190533590982e-06, "loss": 1.2038, "step": 20200 }, { "epoch": 6.01679107950632, "grad_norm": 0.3832801282405853, "learning_rate": 7.224263857555952e-06, "loss": 1.2166, "step": 20201 }, { "epoch": 6.0170889257059885, "grad_norm": 0.33199718594551086, "learning_rate": 7.223337207346045e-06, "loss": 1.2086, "step": 20202 }, { "epoch": 6.017386771905657, "grad_norm": 0.4868021607398987, "learning_rate": 7.222410582969885e-06, "loss": 1.2091, "step": 20203 }, { "epoch": 6.017684618105326, "grad_norm": 0.31164202094078064, "learning_rate": 7.221483984436089e-06, "loss": 1.2012, "step": 20204 }, { "epoch": 6.017982464304994, "grad_norm": 0.33542874455451965, "learning_rate": 7.220557411753281e-06, "loss": 1.1989, "step": 20205 }, { "epoch": 6.018280310504663, "grad_norm": 0.3539286255836487, "learning_rate": 7.219630864930081e-06, "loss": 1.2085, "step": 20206 }, { "epoch": 6.018578156704332, "grad_norm": 0.2705722749233246, "learning_rate": 7.21870434397511e-06, "loss": 1.2092, "step": 20207 }, { "epoch": 6.018876002904, "grad_norm": 0.3076128363609314, "learning_rate": 7.217777848896985e-06, "loss": 1.1942, "step": 20208 }, { "epoch": 6.0191738491036695, "grad_norm": 0.2987601161003113, "learning_rate": 7.216851379704333e-06, "loss": 1.2177, "step": 20209 }, { "epoch": 6.019471695303338, "grad_norm": 0.2667655050754547, "learning_rate": 7.215924936405764e-06, "loss": 1.2322, "step": 20210 }, { "epoch": 6.019769541503006, "grad_norm": 0.27794232964515686, "learning_rate": 7.214998519009908e-06, "loss": 1.2056, "step": 20211 }, { "epoch": 6.020067387702675, "grad_norm": 0.30565345287323, "learning_rate": 7.214072127525373e-06, "loss": 1.2197, "step": 20212 }, { "epoch": 6.020365233902344, "grad_norm": 0.32783243060112, "learning_rate": 7.213145761960789e-06, "loss": 1.209, "step": 20213 }, { "epoch": 6.020663080102012, "grad_norm": 0.2744307816028595, "learning_rate": 7.212219422324771e-06, "loss": 1.2017, "step": 20214 }, { "epoch": 6.020960926301681, "grad_norm": 0.2722924053668976, "learning_rate": 7.211293108625932e-06, "loss": 1.2091, "step": 20215 }, { "epoch": 6.0212587725013496, "grad_norm": 0.31360188126564026, "learning_rate": 7.2103668208729e-06, "loss": 1.2202, "step": 20216 }, { "epoch": 6.021556618701018, "grad_norm": 0.37984272837638855, "learning_rate": 7.209440559074287e-06, "loss": 1.1931, "step": 20217 }, { "epoch": 6.021854464900687, "grad_norm": 0.29541170597076416, "learning_rate": 7.2085143232387075e-06, "loss": 1.2097, "step": 20218 }, { "epoch": 6.022152311100355, "grad_norm": 0.45290622115135193, "learning_rate": 7.207588113374786e-06, "loss": 1.2064, "step": 20219 }, { "epoch": 6.022450157300025, "grad_norm": 0.32291334867477417, "learning_rate": 7.20666192949114e-06, "loss": 1.1922, "step": 20220 }, { "epoch": 6.022748003499693, "grad_norm": 0.46814969182014465, "learning_rate": 7.205735771596379e-06, "loss": 1.2011, "step": 20221 }, { "epoch": 6.023045849699361, "grad_norm": 0.2647963762283325, "learning_rate": 7.204809639699128e-06, "loss": 1.2162, "step": 20222 }, { "epoch": 6.0233436958990305, "grad_norm": 0.2693672776222229, "learning_rate": 7.2038835338080026e-06, "loss": 1.2214, "step": 20223 }, { "epoch": 6.023641542098699, "grad_norm": 0.3999970555305481, "learning_rate": 7.202957453931611e-06, "loss": 1.2086, "step": 20224 }, { "epoch": 6.023939388298367, "grad_norm": 0.41179734468460083, "learning_rate": 7.202031400078578e-06, "loss": 1.1991, "step": 20225 }, { "epoch": 6.024237234498036, "grad_norm": 0.26541635394096375, "learning_rate": 7.201105372257513e-06, "loss": 1.2184, "step": 20226 }, { "epoch": 6.024535080697705, "grad_norm": 0.2938902676105499, "learning_rate": 7.200179370477042e-06, "loss": 1.2234, "step": 20227 }, { "epoch": 6.024832926897373, "grad_norm": 0.27487459778785706, "learning_rate": 7.199253394745772e-06, "loss": 1.2094, "step": 20228 }, { "epoch": 6.025130773097042, "grad_norm": 0.3359871506690979, "learning_rate": 7.198327445072318e-06, "loss": 1.2182, "step": 20229 }, { "epoch": 6.025428619296711, "grad_norm": 0.3092283606529236, "learning_rate": 7.1974015214652996e-06, "loss": 1.2182, "step": 20230 }, { "epoch": 6.025726465496379, "grad_norm": 0.28621119260787964, "learning_rate": 7.196475623933326e-06, "loss": 1.2198, "step": 20231 }, { "epoch": 6.026024311696048, "grad_norm": 0.7144689559936523, "learning_rate": 7.195549752485013e-06, "loss": 1.2126, "step": 20232 }, { "epoch": 6.0263221578957165, "grad_norm": 0.41839155554771423, "learning_rate": 7.19462390712898e-06, "loss": 1.2127, "step": 20233 }, { "epoch": 6.026620004095386, "grad_norm": 0.5097765922546387, "learning_rate": 7.193698087873838e-06, "loss": 1.2038, "step": 20234 }, { "epoch": 6.026917850295054, "grad_norm": 0.3441077470779419, "learning_rate": 7.1927722947281956e-06, "loss": 1.2117, "step": 20235 }, { "epoch": 6.027215696494722, "grad_norm": 0.4326217770576477, "learning_rate": 7.191846527700672e-06, "loss": 1.2149, "step": 20236 }, { "epoch": 6.0275135426943915, "grad_norm": 0.260591596364975, "learning_rate": 7.190920786799882e-06, "loss": 1.2043, "step": 20237 }, { "epoch": 6.02781138889406, "grad_norm": 0.3409421741962433, "learning_rate": 7.18999507203443e-06, "loss": 1.2193, "step": 20238 }, { "epoch": 6.028109235093728, "grad_norm": 0.35025498270988464, "learning_rate": 7.189069383412941e-06, "loss": 1.2049, "step": 20239 }, { "epoch": 6.028407081293397, "grad_norm": 0.2884001135826111, "learning_rate": 7.188143720944018e-06, "loss": 1.2022, "step": 20240 }, { "epoch": 6.028704927493066, "grad_norm": 0.2892345190048218, "learning_rate": 7.187218084636274e-06, "loss": 1.2, "step": 20241 }, { "epoch": 6.029002773692734, "grad_norm": 0.3615533411502838, "learning_rate": 7.1862924744983244e-06, "loss": 1.2287, "step": 20242 }, { "epoch": 6.029300619892403, "grad_norm": 0.27343472838401794, "learning_rate": 7.18536689053878e-06, "loss": 1.2045, "step": 20243 }, { "epoch": 6.029598466092072, "grad_norm": 0.2636178135871887, "learning_rate": 7.184441332766252e-06, "loss": 1.1812, "step": 20244 }, { "epoch": 6.02989631229174, "grad_norm": 0.3013664782047272, "learning_rate": 7.183515801189353e-06, "loss": 1.222, "step": 20245 }, { "epoch": 6.030194158491409, "grad_norm": 0.2612849175930023, "learning_rate": 7.182590295816689e-06, "loss": 1.2062, "step": 20246 }, { "epoch": 6.0304920046910775, "grad_norm": 0.3013571798801422, "learning_rate": 7.18166481665688e-06, "loss": 1.2186, "step": 20247 }, { "epoch": 6.030789850890747, "grad_norm": 0.27074506878852844, "learning_rate": 7.1807393637185265e-06, "loss": 1.1998, "step": 20248 }, { "epoch": 6.031087697090415, "grad_norm": 0.4318067133426666, "learning_rate": 7.179813937010243e-06, "loss": 1.2042, "step": 20249 }, { "epoch": 6.031385543290083, "grad_norm": 0.31180885434150696, "learning_rate": 7.178888536540642e-06, "loss": 1.1954, "step": 20250 }, { "epoch": 6.031683389489753, "grad_norm": 0.3415888845920563, "learning_rate": 7.177963162318333e-06, "loss": 1.1973, "step": 20251 }, { "epoch": 6.031981235689421, "grad_norm": 0.3148844540119171, "learning_rate": 7.1770378143519195e-06, "loss": 1.2013, "step": 20252 }, { "epoch": 6.032279081889089, "grad_norm": 0.45775938034057617, "learning_rate": 7.1761124926500205e-06, "loss": 1.1995, "step": 20253 }, { "epoch": 6.0325769280887585, "grad_norm": 0.3277266323566437, "learning_rate": 7.175187197221236e-06, "loss": 1.2031, "step": 20254 }, { "epoch": 6.032874774288427, "grad_norm": 0.4142463505268097, "learning_rate": 7.1742619280741756e-06, "loss": 1.2072, "step": 20255 }, { "epoch": 6.033172620488095, "grad_norm": 0.36327052116394043, "learning_rate": 7.173336685217457e-06, "loss": 1.2076, "step": 20256 }, { "epoch": 6.033470466687764, "grad_norm": 0.2709522247314453, "learning_rate": 7.172411468659681e-06, "loss": 1.1982, "step": 20257 }, { "epoch": 6.033768312887433, "grad_norm": 0.45761629939079285, "learning_rate": 7.171486278409454e-06, "loss": 1.1959, "step": 20258 }, { "epoch": 6.034066159087101, "grad_norm": 0.2904505133628845, "learning_rate": 7.17056111447539e-06, "loss": 1.2153, "step": 20259 }, { "epoch": 6.03436400528677, "grad_norm": 0.5569671988487244, "learning_rate": 7.16963597686609e-06, "loss": 1.2045, "step": 20260 }, { "epoch": 6.0346618514864385, "grad_norm": 0.3583231270313263, "learning_rate": 7.168710865590168e-06, "loss": 1.2145, "step": 20261 }, { "epoch": 6.034959697686108, "grad_norm": 0.5513421297073364, "learning_rate": 7.167785780656229e-06, "loss": 1.2144, "step": 20262 }, { "epoch": 6.035257543885776, "grad_norm": 0.4723437428474426, "learning_rate": 7.166860722072876e-06, "loss": 1.2052, "step": 20263 }, { "epoch": 6.035555390085444, "grad_norm": 0.3246673047542572, "learning_rate": 7.165935689848722e-06, "loss": 1.2164, "step": 20264 }, { "epoch": 6.035853236285114, "grad_norm": 0.49947765469551086, "learning_rate": 7.1650106839923685e-06, "loss": 1.1882, "step": 20265 }, { "epoch": 6.036151082484782, "grad_norm": 0.2703419029712677, "learning_rate": 7.1640857045124214e-06, "loss": 1.2139, "step": 20266 }, { "epoch": 6.03644892868445, "grad_norm": 0.49116799235343933, "learning_rate": 7.16316075141749e-06, "loss": 1.2194, "step": 20267 }, { "epoch": 6.0367467748841195, "grad_norm": 0.31984731554985046, "learning_rate": 7.162235824716179e-06, "loss": 1.2122, "step": 20268 }, { "epoch": 6.037044621083788, "grad_norm": 0.42808830738067627, "learning_rate": 7.161310924417091e-06, "loss": 1.2005, "step": 20269 }, { "epoch": 6.037342467283456, "grad_norm": 0.35784581303596497, "learning_rate": 7.160386050528836e-06, "loss": 1.2119, "step": 20270 }, { "epoch": 6.037640313483125, "grad_norm": 0.2613573372364044, "learning_rate": 7.159461203060014e-06, "loss": 1.2133, "step": 20271 }, { "epoch": 6.037938159682794, "grad_norm": 0.5055890679359436, "learning_rate": 7.1585363820192295e-06, "loss": 1.2098, "step": 20272 }, { "epoch": 6.038236005882462, "grad_norm": 0.28298699855804443, "learning_rate": 7.157611587415094e-06, "loss": 1.1979, "step": 20273 }, { "epoch": 6.038533852082131, "grad_norm": 0.3474966287612915, "learning_rate": 7.156686819256208e-06, "loss": 1.1902, "step": 20274 }, { "epoch": 6.0388316982817996, "grad_norm": 0.36944159865379333, "learning_rate": 7.155762077551169e-06, "loss": 1.2224, "step": 20275 }, { "epoch": 6.039129544481469, "grad_norm": 0.2859373390674591, "learning_rate": 7.154837362308591e-06, "loss": 1.2096, "step": 20276 }, { "epoch": 6.039427390681137, "grad_norm": 0.4036981463432312, "learning_rate": 7.153912673537068e-06, "loss": 1.1936, "step": 20277 }, { "epoch": 6.039725236880805, "grad_norm": 0.26265665888786316, "learning_rate": 7.15298801124521e-06, "loss": 1.2139, "step": 20278 }, { "epoch": 6.040023083080475, "grad_norm": 0.2937547266483307, "learning_rate": 7.152063375441619e-06, "loss": 1.2144, "step": 20279 }, { "epoch": 6.040320929280143, "grad_norm": 0.3020278513431549, "learning_rate": 7.151138766134893e-06, "loss": 1.2127, "step": 20280 }, { "epoch": 6.040618775479811, "grad_norm": 0.29109111428260803, "learning_rate": 7.150214183333641e-06, "loss": 1.2049, "step": 20281 }, { "epoch": 6.0409166216794805, "grad_norm": 0.3542512059211731, "learning_rate": 7.149289627046463e-06, "loss": 1.2086, "step": 20282 }, { "epoch": 6.041214467879149, "grad_norm": 0.2816535532474518, "learning_rate": 7.148365097281956e-06, "loss": 1.2058, "step": 20283 }, { "epoch": 6.041512314078817, "grad_norm": 0.35247406363487244, "learning_rate": 7.147440594048728e-06, "loss": 1.2184, "step": 20284 }, { "epoch": 6.041810160278486, "grad_norm": 0.31707391142845154, "learning_rate": 7.1465161173553795e-06, "loss": 1.2087, "step": 20285 }, { "epoch": 6.042108006478155, "grad_norm": 0.36135584115982056, "learning_rate": 7.145591667210507e-06, "loss": 1.2004, "step": 20286 }, { "epoch": 6.042405852677824, "grad_norm": 0.2838849723339081, "learning_rate": 7.1446672436227185e-06, "loss": 1.2278, "step": 20287 }, { "epoch": 6.042703698877492, "grad_norm": 0.31617432832717896, "learning_rate": 7.1437428466006096e-06, "loss": 1.2062, "step": 20288 }, { "epoch": 6.043001545077161, "grad_norm": 0.24839657545089722, "learning_rate": 7.142818476152782e-06, "loss": 1.1983, "step": 20289 }, { "epoch": 6.04329939127683, "grad_norm": 0.2619687616825104, "learning_rate": 7.141894132287836e-06, "loss": 1.2022, "step": 20290 }, { "epoch": 6.043597237476498, "grad_norm": 0.27380067110061646, "learning_rate": 7.1409698150143735e-06, "loss": 1.1957, "step": 20291 }, { "epoch": 6.0438950836761665, "grad_norm": 0.24459058046340942, "learning_rate": 7.140045524340989e-06, "loss": 1.1973, "step": 20292 }, { "epoch": 6.044192929875836, "grad_norm": 0.3787658214569092, "learning_rate": 7.139121260276289e-06, "loss": 1.1999, "step": 20293 }, { "epoch": 6.044490776075504, "grad_norm": 0.2831189036369324, "learning_rate": 7.138197022828867e-06, "loss": 1.2056, "step": 20294 }, { "epoch": 6.044788622275172, "grad_norm": 0.37250658869743347, "learning_rate": 7.137272812007328e-06, "loss": 1.2121, "step": 20295 }, { "epoch": 6.0450864684748415, "grad_norm": 0.2655556797981262, "learning_rate": 7.136348627820264e-06, "loss": 1.1904, "step": 20296 }, { "epoch": 6.04538431467451, "grad_norm": 0.3377954959869385, "learning_rate": 7.1354244702762746e-06, "loss": 1.2136, "step": 20297 }, { "epoch": 6.045682160874178, "grad_norm": 0.25669988989830017, "learning_rate": 7.134500339383964e-06, "loss": 1.224, "step": 20298 }, { "epoch": 6.045980007073847, "grad_norm": 0.289655864238739, "learning_rate": 7.1335762351519264e-06, "loss": 1.2171, "step": 20299 }, { "epoch": 6.046277853273516, "grad_norm": 0.37964531779289246, "learning_rate": 7.132652157588755e-06, "loss": 1.2089, "step": 20300 }, { "epoch": 6.046575699473185, "grad_norm": 0.3691795766353607, "learning_rate": 7.131728106703055e-06, "loss": 1.1989, "step": 20301 }, { "epoch": 6.046873545672853, "grad_norm": 0.2852831184864044, "learning_rate": 7.130804082503422e-06, "loss": 1.2121, "step": 20302 }, { "epoch": 6.047171391872522, "grad_norm": 0.25786951184272766, "learning_rate": 7.129880084998446e-06, "loss": 1.2259, "step": 20303 }, { "epoch": 6.047469238072191, "grad_norm": 0.2877507209777832, "learning_rate": 7.128956114196734e-06, "loss": 1.2264, "step": 20304 }, { "epoch": 6.047767084271859, "grad_norm": 0.26803675293922424, "learning_rate": 7.1280321701068776e-06, "loss": 1.2099, "step": 20305 }, { "epoch": 6.0480649304715275, "grad_norm": 0.294336199760437, "learning_rate": 7.127108252737469e-06, "loss": 1.2089, "step": 20306 }, { "epoch": 6.048362776671197, "grad_norm": 0.41252401471138, "learning_rate": 7.126184362097111e-06, "loss": 1.2175, "step": 20307 }, { "epoch": 6.048660622870865, "grad_norm": 0.2663376033306122, "learning_rate": 7.125260498194397e-06, "loss": 1.2103, "step": 20308 }, { "epoch": 6.048958469070533, "grad_norm": 0.444261372089386, "learning_rate": 7.12433666103792e-06, "loss": 1.2047, "step": 20309 }, { "epoch": 6.049256315270203, "grad_norm": 0.3058907091617584, "learning_rate": 7.123412850636282e-06, "loss": 1.2129, "step": 20310 }, { "epoch": 6.049554161469871, "grad_norm": 0.48705607652664185, "learning_rate": 7.122489066998067e-06, "loss": 1.2101, "step": 20311 }, { "epoch": 6.049852007669539, "grad_norm": 0.38099536299705505, "learning_rate": 7.121565310131882e-06, "loss": 1.2085, "step": 20312 }, { "epoch": 6.0501498538692084, "grad_norm": 0.3769145905971527, "learning_rate": 7.120641580046314e-06, "loss": 1.2155, "step": 20313 }, { "epoch": 6.050447700068877, "grad_norm": 0.46924084424972534, "learning_rate": 7.119717876749956e-06, "loss": 1.2256, "step": 20314 }, { "epoch": 6.050745546268546, "grad_norm": 0.28301212191581726, "learning_rate": 7.11879420025141e-06, "loss": 1.1889, "step": 20315 }, { "epoch": 6.051043392468214, "grad_norm": 0.3728954792022705, "learning_rate": 7.117870550559265e-06, "loss": 1.2152, "step": 20316 }, { "epoch": 6.051341238667883, "grad_norm": 0.2818300127983093, "learning_rate": 7.116946927682109e-06, "loss": 1.2075, "step": 20317 }, { "epoch": 6.051639084867552, "grad_norm": 0.29196274280548096, "learning_rate": 7.116023331628547e-06, "loss": 1.2033, "step": 20318 }, { "epoch": 6.05193693106722, "grad_norm": 0.26948052644729614, "learning_rate": 7.115099762407162e-06, "loss": 1.2144, "step": 20319 }, { "epoch": 6.0522347772668885, "grad_norm": 0.3444169759750366, "learning_rate": 7.114176220026549e-06, "loss": 1.2132, "step": 20320 }, { "epoch": 6.052532623466558, "grad_norm": 0.275411456823349, "learning_rate": 7.113252704495306e-06, "loss": 1.2135, "step": 20321 }, { "epoch": 6.052830469666226, "grad_norm": 0.29189392924308777, "learning_rate": 7.11232921582202e-06, "loss": 1.2255, "step": 20322 }, { "epoch": 6.053128315865894, "grad_norm": 0.37070831656455994, "learning_rate": 7.111405754015283e-06, "loss": 1.2175, "step": 20323 }, { "epoch": 6.053426162065564, "grad_norm": 0.4555090367794037, "learning_rate": 7.110482319083689e-06, "loss": 1.2174, "step": 20324 }, { "epoch": 6.053724008265232, "grad_norm": 0.3041447401046753, "learning_rate": 7.109558911035828e-06, "loss": 1.23, "step": 20325 }, { "epoch": 6.0540218544649, "grad_norm": 0.446586936712265, "learning_rate": 7.108635529880292e-06, "loss": 1.2094, "step": 20326 }, { "epoch": 6.0543197006645695, "grad_norm": 0.4064498841762543, "learning_rate": 7.107712175625674e-06, "loss": 1.1954, "step": 20327 }, { "epoch": 6.054617546864238, "grad_norm": 0.44321534037590027, "learning_rate": 7.106788848280559e-06, "loss": 1.2123, "step": 20328 }, { "epoch": 6.054915393063907, "grad_norm": 0.35596713423728943, "learning_rate": 7.1058655478535454e-06, "loss": 1.1995, "step": 20329 }, { "epoch": 6.055213239263575, "grad_norm": 0.31254100799560547, "learning_rate": 7.104942274353218e-06, "loss": 1.2104, "step": 20330 }, { "epoch": 6.055511085463244, "grad_norm": 0.39108315110206604, "learning_rate": 7.104019027788165e-06, "loss": 1.1914, "step": 20331 }, { "epoch": 6.055808931662913, "grad_norm": 0.30670154094696045, "learning_rate": 7.103095808166984e-06, "loss": 1.2292, "step": 20332 }, { "epoch": 6.056106777862581, "grad_norm": 0.26854562759399414, "learning_rate": 7.102172615498261e-06, "loss": 1.2124, "step": 20333 }, { "epoch": 6.0564046240622496, "grad_norm": 0.3315681219100952, "learning_rate": 7.101249449790578e-06, "loss": 1.1954, "step": 20334 }, { "epoch": 6.056702470261919, "grad_norm": 0.36479222774505615, "learning_rate": 7.100326311052535e-06, "loss": 1.2224, "step": 20335 }, { "epoch": 6.057000316461587, "grad_norm": 0.32198062539100647, "learning_rate": 7.099403199292715e-06, "loss": 1.2185, "step": 20336 }, { "epoch": 6.057298162661255, "grad_norm": 0.2780730724334717, "learning_rate": 7.098480114519703e-06, "loss": 1.2169, "step": 20337 }, { "epoch": 6.057596008860925, "grad_norm": 0.36179402470588684, "learning_rate": 7.0975570567421e-06, "loss": 1.2094, "step": 20338 }, { "epoch": 6.057893855060593, "grad_norm": 0.2977309226989746, "learning_rate": 7.096634025968482e-06, "loss": 1.2253, "step": 20339 }, { "epoch": 6.058191701260261, "grad_norm": 0.2938660681247711, "learning_rate": 7.095711022207438e-06, "loss": 1.2012, "step": 20340 }, { "epoch": 6.0584895474599305, "grad_norm": 0.48642587661743164, "learning_rate": 7.094788045467562e-06, "loss": 1.2172, "step": 20341 }, { "epoch": 6.058787393659599, "grad_norm": 0.3143618106842041, "learning_rate": 7.093865095757433e-06, "loss": 1.2014, "step": 20342 }, { "epoch": 6.059085239859268, "grad_norm": 0.31239813566207886, "learning_rate": 7.092942173085644e-06, "loss": 1.1942, "step": 20343 }, { "epoch": 6.059383086058936, "grad_norm": 0.25440940260887146, "learning_rate": 7.092019277460782e-06, "loss": 1.2137, "step": 20344 }, { "epoch": 6.059680932258605, "grad_norm": 0.3276274502277374, "learning_rate": 7.091096408891429e-06, "loss": 1.2169, "step": 20345 }, { "epoch": 6.059978778458274, "grad_norm": 0.268621027469635, "learning_rate": 7.090173567386177e-06, "loss": 1.2079, "step": 20346 }, { "epoch": 6.060276624657942, "grad_norm": 0.27536237239837646, "learning_rate": 7.089250752953607e-06, "loss": 1.232, "step": 20347 }, { "epoch": 6.060574470857611, "grad_norm": 0.37485766410827637, "learning_rate": 7.0883279656023066e-06, "loss": 1.2133, "step": 20348 }, { "epoch": 6.06087231705728, "grad_norm": 0.25602367520332336, "learning_rate": 7.0874052053408616e-06, "loss": 1.1843, "step": 20349 }, { "epoch": 6.061170163256948, "grad_norm": 0.40816983580589294, "learning_rate": 7.086482472177859e-06, "loss": 1.2167, "step": 20350 }, { "epoch": 6.0614680094566165, "grad_norm": 0.28543126583099365, "learning_rate": 7.085559766121877e-06, "loss": 1.2146, "step": 20351 }, { "epoch": 6.061765855656286, "grad_norm": 0.38882529735565186, "learning_rate": 7.084637087181511e-06, "loss": 1.2167, "step": 20352 }, { "epoch": 6.062063701855954, "grad_norm": 0.3593398928642273, "learning_rate": 7.083714435365337e-06, "loss": 1.211, "step": 20353 }, { "epoch": 6.062361548055623, "grad_norm": 0.3053756356239319, "learning_rate": 7.082791810681942e-06, "loss": 1.2014, "step": 20354 }, { "epoch": 6.0626593942552915, "grad_norm": 0.5559249520301819, "learning_rate": 7.081869213139908e-06, "loss": 1.2064, "step": 20355 }, { "epoch": 6.06295724045496, "grad_norm": 0.36054715514183044, "learning_rate": 7.080946642747825e-06, "loss": 1.2183, "step": 20356 }, { "epoch": 6.063255086654629, "grad_norm": 0.4525575637817383, "learning_rate": 7.080024099514267e-06, "loss": 1.2028, "step": 20357 }, { "epoch": 6.063552932854297, "grad_norm": 0.2647133469581604, "learning_rate": 7.079101583447826e-06, "loss": 1.2196, "step": 20358 }, { "epoch": 6.063850779053966, "grad_norm": 0.5323600172996521, "learning_rate": 7.078179094557078e-06, "loss": 1.2211, "step": 20359 }, { "epoch": 6.064148625253635, "grad_norm": 0.34941011667251587, "learning_rate": 7.077256632850611e-06, "loss": 1.2192, "step": 20360 }, { "epoch": 6.064446471453303, "grad_norm": 0.5039055943489075, "learning_rate": 7.076334198337007e-06, "loss": 1.2113, "step": 20361 }, { "epoch": 6.064744317652972, "grad_norm": 0.3495168089866638, "learning_rate": 7.0754117910248445e-06, "loss": 1.2015, "step": 20362 }, { "epoch": 6.065042163852641, "grad_norm": 0.2851726710796356, "learning_rate": 7.0744894109227095e-06, "loss": 1.2005, "step": 20363 }, { "epoch": 6.065340010052309, "grad_norm": 0.47293710708618164, "learning_rate": 7.073567058039184e-06, "loss": 1.1926, "step": 20364 }, { "epoch": 6.0656378562519775, "grad_norm": 0.2833225429058075, "learning_rate": 7.072644732382842e-06, "loss": 1.2157, "step": 20365 }, { "epoch": 6.065935702451647, "grad_norm": 0.36753979325294495, "learning_rate": 7.071722433962272e-06, "loss": 1.2129, "step": 20366 }, { "epoch": 6.066233548651315, "grad_norm": 0.45584869384765625, "learning_rate": 7.0708001627860556e-06, "loss": 1.215, "step": 20367 }, { "epoch": 6.066531394850984, "grad_norm": 0.2690370976924896, "learning_rate": 7.069877918862766e-06, "loss": 1.1947, "step": 20368 }, { "epoch": 6.066829241050653, "grad_norm": 0.5671363472938538, "learning_rate": 7.068955702200995e-06, "loss": 1.2114, "step": 20369 }, { "epoch": 6.067127087250321, "grad_norm": 0.2720038890838623, "learning_rate": 7.068033512809316e-06, "loss": 1.2191, "step": 20370 }, { "epoch": 6.06742493344999, "grad_norm": 0.5115370154380798, "learning_rate": 7.067111350696303e-06, "loss": 1.2105, "step": 20371 }, { "epoch": 6.0677227796496584, "grad_norm": 0.31888440251350403, "learning_rate": 7.0661892158705455e-06, "loss": 1.2116, "step": 20372 }, { "epoch": 6.068020625849327, "grad_norm": 0.36731308698654175, "learning_rate": 7.065267108340622e-06, "loss": 1.217, "step": 20373 }, { "epoch": 6.068318472048996, "grad_norm": 0.6055471301078796, "learning_rate": 7.064345028115105e-06, "loss": 1.2088, "step": 20374 }, { "epoch": 6.068616318248664, "grad_norm": 0.28859513998031616, "learning_rate": 7.063422975202581e-06, "loss": 1.2283, "step": 20375 }, { "epoch": 6.068914164448333, "grad_norm": 0.6174281239509583, "learning_rate": 7.062500949611622e-06, "loss": 1.2135, "step": 20376 }, { "epoch": 6.069212010648002, "grad_norm": 0.33535486459732056, "learning_rate": 7.061578951350814e-06, "loss": 1.2116, "step": 20377 }, { "epoch": 6.06950985684767, "grad_norm": 0.5999237895011902, "learning_rate": 7.060656980428728e-06, "loss": 1.1964, "step": 20378 }, { "epoch": 6.0698077030473385, "grad_norm": 0.3785956799983978, "learning_rate": 7.059735036853942e-06, "loss": 1.2025, "step": 20379 }, { "epoch": 6.070105549247008, "grad_norm": 0.37391769886016846, "learning_rate": 7.058813120635042e-06, "loss": 1.2212, "step": 20380 }, { "epoch": 6.070403395446676, "grad_norm": 0.4204009473323822, "learning_rate": 7.057891231780598e-06, "loss": 1.2008, "step": 20381 }, { "epoch": 6.070701241646345, "grad_norm": 0.2564176619052887, "learning_rate": 7.056969370299187e-06, "loss": 1.224, "step": 20382 }, { "epoch": 6.070999087846014, "grad_norm": 0.35885563492774963, "learning_rate": 7.0560475361993885e-06, "loss": 1.2079, "step": 20383 }, { "epoch": 6.071296934045682, "grad_norm": 0.35639089345932007, "learning_rate": 7.055125729489782e-06, "loss": 1.1945, "step": 20384 }, { "epoch": 6.071594780245351, "grad_norm": 0.25001823902130127, "learning_rate": 7.054203950178935e-06, "loss": 1.2165, "step": 20385 }, { "epoch": 6.0718926264450195, "grad_norm": 0.41291582584381104, "learning_rate": 7.053282198275433e-06, "loss": 1.2268, "step": 20386 }, { "epoch": 6.072190472644688, "grad_norm": 0.26988789439201355, "learning_rate": 7.05236047378785e-06, "loss": 1.218, "step": 20387 }, { "epoch": 6.072488318844357, "grad_norm": 0.28253674507141113, "learning_rate": 7.051438776724753e-06, "loss": 1.2005, "step": 20388 }, { "epoch": 6.072786165044025, "grad_norm": 0.33627164363861084, "learning_rate": 7.050517107094725e-06, "loss": 1.2046, "step": 20389 }, { "epoch": 6.073084011243694, "grad_norm": 0.3070752024650574, "learning_rate": 7.049595464906345e-06, "loss": 1.1961, "step": 20390 }, { "epoch": 6.073381857443363, "grad_norm": 0.3663565516471863, "learning_rate": 7.048673850168177e-06, "loss": 1.2106, "step": 20391 }, { "epoch": 6.073679703643031, "grad_norm": 0.31832611560821533, "learning_rate": 7.047752262888805e-06, "loss": 1.2044, "step": 20392 }, { "epoch": 6.0739775498426996, "grad_norm": 0.27179139852523804, "learning_rate": 7.046830703076797e-06, "loss": 1.1984, "step": 20393 }, { "epoch": 6.074275396042369, "grad_norm": 0.2872277796268463, "learning_rate": 7.0459091707407335e-06, "loss": 1.2197, "step": 20394 }, { "epoch": 6.074573242242037, "grad_norm": 0.3771313726902008, "learning_rate": 7.044987665889182e-06, "loss": 1.1935, "step": 20395 }, { "epoch": 6.074871088441706, "grad_norm": 0.275558739900589, "learning_rate": 7.0440661885307184e-06, "loss": 1.2022, "step": 20396 }, { "epoch": 6.075168934641375, "grad_norm": 0.43085530400276184, "learning_rate": 7.0431447386739196e-06, "loss": 1.2004, "step": 20397 }, { "epoch": 6.075466780841043, "grad_norm": 0.3149186074733734, "learning_rate": 7.042223316327355e-06, "loss": 1.2117, "step": 20398 }, { "epoch": 6.075764627040712, "grad_norm": 0.3658163547515869, "learning_rate": 7.041301921499592e-06, "loss": 1.2057, "step": 20399 }, { "epoch": 6.0760624732403805, "grad_norm": 0.38250505924224854, "learning_rate": 7.040380554199215e-06, "loss": 1.1872, "step": 20400 }, { "epoch": 6.076360319440049, "grad_norm": 0.26535093784332275, "learning_rate": 7.039459214434789e-06, "loss": 1.2137, "step": 20401 }, { "epoch": 6.076658165639718, "grad_norm": 0.43134433031082153, "learning_rate": 7.038537902214884e-06, "loss": 1.2043, "step": 20402 }, { "epoch": 6.076956011839386, "grad_norm": 0.25832346081733704, "learning_rate": 7.03761661754808e-06, "loss": 1.194, "step": 20403 }, { "epoch": 6.077253858039055, "grad_norm": 0.37801945209503174, "learning_rate": 7.036695360442942e-06, "loss": 1.1986, "step": 20404 }, { "epoch": 6.077551704238724, "grad_norm": 0.2915472388267517, "learning_rate": 7.035774130908041e-06, "loss": 1.2123, "step": 20405 }, { "epoch": 6.077849550438392, "grad_norm": 0.30703651905059814, "learning_rate": 7.034852928951953e-06, "loss": 1.2081, "step": 20406 }, { "epoch": 6.078147396638061, "grad_norm": 0.47424939274787903, "learning_rate": 7.0339317545832435e-06, "loss": 1.2096, "step": 20407 }, { "epoch": 6.07844524283773, "grad_norm": 0.41901594400405884, "learning_rate": 7.033010607810482e-06, "loss": 1.2081, "step": 20408 }, { "epoch": 6.078743089037398, "grad_norm": 0.27409788966178894, "learning_rate": 7.0320894886422474e-06, "loss": 1.1911, "step": 20409 }, { "epoch": 6.079040935237067, "grad_norm": 0.29361626505851746, "learning_rate": 7.0311683970871005e-06, "loss": 1.1985, "step": 20410 }, { "epoch": 6.079338781436736, "grad_norm": 0.2925852835178375, "learning_rate": 7.030247333153617e-06, "loss": 1.2284, "step": 20411 }, { "epoch": 6.079636627636404, "grad_norm": 0.26330602169036865, "learning_rate": 7.029326296850363e-06, "loss": 1.2141, "step": 20412 }, { "epoch": 6.079934473836073, "grad_norm": 0.27293527126312256, "learning_rate": 7.0284052881859085e-06, "loss": 1.2333, "step": 20413 }, { "epoch": 6.0802323200357415, "grad_norm": 0.2640949487686157, "learning_rate": 7.027484307168824e-06, "loss": 1.2126, "step": 20414 }, { "epoch": 6.08053016623541, "grad_norm": 0.26671817898750305, "learning_rate": 7.026563353807678e-06, "loss": 1.1931, "step": 20415 }, { "epoch": 6.080828012435079, "grad_norm": 0.2652629613876343, "learning_rate": 7.025642428111034e-06, "loss": 1.1916, "step": 20416 }, { "epoch": 6.081125858634747, "grad_norm": 0.3108772337436676, "learning_rate": 7.024721530087469e-06, "loss": 1.2156, "step": 20417 }, { "epoch": 6.081423704834416, "grad_norm": 0.2715778648853302, "learning_rate": 7.023800659745541e-06, "loss": 1.216, "step": 20418 }, { "epoch": 6.081721551034085, "grad_norm": 0.26113471388816833, "learning_rate": 7.0228798170938225e-06, "loss": 1.1955, "step": 20419 }, { "epoch": 6.082019397233753, "grad_norm": 0.32121750712394714, "learning_rate": 7.021959002140885e-06, "loss": 1.2216, "step": 20420 }, { "epoch": 6.0823172434334225, "grad_norm": 0.30098626017570496, "learning_rate": 7.021038214895293e-06, "loss": 1.2005, "step": 20421 }, { "epoch": 6.082615089633091, "grad_norm": 0.27612149715423584, "learning_rate": 7.020117455365606e-06, "loss": 1.2129, "step": 20422 }, { "epoch": 6.082912935832759, "grad_norm": 0.2973646819591522, "learning_rate": 7.019196723560402e-06, "loss": 1.2121, "step": 20423 }, { "epoch": 6.083210782032428, "grad_norm": 0.26949751377105713, "learning_rate": 7.01827601948824e-06, "loss": 1.2028, "step": 20424 }, { "epoch": 6.083508628232097, "grad_norm": 0.4442526400089264, "learning_rate": 7.017355343157686e-06, "loss": 1.2217, "step": 20425 }, { "epoch": 6.083806474431765, "grad_norm": 0.2635462284088135, "learning_rate": 7.016434694577312e-06, "loss": 1.2113, "step": 20426 }, { "epoch": 6.084104320631434, "grad_norm": 0.34377890825271606, "learning_rate": 7.015514073755676e-06, "loss": 1.2154, "step": 20427 }, { "epoch": 6.084402166831103, "grad_norm": 0.30079883337020874, "learning_rate": 7.014593480701351e-06, "loss": 1.2039, "step": 20428 }, { "epoch": 6.084700013030771, "grad_norm": 0.3327597975730896, "learning_rate": 7.013672915422898e-06, "loss": 1.2197, "step": 20429 }, { "epoch": 6.08499785923044, "grad_norm": 0.2588975727558136, "learning_rate": 7.012752377928878e-06, "loss": 1.204, "step": 20430 }, { "epoch": 6.0852957054301084, "grad_norm": 0.35409826040267944, "learning_rate": 7.0118318682278615e-06, "loss": 1.2015, "step": 20431 }, { "epoch": 6.085593551629777, "grad_norm": 0.4417869448661804, "learning_rate": 7.010911386328414e-06, "loss": 1.2115, "step": 20432 }, { "epoch": 6.085891397829446, "grad_norm": 0.2775195837020874, "learning_rate": 7.0099909322390915e-06, "loss": 1.2191, "step": 20433 }, { "epoch": 6.086189244029114, "grad_norm": 0.3130930960178375, "learning_rate": 7.009070505968467e-06, "loss": 1.2218, "step": 20434 }, { "epoch": 6.0864870902287835, "grad_norm": 0.25491204857826233, "learning_rate": 7.008150107525098e-06, "loss": 1.2002, "step": 20435 }, { "epoch": 6.086784936428452, "grad_norm": 0.25688236951828003, "learning_rate": 7.007229736917549e-06, "loss": 1.2051, "step": 20436 }, { "epoch": 6.08708278262812, "grad_norm": 0.28710290789604187, "learning_rate": 7.006309394154383e-06, "loss": 1.2025, "step": 20437 }, { "epoch": 6.087380628827789, "grad_norm": 0.2777743637561798, "learning_rate": 7.005389079244167e-06, "loss": 1.2218, "step": 20438 }, { "epoch": 6.087678475027458, "grad_norm": 0.4726444482803345, "learning_rate": 7.004468792195454e-06, "loss": 1.2149, "step": 20439 }, { "epoch": 6.087976321227126, "grad_norm": 0.29624661803245544, "learning_rate": 7.0035485330168175e-06, "loss": 1.2336, "step": 20440 }, { "epoch": 6.088274167426795, "grad_norm": 0.5459936857223511, "learning_rate": 7.00262830171681e-06, "loss": 1.209, "step": 20441 }, { "epoch": 6.088572013626464, "grad_norm": 0.2904707193374634, "learning_rate": 7.001708098303999e-06, "loss": 1.2282, "step": 20442 }, { "epoch": 6.088869859826132, "grad_norm": 0.3669629991054535, "learning_rate": 7.000787922786944e-06, "loss": 1.2053, "step": 20443 }, { "epoch": 6.089167706025801, "grad_norm": 0.4363369941711426, "learning_rate": 6.999867775174205e-06, "loss": 1.2076, "step": 20444 }, { "epoch": 6.0894655522254695, "grad_norm": 0.2777957320213318, "learning_rate": 6.998947655474348e-06, "loss": 1.2063, "step": 20445 }, { "epoch": 6.089763398425138, "grad_norm": 0.3113929033279419, "learning_rate": 6.998027563695931e-06, "loss": 1.2066, "step": 20446 }, { "epoch": 6.090061244624807, "grad_norm": 0.3149489164352417, "learning_rate": 6.997107499847509e-06, "loss": 1.1957, "step": 20447 }, { "epoch": 6.090359090824475, "grad_norm": 0.27551475167274475, "learning_rate": 6.996187463937647e-06, "loss": 1.2202, "step": 20448 }, { "epoch": 6.090656937024145, "grad_norm": 0.330971896648407, "learning_rate": 6.995267455974909e-06, "loss": 1.2157, "step": 20449 }, { "epoch": 6.090954783223813, "grad_norm": 0.28980836272239685, "learning_rate": 6.994347475967844e-06, "loss": 1.2022, "step": 20450 }, { "epoch": 6.091252629423481, "grad_norm": 0.401186466217041, "learning_rate": 6.993427523925024e-06, "loss": 1.1996, "step": 20451 }, { "epoch": 6.09155047562315, "grad_norm": 0.2572411596775055, "learning_rate": 6.992507599855001e-06, "loss": 1.2126, "step": 20452 }, { "epoch": 6.091848321822819, "grad_norm": 0.4092616140842438, "learning_rate": 6.991587703766329e-06, "loss": 1.2134, "step": 20453 }, { "epoch": 6.092146168022487, "grad_norm": 0.3409324288368225, "learning_rate": 6.990667835667575e-06, "loss": 1.1989, "step": 20454 }, { "epoch": 6.092444014222156, "grad_norm": 0.3348819613456726, "learning_rate": 6.989747995567298e-06, "loss": 1.2099, "step": 20455 }, { "epoch": 6.092741860421825, "grad_norm": 0.3193581700325012, "learning_rate": 6.988828183474046e-06, "loss": 1.2028, "step": 20456 }, { "epoch": 6.093039706621493, "grad_norm": 0.38945794105529785, "learning_rate": 6.98790839939639e-06, "loss": 1.2102, "step": 20457 }, { "epoch": 6.093337552821162, "grad_norm": 0.2539597451686859, "learning_rate": 6.986988643342876e-06, "loss": 1.1944, "step": 20458 }, { "epoch": 6.0936353990208305, "grad_norm": 0.4171285927295685, "learning_rate": 6.986068915322071e-06, "loss": 1.2103, "step": 20459 }, { "epoch": 6.093933245220499, "grad_norm": 0.28877657651901245, "learning_rate": 6.985149215342524e-06, "loss": 1.204, "step": 20460 }, { "epoch": 6.094231091420168, "grad_norm": 0.3398493230342865, "learning_rate": 6.984229543412794e-06, "loss": 1.1847, "step": 20461 }, { "epoch": 6.094528937619836, "grad_norm": 0.2786409854888916, "learning_rate": 6.983309899541443e-06, "loss": 1.218, "step": 20462 }, { "epoch": 6.094826783819506, "grad_norm": 0.6020640134811401, "learning_rate": 6.982390283737022e-06, "loss": 1.2103, "step": 20463 }, { "epoch": 6.095124630019174, "grad_norm": 0.3121355473995209, "learning_rate": 6.9814706960080845e-06, "loss": 1.2303, "step": 20464 }, { "epoch": 6.095422476218842, "grad_norm": 0.5204094648361206, "learning_rate": 6.980551136363194e-06, "loss": 1.2075, "step": 20465 }, { "epoch": 6.0957203224185115, "grad_norm": 0.27732616662979126, "learning_rate": 6.979631604810899e-06, "loss": 1.2063, "step": 20466 }, { "epoch": 6.09601816861818, "grad_norm": 0.4900452494621277, "learning_rate": 6.978712101359756e-06, "loss": 1.2041, "step": 20467 }, { "epoch": 6.096316014817848, "grad_norm": 0.28665292263031006, "learning_rate": 6.977792626018325e-06, "loss": 1.2165, "step": 20468 }, { "epoch": 6.096613861017517, "grad_norm": 0.45302802324295044, "learning_rate": 6.976873178795157e-06, "loss": 1.211, "step": 20469 }, { "epoch": 6.096911707217186, "grad_norm": 0.26088371872901917, "learning_rate": 6.9759537596988025e-06, "loss": 1.2011, "step": 20470 }, { "epoch": 6.097209553416854, "grad_norm": 0.36315658688545227, "learning_rate": 6.975034368737821e-06, "loss": 1.2057, "step": 20471 }, { "epoch": 6.097507399616523, "grad_norm": 0.24717873334884644, "learning_rate": 6.974115005920767e-06, "loss": 1.2098, "step": 20472 }, { "epoch": 6.0978052458161915, "grad_norm": 0.26071131229400635, "learning_rate": 6.97319567125619e-06, "loss": 1.2086, "step": 20473 }, { "epoch": 6.09810309201586, "grad_norm": 0.27413856983184814, "learning_rate": 6.9722763647526484e-06, "loss": 1.2052, "step": 20474 }, { "epoch": 6.098400938215529, "grad_norm": 0.33768430352211, "learning_rate": 6.971357086418688e-06, "loss": 1.1973, "step": 20475 }, { "epoch": 6.098698784415197, "grad_norm": 0.3707341253757477, "learning_rate": 6.97043783626287e-06, "loss": 1.2011, "step": 20476 }, { "epoch": 6.098996630614867, "grad_norm": 0.2891037166118622, "learning_rate": 6.9695186142937425e-06, "loss": 1.2058, "step": 20477 }, { "epoch": 6.099294476814535, "grad_norm": 0.277849018573761, "learning_rate": 6.968599420519855e-06, "loss": 1.2151, "step": 20478 }, { "epoch": 6.099592323014203, "grad_norm": 0.37085142731666565, "learning_rate": 6.967680254949767e-06, "loss": 1.1967, "step": 20479 }, { "epoch": 6.0998901692138725, "grad_norm": 0.2706451416015625, "learning_rate": 6.966761117592027e-06, "loss": 1.2112, "step": 20480 }, { "epoch": 6.100188015413541, "grad_norm": 0.40303710103034973, "learning_rate": 6.965842008455181e-06, "loss": 1.2084, "step": 20481 }, { "epoch": 6.100485861613209, "grad_norm": 0.3119714558124542, "learning_rate": 6.9649229275477905e-06, "loss": 1.204, "step": 20482 }, { "epoch": 6.100783707812878, "grad_norm": 0.36651402711868286, "learning_rate": 6.964003874878398e-06, "loss": 1.2106, "step": 20483 }, { "epoch": 6.101081554012547, "grad_norm": 0.4344756603240967, "learning_rate": 6.963084850455556e-06, "loss": 1.2067, "step": 20484 }, { "epoch": 6.101379400212215, "grad_norm": 0.27097341418266296, "learning_rate": 6.96216585428782e-06, "loss": 1.2034, "step": 20485 }, { "epoch": 6.101677246411884, "grad_norm": 0.6429668664932251, "learning_rate": 6.961246886383737e-06, "loss": 1.2117, "step": 20486 }, { "epoch": 6.101975092611553, "grad_norm": 0.29374146461486816, "learning_rate": 6.960327946751852e-06, "loss": 1.1867, "step": 20487 }, { "epoch": 6.102272938811222, "grad_norm": 0.47270968556404114, "learning_rate": 6.959409035400725e-06, "loss": 1.2342, "step": 20488 }, { "epoch": 6.10257078501089, "grad_norm": 0.2763921022415161, "learning_rate": 6.958490152338897e-06, "loss": 1.2199, "step": 20489 }, { "epoch": 6.1028686312105584, "grad_norm": 0.3743135333061218, "learning_rate": 6.9575712975749165e-06, "loss": 1.2095, "step": 20490 }, { "epoch": 6.103166477410228, "grad_norm": 0.28013962507247925, "learning_rate": 6.956652471117342e-06, "loss": 1.2001, "step": 20491 }, { "epoch": 6.103464323609896, "grad_norm": 0.26570209860801697, "learning_rate": 6.955733672974712e-06, "loss": 1.204, "step": 20492 }, { "epoch": 6.103762169809564, "grad_norm": 0.39894533157348633, "learning_rate": 6.95481490315558e-06, "loss": 1.2119, "step": 20493 }, { "epoch": 6.1040600160092335, "grad_norm": 0.29802659153938293, "learning_rate": 6.953896161668494e-06, "loss": 1.2141, "step": 20494 }, { "epoch": 6.104357862208902, "grad_norm": 0.27953803539276123, "learning_rate": 6.952977448522001e-06, "loss": 1.2163, "step": 20495 }, { "epoch": 6.10465570840857, "grad_norm": 0.35821402072906494, "learning_rate": 6.952058763724646e-06, "loss": 1.1876, "step": 20496 }, { "epoch": 6.104953554608239, "grad_norm": 0.2715325653553009, "learning_rate": 6.951140107284983e-06, "loss": 1.2025, "step": 20497 }, { "epoch": 6.105251400807908, "grad_norm": 0.4369836151599884, "learning_rate": 6.9502214792115495e-06, "loss": 1.2112, "step": 20498 }, { "epoch": 6.105549247007576, "grad_norm": 0.3524416983127594, "learning_rate": 6.949302879512903e-06, "loss": 1.2078, "step": 20499 }, { "epoch": 6.105847093207245, "grad_norm": 0.850420355796814, "learning_rate": 6.948384308197582e-06, "loss": 1.21, "step": 20500 }, { "epoch": 6.105847093207245, "eval_loss": 1.3234530687332153, "eval_runtime": 23.9568, "eval_samples_per_second": 72.38, "eval_steps_per_second": 4.55, "step": 20500 }, { "epoch": 6.106144939406914, "grad_norm": 0.3084619641304016, "learning_rate": 6.947465765274135e-06, "loss": 1.2032, "step": 20501 }, { "epoch": 6.106442785606583, "grad_norm": 0.4244225323200226, "learning_rate": 6.94654725075111e-06, "loss": 1.1966, "step": 20502 }, { "epoch": 6.106740631806251, "grad_norm": 0.3955024182796478, "learning_rate": 6.945628764637053e-06, "loss": 1.2156, "step": 20503 }, { "epoch": 6.1070384780059195, "grad_norm": 0.26083919405937195, "learning_rate": 6.944710306940503e-06, "loss": 1.2078, "step": 20504 }, { "epoch": 6.107336324205589, "grad_norm": 0.6535545587539673, "learning_rate": 6.9437918776700145e-06, "loss": 1.2169, "step": 20505 }, { "epoch": 6.107634170405257, "grad_norm": 0.2741965353488922, "learning_rate": 6.942873476834127e-06, "loss": 1.2111, "step": 20506 }, { "epoch": 6.107932016604925, "grad_norm": 0.37694886326789856, "learning_rate": 6.9419551044413825e-06, "loss": 1.2263, "step": 20507 }, { "epoch": 6.108229862804595, "grad_norm": 0.36882150173187256, "learning_rate": 6.941036760500333e-06, "loss": 1.1988, "step": 20508 }, { "epoch": 6.108527709004263, "grad_norm": 0.3224135637283325, "learning_rate": 6.940118445019516e-06, "loss": 1.1923, "step": 20509 }, { "epoch": 6.108825555203931, "grad_norm": 0.46624529361724854, "learning_rate": 6.939200158007482e-06, "loss": 1.2059, "step": 20510 }, { "epoch": 6.1091234014036, "grad_norm": 0.30174317955970764, "learning_rate": 6.9382818994727695e-06, "loss": 1.1994, "step": 20511 }, { "epoch": 6.109421247603269, "grad_norm": 0.28202491998672485, "learning_rate": 6.93736366942392e-06, "loss": 1.2058, "step": 20512 }, { "epoch": 6.109719093802937, "grad_norm": 0.3995566666126251, "learning_rate": 6.936445467869481e-06, "loss": 1.2134, "step": 20513 }, { "epoch": 6.110016940002606, "grad_norm": 0.3717820346355438, "learning_rate": 6.935527294817998e-06, "loss": 1.2148, "step": 20514 }, { "epoch": 6.110314786202275, "grad_norm": 0.44415083527565, "learning_rate": 6.934609150278005e-06, "loss": 1.1948, "step": 20515 }, { "epoch": 6.110612632401944, "grad_norm": 0.453581839799881, "learning_rate": 6.9336910342580525e-06, "loss": 1.1998, "step": 20516 }, { "epoch": 6.110910478601612, "grad_norm": 0.2977471947669983, "learning_rate": 6.932772946766679e-06, "loss": 1.2202, "step": 20517 }, { "epoch": 6.1112083248012805, "grad_norm": 0.479152649641037, "learning_rate": 6.931854887812424e-06, "loss": 1.2124, "step": 20518 }, { "epoch": 6.11150617100095, "grad_norm": 0.3845822513103485, "learning_rate": 6.930936857403832e-06, "loss": 1.2205, "step": 20519 }, { "epoch": 6.111804017200618, "grad_norm": 0.483164519071579, "learning_rate": 6.930018855549445e-06, "loss": 1.2135, "step": 20520 }, { "epoch": 6.112101863400286, "grad_norm": 0.3645203411579132, "learning_rate": 6.929100882257798e-06, "loss": 1.2102, "step": 20521 }, { "epoch": 6.112399709599956, "grad_norm": 0.3148750364780426, "learning_rate": 6.928182937537442e-06, "loss": 1.2293, "step": 20522 }, { "epoch": 6.112697555799624, "grad_norm": 0.3886222541332245, "learning_rate": 6.927265021396909e-06, "loss": 1.213, "step": 20523 }, { "epoch": 6.112995401999292, "grad_norm": 0.287148118019104, "learning_rate": 6.9263471338447416e-06, "loss": 1.2011, "step": 20524 }, { "epoch": 6.1132932481989615, "grad_norm": 0.26246532797813416, "learning_rate": 6.925429274889481e-06, "loss": 1.1939, "step": 20525 }, { "epoch": 6.11359109439863, "grad_norm": 0.47627633810043335, "learning_rate": 6.924511444539663e-06, "loss": 1.1948, "step": 20526 }, { "epoch": 6.113888940598298, "grad_norm": 0.35837310552597046, "learning_rate": 6.923593642803834e-06, "loss": 1.2005, "step": 20527 }, { "epoch": 6.114186786797967, "grad_norm": 0.42851340770721436, "learning_rate": 6.9226758696905295e-06, "loss": 1.2183, "step": 20528 }, { "epoch": 6.114484632997636, "grad_norm": 0.5166046023368835, "learning_rate": 6.921758125208284e-06, "loss": 1.2096, "step": 20529 }, { "epoch": 6.114782479197305, "grad_norm": 0.3526485562324524, "learning_rate": 6.9208404093656415e-06, "loss": 1.2049, "step": 20530 }, { "epoch": 6.115080325396973, "grad_norm": 0.6175981760025024, "learning_rate": 6.919922722171139e-06, "loss": 1.2096, "step": 20531 }, { "epoch": 6.1153781715966415, "grad_norm": 0.2990829050540924, "learning_rate": 6.9190050636333126e-06, "loss": 1.2116, "step": 20532 }, { "epoch": 6.115676017796311, "grad_norm": 0.8120248913764954, "learning_rate": 6.918087433760705e-06, "loss": 1.2086, "step": 20533 }, { "epoch": 6.115973863995979, "grad_norm": 0.2923959195613861, "learning_rate": 6.917169832561852e-06, "loss": 1.2189, "step": 20534 }, { "epoch": 6.116271710195647, "grad_norm": 0.5978054404258728, "learning_rate": 6.9162522600452834e-06, "loss": 1.196, "step": 20535 }, { "epoch": 6.116569556395317, "grad_norm": 0.3772621154785156, "learning_rate": 6.915334716219544e-06, "loss": 1.2195, "step": 20536 }, { "epoch": 6.116867402594985, "grad_norm": 0.3038269579410553, "learning_rate": 6.914417201093172e-06, "loss": 1.222, "step": 20537 }, { "epoch": 6.117165248794653, "grad_norm": 0.555637001991272, "learning_rate": 6.913499714674696e-06, "loss": 1.2295, "step": 20538 }, { "epoch": 6.1174630949943225, "grad_norm": 0.31136155128479004, "learning_rate": 6.912582256972661e-06, "loss": 1.2051, "step": 20539 }, { "epoch": 6.117760941193991, "grad_norm": 0.5317781567573547, "learning_rate": 6.911664827995599e-06, "loss": 1.2104, "step": 20540 }, { "epoch": 6.118058787393659, "grad_norm": 0.48772385716438293, "learning_rate": 6.910747427752041e-06, "loss": 1.2161, "step": 20541 }, { "epoch": 6.118356633593328, "grad_norm": 0.375875324010849, "learning_rate": 6.909830056250527e-06, "loss": 1.2091, "step": 20542 }, { "epoch": 6.118654479792997, "grad_norm": 0.5589781403541565, "learning_rate": 6.908912713499592e-06, "loss": 1.2111, "step": 20543 }, { "epoch": 6.118952325992666, "grad_norm": 0.28380241990089417, "learning_rate": 6.907995399507773e-06, "loss": 1.2244, "step": 20544 }, { "epoch": 6.119250172192334, "grad_norm": 0.6299400925636292, "learning_rate": 6.907078114283603e-06, "loss": 1.1996, "step": 20545 }, { "epoch": 6.119548018392003, "grad_norm": 0.28528720140457153, "learning_rate": 6.906160857835611e-06, "loss": 1.2237, "step": 20546 }, { "epoch": 6.119845864591672, "grad_norm": 0.3071301579475403, "learning_rate": 6.90524363017234e-06, "loss": 1.2006, "step": 20547 }, { "epoch": 6.12014371079134, "grad_norm": 0.40062451362609863, "learning_rate": 6.904326431302317e-06, "loss": 1.2103, "step": 20548 }, { "epoch": 6.1204415569910084, "grad_norm": 0.324465274810791, "learning_rate": 6.903409261234078e-06, "loss": 1.2091, "step": 20549 }, { "epoch": 6.120739403190678, "grad_norm": 0.3610185384750366, "learning_rate": 6.9024921199761575e-06, "loss": 1.2172, "step": 20550 }, { "epoch": 6.121037249390346, "grad_norm": 0.40855681896209717, "learning_rate": 6.901575007537088e-06, "loss": 1.2067, "step": 20551 }, { "epoch": 6.121335095590014, "grad_norm": 0.2959303557872772, "learning_rate": 6.900657923925398e-06, "loss": 1.2246, "step": 20552 }, { "epoch": 6.1216329417896835, "grad_norm": 0.28657275438308716, "learning_rate": 6.899740869149623e-06, "loss": 1.225, "step": 20553 }, { "epoch": 6.121930787989352, "grad_norm": 0.4831201434135437, "learning_rate": 6.8988238432183e-06, "loss": 1.2139, "step": 20554 }, { "epoch": 6.122228634189021, "grad_norm": 0.33637285232543945, "learning_rate": 6.89790684613995e-06, "loss": 1.2026, "step": 20555 }, { "epoch": 6.122526480388689, "grad_norm": 0.4028896689414978, "learning_rate": 6.896989877923117e-06, "loss": 1.2057, "step": 20556 }, { "epoch": 6.122824326588358, "grad_norm": 0.3351418375968933, "learning_rate": 6.896072938576321e-06, "loss": 1.2118, "step": 20557 }, { "epoch": 6.123122172788027, "grad_norm": 0.4150068163871765, "learning_rate": 6.895156028108103e-06, "loss": 1.2179, "step": 20558 }, { "epoch": 6.123420018987695, "grad_norm": 0.32761695981025696, "learning_rate": 6.894239146526986e-06, "loss": 1.2099, "step": 20559 }, { "epoch": 6.123717865187364, "grad_norm": 0.3515329957008362, "learning_rate": 6.893322293841505e-06, "loss": 1.2013, "step": 20560 }, { "epoch": 6.124015711387033, "grad_norm": 0.3351740837097168, "learning_rate": 6.89240547006019e-06, "loss": 1.2028, "step": 20561 }, { "epoch": 6.124313557586701, "grad_norm": 0.28326913714408875, "learning_rate": 6.8914886751915714e-06, "loss": 1.2216, "step": 20562 }, { "epoch": 6.1246114037863695, "grad_norm": 0.29817336797714233, "learning_rate": 6.890571909244173e-06, "loss": 1.2059, "step": 20563 }, { "epoch": 6.124909249986039, "grad_norm": 0.30810728669166565, "learning_rate": 6.889655172226533e-06, "loss": 1.2098, "step": 20564 }, { "epoch": 6.125207096185707, "grad_norm": 0.3400174677371979, "learning_rate": 6.888738464147174e-06, "loss": 1.2043, "step": 20565 }, { "epoch": 6.125504942385375, "grad_norm": 0.41957220435142517, "learning_rate": 6.887821785014627e-06, "loss": 1.2104, "step": 20566 }, { "epoch": 6.125802788585045, "grad_norm": 0.3656337559223175, "learning_rate": 6.886905134837424e-06, "loss": 1.2127, "step": 20567 }, { "epoch": 6.126100634784713, "grad_norm": 0.3066710829734802, "learning_rate": 6.885988513624091e-06, "loss": 1.203, "step": 20568 }, { "epoch": 6.126398480984382, "grad_norm": 0.2674976587295532, "learning_rate": 6.885071921383151e-06, "loss": 1.2173, "step": 20569 }, { "epoch": 6.12669632718405, "grad_norm": 0.2625259459018707, "learning_rate": 6.884155358123141e-06, "loss": 1.219, "step": 20570 }, { "epoch": 6.126994173383719, "grad_norm": 0.46240806579589844, "learning_rate": 6.883238823852583e-06, "loss": 1.2165, "step": 20571 }, { "epoch": 6.127292019583388, "grad_norm": 0.26764822006225586, "learning_rate": 6.8823223185800024e-06, "loss": 1.2049, "step": 20572 }, { "epoch": 6.127589865783056, "grad_norm": 0.5365241765975952, "learning_rate": 6.881405842313934e-06, "loss": 1.2085, "step": 20573 }, { "epoch": 6.127887711982725, "grad_norm": 0.3085898756980896, "learning_rate": 6.880489395062896e-06, "loss": 1.2161, "step": 20574 }, { "epoch": 6.128185558182394, "grad_norm": 0.6672071218490601, "learning_rate": 6.879572976835422e-06, "loss": 1.2084, "step": 20575 }, { "epoch": 6.128483404382062, "grad_norm": 0.2635343372821808, "learning_rate": 6.878656587640036e-06, "loss": 1.1941, "step": 20576 }, { "epoch": 6.1287812505817305, "grad_norm": 0.3361892104148865, "learning_rate": 6.877740227485259e-06, "loss": 1.2068, "step": 20577 }, { "epoch": 6.1290790967814, "grad_norm": 0.42709881067276, "learning_rate": 6.876823896379623e-06, "loss": 1.2112, "step": 20578 }, { "epoch": 6.129376942981068, "grad_norm": 0.2991598844528198, "learning_rate": 6.875907594331652e-06, "loss": 1.2122, "step": 20579 }, { "epoch": 6.129674789180736, "grad_norm": 0.47507810592651367, "learning_rate": 6.874991321349867e-06, "loss": 1.2162, "step": 20580 }, { "epoch": 6.129972635380406, "grad_norm": 0.2950620651245117, "learning_rate": 6.874075077442801e-06, "loss": 1.1809, "step": 20581 }, { "epoch": 6.130270481580074, "grad_norm": 0.3419857323169708, "learning_rate": 6.873158862618972e-06, "loss": 1.2024, "step": 20582 }, { "epoch": 6.130568327779743, "grad_norm": 0.2745928466320038, "learning_rate": 6.872242676886905e-06, "loss": 1.2026, "step": 20583 }, { "epoch": 6.1308661739794115, "grad_norm": 0.2627103328704834, "learning_rate": 6.871326520255126e-06, "loss": 1.1981, "step": 20584 }, { "epoch": 6.13116402017908, "grad_norm": 0.27292487025260925, "learning_rate": 6.870410392732161e-06, "loss": 1.204, "step": 20585 }, { "epoch": 6.131461866378749, "grad_norm": 0.3463563919067383, "learning_rate": 6.869494294326524e-06, "loss": 1.2092, "step": 20586 }, { "epoch": 6.131759712578417, "grad_norm": 0.265866756439209, "learning_rate": 6.868578225046751e-06, "loss": 1.2171, "step": 20587 }, { "epoch": 6.132057558778086, "grad_norm": 0.34672361612319946, "learning_rate": 6.867662184901358e-06, "loss": 1.1981, "step": 20588 }, { "epoch": 6.132355404977755, "grad_norm": 0.358896404504776, "learning_rate": 6.866746173898865e-06, "loss": 1.2177, "step": 20589 }, { "epoch": 6.132653251177423, "grad_norm": 0.3443050682544708, "learning_rate": 6.865830192047803e-06, "loss": 1.2018, "step": 20590 }, { "epoch": 6.1329510973770915, "grad_norm": 0.3751683533191681, "learning_rate": 6.8649142393566845e-06, "loss": 1.2089, "step": 20591 }, { "epoch": 6.133248943576761, "grad_norm": 0.3259476125240326, "learning_rate": 6.863998315834042e-06, "loss": 1.2082, "step": 20592 }, { "epoch": 6.133546789776429, "grad_norm": 0.35596439242362976, "learning_rate": 6.86308242148839e-06, "loss": 1.2041, "step": 20593 }, { "epoch": 6.133844635976097, "grad_norm": 0.2538476288318634, "learning_rate": 6.862166556328247e-06, "loss": 1.2044, "step": 20594 }, { "epoch": 6.134142482175767, "grad_norm": 0.5332903861999512, "learning_rate": 6.861250720362141e-06, "loss": 1.2126, "step": 20595 }, { "epoch": 6.134440328375435, "grad_norm": 0.41133761405944824, "learning_rate": 6.8603349135985915e-06, "loss": 1.2158, "step": 20596 }, { "epoch": 6.134738174575104, "grad_norm": 0.37557145953178406, "learning_rate": 6.859419136046115e-06, "loss": 1.2136, "step": 20597 }, { "epoch": 6.1350360207747725, "grad_norm": 0.29362809658050537, "learning_rate": 6.858503387713239e-06, "loss": 1.2349, "step": 20598 }, { "epoch": 6.135333866974441, "grad_norm": 0.5136938095092773, "learning_rate": 6.857587668608479e-06, "loss": 1.2078, "step": 20599 }, { "epoch": 6.13563171317411, "grad_norm": 0.3157677948474884, "learning_rate": 6.856671978740349e-06, "loss": 1.2042, "step": 20600 }, { "epoch": 6.135929559373778, "grad_norm": 0.4371340870857239, "learning_rate": 6.8557563181173765e-06, "loss": 1.2106, "step": 20601 }, { "epoch": 6.136227405573447, "grad_norm": 0.2592601776123047, "learning_rate": 6.854840686748082e-06, "loss": 1.1945, "step": 20602 }, { "epoch": 6.136525251773116, "grad_norm": 0.5364843606948853, "learning_rate": 6.853925084640977e-06, "loss": 1.1995, "step": 20603 }, { "epoch": 6.136823097972784, "grad_norm": 0.2640324831008911, "learning_rate": 6.853009511804587e-06, "loss": 1.2213, "step": 20604 }, { "epoch": 6.137120944172453, "grad_norm": 0.39366430044174194, "learning_rate": 6.852093968247426e-06, "loss": 1.212, "step": 20605 }, { "epoch": 6.137418790372122, "grad_norm": 0.36515429615974426, "learning_rate": 6.851178453978013e-06, "loss": 1.1881, "step": 20606 }, { "epoch": 6.13771663657179, "grad_norm": 0.3126497268676758, "learning_rate": 6.850262969004866e-06, "loss": 1.2115, "step": 20607 }, { "epoch": 6.1380144827714584, "grad_norm": 0.3741849660873413, "learning_rate": 6.849347513336501e-06, "loss": 1.2258, "step": 20608 }, { "epoch": 6.138312328971128, "grad_norm": 0.2869343161582947, "learning_rate": 6.848432086981444e-06, "loss": 1.204, "step": 20609 }, { "epoch": 6.138610175170796, "grad_norm": 0.3380416929721832, "learning_rate": 6.847516689948203e-06, "loss": 1.2106, "step": 20610 }, { "epoch": 6.138908021370465, "grad_norm": 0.32794860005378723, "learning_rate": 6.846601322245293e-06, "loss": 1.214, "step": 20611 }, { "epoch": 6.1392058675701335, "grad_norm": 0.26346683502197266, "learning_rate": 6.8456859838812396e-06, "loss": 1.1959, "step": 20612 }, { "epoch": 6.139503713769802, "grad_norm": 0.31447237730026245, "learning_rate": 6.8447706748645515e-06, "loss": 1.2156, "step": 20613 }, { "epoch": 6.139801559969471, "grad_norm": 0.26520559191703796, "learning_rate": 6.843855395203744e-06, "loss": 1.2159, "step": 20614 }, { "epoch": 6.140099406169139, "grad_norm": 0.25677022337913513, "learning_rate": 6.842940144907341e-06, "loss": 1.1995, "step": 20615 }, { "epoch": 6.140397252368808, "grad_norm": 0.2796150743961334, "learning_rate": 6.842024923983852e-06, "loss": 1.2141, "step": 20616 }, { "epoch": 6.140695098568477, "grad_norm": 0.27832838892936707, "learning_rate": 6.841109732441788e-06, "loss": 1.1975, "step": 20617 }, { "epoch": 6.140992944768145, "grad_norm": 0.31540924310684204, "learning_rate": 6.840194570289673e-06, "loss": 1.2007, "step": 20618 }, { "epoch": 6.141290790967814, "grad_norm": 0.3285495638847351, "learning_rate": 6.8392794375360174e-06, "loss": 1.2299, "step": 20619 }, { "epoch": 6.141588637167483, "grad_norm": 0.3270295262336731, "learning_rate": 6.838364334189331e-06, "loss": 1.2279, "step": 20620 }, { "epoch": 6.141886483367151, "grad_norm": 0.3397824764251709, "learning_rate": 6.837449260258138e-06, "loss": 1.2167, "step": 20621 }, { "epoch": 6.14218432956682, "grad_norm": 0.3192901015281677, "learning_rate": 6.836534215750945e-06, "loss": 1.1943, "step": 20622 }, { "epoch": 6.142482175766489, "grad_norm": 0.2750777006149292, "learning_rate": 6.835619200676262e-06, "loss": 1.2089, "step": 20623 }, { "epoch": 6.142780021966157, "grad_norm": 0.3177313506603241, "learning_rate": 6.834704215042609e-06, "loss": 1.206, "step": 20624 }, { "epoch": 6.143077868165826, "grad_norm": 0.2615659534931183, "learning_rate": 6.833789258858496e-06, "loss": 1.2098, "step": 20625 }, { "epoch": 6.143375714365495, "grad_norm": 0.33818912506103516, "learning_rate": 6.8328743321324394e-06, "loss": 1.2055, "step": 20626 }, { "epoch": 6.143673560565163, "grad_norm": 0.256344735622406, "learning_rate": 6.8319594348729505e-06, "loss": 1.2114, "step": 20627 }, { "epoch": 6.143971406764832, "grad_norm": 0.2722576856613159, "learning_rate": 6.831044567088533e-06, "loss": 1.1923, "step": 20628 }, { "epoch": 6.1442692529645, "grad_norm": 0.2654699683189392, "learning_rate": 6.830129728787711e-06, "loss": 1.2197, "step": 20629 }, { "epoch": 6.144567099164169, "grad_norm": 0.2782585620880127, "learning_rate": 6.829214919978987e-06, "loss": 1.1987, "step": 20630 }, { "epoch": 6.144864945363838, "grad_norm": 0.3233979344367981, "learning_rate": 6.828300140670873e-06, "loss": 1.204, "step": 20631 }, { "epoch": 6.145162791563506, "grad_norm": 0.2728368937969208, "learning_rate": 6.827385390871888e-06, "loss": 1.1921, "step": 20632 }, { "epoch": 6.145460637763175, "grad_norm": 0.2728160619735718, "learning_rate": 6.826470670590537e-06, "loss": 1.2112, "step": 20633 }, { "epoch": 6.145758483962844, "grad_norm": 0.3183210790157318, "learning_rate": 6.825555979835328e-06, "loss": 1.21, "step": 20634 }, { "epoch": 6.146056330162512, "grad_norm": 0.28341755270957947, "learning_rate": 6.824641318614776e-06, "loss": 1.2078, "step": 20635 }, { "epoch": 6.146354176362181, "grad_norm": 0.2619353234767914, "learning_rate": 6.823726686937388e-06, "loss": 1.222, "step": 20636 }, { "epoch": 6.14665202256185, "grad_norm": 0.28459495306015015, "learning_rate": 6.822812084811672e-06, "loss": 1.2158, "step": 20637 }, { "epoch": 6.146949868761518, "grad_norm": 0.35960063338279724, "learning_rate": 6.821897512246143e-06, "loss": 1.2057, "step": 20638 }, { "epoch": 6.147247714961187, "grad_norm": 0.2918333411216736, "learning_rate": 6.820982969249308e-06, "loss": 1.211, "step": 20639 }, { "epoch": 6.147545561160856, "grad_norm": 0.27319374680519104, "learning_rate": 6.820068455829669e-06, "loss": 1.2031, "step": 20640 }, { "epoch": 6.147843407360524, "grad_norm": 0.2744782269001007, "learning_rate": 6.819153971995745e-06, "loss": 1.2069, "step": 20641 }, { "epoch": 6.148141253560193, "grad_norm": 0.3396849036216736, "learning_rate": 6.8182395177560365e-06, "loss": 1.2267, "step": 20642 }, { "epoch": 6.1484390997598615, "grad_norm": 0.24592465162277222, "learning_rate": 6.817325093119056e-06, "loss": 1.2042, "step": 20643 }, { "epoch": 6.14873694595953, "grad_norm": 0.36841118335723877, "learning_rate": 6.816410698093312e-06, "loss": 1.2175, "step": 20644 }, { "epoch": 6.149034792159199, "grad_norm": 0.2556819021701813, "learning_rate": 6.815496332687304e-06, "loss": 1.213, "step": 20645 }, { "epoch": 6.149332638358867, "grad_norm": 0.3732451796531677, "learning_rate": 6.81458199690955e-06, "loss": 1.2075, "step": 20646 }, { "epoch": 6.149630484558536, "grad_norm": 0.30606967210769653, "learning_rate": 6.8136676907685486e-06, "loss": 1.2126, "step": 20647 }, { "epoch": 6.149928330758205, "grad_norm": 0.3908883035182953, "learning_rate": 6.812753414272808e-06, "loss": 1.2158, "step": 20648 }, { "epoch": 6.150226176957873, "grad_norm": 0.38985610008239746, "learning_rate": 6.811839167430841e-06, "loss": 1.2004, "step": 20649 }, { "epoch": 6.150524023157542, "grad_norm": 0.42454275488853455, "learning_rate": 6.8109249502511465e-06, "loss": 1.219, "step": 20650 }, { "epoch": 6.150821869357211, "grad_norm": 0.30039963126182556, "learning_rate": 6.810010762742229e-06, "loss": 1.2115, "step": 20651 }, { "epoch": 6.151119715556879, "grad_norm": 0.30101659893989563, "learning_rate": 6.8090966049126015e-06, "loss": 1.1975, "step": 20652 }, { "epoch": 6.151417561756548, "grad_norm": 0.3211324214935303, "learning_rate": 6.8081824767707626e-06, "loss": 1.2071, "step": 20653 }, { "epoch": 6.151715407956217, "grad_norm": 0.30729320645332336, "learning_rate": 6.807268378325218e-06, "loss": 1.2117, "step": 20654 }, { "epoch": 6.152013254155885, "grad_norm": 0.3042641282081604, "learning_rate": 6.806354309584477e-06, "loss": 1.216, "step": 20655 }, { "epoch": 6.152311100355554, "grad_norm": 0.24978935718536377, "learning_rate": 6.805440270557042e-06, "loss": 1.2189, "step": 20656 }, { "epoch": 6.1526089465552225, "grad_norm": 0.31557783484458923, "learning_rate": 6.804526261251413e-06, "loss": 1.2128, "step": 20657 }, { "epoch": 6.152906792754891, "grad_norm": 0.2657829523086548, "learning_rate": 6.8036122816760995e-06, "loss": 1.2245, "step": 20658 }, { "epoch": 6.15320463895456, "grad_norm": 0.2974882125854492, "learning_rate": 6.802698331839599e-06, "loss": 1.1951, "step": 20659 }, { "epoch": 6.153502485154228, "grad_norm": 0.28717219829559326, "learning_rate": 6.8017844117504205e-06, "loss": 1.2357, "step": 20660 }, { "epoch": 6.153800331353897, "grad_norm": 0.33680206537246704, "learning_rate": 6.800870521417067e-06, "loss": 1.2107, "step": 20661 }, { "epoch": 6.154098177553566, "grad_norm": 0.2575487494468689, "learning_rate": 6.799956660848034e-06, "loss": 1.213, "step": 20662 }, { "epoch": 6.154396023753234, "grad_norm": 0.43166589736938477, "learning_rate": 6.799042830051834e-06, "loss": 1.1901, "step": 20663 }, { "epoch": 6.1546938699529035, "grad_norm": 0.2783490717411041, "learning_rate": 6.7981290290369615e-06, "loss": 1.2104, "step": 20664 }, { "epoch": 6.154991716152572, "grad_norm": 0.5312780737876892, "learning_rate": 6.797215257811921e-06, "loss": 1.2145, "step": 20665 }, { "epoch": 6.15528956235224, "grad_norm": 0.25795263051986694, "learning_rate": 6.796301516385214e-06, "loss": 1.1946, "step": 20666 }, { "epoch": 6.155587408551909, "grad_norm": 0.28236448764801025, "learning_rate": 6.7953878047653455e-06, "loss": 1.2198, "step": 20667 }, { "epoch": 6.155885254751578, "grad_norm": 0.36620813608169556, "learning_rate": 6.794474122960808e-06, "loss": 1.1893, "step": 20668 }, { "epoch": 6.156183100951246, "grad_norm": 0.2770450711250305, "learning_rate": 6.793560470980111e-06, "loss": 1.2108, "step": 20669 }, { "epoch": 6.156480947150915, "grad_norm": 0.3194955587387085, "learning_rate": 6.79264684883175e-06, "loss": 1.2147, "step": 20670 }, { "epoch": 6.1567787933505835, "grad_norm": 0.24974024295806885, "learning_rate": 6.791733256524225e-06, "loss": 1.2192, "step": 20671 }, { "epoch": 6.157076639550252, "grad_norm": 0.28228843212127686, "learning_rate": 6.79081969406604e-06, "loss": 1.2072, "step": 20672 }, { "epoch": 6.157374485749921, "grad_norm": 0.26873913407325745, "learning_rate": 6.789906161465689e-06, "loss": 1.2217, "step": 20673 }, { "epoch": 6.157672331949589, "grad_norm": 0.26520776748657227, "learning_rate": 6.788992658731679e-06, "loss": 1.2107, "step": 20674 }, { "epoch": 6.157970178149258, "grad_norm": 0.32572123408317566, "learning_rate": 6.7880791858725046e-06, "loss": 1.1901, "step": 20675 }, { "epoch": 6.158268024348927, "grad_norm": 0.26272010803222656, "learning_rate": 6.787165742896659e-06, "loss": 1.2036, "step": 20676 }, { "epoch": 6.158565870548595, "grad_norm": 0.34784695506095886, "learning_rate": 6.786252329812652e-06, "loss": 1.2168, "step": 20677 }, { "epoch": 6.1588637167482645, "grad_norm": 0.26512646675109863, "learning_rate": 6.785338946628977e-06, "loss": 1.2073, "step": 20678 }, { "epoch": 6.159161562947933, "grad_norm": 0.33080342411994934, "learning_rate": 6.784425593354127e-06, "loss": 1.206, "step": 20679 }, { "epoch": 6.159459409147601, "grad_norm": 0.2957414388656616, "learning_rate": 6.783512269996609e-06, "loss": 1.2076, "step": 20680 }, { "epoch": 6.15975725534727, "grad_norm": 0.2581091523170471, "learning_rate": 6.782598976564914e-06, "loss": 1.2217, "step": 20681 }, { "epoch": 6.160055101546939, "grad_norm": 0.27238890528678894, "learning_rate": 6.781685713067538e-06, "loss": 1.2195, "step": 20682 }, { "epoch": 6.160352947746607, "grad_norm": 0.41715824604034424, "learning_rate": 6.780772479512983e-06, "loss": 1.2128, "step": 20683 }, { "epoch": 6.160650793946276, "grad_norm": 0.27436164021492004, "learning_rate": 6.7798592759097446e-06, "loss": 1.2188, "step": 20684 }, { "epoch": 6.160948640145945, "grad_norm": 0.5577759146690369, "learning_rate": 6.778946102266314e-06, "loss": 1.2183, "step": 20685 }, { "epoch": 6.161246486345613, "grad_norm": 0.2873111069202423, "learning_rate": 6.7780329585911965e-06, "loss": 1.2105, "step": 20686 }, { "epoch": 6.161544332545282, "grad_norm": 0.36976149678230286, "learning_rate": 6.777119844892881e-06, "loss": 1.205, "step": 20687 }, { "epoch": 6.16184217874495, "grad_norm": 0.2958317697048187, "learning_rate": 6.77620676117986e-06, "loss": 1.1901, "step": 20688 }, { "epoch": 6.16214002494462, "grad_norm": 0.3038024604320526, "learning_rate": 6.775293707460637e-06, "loss": 1.2088, "step": 20689 }, { "epoch": 6.162437871144288, "grad_norm": 0.3150775730609894, "learning_rate": 6.7743806837437e-06, "loss": 1.2174, "step": 20690 }, { "epoch": 6.162735717343956, "grad_norm": 0.4162593483924866, "learning_rate": 6.7734676900375514e-06, "loss": 1.2216, "step": 20691 }, { "epoch": 6.1630335635436255, "grad_norm": 0.27144870162010193, "learning_rate": 6.772554726350682e-06, "loss": 1.2097, "step": 20692 }, { "epoch": 6.163331409743294, "grad_norm": 0.32726457715034485, "learning_rate": 6.771641792691579e-06, "loss": 1.2005, "step": 20693 }, { "epoch": 6.163629255942962, "grad_norm": 0.2827392518520355, "learning_rate": 6.770728889068747e-06, "loss": 1.2072, "step": 20694 }, { "epoch": 6.163927102142631, "grad_norm": 0.33994659781455994, "learning_rate": 6.769816015490674e-06, "loss": 1.2264, "step": 20695 }, { "epoch": 6.1642249483423, "grad_norm": 0.2557581663131714, "learning_rate": 6.768903171965849e-06, "loss": 1.2027, "step": 20696 }, { "epoch": 6.164522794541968, "grad_norm": 0.2889869213104248, "learning_rate": 6.767990358502776e-06, "loss": 1.1965, "step": 20697 }, { "epoch": 6.164820640741637, "grad_norm": 0.44061148166656494, "learning_rate": 6.767077575109942e-06, "loss": 1.2048, "step": 20698 }, { "epoch": 6.165118486941306, "grad_norm": 0.33885693550109863, "learning_rate": 6.766164821795836e-06, "loss": 1.2063, "step": 20699 }, { "epoch": 6.165416333140974, "grad_norm": 0.2890945076942444, "learning_rate": 6.765252098568953e-06, "loss": 1.207, "step": 20700 }, { "epoch": 6.165714179340643, "grad_norm": 0.296283096075058, "learning_rate": 6.764339405437788e-06, "loss": 1.2044, "step": 20701 }, { "epoch": 6.1660120255403115, "grad_norm": 0.3410748243331909, "learning_rate": 6.7634267424108255e-06, "loss": 1.1972, "step": 20702 }, { "epoch": 6.166309871739981, "grad_norm": 0.3115783929824829, "learning_rate": 6.762514109496565e-06, "loss": 1.2059, "step": 20703 }, { "epoch": 6.166607717939649, "grad_norm": 0.2976948618888855, "learning_rate": 6.761601506703495e-06, "loss": 1.2224, "step": 20704 }, { "epoch": 6.166905564139317, "grad_norm": 0.35263922810554504, "learning_rate": 6.760688934040098e-06, "loss": 1.2024, "step": 20705 }, { "epoch": 6.167203410338987, "grad_norm": 0.3121910095214844, "learning_rate": 6.759776391514876e-06, "loss": 1.1983, "step": 20706 }, { "epoch": 6.167501256538655, "grad_norm": 0.3365488052368164, "learning_rate": 6.758863879136314e-06, "loss": 1.2094, "step": 20707 }, { "epoch": 6.167799102738323, "grad_norm": 0.2783413827419281, "learning_rate": 6.7579513969129006e-06, "loss": 1.209, "step": 20708 }, { "epoch": 6.168096948937992, "grad_norm": 0.3695756196975708, "learning_rate": 6.757038944853131e-06, "loss": 1.1872, "step": 20709 }, { "epoch": 6.168394795137661, "grad_norm": 0.33527496457099915, "learning_rate": 6.756126522965487e-06, "loss": 1.2287, "step": 20710 }, { "epoch": 6.168692641337329, "grad_norm": 0.36470115184783936, "learning_rate": 6.755214131258465e-06, "loss": 1.1932, "step": 20711 }, { "epoch": 6.168990487536998, "grad_norm": 0.4481731057167053, "learning_rate": 6.754301769740548e-06, "loss": 1.2142, "step": 20712 }, { "epoch": 6.169288333736667, "grad_norm": 0.33711037039756775, "learning_rate": 6.753389438420224e-06, "loss": 1.2162, "step": 20713 }, { "epoch": 6.169586179936335, "grad_norm": 0.44622817635536194, "learning_rate": 6.752477137305989e-06, "loss": 1.2085, "step": 20714 }, { "epoch": 6.169884026136004, "grad_norm": 0.274313360452652, "learning_rate": 6.751564866406326e-06, "loss": 1.2047, "step": 20715 }, { "epoch": 6.1701818723356725, "grad_norm": 0.4805906116962433, "learning_rate": 6.750652625729718e-06, "loss": 1.2282, "step": 20716 }, { "epoch": 6.170479718535342, "grad_norm": 0.2658785283565521, "learning_rate": 6.74974041528466e-06, "loss": 1.2098, "step": 20717 }, { "epoch": 6.17077756473501, "grad_norm": 0.39714986085891724, "learning_rate": 6.748828235079636e-06, "loss": 1.2139, "step": 20718 }, { "epoch": 6.171075410934678, "grad_norm": 0.30077528953552246, "learning_rate": 6.74791608512313e-06, "loss": 1.2116, "step": 20719 }, { "epoch": 6.171373257134348, "grad_norm": 0.26328808069229126, "learning_rate": 6.747003965423635e-06, "loss": 1.2043, "step": 20720 }, { "epoch": 6.171671103334016, "grad_norm": 0.3597334325313568, "learning_rate": 6.7460918759896334e-06, "loss": 1.183, "step": 20721 }, { "epoch": 6.171968949533684, "grad_norm": 0.25994551181793213, "learning_rate": 6.745179816829608e-06, "loss": 1.2058, "step": 20722 }, { "epoch": 6.1722667957333535, "grad_norm": 0.2922576665878296, "learning_rate": 6.7442677879520524e-06, "loss": 1.2153, "step": 20723 }, { "epoch": 6.172564641933022, "grad_norm": 0.27364155650138855, "learning_rate": 6.743355789365442e-06, "loss": 1.2084, "step": 20724 }, { "epoch": 6.17286248813269, "grad_norm": 0.26237180829048157, "learning_rate": 6.7424438210782706e-06, "loss": 1.2111, "step": 20725 }, { "epoch": 6.173160334332359, "grad_norm": 0.2967134714126587, "learning_rate": 6.741531883099022e-06, "loss": 1.1999, "step": 20726 }, { "epoch": 6.173458180532028, "grad_norm": 0.29727038741111755, "learning_rate": 6.740619975436174e-06, "loss": 1.2067, "step": 20727 }, { "epoch": 6.173756026731697, "grad_norm": 0.39990097284317017, "learning_rate": 6.739708098098221e-06, "loss": 1.1846, "step": 20728 }, { "epoch": 6.174053872931365, "grad_norm": 0.44286075234413147, "learning_rate": 6.738796251093638e-06, "loss": 1.2186, "step": 20729 }, { "epoch": 6.1743517191310335, "grad_norm": 0.48298612236976624, "learning_rate": 6.737884434430912e-06, "loss": 1.1968, "step": 20730 }, { "epoch": 6.174649565330703, "grad_norm": 0.36918771266937256, "learning_rate": 6.736972648118529e-06, "loss": 1.1972, "step": 20731 }, { "epoch": 6.174947411530371, "grad_norm": 0.2885284125804901, "learning_rate": 6.73606089216497e-06, "loss": 1.2213, "step": 20732 }, { "epoch": 6.175245257730039, "grad_norm": 0.43381965160369873, "learning_rate": 6.735149166578714e-06, "loss": 1.2092, "step": 20733 }, { "epoch": 6.175543103929709, "grad_norm": 0.2930415868759155, "learning_rate": 6.73423747136825e-06, "loss": 1.2052, "step": 20734 }, { "epoch": 6.175840950129377, "grad_norm": 0.3443886339664459, "learning_rate": 6.733325806542057e-06, "loss": 1.2113, "step": 20735 }, { "epoch": 6.176138796329045, "grad_norm": 0.2660033404827118, "learning_rate": 6.732414172108615e-06, "loss": 1.2069, "step": 20736 }, { "epoch": 6.1764366425287145, "grad_norm": 0.35309046506881714, "learning_rate": 6.731502568076413e-06, "loss": 1.2118, "step": 20737 }, { "epoch": 6.176734488728383, "grad_norm": 0.25800788402557373, "learning_rate": 6.730590994453928e-06, "loss": 1.1957, "step": 20738 }, { "epoch": 6.177032334928051, "grad_norm": 0.3657929301261902, "learning_rate": 6.729679451249638e-06, "loss": 1.2179, "step": 20739 }, { "epoch": 6.17733018112772, "grad_norm": 0.28903043270111084, "learning_rate": 6.728767938472028e-06, "loss": 1.1813, "step": 20740 }, { "epoch": 6.177628027327389, "grad_norm": 0.29322391748428345, "learning_rate": 6.727856456129576e-06, "loss": 1.215, "step": 20741 }, { "epoch": 6.177925873527057, "grad_norm": 0.2867197096347809, "learning_rate": 6.726945004230765e-06, "loss": 1.2243, "step": 20742 }, { "epoch": 6.178223719726726, "grad_norm": 0.3531189262866974, "learning_rate": 6.726033582784075e-06, "loss": 1.2009, "step": 20743 }, { "epoch": 6.178521565926395, "grad_norm": 0.2790989279747009, "learning_rate": 6.725122191797981e-06, "loss": 1.2087, "step": 20744 }, { "epoch": 6.178819412126064, "grad_norm": 0.27653998136520386, "learning_rate": 6.724210831280971e-06, "loss": 1.2423, "step": 20745 }, { "epoch": 6.179117258325732, "grad_norm": 0.37640488147735596, "learning_rate": 6.723299501241518e-06, "loss": 1.2186, "step": 20746 }, { "epoch": 6.1794151045254, "grad_norm": 0.2798153758049011, "learning_rate": 6.722388201688099e-06, "loss": 1.2068, "step": 20747 }, { "epoch": 6.17971295072507, "grad_norm": 0.2653747498989105, "learning_rate": 6.721476932629199e-06, "loss": 1.2119, "step": 20748 }, { "epoch": 6.180010796924738, "grad_norm": 0.33432847261428833, "learning_rate": 6.720565694073294e-06, "loss": 1.2048, "step": 20749 }, { "epoch": 6.180308643124406, "grad_norm": 0.26739978790283203, "learning_rate": 6.7196544860288555e-06, "loss": 1.2073, "step": 20750 }, { "epoch": 6.1806064893240755, "grad_norm": 0.3965112566947937, "learning_rate": 6.718743308504374e-06, "loss": 1.2218, "step": 20751 }, { "epoch": 6.180904335523744, "grad_norm": 0.25039783120155334, "learning_rate": 6.717832161508315e-06, "loss": 1.2079, "step": 20752 }, { "epoch": 6.181202181723412, "grad_norm": 0.4873012602329254, "learning_rate": 6.716921045049163e-06, "loss": 1.2127, "step": 20753 }, { "epoch": 6.181500027923081, "grad_norm": 0.28977301716804504, "learning_rate": 6.716009959135392e-06, "loss": 1.2049, "step": 20754 }, { "epoch": 6.18179787412275, "grad_norm": 0.2853916585445404, "learning_rate": 6.715098903775481e-06, "loss": 1.2306, "step": 20755 }, { "epoch": 6.182095720322419, "grad_norm": 0.27583566308021545, "learning_rate": 6.7141878789779e-06, "loss": 1.2248, "step": 20756 }, { "epoch": 6.182393566522087, "grad_norm": 0.3771075904369354, "learning_rate": 6.7132768847511354e-06, "loss": 1.1998, "step": 20757 }, { "epoch": 6.182691412721756, "grad_norm": 0.2786421477794647, "learning_rate": 6.712365921103652e-06, "loss": 1.1987, "step": 20758 }, { "epoch": 6.182989258921425, "grad_norm": 0.36081111431121826, "learning_rate": 6.711454988043933e-06, "loss": 1.2124, "step": 20759 }, { "epoch": 6.183287105121093, "grad_norm": 0.2855590283870697, "learning_rate": 6.710544085580452e-06, "loss": 1.2283, "step": 20760 }, { "epoch": 6.1835849513207615, "grad_norm": 0.26320427656173706, "learning_rate": 6.70963321372168e-06, "loss": 1.2275, "step": 20761 }, { "epoch": 6.183882797520431, "grad_norm": 0.29471349716186523, "learning_rate": 6.7087223724760995e-06, "loss": 1.212, "step": 20762 }, { "epoch": 6.184180643720099, "grad_norm": 0.33346307277679443, "learning_rate": 6.70781156185218e-06, "loss": 1.2134, "step": 20763 }, { "epoch": 6.184478489919767, "grad_norm": 0.31599733233451843, "learning_rate": 6.706900781858389e-06, "loss": 1.2096, "step": 20764 }, { "epoch": 6.184776336119437, "grad_norm": 0.2737686038017273, "learning_rate": 6.705990032503211e-06, "loss": 1.1987, "step": 20765 }, { "epoch": 6.185074182319105, "grad_norm": 0.3117977976799011, "learning_rate": 6.7050793137951185e-06, "loss": 1.2282, "step": 20766 }, { "epoch": 6.185372028518773, "grad_norm": 0.29927635192871094, "learning_rate": 6.704168625742576e-06, "loss": 1.2034, "step": 20767 }, { "epoch": 6.185669874718442, "grad_norm": 0.28418171405792236, "learning_rate": 6.703257968354066e-06, "loss": 1.2164, "step": 20768 }, { "epoch": 6.185967720918111, "grad_norm": 0.28196296095848083, "learning_rate": 6.702347341638059e-06, "loss": 1.2081, "step": 20769 }, { "epoch": 6.18626556711778, "grad_norm": 0.3697756230831146, "learning_rate": 6.70143674560302e-06, "loss": 1.207, "step": 20770 }, { "epoch": 6.186563413317448, "grad_norm": 0.2549345791339874, "learning_rate": 6.7005261802574296e-06, "loss": 1.2085, "step": 20771 }, { "epoch": 6.186861259517117, "grad_norm": 0.327949196100235, "learning_rate": 6.699615645609758e-06, "loss": 1.2056, "step": 20772 }, { "epoch": 6.187159105716786, "grad_norm": 0.2568923532962799, "learning_rate": 6.698705141668473e-06, "loss": 1.2151, "step": 20773 }, { "epoch": 6.187456951916454, "grad_norm": 0.3360218405723572, "learning_rate": 6.697794668442051e-06, "loss": 1.1999, "step": 20774 }, { "epoch": 6.1877547981161225, "grad_norm": 0.2906973958015442, "learning_rate": 6.6968842259389556e-06, "loss": 1.2088, "step": 20775 }, { "epoch": 6.188052644315792, "grad_norm": 0.45060616731643677, "learning_rate": 6.695973814167667e-06, "loss": 1.233, "step": 20776 }, { "epoch": 6.18835049051546, "grad_norm": 0.266433447599411, "learning_rate": 6.695063433136648e-06, "loss": 1.2036, "step": 20777 }, { "epoch": 6.188648336715128, "grad_norm": 0.2839943468570709, "learning_rate": 6.694153082854369e-06, "loss": 1.2134, "step": 20778 }, { "epoch": 6.188946182914798, "grad_norm": 0.4276580214500427, "learning_rate": 6.693242763329308e-06, "loss": 1.2076, "step": 20779 }, { "epoch": 6.189244029114466, "grad_norm": 0.36513426899909973, "learning_rate": 6.692332474569927e-06, "loss": 1.1947, "step": 20780 }, { "epoch": 6.189541875314134, "grad_norm": 0.5294678807258606, "learning_rate": 6.691422216584692e-06, "loss": 1.2127, "step": 20781 }, { "epoch": 6.1898397215138035, "grad_norm": 0.5186043977737427, "learning_rate": 6.690511989382082e-06, "loss": 1.2116, "step": 20782 }, { "epoch": 6.190137567713472, "grad_norm": 0.4168766140937805, "learning_rate": 6.689601792970558e-06, "loss": 1.2145, "step": 20783 }, { "epoch": 6.190435413913141, "grad_norm": 0.34949246048927307, "learning_rate": 6.688691627358587e-06, "loss": 1.2037, "step": 20784 }, { "epoch": 6.190733260112809, "grad_norm": 0.4944717586040497, "learning_rate": 6.687781492554648e-06, "loss": 1.2222, "step": 20785 }, { "epoch": 6.191031106312478, "grad_norm": 0.29145970940589905, "learning_rate": 6.6868713885672e-06, "loss": 1.2034, "step": 20786 }, { "epoch": 6.191328952512147, "grad_norm": 0.6446349620819092, "learning_rate": 6.685961315404708e-06, "loss": 1.2172, "step": 20787 }, { "epoch": 6.191626798711815, "grad_norm": 0.2638005018234253, "learning_rate": 6.6850512730756455e-06, "loss": 1.2138, "step": 20788 }, { "epoch": 6.1919246449114835, "grad_norm": 0.34286072850227356, "learning_rate": 6.684141261588477e-06, "loss": 1.2174, "step": 20789 }, { "epoch": 6.192222491111153, "grad_norm": 0.5390883684158325, "learning_rate": 6.68323128095167e-06, "loss": 1.1962, "step": 20790 }, { "epoch": 6.192520337310821, "grad_norm": 0.33346831798553467, "learning_rate": 6.682321331173691e-06, "loss": 1.2121, "step": 20791 }, { "epoch": 6.192818183510489, "grad_norm": 0.6277686357498169, "learning_rate": 6.6814114122630025e-06, "loss": 1.2138, "step": 20792 }, { "epoch": 6.193116029710159, "grad_norm": 0.31872302293777466, "learning_rate": 6.680501524228077e-06, "loss": 1.2071, "step": 20793 }, { "epoch": 6.193413875909827, "grad_norm": 0.4405876398086548, "learning_rate": 6.679591667077374e-06, "loss": 1.2147, "step": 20794 }, { "epoch": 6.193711722109496, "grad_norm": 0.29229310154914856, "learning_rate": 6.678681840819357e-06, "loss": 1.2191, "step": 20795 }, { "epoch": 6.1940095683091645, "grad_norm": 0.3992125391960144, "learning_rate": 6.6777720454625e-06, "loss": 1.219, "step": 20796 }, { "epoch": 6.194307414508833, "grad_norm": 0.4436438977718353, "learning_rate": 6.676862281015264e-06, "loss": 1.2215, "step": 20797 }, { "epoch": 6.194605260708502, "grad_norm": 0.28378355503082275, "learning_rate": 6.6759525474861055e-06, "loss": 1.2069, "step": 20798 }, { "epoch": 6.19490310690817, "grad_norm": 0.44195660948753357, "learning_rate": 6.675042844883499e-06, "loss": 1.2166, "step": 20799 }, { "epoch": 6.195200953107839, "grad_norm": 0.335060179233551, "learning_rate": 6.674133173215902e-06, "loss": 1.2144, "step": 20800 }, { "epoch": 6.195498799307508, "grad_norm": 0.4112803041934967, "learning_rate": 6.673223532491778e-06, "loss": 1.2142, "step": 20801 }, { "epoch": 6.195796645507176, "grad_norm": 0.37522804737091064, "learning_rate": 6.672313922719597e-06, "loss": 1.1878, "step": 20802 }, { "epoch": 6.196094491706845, "grad_norm": 0.2578239142894745, "learning_rate": 6.671404343907817e-06, "loss": 1.2158, "step": 20803 }, { "epoch": 6.196392337906514, "grad_norm": 0.5630843043327332, "learning_rate": 6.670494796064895e-06, "loss": 1.2154, "step": 20804 }, { "epoch": 6.196690184106182, "grad_norm": 0.2554631233215332, "learning_rate": 6.6695852791993044e-06, "loss": 1.2053, "step": 20805 }, { "epoch": 6.19698803030585, "grad_norm": 0.3429364264011383, "learning_rate": 6.6686757933194965e-06, "loss": 1.2028, "step": 20806 }, { "epoch": 6.19728587650552, "grad_norm": 0.47954127192497253, "learning_rate": 6.6677663384339405e-06, "loss": 1.2335, "step": 20807 }, { "epoch": 6.197583722705188, "grad_norm": 0.26083940267562866, "learning_rate": 6.6668569145511e-06, "loss": 1.2032, "step": 20808 }, { "epoch": 6.197881568904856, "grad_norm": 0.4446563720703125, "learning_rate": 6.665947521679425e-06, "loss": 1.2124, "step": 20809 }, { "epoch": 6.1981794151045255, "grad_norm": 0.41570764780044556, "learning_rate": 6.665038159827391e-06, "loss": 1.2264, "step": 20810 }, { "epoch": 6.198477261304194, "grad_norm": 0.3224899172782898, "learning_rate": 6.664128829003445e-06, "loss": 1.2037, "step": 20811 }, { "epoch": 6.198775107503863, "grad_norm": 0.4793171286582947, "learning_rate": 6.663219529216055e-06, "loss": 1.1908, "step": 20812 }, { "epoch": 6.199072953703531, "grad_norm": 0.32333850860595703, "learning_rate": 6.662310260473679e-06, "loss": 1.2095, "step": 20813 }, { "epoch": 6.1993707999032, "grad_norm": 0.5674697160720825, "learning_rate": 6.661401022784779e-06, "loss": 1.1854, "step": 20814 }, { "epoch": 6.199668646102869, "grad_norm": 0.3417503237724304, "learning_rate": 6.660491816157808e-06, "loss": 1.1886, "step": 20815 }, { "epoch": 6.199966492302537, "grad_norm": 0.36498722434043884, "learning_rate": 6.6595826406012344e-06, "loss": 1.2062, "step": 20816 }, { "epoch": 6.200264338502206, "grad_norm": 0.3774951100349426, "learning_rate": 6.658673496123509e-06, "loss": 1.2164, "step": 20817 }, { "epoch": 6.200562184701875, "grad_norm": 0.26584306359291077, "learning_rate": 6.657764382733094e-06, "loss": 1.2104, "step": 20818 }, { "epoch": 6.200860030901543, "grad_norm": 0.31986191868782043, "learning_rate": 6.656855300438447e-06, "loss": 1.212, "step": 20819 }, { "epoch": 6.2011578771012115, "grad_norm": 0.39728841185569763, "learning_rate": 6.6559462492480295e-06, "loss": 1.2232, "step": 20820 }, { "epoch": 6.201455723300881, "grad_norm": 0.2903613746166229, "learning_rate": 6.655037229170291e-06, "loss": 1.2213, "step": 20821 }, { "epoch": 6.201753569500549, "grad_norm": 0.36729517579078674, "learning_rate": 6.654128240213697e-06, "loss": 1.2195, "step": 20822 }, { "epoch": 6.202051415700218, "grad_norm": 0.29853418469429016, "learning_rate": 6.6532192823867006e-06, "loss": 1.1987, "step": 20823 }, { "epoch": 6.202349261899887, "grad_norm": 0.3352545201778412, "learning_rate": 6.652310355697759e-06, "loss": 1.206, "step": 20824 }, { "epoch": 6.202647108099555, "grad_norm": 0.39215201139450073, "learning_rate": 6.651401460155331e-06, "loss": 1.2086, "step": 20825 }, { "epoch": 6.202944954299224, "grad_norm": 0.24427476525306702, "learning_rate": 6.650492595767868e-06, "loss": 1.2051, "step": 20826 }, { "epoch": 6.203242800498892, "grad_norm": 0.3072798550128937, "learning_rate": 6.649583762543833e-06, "loss": 1.2118, "step": 20827 }, { "epoch": 6.203540646698561, "grad_norm": 0.37357524037361145, "learning_rate": 6.648674960491677e-06, "loss": 1.211, "step": 20828 }, { "epoch": 6.20383849289823, "grad_norm": 0.40315720438957214, "learning_rate": 6.647766189619853e-06, "loss": 1.2026, "step": 20829 }, { "epoch": 6.204136339097898, "grad_norm": 0.2604370415210724, "learning_rate": 6.646857449936821e-06, "loss": 1.2015, "step": 20830 }, { "epoch": 6.204434185297567, "grad_norm": 0.2628321945667267, "learning_rate": 6.645948741451037e-06, "loss": 1.2076, "step": 20831 }, { "epoch": 6.204732031497236, "grad_norm": 0.3860684633255005, "learning_rate": 6.645040064170948e-06, "loss": 1.1966, "step": 20832 }, { "epoch": 6.205029877696904, "grad_norm": 0.31014302372932434, "learning_rate": 6.644131418105017e-06, "loss": 1.2074, "step": 20833 }, { "epoch": 6.2053277238965725, "grad_norm": 0.314895898103714, "learning_rate": 6.643222803261693e-06, "loss": 1.198, "step": 20834 }, { "epoch": 6.205625570096242, "grad_norm": 0.34027209877967834, "learning_rate": 6.642314219649426e-06, "loss": 1.222, "step": 20835 }, { "epoch": 6.20592341629591, "grad_norm": 0.29028600454330444, "learning_rate": 6.6414056672766765e-06, "loss": 1.1941, "step": 20836 }, { "epoch": 6.206221262495579, "grad_norm": 0.302869588136673, "learning_rate": 6.640497146151898e-06, "loss": 1.2186, "step": 20837 }, { "epoch": 6.206519108695248, "grad_norm": 0.34623581171035767, "learning_rate": 6.639588656283535e-06, "loss": 1.2018, "step": 20838 }, { "epoch": 6.206816954894916, "grad_norm": 0.27471330761909485, "learning_rate": 6.638680197680049e-06, "loss": 1.1964, "step": 20839 }, { "epoch": 6.207114801094585, "grad_norm": 0.36948472261428833, "learning_rate": 6.637771770349883e-06, "loss": 1.1875, "step": 20840 }, { "epoch": 6.2074126472942535, "grad_norm": 0.3009609282016754, "learning_rate": 6.636863374301501e-06, "loss": 1.2173, "step": 20841 }, { "epoch": 6.207710493493922, "grad_norm": 0.3019651174545288, "learning_rate": 6.635955009543345e-06, "loss": 1.1965, "step": 20842 }, { "epoch": 6.208008339693591, "grad_norm": 0.3131799101829529, "learning_rate": 6.635046676083865e-06, "loss": 1.2021, "step": 20843 }, { "epoch": 6.208306185893259, "grad_norm": 0.42689722776412964, "learning_rate": 6.634138373931523e-06, "loss": 1.2077, "step": 20844 }, { "epoch": 6.208604032092928, "grad_norm": 0.37339693307876587, "learning_rate": 6.633230103094761e-06, "loss": 1.2147, "step": 20845 }, { "epoch": 6.208901878292597, "grad_norm": 0.4136165678501129, "learning_rate": 6.632321863582027e-06, "loss": 1.2157, "step": 20846 }, { "epoch": 6.209199724492265, "grad_norm": 0.32157769799232483, "learning_rate": 6.631413655401778e-06, "loss": 1.2094, "step": 20847 }, { "epoch": 6.2094975706919335, "grad_norm": 0.3838278353214264, "learning_rate": 6.630505478562464e-06, "loss": 1.2036, "step": 20848 }, { "epoch": 6.209795416891603, "grad_norm": 0.28281065821647644, "learning_rate": 6.629597333072527e-06, "loss": 1.2064, "step": 20849 }, { "epoch": 6.210093263091271, "grad_norm": 0.3274846076965332, "learning_rate": 6.628689218940426e-06, "loss": 1.2194, "step": 20850 }, { "epoch": 6.21039110929094, "grad_norm": 0.3020499646663666, "learning_rate": 6.6277811361746045e-06, "loss": 1.1887, "step": 20851 }, { "epoch": 6.210688955490609, "grad_norm": 0.3482782244682312, "learning_rate": 6.626873084783508e-06, "loss": 1.1968, "step": 20852 }, { "epoch": 6.210986801690277, "grad_norm": 0.3389839231967926, "learning_rate": 6.625965064775592e-06, "loss": 1.2093, "step": 20853 }, { "epoch": 6.211284647889946, "grad_norm": 0.26609256863594055, "learning_rate": 6.625057076159302e-06, "loss": 1.2055, "step": 20854 }, { "epoch": 6.2115824940896145, "grad_norm": 0.3010624647140503, "learning_rate": 6.624149118943082e-06, "loss": 1.2023, "step": 20855 }, { "epoch": 6.211880340289283, "grad_norm": 0.33901360630989075, "learning_rate": 6.623241193135386e-06, "loss": 1.2017, "step": 20856 }, { "epoch": 6.212178186488952, "grad_norm": 0.2817125618457794, "learning_rate": 6.622333298744654e-06, "loss": 1.1984, "step": 20857 }, { "epoch": 6.21247603268862, "grad_norm": 0.5507752895355225, "learning_rate": 6.6214254357793405e-06, "loss": 1.2059, "step": 20858 }, { "epoch": 6.212773878888289, "grad_norm": 0.31725063920021057, "learning_rate": 6.620517604247887e-06, "loss": 1.2163, "step": 20859 }, { "epoch": 6.213071725087958, "grad_norm": 0.438908189535141, "learning_rate": 6.61960980415874e-06, "loss": 1.204, "step": 20860 }, { "epoch": 6.213369571287626, "grad_norm": 0.28866636753082275, "learning_rate": 6.61870203552035e-06, "loss": 1.1937, "step": 20861 }, { "epoch": 6.2136674174872955, "grad_norm": 0.35296639800071716, "learning_rate": 6.61779429834116e-06, "loss": 1.2051, "step": 20862 }, { "epoch": 6.213965263686964, "grad_norm": 0.35828477144241333, "learning_rate": 6.616886592629612e-06, "loss": 1.2092, "step": 20863 }, { "epoch": 6.214263109886632, "grad_norm": 0.2809810936450958, "learning_rate": 6.615978918394158e-06, "loss": 1.2016, "step": 20864 }, { "epoch": 6.214560956086301, "grad_norm": 0.5067834258079529, "learning_rate": 6.6150712756432365e-06, "loss": 1.2066, "step": 20865 }, { "epoch": 6.21485880228597, "grad_norm": 0.3311060667037964, "learning_rate": 6.6141636643852946e-06, "loss": 1.2053, "step": 20866 }, { "epoch": 6.215156648485638, "grad_norm": 0.4814979135990143, "learning_rate": 6.61325608462878e-06, "loss": 1.2064, "step": 20867 }, { "epoch": 6.215454494685307, "grad_norm": 0.3755452334880829, "learning_rate": 6.612348536382134e-06, "loss": 1.2187, "step": 20868 }, { "epoch": 6.2157523408849755, "grad_norm": 0.35468029975891113, "learning_rate": 6.611441019653795e-06, "loss": 1.2054, "step": 20869 }, { "epoch": 6.216050187084644, "grad_norm": 0.32278144359588623, "learning_rate": 6.610533534452215e-06, "loss": 1.1867, "step": 20870 }, { "epoch": 6.216348033284313, "grad_norm": 0.2916470170021057, "learning_rate": 6.609626080785834e-06, "loss": 1.2176, "step": 20871 }, { "epoch": 6.216645879483981, "grad_norm": 0.28612467646598816, "learning_rate": 6.60871865866309e-06, "loss": 1.2096, "step": 20872 }, { "epoch": 6.21694372568365, "grad_norm": 0.362280011177063, "learning_rate": 6.6078112680924345e-06, "loss": 1.2175, "step": 20873 }, { "epoch": 6.217241571883319, "grad_norm": 0.2870917022228241, "learning_rate": 6.606903909082303e-06, "loss": 1.2104, "step": 20874 }, { "epoch": 6.217539418082987, "grad_norm": 0.3332632780075073, "learning_rate": 6.6059965816411406e-06, "loss": 1.1964, "step": 20875 }, { "epoch": 6.217837264282656, "grad_norm": 0.286295622587204, "learning_rate": 6.605089285777389e-06, "loss": 1.2181, "step": 20876 }, { "epoch": 6.218135110482325, "grad_norm": 0.27101877331733704, "learning_rate": 6.604182021499485e-06, "loss": 1.2033, "step": 20877 }, { "epoch": 6.218432956681993, "grad_norm": 0.31318768858909607, "learning_rate": 6.603274788815877e-06, "loss": 1.2013, "step": 20878 }, { "epoch": 6.218730802881662, "grad_norm": 0.3703415095806122, "learning_rate": 6.602367587735001e-06, "loss": 1.205, "step": 20879 }, { "epoch": 6.219028649081331, "grad_norm": 0.332447350025177, "learning_rate": 6.601460418265297e-06, "loss": 1.2155, "step": 20880 }, { "epoch": 6.219326495280999, "grad_norm": 0.31926429271698, "learning_rate": 6.6005532804152095e-06, "loss": 1.2102, "step": 20881 }, { "epoch": 6.219624341480668, "grad_norm": 0.3711044490337372, "learning_rate": 6.599646174193174e-06, "loss": 1.2148, "step": 20882 }, { "epoch": 6.219922187680337, "grad_norm": 0.3418666422367096, "learning_rate": 6.59873909960763e-06, "loss": 1.2011, "step": 20883 }, { "epoch": 6.220220033880005, "grad_norm": 0.5140612721443176, "learning_rate": 6.5978320566670215e-06, "loss": 1.2048, "step": 20884 }, { "epoch": 6.220517880079674, "grad_norm": 0.2515263855457306, "learning_rate": 6.5969250453797854e-06, "loss": 1.1976, "step": 20885 }, { "epoch": 6.220815726279342, "grad_norm": 0.4952830970287323, "learning_rate": 6.596018065754355e-06, "loss": 1.2112, "step": 20886 }, { "epoch": 6.221113572479011, "grad_norm": 0.3173068165779114, "learning_rate": 6.5951111177991774e-06, "loss": 1.2106, "step": 20887 }, { "epoch": 6.22141141867868, "grad_norm": 0.28146082162857056, "learning_rate": 6.594204201522685e-06, "loss": 1.2105, "step": 20888 }, { "epoch": 6.221709264878348, "grad_norm": 0.44297167658805847, "learning_rate": 6.593297316933316e-06, "loss": 1.2002, "step": 20889 }, { "epoch": 6.2220071110780175, "grad_norm": 0.2924647927284241, "learning_rate": 6.592390464039513e-06, "loss": 1.2033, "step": 20890 }, { "epoch": 6.222304957277686, "grad_norm": 0.31262731552124023, "learning_rate": 6.591483642849705e-06, "loss": 1.2185, "step": 20891 }, { "epoch": 6.222602803477354, "grad_norm": 0.27535152435302734, "learning_rate": 6.590576853372337e-06, "loss": 1.2154, "step": 20892 }, { "epoch": 6.222900649677023, "grad_norm": 0.2639579772949219, "learning_rate": 6.589670095615843e-06, "loss": 1.1925, "step": 20893 }, { "epoch": 6.223198495876692, "grad_norm": 0.31021997332572937, "learning_rate": 6.588763369588655e-06, "loss": 1.2049, "step": 20894 }, { "epoch": 6.22349634207636, "grad_norm": 0.2987977862358093, "learning_rate": 6.587856675299213e-06, "loss": 1.2202, "step": 20895 }, { "epoch": 6.223794188276029, "grad_norm": 0.2946363687515259, "learning_rate": 6.586950012755955e-06, "loss": 1.2014, "step": 20896 }, { "epoch": 6.224092034475698, "grad_norm": 0.28557130694389343, "learning_rate": 6.586043381967311e-06, "loss": 1.2024, "step": 20897 }, { "epoch": 6.224389880675366, "grad_norm": 0.28428199887275696, "learning_rate": 6.5851367829417216e-06, "loss": 1.2079, "step": 20898 }, { "epoch": 6.224687726875035, "grad_norm": 0.2734763026237488, "learning_rate": 6.584230215687618e-06, "loss": 1.2088, "step": 20899 }, { "epoch": 6.2249855730747035, "grad_norm": 0.2596544921398163, "learning_rate": 6.583323680213436e-06, "loss": 1.2098, "step": 20900 }, { "epoch": 6.225283419274372, "grad_norm": 0.30075424909591675, "learning_rate": 6.582417176527609e-06, "loss": 1.1872, "step": 20901 }, { "epoch": 6.225581265474041, "grad_norm": 0.2578868269920349, "learning_rate": 6.581510704638574e-06, "loss": 1.2195, "step": 20902 }, { "epoch": 6.225879111673709, "grad_norm": 0.38626062870025635, "learning_rate": 6.5806042645547595e-06, "loss": 1.2001, "step": 20903 }, { "epoch": 6.2261769578733785, "grad_norm": 0.32577237486839294, "learning_rate": 6.5796978562846045e-06, "loss": 1.2029, "step": 20904 }, { "epoch": 6.226474804073047, "grad_norm": 0.27227717638015747, "learning_rate": 6.578791479836537e-06, "loss": 1.2117, "step": 20905 }, { "epoch": 6.226772650272715, "grad_norm": 0.2720992863178253, "learning_rate": 6.577885135218993e-06, "loss": 1.2015, "step": 20906 }, { "epoch": 6.227070496472384, "grad_norm": 0.2652950584888458, "learning_rate": 6.5769788224404075e-06, "loss": 1.2018, "step": 20907 }, { "epoch": 6.227368342672053, "grad_norm": 0.29024332761764526, "learning_rate": 6.576072541509204e-06, "loss": 1.1911, "step": 20908 }, { "epoch": 6.227666188871721, "grad_norm": 0.2996639013290405, "learning_rate": 6.575166292433825e-06, "loss": 1.2186, "step": 20909 }, { "epoch": 6.22796403507139, "grad_norm": 0.250465452671051, "learning_rate": 6.5742600752226985e-06, "loss": 1.212, "step": 20910 }, { "epoch": 6.228261881271059, "grad_norm": 0.3431493043899536, "learning_rate": 6.573353889884249e-06, "loss": 1.2106, "step": 20911 }, { "epoch": 6.228559727470727, "grad_norm": 0.28145286440849304, "learning_rate": 6.572447736426917e-06, "loss": 1.2023, "step": 20912 }, { "epoch": 6.228857573670396, "grad_norm": 0.2633451521396637, "learning_rate": 6.5715416148591295e-06, "loss": 1.2007, "step": 20913 }, { "epoch": 6.2291554198700645, "grad_norm": 0.37861987948417664, "learning_rate": 6.570635525189313e-06, "loss": 1.2104, "step": 20914 }, { "epoch": 6.229453266069733, "grad_norm": 0.26000311970710754, "learning_rate": 6.5697294674259045e-06, "loss": 1.2368, "step": 20915 }, { "epoch": 6.229751112269402, "grad_norm": 0.43028923869132996, "learning_rate": 6.568823441577332e-06, "loss": 1.2118, "step": 20916 }, { "epoch": 6.23004895846907, "grad_norm": 0.25162479281425476, "learning_rate": 6.56791744765202e-06, "loss": 1.1923, "step": 20917 }, { "epoch": 6.23034680466874, "grad_norm": 0.43839678168296814, "learning_rate": 6.567011485658403e-06, "loss": 1.2002, "step": 20918 }, { "epoch": 6.230644650868408, "grad_norm": 0.28519049286842346, "learning_rate": 6.566105555604912e-06, "loss": 1.1909, "step": 20919 }, { "epoch": 6.230942497068076, "grad_norm": 0.4051321744918823, "learning_rate": 6.5651996574999665e-06, "loss": 1.1975, "step": 20920 }, { "epoch": 6.2312403432677455, "grad_norm": 0.27308428287506104, "learning_rate": 6.564293791352006e-06, "loss": 1.2234, "step": 20921 }, { "epoch": 6.231538189467414, "grad_norm": 0.3390137851238251, "learning_rate": 6.563387957169447e-06, "loss": 1.2159, "step": 20922 }, { "epoch": 6.231836035667082, "grad_norm": 0.32019802927970886, "learning_rate": 6.56248215496073e-06, "loss": 1.2053, "step": 20923 }, { "epoch": 6.232133881866751, "grad_norm": 0.27889537811279297, "learning_rate": 6.5615763847342716e-06, "loss": 1.1981, "step": 20924 }, { "epoch": 6.23243172806642, "grad_norm": 0.46111857891082764, "learning_rate": 6.560670646498504e-06, "loss": 1.2231, "step": 20925 }, { "epoch": 6.232729574266088, "grad_norm": 0.27917757630348206, "learning_rate": 6.559764940261855e-06, "loss": 1.2033, "step": 20926 }, { "epoch": 6.233027420465757, "grad_norm": 0.4410606026649475, "learning_rate": 6.558859266032751e-06, "loss": 1.2231, "step": 20927 }, { "epoch": 6.2333252666654255, "grad_norm": 0.32204869389533997, "learning_rate": 6.557953623819612e-06, "loss": 1.2021, "step": 20928 }, { "epoch": 6.233623112865095, "grad_norm": 0.33063867688179016, "learning_rate": 6.557048013630873e-06, "loss": 1.214, "step": 20929 }, { "epoch": 6.233920959064763, "grad_norm": 0.42502111196517944, "learning_rate": 6.556142435474954e-06, "loss": 1.2174, "step": 20930 }, { "epoch": 6.234218805264431, "grad_norm": 0.25938206911087036, "learning_rate": 6.555236889360279e-06, "loss": 1.2172, "step": 20931 }, { "epoch": 6.234516651464101, "grad_norm": 0.42170843482017517, "learning_rate": 6.554331375295281e-06, "loss": 1.2116, "step": 20932 }, { "epoch": 6.234814497663769, "grad_norm": 0.3091077208518982, "learning_rate": 6.553425893288379e-06, "loss": 1.2093, "step": 20933 }, { "epoch": 6.235112343863437, "grad_norm": 0.2852857708930969, "learning_rate": 6.552520443347995e-06, "loss": 1.2077, "step": 20934 }, { "epoch": 6.2354101900631065, "grad_norm": 0.46792224049568176, "learning_rate": 6.551615025482559e-06, "loss": 1.1973, "step": 20935 }, { "epoch": 6.235708036262775, "grad_norm": 0.2661101818084717, "learning_rate": 6.5507096397004934e-06, "loss": 1.2205, "step": 20936 }, { "epoch": 6.236005882462443, "grad_norm": 0.3596484661102295, "learning_rate": 6.549804286010217e-06, "loss": 1.2045, "step": 20937 }, { "epoch": 6.236303728662112, "grad_norm": 0.36223822832107544, "learning_rate": 6.548898964420161e-06, "loss": 1.2203, "step": 20938 }, { "epoch": 6.236601574861781, "grad_norm": 0.2536521553993225, "learning_rate": 6.547993674938741e-06, "loss": 1.2012, "step": 20939 }, { "epoch": 6.236899421061449, "grad_norm": 0.4647725820541382, "learning_rate": 6.547088417574385e-06, "loss": 1.2219, "step": 20940 }, { "epoch": 6.237197267261118, "grad_norm": 0.32785943150520325, "learning_rate": 6.546183192335513e-06, "loss": 1.2062, "step": 20941 }, { "epoch": 6.2374951134607866, "grad_norm": 0.33601462841033936, "learning_rate": 6.545277999230546e-06, "loss": 1.2073, "step": 20942 }, { "epoch": 6.237792959660455, "grad_norm": 0.3900188207626343, "learning_rate": 6.544372838267912e-06, "loss": 1.2045, "step": 20943 }, { "epoch": 6.238090805860124, "grad_norm": 0.29292941093444824, "learning_rate": 6.543467709456026e-06, "loss": 1.2137, "step": 20944 }, { "epoch": 6.238388652059792, "grad_norm": 0.27695026993751526, "learning_rate": 6.542562612803308e-06, "loss": 1.1888, "step": 20945 }, { "epoch": 6.238686498259462, "grad_norm": 0.4734032154083252, "learning_rate": 6.5416575483181855e-06, "loss": 1.2062, "step": 20946 }, { "epoch": 6.23898434445913, "grad_norm": 0.2380705028772354, "learning_rate": 6.5407525160090745e-06, "loss": 1.2037, "step": 20947 }, { "epoch": 6.239282190658798, "grad_norm": 0.3729903995990753, "learning_rate": 6.539847515884394e-06, "loss": 1.2075, "step": 20948 }, { "epoch": 6.2395800368584675, "grad_norm": 0.3799933195114136, "learning_rate": 6.53894254795257e-06, "loss": 1.2212, "step": 20949 }, { "epoch": 6.239877883058136, "grad_norm": 0.37151652574539185, "learning_rate": 6.538037612222019e-06, "loss": 1.1912, "step": 20950 }, { "epoch": 6.240175729257804, "grad_norm": 0.32603856921195984, "learning_rate": 6.537132708701157e-06, "loss": 1.213, "step": 20951 }, { "epoch": 6.240473575457473, "grad_norm": 0.2977096736431122, "learning_rate": 6.536227837398409e-06, "loss": 1.2204, "step": 20952 }, { "epoch": 6.240771421657142, "grad_norm": 0.2917332351207733, "learning_rate": 6.535322998322189e-06, "loss": 1.1913, "step": 20953 }, { "epoch": 6.24106926785681, "grad_norm": 0.2621777057647705, "learning_rate": 6.534418191480916e-06, "loss": 1.1929, "step": 20954 }, { "epoch": 6.241367114056479, "grad_norm": 0.3363119065761566, "learning_rate": 6.533513416883014e-06, "loss": 1.2104, "step": 20955 }, { "epoch": 6.241664960256148, "grad_norm": 0.27684512734413147, "learning_rate": 6.5326086745368925e-06, "loss": 1.2169, "step": 20956 }, { "epoch": 6.241962806455817, "grad_norm": 0.34454455971717834, "learning_rate": 6.531703964450977e-06, "loss": 1.201, "step": 20957 }, { "epoch": 6.242260652655485, "grad_norm": 0.29072514176368713, "learning_rate": 6.530799286633679e-06, "loss": 1.1902, "step": 20958 }, { "epoch": 6.2425584988551535, "grad_norm": 0.2903517186641693, "learning_rate": 6.529894641093417e-06, "loss": 1.2001, "step": 20959 }, { "epoch": 6.242856345054823, "grad_norm": 0.3266454339027405, "learning_rate": 6.52899002783861e-06, "loss": 1.2088, "step": 20960 }, { "epoch": 6.243154191254491, "grad_norm": 0.25888121128082275, "learning_rate": 6.5280854468776745e-06, "loss": 1.2099, "step": 20961 }, { "epoch": 6.243452037454159, "grad_norm": 0.35930338501930237, "learning_rate": 6.52718089821902e-06, "loss": 1.2153, "step": 20962 }, { "epoch": 6.2437498836538285, "grad_norm": 0.24772289395332336, "learning_rate": 6.52627638187107e-06, "loss": 1.1967, "step": 20963 }, { "epoch": 6.244047729853497, "grad_norm": 0.2683066129684448, "learning_rate": 6.525371897842239e-06, "loss": 1.1901, "step": 20964 }, { "epoch": 6.244345576053165, "grad_norm": 0.31842073798179626, "learning_rate": 6.524467446140935e-06, "loss": 1.2013, "step": 20965 }, { "epoch": 6.244643422252834, "grad_norm": 0.25482967495918274, "learning_rate": 6.523563026775583e-06, "loss": 1.1959, "step": 20966 }, { "epoch": 6.244941268452503, "grad_norm": 0.41225674748420715, "learning_rate": 6.5226586397545935e-06, "loss": 1.2045, "step": 20967 }, { "epoch": 6.245239114652171, "grad_norm": 0.26212966442108154, "learning_rate": 6.521754285086377e-06, "loss": 1.2079, "step": 20968 }, { "epoch": 6.24553696085184, "grad_norm": 0.30973610281944275, "learning_rate": 6.520849962779353e-06, "loss": 1.2186, "step": 20969 }, { "epoch": 6.245834807051509, "grad_norm": 0.4781585931777954, "learning_rate": 6.519945672841932e-06, "loss": 1.21, "step": 20970 }, { "epoch": 6.246132653251178, "grad_norm": 0.3428117334842682, "learning_rate": 6.519041415282525e-06, "loss": 1.2135, "step": 20971 }, { "epoch": 6.246430499450846, "grad_norm": 0.38383954763412476, "learning_rate": 6.518137190109554e-06, "loss": 1.2013, "step": 20972 }, { "epoch": 6.2467283456505145, "grad_norm": 0.2761189043521881, "learning_rate": 6.517232997331422e-06, "loss": 1.2122, "step": 20973 }, { "epoch": 6.247026191850184, "grad_norm": 0.34829890727996826, "learning_rate": 6.516328836956551e-06, "loss": 1.2013, "step": 20974 }, { "epoch": 6.247324038049852, "grad_norm": 0.5543403625488281, "learning_rate": 6.515424708993345e-06, "loss": 1.2089, "step": 20975 }, { "epoch": 6.24762188424952, "grad_norm": 0.4369713366031647, "learning_rate": 6.514520613450217e-06, "loss": 1.1956, "step": 20976 }, { "epoch": 6.24791973044919, "grad_norm": 0.4368632435798645, "learning_rate": 6.513616550335582e-06, "loss": 1.1939, "step": 20977 }, { "epoch": 6.248217576648858, "grad_norm": 0.296641081571579, "learning_rate": 6.512712519657852e-06, "loss": 1.2088, "step": 20978 }, { "epoch": 6.248515422848526, "grad_norm": 0.5300294160842896, "learning_rate": 6.5118085214254316e-06, "loss": 1.2153, "step": 20979 }, { "epoch": 6.2488132690481955, "grad_norm": 0.5507962107658386, "learning_rate": 6.51090455564674e-06, "loss": 1.2076, "step": 20980 }, { "epoch": 6.249111115247864, "grad_norm": 0.45971810817718506, "learning_rate": 6.510000622330181e-06, "loss": 1.2169, "step": 20981 }, { "epoch": 6.249408961447532, "grad_norm": 0.6372692584991455, "learning_rate": 6.509096721484167e-06, "loss": 1.1966, "step": 20982 }, { "epoch": 6.249706807647201, "grad_norm": 0.3024199903011322, "learning_rate": 6.508192853117108e-06, "loss": 1.2039, "step": 20983 }, { "epoch": 6.25000465384687, "grad_norm": 0.5083332061767578, "learning_rate": 6.507289017237415e-06, "loss": 1.2128, "step": 20984 }, { "epoch": 6.250302500046539, "grad_norm": 0.2918650507926941, "learning_rate": 6.50638521385349e-06, "loss": 1.2117, "step": 20985 }, { "epoch": 6.250600346246207, "grad_norm": 0.31115829944610596, "learning_rate": 6.505481442973753e-06, "loss": 1.215, "step": 20986 }, { "epoch": 6.2508981924458755, "grad_norm": 0.35847991704940796, "learning_rate": 6.504577704606605e-06, "loss": 1.2101, "step": 20987 }, { "epoch": 6.251196038645545, "grad_norm": 0.3025646507740021, "learning_rate": 6.503673998760456e-06, "loss": 1.2197, "step": 20988 }, { "epoch": 6.251493884845213, "grad_norm": 0.3011787533760071, "learning_rate": 6.502770325443713e-06, "loss": 1.2, "step": 20989 }, { "epoch": 6.251791731044881, "grad_norm": 0.31051358580589294, "learning_rate": 6.501866684664784e-06, "loss": 1.2001, "step": 20990 }, { "epoch": 6.252089577244551, "grad_norm": 0.2914946675300598, "learning_rate": 6.5009630764320795e-06, "loss": 1.2113, "step": 20991 }, { "epoch": 6.252387423444219, "grad_norm": 0.26340430974960327, "learning_rate": 6.500059500754006e-06, "loss": 1.2135, "step": 20992 }, { "epoch": 6.252685269643887, "grad_norm": 0.3000192642211914, "learning_rate": 6.499155957638963e-06, "loss": 1.2107, "step": 20993 }, { "epoch": 6.2529831158435565, "grad_norm": 0.2933465838432312, "learning_rate": 6.498252447095366e-06, "loss": 1.2105, "step": 20994 }, { "epoch": 6.253280962043225, "grad_norm": 0.3004761338233948, "learning_rate": 6.4973489691316184e-06, "loss": 1.2072, "step": 20995 }, { "epoch": 6.253578808242894, "grad_norm": 0.3727788031101227, "learning_rate": 6.496445523756122e-06, "loss": 1.217, "step": 20996 }, { "epoch": 6.253876654442562, "grad_norm": 0.29946908354759216, "learning_rate": 6.495542110977288e-06, "loss": 1.2101, "step": 20997 }, { "epoch": 6.254174500642231, "grad_norm": 0.2809601426124573, "learning_rate": 6.494638730803522e-06, "loss": 1.2028, "step": 20998 }, { "epoch": 6.2544723468419, "grad_norm": 0.3155931234359741, "learning_rate": 6.493735383243221e-06, "loss": 1.2228, "step": 20999 }, { "epoch": 6.254770193041568, "grad_norm": 0.2671355605125427, "learning_rate": 6.492832068304796e-06, "loss": 1.19, "step": 21000 }, { "epoch": 6.254770193041568, "eval_loss": 1.3171136379241943, "eval_runtime": 25.842, "eval_samples_per_second": 67.1, "eval_steps_per_second": 4.218, "step": 21000 }, { "epoch": 6.254770193041568, "step": 21000, "total_flos": 2.2989868385333726e+20, "train_loss": 1.263435139360882, "train_runtime": 585194.7226, "train_samples_per_second": 29.375, "train_steps_per_second": 0.057 } ], "logging_steps": 1, "max_steps": 33570, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2989868385333726e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }