diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20990764063811923, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020990764063811922, + "grad_norm": 0.8211116790771484, + "learning_rate": 2.09643605870021e-07, + "loss": 0.9775, + "step": 1 + }, + { + "epoch": 0.00041981528127623844, + "grad_norm": 0.7017123103141785, + "learning_rate": 4.19287211740042e-07, + "loss": 0.868, + "step": 2 + }, + { + "epoch": 0.0006297229219143577, + "grad_norm": 0.6900752782821655, + "learning_rate": 6.28930817610063e-07, + "loss": 0.8862, + "step": 3 + }, + { + "epoch": 0.0008396305625524769, + "grad_norm": 0.7597951292991638, + "learning_rate": 8.38574423480084e-07, + "loss": 0.923, + "step": 4 + }, + { + "epoch": 0.0010495382031905961, + "grad_norm": 0.8734421730041504, + "learning_rate": 1.048218029350105e-06, + "loss": 1.0745, + "step": 5 + }, + { + "epoch": 0.0012594458438287153, + "grad_norm": 0.8313158750534058, + "learning_rate": 1.257861635220126e-06, + "loss": 0.9817, + "step": 6 + }, + { + "epoch": 0.0014693534844668346, + "grad_norm": 0.656913697719574, + "learning_rate": 1.467505241090147e-06, + "loss": 0.8544, + "step": 7 + }, + { + "epoch": 0.0016792611251049538, + "grad_norm": 0.7147523164749146, + "learning_rate": 1.677148846960168e-06, + "loss": 0.8955, + "step": 8 + }, + { + "epoch": 0.001889168765743073, + "grad_norm": 0.753795325756073, + "learning_rate": 1.8867924528301887e-06, + "loss": 0.881, + "step": 9 + }, + { + "epoch": 0.0020990764063811922, + "grad_norm": 0.7141973972320557, + "learning_rate": 2.09643605870021e-06, + "loss": 0.9229, + "step": 10 + }, + { + "epoch": 0.0023089840470193117, + "grad_norm": 0.8047632575035095, + "learning_rate": 2.306079664570231e-06, + "loss": 0.9438, + "step": 11 + }, + { + "epoch": 0.0025188916876574307, + "grad_norm": 0.8492773175239563, + "learning_rate": 2.515723270440252e-06, + "loss": 0.9977, + "step": 12 + }, + { + "epoch": 0.00272879932829555, + "grad_norm": 0.744103729724884, + "learning_rate": 2.7253668763102727e-06, + "loss": 0.9049, + "step": 13 + }, + { + "epoch": 0.002938706968933669, + "grad_norm": 0.7073290944099426, + "learning_rate": 2.935010482180294e-06, + "loss": 0.8824, + "step": 14 + }, + { + "epoch": 0.0031486146095717885, + "grad_norm": 0.6808626651763916, + "learning_rate": 3.1446540880503146e-06, + "loss": 0.8854, + "step": 15 + }, + { + "epoch": 0.0033585222502099076, + "grad_norm": 0.6526097655296326, + "learning_rate": 3.354297693920336e-06, + "loss": 0.8219, + "step": 16 + }, + { + "epoch": 0.003568429890848027, + "grad_norm": 0.8246333003044128, + "learning_rate": 3.563941299790356e-06, + "loss": 0.9948, + "step": 17 + }, + { + "epoch": 0.003778337531486146, + "grad_norm": 0.8494473099708557, + "learning_rate": 3.7735849056603773e-06, + "loss": 1.0187, + "step": 18 + }, + { + "epoch": 0.003988245172124265, + "grad_norm": 0.9259452223777771, + "learning_rate": 3.9832285115303985e-06, + "loss": 1.0864, + "step": 19 + }, + { + "epoch": 0.0041981528127623844, + "grad_norm": 0.8235483765602112, + "learning_rate": 4.19287211740042e-06, + "loss": 1.0255, + "step": 20 + }, + { + "epoch": 0.004408060453400504, + "grad_norm": 0.8101536631584167, + "learning_rate": 4.40251572327044e-06, + "loss": 0.9892, + "step": 21 + }, + { + "epoch": 0.004617968094038623, + "grad_norm": 0.7406070232391357, + "learning_rate": 4.612159329140462e-06, + "loss": 0.9266, + "step": 22 + }, + { + "epoch": 0.004827875734676742, + "grad_norm": 0.7151230573654175, + "learning_rate": 4.821802935010482e-06, + "loss": 0.902, + "step": 23 + }, + { + "epoch": 0.005037783375314861, + "grad_norm": 0.6921148896217346, + "learning_rate": 5.031446540880504e-06, + "loss": 0.8621, + "step": 24 + }, + { + "epoch": 0.005247691015952981, + "grad_norm": 0.6169761419296265, + "learning_rate": 5.241090146750524e-06, + "loss": 0.8296, + "step": 25 + }, + { + "epoch": 0.0054575986565911, + "grad_norm": 0.679093599319458, + "learning_rate": 5.4507337526205454e-06, + "loss": 0.8593, + "step": 26 + }, + { + "epoch": 0.005667506297229219, + "grad_norm": 0.8058671951293945, + "learning_rate": 5.660377358490566e-06, + "loss": 0.9504, + "step": 27 + }, + { + "epoch": 0.005877413937867338, + "grad_norm": 0.8705667853355408, + "learning_rate": 5.870020964360588e-06, + "loss": 1.0245, + "step": 28 + }, + { + "epoch": 0.006087321578505458, + "grad_norm": 0.7334048748016357, + "learning_rate": 6.079664570230608e-06, + "loss": 0.9172, + "step": 29 + }, + { + "epoch": 0.006297229219143577, + "grad_norm": 0.7490406632423401, + "learning_rate": 6.289308176100629e-06, + "loss": 0.9329, + "step": 30 + }, + { + "epoch": 0.006507136859781696, + "grad_norm": 0.8739404678344727, + "learning_rate": 6.49895178197065e-06, + "loss": 0.9584, + "step": 31 + }, + { + "epoch": 0.006717044500419815, + "grad_norm": 0.7686614990234375, + "learning_rate": 6.708595387840672e-06, + "loss": 0.9493, + "step": 32 + }, + { + "epoch": 0.0069269521410579345, + "grad_norm": 0.9184572100639343, + "learning_rate": 6.918238993710692e-06, + "loss": 1.0678, + "step": 33 + }, + { + "epoch": 0.007136859781696054, + "grad_norm": 0.8303453326225281, + "learning_rate": 7.127882599580712e-06, + "loss": 0.9834, + "step": 34 + }, + { + "epoch": 0.0073467674223341725, + "grad_norm": 0.6670796871185303, + "learning_rate": 7.337526205450735e-06, + "loss": 0.8434, + "step": 35 + }, + { + "epoch": 0.007556675062972292, + "grad_norm": 0.6518784761428833, + "learning_rate": 7.547169811320755e-06, + "loss": 0.8445, + "step": 36 + }, + { + "epoch": 0.007766582703610411, + "grad_norm": 0.8396414518356323, + "learning_rate": 7.756813417190776e-06, + "loss": 0.9229, + "step": 37 + }, + { + "epoch": 0.00797649034424853, + "grad_norm": 0.9019722938537598, + "learning_rate": 7.966457023060797e-06, + "loss": 0.9545, + "step": 38 + }, + { + "epoch": 0.00818639798488665, + "grad_norm": 0.6634522676467896, + "learning_rate": 8.176100628930818e-06, + "loss": 0.8279, + "step": 39 + }, + { + "epoch": 0.008396305625524769, + "grad_norm": 0.86155104637146, + "learning_rate": 8.38574423480084e-06, + "loss": 0.9326, + "step": 40 + }, + { + "epoch": 0.008606213266162888, + "grad_norm": 0.6448208689689636, + "learning_rate": 8.59538784067086e-06, + "loss": 0.8261, + "step": 41 + }, + { + "epoch": 0.008816120906801008, + "grad_norm": 0.7442598938941956, + "learning_rate": 8.80503144654088e-06, + "loss": 0.9352, + "step": 42 + }, + { + "epoch": 0.009026028547439127, + "grad_norm": 0.8854546546936035, + "learning_rate": 9.014675052410902e-06, + "loss": 1.017, + "step": 43 + }, + { + "epoch": 0.009235936188077247, + "grad_norm": 0.66485196352005, + "learning_rate": 9.224318658280923e-06, + "loss": 0.8096, + "step": 44 + }, + { + "epoch": 0.009445843828715366, + "grad_norm": 0.5998132824897766, + "learning_rate": 9.433962264150944e-06, + "loss": 0.7932, + "step": 45 + }, + { + "epoch": 0.009655751469353484, + "grad_norm": 0.6402536034584045, + "learning_rate": 9.643605870020965e-06, + "loss": 0.8146, + "step": 46 + }, + { + "epoch": 0.009865659109991603, + "grad_norm": 0.6665769219398499, + "learning_rate": 9.853249475890985e-06, + "loss": 0.8435, + "step": 47 + }, + { + "epoch": 0.010075566750629723, + "grad_norm": 0.6947203874588013, + "learning_rate": 1.0062893081761008e-05, + "loss": 0.8712, + "step": 48 + }, + { + "epoch": 0.010285474391267842, + "grad_norm": 0.6707759499549866, + "learning_rate": 1.0272536687631027e-05, + "loss": 0.8382, + "step": 49 + }, + { + "epoch": 0.010495382031905962, + "grad_norm": 0.6716253161430359, + "learning_rate": 1.0482180293501048e-05, + "loss": 0.8887, + "step": 50 + }, + { + "epoch": 0.010705289672544081, + "grad_norm": 0.6954374313354492, + "learning_rate": 1.069182389937107e-05, + "loss": 0.895, + "step": 51 + }, + { + "epoch": 0.0109151973131822, + "grad_norm": 0.5633372068405151, + "learning_rate": 1.0901467505241091e-05, + "loss": 0.7971, + "step": 52 + }, + { + "epoch": 0.01112510495382032, + "grad_norm": 0.5513401031494141, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.7848, + "step": 53 + }, + { + "epoch": 0.011335012594458438, + "grad_norm": 0.6735419034957886, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.8558, + "step": 54 + }, + { + "epoch": 0.011544920235096557, + "grad_norm": 0.5273690223693848, + "learning_rate": 1.1530398322851153e-05, + "loss": 0.7531, + "step": 55 + }, + { + "epoch": 0.011754827875734676, + "grad_norm": 0.612610936164856, + "learning_rate": 1.1740041928721176e-05, + "loss": 0.8009, + "step": 56 + }, + { + "epoch": 0.011964735516372796, + "grad_norm": 0.5250133275985718, + "learning_rate": 1.1949685534591196e-05, + "loss": 0.7411, + "step": 57 + }, + { + "epoch": 0.012174643157010915, + "grad_norm": 0.6210602521896362, + "learning_rate": 1.2159329140461215e-05, + "loss": 0.814, + "step": 58 + }, + { + "epoch": 0.012384550797649035, + "grad_norm": 0.5767892003059387, + "learning_rate": 1.2368972746331238e-05, + "loss": 0.7512, + "step": 59 + }, + { + "epoch": 0.012594458438287154, + "grad_norm": 0.5112836360931396, + "learning_rate": 1.2578616352201259e-05, + "loss": 0.7405, + "step": 60 + }, + { + "epoch": 0.012804366078925274, + "grad_norm": 0.5214811563491821, + "learning_rate": 1.2788259958071281e-05, + "loss": 0.7152, + "step": 61 + }, + { + "epoch": 0.013014273719563391, + "grad_norm": 0.5820367932319641, + "learning_rate": 1.29979035639413e-05, + "loss": 0.7359, + "step": 62 + }, + { + "epoch": 0.01322418136020151, + "grad_norm": 0.5728296637535095, + "learning_rate": 1.320754716981132e-05, + "loss": 0.7096, + "step": 63 + }, + { + "epoch": 0.01343408900083963, + "grad_norm": 0.514997124671936, + "learning_rate": 1.3417190775681343e-05, + "loss": 0.6945, + "step": 64 + }, + { + "epoch": 0.01364399664147775, + "grad_norm": 0.5707572102546692, + "learning_rate": 1.3626834381551362e-05, + "loss": 0.7065, + "step": 65 + }, + { + "epoch": 0.013853904282115869, + "grad_norm": 0.5674712657928467, + "learning_rate": 1.3836477987421385e-05, + "loss": 0.6759, + "step": 66 + }, + { + "epoch": 0.014063811922753989, + "grad_norm": 0.5445975661277771, + "learning_rate": 1.4046121593291406e-05, + "loss": 0.6487, + "step": 67 + }, + { + "epoch": 0.014273719563392108, + "grad_norm": 0.5629355311393738, + "learning_rate": 1.4255765199161425e-05, + "loss": 0.6635, + "step": 68 + }, + { + "epoch": 0.014483627204030227, + "grad_norm": 0.47151610255241394, + "learning_rate": 1.4465408805031447e-05, + "loss": 0.5976, + "step": 69 + }, + { + "epoch": 0.014693534844668345, + "grad_norm": 0.44633767008781433, + "learning_rate": 1.467505241090147e-05, + "loss": 0.5738, + "step": 70 + }, + { + "epoch": 0.014903442485306465, + "grad_norm": 0.48507657647132874, + "learning_rate": 1.488469601677149e-05, + "loss": 0.5859, + "step": 71 + }, + { + "epoch": 0.015113350125944584, + "grad_norm": 0.4147733151912689, + "learning_rate": 1.509433962264151e-05, + "loss": 0.5386, + "step": 72 + }, + { + "epoch": 0.015323257766582703, + "grad_norm": 0.635608434677124, + "learning_rate": 1.530398322851153e-05, + "loss": 0.6529, + "step": 73 + }, + { + "epoch": 0.015533165407220823, + "grad_norm": 0.5556919574737549, + "learning_rate": 1.5513626834381552e-05, + "loss": 0.5935, + "step": 74 + }, + { + "epoch": 0.015743073047858942, + "grad_norm": 0.5627433657646179, + "learning_rate": 1.572327044025157e-05, + "loss": 0.5677, + "step": 75 + }, + { + "epoch": 0.01595298068849706, + "grad_norm": 0.5727344155311584, + "learning_rate": 1.5932914046121594e-05, + "loss": 0.5701, + "step": 76 + }, + { + "epoch": 0.01616288832913518, + "grad_norm": 0.5192092657089233, + "learning_rate": 1.6142557651991616e-05, + "loss": 0.537, + "step": 77 + }, + { + "epoch": 0.0163727959697733, + "grad_norm": 0.6583610773086548, + "learning_rate": 1.6352201257861635e-05, + "loss": 0.5718, + "step": 78 + }, + { + "epoch": 0.01658270361041142, + "grad_norm": 0.4762994050979614, + "learning_rate": 1.6561844863731658e-05, + "loss": 0.5016, + "step": 79 + }, + { + "epoch": 0.016792611251049538, + "grad_norm": 0.7903013825416565, + "learning_rate": 1.677148846960168e-05, + "loss": 0.6123, + "step": 80 + }, + { + "epoch": 0.01700251889168766, + "grad_norm": 0.6027877330780029, + "learning_rate": 1.69811320754717e-05, + "loss": 0.5085, + "step": 81 + }, + { + "epoch": 0.017212426532325777, + "grad_norm": 0.6400225162506104, + "learning_rate": 1.719077568134172e-05, + "loss": 0.5325, + "step": 82 + }, + { + "epoch": 0.017422334172963894, + "grad_norm": 0.5193424224853516, + "learning_rate": 1.740041928721174e-05, + "loss": 0.476, + "step": 83 + }, + { + "epoch": 0.017632241813602016, + "grad_norm": 0.5318325757980347, + "learning_rate": 1.761006289308176e-05, + "loss": 0.4574, + "step": 84 + }, + { + "epoch": 0.017842149454240133, + "grad_norm": 0.5530166029930115, + "learning_rate": 1.7819706498951782e-05, + "loss": 0.4331, + "step": 85 + }, + { + "epoch": 0.018052057094878254, + "grad_norm": 0.5483909845352173, + "learning_rate": 1.8029350104821805e-05, + "loss": 0.4254, + "step": 86 + }, + { + "epoch": 0.018261964735516372, + "grad_norm": 0.4599871039390564, + "learning_rate": 1.8238993710691824e-05, + "loss": 0.3828, + "step": 87 + }, + { + "epoch": 0.018471872376154493, + "grad_norm": 0.47489672899246216, + "learning_rate": 1.8448637316561846e-05, + "loss": 0.3989, + "step": 88 + }, + { + "epoch": 0.01868178001679261, + "grad_norm": 0.6532279253005981, + "learning_rate": 1.865828092243187e-05, + "loss": 0.3968, + "step": 89 + }, + { + "epoch": 0.018891687657430732, + "grad_norm": 0.707245945930481, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.4099, + "step": 90 + }, + { + "epoch": 0.01910159529806885, + "grad_norm": 0.4290582835674286, + "learning_rate": 1.9077568134171907e-05, + "loss": 0.3546, + "step": 91 + }, + { + "epoch": 0.019311502938706968, + "grad_norm": 0.4370197057723999, + "learning_rate": 1.928721174004193e-05, + "loss": 0.3519, + "step": 92 + }, + { + "epoch": 0.01952141057934509, + "grad_norm": 0.4862878918647766, + "learning_rate": 1.9496855345911952e-05, + "loss": 0.3494, + "step": 93 + }, + { + "epoch": 0.019731318219983206, + "grad_norm": 0.514576256275177, + "learning_rate": 1.970649895178197e-05, + "loss": 0.326, + "step": 94 + }, + { + "epoch": 0.019941225860621328, + "grad_norm": 0.3785364031791687, + "learning_rate": 1.9916142557651993e-05, + "loss": 0.3196, + "step": 95 + }, + { + "epoch": 0.020151133501259445, + "grad_norm": 0.4459572732448578, + "learning_rate": 2.0125786163522016e-05, + "loss": 0.3245, + "step": 96 + }, + { + "epoch": 0.020361041141897566, + "grad_norm": 0.3634876310825348, + "learning_rate": 2.0335429769392035e-05, + "loss": 0.2999, + "step": 97 + }, + { + "epoch": 0.020570948782535684, + "grad_norm": 0.39789989590644836, + "learning_rate": 2.0545073375262054e-05, + "loss": 0.2908, + "step": 98 + }, + { + "epoch": 0.020780856423173802, + "grad_norm": 0.3628767430782318, + "learning_rate": 2.0754716981132076e-05, + "loss": 0.278, + "step": 99 + }, + { + "epoch": 0.020990764063811923, + "grad_norm": 0.3945654332637787, + "learning_rate": 2.0964360587002095e-05, + "loss": 0.2902, + "step": 100 + }, + { + "epoch": 0.02120067170445004, + "grad_norm": 0.2995467185974121, + "learning_rate": 2.1174004192872118e-05, + "loss": 0.2748, + "step": 101 + }, + { + "epoch": 0.021410579345088162, + "grad_norm": 0.2776371240615845, + "learning_rate": 2.138364779874214e-05, + "loss": 0.2772, + "step": 102 + }, + { + "epoch": 0.02162048698572628, + "grad_norm": 0.292316734790802, + "learning_rate": 2.159329140461216e-05, + "loss": 0.2732, + "step": 103 + }, + { + "epoch": 0.0218303946263644, + "grad_norm": 0.26235565543174744, + "learning_rate": 2.1802935010482182e-05, + "loss": 0.2592, + "step": 104 + }, + { + "epoch": 0.02204030226700252, + "grad_norm": 0.2782291769981384, + "learning_rate": 2.2012578616352204e-05, + "loss": 0.2586, + "step": 105 + }, + { + "epoch": 0.02225020990764064, + "grad_norm": 0.24781855940818787, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.022460117548278757, + "grad_norm": 0.3645837604999542, + "learning_rate": 2.2431865828092242e-05, + "loss": 0.247, + "step": 107 + }, + { + "epoch": 0.022670025188916875, + "grad_norm": 0.4096992313861847, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.2648, + "step": 108 + }, + { + "epoch": 0.022879932829554996, + "grad_norm": 0.36801740527153015, + "learning_rate": 2.2851153039832284e-05, + "loss": 0.2677, + "step": 109 + }, + { + "epoch": 0.023089840470193114, + "grad_norm": 0.221563920378685, + "learning_rate": 2.3060796645702306e-05, + "loss": 0.2544, + "step": 110 + }, + { + "epoch": 0.023299748110831235, + "grad_norm": 0.17734010517597198, + "learning_rate": 2.327044025157233e-05, + "loss": 0.2458, + "step": 111 + }, + { + "epoch": 0.023509655751469353, + "grad_norm": 0.23815220594406128, + "learning_rate": 2.348008385744235e-05, + "loss": 0.2249, + "step": 112 + }, + { + "epoch": 0.023719563392107474, + "grad_norm": 0.24534103274345398, + "learning_rate": 2.368972746331237e-05, + "loss": 0.2283, + "step": 113 + }, + { + "epoch": 0.02392947103274559, + "grad_norm": 0.17044654488563538, + "learning_rate": 2.3899371069182393e-05, + "loss": 0.2543, + "step": 114 + }, + { + "epoch": 0.02413937867338371, + "grad_norm": 0.22411420941352844, + "learning_rate": 2.4109014675052412e-05, + "loss": 0.2537, + "step": 115 + }, + { + "epoch": 0.02434928631402183, + "grad_norm": 0.17107880115509033, + "learning_rate": 2.431865828092243e-05, + "loss": 0.2477, + "step": 116 + }, + { + "epoch": 0.02455919395465995, + "grad_norm": 0.19663883745670319, + "learning_rate": 2.4528301886792453e-05, + "loss": 0.2503, + "step": 117 + }, + { + "epoch": 0.02476910159529807, + "grad_norm": 0.2755718231201172, + "learning_rate": 2.4737945492662476e-05, + "loss": 0.201, + "step": 118 + }, + { + "epoch": 0.024979009235936187, + "grad_norm": 0.16535572707653046, + "learning_rate": 2.4947589098532495e-05, + "loss": 0.2364, + "step": 119 + }, + { + "epoch": 0.02518891687657431, + "grad_norm": 0.15798090398311615, + "learning_rate": 2.5157232704402517e-05, + "loss": 0.214, + "step": 120 + }, + { + "epoch": 0.025398824517212426, + "grad_norm": 0.1900860220193863, + "learning_rate": 2.5366876310272536e-05, + "loss": 0.2182, + "step": 121 + }, + { + "epoch": 0.025608732157850547, + "grad_norm": 0.18855144083499908, + "learning_rate": 2.5576519916142562e-05, + "loss": 0.2123, + "step": 122 + }, + { + "epoch": 0.025818639798488665, + "grad_norm": 0.1780049353837967, + "learning_rate": 2.578616352201258e-05, + "loss": 0.2197, + "step": 123 + }, + { + "epoch": 0.026028547439126783, + "grad_norm": 0.27040061354637146, + "learning_rate": 2.59958071278826e-05, + "loss": 0.2395, + "step": 124 + }, + { + "epoch": 0.026238455079764904, + "grad_norm": 0.1983417570590973, + "learning_rate": 2.6205450733752623e-05, + "loss": 0.2197, + "step": 125 + }, + { + "epoch": 0.02644836272040302, + "grad_norm": 0.1723383665084839, + "learning_rate": 2.641509433962264e-05, + "loss": 0.2183, + "step": 126 + }, + { + "epoch": 0.026658270361041143, + "grad_norm": 0.24477605521678925, + "learning_rate": 2.662473794549266e-05, + "loss": 0.2293, + "step": 127 + }, + { + "epoch": 0.02686817800167926, + "grad_norm": 0.15110079944133759, + "learning_rate": 2.6834381551362687e-05, + "loss": 0.2245, + "step": 128 + }, + { + "epoch": 0.02707808564231738, + "grad_norm": 0.22366492450237274, + "learning_rate": 2.7044025157232706e-05, + "loss": 0.2031, + "step": 129 + }, + { + "epoch": 0.0272879932829555, + "grad_norm": 0.32891571521759033, + "learning_rate": 2.7253668763102725e-05, + "loss": 0.1821, + "step": 130 + }, + { + "epoch": 0.02749790092359362, + "grad_norm": 0.20027081668376923, + "learning_rate": 2.746331236897275e-05, + "loss": 0.223, + "step": 131 + }, + { + "epoch": 0.027707808564231738, + "grad_norm": 0.2269366830587387, + "learning_rate": 2.767295597484277e-05, + "loss": 0.2246, + "step": 132 + }, + { + "epoch": 0.027917716204869856, + "grad_norm": 0.1355280727148056, + "learning_rate": 2.788259958071279e-05, + "loss": 0.2183, + "step": 133 + }, + { + "epoch": 0.028127623845507977, + "grad_norm": 0.29291626811027527, + "learning_rate": 2.809224318658281e-05, + "loss": 0.2049, + "step": 134 + }, + { + "epoch": 0.028337531486146095, + "grad_norm": 0.1778186410665512, + "learning_rate": 2.830188679245283e-05, + "loss": 0.2095, + "step": 135 + }, + { + "epoch": 0.028547439126784216, + "grad_norm": 0.23263931274414062, + "learning_rate": 2.851153039832285e-05, + "loss": 0.2372, + "step": 136 + }, + { + "epoch": 0.028757346767422334, + "grad_norm": 0.2121749222278595, + "learning_rate": 2.8721174004192875e-05, + "loss": 0.2107, + "step": 137 + }, + { + "epoch": 0.028967254408060455, + "grad_norm": 0.19954991340637207, + "learning_rate": 2.8930817610062894e-05, + "loss": 0.2181, + "step": 138 + }, + { + "epoch": 0.029177162048698572, + "grad_norm": 0.15431921184062958, + "learning_rate": 2.9140461215932913e-05, + "loss": 0.2198, + "step": 139 + }, + { + "epoch": 0.02938706968933669, + "grad_norm": 0.17603729665279388, + "learning_rate": 2.935010482180294e-05, + "loss": 0.2339, + "step": 140 + }, + { + "epoch": 0.02959697732997481, + "grad_norm": 0.1604471355676651, + "learning_rate": 2.9559748427672958e-05, + "loss": 0.2064, + "step": 141 + }, + { + "epoch": 0.02980688497061293, + "grad_norm": 0.17169184982776642, + "learning_rate": 2.976939203354298e-05, + "loss": 0.1987, + "step": 142 + }, + { + "epoch": 0.03001679261125105, + "grad_norm": 0.1285230815410614, + "learning_rate": 2.9979035639413e-05, + "loss": 0.2342, + "step": 143 + }, + { + "epoch": 0.030226700251889168, + "grad_norm": 0.1755395084619522, + "learning_rate": 3.018867924528302e-05, + "loss": 0.222, + "step": 144 + }, + { + "epoch": 0.03043660789252729, + "grad_norm": 0.15474987030029297, + "learning_rate": 3.0398322851153044e-05, + "loss": 0.2159, + "step": 145 + }, + { + "epoch": 0.030646515533165407, + "grad_norm": 0.12986472249031067, + "learning_rate": 3.060796645702306e-05, + "loss": 0.2124, + "step": 146 + }, + { + "epoch": 0.030856423173803528, + "grad_norm": 0.1458188146352768, + "learning_rate": 3.081761006289308e-05, + "loss": 0.214, + "step": 147 + }, + { + "epoch": 0.031066330814441646, + "grad_norm": 0.1323792040348053, + "learning_rate": 3.1027253668763105e-05, + "loss": 0.2153, + "step": 148 + }, + { + "epoch": 0.03127623845507976, + "grad_norm": 0.16542711853981018, + "learning_rate": 3.1236897274633124e-05, + "loss": 0.2154, + "step": 149 + }, + { + "epoch": 0.031486146095717885, + "grad_norm": 0.17730407416820526, + "learning_rate": 3.144654088050314e-05, + "loss": 0.2202, + "step": 150 + }, + { + "epoch": 0.031696053736356006, + "grad_norm": 0.15039502084255219, + "learning_rate": 3.165618448637317e-05, + "loss": 0.2056, + "step": 151 + }, + { + "epoch": 0.03190596137699412, + "grad_norm": 0.20309150218963623, + "learning_rate": 3.186582809224319e-05, + "loss": 0.2175, + "step": 152 + }, + { + "epoch": 0.03211586901763224, + "grad_norm": 0.16652604937553406, + "learning_rate": 3.207547169811321e-05, + "loss": 0.2076, + "step": 153 + }, + { + "epoch": 0.03232577665827036, + "grad_norm": 0.14530467987060547, + "learning_rate": 3.228511530398323e-05, + "loss": 0.2167, + "step": 154 + }, + { + "epoch": 0.032535684298908484, + "grad_norm": 0.13003528118133545, + "learning_rate": 3.249475890985325e-05, + "loss": 0.2089, + "step": 155 + }, + { + "epoch": 0.0327455919395466, + "grad_norm": 0.16985855996608734, + "learning_rate": 3.270440251572327e-05, + "loss": 0.1994, + "step": 156 + }, + { + "epoch": 0.03295549958018472, + "grad_norm": 0.18479777872562408, + "learning_rate": 3.29140461215933e-05, + "loss": 0.2127, + "step": 157 + }, + { + "epoch": 0.03316540722082284, + "grad_norm": 0.1541491150856018, + "learning_rate": 3.3123689727463316e-05, + "loss": 0.2208, + "step": 158 + }, + { + "epoch": 0.033375314861460954, + "grad_norm": 0.13511165976524353, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.218, + "step": 159 + }, + { + "epoch": 0.033585222502099076, + "grad_norm": 0.1392865628004074, + "learning_rate": 3.354297693920336e-05, + "loss": 0.226, + "step": 160 + }, + { + "epoch": 0.0337951301427372, + "grad_norm": 0.1614847183227539, + "learning_rate": 3.375262054507338e-05, + "loss": 0.2072, + "step": 161 + }, + { + "epoch": 0.03400503778337532, + "grad_norm": 0.12186679244041443, + "learning_rate": 3.39622641509434e-05, + "loss": 0.2193, + "step": 162 + }, + { + "epoch": 0.03421494542401343, + "grad_norm": 0.1777278333902359, + "learning_rate": 3.417190775681342e-05, + "loss": 0.202, + "step": 163 + }, + { + "epoch": 0.03442485306465155, + "grad_norm": 0.13323499262332916, + "learning_rate": 3.438155136268344e-05, + "loss": 0.1983, + "step": 164 + }, + { + "epoch": 0.034634760705289674, + "grad_norm": 0.22301942110061646, + "learning_rate": 3.4591194968553456e-05, + "loss": 0.2252, + "step": 165 + }, + { + "epoch": 0.03484466834592779, + "grad_norm": 0.1538372039794922, + "learning_rate": 3.480083857442348e-05, + "loss": 0.2071, + "step": 166 + }, + { + "epoch": 0.03505457598656591, + "grad_norm": 0.128251314163208, + "learning_rate": 3.50104821802935e-05, + "loss": 0.214, + "step": 167 + }, + { + "epoch": 0.03526448362720403, + "grad_norm": 0.26556313037872314, + "learning_rate": 3.522012578616352e-05, + "loss": 0.1863, + "step": 168 + }, + { + "epoch": 0.03547439126784215, + "grad_norm": 0.14938302338123322, + "learning_rate": 3.5429769392033546e-05, + "loss": 0.2152, + "step": 169 + }, + { + "epoch": 0.035684298908480266, + "grad_norm": 0.19645771384239197, + "learning_rate": 3.5639412997903565e-05, + "loss": 0.1977, + "step": 170 + }, + { + "epoch": 0.03589420654911839, + "grad_norm": 0.1725340038537979, + "learning_rate": 3.5849056603773584e-05, + "loss": 0.1981, + "step": 171 + }, + { + "epoch": 0.03610411418975651, + "grad_norm": 0.16048069298267365, + "learning_rate": 3.605870020964361e-05, + "loss": 0.1983, + "step": 172 + }, + { + "epoch": 0.03631402183039462, + "grad_norm": 0.21855585277080536, + "learning_rate": 3.626834381551363e-05, + "loss": 0.2321, + "step": 173 + }, + { + "epoch": 0.036523929471032744, + "grad_norm": 0.1136787012219429, + "learning_rate": 3.647798742138365e-05, + "loss": 0.1973, + "step": 174 + }, + { + "epoch": 0.036733837111670865, + "grad_norm": 0.15145623683929443, + "learning_rate": 3.6687631027253674e-05, + "loss": 0.1947, + "step": 175 + }, + { + "epoch": 0.036943744752308987, + "grad_norm": 0.21631890535354614, + "learning_rate": 3.689727463312369e-05, + "loss": 0.2151, + "step": 176 + }, + { + "epoch": 0.0371536523929471, + "grad_norm": 0.2623152434825897, + "learning_rate": 3.710691823899371e-05, + "loss": 0.1814, + "step": 177 + }, + { + "epoch": 0.03736356003358522, + "grad_norm": 0.1753605455160141, + "learning_rate": 3.731656184486374e-05, + "loss": 0.203, + "step": 178 + }, + { + "epoch": 0.03757346767422334, + "grad_norm": 0.10878176242113113, + "learning_rate": 3.752620545073376e-05, + "loss": 0.2052, + "step": 179 + }, + { + "epoch": 0.037783375314861464, + "grad_norm": 0.13699688017368317, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.2004, + "step": 180 + }, + { + "epoch": 0.03799328295549958, + "grad_norm": 0.14288806915283203, + "learning_rate": 3.7945492662473795e-05, + "loss": 0.1745, + "step": 181 + }, + { + "epoch": 0.0382031905961377, + "grad_norm": 0.12457548081874847, + "learning_rate": 3.8155136268343814e-05, + "loss": 0.194, + "step": 182 + }, + { + "epoch": 0.03841309823677582, + "grad_norm": 0.167145237326622, + "learning_rate": 3.836477987421384e-05, + "loss": 0.2027, + "step": 183 + }, + { + "epoch": 0.038623005877413935, + "grad_norm": 0.12857979536056519, + "learning_rate": 3.857442348008386e-05, + "loss": 0.2126, + "step": 184 + }, + { + "epoch": 0.038832913518052056, + "grad_norm": 0.16190126538276672, + "learning_rate": 3.878406708595388e-05, + "loss": 0.2037, + "step": 185 + }, + { + "epoch": 0.03904282115869018, + "grad_norm": 0.168744757771492, + "learning_rate": 3.8993710691823904e-05, + "loss": 0.2108, + "step": 186 + }, + { + "epoch": 0.0392527287993283, + "grad_norm": 0.1676539033651352, + "learning_rate": 3.920335429769392e-05, + "loss": 0.216, + "step": 187 + }, + { + "epoch": 0.03946263643996641, + "grad_norm": 0.13556820154190063, + "learning_rate": 3.941299790356394e-05, + "loss": 0.2085, + "step": 188 + }, + { + "epoch": 0.039672544080604534, + "grad_norm": 0.1797979772090912, + "learning_rate": 3.962264150943397e-05, + "loss": 0.1933, + "step": 189 + }, + { + "epoch": 0.039882451721242655, + "grad_norm": 0.20826327800750732, + "learning_rate": 3.983228511530399e-05, + "loss": 0.2214, + "step": 190 + }, + { + "epoch": 0.04009235936188077, + "grad_norm": 0.19972363114356995, + "learning_rate": 4.0041928721174006e-05, + "loss": 0.1941, + "step": 191 + }, + { + "epoch": 0.04030226700251889, + "grad_norm": 0.149556502699852, + "learning_rate": 4.025157232704403e-05, + "loss": 0.1956, + "step": 192 + }, + { + "epoch": 0.04051217464315701, + "grad_norm": 0.22496013343334198, + "learning_rate": 4.046121593291405e-05, + "loss": 0.1973, + "step": 193 + }, + { + "epoch": 0.04072208228379513, + "grad_norm": 0.16132576763629913, + "learning_rate": 4.067085953878407e-05, + "loss": 0.203, + "step": 194 + }, + { + "epoch": 0.04093198992443325, + "grad_norm": 0.17156128585338593, + "learning_rate": 4.088050314465409e-05, + "loss": 0.1942, + "step": 195 + }, + { + "epoch": 0.04114189756507137, + "grad_norm": 0.14846180379390717, + "learning_rate": 4.109014675052411e-05, + "loss": 0.2005, + "step": 196 + }, + { + "epoch": 0.04135180520570949, + "grad_norm": 0.20252752304077148, + "learning_rate": 4.129979035639413e-05, + "loss": 0.186, + "step": 197 + }, + { + "epoch": 0.041561712846347604, + "grad_norm": 0.16286462545394897, + "learning_rate": 4.150943396226415e-05, + "loss": 0.1997, + "step": 198 + }, + { + "epoch": 0.041771620486985725, + "grad_norm": 0.13008786737918854, + "learning_rate": 4.171907756813417e-05, + "loss": 0.2052, + "step": 199 + }, + { + "epoch": 0.041981528127623846, + "grad_norm": 0.13853180408477783, + "learning_rate": 4.192872117400419e-05, + "loss": 0.1907, + "step": 200 + }, + { + "epoch": 0.04219143576826197, + "grad_norm": 0.20382314920425415, + "learning_rate": 4.213836477987422e-05, + "loss": 0.1931, + "step": 201 + }, + { + "epoch": 0.04240134340890008, + "grad_norm": 0.23206844925880432, + "learning_rate": 4.2348008385744236e-05, + "loss": 0.1849, + "step": 202 + }, + { + "epoch": 0.0426112510495382, + "grad_norm": 0.2456827312707901, + "learning_rate": 4.2557651991614255e-05, + "loss": 0.2088, + "step": 203 + }, + { + "epoch": 0.042821158690176324, + "grad_norm": 0.15247821807861328, + "learning_rate": 4.276729559748428e-05, + "loss": 0.1944, + "step": 204 + }, + { + "epoch": 0.043031066330814445, + "grad_norm": 0.174981027841568, + "learning_rate": 4.29769392033543e-05, + "loss": 0.2161, + "step": 205 + }, + { + "epoch": 0.04324097397145256, + "grad_norm": 0.20193031430244446, + "learning_rate": 4.318658280922432e-05, + "loss": 0.2084, + "step": 206 + }, + { + "epoch": 0.04345088161209068, + "grad_norm": 0.20125791430473328, + "learning_rate": 4.3396226415094345e-05, + "loss": 0.189, + "step": 207 + }, + { + "epoch": 0.0436607892527288, + "grad_norm": 0.16958673298358917, + "learning_rate": 4.3605870020964364e-05, + "loss": 0.1952, + "step": 208 + }, + { + "epoch": 0.043870696893366916, + "grad_norm": 0.20714177191257477, + "learning_rate": 4.381551362683438e-05, + "loss": 0.1832, + "step": 209 + }, + { + "epoch": 0.04408060453400504, + "grad_norm": 0.14229562878608704, + "learning_rate": 4.402515723270441e-05, + "loss": 0.1806, + "step": 210 + }, + { + "epoch": 0.04429051217464316, + "grad_norm": 0.1985626220703125, + "learning_rate": 4.423480083857443e-05, + "loss": 0.1973, + "step": 211 + }, + { + "epoch": 0.04450041981528128, + "grad_norm": 0.13714846968650818, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.2018, + "step": 212 + }, + { + "epoch": 0.044710327455919394, + "grad_norm": 0.25591611862182617, + "learning_rate": 4.4654088050314466e-05, + "loss": 0.2001, + "step": 213 + }, + { + "epoch": 0.044920235096557515, + "grad_norm": 0.29017260670661926, + "learning_rate": 4.4863731656184485e-05, + "loss": 0.2278, + "step": 214 + }, + { + "epoch": 0.045130142737195636, + "grad_norm": 0.12594066560268402, + "learning_rate": 4.5073375262054504e-05, + "loss": 0.2023, + "step": 215 + }, + { + "epoch": 0.04534005037783375, + "grad_norm": 0.1602821797132492, + "learning_rate": 4.528301886792453e-05, + "loss": 0.2101, + "step": 216 + }, + { + "epoch": 0.04554995801847187, + "grad_norm": 0.36664336919784546, + "learning_rate": 4.549266247379455e-05, + "loss": 0.1665, + "step": 217 + }, + { + "epoch": 0.04575986565910999, + "grad_norm": 0.20665952563285828, + "learning_rate": 4.570230607966457e-05, + "loss": 0.1965, + "step": 218 + }, + { + "epoch": 0.045969773299748114, + "grad_norm": 0.15664128959178925, + "learning_rate": 4.5911949685534594e-05, + "loss": 0.2104, + "step": 219 + }, + { + "epoch": 0.04617968094038623, + "grad_norm": 0.14733830094337463, + "learning_rate": 4.612159329140461e-05, + "loss": 0.1955, + "step": 220 + }, + { + "epoch": 0.04638958858102435, + "grad_norm": 0.19135522842407227, + "learning_rate": 4.633123689727464e-05, + "loss": 0.1947, + "step": 221 + }, + { + "epoch": 0.04659949622166247, + "grad_norm": 0.24508413672447205, + "learning_rate": 4.654088050314466e-05, + "loss": 0.2005, + "step": 222 + }, + { + "epoch": 0.046809403862300585, + "grad_norm": 0.16794531047344208, + "learning_rate": 4.6750524109014677e-05, + "loss": 0.1958, + "step": 223 + }, + { + "epoch": 0.047019311502938706, + "grad_norm": 0.1870536506175995, + "learning_rate": 4.69601677148847e-05, + "loss": 0.1868, + "step": 224 + }, + { + "epoch": 0.04722921914357683, + "grad_norm": 0.16214439272880554, + "learning_rate": 4.716981132075472e-05, + "loss": 0.2003, + "step": 225 + }, + { + "epoch": 0.04743912678421495, + "grad_norm": 0.24978841841220856, + "learning_rate": 4.737945492662474e-05, + "loss": 0.1909, + "step": 226 + }, + { + "epoch": 0.04764903442485306, + "grad_norm": 0.163265198469162, + "learning_rate": 4.7589098532494766e-05, + "loss": 0.2056, + "step": 227 + }, + { + "epoch": 0.04785894206549118, + "grad_norm": 0.24885287880897522, + "learning_rate": 4.7798742138364785e-05, + "loss": 0.2035, + "step": 228 + }, + { + "epoch": 0.048068849706129305, + "grad_norm": 0.1393681764602661, + "learning_rate": 4.8008385744234804e-05, + "loss": 0.1976, + "step": 229 + }, + { + "epoch": 0.04827875734676742, + "grad_norm": 0.17042241990566254, + "learning_rate": 4.8218029350104823e-05, + "loss": 0.1824, + "step": 230 + }, + { + "epoch": 0.04848866498740554, + "grad_norm": 0.1625502109527588, + "learning_rate": 4.842767295597484e-05, + "loss": 0.1904, + "step": 231 + }, + { + "epoch": 0.04869857262804366, + "grad_norm": 0.14994169771671295, + "learning_rate": 4.863731656184486e-05, + "loss": 0.1926, + "step": 232 + }, + { + "epoch": 0.04890848026868178, + "grad_norm": 0.15602821111679077, + "learning_rate": 4.884696016771489e-05, + "loss": 0.1828, + "step": 233 + }, + { + "epoch": 0.0491183879093199, + "grad_norm": 0.13405688107013702, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.1989, + "step": 234 + }, + { + "epoch": 0.04932829554995802, + "grad_norm": 0.18559689819812775, + "learning_rate": 4.9266247379454926e-05, + "loss": 0.2131, + "step": 235 + }, + { + "epoch": 0.04953820319059614, + "grad_norm": 0.1557319462299347, + "learning_rate": 4.947589098532495e-05, + "loss": 0.19, + "step": 236 + }, + { + "epoch": 0.04974811083123426, + "grad_norm": 0.2162303477525711, + "learning_rate": 4.968553459119497e-05, + "loss": 0.196, + "step": 237 + }, + { + "epoch": 0.049958018471872374, + "grad_norm": 0.17403477430343628, + "learning_rate": 4.989517819706499e-05, + "loss": 0.1997, + "step": 238 + }, + { + "epoch": 0.050167926112510496, + "grad_norm": 0.11738390475511551, + "learning_rate": 5.010482180293501e-05, + "loss": 0.1955, + "step": 239 + }, + { + "epoch": 0.05037783375314862, + "grad_norm": 0.15942999720573425, + "learning_rate": 5.0314465408805034e-05, + "loss": 0.1899, + "step": 240 + }, + { + "epoch": 0.05058774139378673, + "grad_norm": 0.14695511758327484, + "learning_rate": 5.052410901467506e-05, + "loss": 0.2032, + "step": 241 + }, + { + "epoch": 0.05079764903442485, + "grad_norm": 0.16291062533855438, + "learning_rate": 5.073375262054507e-05, + "loss": 0.1929, + "step": 242 + }, + { + "epoch": 0.05100755667506297, + "grad_norm": 0.18514905869960785, + "learning_rate": 5.09433962264151e-05, + "loss": 0.1899, + "step": 243 + }, + { + "epoch": 0.051217464315701094, + "grad_norm": 0.2196233868598938, + "learning_rate": 5.1153039832285124e-05, + "loss": 0.1741, + "step": 244 + }, + { + "epoch": 0.05142737195633921, + "grad_norm": 0.19183433055877686, + "learning_rate": 5.1362683438155136e-05, + "loss": 0.1985, + "step": 245 + }, + { + "epoch": 0.05163727959697733, + "grad_norm": 0.1604142189025879, + "learning_rate": 5.157232704402516e-05, + "loss": 0.1865, + "step": 246 + }, + { + "epoch": 0.05184718723761545, + "grad_norm": 0.18311725556850433, + "learning_rate": 5.178197064989518e-05, + "loss": 0.2005, + "step": 247 + }, + { + "epoch": 0.052057094878253565, + "grad_norm": 0.21732251346111298, + "learning_rate": 5.19916142557652e-05, + "loss": 0.2032, + "step": 248 + }, + { + "epoch": 0.052267002518891686, + "grad_norm": 0.2600694000720978, + "learning_rate": 5.220125786163522e-05, + "loss": 0.1838, + "step": 249 + }, + { + "epoch": 0.05247691015952981, + "grad_norm": 0.18634290993213654, + "learning_rate": 5.2410901467505245e-05, + "loss": 0.1809, + "step": 250 + }, + { + "epoch": 0.05268681780016793, + "grad_norm": 0.14735780656337738, + "learning_rate": 5.262054507337526e-05, + "loss": 0.1981, + "step": 251 + }, + { + "epoch": 0.05289672544080604, + "grad_norm": 0.16118381917476654, + "learning_rate": 5.283018867924528e-05, + "loss": 0.1836, + "step": 252 + }, + { + "epoch": 0.053106633081444164, + "grad_norm": 0.17707999050617218, + "learning_rate": 5.303983228511531e-05, + "loss": 0.2044, + "step": 253 + }, + { + "epoch": 0.053316540722082285, + "grad_norm": 0.25897523760795593, + "learning_rate": 5.324947589098532e-05, + "loss": 0.1753, + "step": 254 + }, + { + "epoch": 0.0535264483627204, + "grad_norm": 0.1371389776468277, + "learning_rate": 5.345911949685535e-05, + "loss": 0.1839, + "step": 255 + }, + { + "epoch": 0.05373635600335852, + "grad_norm": 0.21849682927131653, + "learning_rate": 5.366876310272537e-05, + "loss": 0.1938, + "step": 256 + }, + { + "epoch": 0.05394626364399664, + "grad_norm": 0.16861748695373535, + "learning_rate": 5.3878406708595385e-05, + "loss": 0.1751, + "step": 257 + }, + { + "epoch": 0.05415617128463476, + "grad_norm": 0.19400931894779205, + "learning_rate": 5.408805031446541e-05, + "loss": 0.2117, + "step": 258 + }, + { + "epoch": 0.05436607892527288, + "grad_norm": 0.18074113130569458, + "learning_rate": 5.429769392033544e-05, + "loss": 0.1949, + "step": 259 + }, + { + "epoch": 0.054575986565911, + "grad_norm": 0.17707990109920502, + "learning_rate": 5.450733752620545e-05, + "loss": 0.199, + "step": 260 + }, + { + "epoch": 0.05478589420654912, + "grad_norm": 0.16568966209888458, + "learning_rate": 5.4716981132075475e-05, + "loss": 0.2027, + "step": 261 + }, + { + "epoch": 0.05499580184718724, + "grad_norm": 0.24486149847507477, + "learning_rate": 5.49266247379455e-05, + "loss": 0.2042, + "step": 262 + }, + { + "epoch": 0.055205709487825355, + "grad_norm": 0.20431551337242126, + "learning_rate": 5.513626834381551e-05, + "loss": 0.2013, + "step": 263 + }, + { + "epoch": 0.055415617128463476, + "grad_norm": 0.21347559988498688, + "learning_rate": 5.534591194968554e-05, + "loss": 0.1687, + "step": 264 + }, + { + "epoch": 0.0556255247691016, + "grad_norm": 0.22354485094547272, + "learning_rate": 5.555555555555556e-05, + "loss": 0.1819, + "step": 265 + }, + { + "epoch": 0.05583543240973971, + "grad_norm": 0.2891826033592224, + "learning_rate": 5.576519916142558e-05, + "loss": 0.1927, + "step": 266 + }, + { + "epoch": 0.05604534005037783, + "grad_norm": 0.2465565800666809, + "learning_rate": 5.5974842767295596e-05, + "loss": 0.1852, + "step": 267 + }, + { + "epoch": 0.056255247691015954, + "grad_norm": 0.1743604689836502, + "learning_rate": 5.618448637316562e-05, + "loss": 0.1722, + "step": 268 + }, + { + "epoch": 0.056465155331654075, + "grad_norm": 0.2461417019367218, + "learning_rate": 5.6394129979035634e-05, + "loss": 0.1816, + "step": 269 + }, + { + "epoch": 0.05667506297229219, + "grad_norm": 0.21032604575157166, + "learning_rate": 5.660377358490566e-05, + "loss": 0.182, + "step": 270 + }, + { + "epoch": 0.05688497061293031, + "grad_norm": 0.2683754861354828, + "learning_rate": 5.6813417190775686e-05, + "loss": 0.1718, + "step": 271 + }, + { + "epoch": 0.05709487825356843, + "grad_norm": 0.18455228209495544, + "learning_rate": 5.70230607966457e-05, + "loss": 0.1954, + "step": 272 + }, + { + "epoch": 0.057304785894206546, + "grad_norm": 0.22255247831344604, + "learning_rate": 5.7232704402515724e-05, + "loss": 0.1922, + "step": 273 + }, + { + "epoch": 0.05751469353484467, + "grad_norm": 0.22789119184017181, + "learning_rate": 5.744234800838575e-05, + "loss": 0.1882, + "step": 274 + }, + { + "epoch": 0.05772460117548279, + "grad_norm": 0.20874802768230438, + "learning_rate": 5.765199161425576e-05, + "loss": 0.1859, + "step": 275 + }, + { + "epoch": 0.05793450881612091, + "grad_norm": 0.17554089426994324, + "learning_rate": 5.786163522012579e-05, + "loss": 0.1793, + "step": 276 + }, + { + "epoch": 0.058144416456759024, + "grad_norm": 0.2011173665523529, + "learning_rate": 5.8071278825995814e-05, + "loss": 0.2009, + "step": 277 + }, + { + "epoch": 0.058354324097397145, + "grad_norm": 0.28261420130729675, + "learning_rate": 5.8280922431865826e-05, + "loss": 0.2002, + "step": 278 + }, + { + "epoch": 0.058564231738035266, + "grad_norm": 0.2356766015291214, + "learning_rate": 5.849056603773585e-05, + "loss": 0.2095, + "step": 279 + }, + { + "epoch": 0.05877413937867338, + "grad_norm": 0.15072722733020782, + "learning_rate": 5.870020964360588e-05, + "loss": 0.1781, + "step": 280 + }, + { + "epoch": 0.0589840470193115, + "grad_norm": 0.1446981132030487, + "learning_rate": 5.89098532494759e-05, + "loss": 0.1813, + "step": 281 + }, + { + "epoch": 0.05919395465994962, + "grad_norm": 0.16516901552677155, + "learning_rate": 5.9119496855345916e-05, + "loss": 0.1946, + "step": 282 + }, + { + "epoch": 0.059403862300587744, + "grad_norm": 0.16732774674892426, + "learning_rate": 5.9329140461215935e-05, + "loss": 0.1813, + "step": 283 + }, + { + "epoch": 0.05961376994122586, + "grad_norm": 0.2000836730003357, + "learning_rate": 5.953878406708596e-05, + "loss": 0.1918, + "step": 284 + }, + { + "epoch": 0.05982367758186398, + "grad_norm": 0.15576116740703583, + "learning_rate": 5.974842767295597e-05, + "loss": 0.1814, + "step": 285 + }, + { + "epoch": 0.0600335852225021, + "grad_norm": 0.18421867489814758, + "learning_rate": 5.9958071278826e-05, + "loss": 0.1745, + "step": 286 + }, + { + "epoch": 0.06024349286314022, + "grad_norm": 0.2663988471031189, + "learning_rate": 6.0167714884696025e-05, + "loss": 0.1889, + "step": 287 + }, + { + "epoch": 0.060453400503778336, + "grad_norm": 0.18096649646759033, + "learning_rate": 6.037735849056604e-05, + "loss": 0.191, + "step": 288 + }, + { + "epoch": 0.06066330814441646, + "grad_norm": 0.24025796353816986, + "learning_rate": 6.058700209643606e-05, + "loss": 0.1907, + "step": 289 + }, + { + "epoch": 0.06087321578505458, + "grad_norm": 0.1682557910680771, + "learning_rate": 6.079664570230609e-05, + "loss": 0.1747, + "step": 290 + }, + { + "epoch": 0.06108312342569269, + "grad_norm": 0.2801767587661743, + "learning_rate": 6.10062893081761e-05, + "loss": 0.1813, + "step": 291 + }, + { + "epoch": 0.061293031066330814, + "grad_norm": 0.2288123071193695, + "learning_rate": 6.121593291404612e-05, + "loss": 0.1681, + "step": 292 + }, + { + "epoch": 0.061502938706968935, + "grad_norm": 0.14664186537265778, + "learning_rate": 6.142557651991615e-05, + "loss": 0.1723, + "step": 293 + }, + { + "epoch": 0.061712846347607056, + "grad_norm": 0.15858514606952667, + "learning_rate": 6.163522012578616e-05, + "loss": 0.198, + "step": 294 + }, + { + "epoch": 0.06192275398824517, + "grad_norm": 0.24689258635044098, + "learning_rate": 6.184486373165618e-05, + "loss": 0.1865, + "step": 295 + }, + { + "epoch": 0.06213266162888329, + "grad_norm": 0.19141900539398193, + "learning_rate": 6.205450733752621e-05, + "loss": 0.2103, + "step": 296 + }, + { + "epoch": 0.06234256926952141, + "grad_norm": 0.22445173561573029, + "learning_rate": 6.226415094339622e-05, + "loss": 0.1778, + "step": 297 + }, + { + "epoch": 0.06255247691015953, + "grad_norm": 0.1805533766746521, + "learning_rate": 6.247379454926625e-05, + "loss": 0.1752, + "step": 298 + }, + { + "epoch": 0.06276238455079765, + "grad_norm": 0.17849349975585938, + "learning_rate": 6.268343815513627e-05, + "loss": 0.1735, + "step": 299 + }, + { + "epoch": 0.06297229219143577, + "grad_norm": 0.2601464092731476, + "learning_rate": 6.289308176100629e-05, + "loss": 0.1868, + "step": 300 + }, + { + "epoch": 0.06318219983207389, + "grad_norm": 0.2865089178085327, + "learning_rate": 6.310272536687631e-05, + "loss": 0.2058, + "step": 301 + }, + { + "epoch": 0.06339210747271201, + "grad_norm": 0.1764407902956009, + "learning_rate": 6.331236897274634e-05, + "loss": 0.1828, + "step": 302 + }, + { + "epoch": 0.06360201511335013, + "grad_norm": 0.136027991771698, + "learning_rate": 6.352201257861635e-05, + "loss": 0.1841, + "step": 303 + }, + { + "epoch": 0.06381192275398824, + "grad_norm": 0.26960527896881104, + "learning_rate": 6.373165618448638e-05, + "loss": 0.1806, + "step": 304 + }, + { + "epoch": 0.06402183039462636, + "grad_norm": 0.2371356189250946, + "learning_rate": 6.39412997903564e-05, + "loss": 0.1755, + "step": 305 + }, + { + "epoch": 0.06423173803526448, + "grad_norm": 0.16067345440387726, + "learning_rate": 6.415094339622641e-05, + "loss": 0.1899, + "step": 306 + }, + { + "epoch": 0.0644416456759026, + "grad_norm": 0.1733190417289734, + "learning_rate": 6.436058700209644e-05, + "loss": 0.1829, + "step": 307 + }, + { + "epoch": 0.06465155331654072, + "grad_norm": 0.19170600175857544, + "learning_rate": 6.457023060796647e-05, + "loss": 0.1777, + "step": 308 + }, + { + "epoch": 0.06486146095717885, + "grad_norm": 0.17290905117988586, + "learning_rate": 6.477987421383648e-05, + "loss": 0.187, + "step": 309 + }, + { + "epoch": 0.06507136859781697, + "grad_norm": 0.3149113059043884, + "learning_rate": 6.49895178197065e-05, + "loss": 0.2037, + "step": 310 + }, + { + "epoch": 0.06528127623845507, + "grad_norm": 0.15404744446277618, + "learning_rate": 6.519916142557653e-05, + "loss": 0.1909, + "step": 311 + }, + { + "epoch": 0.0654911838790932, + "grad_norm": 0.157347172498703, + "learning_rate": 6.540880503144654e-05, + "loss": 0.191, + "step": 312 + }, + { + "epoch": 0.06570109151973132, + "grad_norm": 0.23342733085155487, + "learning_rate": 6.561844863731657e-05, + "loss": 0.1867, + "step": 313 + }, + { + "epoch": 0.06591099916036944, + "grad_norm": 0.2597595155239105, + "learning_rate": 6.58280922431866e-05, + "loss": 0.1949, + "step": 314 + }, + { + "epoch": 0.06612090680100756, + "grad_norm": 0.23665842413902283, + "learning_rate": 6.60377358490566e-05, + "loss": 0.1898, + "step": 315 + }, + { + "epoch": 0.06633081444164568, + "grad_norm": 0.18460237979888916, + "learning_rate": 6.624737945492663e-05, + "loss": 0.1824, + "step": 316 + }, + { + "epoch": 0.0665407220822838, + "grad_norm": 0.1677280068397522, + "learning_rate": 6.645702306079666e-05, + "loss": 0.1871, + "step": 317 + }, + { + "epoch": 0.06675062972292191, + "grad_norm": 0.1769377440214157, + "learning_rate": 6.666666666666667e-05, + "loss": 0.1779, + "step": 318 + }, + { + "epoch": 0.06696053736356003, + "grad_norm": 0.1881011724472046, + "learning_rate": 6.68763102725367e-05, + "loss": 0.1711, + "step": 319 + }, + { + "epoch": 0.06717044500419815, + "grad_norm": 0.19110549986362457, + "learning_rate": 6.708595387840672e-05, + "loss": 0.1823, + "step": 320 + }, + { + "epoch": 0.06738035264483627, + "grad_norm": 0.26796162128448486, + "learning_rate": 6.729559748427673e-05, + "loss": 0.1863, + "step": 321 + }, + { + "epoch": 0.0675902602854744, + "grad_norm": 0.17290090024471283, + "learning_rate": 6.750524109014676e-05, + "loss": 0.1816, + "step": 322 + }, + { + "epoch": 0.06780016792611251, + "grad_norm": 0.2324109524488449, + "learning_rate": 6.771488469601677e-05, + "loss": 0.1944, + "step": 323 + }, + { + "epoch": 0.06801007556675064, + "grad_norm": 0.24944299459457397, + "learning_rate": 6.79245283018868e-05, + "loss": 0.1965, + "step": 324 + }, + { + "epoch": 0.06821998320738874, + "grad_norm": 0.2102229744195938, + "learning_rate": 6.813417190775681e-05, + "loss": 0.221, + "step": 325 + }, + { + "epoch": 0.06842989084802686, + "grad_norm": 0.22497773170471191, + "learning_rate": 6.834381551362684e-05, + "loss": 0.1818, + "step": 326 + }, + { + "epoch": 0.06863979848866499, + "grad_norm": 0.19047041237354279, + "learning_rate": 6.855345911949685e-05, + "loss": 0.1817, + "step": 327 + }, + { + "epoch": 0.0688497061293031, + "grad_norm": 0.19890040159225464, + "learning_rate": 6.876310272536687e-05, + "loss": 0.2054, + "step": 328 + }, + { + "epoch": 0.06905961376994123, + "grad_norm": 0.18274420499801636, + "learning_rate": 6.89727463312369e-05, + "loss": 0.1732, + "step": 329 + }, + { + "epoch": 0.06926952141057935, + "grad_norm": 0.20556879043579102, + "learning_rate": 6.918238993710691e-05, + "loss": 0.1773, + "step": 330 + }, + { + "epoch": 0.06947942905121747, + "grad_norm": 0.19632075726985931, + "learning_rate": 6.939203354297694e-05, + "loss": 0.1841, + "step": 331 + }, + { + "epoch": 0.06968933669185558, + "grad_norm": 0.25757917761802673, + "learning_rate": 6.960167714884696e-05, + "loss": 0.1861, + "step": 332 + }, + { + "epoch": 0.0698992443324937, + "grad_norm": 0.1654757410287857, + "learning_rate": 6.981132075471698e-05, + "loss": 0.1797, + "step": 333 + }, + { + "epoch": 0.07010915197313182, + "grad_norm": 0.1424175500869751, + "learning_rate": 7.0020964360587e-05, + "loss": 0.1659, + "step": 334 + }, + { + "epoch": 0.07031905961376994, + "grad_norm": 0.21559248864650726, + "learning_rate": 7.023060796645703e-05, + "loss": 0.175, + "step": 335 + }, + { + "epoch": 0.07052896725440806, + "grad_norm": 0.3100188076496124, + "learning_rate": 7.044025157232704e-05, + "loss": 0.1644, + "step": 336 + }, + { + "epoch": 0.07073887489504618, + "grad_norm": 0.2530849277973175, + "learning_rate": 7.064989517819707e-05, + "loss": 0.1697, + "step": 337 + }, + { + "epoch": 0.0709487825356843, + "grad_norm": 0.2111438810825348, + "learning_rate": 7.085953878406709e-05, + "loss": 0.202, + "step": 338 + }, + { + "epoch": 0.07115869017632241, + "grad_norm": 0.21798165142536163, + "learning_rate": 7.10691823899371e-05, + "loss": 0.1832, + "step": 339 + }, + { + "epoch": 0.07136859781696053, + "grad_norm": 0.24779516458511353, + "learning_rate": 7.127882599580713e-05, + "loss": 0.1934, + "step": 340 + }, + { + "epoch": 0.07157850545759865, + "grad_norm": 0.21718356013298035, + "learning_rate": 7.148846960167716e-05, + "loss": 0.1951, + "step": 341 + }, + { + "epoch": 0.07178841309823678, + "grad_norm": 0.22320568561553955, + "learning_rate": 7.169811320754717e-05, + "loss": 0.1619, + "step": 342 + }, + { + "epoch": 0.0719983207388749, + "grad_norm": 0.19393590092658997, + "learning_rate": 7.19077568134172e-05, + "loss": 0.201, + "step": 343 + }, + { + "epoch": 0.07220822837951302, + "grad_norm": 0.1626208871603012, + "learning_rate": 7.211740041928722e-05, + "loss": 0.1848, + "step": 344 + }, + { + "epoch": 0.07241813602015114, + "grad_norm": 0.2256711721420288, + "learning_rate": 7.232704402515723e-05, + "loss": 0.192, + "step": 345 + }, + { + "epoch": 0.07262804366078925, + "grad_norm": 0.2225414514541626, + "learning_rate": 7.253668763102726e-05, + "loss": 0.1907, + "step": 346 + }, + { + "epoch": 0.07283795130142737, + "grad_norm": 0.1786690205335617, + "learning_rate": 7.274633123689728e-05, + "loss": 0.1756, + "step": 347 + }, + { + "epoch": 0.07304785894206549, + "grad_norm": 0.2155577391386032, + "learning_rate": 7.29559748427673e-05, + "loss": 0.1739, + "step": 348 + }, + { + "epoch": 0.07325776658270361, + "grad_norm": 0.32533329725265503, + "learning_rate": 7.316561844863732e-05, + "loss": 0.1988, + "step": 349 + }, + { + "epoch": 0.07346767422334173, + "grad_norm": 0.1870083510875702, + "learning_rate": 7.337526205450735e-05, + "loss": 0.1615, + "step": 350 + }, + { + "epoch": 0.07367758186397985, + "grad_norm": 0.2160840779542923, + "learning_rate": 7.358490566037736e-05, + "loss": 0.1901, + "step": 351 + }, + { + "epoch": 0.07388748950461797, + "grad_norm": 0.19049416482448578, + "learning_rate": 7.379454926624739e-05, + "loss": 0.1763, + "step": 352 + }, + { + "epoch": 0.0740973971452561, + "grad_norm": 0.15733250975608826, + "learning_rate": 7.400419287211741e-05, + "loss": 0.1812, + "step": 353 + }, + { + "epoch": 0.0743073047858942, + "grad_norm": 0.22470858693122864, + "learning_rate": 7.421383647798742e-05, + "loss": 0.194, + "step": 354 + }, + { + "epoch": 0.07451721242653232, + "grad_norm": 0.3271860182285309, + "learning_rate": 7.442348008385745e-05, + "loss": 0.1708, + "step": 355 + }, + { + "epoch": 0.07472712006717044, + "grad_norm": 0.17839424312114716, + "learning_rate": 7.463312368972748e-05, + "loss": 0.182, + "step": 356 + }, + { + "epoch": 0.07493702770780857, + "grad_norm": 0.1907908171415329, + "learning_rate": 7.484276729559749e-05, + "loss": 0.163, + "step": 357 + }, + { + "epoch": 0.07514693534844669, + "grad_norm": 0.20342503488063812, + "learning_rate": 7.505241090146751e-05, + "loss": 0.2029, + "step": 358 + }, + { + "epoch": 0.07535684298908481, + "grad_norm": 0.21872438490390778, + "learning_rate": 7.526205450733753e-05, + "loss": 0.192, + "step": 359 + }, + { + "epoch": 0.07556675062972293, + "grad_norm": 0.22313977777957916, + "learning_rate": 7.547169811320755e-05, + "loss": 0.1823, + "step": 360 + }, + { + "epoch": 0.07577665827036104, + "grad_norm": 0.1931924819946289, + "learning_rate": 7.568134171907756e-05, + "loss": 0.1992, + "step": 361 + }, + { + "epoch": 0.07598656591099916, + "grad_norm": 0.2859954535961151, + "learning_rate": 7.589098532494759e-05, + "loss": 0.1684, + "step": 362 + }, + { + "epoch": 0.07619647355163728, + "grad_norm": 0.18601499497890472, + "learning_rate": 7.610062893081762e-05, + "loss": 0.1871, + "step": 363 + }, + { + "epoch": 0.0764063811922754, + "grad_norm": 0.26345667243003845, + "learning_rate": 7.631027253668763e-05, + "loss": 0.1883, + "step": 364 + }, + { + "epoch": 0.07661628883291352, + "grad_norm": 0.24455974996089935, + "learning_rate": 7.651991614255765e-05, + "loss": 0.1991, + "step": 365 + }, + { + "epoch": 0.07682619647355164, + "grad_norm": 0.1787412464618683, + "learning_rate": 7.672955974842768e-05, + "loss": 0.1895, + "step": 366 + }, + { + "epoch": 0.07703610411418976, + "grad_norm": 0.2711624205112457, + "learning_rate": 7.693920335429769e-05, + "loss": 0.1889, + "step": 367 + }, + { + "epoch": 0.07724601175482787, + "grad_norm": 0.2764052152633667, + "learning_rate": 7.714884696016772e-05, + "loss": 0.1895, + "step": 368 + }, + { + "epoch": 0.07745591939546599, + "grad_norm": 0.15490169823169708, + "learning_rate": 7.735849056603774e-05, + "loss": 0.1812, + "step": 369 + }, + { + "epoch": 0.07766582703610411, + "grad_norm": 0.18659183382987976, + "learning_rate": 7.756813417190776e-05, + "loss": 0.1871, + "step": 370 + }, + { + "epoch": 0.07787573467674223, + "grad_norm": 0.19188903272151947, + "learning_rate": 7.777777777777778e-05, + "loss": 0.1716, + "step": 371 + }, + { + "epoch": 0.07808564231738035, + "grad_norm": 0.22174161672592163, + "learning_rate": 7.798742138364781e-05, + "loss": 0.1795, + "step": 372 + }, + { + "epoch": 0.07829554995801848, + "grad_norm": 0.1882723569869995, + "learning_rate": 7.819706498951782e-05, + "loss": 0.1874, + "step": 373 + }, + { + "epoch": 0.0785054575986566, + "grad_norm": 0.1762145459651947, + "learning_rate": 7.840670859538785e-05, + "loss": 0.1852, + "step": 374 + }, + { + "epoch": 0.0787153652392947, + "grad_norm": 0.17931701242923737, + "learning_rate": 7.861635220125787e-05, + "loss": 0.1817, + "step": 375 + }, + { + "epoch": 0.07892527287993283, + "grad_norm": 0.1833990216255188, + "learning_rate": 7.882599580712788e-05, + "loss": 0.1852, + "step": 376 + }, + { + "epoch": 0.07913518052057095, + "grad_norm": 0.2758026421070099, + "learning_rate": 7.903563941299791e-05, + "loss": 0.1868, + "step": 377 + }, + { + "epoch": 0.07934508816120907, + "grad_norm": 0.24222204089164734, + "learning_rate": 7.924528301886794e-05, + "loss": 0.186, + "step": 378 + }, + { + "epoch": 0.07955499580184719, + "grad_norm": 0.17609156668186188, + "learning_rate": 7.945492662473795e-05, + "loss": 0.1807, + "step": 379 + }, + { + "epoch": 0.07976490344248531, + "grad_norm": 0.23695167899131775, + "learning_rate": 7.966457023060797e-05, + "loss": 0.1799, + "step": 380 + }, + { + "epoch": 0.07997481108312343, + "grad_norm": 0.25356245040893555, + "learning_rate": 7.9874213836478e-05, + "loss": 0.1899, + "step": 381 + }, + { + "epoch": 0.08018471872376154, + "grad_norm": 0.23144365847110748, + "learning_rate": 8.008385744234801e-05, + "loss": 0.192, + "step": 382 + }, + { + "epoch": 0.08039462636439966, + "grad_norm": 0.1521812379360199, + "learning_rate": 8.029350104821804e-05, + "loss": 0.1864, + "step": 383 + }, + { + "epoch": 0.08060453400503778, + "grad_norm": 0.16725748777389526, + "learning_rate": 8.050314465408806e-05, + "loss": 0.202, + "step": 384 + }, + { + "epoch": 0.0808144416456759, + "grad_norm": 0.21173058450222015, + "learning_rate": 8.071278825995808e-05, + "loss": 0.1751, + "step": 385 + }, + { + "epoch": 0.08102434928631402, + "grad_norm": 0.15676653385162354, + "learning_rate": 8.09224318658281e-05, + "loss": 0.1946, + "step": 386 + }, + { + "epoch": 0.08123425692695214, + "grad_norm": 0.21838362514972687, + "learning_rate": 8.113207547169813e-05, + "loss": 0.1889, + "step": 387 + }, + { + "epoch": 0.08144416456759027, + "grad_norm": 0.19586238265037537, + "learning_rate": 8.134171907756814e-05, + "loss": 0.1884, + "step": 388 + }, + { + "epoch": 0.08165407220822837, + "grad_norm": 0.21012739837169647, + "learning_rate": 8.155136268343817e-05, + "loss": 0.1822, + "step": 389 + }, + { + "epoch": 0.0818639798488665, + "grad_norm": 0.2092917114496231, + "learning_rate": 8.176100628930818e-05, + "loss": 0.1783, + "step": 390 + }, + { + "epoch": 0.08207388748950462, + "grad_norm": 0.3745954930782318, + "learning_rate": 8.19706498951782e-05, + "loss": 0.1976, + "step": 391 + }, + { + "epoch": 0.08228379513014274, + "grad_norm": 0.2579379081726074, + "learning_rate": 8.218029350104822e-05, + "loss": 0.197, + "step": 392 + }, + { + "epoch": 0.08249370277078086, + "grad_norm": 0.18806852400302887, + "learning_rate": 8.238993710691824e-05, + "loss": 0.1873, + "step": 393 + }, + { + "epoch": 0.08270361041141898, + "grad_norm": 0.24592849612236023, + "learning_rate": 8.259958071278825e-05, + "loss": 0.2083, + "step": 394 + }, + { + "epoch": 0.0829135180520571, + "grad_norm": 0.2678208649158478, + "learning_rate": 8.280922431865828e-05, + "loss": 0.1878, + "step": 395 + }, + { + "epoch": 0.08312342569269521, + "grad_norm": 0.2023075520992279, + "learning_rate": 8.30188679245283e-05, + "loss": 0.1861, + "step": 396 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.2390558272600174, + "learning_rate": 8.322851153039832e-05, + "loss": 0.1802, + "step": 397 + }, + { + "epoch": 0.08354324097397145, + "grad_norm": 0.175222247838974, + "learning_rate": 8.343815513626834e-05, + "loss": 0.1674, + "step": 398 + }, + { + "epoch": 0.08375314861460957, + "grad_norm": 0.2340380698442459, + "learning_rate": 8.364779874213837e-05, + "loss": 0.17, + "step": 399 + }, + { + "epoch": 0.08396305625524769, + "grad_norm": 0.20814655721187592, + "learning_rate": 8.385744234800838e-05, + "loss": 0.1513, + "step": 400 + }, + { + "epoch": 0.08417296389588581, + "grad_norm": 0.34867948293685913, + "learning_rate": 8.406708595387841e-05, + "loss": 0.1644, + "step": 401 + }, + { + "epoch": 0.08438287153652393, + "grad_norm": 0.21573619544506073, + "learning_rate": 8.427672955974843e-05, + "loss": 0.1785, + "step": 402 + }, + { + "epoch": 0.08459277917716204, + "grad_norm": 0.21437713503837585, + "learning_rate": 8.448637316561845e-05, + "loss": 0.1523, + "step": 403 + }, + { + "epoch": 0.08480268681780016, + "grad_norm": 0.2250152826309204, + "learning_rate": 8.469601677148847e-05, + "loss": 0.1728, + "step": 404 + }, + { + "epoch": 0.08501259445843828, + "grad_norm": 0.2514733672142029, + "learning_rate": 8.49056603773585e-05, + "loss": 0.1755, + "step": 405 + }, + { + "epoch": 0.0852225020990764, + "grad_norm": 0.16003377735614777, + "learning_rate": 8.511530398322851e-05, + "loss": 0.1935, + "step": 406 + }, + { + "epoch": 0.08543240973971453, + "grad_norm": 0.1792365163564682, + "learning_rate": 8.532494758909854e-05, + "loss": 0.1991, + "step": 407 + }, + { + "epoch": 0.08564231738035265, + "grad_norm": 0.16854703426361084, + "learning_rate": 8.553459119496856e-05, + "loss": 0.1904, + "step": 408 + }, + { + "epoch": 0.08585222502099077, + "grad_norm": 0.19401603937149048, + "learning_rate": 8.574423480083857e-05, + "loss": 0.1681, + "step": 409 + }, + { + "epoch": 0.08606213266162889, + "grad_norm": 0.15639828145503998, + "learning_rate": 8.59538784067086e-05, + "loss": 0.1628, + "step": 410 + }, + { + "epoch": 0.086272040302267, + "grad_norm": 0.15357258915901184, + "learning_rate": 8.616352201257863e-05, + "loss": 0.1905, + "step": 411 + }, + { + "epoch": 0.08648194794290512, + "grad_norm": 0.256944477558136, + "learning_rate": 8.637316561844864e-05, + "loss": 0.2015, + "step": 412 + }, + { + "epoch": 0.08669185558354324, + "grad_norm": 0.13482192158699036, + "learning_rate": 8.658280922431866e-05, + "loss": 0.1718, + "step": 413 + }, + { + "epoch": 0.08690176322418136, + "grad_norm": 0.16525831818580627, + "learning_rate": 8.679245283018869e-05, + "loss": 0.1778, + "step": 414 + }, + { + "epoch": 0.08711167086481948, + "grad_norm": 0.2145531326532364, + "learning_rate": 8.70020964360587e-05, + "loss": 0.1714, + "step": 415 + }, + { + "epoch": 0.0873215785054576, + "grad_norm": 0.20481255650520325, + "learning_rate": 8.721174004192873e-05, + "loss": 0.1832, + "step": 416 + }, + { + "epoch": 0.08753148614609572, + "grad_norm": 0.14865756034851074, + "learning_rate": 8.742138364779875e-05, + "loss": 0.1659, + "step": 417 + }, + { + "epoch": 0.08774139378673383, + "grad_norm": 0.14713706076145172, + "learning_rate": 8.763102725366877e-05, + "loss": 0.1674, + "step": 418 + }, + { + "epoch": 0.08795130142737195, + "grad_norm": 0.1728464812040329, + "learning_rate": 8.784067085953879e-05, + "loss": 0.1727, + "step": 419 + }, + { + "epoch": 0.08816120906801007, + "grad_norm": 0.2646033465862274, + "learning_rate": 8.805031446540882e-05, + "loss": 0.1973, + "step": 420 + }, + { + "epoch": 0.0883711167086482, + "grad_norm": 0.2262433022260666, + "learning_rate": 8.825995807127883e-05, + "loss": 0.1817, + "step": 421 + }, + { + "epoch": 0.08858102434928632, + "grad_norm": 0.16398945450782776, + "learning_rate": 8.846960167714886e-05, + "loss": 0.1628, + "step": 422 + }, + { + "epoch": 0.08879093198992444, + "grad_norm": 0.15976634621620178, + "learning_rate": 8.867924528301888e-05, + "loss": 0.1678, + "step": 423 + }, + { + "epoch": 0.08900083963056256, + "grad_norm": 0.19314904510974884, + "learning_rate": 8.888888888888889e-05, + "loss": 0.1753, + "step": 424 + }, + { + "epoch": 0.08921074727120067, + "grad_norm": 0.21701818704605103, + "learning_rate": 8.909853249475892e-05, + "loss": 0.19, + "step": 425 + }, + { + "epoch": 0.08942065491183879, + "grad_norm": 0.22768795490264893, + "learning_rate": 8.930817610062893e-05, + "loss": 0.1613, + "step": 426 + }, + { + "epoch": 0.08963056255247691, + "grad_norm": 0.13172288239002228, + "learning_rate": 8.951781970649896e-05, + "loss": 0.1749, + "step": 427 + }, + { + "epoch": 0.08984047019311503, + "grad_norm": 0.2015480250120163, + "learning_rate": 8.972746331236897e-05, + "loss": 0.1857, + "step": 428 + }, + { + "epoch": 0.09005037783375315, + "grad_norm": 0.20239531993865967, + "learning_rate": 8.9937106918239e-05, + "loss": 0.1804, + "step": 429 + }, + { + "epoch": 0.09026028547439127, + "grad_norm": 0.17528317868709564, + "learning_rate": 9.014675052410901e-05, + "loss": 0.1729, + "step": 430 + }, + { + "epoch": 0.0904701931150294, + "grad_norm": 0.16663801670074463, + "learning_rate": 9.035639412997903e-05, + "loss": 0.1896, + "step": 431 + }, + { + "epoch": 0.0906801007556675, + "grad_norm": 0.18777558207511902, + "learning_rate": 9.056603773584906e-05, + "loss": 0.1801, + "step": 432 + }, + { + "epoch": 0.09089000839630562, + "grad_norm": 0.1440989226102829, + "learning_rate": 9.077568134171907e-05, + "loss": 0.179, + "step": 433 + }, + { + "epoch": 0.09109991603694374, + "grad_norm": 0.1786854863166809, + "learning_rate": 9.09853249475891e-05, + "loss": 0.1799, + "step": 434 + }, + { + "epoch": 0.09130982367758186, + "grad_norm": 0.20794442296028137, + "learning_rate": 9.119496855345912e-05, + "loss": 0.1731, + "step": 435 + }, + { + "epoch": 0.09151973131821999, + "grad_norm": 0.16377133131027222, + "learning_rate": 9.140461215932914e-05, + "loss": 0.1815, + "step": 436 + }, + { + "epoch": 0.0917296389588581, + "grad_norm": 0.174666628241539, + "learning_rate": 9.161425576519916e-05, + "loss": 0.1798, + "step": 437 + }, + { + "epoch": 0.09193954659949623, + "grad_norm": 0.2127188891172409, + "learning_rate": 9.182389937106919e-05, + "loss": 0.1805, + "step": 438 + }, + { + "epoch": 0.09214945424013433, + "grad_norm": 0.1936446726322174, + "learning_rate": 9.203354297693921e-05, + "loss": 0.1723, + "step": 439 + }, + { + "epoch": 0.09235936188077246, + "grad_norm": 0.18736332654953003, + "learning_rate": 9.224318658280923e-05, + "loss": 0.1849, + "step": 440 + }, + { + "epoch": 0.09256926952141058, + "grad_norm": 0.16817238926887512, + "learning_rate": 9.245283018867925e-05, + "loss": 0.1746, + "step": 441 + }, + { + "epoch": 0.0927791771620487, + "grad_norm": 0.18249107897281647, + "learning_rate": 9.266247379454928e-05, + "loss": 0.1628, + "step": 442 + }, + { + "epoch": 0.09298908480268682, + "grad_norm": 0.1728898137807846, + "learning_rate": 9.287211740041929e-05, + "loss": 0.203, + "step": 443 + }, + { + "epoch": 0.09319899244332494, + "grad_norm": 0.16144797205924988, + "learning_rate": 9.308176100628931e-05, + "loss": 0.1843, + "step": 444 + }, + { + "epoch": 0.09340890008396306, + "grad_norm": 0.19680747389793396, + "learning_rate": 9.329140461215934e-05, + "loss": 0.168, + "step": 445 + }, + { + "epoch": 0.09361880772460117, + "grad_norm": 0.16198395192623138, + "learning_rate": 9.350104821802935e-05, + "loss": 0.1598, + "step": 446 + }, + { + "epoch": 0.09382871536523929, + "grad_norm": 0.17398878931999207, + "learning_rate": 9.371069182389938e-05, + "loss": 0.17, + "step": 447 + }, + { + "epoch": 0.09403862300587741, + "grad_norm": 0.18602675199508667, + "learning_rate": 9.39203354297694e-05, + "loss": 0.1716, + "step": 448 + }, + { + "epoch": 0.09424853064651553, + "grad_norm": 0.18403322994709015, + "learning_rate": 9.412997903563942e-05, + "loss": 0.1948, + "step": 449 + }, + { + "epoch": 0.09445843828715365, + "grad_norm": 0.18783587217330933, + "learning_rate": 9.433962264150944e-05, + "loss": 0.1812, + "step": 450 + }, + { + "epoch": 0.09466834592779177, + "grad_norm": 0.20252300798892975, + "learning_rate": 9.454926624737947e-05, + "loss": 0.1666, + "step": 451 + }, + { + "epoch": 0.0948782535684299, + "grad_norm": 0.20129899680614471, + "learning_rate": 9.475890985324948e-05, + "loss": 0.1695, + "step": 452 + }, + { + "epoch": 0.095088161209068, + "grad_norm": 0.17035968601703644, + "learning_rate": 9.496855345911951e-05, + "loss": 0.1678, + "step": 453 + }, + { + "epoch": 0.09529806884970612, + "grad_norm": 0.20403030514717102, + "learning_rate": 9.517819706498953e-05, + "loss": 0.1843, + "step": 454 + }, + { + "epoch": 0.09550797649034425, + "grad_norm": 0.17489562928676605, + "learning_rate": 9.538784067085954e-05, + "loss": 0.1657, + "step": 455 + }, + { + "epoch": 0.09571788413098237, + "grad_norm": 0.19699983298778534, + "learning_rate": 9.559748427672957e-05, + "loss": 0.1634, + "step": 456 + }, + { + "epoch": 0.09592779177162049, + "grad_norm": 0.17113354802131653, + "learning_rate": 9.58071278825996e-05, + "loss": 0.1879, + "step": 457 + }, + { + "epoch": 0.09613769941225861, + "grad_norm": 0.2290397435426712, + "learning_rate": 9.601677148846961e-05, + "loss": 0.1784, + "step": 458 + }, + { + "epoch": 0.09634760705289673, + "grad_norm": 0.2173147350549698, + "learning_rate": 9.622641509433963e-05, + "loss": 0.1673, + "step": 459 + }, + { + "epoch": 0.09655751469353484, + "grad_norm": 0.18280835449695587, + "learning_rate": 9.643605870020965e-05, + "loss": 0.1806, + "step": 460 + }, + { + "epoch": 0.09676742233417296, + "grad_norm": 0.14638672769069672, + "learning_rate": 9.664570230607967e-05, + "loss": 0.1833, + "step": 461 + }, + { + "epoch": 0.09697732997481108, + "grad_norm": 0.16228064894676208, + "learning_rate": 9.685534591194969e-05, + "loss": 0.1762, + "step": 462 + }, + { + "epoch": 0.0971872376154492, + "grad_norm": 0.1836690902709961, + "learning_rate": 9.706498951781971e-05, + "loss": 0.1782, + "step": 463 + }, + { + "epoch": 0.09739714525608732, + "grad_norm": 0.19470515847206116, + "learning_rate": 9.727463312368972e-05, + "loss": 0.185, + "step": 464 + }, + { + "epoch": 0.09760705289672544, + "grad_norm": 0.1833791732788086, + "learning_rate": 9.748427672955975e-05, + "loss": 0.1753, + "step": 465 + }, + { + "epoch": 0.09781696053736356, + "grad_norm": 0.22608265280723572, + "learning_rate": 9.769392033542977e-05, + "loss": 0.1792, + "step": 466 + }, + { + "epoch": 0.09802686817800169, + "grad_norm": 0.16552825272083282, + "learning_rate": 9.790356394129979e-05, + "loss": 0.1808, + "step": 467 + }, + { + "epoch": 0.0982367758186398, + "grad_norm": 0.2294851690530777, + "learning_rate": 9.811320754716981e-05, + "loss": 0.1935, + "step": 468 + }, + { + "epoch": 0.09844668345927791, + "grad_norm": 0.26589101552963257, + "learning_rate": 9.832285115303984e-05, + "loss": 0.1676, + "step": 469 + }, + { + "epoch": 0.09865659109991604, + "grad_norm": 0.22315791249275208, + "learning_rate": 9.853249475890985e-05, + "loss": 0.1806, + "step": 470 + }, + { + "epoch": 0.09886649874055416, + "grad_norm": 0.16855137050151825, + "learning_rate": 9.874213836477988e-05, + "loss": 0.1816, + "step": 471 + }, + { + "epoch": 0.09907640638119228, + "grad_norm": 0.19197392463684082, + "learning_rate": 9.89517819706499e-05, + "loss": 0.1923, + "step": 472 + }, + { + "epoch": 0.0992863140218304, + "grad_norm": 0.18722014129161835, + "learning_rate": 9.916142557651992e-05, + "loss": 0.1823, + "step": 473 + }, + { + "epoch": 0.09949622166246852, + "grad_norm": 0.15668706595897675, + "learning_rate": 9.937106918238994e-05, + "loss": 0.1896, + "step": 474 + }, + { + "epoch": 0.09970612930310663, + "grad_norm": 0.17297013103961945, + "learning_rate": 9.958071278825997e-05, + "loss": 0.1907, + "step": 475 + }, + { + "epoch": 0.09991603694374475, + "grad_norm": 0.23546694219112396, + "learning_rate": 9.979035639412998e-05, + "loss": 0.1647, + "step": 476 + }, + { + "epoch": 0.10012594458438287, + "grad_norm": 0.1627054661512375, + "learning_rate": 0.0001, + "loss": 0.1686, + "step": 477 + }, + { + "epoch": 0.10033585222502099, + "grad_norm": 0.21043647825717926, + "learning_rate": 9.999998657442895e-05, + "loss": 0.1865, + "step": 478 + }, + { + "epoch": 0.10054575986565911, + "grad_norm": 0.19615764915943146, + "learning_rate": 9.999994629772298e-05, + "loss": 0.1683, + "step": 479 + }, + { + "epoch": 0.10075566750629723, + "grad_norm": 0.2884671092033386, + "learning_rate": 9.999987916990372e-05, + "loss": 0.1858, + "step": 480 + }, + { + "epoch": 0.10096557514693535, + "grad_norm": 0.2381323128938675, + "learning_rate": 9.999978519100723e-05, + "loss": 0.1879, + "step": 481 + }, + { + "epoch": 0.10117548278757346, + "grad_norm": 0.19187557697296143, + "learning_rate": 9.999966436108398e-05, + "loss": 0.1808, + "step": 482 + }, + { + "epoch": 0.10138539042821158, + "grad_norm": 0.19443491101264954, + "learning_rate": 9.999951668019887e-05, + "loss": 0.1659, + "step": 483 + }, + { + "epoch": 0.1015952980688497, + "grad_norm": 0.20151716470718384, + "learning_rate": 9.999934214843116e-05, + "loss": 0.1679, + "step": 484 + }, + { + "epoch": 0.10180520570948783, + "grad_norm": 0.2896507978439331, + "learning_rate": 9.999914076587464e-05, + "loss": 0.1734, + "step": 485 + }, + { + "epoch": 0.10201511335012595, + "grad_norm": 0.21598441898822784, + "learning_rate": 9.999891253263741e-05, + "loss": 0.1779, + "step": 486 + }, + { + "epoch": 0.10222502099076407, + "grad_norm": 0.196011021733284, + "learning_rate": 9.999865744884207e-05, + "loss": 0.1815, + "step": 487 + }, + { + "epoch": 0.10243492863140219, + "grad_norm": 0.20962318778038025, + "learning_rate": 9.999837551462558e-05, + "loss": 0.1727, + "step": 488 + }, + { + "epoch": 0.1026448362720403, + "grad_norm": 0.18339572846889496, + "learning_rate": 9.999806673013935e-05, + "loss": 0.1689, + "step": 489 + }, + { + "epoch": 0.10285474391267842, + "grad_norm": 0.20994813740253448, + "learning_rate": 9.999773109554922e-05, + "loss": 0.1768, + "step": 490 + }, + { + "epoch": 0.10306465155331654, + "grad_norm": 0.163935124874115, + "learning_rate": 9.999736861103541e-05, + "loss": 0.1777, + "step": 491 + }, + { + "epoch": 0.10327455919395466, + "grad_norm": 0.1465967446565628, + "learning_rate": 9.99969792767926e-05, + "loss": 0.1846, + "step": 492 + }, + { + "epoch": 0.10348446683459278, + "grad_norm": 0.21443922817707062, + "learning_rate": 9.999656309302987e-05, + "loss": 0.1807, + "step": 493 + }, + { + "epoch": 0.1036943744752309, + "grad_norm": 0.18504248559474945, + "learning_rate": 9.999612005997071e-05, + "loss": 0.1548, + "step": 494 + }, + { + "epoch": 0.10390428211586902, + "grad_norm": 0.15490441024303436, + "learning_rate": 9.999565017785305e-05, + "loss": 0.1696, + "step": 495 + }, + { + "epoch": 0.10411418975650713, + "grad_norm": 0.1881389319896698, + "learning_rate": 9.999515344692923e-05, + "loss": 0.1852, + "step": 496 + }, + { + "epoch": 0.10432409739714525, + "grad_norm": 0.16337451338768005, + "learning_rate": 9.999462986746598e-05, + "loss": 0.1834, + "step": 497 + }, + { + "epoch": 0.10453400503778337, + "grad_norm": 0.16641898453235626, + "learning_rate": 9.99940794397445e-05, + "loss": 0.187, + "step": 498 + }, + { + "epoch": 0.1047439126784215, + "grad_norm": 0.15948446094989777, + "learning_rate": 9.999350216406038e-05, + "loss": 0.1835, + "step": 499 + }, + { + "epoch": 0.10495382031905962, + "grad_norm": 0.1550200879573822, + "learning_rate": 9.999289804072363e-05, + "loss": 0.1706, + "step": 500 + }, + { + "epoch": 0.10516372795969774, + "grad_norm": 0.16597698628902435, + "learning_rate": 9.999226707005867e-05, + "loss": 0.1811, + "step": 501 + }, + { + "epoch": 0.10537363560033586, + "grad_norm": 0.17551501095294952, + "learning_rate": 9.999160925240434e-05, + "loss": 0.1677, + "step": 502 + }, + { + "epoch": 0.10558354324097396, + "grad_norm": 0.15515847504138947, + "learning_rate": 9.999092458811393e-05, + "loss": 0.1789, + "step": 503 + }, + { + "epoch": 0.10579345088161209, + "grad_norm": 0.18121638894081116, + "learning_rate": 9.99902130775551e-05, + "loss": 0.1684, + "step": 504 + }, + { + "epoch": 0.10600335852225021, + "grad_norm": 0.1853945255279541, + "learning_rate": 9.998947472110994e-05, + "loss": 0.1907, + "step": 505 + }, + { + "epoch": 0.10621326616288833, + "grad_norm": 0.21303139626979828, + "learning_rate": 9.998870951917496e-05, + "loss": 0.1712, + "step": 506 + }, + { + "epoch": 0.10642317380352645, + "grad_norm": 0.16773764789104462, + "learning_rate": 9.998791747216113e-05, + "loss": 0.1756, + "step": 507 + }, + { + "epoch": 0.10663308144416457, + "grad_norm": 0.18033501505851746, + "learning_rate": 9.998709858049376e-05, + "loss": 0.1654, + "step": 508 + }, + { + "epoch": 0.10684298908480269, + "grad_norm": 0.14199328422546387, + "learning_rate": 9.998625284461263e-05, + "loss": 0.1587, + "step": 509 + }, + { + "epoch": 0.1070528967254408, + "grad_norm": 0.19968685507774353, + "learning_rate": 9.998538026497192e-05, + "loss": 0.1796, + "step": 510 + }, + { + "epoch": 0.10726280436607892, + "grad_norm": 0.17311611771583557, + "learning_rate": 9.998448084204021e-05, + "loss": 0.1864, + "step": 511 + }, + { + "epoch": 0.10747271200671704, + "grad_norm": 0.20124119520187378, + "learning_rate": 9.998355457630053e-05, + "loss": 0.1829, + "step": 512 + }, + { + "epoch": 0.10768261964735516, + "grad_norm": 0.12473297864198685, + "learning_rate": 9.998260146825029e-05, + "loss": 0.175, + "step": 513 + }, + { + "epoch": 0.10789252728799328, + "grad_norm": 0.1696644425392151, + "learning_rate": 9.998162151840135e-05, + "loss": 0.1762, + "step": 514 + }, + { + "epoch": 0.1081024349286314, + "grad_norm": 0.1781477928161621, + "learning_rate": 9.998061472727996e-05, + "loss": 0.1679, + "step": 515 + }, + { + "epoch": 0.10831234256926953, + "grad_norm": 0.19112960994243622, + "learning_rate": 9.997958109542675e-05, + "loss": 0.1553, + "step": 516 + }, + { + "epoch": 0.10852225020990765, + "grad_norm": 0.1417030394077301, + "learning_rate": 9.997852062339685e-05, + "loss": 0.1737, + "step": 517 + }, + { + "epoch": 0.10873215785054575, + "grad_norm": 0.15080858767032623, + "learning_rate": 9.997743331175976e-05, + "loss": 0.1595, + "step": 518 + }, + { + "epoch": 0.10894206549118388, + "grad_norm": 0.2046668380498886, + "learning_rate": 9.997631916109937e-05, + "loss": 0.1839, + "step": 519 + }, + { + "epoch": 0.109151973131822, + "grad_norm": 0.19941595196723938, + "learning_rate": 9.997517817201401e-05, + "loss": 0.1718, + "step": 520 + }, + { + "epoch": 0.10936188077246012, + "grad_norm": 0.15989692509174347, + "learning_rate": 9.997401034511642e-05, + "loss": 0.1613, + "step": 521 + }, + { + "epoch": 0.10957178841309824, + "grad_norm": 0.1697997897863388, + "learning_rate": 9.997281568103374e-05, + "loss": 0.1603, + "step": 522 + }, + { + "epoch": 0.10978169605373636, + "grad_norm": 0.1840822696685791, + "learning_rate": 9.997159418040754e-05, + "loss": 0.1735, + "step": 523 + }, + { + "epoch": 0.10999160369437448, + "grad_norm": 0.20991730690002441, + "learning_rate": 9.99703458438938e-05, + "loss": 0.154, + "step": 524 + }, + { + "epoch": 0.11020151133501259, + "grad_norm": 0.16802968084812164, + "learning_rate": 9.99690706721629e-05, + "loss": 0.1761, + "step": 525 + }, + { + "epoch": 0.11041141897565071, + "grad_norm": 0.18329255282878876, + "learning_rate": 9.996776866589962e-05, + "loss": 0.1609, + "step": 526 + }, + { + "epoch": 0.11062132661628883, + "grad_norm": 0.18645748496055603, + "learning_rate": 9.996643982580318e-05, + "loss": 0.1793, + "step": 527 + }, + { + "epoch": 0.11083123425692695, + "grad_norm": 0.1966720074415207, + "learning_rate": 9.996508415258722e-05, + "loss": 0.1714, + "step": 528 + }, + { + "epoch": 0.11104114189756507, + "grad_norm": 0.18155452609062195, + "learning_rate": 9.996370164697974e-05, + "loss": 0.1673, + "step": 529 + }, + { + "epoch": 0.1112510495382032, + "grad_norm": 0.2004195600748062, + "learning_rate": 9.996229230972317e-05, + "loss": 0.1865, + "step": 530 + }, + { + "epoch": 0.11146095717884132, + "grad_norm": 0.15521694719791412, + "learning_rate": 9.996085614157438e-05, + "loss": 0.1757, + "step": 531 + }, + { + "epoch": 0.11167086481947942, + "grad_norm": 0.1686578243970871, + "learning_rate": 9.995939314330462e-05, + "loss": 0.1768, + "step": 532 + }, + { + "epoch": 0.11188077246011754, + "grad_norm": 0.20034368336200714, + "learning_rate": 9.995790331569954e-05, + "loss": 0.1823, + "step": 533 + }, + { + "epoch": 0.11209068010075567, + "grad_norm": 0.1494702696800232, + "learning_rate": 9.995638665955922e-05, + "loss": 0.175, + "step": 534 + }, + { + "epoch": 0.11230058774139379, + "grad_norm": 0.16365233063697815, + "learning_rate": 9.995484317569814e-05, + "loss": 0.1716, + "step": 535 + }, + { + "epoch": 0.11251049538203191, + "grad_norm": 0.19227434694766998, + "learning_rate": 9.995327286494521e-05, + "loss": 0.1605, + "step": 536 + }, + { + "epoch": 0.11272040302267003, + "grad_norm": 0.21946166455745697, + "learning_rate": 9.995167572814365e-05, + "loss": 0.182, + "step": 537 + }, + { + "epoch": 0.11293031066330815, + "grad_norm": 0.2211793065071106, + "learning_rate": 9.995005176615124e-05, + "loss": 0.1783, + "step": 538 + }, + { + "epoch": 0.11314021830394626, + "grad_norm": 0.2154102325439453, + "learning_rate": 9.994840097984006e-05, + "loss": 0.1888, + "step": 539 + }, + { + "epoch": 0.11335012594458438, + "grad_norm": 0.20600587129592896, + "learning_rate": 9.994672337009658e-05, + "loss": 0.1871, + "step": 540 + }, + { + "epoch": 0.1135600335852225, + "grad_norm": 0.22028079628944397, + "learning_rate": 9.994501893782176e-05, + "loss": 0.1855, + "step": 541 + }, + { + "epoch": 0.11376994122586062, + "grad_norm": 0.23957398533821106, + "learning_rate": 9.99432876839309e-05, + "loss": 0.1616, + "step": 542 + }, + { + "epoch": 0.11397984886649874, + "grad_norm": 0.14516577124595642, + "learning_rate": 9.994152960935375e-05, + "loss": 0.1864, + "step": 543 + }, + { + "epoch": 0.11418975650713686, + "grad_norm": 0.14327426254749298, + "learning_rate": 9.99397447150344e-05, + "loss": 0.1596, + "step": 544 + }, + { + "epoch": 0.11439966414777498, + "grad_norm": 0.15387804806232452, + "learning_rate": 9.99379330019314e-05, + "loss": 0.1638, + "step": 545 + }, + { + "epoch": 0.11460957178841309, + "grad_norm": 0.14998720586299896, + "learning_rate": 9.993609447101767e-05, + "loss": 0.1456, + "step": 546 + }, + { + "epoch": 0.11481947942905121, + "grad_norm": 0.21649526059627533, + "learning_rate": 9.993422912328054e-05, + "loss": 0.1656, + "step": 547 + }, + { + "epoch": 0.11502938706968933, + "grad_norm": 0.19370022416114807, + "learning_rate": 9.993233695972175e-05, + "loss": 0.173, + "step": 548 + }, + { + "epoch": 0.11523929471032746, + "grad_norm": 0.20101432502269745, + "learning_rate": 9.993041798135745e-05, + "loss": 0.1914, + "step": 549 + }, + { + "epoch": 0.11544920235096558, + "grad_norm": 0.18047718703746796, + "learning_rate": 9.992847218921816e-05, + "loss": 0.2084, + "step": 550 + }, + { + "epoch": 0.1156591099916037, + "grad_norm": 0.18755191564559937, + "learning_rate": 9.99264995843488e-05, + "loss": 0.1646, + "step": 551 + }, + { + "epoch": 0.11586901763224182, + "grad_norm": 0.2605830430984497, + "learning_rate": 9.992450016780876e-05, + "loss": 0.1767, + "step": 552 + }, + { + "epoch": 0.11607892527287993, + "grad_norm": 0.1701487898826599, + "learning_rate": 9.99224739406717e-05, + "loss": 0.173, + "step": 553 + }, + { + "epoch": 0.11628883291351805, + "grad_norm": 0.19497598707675934, + "learning_rate": 9.99204209040258e-05, + "loss": 0.1857, + "step": 554 + }, + { + "epoch": 0.11649874055415617, + "grad_norm": 0.16379636526107788, + "learning_rate": 9.991834105897356e-05, + "loss": 0.154, + "step": 555 + }, + { + "epoch": 0.11670864819479429, + "grad_norm": 0.18630164861679077, + "learning_rate": 9.991623440663192e-05, + "loss": 0.1871, + "step": 556 + }, + { + "epoch": 0.11691855583543241, + "grad_norm": 0.219542995095253, + "learning_rate": 9.991410094813221e-05, + "loss": 0.1877, + "step": 557 + }, + { + "epoch": 0.11712846347607053, + "grad_norm": 0.1855912059545517, + "learning_rate": 9.991194068462011e-05, + "loss": 0.1771, + "step": 558 + }, + { + "epoch": 0.11733837111670865, + "grad_norm": 0.15919265151023865, + "learning_rate": 9.990975361725577e-05, + "loss": 0.1684, + "step": 559 + }, + { + "epoch": 0.11754827875734676, + "grad_norm": 0.14570386707782745, + "learning_rate": 9.990753974721366e-05, + "loss": 0.1656, + "step": 560 + }, + { + "epoch": 0.11775818639798488, + "grad_norm": 0.14153516292572021, + "learning_rate": 9.990529907568272e-05, + "loss": 0.1642, + "step": 561 + }, + { + "epoch": 0.117968094038623, + "grad_norm": 0.14015786349773407, + "learning_rate": 9.99030316038662e-05, + "loss": 0.1704, + "step": 562 + }, + { + "epoch": 0.11817800167926112, + "grad_norm": 0.2615254819393158, + "learning_rate": 9.99007373329818e-05, + "loss": 0.1601, + "step": 563 + }, + { + "epoch": 0.11838790931989925, + "grad_norm": 0.2077956199645996, + "learning_rate": 9.989841626426162e-05, + "loss": 0.1715, + "step": 564 + }, + { + "epoch": 0.11859781696053737, + "grad_norm": 0.1743435561656952, + "learning_rate": 9.989606839895208e-05, + "loss": 0.1725, + "step": 565 + }, + { + "epoch": 0.11880772460117549, + "grad_norm": 0.14333437383174896, + "learning_rate": 9.989369373831407e-05, + "loss": 0.1731, + "step": 566 + }, + { + "epoch": 0.1190176322418136, + "grad_norm": 0.16472546756267548, + "learning_rate": 9.989129228362284e-05, + "loss": 0.1641, + "step": 567 + }, + { + "epoch": 0.11922753988245172, + "grad_norm": 0.19595298171043396, + "learning_rate": 9.988886403616802e-05, + "loss": 0.1789, + "step": 568 + }, + { + "epoch": 0.11943744752308984, + "grad_norm": 0.15337203443050385, + "learning_rate": 9.988640899725361e-05, + "loss": 0.166, + "step": 569 + }, + { + "epoch": 0.11964735516372796, + "grad_norm": 0.13052469491958618, + "learning_rate": 9.988392716819806e-05, + "loss": 0.1762, + "step": 570 + }, + { + "epoch": 0.11985726280436608, + "grad_norm": 0.17160564661026, + "learning_rate": 9.988141855033415e-05, + "loss": 0.1742, + "step": 571 + }, + { + "epoch": 0.1200671704450042, + "grad_norm": 0.15600250661373138, + "learning_rate": 9.987888314500906e-05, + "loss": 0.1819, + "step": 572 + }, + { + "epoch": 0.12027707808564232, + "grad_norm": 0.17436926066875458, + "learning_rate": 9.987632095358437e-05, + "loss": 0.154, + "step": 573 + }, + { + "epoch": 0.12048698572628044, + "grad_norm": 0.1583249568939209, + "learning_rate": 9.987373197743603e-05, + "loss": 0.1812, + "step": 574 + }, + { + "epoch": 0.12069689336691855, + "grad_norm": 0.14877773821353912, + "learning_rate": 9.987111621795437e-05, + "loss": 0.1591, + "step": 575 + }, + { + "epoch": 0.12090680100755667, + "grad_norm": 0.13348308205604553, + "learning_rate": 9.986847367654414e-05, + "loss": 0.164, + "step": 576 + }, + { + "epoch": 0.12111670864819479, + "grad_norm": 0.16104738414287567, + "learning_rate": 9.986580435462443e-05, + "loss": 0.1568, + "step": 577 + }, + { + "epoch": 0.12132661628883291, + "grad_norm": 0.1393367350101471, + "learning_rate": 9.98631082536287e-05, + "loss": 0.1504, + "step": 578 + }, + { + "epoch": 0.12153652392947104, + "grad_norm": 0.17166094481945038, + "learning_rate": 9.986038537500488e-05, + "loss": 0.1746, + "step": 579 + }, + { + "epoch": 0.12174643157010916, + "grad_norm": 0.19518902897834778, + "learning_rate": 9.985763572021516e-05, + "loss": 0.1961, + "step": 580 + }, + { + "epoch": 0.12195633921074728, + "grad_norm": 0.17867891490459442, + "learning_rate": 9.985485929073619e-05, + "loss": 0.1836, + "step": 581 + }, + { + "epoch": 0.12216624685138538, + "grad_norm": 0.2270008772611618, + "learning_rate": 9.9852056088059e-05, + "loss": 0.1847, + "step": 582 + }, + { + "epoch": 0.1223761544920235, + "grad_norm": 0.16426697373390198, + "learning_rate": 9.984922611368892e-05, + "loss": 0.1587, + "step": 583 + }, + { + "epoch": 0.12258606213266163, + "grad_norm": 0.16188107430934906, + "learning_rate": 9.984636936914575e-05, + "loss": 0.1783, + "step": 584 + }, + { + "epoch": 0.12279596977329975, + "grad_norm": 0.16157647967338562, + "learning_rate": 9.984348585596361e-05, + "loss": 0.1612, + "step": 585 + }, + { + "epoch": 0.12300587741393787, + "grad_norm": 0.17590083181858063, + "learning_rate": 9.984057557569104e-05, + "loss": 0.1852, + "step": 586 + }, + { + "epoch": 0.12321578505457599, + "grad_norm": 0.13868169486522675, + "learning_rate": 9.983763852989088e-05, + "loss": 0.1915, + "step": 587 + }, + { + "epoch": 0.12342569269521411, + "grad_norm": 0.15027277171611786, + "learning_rate": 9.983467472014043e-05, + "loss": 0.1924, + "step": 588 + }, + { + "epoch": 0.12363560033585222, + "grad_norm": 0.16492144763469696, + "learning_rate": 9.983168414803132e-05, + "loss": 0.1768, + "step": 589 + }, + { + "epoch": 0.12384550797649034, + "grad_norm": 0.16169002652168274, + "learning_rate": 9.982866681516954e-05, + "loss": 0.1704, + "step": 590 + }, + { + "epoch": 0.12405541561712846, + "grad_norm": 0.16292813420295715, + "learning_rate": 9.982562272317546e-05, + "loss": 0.1687, + "step": 591 + }, + { + "epoch": 0.12426532325776658, + "grad_norm": 0.16432535648345947, + "learning_rate": 9.982255187368386e-05, + "loss": 0.1702, + "step": 592 + }, + { + "epoch": 0.1244752308984047, + "grad_norm": 0.13967812061309814, + "learning_rate": 9.981945426834382e-05, + "loss": 0.1858, + "step": 593 + }, + { + "epoch": 0.12468513853904283, + "grad_norm": 0.16663575172424316, + "learning_rate": 9.981632990881885e-05, + "loss": 0.158, + "step": 594 + }, + { + "epoch": 0.12489504617968095, + "grad_norm": 0.17746829986572266, + "learning_rate": 9.981317879678679e-05, + "loss": 0.1679, + "step": 595 + }, + { + "epoch": 0.12510495382031905, + "grad_norm": 0.15006081759929657, + "learning_rate": 9.981000093393986e-05, + "loss": 0.1659, + "step": 596 + }, + { + "epoch": 0.1253148614609572, + "grad_norm": 0.15521980822086334, + "learning_rate": 9.980679632198466e-05, + "loss": 0.1587, + "step": 597 + }, + { + "epoch": 0.1255247691015953, + "grad_norm": 0.17882917821407318, + "learning_rate": 9.980356496264212e-05, + "loss": 0.1571, + "step": 598 + }, + { + "epoch": 0.12573467674223343, + "grad_norm": 0.17295823991298676, + "learning_rate": 9.980030685764754e-05, + "loss": 0.1936, + "step": 599 + }, + { + "epoch": 0.12594458438287154, + "grad_norm": 0.1699322760105133, + "learning_rate": 9.979702200875065e-05, + "loss": 0.1555, + "step": 600 + }, + { + "epoch": 0.12615449202350965, + "grad_norm": 0.15549197793006897, + "learning_rate": 9.979371041771543e-05, + "loss": 0.1569, + "step": 601 + }, + { + "epoch": 0.12636439966414778, + "grad_norm": 0.16207100450992584, + "learning_rate": 9.979037208632034e-05, + "loss": 0.1609, + "step": 602 + }, + { + "epoch": 0.1265743073047859, + "grad_norm": 0.156686931848526, + "learning_rate": 9.978700701635807e-05, + "loss": 0.165, + "step": 603 + }, + { + "epoch": 0.12678421494542402, + "grad_norm": 0.16895289719104767, + "learning_rate": 9.97836152096358e-05, + "loss": 0.1623, + "step": 604 + }, + { + "epoch": 0.12699412258606213, + "grad_norm": 0.14077980816364288, + "learning_rate": 9.978019666797498e-05, + "loss": 0.1573, + "step": 605 + }, + { + "epoch": 0.12720403022670027, + "grad_norm": 0.14448733627796173, + "learning_rate": 9.977675139321146e-05, + "loss": 0.1671, + "step": 606 + }, + { + "epoch": 0.12741393786733837, + "grad_norm": 0.1879337728023529, + "learning_rate": 9.977327938719541e-05, + "loss": 0.1644, + "step": 607 + }, + { + "epoch": 0.12762384550797648, + "grad_norm": 0.22603319585323334, + "learning_rate": 9.976978065179138e-05, + "loss": 0.1817, + "step": 608 + }, + { + "epoch": 0.12783375314861462, + "grad_norm": 0.19939488172531128, + "learning_rate": 9.976625518887828e-05, + "loss": 0.1925, + "step": 609 + }, + { + "epoch": 0.12804366078925272, + "grad_norm": 0.17629611492156982, + "learning_rate": 9.976270300034936e-05, + "loss": 0.1603, + "step": 610 + }, + { + "epoch": 0.12825356842989086, + "grad_norm": 0.17778918147087097, + "learning_rate": 9.975912408811223e-05, + "loss": 0.1849, + "step": 611 + }, + { + "epoch": 0.12846347607052896, + "grad_norm": 0.2002590447664261, + "learning_rate": 9.975551845408886e-05, + "loss": 0.166, + "step": 612 + }, + { + "epoch": 0.1286733837111671, + "grad_norm": 0.20324808359146118, + "learning_rate": 9.975188610021553e-05, + "loss": 0.1673, + "step": 613 + }, + { + "epoch": 0.1288832913518052, + "grad_norm": 0.20010024309158325, + "learning_rate": 9.974822702844291e-05, + "loss": 0.1529, + "step": 614 + }, + { + "epoch": 0.12909319899244331, + "grad_norm": 0.18944194912910461, + "learning_rate": 9.974454124073603e-05, + "loss": 0.1652, + "step": 615 + }, + { + "epoch": 0.12930310663308145, + "grad_norm": 0.16956883668899536, + "learning_rate": 9.974082873907418e-05, + "loss": 0.1812, + "step": 616 + }, + { + "epoch": 0.12951301427371956, + "grad_norm": 0.21157757937908173, + "learning_rate": 9.973708952545111e-05, + "loss": 0.1862, + "step": 617 + }, + { + "epoch": 0.1297229219143577, + "grad_norm": 0.20554983615875244, + "learning_rate": 9.973332360187486e-05, + "loss": 0.1705, + "step": 618 + }, + { + "epoch": 0.1299328295549958, + "grad_norm": 0.21964187920093536, + "learning_rate": 9.97295309703678e-05, + "loss": 0.1807, + "step": 619 + }, + { + "epoch": 0.13014273719563393, + "grad_norm": 0.15438182651996613, + "learning_rate": 9.972571163296666e-05, + "loss": 0.1951, + "step": 620 + }, + { + "epoch": 0.13035264483627204, + "grad_norm": 0.17805755138397217, + "learning_rate": 9.972186559172253e-05, + "loss": 0.1543, + "step": 621 + }, + { + "epoch": 0.13056255247691015, + "grad_norm": 0.1914507895708084, + "learning_rate": 9.97179928487008e-05, + "loss": 0.1815, + "step": 622 + }, + { + "epoch": 0.13077246011754828, + "grad_norm": 0.18952740728855133, + "learning_rate": 9.971409340598123e-05, + "loss": 0.1601, + "step": 623 + }, + { + "epoch": 0.1309823677581864, + "grad_norm": 0.1550108641386032, + "learning_rate": 9.971016726565791e-05, + "loss": 0.1726, + "step": 624 + }, + { + "epoch": 0.13119227539882453, + "grad_norm": 0.15205375850200653, + "learning_rate": 9.970621442983929e-05, + "loss": 0.1536, + "step": 625 + }, + { + "epoch": 0.13140218303946263, + "grad_norm": 0.15574775636196136, + "learning_rate": 9.970223490064809e-05, + "loss": 0.167, + "step": 626 + }, + { + "epoch": 0.13161209068010077, + "grad_norm": 0.18050767481327057, + "learning_rate": 9.969822868022143e-05, + "loss": 0.1752, + "step": 627 + }, + { + "epoch": 0.13182199832073888, + "grad_norm": 0.1507614701986313, + "learning_rate": 9.969419577071076e-05, + "loss": 0.1712, + "step": 628 + }, + { + "epoch": 0.13203190596137698, + "grad_norm": 0.20433206856250763, + "learning_rate": 9.96901361742818e-05, + "loss": 0.1782, + "step": 629 + }, + { + "epoch": 0.13224181360201512, + "grad_norm": 0.14705337584018707, + "learning_rate": 9.968604989311467e-05, + "loss": 0.1554, + "step": 630 + }, + { + "epoch": 0.13245172124265323, + "grad_norm": 0.1573822945356369, + "learning_rate": 9.968193692940382e-05, + "loss": 0.1762, + "step": 631 + }, + { + "epoch": 0.13266162888329136, + "grad_norm": 0.17816203832626343, + "learning_rate": 9.967779728535797e-05, + "loss": 0.1783, + "step": 632 + }, + { + "epoch": 0.13287153652392947, + "grad_norm": 0.1705409288406372, + "learning_rate": 9.967363096320022e-05, + "loss": 0.1734, + "step": 633 + }, + { + "epoch": 0.1330814441645676, + "grad_norm": 0.14574241638183594, + "learning_rate": 9.966943796516798e-05, + "loss": 0.1599, + "step": 634 + }, + { + "epoch": 0.1332913518052057, + "grad_norm": 0.1341760754585266, + "learning_rate": 9.966521829351297e-05, + "loss": 0.1833, + "step": 635 + }, + { + "epoch": 0.13350125944584382, + "grad_norm": 0.1680106520652771, + "learning_rate": 9.966097195050128e-05, + "loss": 0.1701, + "step": 636 + }, + { + "epoch": 0.13371116708648195, + "grad_norm": 0.1687798947095871, + "learning_rate": 9.965669893841326e-05, + "loss": 0.1669, + "step": 637 + }, + { + "epoch": 0.13392107472712006, + "grad_norm": 0.1792893260717392, + "learning_rate": 9.965239925954364e-05, + "loss": 0.1705, + "step": 638 + }, + { + "epoch": 0.1341309823677582, + "grad_norm": 0.15732963383197784, + "learning_rate": 9.964807291620144e-05, + "loss": 0.176, + "step": 639 + }, + { + "epoch": 0.1343408900083963, + "grad_norm": 0.14176061749458313, + "learning_rate": 9.964371991070999e-05, + "loss": 0.1755, + "step": 640 + }, + { + "epoch": 0.13455079764903444, + "grad_norm": 0.1526860147714615, + "learning_rate": 9.963934024540698e-05, + "loss": 0.1589, + "step": 641 + }, + { + "epoch": 0.13476070528967254, + "grad_norm": 0.15608391165733337, + "learning_rate": 9.963493392264435e-05, + "loss": 0.1652, + "step": 642 + }, + { + "epoch": 0.13497061293031065, + "grad_norm": 0.14473848044872284, + "learning_rate": 9.963050094478845e-05, + "loss": 0.1742, + "step": 643 + }, + { + "epoch": 0.1351805205709488, + "grad_norm": 0.1382717341184616, + "learning_rate": 9.962604131421984e-05, + "loss": 0.1713, + "step": 644 + }, + { + "epoch": 0.1353904282115869, + "grad_norm": 0.14345118403434753, + "learning_rate": 9.962155503333348e-05, + "loss": 0.1648, + "step": 645 + }, + { + "epoch": 0.13560033585222503, + "grad_norm": 0.1398243010044098, + "learning_rate": 9.96170421045386e-05, + "loss": 0.1889, + "step": 646 + }, + { + "epoch": 0.13581024349286314, + "grad_norm": 0.14956693351268768, + "learning_rate": 9.96125025302587e-05, + "loss": 0.1744, + "step": 647 + }, + { + "epoch": 0.13602015113350127, + "grad_norm": 0.15064063668251038, + "learning_rate": 9.96079363129317e-05, + "loss": 0.1625, + "step": 648 + }, + { + "epoch": 0.13623005877413938, + "grad_norm": 0.15413698554039001, + "learning_rate": 9.960334345500974e-05, + "loss": 0.1745, + "step": 649 + }, + { + "epoch": 0.13643996641477749, + "grad_norm": 0.18636789917945862, + "learning_rate": 9.959872395895929e-05, + "loss": 0.1639, + "step": 650 + }, + { + "epoch": 0.13664987405541562, + "grad_norm": 0.2444021999835968, + "learning_rate": 9.959407782726108e-05, + "loss": 0.1826, + "step": 651 + }, + { + "epoch": 0.13685978169605373, + "grad_norm": 0.16845685243606567, + "learning_rate": 9.958940506241026e-05, + "loss": 0.1793, + "step": 652 + }, + { + "epoch": 0.13706968933669186, + "grad_norm": 0.16233941912651062, + "learning_rate": 9.958470566691618e-05, + "loss": 0.18, + "step": 653 + }, + { + "epoch": 0.13727959697732997, + "grad_norm": 0.18665330111980438, + "learning_rate": 9.95799796433025e-05, + "loss": 0.1662, + "step": 654 + }, + { + "epoch": 0.1374895046179681, + "grad_norm": 0.18732531368732452, + "learning_rate": 9.957522699410723e-05, + "loss": 0.1646, + "step": 655 + }, + { + "epoch": 0.1376994122586062, + "grad_norm": 0.12580807507038116, + "learning_rate": 9.957044772188266e-05, + "loss": 0.1616, + "step": 656 + }, + { + "epoch": 0.13790931989924432, + "grad_norm": 0.17410576343536377, + "learning_rate": 9.956564182919535e-05, + "loss": 0.1628, + "step": 657 + }, + { + "epoch": 0.13811922753988246, + "grad_norm": 0.1753510683774948, + "learning_rate": 9.95608093186262e-05, + "loss": 0.1867, + "step": 658 + }, + { + "epoch": 0.13832913518052056, + "grad_norm": 0.17664988338947296, + "learning_rate": 9.955595019277032e-05, + "loss": 0.1819, + "step": 659 + }, + { + "epoch": 0.1385390428211587, + "grad_norm": 0.1787986010313034, + "learning_rate": 9.955106445423722e-05, + "loss": 0.1668, + "step": 660 + }, + { + "epoch": 0.1387489504617968, + "grad_norm": 0.14035004377365112, + "learning_rate": 9.954615210565065e-05, + "loss": 0.1651, + "step": 661 + }, + { + "epoch": 0.13895885810243494, + "grad_norm": 0.20008955895900726, + "learning_rate": 9.954121314964864e-05, + "loss": 0.179, + "step": 662 + }, + { + "epoch": 0.13916876574307305, + "grad_norm": 0.22378800809383392, + "learning_rate": 9.953624758888352e-05, + "loss": 0.1729, + "step": 663 + }, + { + "epoch": 0.13937867338371115, + "grad_norm": 0.18687045574188232, + "learning_rate": 9.953125542602193e-05, + "loss": 0.1756, + "step": 664 + }, + { + "epoch": 0.1395885810243493, + "grad_norm": 0.1506877988576889, + "learning_rate": 9.952623666374475e-05, + "loss": 0.1616, + "step": 665 + }, + { + "epoch": 0.1397984886649874, + "grad_norm": 0.20918136835098267, + "learning_rate": 9.95211913047472e-05, + "loss": 0.1888, + "step": 666 + }, + { + "epoch": 0.14000839630562553, + "grad_norm": 0.17521612346172333, + "learning_rate": 9.951611935173872e-05, + "loss": 0.1886, + "step": 667 + }, + { + "epoch": 0.14021830394626364, + "grad_norm": 0.14788690209388733, + "learning_rate": 9.951102080744308e-05, + "loss": 0.162, + "step": 668 + }, + { + "epoch": 0.14042821158690177, + "grad_norm": 0.18560314178466797, + "learning_rate": 9.950589567459832e-05, + "loss": 0.1573, + "step": 669 + }, + { + "epoch": 0.14063811922753988, + "grad_norm": 0.16913674771785736, + "learning_rate": 9.950074395595675e-05, + "loss": 0.1713, + "step": 670 + }, + { + "epoch": 0.140848026868178, + "grad_norm": 0.13768184185028076, + "learning_rate": 9.949556565428496e-05, + "loss": 0.1733, + "step": 671 + }, + { + "epoch": 0.14105793450881612, + "grad_norm": 0.19472239911556244, + "learning_rate": 9.949036077236382e-05, + "loss": 0.1638, + "step": 672 + }, + { + "epoch": 0.14126784214945423, + "grad_norm": 0.17684867978096008, + "learning_rate": 9.948512931298846e-05, + "loss": 0.1686, + "step": 673 + }, + { + "epoch": 0.14147774979009237, + "grad_norm": 0.20061515271663666, + "learning_rate": 9.94798712789683e-05, + "loss": 0.1921, + "step": 674 + }, + { + "epoch": 0.14168765743073047, + "grad_norm": 0.17213481664657593, + "learning_rate": 9.9474586673127e-05, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 0.1418975650713686, + "grad_norm": 0.16144217550754547, + "learning_rate": 9.946927549830258e-05, + "loss": 0.1594, + "step": 676 + }, + { + "epoch": 0.14210747271200672, + "grad_norm": 0.16045495867729187, + "learning_rate": 9.946393775734719e-05, + "loss": 0.1585, + "step": 677 + }, + { + "epoch": 0.14231738035264482, + "grad_norm": 0.168419748544693, + "learning_rate": 9.945857345312735e-05, + "loss": 0.1618, + "step": 678 + }, + { + "epoch": 0.14252728799328296, + "grad_norm": 0.16631141304969788, + "learning_rate": 9.945318258852383e-05, + "loss": 0.1648, + "step": 679 + }, + { + "epoch": 0.14273719563392107, + "grad_norm": 0.17133933305740356, + "learning_rate": 9.944776516643161e-05, + "loss": 0.1902, + "step": 680 + }, + { + "epoch": 0.1429471032745592, + "grad_norm": 0.144994854927063, + "learning_rate": 9.944232118976e-05, + "loss": 0.1645, + "step": 681 + }, + { + "epoch": 0.1431570109151973, + "grad_norm": 0.13521502912044525, + "learning_rate": 9.943685066143252e-05, + "loss": 0.1679, + "step": 682 + }, + { + "epoch": 0.14336691855583544, + "grad_norm": 0.1505574733018875, + "learning_rate": 9.943135358438698e-05, + "loss": 0.1497, + "step": 683 + }, + { + "epoch": 0.14357682619647355, + "grad_norm": 0.1701841652393341, + "learning_rate": 9.942582996157544e-05, + "loss": 0.141, + "step": 684 + }, + { + "epoch": 0.14378673383711166, + "grad_norm": 0.16892337799072266, + "learning_rate": 9.94202797959642e-05, + "loss": 0.1845, + "step": 685 + }, + { + "epoch": 0.1439966414777498, + "grad_norm": 0.1322741061449051, + "learning_rate": 9.941470309053384e-05, + "loss": 0.1635, + "step": 686 + }, + { + "epoch": 0.1442065491183879, + "grad_norm": 0.18180270493030548, + "learning_rate": 9.940909984827915e-05, + "loss": 0.1521, + "step": 687 + }, + { + "epoch": 0.14441645675902604, + "grad_norm": 0.17136745154857635, + "learning_rate": 9.940347007220924e-05, + "loss": 0.1694, + "step": 688 + }, + { + "epoch": 0.14462636439966414, + "grad_norm": 0.16122983396053314, + "learning_rate": 9.93978137653474e-05, + "loss": 0.1726, + "step": 689 + }, + { + "epoch": 0.14483627204030228, + "grad_norm": 0.15024663507938385, + "learning_rate": 9.939213093073118e-05, + "loss": 0.1703, + "step": 690 + }, + { + "epoch": 0.14504617968094038, + "grad_norm": 0.14193399250507355, + "learning_rate": 9.938642157141245e-05, + "loss": 0.1837, + "step": 691 + }, + { + "epoch": 0.1452560873215785, + "grad_norm": 0.15650463104248047, + "learning_rate": 9.938068569045721e-05, + "loss": 0.1665, + "step": 692 + }, + { + "epoch": 0.14546599496221663, + "grad_norm": 0.14254000782966614, + "learning_rate": 9.937492329094577e-05, + "loss": 0.175, + "step": 693 + }, + { + "epoch": 0.14567590260285473, + "grad_norm": 0.17051447927951813, + "learning_rate": 9.93691343759727e-05, + "loss": 0.1664, + "step": 694 + }, + { + "epoch": 0.14588581024349287, + "grad_norm": 0.16692955791950226, + "learning_rate": 9.936331894864677e-05, + "loss": 0.1682, + "step": 695 + }, + { + "epoch": 0.14609571788413098, + "grad_norm": 0.20158237218856812, + "learning_rate": 9.935747701209096e-05, + "loss": 0.1574, + "step": 696 + }, + { + "epoch": 0.1463056255247691, + "grad_norm": 0.1351911425590515, + "learning_rate": 9.935160856944257e-05, + "loss": 0.1618, + "step": 697 + }, + { + "epoch": 0.14651553316540722, + "grad_norm": 0.1611570417881012, + "learning_rate": 9.934571362385305e-05, + "loss": 0.164, + "step": 698 + }, + { + "epoch": 0.14672544080604533, + "grad_norm": 0.22555968165397644, + "learning_rate": 9.933979217848815e-05, + "loss": 0.2044, + "step": 699 + }, + { + "epoch": 0.14693534844668346, + "grad_norm": 0.17471735179424286, + "learning_rate": 9.93338442365278e-05, + "loss": 0.1809, + "step": 700 + }, + { + "epoch": 0.14714525608732157, + "grad_norm": 0.12285126745700836, + "learning_rate": 9.93278698011662e-05, + "loss": 0.1644, + "step": 701 + }, + { + "epoch": 0.1473551637279597, + "grad_norm": 0.1364145576953888, + "learning_rate": 9.932186887561175e-05, + "loss": 0.1655, + "step": 702 + }, + { + "epoch": 0.1475650713685978, + "grad_norm": 0.14189580082893372, + "learning_rate": 9.931584146308708e-05, + "loss": 0.1729, + "step": 703 + }, + { + "epoch": 0.14777497900923595, + "grad_norm": 0.2254076600074768, + "learning_rate": 9.930978756682905e-05, + "loss": 0.1731, + "step": 704 + }, + { + "epoch": 0.14798488664987405, + "grad_norm": 0.15398503839969635, + "learning_rate": 9.930370719008875e-05, + "loss": 0.1813, + "step": 705 + }, + { + "epoch": 0.1481947942905122, + "grad_norm": 0.15267014503479004, + "learning_rate": 9.929760033613146e-05, + "loss": 0.1681, + "step": 706 + }, + { + "epoch": 0.1484047019311503, + "grad_norm": 0.1611442267894745, + "learning_rate": 9.929146700823671e-05, + "loss": 0.1925, + "step": 707 + }, + { + "epoch": 0.1486146095717884, + "grad_norm": 0.15641498565673828, + "learning_rate": 9.928530720969827e-05, + "loss": 0.1733, + "step": 708 + }, + { + "epoch": 0.14882451721242654, + "grad_norm": 0.12724818289279938, + "learning_rate": 9.927912094382403e-05, + "loss": 0.1735, + "step": 709 + }, + { + "epoch": 0.14903442485306465, + "grad_norm": 0.17570020258426666, + "learning_rate": 9.92729082139362e-05, + "loss": 0.1732, + "step": 710 + }, + { + "epoch": 0.14924433249370278, + "grad_norm": 0.19477427005767822, + "learning_rate": 9.926666902337115e-05, + "loss": 0.1551, + "step": 711 + }, + { + "epoch": 0.1494542401343409, + "grad_norm": 0.16798420250415802, + "learning_rate": 9.926040337547946e-05, + "loss": 0.1906, + "step": 712 + }, + { + "epoch": 0.14966414777497902, + "grad_norm": 0.17551501095294952, + "learning_rate": 9.925411127362594e-05, + "loss": 0.1747, + "step": 713 + }, + { + "epoch": 0.14987405541561713, + "grad_norm": 0.1842852234840393, + "learning_rate": 9.924779272118957e-05, + "loss": 0.1563, + "step": 714 + }, + { + "epoch": 0.15008396305625524, + "grad_norm": 0.1805478185415268, + "learning_rate": 9.924144772156358e-05, + "loss": 0.165, + "step": 715 + }, + { + "epoch": 0.15029387069689337, + "grad_norm": 0.21542084217071533, + "learning_rate": 9.923507627815536e-05, + "loss": 0.1469, + "step": 716 + }, + { + "epoch": 0.15050377833753148, + "grad_norm": 0.15649262070655823, + "learning_rate": 9.922867839438654e-05, + "loss": 0.1706, + "step": 717 + }, + { + "epoch": 0.15071368597816961, + "grad_norm": 0.14313052594661713, + "learning_rate": 9.92222540736929e-05, + "loss": 0.1737, + "step": 718 + }, + { + "epoch": 0.15092359361880772, + "grad_norm": 0.13190749287605286, + "learning_rate": 9.92158033195245e-05, + "loss": 0.1742, + "step": 719 + }, + { + "epoch": 0.15113350125944586, + "grad_norm": 0.18187177181243896, + "learning_rate": 9.920932613534549e-05, + "loss": 0.1748, + "step": 720 + }, + { + "epoch": 0.15134340890008396, + "grad_norm": 0.14302362501621246, + "learning_rate": 9.920282252463429e-05, + "loss": 0.1711, + "step": 721 + }, + { + "epoch": 0.15155331654072207, + "grad_norm": 0.16898708045482635, + "learning_rate": 9.919629249088347e-05, + "loss": 0.1616, + "step": 722 + }, + { + "epoch": 0.1517632241813602, + "grad_norm": 0.15528154373168945, + "learning_rate": 9.918973603759984e-05, + "loss": 0.1652, + "step": 723 + }, + { + "epoch": 0.15197313182199831, + "grad_norm": 0.1718195527791977, + "learning_rate": 9.918315316830434e-05, + "loss": 0.1758, + "step": 724 + }, + { + "epoch": 0.15218303946263645, + "grad_norm": 0.15404529869556427, + "learning_rate": 9.917654388653211e-05, + "loss": 0.162, + "step": 725 + }, + { + "epoch": 0.15239294710327456, + "grad_norm": 0.19148094952106476, + "learning_rate": 9.916990819583252e-05, + "loss": 0.1887, + "step": 726 + }, + { + "epoch": 0.1526028547439127, + "grad_norm": 0.21298371255397797, + "learning_rate": 9.916324609976906e-05, + "loss": 0.1712, + "step": 727 + }, + { + "epoch": 0.1528127623845508, + "grad_norm": 0.2041487991809845, + "learning_rate": 9.915655760191944e-05, + "loss": 0.167, + "step": 728 + }, + { + "epoch": 0.1530226700251889, + "grad_norm": 0.17242367565631866, + "learning_rate": 9.914984270587552e-05, + "loss": 0.163, + "step": 729 + }, + { + "epoch": 0.15323257766582704, + "grad_norm": 0.1646365076303482, + "learning_rate": 9.914310141524339e-05, + "loss": 0.1638, + "step": 730 + }, + { + "epoch": 0.15344248530646515, + "grad_norm": 0.1947726458311081, + "learning_rate": 9.913633373364324e-05, + "loss": 0.1828, + "step": 731 + }, + { + "epoch": 0.15365239294710328, + "grad_norm": 0.17962804436683655, + "learning_rate": 9.912953966470948e-05, + "loss": 0.1699, + "step": 732 + }, + { + "epoch": 0.1538623005877414, + "grad_norm": 0.17121249437332153, + "learning_rate": 9.912271921209068e-05, + "loss": 0.1605, + "step": 733 + }, + { + "epoch": 0.15407220822837953, + "grad_norm": 0.17192313075065613, + "learning_rate": 9.911587237944959e-05, + "loss": 0.1746, + "step": 734 + }, + { + "epoch": 0.15428211586901763, + "grad_norm": 0.1262722760438919, + "learning_rate": 9.910899917046311e-05, + "loss": 0.1677, + "step": 735 + }, + { + "epoch": 0.15449202350965574, + "grad_norm": 0.15364903211593628, + "learning_rate": 9.910209958882231e-05, + "loss": 0.1792, + "step": 736 + }, + { + "epoch": 0.15470193115029388, + "grad_norm": 0.17038469016551971, + "learning_rate": 9.909517363823241e-05, + "loss": 0.1588, + "step": 737 + }, + { + "epoch": 0.15491183879093198, + "grad_norm": 0.14676110446453094, + "learning_rate": 9.908822132241281e-05, + "loss": 0.1639, + "step": 738 + }, + { + "epoch": 0.15512174643157012, + "grad_norm": 0.15300148725509644, + "learning_rate": 9.908124264509707e-05, + "loss": 0.1752, + "step": 739 + }, + { + "epoch": 0.15533165407220823, + "grad_norm": 0.1372271180152893, + "learning_rate": 9.90742376100329e-05, + "loss": 0.1728, + "step": 740 + }, + { + "epoch": 0.15554156171284636, + "grad_norm": 0.13885709643363953, + "learning_rate": 9.906720622098215e-05, + "loss": 0.1625, + "step": 741 + }, + { + "epoch": 0.15575146935348447, + "grad_norm": 0.12582066655158997, + "learning_rate": 9.906014848172086e-05, + "loss": 0.17, + "step": 742 + }, + { + "epoch": 0.15596137699412257, + "grad_norm": 0.13245654106140137, + "learning_rate": 9.905306439603918e-05, + "loss": 0.1711, + "step": 743 + }, + { + "epoch": 0.1561712846347607, + "grad_norm": 0.12488420307636261, + "learning_rate": 9.904595396774142e-05, + "loss": 0.1725, + "step": 744 + }, + { + "epoch": 0.15638119227539882, + "grad_norm": 0.1577821522951126, + "learning_rate": 9.903881720064606e-05, + "loss": 0.1707, + "step": 745 + }, + { + "epoch": 0.15659109991603695, + "grad_norm": 0.16098785400390625, + "learning_rate": 9.903165409858567e-05, + "loss": 0.1608, + "step": 746 + }, + { + "epoch": 0.15680100755667506, + "grad_norm": 0.14498507976531982, + "learning_rate": 9.902446466540707e-05, + "loss": 0.1776, + "step": 747 + }, + { + "epoch": 0.1570109151973132, + "grad_norm": 0.15581083297729492, + "learning_rate": 9.901724890497109e-05, + "loss": 0.1746, + "step": 748 + }, + { + "epoch": 0.1572208228379513, + "grad_norm": 0.16081508994102478, + "learning_rate": 9.901000682115276e-05, + "loss": 0.1869, + "step": 749 + }, + { + "epoch": 0.1574307304785894, + "grad_norm": 0.16249604523181915, + "learning_rate": 9.900273841784126e-05, + "loss": 0.1629, + "step": 750 + }, + { + "epoch": 0.15764063811922754, + "grad_norm": 0.1613830178976059, + "learning_rate": 9.899544369893992e-05, + "loss": 0.1588, + "step": 751 + }, + { + "epoch": 0.15785054575986565, + "grad_norm": 0.15741245448589325, + "learning_rate": 9.898812266836613e-05, + "loss": 0.1708, + "step": 752 + }, + { + "epoch": 0.1580604534005038, + "grad_norm": 0.12306373566389084, + "learning_rate": 9.898077533005144e-05, + "loss": 0.1751, + "step": 753 + }, + { + "epoch": 0.1582703610411419, + "grad_norm": 0.15071968734264374, + "learning_rate": 9.897340168794155e-05, + "loss": 0.1686, + "step": 754 + }, + { + "epoch": 0.15848026868178003, + "grad_norm": 0.14102348685264587, + "learning_rate": 9.896600174599632e-05, + "loss": 0.1701, + "step": 755 + }, + { + "epoch": 0.15869017632241814, + "grad_norm": 0.16288180649280548, + "learning_rate": 9.895857550818963e-05, + "loss": 0.1652, + "step": 756 + }, + { + "epoch": 0.15890008396305624, + "grad_norm": 0.17342409491539001, + "learning_rate": 9.895112297850956e-05, + "loss": 0.1702, + "step": 757 + }, + { + "epoch": 0.15910999160369438, + "grad_norm": 0.13485445082187653, + "learning_rate": 9.894364416095829e-05, + "loss": 0.1705, + "step": 758 + }, + { + "epoch": 0.15931989924433249, + "grad_norm": 0.17127734422683716, + "learning_rate": 9.893613905955211e-05, + "loss": 0.1831, + "step": 759 + }, + { + "epoch": 0.15952980688497062, + "grad_norm": 0.13848379254341125, + "learning_rate": 9.892860767832144e-05, + "loss": 0.161, + "step": 760 + }, + { + "epoch": 0.15973971452560873, + "grad_norm": 0.12154096364974976, + "learning_rate": 9.892105002131081e-05, + "loss": 0.1637, + "step": 761 + }, + { + "epoch": 0.15994962216624686, + "grad_norm": 0.15433917939662933, + "learning_rate": 9.891346609257882e-05, + "loss": 0.1765, + "step": 762 + }, + { + "epoch": 0.16015952980688497, + "grad_norm": 0.16946150362491608, + "learning_rate": 9.890585589619825e-05, + "loss": 0.1766, + "step": 763 + }, + { + "epoch": 0.16036943744752308, + "grad_norm": 0.14503143727779388, + "learning_rate": 9.889821943625594e-05, + "loss": 0.1458, + "step": 764 + }, + { + "epoch": 0.1605793450881612, + "grad_norm": 0.16491486132144928, + "learning_rate": 9.889055671685283e-05, + "loss": 0.1861, + "step": 765 + }, + { + "epoch": 0.16078925272879932, + "grad_norm": 0.15087951719760895, + "learning_rate": 9.888286774210398e-05, + "loss": 0.1745, + "step": 766 + }, + { + "epoch": 0.16099916036943746, + "grad_norm": 0.19266510009765625, + "learning_rate": 9.887515251613857e-05, + "loss": 0.1711, + "step": 767 + }, + { + "epoch": 0.16120906801007556, + "grad_norm": 0.19419905543327332, + "learning_rate": 9.886741104309981e-05, + "loss": 0.1705, + "step": 768 + }, + { + "epoch": 0.1614189756507137, + "grad_norm": 0.13408306241035461, + "learning_rate": 9.885964332714508e-05, + "loss": 0.1629, + "step": 769 + }, + { + "epoch": 0.1616288832913518, + "grad_norm": 0.17757900059223175, + "learning_rate": 9.885184937244581e-05, + "loss": 0.1643, + "step": 770 + }, + { + "epoch": 0.1618387909319899, + "grad_norm": 0.16376204788684845, + "learning_rate": 9.884402918318754e-05, + "loss": 0.1869, + "step": 771 + }, + { + "epoch": 0.16204869857262805, + "grad_norm": 0.1362319141626358, + "learning_rate": 9.883618276356988e-05, + "loss": 0.1736, + "step": 772 + }, + { + "epoch": 0.16225860621326615, + "grad_norm": 0.15015824139118195, + "learning_rate": 9.882831011780653e-05, + "loss": 0.1598, + "step": 773 + }, + { + "epoch": 0.1624685138539043, + "grad_norm": 0.1815597116947174, + "learning_rate": 9.882041125012528e-05, + "loss": 0.1597, + "step": 774 + }, + { + "epoch": 0.1626784214945424, + "grad_norm": 0.16493958234786987, + "learning_rate": 9.881248616476803e-05, + "loss": 0.181, + "step": 775 + }, + { + "epoch": 0.16288832913518053, + "grad_norm": 0.1213487982749939, + "learning_rate": 9.880453486599072e-05, + "loss": 0.1644, + "step": 776 + }, + { + "epoch": 0.16309823677581864, + "grad_norm": 0.1440252661705017, + "learning_rate": 9.879655735806337e-05, + "loss": 0.1657, + "step": 777 + }, + { + "epoch": 0.16330814441645675, + "grad_norm": 0.1406947672367096, + "learning_rate": 9.878855364527007e-05, + "loss": 0.172, + "step": 778 + }, + { + "epoch": 0.16351805205709488, + "grad_norm": 0.17073537409305573, + "learning_rate": 9.878052373190902e-05, + "loss": 0.1852, + "step": 779 + }, + { + "epoch": 0.163727959697733, + "grad_norm": 0.15924111008644104, + "learning_rate": 9.877246762229247e-05, + "loss": 0.18, + "step": 780 + }, + { + "epoch": 0.16393786733837112, + "grad_norm": 0.14216449856758118, + "learning_rate": 9.876438532074672e-05, + "loss": 0.1575, + "step": 781 + }, + { + "epoch": 0.16414777497900923, + "grad_norm": 0.12731285393238068, + "learning_rate": 9.875627683161217e-05, + "loss": 0.161, + "step": 782 + }, + { + "epoch": 0.16435768261964737, + "grad_norm": 0.16211947798728943, + "learning_rate": 9.874814215924324e-05, + "loss": 0.1592, + "step": 783 + }, + { + "epoch": 0.16456759026028547, + "grad_norm": 0.1463523507118225, + "learning_rate": 9.873998130800844e-05, + "loss": 0.1773, + "step": 784 + }, + { + "epoch": 0.16477749790092358, + "grad_norm": 0.14601489901542664, + "learning_rate": 9.873179428229033e-05, + "loss": 0.1685, + "step": 785 + }, + { + "epoch": 0.16498740554156172, + "grad_norm": 0.1590069830417633, + "learning_rate": 9.872358108648557e-05, + "loss": 0.1709, + "step": 786 + }, + { + "epoch": 0.16519731318219982, + "grad_norm": 0.14027854800224304, + "learning_rate": 9.871534172500479e-05, + "loss": 0.1842, + "step": 787 + }, + { + "epoch": 0.16540722082283796, + "grad_norm": 0.12902334332466125, + "learning_rate": 9.870707620227271e-05, + "loss": 0.1484, + "step": 788 + }, + { + "epoch": 0.16561712846347607, + "grad_norm": 0.13427670300006866, + "learning_rate": 9.869878452272812e-05, + "loss": 0.1599, + "step": 789 + }, + { + "epoch": 0.1658270361041142, + "grad_norm": 0.17016825079917908, + "learning_rate": 9.869046669082386e-05, + "loss": 0.1506, + "step": 790 + }, + { + "epoch": 0.1660369437447523, + "grad_norm": 0.13757233321666718, + "learning_rate": 9.868212271102678e-05, + "loss": 0.1574, + "step": 791 + }, + { + "epoch": 0.16624685138539042, + "grad_norm": 0.19485127925872803, + "learning_rate": 9.867375258781778e-05, + "loss": 0.1686, + "step": 792 + }, + { + "epoch": 0.16645675902602855, + "grad_norm": 0.16131381690502167, + "learning_rate": 9.866535632569182e-05, + "loss": 0.1693, + "step": 793 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.16035085916519165, + "learning_rate": 9.865693392915787e-05, + "loss": 0.1589, + "step": 794 + }, + { + "epoch": 0.1668765743073048, + "grad_norm": 0.16287830471992493, + "learning_rate": 9.864848540273897e-05, + "loss": 0.1749, + "step": 795 + }, + { + "epoch": 0.1670864819479429, + "grad_norm": 0.16353754699230194, + "learning_rate": 9.864001075097214e-05, + "loss": 0.1846, + "step": 796 + }, + { + "epoch": 0.16729638958858103, + "grad_norm": 0.19826947152614594, + "learning_rate": 9.863150997840849e-05, + "loss": 0.1624, + "step": 797 + }, + { + "epoch": 0.16750629722921914, + "grad_norm": 0.14296779036521912, + "learning_rate": 9.862298308961313e-05, + "loss": 0.1823, + "step": 798 + }, + { + "epoch": 0.16771620486985725, + "grad_norm": 0.1371513456106186, + "learning_rate": 9.861443008916517e-05, + "loss": 0.1742, + "step": 799 + }, + { + "epoch": 0.16792611251049538, + "grad_norm": 0.17043526470661163, + "learning_rate": 9.86058509816578e-05, + "loss": 0.1661, + "step": 800 + }, + { + "epoch": 0.1681360201511335, + "grad_norm": 0.17045053839683533, + "learning_rate": 9.859724577169815e-05, + "loss": 0.1773, + "step": 801 + }, + { + "epoch": 0.16834592779177163, + "grad_norm": 0.1564718633890152, + "learning_rate": 9.858861446390748e-05, + "loss": 0.1636, + "step": 802 + }, + { + "epoch": 0.16855583543240973, + "grad_norm": 0.13335298001766205, + "learning_rate": 9.857995706292092e-05, + "loss": 0.18, + "step": 803 + }, + { + "epoch": 0.16876574307304787, + "grad_norm": 0.14552432298660278, + "learning_rate": 9.857127357338775e-05, + "loss": 0.1599, + "step": 804 + }, + { + "epoch": 0.16897565071368598, + "grad_norm": 0.1437654346227646, + "learning_rate": 9.856256399997119e-05, + "loss": 0.178, + "step": 805 + }, + { + "epoch": 0.16918555835432408, + "grad_norm": 0.143344447016716, + "learning_rate": 9.855382834734848e-05, + "loss": 0.159, + "step": 806 + }, + { + "epoch": 0.16939546599496222, + "grad_norm": 0.15504872798919678, + "learning_rate": 9.854506662021085e-05, + "loss": 0.1944, + "step": 807 + }, + { + "epoch": 0.16960537363560033, + "grad_norm": 0.12894919514656067, + "learning_rate": 9.853627882326357e-05, + "loss": 0.1677, + "step": 808 + }, + { + "epoch": 0.16981528127623846, + "grad_norm": 0.1339186728000641, + "learning_rate": 9.852746496122587e-05, + "loss": 0.1666, + "step": 809 + }, + { + "epoch": 0.17002518891687657, + "grad_norm": 0.11661262065172195, + "learning_rate": 9.8518625038831e-05, + "loss": 0.1611, + "step": 810 + }, + { + "epoch": 0.1702350965575147, + "grad_norm": 0.14091713726520538, + "learning_rate": 9.85097590608262e-05, + "loss": 0.1483, + "step": 811 + }, + { + "epoch": 0.1704450041981528, + "grad_norm": 0.13658110797405243, + "learning_rate": 9.85008670319727e-05, + "loss": 0.1667, + "step": 812 + }, + { + "epoch": 0.17065491183879095, + "grad_norm": 0.13288968801498413, + "learning_rate": 9.849194895704575e-05, + "loss": 0.1685, + "step": 813 + }, + { + "epoch": 0.17086481947942905, + "grad_norm": 0.1870601624250412, + "learning_rate": 9.84830048408345e-05, + "loss": 0.1794, + "step": 814 + }, + { + "epoch": 0.17107472712006716, + "grad_norm": 0.15742088854312897, + "learning_rate": 9.84740346881422e-05, + "loss": 0.1546, + "step": 815 + }, + { + "epoch": 0.1712846347607053, + "grad_norm": 0.1625792235136032, + "learning_rate": 9.846503850378602e-05, + "loss": 0.1506, + "step": 816 + }, + { + "epoch": 0.1714945424013434, + "grad_norm": 0.1315995752811432, + "learning_rate": 9.845601629259708e-05, + "loss": 0.1611, + "step": 817 + }, + { + "epoch": 0.17170445004198154, + "grad_norm": 0.16079425811767578, + "learning_rate": 9.844696805942055e-05, + "loss": 0.1588, + "step": 818 + }, + { + "epoch": 0.17191435768261965, + "grad_norm": 0.16755861043930054, + "learning_rate": 9.843789380911554e-05, + "loss": 0.1629, + "step": 819 + }, + { + "epoch": 0.17212426532325778, + "grad_norm": 0.12923209369182587, + "learning_rate": 9.84287935465551e-05, + "loss": 0.1675, + "step": 820 + }, + { + "epoch": 0.1723341729638959, + "grad_norm": 0.14494512975215912, + "learning_rate": 9.84196672766263e-05, + "loss": 0.1674, + "step": 821 + }, + { + "epoch": 0.172544080604534, + "grad_norm": 0.13111597299575806, + "learning_rate": 9.841051500423014e-05, + "loss": 0.1835, + "step": 822 + }, + { + "epoch": 0.17275398824517213, + "grad_norm": 0.1468263864517212, + "learning_rate": 9.840133673428162e-05, + "loss": 0.1635, + "step": 823 + }, + { + "epoch": 0.17296389588581024, + "grad_norm": 0.14243565499782562, + "learning_rate": 9.839213247170967e-05, + "loss": 0.1617, + "step": 824 + }, + { + "epoch": 0.17317380352644837, + "grad_norm": 0.14563241600990295, + "learning_rate": 9.838290222145718e-05, + "loss": 0.1614, + "step": 825 + }, + { + "epoch": 0.17338371116708648, + "grad_norm": 0.15182636678218842, + "learning_rate": 9.837364598848102e-05, + "loss": 0.1712, + "step": 826 + }, + { + "epoch": 0.17359361880772461, + "grad_norm": 0.15270951390266418, + "learning_rate": 9.8364363777752e-05, + "loss": 0.1721, + "step": 827 + }, + { + "epoch": 0.17380352644836272, + "grad_norm": 0.14734511077404022, + "learning_rate": 9.835505559425487e-05, + "loss": 0.1751, + "step": 828 + }, + { + "epoch": 0.17401343408900083, + "grad_norm": 0.13699984550476074, + "learning_rate": 9.834572144298834e-05, + "loss": 0.1895, + "step": 829 + }, + { + "epoch": 0.17422334172963896, + "grad_norm": 0.1422858089208603, + "learning_rate": 9.833636132896505e-05, + "loss": 0.1757, + "step": 830 + }, + { + "epoch": 0.17443324937027707, + "grad_norm": 0.1300913542509079, + "learning_rate": 9.832697525721161e-05, + "loss": 0.1774, + "step": 831 + }, + { + "epoch": 0.1746431570109152, + "grad_norm": 0.16247855126857758, + "learning_rate": 9.831756323276856e-05, + "loss": 0.1499, + "step": 832 + }, + { + "epoch": 0.1748530646515533, + "grad_norm": 0.13687850534915924, + "learning_rate": 9.830812526069036e-05, + "loss": 0.1739, + "step": 833 + }, + { + "epoch": 0.17506297229219145, + "grad_norm": 0.16242316365242004, + "learning_rate": 9.829866134604543e-05, + "loss": 0.1703, + "step": 834 + }, + { + "epoch": 0.17527287993282956, + "grad_norm": 0.1983039677143097, + "learning_rate": 9.82891714939161e-05, + "loss": 0.1726, + "step": 835 + }, + { + "epoch": 0.17548278757346766, + "grad_norm": 0.17525269091129303, + "learning_rate": 9.827965570939861e-05, + "loss": 0.1839, + "step": 836 + }, + { + "epoch": 0.1756926952141058, + "grad_norm": 0.1517772525548935, + "learning_rate": 9.827011399760319e-05, + "loss": 0.17, + "step": 837 + }, + { + "epoch": 0.1759026028547439, + "grad_norm": 0.17186671495437622, + "learning_rate": 9.826054636365396e-05, + "loss": 0.1526, + "step": 838 + }, + { + "epoch": 0.17611251049538204, + "grad_norm": 0.15066854655742645, + "learning_rate": 9.825095281268894e-05, + "loss": 0.1442, + "step": 839 + }, + { + "epoch": 0.17632241813602015, + "grad_norm": 0.14937272667884827, + "learning_rate": 9.82413333498601e-05, + "loss": 0.1641, + "step": 840 + }, + { + "epoch": 0.17653232577665828, + "grad_norm": 0.1843566745519638, + "learning_rate": 9.823168798033328e-05, + "loss": 0.1674, + "step": 841 + }, + { + "epoch": 0.1767422334172964, + "grad_norm": 0.15555419027805328, + "learning_rate": 9.82220167092883e-05, + "loss": 0.1551, + "step": 842 + }, + { + "epoch": 0.1769521410579345, + "grad_norm": 0.1585703343153, + "learning_rate": 9.821231954191885e-05, + "loss": 0.1578, + "step": 843 + }, + { + "epoch": 0.17716204869857263, + "grad_norm": 0.1662077009677887, + "learning_rate": 9.82025964834325e-05, + "loss": 0.1778, + "step": 844 + }, + { + "epoch": 0.17737195633921074, + "grad_norm": 0.1385970562696457, + "learning_rate": 9.819284753905078e-05, + "loss": 0.1653, + "step": 845 + }, + { + "epoch": 0.17758186397984888, + "grad_norm": 0.14429986476898193, + "learning_rate": 9.81830727140091e-05, + "loss": 0.1801, + "step": 846 + }, + { + "epoch": 0.17779177162048698, + "grad_norm": 0.16140583157539368, + "learning_rate": 9.817327201355675e-05, + "loss": 0.1808, + "step": 847 + }, + { + "epoch": 0.17800167926112512, + "grad_norm": 0.13548845052719116, + "learning_rate": 9.816344544295692e-05, + "loss": 0.1567, + "step": 848 + }, + { + "epoch": 0.17821158690176322, + "grad_norm": 0.14740432798862457, + "learning_rate": 9.815359300748674e-05, + "loss": 0.1737, + "step": 849 + }, + { + "epoch": 0.17842149454240133, + "grad_norm": 0.15687352418899536, + "learning_rate": 9.814371471243715e-05, + "loss": 0.1775, + "step": 850 + }, + { + "epoch": 0.17863140218303947, + "grad_norm": 0.14518170058727264, + "learning_rate": 9.813381056311307e-05, + "loss": 0.1536, + "step": 851 + }, + { + "epoch": 0.17884130982367757, + "grad_norm": 0.1569397896528244, + "learning_rate": 9.812388056483319e-05, + "loss": 0.1667, + "step": 852 + }, + { + "epoch": 0.1790512174643157, + "grad_norm": 0.13371489942073822, + "learning_rate": 9.81139247229302e-05, + "loss": 0.175, + "step": 853 + }, + { + "epoch": 0.17926112510495382, + "grad_norm": 0.15128138661384583, + "learning_rate": 9.810394304275058e-05, + "loss": 0.1601, + "step": 854 + }, + { + "epoch": 0.17947103274559195, + "grad_norm": 0.12941020727157593, + "learning_rate": 9.809393552965476e-05, + "loss": 0.1579, + "step": 855 + }, + { + "epoch": 0.17968094038623006, + "grad_norm": 0.1644691675901413, + "learning_rate": 9.808390218901696e-05, + "loss": 0.1737, + "step": 856 + }, + { + "epoch": 0.17989084802686817, + "grad_norm": 0.16271840035915375, + "learning_rate": 9.807384302622533e-05, + "loss": 0.1846, + "step": 857 + }, + { + "epoch": 0.1801007556675063, + "grad_norm": 0.15787339210510254, + "learning_rate": 9.806375804668189e-05, + "loss": 0.1733, + "step": 858 + }, + { + "epoch": 0.1803106633081444, + "grad_norm": 0.16740775108337402, + "learning_rate": 9.805364725580248e-05, + "loss": 0.1625, + "step": 859 + }, + { + "epoch": 0.18052057094878254, + "grad_norm": 0.11970685422420502, + "learning_rate": 9.804351065901682e-05, + "loss": 0.1554, + "step": 860 + }, + { + "epoch": 0.18073047858942065, + "grad_norm": 0.21359467506408691, + "learning_rate": 9.803334826176852e-05, + "loss": 0.1658, + "step": 861 + }, + { + "epoch": 0.1809403862300588, + "grad_norm": 0.1699121594429016, + "learning_rate": 9.8023160069515e-05, + "loss": 0.1608, + "step": 862 + }, + { + "epoch": 0.1811502938706969, + "grad_norm": 0.15073060989379883, + "learning_rate": 9.801294608772755e-05, + "loss": 0.156, + "step": 863 + }, + { + "epoch": 0.181360201511335, + "grad_norm": 0.17342859506607056, + "learning_rate": 9.800270632189133e-05, + "loss": 0.1829, + "step": 864 + }, + { + "epoch": 0.18157010915197314, + "grad_norm": 0.20463357865810394, + "learning_rate": 9.799244077750531e-05, + "loss": 0.1839, + "step": 865 + }, + { + "epoch": 0.18178001679261124, + "grad_norm": 0.17843082547187805, + "learning_rate": 9.798214946008234e-05, + "loss": 0.1707, + "step": 866 + }, + { + "epoch": 0.18198992443324938, + "grad_norm": 0.14360542595386505, + "learning_rate": 9.797183237514907e-05, + "loss": 0.1687, + "step": 867 + }, + { + "epoch": 0.18219983207388749, + "grad_norm": 0.15701772272586823, + "learning_rate": 9.796148952824603e-05, + "loss": 0.1584, + "step": 868 + }, + { + "epoch": 0.18240973971452562, + "grad_norm": 0.12531954050064087, + "learning_rate": 9.795112092492755e-05, + "loss": 0.1467, + "step": 869 + }, + { + "epoch": 0.18261964735516373, + "grad_norm": 0.16143764555454254, + "learning_rate": 9.794072657076182e-05, + "loss": 0.1687, + "step": 870 + }, + { + "epoch": 0.18282955499580184, + "grad_norm": 0.13665997982025146, + "learning_rate": 9.793030647133084e-05, + "loss": 0.1644, + "step": 871 + }, + { + "epoch": 0.18303946263643997, + "grad_norm": 0.15797339379787445, + "learning_rate": 9.791986063223045e-05, + "loss": 0.1737, + "step": 872 + }, + { + "epoch": 0.18324937027707808, + "grad_norm": 0.13726350665092468, + "learning_rate": 9.790938905907027e-05, + "loss": 0.1842, + "step": 873 + }, + { + "epoch": 0.1834592779177162, + "grad_norm": 0.13706736266613007, + "learning_rate": 9.78988917574738e-05, + "loss": 0.1699, + "step": 874 + }, + { + "epoch": 0.18366918555835432, + "grad_norm": 0.13394543528556824, + "learning_rate": 9.788836873307835e-05, + "loss": 0.1757, + "step": 875 + }, + { + "epoch": 0.18387909319899245, + "grad_norm": 0.13909773528575897, + "learning_rate": 9.7877819991535e-05, + "loss": 0.1761, + "step": 876 + }, + { + "epoch": 0.18408900083963056, + "grad_norm": 0.1416359543800354, + "learning_rate": 9.786724553850865e-05, + "loss": 0.1657, + "step": 877 + }, + { + "epoch": 0.18429890848026867, + "grad_norm": 0.13720481097698212, + "learning_rate": 9.785664537967806e-05, + "loss": 0.173, + "step": 878 + }, + { + "epoch": 0.1845088161209068, + "grad_norm": 0.14136558771133423, + "learning_rate": 9.784601952073573e-05, + "loss": 0.1625, + "step": 879 + }, + { + "epoch": 0.1847187237615449, + "grad_norm": 0.1424713134765625, + "learning_rate": 9.783536796738802e-05, + "loss": 0.1676, + "step": 880 + }, + { + "epoch": 0.18492863140218305, + "grad_norm": 0.163283571600914, + "learning_rate": 9.782469072535502e-05, + "loss": 0.1574, + "step": 881 + }, + { + "epoch": 0.18513853904282115, + "grad_norm": 0.16645042598247528, + "learning_rate": 9.781398780037067e-05, + "loss": 0.1576, + "step": 882 + }, + { + "epoch": 0.1853484466834593, + "grad_norm": 0.17722882330417633, + "learning_rate": 9.780325919818268e-05, + "loss": 0.1815, + "step": 883 + }, + { + "epoch": 0.1855583543240974, + "grad_norm": 0.1435573697090149, + "learning_rate": 9.779250492455257e-05, + "loss": 0.168, + "step": 884 + }, + { + "epoch": 0.1857682619647355, + "grad_norm": 0.13388660550117493, + "learning_rate": 9.778172498525559e-05, + "loss": 0.1493, + "step": 885 + }, + { + "epoch": 0.18597816960537364, + "grad_norm": 0.15198923647403717, + "learning_rate": 9.777091938608088e-05, + "loss": 0.1759, + "step": 886 + }, + { + "epoch": 0.18618807724601175, + "grad_norm": 0.14781691133975983, + "learning_rate": 9.776008813283125e-05, + "loss": 0.1521, + "step": 887 + }, + { + "epoch": 0.18639798488664988, + "grad_norm": 0.18348653614521027, + "learning_rate": 9.774923123132332e-05, + "loss": 0.1522, + "step": 888 + }, + { + "epoch": 0.186607892527288, + "grad_norm": 0.15346892178058624, + "learning_rate": 9.773834868738752e-05, + "loss": 0.1593, + "step": 889 + }, + { + "epoch": 0.18681780016792612, + "grad_norm": 0.1563442200422287, + "learning_rate": 9.7727440506868e-05, + "loss": 0.1569, + "step": 890 + }, + { + "epoch": 0.18702770780856423, + "grad_norm": 0.12280000746250153, + "learning_rate": 9.771650669562274e-05, + "loss": 0.1593, + "step": 891 + }, + { + "epoch": 0.18723761544920234, + "grad_norm": 0.1622755527496338, + "learning_rate": 9.770554725952341e-05, + "loss": 0.1736, + "step": 892 + }, + { + "epoch": 0.18744752308984047, + "grad_norm": 0.19185318052768707, + "learning_rate": 9.769456220445549e-05, + "loss": 0.165, + "step": 893 + }, + { + "epoch": 0.18765743073047858, + "grad_norm": 0.16040024161338806, + "learning_rate": 9.768355153631822e-05, + "loss": 0.1837, + "step": 894 + }, + { + "epoch": 0.18786733837111672, + "grad_norm": 0.11211330443620682, + "learning_rate": 9.767251526102456e-05, + "loss": 0.152, + "step": 895 + }, + { + "epoch": 0.18807724601175482, + "grad_norm": 0.16628898680210114, + "learning_rate": 9.766145338450125e-05, + "loss": 0.1729, + "step": 896 + }, + { + "epoch": 0.18828715365239296, + "grad_norm": 0.14260315895080566, + "learning_rate": 9.765036591268877e-05, + "loss": 0.165, + "step": 897 + }, + { + "epoch": 0.18849706129303107, + "grad_norm": 0.12734055519104004, + "learning_rate": 9.763925285154135e-05, + "loss": 0.1714, + "step": 898 + }, + { + "epoch": 0.18870696893366917, + "grad_norm": 0.1465056985616684, + "learning_rate": 9.762811420702693e-05, + "loss": 0.1805, + "step": 899 + }, + { + "epoch": 0.1889168765743073, + "grad_norm": 0.12369433790445328, + "learning_rate": 9.761694998512727e-05, + "loss": 0.1737, + "step": 900 + }, + { + "epoch": 0.18912678421494541, + "grad_norm": 0.13893358409404755, + "learning_rate": 9.760576019183775e-05, + "loss": 0.1502, + "step": 901 + }, + { + "epoch": 0.18933669185558355, + "grad_norm": 0.13517262041568756, + "learning_rate": 9.759454483316761e-05, + "loss": 0.1648, + "step": 902 + }, + { + "epoch": 0.18954659949622166, + "grad_norm": 0.13170479238033295, + "learning_rate": 9.75833039151397e-05, + "loss": 0.159, + "step": 903 + }, + { + "epoch": 0.1897565071368598, + "grad_norm": 0.13293495774269104, + "learning_rate": 9.757203744379067e-05, + "loss": 0.1667, + "step": 904 + }, + { + "epoch": 0.1899664147774979, + "grad_norm": 0.12381456047296524, + "learning_rate": 9.756074542517088e-05, + "loss": 0.1684, + "step": 905 + }, + { + "epoch": 0.190176322418136, + "grad_norm": 0.12754730880260468, + "learning_rate": 9.75494278653444e-05, + "loss": 0.1559, + "step": 906 + }, + { + "epoch": 0.19038623005877414, + "grad_norm": 0.17824846506118774, + "learning_rate": 9.753808477038899e-05, + "loss": 0.1533, + "step": 907 + }, + { + "epoch": 0.19059613769941225, + "grad_norm": 0.13958828151226044, + "learning_rate": 9.752671614639619e-05, + "loss": 0.1494, + "step": 908 + }, + { + "epoch": 0.19080604534005038, + "grad_norm": 0.14593012630939484, + "learning_rate": 9.75153219994712e-05, + "loss": 0.1598, + "step": 909 + }, + { + "epoch": 0.1910159529806885, + "grad_norm": 0.19262051582336426, + "learning_rate": 9.750390233573293e-05, + "loss": 0.1638, + "step": 910 + }, + { + "epoch": 0.19122586062132663, + "grad_norm": 0.15087257325649261, + "learning_rate": 9.7492457161314e-05, + "loss": 0.1644, + "step": 911 + }, + { + "epoch": 0.19143576826196473, + "grad_norm": 0.17646470665931702, + "learning_rate": 9.748098648236072e-05, + "loss": 0.1644, + "step": 912 + }, + { + "epoch": 0.19164567590260284, + "grad_norm": 0.15336105227470398, + "learning_rate": 9.746949030503312e-05, + "loss": 0.1717, + "step": 913 + }, + { + "epoch": 0.19185558354324098, + "grad_norm": 0.15364870429039001, + "learning_rate": 9.745796863550492e-05, + "loss": 0.1721, + "step": 914 + }, + { + "epoch": 0.19206549118387908, + "grad_norm": 0.1769437938928604, + "learning_rate": 9.74464214799635e-05, + "loss": 0.1485, + "step": 915 + }, + { + "epoch": 0.19227539882451722, + "grad_norm": 0.18178406357765198, + "learning_rate": 9.743484884460993e-05, + "loss": 0.1523, + "step": 916 + }, + { + "epoch": 0.19248530646515533, + "grad_norm": 0.16841888427734375, + "learning_rate": 9.742325073565905e-05, + "loss": 0.148, + "step": 917 + }, + { + "epoch": 0.19269521410579346, + "grad_norm": 0.13603579998016357, + "learning_rate": 9.741162715933924e-05, + "loss": 0.161, + "step": 918 + }, + { + "epoch": 0.19290512174643157, + "grad_norm": 0.1348285973072052, + "learning_rate": 9.739997812189265e-05, + "loss": 0.145, + "step": 919 + }, + { + "epoch": 0.19311502938706968, + "grad_norm": 0.13922441005706787, + "learning_rate": 9.738830362957508e-05, + "loss": 0.1607, + "step": 920 + }, + { + "epoch": 0.1933249370277078, + "grad_norm": 0.12017328292131424, + "learning_rate": 9.7376603688656e-05, + "loss": 0.152, + "step": 921 + }, + { + "epoch": 0.19353484466834592, + "grad_norm": 0.14594700932502747, + "learning_rate": 9.736487830541853e-05, + "loss": 0.1638, + "step": 922 + }, + { + "epoch": 0.19374475230898405, + "grad_norm": 0.14390654861927032, + "learning_rate": 9.73531274861595e-05, + "loss": 0.1608, + "step": 923 + }, + { + "epoch": 0.19395465994962216, + "grad_norm": 0.12394702434539795, + "learning_rate": 9.734135123718933e-05, + "loss": 0.1612, + "step": 924 + }, + { + "epoch": 0.1941645675902603, + "grad_norm": 0.16754676401615143, + "learning_rate": 9.732954956483218e-05, + "loss": 0.1791, + "step": 925 + }, + { + "epoch": 0.1943744752308984, + "grad_norm": 0.2816343605518341, + "learning_rate": 9.731772247542576e-05, + "loss": 0.1489, + "step": 926 + }, + { + "epoch": 0.19458438287153654, + "grad_norm": 0.15909412503242493, + "learning_rate": 9.730586997532155e-05, + "loss": 0.1531, + "step": 927 + }, + { + "epoch": 0.19479429051217464, + "grad_norm": 0.16626602411270142, + "learning_rate": 9.729399207088457e-05, + "loss": 0.1729, + "step": 928 + }, + { + "epoch": 0.19500419815281275, + "grad_norm": 0.14581038057804108, + "learning_rate": 9.728208876849354e-05, + "loss": 0.1616, + "step": 929 + }, + { + "epoch": 0.1952141057934509, + "grad_norm": 0.14985312521457672, + "learning_rate": 9.727016007454079e-05, + "loss": 0.1583, + "step": 930 + }, + { + "epoch": 0.195424013434089, + "grad_norm": 0.13958559930324554, + "learning_rate": 9.725820599543234e-05, + "loss": 0.1646, + "step": 931 + }, + { + "epoch": 0.19563392107472713, + "grad_norm": 0.1607862412929535, + "learning_rate": 9.724622653758777e-05, + "loss": 0.1549, + "step": 932 + }, + { + "epoch": 0.19584382871536524, + "grad_norm": 0.17007960379123688, + "learning_rate": 9.723422170744031e-05, + "loss": 0.1718, + "step": 933 + }, + { + "epoch": 0.19605373635600337, + "grad_norm": 0.1419927030801773, + "learning_rate": 9.722219151143688e-05, + "loss": 0.1689, + "step": 934 + }, + { + "epoch": 0.19626364399664148, + "grad_norm": 0.1631292998790741, + "learning_rate": 9.721013595603793e-05, + "loss": 0.1611, + "step": 935 + }, + { + "epoch": 0.1964735516372796, + "grad_norm": 0.19870012998580933, + "learning_rate": 9.719805504771758e-05, + "loss": 0.1836, + "step": 936 + }, + { + "epoch": 0.19668345927791772, + "grad_norm": 0.14150285720825195, + "learning_rate": 9.718594879296355e-05, + "loss": 0.1718, + "step": 937 + }, + { + "epoch": 0.19689336691855583, + "grad_norm": 0.1416793018579483, + "learning_rate": 9.717381719827716e-05, + "loss": 0.1511, + "step": 938 + }, + { + "epoch": 0.19710327455919396, + "grad_norm": 0.14615508913993835, + "learning_rate": 9.716166027017339e-05, + "loss": 0.1599, + "step": 939 + }, + { + "epoch": 0.19731318219983207, + "grad_norm": 0.13773533701896667, + "learning_rate": 9.714947801518076e-05, + "loss": 0.1765, + "step": 940 + }, + { + "epoch": 0.1975230898404702, + "grad_norm": 0.11306725442409515, + "learning_rate": 9.713727043984143e-05, + "loss": 0.1678, + "step": 941 + }, + { + "epoch": 0.1977329974811083, + "grad_norm": 0.14150364696979523, + "learning_rate": 9.712503755071115e-05, + "loss": 0.1535, + "step": 942 + }, + { + "epoch": 0.19794290512174642, + "grad_norm": 0.15026667714118958, + "learning_rate": 9.711277935435925e-05, + "loss": 0.155, + "step": 943 + }, + { + "epoch": 0.19815281276238456, + "grad_norm": 0.1540324091911316, + "learning_rate": 9.710049585736866e-05, + "loss": 0.1866, + "step": 944 + }, + { + "epoch": 0.19836272040302266, + "grad_norm": 0.12859384715557098, + "learning_rate": 9.708818706633591e-05, + "loss": 0.1512, + "step": 945 + }, + { + "epoch": 0.1985726280436608, + "grad_norm": 0.14280495047569275, + "learning_rate": 9.707585298787109e-05, + "loss": 0.1558, + "step": 946 + }, + { + "epoch": 0.1987825356842989, + "grad_norm": 0.14329122006893158, + "learning_rate": 9.70634936285979e-05, + "loss": 0.153, + "step": 947 + }, + { + "epoch": 0.19899244332493704, + "grad_norm": 0.18129687011241913, + "learning_rate": 9.705110899515359e-05, + "loss": 0.1592, + "step": 948 + }, + { + "epoch": 0.19920235096557515, + "grad_norm": 0.13191339373588562, + "learning_rate": 9.7038699094189e-05, + "loss": 0.1565, + "step": 949 + }, + { + "epoch": 0.19941225860621326, + "grad_norm": 0.1381015032529831, + "learning_rate": 9.702626393236849e-05, + "loss": 0.1704, + "step": 950 + }, + { + "epoch": 0.1996221662468514, + "grad_norm": 0.12249460071325302, + "learning_rate": 9.701380351637007e-05, + "loss": 0.1534, + "step": 951 + }, + { + "epoch": 0.1998320738874895, + "grad_norm": 0.1820680946111679, + "learning_rate": 9.700131785288525e-05, + "loss": 0.1805, + "step": 952 + }, + { + "epoch": 0.20004198152812763, + "grad_norm": 0.13920508325099945, + "learning_rate": 9.698880694861913e-05, + "loss": 0.1652, + "step": 953 + }, + { + "epoch": 0.20025188916876574, + "grad_norm": 0.14306975901126862, + "learning_rate": 9.697627081029033e-05, + "loss": 0.1397, + "step": 954 + }, + { + "epoch": 0.20046179680940387, + "grad_norm": 0.13642224669456482, + "learning_rate": 9.696370944463104e-05, + "loss": 0.174, + "step": 955 + }, + { + "epoch": 0.20067170445004198, + "grad_norm": 0.12083397060632706, + "learning_rate": 9.695112285838704e-05, + "loss": 0.1807, + "step": 956 + }, + { + "epoch": 0.2008816120906801, + "grad_norm": 0.17191384732723236, + "learning_rate": 9.693851105831757e-05, + "loss": 0.1656, + "step": 957 + }, + { + "epoch": 0.20109151973131822, + "grad_norm": 0.1415136307477951, + "learning_rate": 9.692587405119549e-05, + "loss": 0.1726, + "step": 958 + }, + { + "epoch": 0.20130142737195633, + "grad_norm": 0.15210871398448944, + "learning_rate": 9.691321184380713e-05, + "loss": 0.1614, + "step": 959 + }, + { + "epoch": 0.20151133501259447, + "grad_norm": 0.14295780658721924, + "learning_rate": 9.690052444295239e-05, + "loss": 0.1884, + "step": 960 + }, + { + "epoch": 0.20172124265323257, + "grad_norm": 0.13754574954509735, + "learning_rate": 9.688781185544471e-05, + "loss": 0.1719, + "step": 961 + }, + { + "epoch": 0.2019311502938707, + "grad_norm": 0.1425933688879013, + "learning_rate": 9.687507408811104e-05, + "loss": 0.1564, + "step": 962 + }, + { + "epoch": 0.20214105793450882, + "grad_norm": 0.1202697604894638, + "learning_rate": 9.686231114779184e-05, + "loss": 0.1584, + "step": 963 + }, + { + "epoch": 0.20235096557514692, + "grad_norm": 0.12710309028625488, + "learning_rate": 9.684952304134111e-05, + "loss": 0.168, + "step": 964 + }, + { + "epoch": 0.20256087321578506, + "grad_norm": 0.14089150726795197, + "learning_rate": 9.683670977562633e-05, + "loss": 0.1852, + "step": 965 + }, + { + "epoch": 0.20277078085642317, + "grad_norm": 0.13602103292942047, + "learning_rate": 9.682387135752856e-05, + "loss": 0.1673, + "step": 966 + }, + { + "epoch": 0.2029806884970613, + "grad_norm": 0.13352209329605103, + "learning_rate": 9.68110077939423e-05, + "loss": 0.1571, + "step": 967 + }, + { + "epoch": 0.2031905961376994, + "grad_norm": 0.13534000515937805, + "learning_rate": 9.679811909177556e-05, + "loss": 0.1706, + "step": 968 + }, + { + "epoch": 0.20340050377833754, + "grad_norm": 0.12475798279047012, + "learning_rate": 9.67852052579499e-05, + "loss": 0.1532, + "step": 969 + }, + { + "epoch": 0.20361041141897565, + "grad_norm": 0.1589164137840271, + "learning_rate": 9.677226629940032e-05, + "loss": 0.1597, + "step": 970 + }, + { + "epoch": 0.20382031905961376, + "grad_norm": 0.14976951479911804, + "learning_rate": 9.675930222307537e-05, + "loss": 0.1731, + "step": 971 + }, + { + "epoch": 0.2040302267002519, + "grad_norm": 0.12664476037025452, + "learning_rate": 9.6746313035937e-05, + "loss": 0.1363, + "step": 972 + }, + { + "epoch": 0.20424013434089, + "grad_norm": 0.12275015562772751, + "learning_rate": 9.673329874496075e-05, + "loss": 0.1574, + "step": 973 + }, + { + "epoch": 0.20445004198152814, + "grad_norm": 0.13222196698188782, + "learning_rate": 9.672025935713556e-05, + "loss": 0.1535, + "step": 974 + }, + { + "epoch": 0.20465994962216624, + "grad_norm": 0.15173639357089996, + "learning_rate": 9.670719487946389e-05, + "loss": 0.1792, + "step": 975 + }, + { + "epoch": 0.20486985726280438, + "grad_norm": 0.17748403549194336, + "learning_rate": 9.669410531896167e-05, + "loss": 0.1891, + "step": 976 + }, + { + "epoch": 0.20507976490344249, + "grad_norm": 0.14893555641174316, + "learning_rate": 9.66809906826583e-05, + "loss": 0.1438, + "step": 977 + }, + { + "epoch": 0.2052896725440806, + "grad_norm": 0.16388140618801117, + "learning_rate": 9.66678509775966e-05, + "loss": 0.1641, + "step": 978 + }, + { + "epoch": 0.20549958018471873, + "grad_norm": 0.17024089395999908, + "learning_rate": 9.665468621083293e-05, + "loss": 0.1839, + "step": 979 + }, + { + "epoch": 0.20570948782535683, + "grad_norm": 0.14790277183055878, + "learning_rate": 9.664149638943707e-05, + "loss": 0.1701, + "step": 980 + }, + { + "epoch": 0.20591939546599497, + "grad_norm": 0.14114373922348022, + "learning_rate": 9.662828152049223e-05, + "loss": 0.1699, + "step": 981 + }, + { + "epoch": 0.20612930310663308, + "grad_norm": 0.12556609511375427, + "learning_rate": 9.661504161109513e-05, + "loss": 0.1607, + "step": 982 + }, + { + "epoch": 0.2063392107472712, + "grad_norm": 0.11868204176425934, + "learning_rate": 9.660177666835585e-05, + "loss": 0.1487, + "step": 983 + }, + { + "epoch": 0.20654911838790932, + "grad_norm": 0.12117776274681091, + "learning_rate": 9.658848669939805e-05, + "loss": 0.1668, + "step": 984 + }, + { + "epoch": 0.20675902602854743, + "grad_norm": 0.126139298081398, + "learning_rate": 9.657517171135866e-05, + "loss": 0.1535, + "step": 985 + }, + { + "epoch": 0.20696893366918556, + "grad_norm": 0.15042434632778168, + "learning_rate": 9.656183171138818e-05, + "loss": 0.168, + "step": 986 + }, + { + "epoch": 0.20717884130982367, + "grad_norm": 0.1438342183828354, + "learning_rate": 9.65484667066505e-05, + "loss": 0.1628, + "step": 987 + }, + { + "epoch": 0.2073887489504618, + "grad_norm": 0.1280035823583603, + "learning_rate": 9.65350767043229e-05, + "loss": 0.1655, + "step": 988 + }, + { + "epoch": 0.2075986565910999, + "grad_norm": 0.13997893035411835, + "learning_rate": 9.652166171159614e-05, + "loss": 0.1766, + "step": 989 + }, + { + "epoch": 0.20780856423173805, + "grad_norm": 0.1653011292219162, + "learning_rate": 9.650822173567438e-05, + "loss": 0.1744, + "step": 990 + }, + { + "epoch": 0.20801847187237615, + "grad_norm": 0.1532358080148697, + "learning_rate": 9.64947567837752e-05, + "loss": 0.1529, + "step": 991 + }, + { + "epoch": 0.20822837951301426, + "grad_norm": 0.13623711466789246, + "learning_rate": 9.648126686312955e-05, + "loss": 0.1632, + "step": 992 + }, + { + "epoch": 0.2084382871536524, + "grad_norm": 0.13220278918743134, + "learning_rate": 9.646775198098186e-05, + "loss": 0.1462, + "step": 993 + }, + { + "epoch": 0.2086481947942905, + "grad_norm": 0.16712597012519836, + "learning_rate": 9.645421214458992e-05, + "loss": 0.1658, + "step": 994 + }, + { + "epoch": 0.20885810243492864, + "grad_norm": 0.16033975780010223, + "learning_rate": 9.644064736122493e-05, + "loss": 0.1523, + "step": 995 + }, + { + "epoch": 0.20906801007556675, + "grad_norm": 0.13283276557922363, + "learning_rate": 9.64270576381715e-05, + "loss": 0.1738, + "step": 996 + }, + { + "epoch": 0.20927791771620488, + "grad_norm": 0.2072938233613968, + "learning_rate": 9.64134429827276e-05, + "loss": 0.1589, + "step": 997 + }, + { + "epoch": 0.209487825356843, + "grad_norm": 0.15322737395763397, + "learning_rate": 9.639980340220462e-05, + "loss": 0.1772, + "step": 998 + }, + { + "epoch": 0.2096977329974811, + "grad_norm": 0.21009685099124908, + "learning_rate": 9.638613890392734e-05, + "loss": 0.1619, + "step": 999 + }, + { + "epoch": 0.20990764063811923, + "grad_norm": 0.1521667093038559, + "learning_rate": 9.63724494952339e-05, + "loss": 0.1878, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 4764, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.14184099585065e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}