diff --git "a/checkpoint-2812/trainer_state.json" "b/checkpoint-2812/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2812/trainer_state.json" @@ -0,0 +1,19757 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 703, + "global_step": 2812, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00035561877667140827, + "grad_norm": 8.660747528076172, + "learning_rate": 3.0000000000000004e-08, + "loss": 6.5549, + "step": 1 + }, + { + "epoch": 0.00035561877667140827, + "eval_loss": 7.5396270751953125, + "eval_runtime": 304.0215, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 1 + }, + { + "epoch": 0.0007112375533428165, + "grad_norm": 22.982288360595703, + "learning_rate": 6.000000000000001e-08, + "loss": 10.6678, + "step": 2 + }, + { + "epoch": 0.0010668563300142249, + "grad_norm": 7.824126243591309, + "learning_rate": 9e-08, + "loss": 7.0961, + "step": 3 + }, + { + "epoch": 0.001422475106685633, + "grad_norm": 11.892224311828613, + "learning_rate": 1.2000000000000002e-07, + "loss": 7.1413, + "step": 4 + }, + { + "epoch": 0.0017780938833570413, + "grad_norm": 4.799657344818115, + "learning_rate": 1.5000000000000002e-07, + "loss": 6.9134, + "step": 5 + }, + { + "epoch": 0.0021337126600284497, + "grad_norm": 11.623159408569336, + "learning_rate": 1.8e-07, + "loss": 10.4101, + "step": 6 + }, + { + "epoch": 0.0024893314366998577, + "grad_norm": 5.316324234008789, + "learning_rate": 2.1000000000000003e-07, + "loss": 4.7275, + "step": 7 + }, + { + "epoch": 0.002844950213371266, + "grad_norm": 8.917740821838379, + "learning_rate": 2.4000000000000003e-07, + "loss": 8.7934, + "step": 8 + }, + { + "epoch": 0.003200568990042674, + "grad_norm": 3.3449316024780273, + "learning_rate": 2.7e-07, + "loss": 3.995, + "step": 9 + }, + { + "epoch": 0.0035561877667140826, + "grad_norm": 5.076784133911133, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.033, + "step": 10 + }, + { + "epoch": 0.0039118065433854906, + "grad_norm": 4.978409767150879, + "learning_rate": 3.3e-07, + "loss": 6.1327, + "step": 11 + }, + { + "epoch": 0.004267425320056899, + "grad_norm": 8.947249412536621, + "learning_rate": 3.6e-07, + "loss": 10.0783, + "step": 12 + }, + { + "epoch": 0.004623044096728307, + "grad_norm": 5.048568248748779, + "learning_rate": 3.9e-07, + "loss": 7.6933, + "step": 13 + }, + { + "epoch": 0.004978662873399715, + "grad_norm": 6.6549859046936035, + "learning_rate": 4.2000000000000006e-07, + "loss": 5.4006, + "step": 14 + }, + { + "epoch": 0.005334281650071123, + "grad_norm": 2.798969030380249, + "learning_rate": 4.5e-07, + "loss": 5.688, + "step": 15 + }, + { + "epoch": 0.005689900426742532, + "grad_norm": 7.968538761138916, + "learning_rate": 4.800000000000001e-07, + "loss": 7.4622, + "step": 16 + }, + { + "epoch": 0.00604551920341394, + "grad_norm": 4.862197399139404, + "learning_rate": 5.100000000000001e-07, + "loss": 6.2464, + "step": 17 + }, + { + "epoch": 0.006401137980085348, + "grad_norm": 4.285610675811768, + "learning_rate": 5.4e-07, + "loss": 6.6423, + "step": 18 + }, + { + "epoch": 0.006756756756756757, + "grad_norm": 7.234747409820557, + "learning_rate": 5.7e-07, + "loss": 9.2264, + "step": 19 + }, + { + "epoch": 0.007112375533428165, + "grad_norm": 8.596134185791016, + "learning_rate": 6.000000000000001e-07, + "loss": 9.9718, + "step": 20 + }, + { + "epoch": 0.007467994310099573, + "grad_norm": 6.671982765197754, + "learning_rate": 6.3e-07, + "loss": 6.1073, + "step": 21 + }, + { + "epoch": 0.007823613086770981, + "grad_norm": 5.703658103942871, + "learning_rate": 6.6e-07, + "loss": 6.4463, + "step": 22 + }, + { + "epoch": 0.008179231863442389, + "grad_norm": 7.160182952880859, + "learning_rate": 6.900000000000001e-07, + "loss": 6.5243, + "step": 23 + }, + { + "epoch": 0.008534850640113799, + "grad_norm": 35.218658447265625, + "learning_rate": 7.2e-07, + "loss": 6.427, + "step": 24 + }, + { + "epoch": 0.008890469416785207, + "grad_norm": 6.055460453033447, + "learning_rate": 7.5e-07, + "loss": 6.1666, + "step": 25 + }, + { + "epoch": 0.009246088193456615, + "grad_norm": 4.726566314697266, + "learning_rate": 7.8e-07, + "loss": 7.7077, + "step": 26 + }, + { + "epoch": 0.009601706970128023, + "grad_norm": 7.525938510894775, + "learning_rate": 8.100000000000001e-07, + "loss": 6.5426, + "step": 27 + }, + { + "epoch": 0.00995732574679943, + "grad_norm": 6.565018177032471, + "learning_rate": 8.400000000000001e-07, + "loss": 8.228, + "step": 28 + }, + { + "epoch": 0.010312944523470839, + "grad_norm": 6.851963043212891, + "learning_rate": 8.699999999999999e-07, + "loss": 5.2098, + "step": 29 + }, + { + "epoch": 0.010668563300142247, + "grad_norm": 12.753103256225586, + "learning_rate": 9e-07, + "loss": 11.8284, + "step": 30 + }, + { + "epoch": 0.011024182076813657, + "grad_norm": 4.209855079650879, + "learning_rate": 9.3e-07, + "loss": 10.0105, + "step": 31 + }, + { + "epoch": 0.011379800853485065, + "grad_norm": 5.523573398590088, + "learning_rate": 9.600000000000001e-07, + "loss": 6.9525, + "step": 32 + }, + { + "epoch": 0.011735419630156473, + "grad_norm": 4.3014750480651855, + "learning_rate": 9.9e-07, + "loss": 5.5591, + "step": 33 + }, + { + "epoch": 0.01209103840682788, + "grad_norm": 5.838426113128662, + "learning_rate": 1.0200000000000002e-06, + "loss": 6.8275, + "step": 34 + }, + { + "epoch": 0.012446657183499289, + "grad_norm": 6.497621059417725, + "learning_rate": 1.05e-06, + "loss": 8.9188, + "step": 35 + }, + { + "epoch": 0.012802275960170697, + "grad_norm": 9.64803409576416, + "learning_rate": 1.08e-06, + "loss": 10.0789, + "step": 36 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 3.929687023162842, + "learning_rate": 1.11e-06, + "loss": 7.2339, + "step": 37 + }, + { + "epoch": 0.013513513513513514, + "grad_norm": 5.083961486816406, + "learning_rate": 1.14e-06, + "loss": 5.7898, + "step": 38 + }, + { + "epoch": 0.013869132290184922, + "grad_norm": 8.719144821166992, + "learning_rate": 1.17e-06, + "loss": 7.5372, + "step": 39 + }, + { + "epoch": 0.01422475106685633, + "grad_norm": 1.972847819328308, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.7912, + "step": 40 + }, + { + "epoch": 0.014580369843527738, + "grad_norm": 5.771431922912598, + "learning_rate": 1.2299999999999999e-06, + "loss": 8.1872, + "step": 41 + }, + { + "epoch": 0.014935988620199146, + "grad_norm": 7.699543476104736, + "learning_rate": 1.26e-06, + "loss": 5.3109, + "step": 42 + }, + { + "epoch": 0.015291607396870554, + "grad_norm": 4.988993167877197, + "learning_rate": 1.29e-06, + "loss": 7.0093, + "step": 43 + }, + { + "epoch": 0.015647226173541962, + "grad_norm": 6.067366600036621, + "learning_rate": 1.32e-06, + "loss": 6.588, + "step": 44 + }, + { + "epoch": 0.016002844950213372, + "grad_norm": 4.55286979675293, + "learning_rate": 1.35e-06, + "loss": 7.312, + "step": 45 + }, + { + "epoch": 0.016358463726884778, + "grad_norm": 4.87521505355835, + "learning_rate": 1.3800000000000001e-06, + "loss": 7.4906, + "step": 46 + }, + { + "epoch": 0.016714082503556188, + "grad_norm": 4.940184116363525, + "learning_rate": 1.41e-06, + "loss": 7.7316, + "step": 47 + }, + { + "epoch": 0.017069701280227598, + "grad_norm": 10.389467239379883, + "learning_rate": 1.44e-06, + "loss": 8.5561, + "step": 48 + }, + { + "epoch": 0.017425320056899004, + "grad_norm": 2.6143903732299805, + "learning_rate": 1.4700000000000001e-06, + "loss": 3.9455, + "step": 49 + }, + { + "epoch": 0.017780938833570414, + "grad_norm": 6.3977484703063965, + "learning_rate": 1.5e-06, + "loss": 7.1512, + "step": 50 + }, + { + "epoch": 0.01813655761024182, + "grad_norm": 7.077576160430908, + "learning_rate": 1.53e-06, + "loss": 6.2702, + "step": 51 + }, + { + "epoch": 0.01849217638691323, + "grad_norm": 10.010270118713379, + "learning_rate": 1.56e-06, + "loss": 9.1809, + "step": 52 + }, + { + "epoch": 0.018847795163584636, + "grad_norm": 4.484495162963867, + "learning_rate": 1.59e-06, + "loss": 7.3234, + "step": 53 + }, + { + "epoch": 0.019203413940256046, + "grad_norm": 3.2852706909179688, + "learning_rate": 1.6200000000000002e-06, + "loss": 6.1882, + "step": 54 + }, + { + "epoch": 0.019559032716927455, + "grad_norm": 8.268205642700195, + "learning_rate": 1.65e-06, + "loss": 7.1931, + "step": 55 + }, + { + "epoch": 0.01991465149359886, + "grad_norm": 6.677961349487305, + "learning_rate": 1.6800000000000002e-06, + "loss": 7.5425, + "step": 56 + }, + { + "epoch": 0.02027027027027027, + "grad_norm": 4.325429916381836, + "learning_rate": 1.71e-06, + "loss": 6.1305, + "step": 57 + }, + { + "epoch": 0.020625889046941678, + "grad_norm": 2.902691125869751, + "learning_rate": 1.7399999999999999e-06, + "loss": 5.9186, + "step": 58 + }, + { + "epoch": 0.020981507823613087, + "grad_norm": 9.4213285446167, + "learning_rate": 1.77e-06, + "loss": 6.8325, + "step": 59 + }, + { + "epoch": 0.021337126600284494, + "grad_norm": 5.033779144287109, + "learning_rate": 1.8e-06, + "loss": 5.9094, + "step": 60 + }, + { + "epoch": 0.021692745376955903, + "grad_norm": 2.1998589038848877, + "learning_rate": 1.83e-06, + "loss": 6.4489, + "step": 61 + }, + { + "epoch": 0.022048364153627313, + "grad_norm": 3.9690961837768555, + "learning_rate": 1.86e-06, + "loss": 6.9195, + "step": 62 + }, + { + "epoch": 0.02240398293029872, + "grad_norm": 5.059171676635742, + "learning_rate": 1.8900000000000001e-06, + "loss": 7.8081, + "step": 63 + }, + { + "epoch": 0.02275960170697013, + "grad_norm": 14.581876754760742, + "learning_rate": 1.9200000000000003e-06, + "loss": 7.2562, + "step": 64 + }, + { + "epoch": 0.023115220483641535, + "grad_norm": 4.498461723327637, + "learning_rate": 1.95e-06, + "loss": 6.4174, + "step": 65 + }, + { + "epoch": 0.023470839260312945, + "grad_norm": 5.469581604003906, + "learning_rate": 1.98e-06, + "loss": 6.0312, + "step": 66 + }, + { + "epoch": 0.02382645803698435, + "grad_norm": 8.71123218536377, + "learning_rate": 2.0100000000000002e-06, + "loss": 9.5572, + "step": 67 + }, + { + "epoch": 0.02418207681365576, + "grad_norm": 4.512469291687012, + "learning_rate": 2.0400000000000004e-06, + "loss": 6.7774, + "step": 68 + }, + { + "epoch": 0.02453769559032717, + "grad_norm": 2.387735605239868, + "learning_rate": 2.07e-06, + "loss": 5.8388, + "step": 69 + }, + { + "epoch": 0.024893314366998577, + "grad_norm": 6.300774097442627, + "learning_rate": 2.1e-06, + "loss": 7.1, + "step": 70 + }, + { + "epoch": 0.025248933143669987, + "grad_norm": 3.5996577739715576, + "learning_rate": 2.13e-06, + "loss": 5.3378, + "step": 71 + }, + { + "epoch": 0.025604551920341393, + "grad_norm": 10.486239433288574, + "learning_rate": 2.16e-06, + "loss": 10.9857, + "step": 72 + }, + { + "epoch": 0.025960170697012803, + "grad_norm": 4.102842330932617, + "learning_rate": 2.19e-06, + "loss": 6.0661, + "step": 73 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 6.174686431884766, + "learning_rate": 2.22e-06, + "loss": 5.9175, + "step": 74 + }, + { + "epoch": 0.02667140825035562, + "grad_norm": 3.774085283279419, + "learning_rate": 2.25e-06, + "loss": 5.9658, + "step": 75 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 3.2386677265167236, + "learning_rate": 2.28e-06, + "loss": 6.6799, + "step": 76 + }, + { + "epoch": 0.027382645803698435, + "grad_norm": 4.4015679359436035, + "learning_rate": 2.31e-06, + "loss": 6.1616, + "step": 77 + }, + { + "epoch": 0.027738264580369845, + "grad_norm": 3.153981924057007, + "learning_rate": 2.34e-06, + "loss": 6.4919, + "step": 78 + }, + { + "epoch": 0.02809388335704125, + "grad_norm": 3.6496145725250244, + "learning_rate": 2.37e-06, + "loss": 6.8055, + "step": 79 + }, + { + "epoch": 0.02844950213371266, + "grad_norm": 4.773905277252197, + "learning_rate": 2.4000000000000003e-06, + "loss": 6.4825, + "step": 80 + }, + { + "epoch": 0.028805120910384067, + "grad_norm": 3.7311317920684814, + "learning_rate": 2.43e-06, + "loss": 6.6372, + "step": 81 + }, + { + "epoch": 0.029160739687055477, + "grad_norm": 3.2733025550842285, + "learning_rate": 2.4599999999999997e-06, + "loss": 6.7441, + "step": 82 + }, + { + "epoch": 0.029516358463726886, + "grad_norm": 4.0636210441589355, + "learning_rate": 2.49e-06, + "loss": 7.6863, + "step": 83 + }, + { + "epoch": 0.029871977240398292, + "grad_norm": 3.718838691711426, + "learning_rate": 2.52e-06, + "loss": 5.906, + "step": 84 + }, + { + "epoch": 0.030227596017069702, + "grad_norm": 5.566934108734131, + "learning_rate": 2.55e-06, + "loss": 7.2042, + "step": 85 + }, + { + "epoch": 0.03058321479374111, + "grad_norm": 7.467172622680664, + "learning_rate": 2.58e-06, + "loss": 6.4337, + "step": 86 + }, + { + "epoch": 0.030938833570412518, + "grad_norm": 3.3917996883392334, + "learning_rate": 2.61e-06, + "loss": 5.7079, + "step": 87 + }, + { + "epoch": 0.031294452347083924, + "grad_norm": 9.125791549682617, + "learning_rate": 2.64e-06, + "loss": 6.3614, + "step": 88 + }, + { + "epoch": 0.031650071123755334, + "grad_norm": 5.505105972290039, + "learning_rate": 2.6700000000000003e-06, + "loss": 6.6182, + "step": 89 + }, + { + "epoch": 0.032005689900426744, + "grad_norm": 4.746493339538574, + "learning_rate": 2.7e-06, + "loss": 8.5985, + "step": 90 + }, + { + "epoch": 0.032361308677098154, + "grad_norm": 7.806989669799805, + "learning_rate": 2.73e-06, + "loss": 5.7867, + "step": 91 + }, + { + "epoch": 0.032716927453769556, + "grad_norm": 4.613385200500488, + "learning_rate": 2.7600000000000003e-06, + "loss": 6.5734, + "step": 92 + }, + { + "epoch": 0.033072546230440966, + "grad_norm": 4.240105152130127, + "learning_rate": 2.7900000000000004e-06, + "loss": 6.0371, + "step": 93 + }, + { + "epoch": 0.033428165007112376, + "grad_norm": 6.871604919433594, + "learning_rate": 2.82e-06, + "loss": 5.7564, + "step": 94 + }, + { + "epoch": 0.033783783783783786, + "grad_norm": 14.289658546447754, + "learning_rate": 2.85e-06, + "loss": 13.1999, + "step": 95 + }, + { + "epoch": 0.034139402560455195, + "grad_norm": 4.22871732711792, + "learning_rate": 2.88e-06, + "loss": 6.7137, + "step": 96 + }, + { + "epoch": 0.0344950213371266, + "grad_norm": 16.260059356689453, + "learning_rate": 2.91e-06, + "loss": 13.8001, + "step": 97 + }, + { + "epoch": 0.03485064011379801, + "grad_norm": 6.04908561706543, + "learning_rate": 2.9400000000000002e-06, + "loss": 6.3432, + "step": 98 + }, + { + "epoch": 0.03520625889046942, + "grad_norm": 12.723590850830078, + "learning_rate": 2.97e-06, + "loss": 6.7425, + "step": 99 + }, + { + "epoch": 0.03556187766714083, + "grad_norm": 6.646047115325928, + "learning_rate": 3e-06, + "loss": 6.4217, + "step": 100 + }, + { + "epoch": 0.03591749644381223, + "grad_norm": 10.887319564819336, + "learning_rate": 2.9999989935745976e-06, + "loss": 11.443, + "step": 101 + }, + { + "epoch": 0.03627311522048364, + "grad_norm": 3.166604995727539, + "learning_rate": 2.9999959742997417e-06, + "loss": 6.8872, + "step": 102 + }, + { + "epoch": 0.03662873399715505, + "grad_norm": 3.8414604663848877, + "learning_rate": 2.9999909421794838e-06, + "loss": 6.3886, + "step": 103 + }, + { + "epoch": 0.03698435277382646, + "grad_norm": 2.5283727645874023, + "learning_rate": 2.9999838972205763e-06, + "loss": 6.2808, + "step": 104 + }, + { + "epoch": 0.03733997155049787, + "grad_norm": 4.218923568725586, + "learning_rate": 2.999974839432472e-06, + "loss": 6.0351, + "step": 105 + }, + { + "epoch": 0.03769559032716927, + "grad_norm": 3.8209598064422607, + "learning_rate": 2.999963768827327e-06, + "loss": 7.4774, + "step": 106 + }, + { + "epoch": 0.03805120910384068, + "grad_norm": 2.251556873321533, + "learning_rate": 2.999950685419996e-06, + "loss": 5.543, + "step": 107 + }, + { + "epoch": 0.03840682788051209, + "grad_norm": 2.416313409805298, + "learning_rate": 2.999935589228036e-06, + "loss": 7.1778, + "step": 108 + }, + { + "epoch": 0.0387624466571835, + "grad_norm": 3.284146785736084, + "learning_rate": 2.999918480271705e-06, + "loss": 5.3883, + "step": 109 + }, + { + "epoch": 0.03911806543385491, + "grad_norm": 4.181211948394775, + "learning_rate": 2.9998993585739604e-06, + "loss": 5.0727, + "step": 110 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 2.2105231285095215, + "learning_rate": 2.9998782241604624e-06, + "loss": 5.8657, + "step": 111 + }, + { + "epoch": 0.03982930298719772, + "grad_norm": 6.132669448852539, + "learning_rate": 2.9998550770595717e-06, + "loss": 8.2886, + "step": 112 + }, + { + "epoch": 0.04018492176386913, + "grad_norm": 4.764754772186279, + "learning_rate": 2.999829917302348e-06, + "loss": 6.4023, + "step": 113 + }, + { + "epoch": 0.04054054054054054, + "grad_norm": 3.3721530437469482, + "learning_rate": 2.9998027449225547e-06, + "loss": 5.063, + "step": 114 + }, + { + "epoch": 0.040896159317211946, + "grad_norm": 2.039125919342041, + "learning_rate": 2.999773559956654e-06, + "loss": 3.4608, + "step": 115 + }, + { + "epoch": 0.041251778093883355, + "grad_norm": 7.183447360992432, + "learning_rate": 2.9997423624438084e-06, + "loss": 7.1676, + "step": 116 + }, + { + "epoch": 0.041607396870554765, + "grad_norm": 6.0739006996154785, + "learning_rate": 2.9997091524258827e-06, + "loss": 6.3158, + "step": 117 + }, + { + "epoch": 0.041963015647226175, + "grad_norm": 4.024500846862793, + "learning_rate": 2.9996739299474407e-06, + "loss": 4.7093, + "step": 118 + }, + { + "epoch": 0.042318634423897585, + "grad_norm": 4.118293285369873, + "learning_rate": 2.9996366950557486e-06, + "loss": 7.1239, + "step": 119 + }, + { + "epoch": 0.04267425320056899, + "grad_norm": 3.2010750770568848, + "learning_rate": 2.9995974478007708e-06, + "loss": 5.6147, + "step": 120 + }, + { + "epoch": 0.0430298719772404, + "grad_norm": 4.027970790863037, + "learning_rate": 2.999556188235174e-06, + "loss": 6.7292, + "step": 121 + }, + { + "epoch": 0.04338549075391181, + "grad_norm": 7.28054666519165, + "learning_rate": 2.999512916414324e-06, + "loss": 5.9732, + "step": 122 + }, + { + "epoch": 0.043741109530583216, + "grad_norm": 7.718742370605469, + "learning_rate": 2.9994676323962875e-06, + "loss": 6.3455, + "step": 123 + }, + { + "epoch": 0.044096728307254626, + "grad_norm": 2.9844284057617188, + "learning_rate": 2.9994203362418314e-06, + "loss": 6.5699, + "step": 124 + }, + { + "epoch": 0.04445234708392603, + "grad_norm": 2.741267204284668, + "learning_rate": 2.9993710280144216e-06, + "loss": 4.828, + "step": 125 + }, + { + "epoch": 0.04480796586059744, + "grad_norm": 8.467273712158203, + "learning_rate": 2.999319707780225e-06, + "loss": 8.1521, + "step": 126 + }, + { + "epoch": 0.04516358463726885, + "grad_norm": 5.6009135246276855, + "learning_rate": 2.9992663756081094e-06, + "loss": 5.2539, + "step": 127 + }, + { + "epoch": 0.04551920341394026, + "grad_norm": 7.367085933685303, + "learning_rate": 2.99921103156964e-06, + "loss": 8.1136, + "step": 128 + }, + { + "epoch": 0.04587482219061166, + "grad_norm": 10.453232765197754, + "learning_rate": 2.9991536757390835e-06, + "loss": 10.991, + "step": 129 + }, + { + "epoch": 0.04623044096728307, + "grad_norm": 3.1980788707733154, + "learning_rate": 2.9990943081934055e-06, + "loss": 6.0795, + "step": 130 + }, + { + "epoch": 0.04658605974395448, + "grad_norm": 6.553122520446777, + "learning_rate": 2.9990329290122717e-06, + "loss": 5.7746, + "step": 131 + }, + { + "epoch": 0.04694167852062589, + "grad_norm": 4.530668258666992, + "learning_rate": 2.998969538278047e-06, + "loss": 7.5428, + "step": 132 + }, + { + "epoch": 0.0472972972972973, + "grad_norm": 5.560062408447266, + "learning_rate": 2.998904136075794e-06, + "loss": 6.4568, + "step": 133 + }, + { + "epoch": 0.0476529160739687, + "grad_norm": 1.133360743522644, + "learning_rate": 2.9988367224932777e-06, + "loss": 4.0467, + "step": 134 + }, + { + "epoch": 0.04800853485064011, + "grad_norm": 2.4051198959350586, + "learning_rate": 2.99876729762096e-06, + "loss": 5.6657, + "step": 135 + }, + { + "epoch": 0.04836415362731152, + "grad_norm": 6.46595573425293, + "learning_rate": 2.998695861552002e-06, + "loss": 5.5864, + "step": 136 + }, + { + "epoch": 0.04871977240398293, + "grad_norm": 2.5633316040039062, + "learning_rate": 2.9986224143822636e-06, + "loss": 5.5263, + "step": 137 + }, + { + "epoch": 0.04907539118065434, + "grad_norm": 7.001709938049316, + "learning_rate": 2.9985469562103037e-06, + "loss": 8.284, + "step": 138 + }, + { + "epoch": 0.049431009957325744, + "grad_norm": 3.4980552196502686, + "learning_rate": 2.9984694871373796e-06, + "loss": 7.3199, + "step": 139 + }, + { + "epoch": 0.049786628733997154, + "grad_norm": 6.3614678382873535, + "learning_rate": 2.9983900072674475e-06, + "loss": 7.5146, + "step": 140 + }, + { + "epoch": 0.050142247510668564, + "grad_norm": 3.4414496421813965, + "learning_rate": 2.9983085167071613e-06, + "loss": 6.96, + "step": 141 + }, + { + "epoch": 0.050497866287339974, + "grad_norm": 2.911268949508667, + "learning_rate": 2.9982250155658732e-06, + "loss": 5.9427, + "step": 142 + }, + { + "epoch": 0.050853485064011376, + "grad_norm": 6.797943592071533, + "learning_rate": 2.9981395039556327e-06, + "loss": 8.2226, + "step": 143 + }, + { + "epoch": 0.051209103840682786, + "grad_norm": 5.3644795417785645, + "learning_rate": 2.998051981991189e-06, + "loss": 6.7129, + "step": 144 + }, + { + "epoch": 0.051564722617354196, + "grad_norm": 4.060850620269775, + "learning_rate": 2.997962449789987e-06, + "loss": 6.0539, + "step": 145 + }, + { + "epoch": 0.051920341394025606, + "grad_norm": 6.512187957763672, + "learning_rate": 2.997870907472171e-06, + "loss": 6.9594, + "step": 146 + }, + { + "epoch": 0.052275960170697015, + "grad_norm": 10.240334510803223, + "learning_rate": 2.9977773551605805e-06, + "loss": 11.9491, + "step": 147 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 2.7813124656677246, + "learning_rate": 2.997681792980754e-06, + "loss": 5.6671, + "step": 148 + }, + { + "epoch": 0.05298719772403983, + "grad_norm": 3.4392282962799072, + "learning_rate": 2.997584221060927e-06, + "loss": 5.5545, + "step": 149 + }, + { + "epoch": 0.05334281650071124, + "grad_norm": 6.306955337524414, + "learning_rate": 2.9974846395320303e-06, + "loss": 6.0252, + "step": 150 + }, + { + "epoch": 0.05369843527738265, + "grad_norm": 5.097898483276367, + "learning_rate": 2.9973830485276924e-06, + "loss": 5.3958, + "step": 151 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 12.706900596618652, + "learning_rate": 2.99727944818424e-06, + "loss": 8.8456, + "step": 152 + }, + { + "epoch": 0.05440967283072546, + "grad_norm": 3.6673364639282227, + "learning_rate": 2.9971738386406924e-06, + "loss": 5.9606, + "step": 153 + }, + { + "epoch": 0.05476529160739687, + "grad_norm": 2.0597755908966064, + "learning_rate": 2.9970662200387674e-06, + "loss": 4.7988, + "step": 154 + }, + { + "epoch": 0.05512091038406828, + "grad_norm": 2.6201746463775635, + "learning_rate": 2.99695659252288e-06, + "loss": 5.3841, + "step": 155 + }, + { + "epoch": 0.05547652916073969, + "grad_norm": 2.7254724502563477, + "learning_rate": 2.996844956240138e-06, + "loss": 5.6765, + "step": 156 + }, + { + "epoch": 0.0558321479374111, + "grad_norm": 2.194261074066162, + "learning_rate": 2.9967313113403465e-06, + "loss": 5.0128, + "step": 157 + }, + { + "epoch": 0.0561877667140825, + "grad_norm": 1.9812778234481812, + "learning_rate": 2.9966156579760058e-06, + "loss": 4.5003, + "step": 158 + }, + { + "epoch": 0.05654338549075391, + "grad_norm": 3.2229928970336914, + "learning_rate": 2.9964979963023115e-06, + "loss": 5.6106, + "step": 159 + }, + { + "epoch": 0.05689900426742532, + "grad_norm": 3.062922954559326, + "learning_rate": 2.996378326477153e-06, + "loss": 5.4193, + "step": 160 + }, + { + "epoch": 0.05725462304409673, + "grad_norm": 9.3699312210083, + "learning_rate": 2.996256648661116e-06, + "loss": 6.0364, + "step": 161 + }, + { + "epoch": 0.057610241820768134, + "grad_norm": 15.000651359558105, + "learning_rate": 2.99613296301748e-06, + "loss": 6.4831, + "step": 162 + }, + { + "epoch": 0.05796586059743954, + "grad_norm": 4.741034507751465, + "learning_rate": 2.9960072697122185e-06, + "loss": 7.7408, + "step": 163 + }, + { + "epoch": 0.05832147937411095, + "grad_norm": 2.6567530632019043, + "learning_rate": 2.9958795689139994e-06, + "loss": 5.0865, + "step": 164 + }, + { + "epoch": 0.05867709815078236, + "grad_norm": 2.6318979263305664, + "learning_rate": 2.9957498607941853e-06, + "loss": 4.8473, + "step": 165 + }, + { + "epoch": 0.05903271692745377, + "grad_norm": 5.565056324005127, + "learning_rate": 2.99561814552683e-06, + "loss": 5.9555, + "step": 166 + }, + { + "epoch": 0.059388335704125175, + "grad_norm": 2.2367746829986572, + "learning_rate": 2.9954844232886844e-06, + "loss": 5.1553, + "step": 167 + }, + { + "epoch": 0.059743954480796585, + "grad_norm": 2.4266884326934814, + "learning_rate": 2.995348694259189e-06, + "loss": 5.2593, + "step": 168 + }, + { + "epoch": 0.060099573257467995, + "grad_norm": 8.791595458984375, + "learning_rate": 2.995210958620478e-06, + "loss": 7.8254, + "step": 169 + }, + { + "epoch": 0.060455192034139404, + "grad_norm": 3.410501003265381, + "learning_rate": 2.995071216557381e-06, + "loss": 5.7089, + "step": 170 + }, + { + "epoch": 0.060810810810810814, + "grad_norm": 2.9625403881073, + "learning_rate": 2.9949294682574164e-06, + "loss": 5.2211, + "step": 171 + }, + { + "epoch": 0.06116642958748222, + "grad_norm": 1.4427578449249268, + "learning_rate": 2.994785713910796e-06, + "loss": 5.4011, + "step": 172 + }, + { + "epoch": 0.06152204836415363, + "grad_norm": 4.734636306762695, + "learning_rate": 2.9946399537104257e-06, + "loss": 4.9466, + "step": 173 + }, + { + "epoch": 0.061877667140825036, + "grad_norm": 1.9430538415908813, + "learning_rate": 2.9944921878518996e-06, + "loss": 4.9066, + "step": 174 + }, + { + "epoch": 0.062233285917496446, + "grad_norm": 4.134811878204346, + "learning_rate": 2.994342416533506e-06, + "loss": 7.1995, + "step": 175 + }, + { + "epoch": 0.06258890469416785, + "grad_norm": 2.5047380924224854, + "learning_rate": 2.9941906399562215e-06, + "loss": 6.6017, + "step": 176 + }, + { + "epoch": 0.06294452347083926, + "grad_norm": 3.1444289684295654, + "learning_rate": 2.994036858323716e-06, + "loss": 5.5821, + "step": 177 + }, + { + "epoch": 0.06330014224751067, + "grad_norm": 2.5180697441101074, + "learning_rate": 2.9938810718423496e-06, + "loss": 5.4745, + "step": 178 + }, + { + "epoch": 0.06365576102418208, + "grad_norm": 8.794586181640625, + "learning_rate": 2.9937232807211715e-06, + "loss": 8.743, + "step": 179 + }, + { + "epoch": 0.06401137980085349, + "grad_norm": 3.083610773086548, + "learning_rate": 2.9935634851719223e-06, + "loss": 6.5057, + "step": 180 + }, + { + "epoch": 0.0643669985775249, + "grad_norm": 3.776583433151245, + "learning_rate": 2.993401685409031e-06, + "loss": 5.7991, + "step": 181 + }, + { + "epoch": 0.06472261735419631, + "grad_norm": 2.824300527572632, + "learning_rate": 2.993237881649618e-06, + "loss": 6.3957, + "step": 182 + }, + { + "epoch": 0.0650782361308677, + "grad_norm": 6.611816883087158, + "learning_rate": 2.9930720741134905e-06, + "loss": 9.5339, + "step": 183 + }, + { + "epoch": 0.06543385490753911, + "grad_norm": 2.592463731765747, + "learning_rate": 2.992904263023146e-06, + "loss": 5.22, + "step": 184 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 2.091866970062256, + "learning_rate": 2.9927344486037708e-06, + "loss": 4.1429, + "step": 185 + }, + { + "epoch": 0.06614509246088193, + "grad_norm": 4.215582370758057, + "learning_rate": 2.9925626310832384e-06, + "loss": 5.6512, + "step": 186 + }, + { + "epoch": 0.06650071123755334, + "grad_norm": 5.5438361167907715, + "learning_rate": 2.9923888106921113e-06, + "loss": 6.3809, + "step": 187 + }, + { + "epoch": 0.06685633001422475, + "grad_norm": 3.3643035888671875, + "learning_rate": 2.9922129876636386e-06, + "loss": 5.4967, + "step": 188 + }, + { + "epoch": 0.06721194879089616, + "grad_norm": 5.390458106994629, + "learning_rate": 2.9920351622337576e-06, + "loss": 6.6857, + "step": 189 + }, + { + "epoch": 0.06756756756756757, + "grad_norm": 2.6646652221679688, + "learning_rate": 2.991855334641092e-06, + "loss": 4.7276, + "step": 190 + }, + { + "epoch": 0.06792318634423898, + "grad_norm": 2.704223155975342, + "learning_rate": 2.9916735051269533e-06, + "loss": 6.5566, + "step": 191 + }, + { + "epoch": 0.06827880512091039, + "grad_norm": 2.414024591445923, + "learning_rate": 2.991489673935339e-06, + "loss": 5.4565, + "step": 192 + }, + { + "epoch": 0.06863442389758179, + "grad_norm": 4.7646164894104, + "learning_rate": 2.9913038413129303e-06, + "loss": 5.006, + "step": 193 + }, + { + "epoch": 0.0689900426742532, + "grad_norm": 3.833077907562256, + "learning_rate": 2.991116007509098e-06, + "loss": 5.4, + "step": 194 + }, + { + "epoch": 0.0693456614509246, + "grad_norm": 3.204576253890991, + "learning_rate": 2.990926172775895e-06, + "loss": 6.9483, + "step": 195 + }, + { + "epoch": 0.06970128022759602, + "grad_norm": 3.457059144973755, + "learning_rate": 2.990734337368062e-06, + "loss": 5.1697, + "step": 196 + }, + { + "epoch": 0.07005689900426743, + "grad_norm": 5.136595249176025, + "learning_rate": 2.9905405015430217e-06, + "loss": 5.8255, + "step": 197 + }, + { + "epoch": 0.07041251778093884, + "grad_norm": 2.2137231826782227, + "learning_rate": 2.9903446655608837e-06, + "loss": 4.891, + "step": 198 + }, + { + "epoch": 0.07076813655761025, + "grad_norm": 3.2659993171691895, + "learning_rate": 2.9901468296844394e-06, + "loss": 6.4004, + "step": 199 + }, + { + "epoch": 0.07112375533428165, + "grad_norm": 1.9002463817596436, + "learning_rate": 2.9899469941791652e-06, + "loss": 5.1626, + "step": 200 + }, + { + "epoch": 0.07147937411095306, + "grad_norm": 4.523158073425293, + "learning_rate": 2.9897451593132213e-06, + "loss": 7.3497, + "step": 201 + }, + { + "epoch": 0.07183499288762446, + "grad_norm": 3.5060434341430664, + "learning_rate": 2.9895413253574485e-06, + "loss": 5.2532, + "step": 202 + }, + { + "epoch": 0.07219061166429587, + "grad_norm": 1.6068642139434814, + "learning_rate": 2.989335492585373e-06, + "loss": 5.5655, + "step": 203 + }, + { + "epoch": 0.07254623044096728, + "grad_norm": 2.014885663986206, + "learning_rate": 2.9891276612732013e-06, + "loss": 4.9464, + "step": 204 + }, + { + "epoch": 0.07290184921763869, + "grad_norm": 6.208018779754639, + "learning_rate": 2.9889178316998223e-06, + "loss": 8.3161, + "step": 205 + }, + { + "epoch": 0.0732574679943101, + "grad_norm": 3.4419260025024414, + "learning_rate": 2.9887060041468065e-06, + "loss": 4.518, + "step": 206 + }, + { + "epoch": 0.07361308677098151, + "grad_norm": 2.168409585952759, + "learning_rate": 2.9884921788984056e-06, + "loss": 4.7217, + "step": 207 + }, + { + "epoch": 0.07396870554765292, + "grad_norm": 1.9316198825836182, + "learning_rate": 2.988276356241552e-06, + "loss": 5.2847, + "step": 208 + }, + { + "epoch": 0.07432432432432433, + "grad_norm": 2.8415982723236084, + "learning_rate": 2.9880585364658577e-06, + "loss": 4.6753, + "step": 209 + }, + { + "epoch": 0.07467994310099574, + "grad_norm": 3.1524605751037598, + "learning_rate": 2.9878387198636153e-06, + "loss": 6.2646, + "step": 210 + }, + { + "epoch": 0.07503556187766713, + "grad_norm": 1.126608967781067, + "learning_rate": 2.987616906729797e-06, + "loss": 4.634, + "step": 211 + }, + { + "epoch": 0.07539118065433854, + "grad_norm": 4.84623908996582, + "learning_rate": 2.9873930973620535e-06, + "loss": 4.7413, + "step": 212 + }, + { + "epoch": 0.07574679943100995, + "grad_norm": 3.0465972423553467, + "learning_rate": 2.9871672920607156e-06, + "loss": 6.2626, + "step": 213 + }, + { + "epoch": 0.07610241820768136, + "grad_norm": 2.2682013511657715, + "learning_rate": 2.986939491128791e-06, + "loss": 6.2415, + "step": 214 + }, + { + "epoch": 0.07645803698435277, + "grad_norm": 1.712035894393921, + "learning_rate": 2.9867096948719657e-06, + "loss": 5.13, + "step": 215 + }, + { + "epoch": 0.07681365576102418, + "grad_norm": 1.3823540210723877, + "learning_rate": 2.986477903598604e-06, + "loss": 4.3312, + "step": 216 + }, + { + "epoch": 0.07716927453769559, + "grad_norm": 3.0332672595977783, + "learning_rate": 2.986244117619746e-06, + "loss": 6.8772, + "step": 217 + }, + { + "epoch": 0.077524893314367, + "grad_norm": 9.452963829040527, + "learning_rate": 2.9860083372491098e-06, + "loss": 8.6648, + "step": 218 + }, + { + "epoch": 0.07788051209103841, + "grad_norm": 2.12556529045105, + "learning_rate": 2.985770562803089e-06, + "loss": 6.6623, + "step": 219 + }, + { + "epoch": 0.07823613086770982, + "grad_norm": 3.7493932247161865, + "learning_rate": 2.985530794600753e-06, + "loss": 7.562, + "step": 220 + }, + { + "epoch": 0.07859174964438122, + "grad_norm": 4.024392127990723, + "learning_rate": 2.9852890329638477e-06, + "loss": 6.7411, + "step": 221 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 2.5206401348114014, + "learning_rate": 2.9850452782167925e-06, + "loss": 6.8618, + "step": 222 + }, + { + "epoch": 0.07930298719772404, + "grad_norm": 2.1051340103149414, + "learning_rate": 2.984799530686682e-06, + "loss": 5.5005, + "step": 223 + }, + { + "epoch": 0.07965860597439545, + "grad_norm": 1.9196279048919678, + "learning_rate": 2.984551790703285e-06, + "loss": 6.0345, + "step": 224 + }, + { + "epoch": 0.08001422475106686, + "grad_norm": 1.928398847579956, + "learning_rate": 2.9843020585990446e-06, + "loss": 5.014, + "step": 225 + }, + { + "epoch": 0.08036984352773827, + "grad_norm": 1.9115183353424072, + "learning_rate": 2.9840503347090754e-06, + "loss": 5.2889, + "step": 226 + }, + { + "epoch": 0.08072546230440968, + "grad_norm": 1.7273354530334473, + "learning_rate": 2.983796619371166e-06, + "loss": 5.5686, + "step": 227 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 2.3344831466674805, + "learning_rate": 2.983540912925778e-06, + "loss": 5.2228, + "step": 228 + }, + { + "epoch": 0.0814366998577525, + "grad_norm": 2.12615966796875, + "learning_rate": 2.9832832157160428e-06, + "loss": 5.7262, + "step": 229 + }, + { + "epoch": 0.08179231863442389, + "grad_norm": 2.4110610485076904, + "learning_rate": 2.9830235280877656e-06, + "loss": 5.6971, + "step": 230 + }, + { + "epoch": 0.0821479374110953, + "grad_norm": 2.5197033882141113, + "learning_rate": 2.982761850389421e-06, + "loss": 5.6357, + "step": 231 + }, + { + "epoch": 0.08250355618776671, + "grad_norm": 1.745025634765625, + "learning_rate": 2.982498182972154e-06, + "loss": 5.0989, + "step": 232 + }, + { + "epoch": 0.08285917496443812, + "grad_norm": 2.024353265762329, + "learning_rate": 2.9822325261897803e-06, + "loss": 5.8642, + "step": 233 + }, + { + "epoch": 0.08321479374110953, + "grad_norm": 2.333660840988159, + "learning_rate": 2.981964880398785e-06, + "loss": 5.6501, + "step": 234 + }, + { + "epoch": 0.08357041251778094, + "grad_norm": 3.800001621246338, + "learning_rate": 2.981695245958322e-06, + "loss": 6.7876, + "step": 235 + }, + { + "epoch": 0.08392603129445235, + "grad_norm": 1.5085493326187134, + "learning_rate": 2.9814236232302136e-06, + "loss": 4.7036, + "step": 236 + }, + { + "epoch": 0.08428165007112376, + "grad_norm": 5.537125110626221, + "learning_rate": 2.981150012578951e-06, + "loss": 8.0673, + "step": 237 + }, + { + "epoch": 0.08463726884779517, + "grad_norm": 1.9578464031219482, + "learning_rate": 2.9808744143716927e-06, + "loss": 4.8236, + "step": 238 + }, + { + "epoch": 0.08499288762446658, + "grad_norm": 5.71116304397583, + "learning_rate": 2.9805968289782636e-06, + "loss": 6.9271, + "step": 239 + }, + { + "epoch": 0.08534850640113797, + "grad_norm": 3.904921293258667, + "learning_rate": 2.9803172567711557e-06, + "loss": 4.555, + "step": 240 + }, + { + "epoch": 0.08570412517780938, + "grad_norm": 1.7809964418411255, + "learning_rate": 2.980035698125527e-06, + "loss": 5.2432, + "step": 241 + }, + { + "epoch": 0.0860597439544808, + "grad_norm": 2.4856197834014893, + "learning_rate": 2.9797521534192015e-06, + "loss": 5.2169, + "step": 242 + }, + { + "epoch": 0.0864153627311522, + "grad_norm": 2.8742263317108154, + "learning_rate": 2.9794666230326677e-06, + "loss": 5.9025, + "step": 243 + }, + { + "epoch": 0.08677098150782361, + "grad_norm": 1.1844121217727661, + "learning_rate": 2.9791791073490796e-06, + "loss": 4.7077, + "step": 244 + }, + { + "epoch": 0.08712660028449502, + "grad_norm": 1.4924622774124146, + "learning_rate": 2.978889606754254e-06, + "loss": 4.4481, + "step": 245 + }, + { + "epoch": 0.08748221906116643, + "grad_norm": 1.4234836101531982, + "learning_rate": 2.9785981216366715e-06, + "loss": 4.657, + "step": 246 + }, + { + "epoch": 0.08783783783783784, + "grad_norm": 2.859877824783325, + "learning_rate": 2.978304652387477e-06, + "loss": 4.6056, + "step": 247 + }, + { + "epoch": 0.08819345661450925, + "grad_norm": 1.855923056602478, + "learning_rate": 2.9780091994004773e-06, + "loss": 4.847, + "step": 248 + }, + { + "epoch": 0.08854907539118065, + "grad_norm": 1.6514908075332642, + "learning_rate": 2.9777117630721404e-06, + "loss": 5.0991, + "step": 249 + }, + { + "epoch": 0.08890469416785206, + "grad_norm": 1.6970970630645752, + "learning_rate": 2.9774123438015956e-06, + "loss": 5.7027, + "step": 250 + }, + { + "epoch": 0.08926031294452347, + "grad_norm": 2.703773021697998, + "learning_rate": 2.9771109419906347e-06, + "loss": 5.6141, + "step": 251 + }, + { + "epoch": 0.08961593172119488, + "grad_norm": 7.41414737701416, + "learning_rate": 2.9768075580437087e-06, + "loss": 8.2536, + "step": 252 + }, + { + "epoch": 0.08997155049786629, + "grad_norm": 3.178450584411621, + "learning_rate": 2.9765021923679288e-06, + "loss": 5.5571, + "step": 253 + }, + { + "epoch": 0.0903271692745377, + "grad_norm": 2.870748996734619, + "learning_rate": 2.9761948453730653e-06, + "loss": 5.8588, + "step": 254 + }, + { + "epoch": 0.0906827880512091, + "grad_norm": 1.2676215171813965, + "learning_rate": 2.975885517471547e-06, + "loss": 4.4839, + "step": 255 + }, + { + "epoch": 0.09103840682788052, + "grad_norm": 1.412637710571289, + "learning_rate": 2.975574209078462e-06, + "loss": 4.5932, + "step": 256 + }, + { + "epoch": 0.09139402560455193, + "grad_norm": 2.3941407203674316, + "learning_rate": 2.975260920611554e-06, + "loss": 6.441, + "step": 257 + }, + { + "epoch": 0.09174964438122332, + "grad_norm": 1.3517104387283325, + "learning_rate": 2.9749456524912254e-06, + "loss": 5.2988, + "step": 258 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 5.502532958984375, + "learning_rate": 2.9746284051405354e-06, + "loss": 7.5008, + "step": 259 + }, + { + "epoch": 0.09246088193456614, + "grad_norm": 1.5129213333129883, + "learning_rate": 2.9743091789851977e-06, + "loss": 4.9853, + "step": 260 + }, + { + "epoch": 0.09281650071123755, + "grad_norm": 2.2975544929504395, + "learning_rate": 2.9739879744535824e-06, + "loss": 4.392, + "step": 261 + }, + { + "epoch": 0.09317211948790896, + "grad_norm": 1.3772670030593872, + "learning_rate": 2.973664791976713e-06, + "loss": 5.2519, + "step": 262 + }, + { + "epoch": 0.09352773826458037, + "grad_norm": 1.3389573097229004, + "learning_rate": 2.9733396319882696e-06, + "loss": 4.8188, + "step": 263 + }, + { + "epoch": 0.09388335704125178, + "grad_norm": 2.167651891708374, + "learning_rate": 2.973012494924584e-06, + "loss": 4.5069, + "step": 264 + }, + { + "epoch": 0.09423897581792319, + "grad_norm": 4.063717842102051, + "learning_rate": 2.9726833812246417e-06, + "loss": 4.8118, + "step": 265 + }, + { + "epoch": 0.0945945945945946, + "grad_norm": 1.1990865468978882, + "learning_rate": 2.9723522913300802e-06, + "loss": 5.0443, + "step": 266 + }, + { + "epoch": 0.09495021337126601, + "grad_norm": 2.607632637023926, + "learning_rate": 2.9720192256851898e-06, + "loss": 6.8281, + "step": 267 + }, + { + "epoch": 0.0953058321479374, + "grad_norm": 1.527082085609436, + "learning_rate": 2.9716841847369107e-06, + "loss": 5.1085, + "step": 268 + }, + { + "epoch": 0.09566145092460882, + "grad_norm": 3.3045883178710938, + "learning_rate": 2.9713471689348354e-06, + "loss": 4.9132, + "step": 269 + }, + { + "epoch": 0.09601706970128022, + "grad_norm": 1.7471206188201904, + "learning_rate": 2.971008178731205e-06, + "loss": 4.3584, + "step": 270 + }, + { + "epoch": 0.09637268847795163, + "grad_norm": 2.3424017429351807, + "learning_rate": 2.9706672145809105e-06, + "loss": 1.1488, + "step": 271 + }, + { + "epoch": 0.09672830725462304, + "grad_norm": 1.2416472434997559, + "learning_rate": 2.9703242769414925e-06, + "loss": 4.5144, + "step": 272 + }, + { + "epoch": 0.09708392603129445, + "grad_norm": 3.3306009769439697, + "learning_rate": 2.9699793662731387e-06, + "loss": 5.142, + "step": 273 + }, + { + "epoch": 0.09743954480796586, + "grad_norm": 1.5548534393310547, + "learning_rate": 2.969632483038685e-06, + "loss": 4.3735, + "step": 274 + }, + { + "epoch": 0.09779516358463727, + "grad_norm": 2.0693247318267822, + "learning_rate": 2.9692836277036147e-06, + "loss": 3.9157, + "step": 275 + }, + { + "epoch": 0.09815078236130868, + "grad_norm": 2.947077512741089, + "learning_rate": 2.968932800736056e-06, + "loss": 5.2837, + "step": 276 + }, + { + "epoch": 0.09850640113798008, + "grad_norm": 1.5517710447311401, + "learning_rate": 2.9685800026067847e-06, + "loss": 5.3426, + "step": 277 + }, + { + "epoch": 0.09886201991465149, + "grad_norm": 1.305782437324524, + "learning_rate": 2.9682252337892206e-06, + "loss": 4.6709, + "step": 278 + }, + { + "epoch": 0.0992176386913229, + "grad_norm": 1.3639888763427734, + "learning_rate": 2.967868494759427e-06, + "loss": 4.0865, + "step": 279 + }, + { + "epoch": 0.09957325746799431, + "grad_norm": 0.9754592776298523, + "learning_rate": 2.967509785996114e-06, + "loss": 3.7777, + "step": 280 + }, + { + "epoch": 0.09992887624466572, + "grad_norm": 3.4132964611053467, + "learning_rate": 2.9671491079806324e-06, + "loss": 5.9286, + "step": 281 + }, + { + "epoch": 0.10028449502133713, + "grad_norm": 4.247563362121582, + "learning_rate": 2.966786461196976e-06, + "loss": 7.6671, + "step": 282 + }, + { + "epoch": 0.10064011379800854, + "grad_norm": 2.2718350887298584, + "learning_rate": 2.966421846131781e-06, + "loss": 4.6816, + "step": 283 + }, + { + "epoch": 0.10099573257467995, + "grad_norm": 1.9747391939163208, + "learning_rate": 2.9660552632743234e-06, + "loss": 6.2515, + "step": 284 + }, + { + "epoch": 0.10135135135135136, + "grad_norm": 1.1069923639297485, + "learning_rate": 2.9656867131165223e-06, + "loss": 3.3182, + "step": 285 + }, + { + "epoch": 0.10170697012802275, + "grad_norm": 1.4600532054901123, + "learning_rate": 2.9653161961529353e-06, + "loss": 5.8221, + "step": 286 + }, + { + "epoch": 0.10206258890469416, + "grad_norm": 1.0828770399093628, + "learning_rate": 2.964943712880759e-06, + "loss": 4.3819, + "step": 287 + }, + { + "epoch": 0.10241820768136557, + "grad_norm": 1.2702336311340332, + "learning_rate": 2.9645692637998286e-06, + "loss": 2.9866, + "step": 288 + }, + { + "epoch": 0.10277382645803698, + "grad_norm": 1.2744325399398804, + "learning_rate": 2.964192849412618e-06, + "loss": 4.6873, + "step": 289 + }, + { + "epoch": 0.10312944523470839, + "grad_norm": 4.656653881072998, + "learning_rate": 2.9638144702242377e-06, + "loss": 7.3547, + "step": 290 + }, + { + "epoch": 0.1034850640113798, + "grad_norm": 4.714226245880127, + "learning_rate": 2.9634341267424347e-06, + "loss": 6.8914, + "step": 291 + }, + { + "epoch": 0.10384068278805121, + "grad_norm": 1.5539475679397583, + "learning_rate": 2.963051819477592e-06, + "loss": 4.7132, + "step": 292 + }, + { + "epoch": 0.10419630156472262, + "grad_norm": 2.120549440383911, + "learning_rate": 2.9626675489427287e-06, + "loss": 4.9716, + "step": 293 + }, + { + "epoch": 0.10455192034139403, + "grad_norm": 3.1780097484588623, + "learning_rate": 2.962281315653497e-06, + "loss": 5.8229, + "step": 294 + }, + { + "epoch": 0.10490753911806544, + "grad_norm": 1.6481893062591553, + "learning_rate": 2.961893120128184e-06, + "loss": 5.6063, + "step": 295 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.1038414239883423, + "learning_rate": 2.9615029628877086e-06, + "loss": 4.0314, + "step": 296 + }, + { + "epoch": 0.10561877667140825, + "grad_norm": 1.4341669082641602, + "learning_rate": 2.9611108444556244e-06, + "loss": 4.6504, + "step": 297 + }, + { + "epoch": 0.10597439544807966, + "grad_norm": 0.9920176267623901, + "learning_rate": 2.9607167653581137e-06, + "loss": 4.6684, + "step": 298 + }, + { + "epoch": 0.10633001422475107, + "grad_norm": 7.0024189949035645, + "learning_rate": 2.9603207261239928e-06, + "loss": 8.0655, + "step": 299 + }, + { + "epoch": 0.10668563300142248, + "grad_norm": 2.566538095474243, + "learning_rate": 2.9599227272847066e-06, + "loss": 7.0744, + "step": 300 + }, + { + "epoch": 0.10704125177809388, + "grad_norm": 1.0727044343948364, + "learning_rate": 2.95952276937433e-06, + "loss": 3.7807, + "step": 301 + }, + { + "epoch": 0.1073968705547653, + "grad_norm": 1.8506673574447632, + "learning_rate": 2.959120852929566e-06, + "loss": 5.5393, + "step": 302 + }, + { + "epoch": 0.1077524893314367, + "grad_norm": 1.460008144378662, + "learning_rate": 2.9587169784897474e-06, + "loss": 5.2025, + "step": 303 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 0.9715114831924438, + "learning_rate": 2.958311146596833e-06, + "loss": 4.8721, + "step": 304 + }, + { + "epoch": 0.10846372688477951, + "grad_norm": 1.59697425365448, + "learning_rate": 2.957903357795409e-06, + "loss": 5.0807, + "step": 305 + }, + { + "epoch": 0.10881934566145092, + "grad_norm": 3.5417697429656982, + "learning_rate": 2.9574936126326876e-06, + "loss": 4.5647, + "step": 306 + }, + { + "epoch": 0.10917496443812233, + "grad_norm": 1.3611819744110107, + "learning_rate": 2.9570819116585056e-06, + "loss": 4.6148, + "step": 307 + }, + { + "epoch": 0.10953058321479374, + "grad_norm": 0.9342393279075623, + "learning_rate": 2.9566682554253255e-06, + "loss": 4.2446, + "step": 308 + }, + { + "epoch": 0.10988620199146515, + "grad_norm": 1.327476978302002, + "learning_rate": 2.9562526444882316e-06, + "loss": 6.5259, + "step": 309 + }, + { + "epoch": 0.11024182076813656, + "grad_norm": 1.4551005363464355, + "learning_rate": 2.955835079404934e-06, + "loss": 4.8192, + "step": 310 + }, + { + "epoch": 0.11059743954480797, + "grad_norm": 1.2439876794815063, + "learning_rate": 2.9554155607357623e-06, + "loss": 4.3822, + "step": 311 + }, + { + "epoch": 0.11095305832147938, + "grad_norm": 2.7374589443206787, + "learning_rate": 2.9549940890436693e-06, + "loss": 6.4328, + "step": 312 + }, + { + "epoch": 0.11130867709815079, + "grad_norm": 2.225618600845337, + "learning_rate": 2.954570664894228e-06, + "loss": 5.7494, + "step": 313 + }, + { + "epoch": 0.1116642958748222, + "grad_norm": 0.875352144241333, + "learning_rate": 2.9541452888556314e-06, + "loss": 4.0971, + "step": 314 + }, + { + "epoch": 0.1120199146514936, + "grad_norm": 1.8364171981811523, + "learning_rate": 2.9537179614986924e-06, + "loss": 5.7963, + "step": 315 + }, + { + "epoch": 0.112375533428165, + "grad_norm": 2.040693521499634, + "learning_rate": 2.953288683396841e-06, + "loss": 4.2519, + "step": 316 + }, + { + "epoch": 0.11273115220483641, + "grad_norm": 1.3991960287094116, + "learning_rate": 2.9528574551261262e-06, + "loss": 3.602, + "step": 317 + }, + { + "epoch": 0.11308677098150782, + "grad_norm": 0.9148945212364197, + "learning_rate": 2.9524242772652134e-06, + "loss": 4.0109, + "step": 318 + }, + { + "epoch": 0.11344238975817923, + "grad_norm": 7.301913738250732, + "learning_rate": 2.951989150395384e-06, + "loss": 8.4141, + "step": 319 + }, + { + "epoch": 0.11379800853485064, + "grad_norm": 1.252691626548767, + "learning_rate": 2.9515520751005353e-06, + "loss": 5.6531, + "step": 320 + }, + { + "epoch": 0.11415362731152205, + "grad_norm": 1.7830569744110107, + "learning_rate": 2.9511130519671782e-06, + "loss": 6.2136, + "step": 321 + }, + { + "epoch": 0.11450924608819346, + "grad_norm": 1.0401514768600464, + "learning_rate": 2.950672081584439e-06, + "loss": 3.728, + "step": 322 + }, + { + "epoch": 0.11486486486486487, + "grad_norm": 1.6078163385391235, + "learning_rate": 2.9502291645440553e-06, + "loss": 4.3085, + "step": 323 + }, + { + "epoch": 0.11522048364153627, + "grad_norm": 3.799940824508667, + "learning_rate": 2.949784301440378e-06, + "loss": 6.3361, + "step": 324 + }, + { + "epoch": 0.11557610241820768, + "grad_norm": 1.6291847229003906, + "learning_rate": 2.94933749287037e-06, + "loss": 5.091, + "step": 325 + }, + { + "epoch": 0.11593172119487909, + "grad_norm": 2.27602481842041, + "learning_rate": 2.9488887394336023e-06, + "loss": 5.9824, + "step": 326 + }, + { + "epoch": 0.1162873399715505, + "grad_norm": 1.8447299003601074, + "learning_rate": 2.9484380417322585e-06, + "loss": 5.4747, + "step": 327 + }, + { + "epoch": 0.1166429587482219, + "grad_norm": 2.5115737915039062, + "learning_rate": 2.94798540037113e-06, + "loss": 6.4419, + "step": 328 + }, + { + "epoch": 0.11699857752489332, + "grad_norm": 1.7885041236877441, + "learning_rate": 2.9475308159576163e-06, + "loss": 4.9951, + "step": 329 + }, + { + "epoch": 0.11735419630156473, + "grad_norm": 1.0126817226409912, + "learning_rate": 2.9470742891017243e-06, + "loss": 4.4026, + "step": 330 + }, + { + "epoch": 0.11770981507823614, + "grad_norm": 1.752461314201355, + "learning_rate": 2.946615820416068e-06, + "loss": 6.7716, + "step": 331 + }, + { + "epoch": 0.11806543385490754, + "grad_norm": 5.161525249481201, + "learning_rate": 2.946155410515867e-06, + "loss": 8.3386, + "step": 332 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 1.4897671937942505, + "learning_rate": 2.945693060018944e-06, + "loss": 4.3863, + "step": 333 + }, + { + "epoch": 0.11877667140825035, + "grad_norm": 1.267040491104126, + "learning_rate": 2.94522876954573e-06, + "loss": 4.3718, + "step": 334 + }, + { + "epoch": 0.11913229018492176, + "grad_norm": 1.3649559020996094, + "learning_rate": 2.944762539719254e-06, + "loss": 5.2657, + "step": 335 + }, + { + "epoch": 0.11948790896159317, + "grad_norm": 1.1782056093215942, + "learning_rate": 2.9442943711651514e-06, + "loss": 4.796, + "step": 336 + }, + { + "epoch": 0.11984352773826458, + "grad_norm": 1.3422836065292358, + "learning_rate": 2.9438242645116583e-06, + "loss": 4.5068, + "step": 337 + }, + { + "epoch": 0.12019914651493599, + "grad_norm": 1.0977728366851807, + "learning_rate": 2.94335222038961e-06, + "loss": 4.5849, + "step": 338 + }, + { + "epoch": 0.1205547652916074, + "grad_norm": 1.2248634099960327, + "learning_rate": 2.9428782394324435e-06, + "loss": 5.4004, + "step": 339 + }, + { + "epoch": 0.12091038406827881, + "grad_norm": 3.4554972648620605, + "learning_rate": 2.942402322276194e-06, + "loss": 6.3974, + "step": 340 + }, + { + "epoch": 0.12126600284495022, + "grad_norm": 2.226229667663574, + "learning_rate": 2.941924469559494e-06, + "loss": 5.0555, + "step": 341 + }, + { + "epoch": 0.12162162162162163, + "grad_norm": 0.8381339311599731, + "learning_rate": 2.9414446819235756e-06, + "loss": 4.2992, + "step": 342 + }, + { + "epoch": 0.12197724039829302, + "grad_norm": 2.08362078666687, + "learning_rate": 2.9409629600122657e-06, + "loss": 4.7059, + "step": 343 + }, + { + "epoch": 0.12233285917496443, + "grad_norm": 1.0644973516464233, + "learning_rate": 2.940479304471987e-06, + "loss": 4.1309, + "step": 344 + }, + { + "epoch": 0.12268847795163584, + "grad_norm": 1.8396978378295898, + "learning_rate": 2.939993715951757e-06, + "loss": 4.3294, + "step": 345 + }, + { + "epoch": 0.12304409672830725, + "grad_norm": 1.6997473239898682, + "learning_rate": 2.9395061951031878e-06, + "loss": 3.3947, + "step": 346 + }, + { + "epoch": 0.12339971550497866, + "grad_norm": 1.0901015996932983, + "learning_rate": 2.9390167425804836e-06, + "loss": 3.7619, + "step": 347 + }, + { + "epoch": 0.12375533428165007, + "grad_norm": 1.4327492713928223, + "learning_rate": 2.9385253590404404e-06, + "loss": 6.0227, + "step": 348 + }, + { + "epoch": 0.12411095305832148, + "grad_norm": 1.208313226699829, + "learning_rate": 2.9380320451424465e-06, + "loss": 5.7823, + "step": 349 + }, + { + "epoch": 0.12446657183499289, + "grad_norm": 1.2747994661331177, + "learning_rate": 2.9375368015484807e-06, + "loss": 5.7174, + "step": 350 + }, + { + "epoch": 0.1248221906116643, + "grad_norm": 0.9440839290618896, + "learning_rate": 2.93703962892311e-06, + "loss": 3.9918, + "step": 351 + }, + { + "epoch": 0.1251778093883357, + "grad_norm": 2.495675563812256, + "learning_rate": 2.9365405279334904e-06, + "loss": 1.8021, + "step": 352 + }, + { + "epoch": 0.12553342816500712, + "grad_norm": 1.2236593961715698, + "learning_rate": 2.936039499249366e-06, + "loss": 3.8869, + "step": 353 + }, + { + "epoch": 0.12588904694167852, + "grad_norm": 1.1432700157165527, + "learning_rate": 2.9355365435430673e-06, + "loss": 4.0786, + "step": 354 + }, + { + "epoch": 0.12624466571834994, + "grad_norm": 2.5109128952026367, + "learning_rate": 2.935031661489512e-06, + "loss": 6.375, + "step": 355 + }, + { + "epoch": 0.12660028449502134, + "grad_norm": 0.879395604133606, + "learning_rate": 2.9345248537661996e-06, + "loss": 3.4977, + "step": 356 + }, + { + "epoch": 0.12695590327169273, + "grad_norm": 1.7509536743164062, + "learning_rate": 2.9340161210532175e-06, + "loss": 4.7449, + "step": 357 + }, + { + "epoch": 0.12731152204836416, + "grad_norm": 0.8291417956352234, + "learning_rate": 2.933505464033233e-06, + "loss": 3.534, + "step": 358 + }, + { + "epoch": 0.12766714082503555, + "grad_norm": 2.1183810234069824, + "learning_rate": 2.9329928833914985e-06, + "loss": 4.7768, + "step": 359 + }, + { + "epoch": 0.12802275960170698, + "grad_norm": 1.101134181022644, + "learning_rate": 2.9324783798158447e-06, + "loss": 4.4087, + "step": 360 + }, + { + "epoch": 0.12837837837837837, + "grad_norm": 2.5908446311950684, + "learning_rate": 2.931961953996685e-06, + "loss": 6.1929, + "step": 361 + }, + { + "epoch": 0.1287339971550498, + "grad_norm": 2.5793728828430176, + "learning_rate": 2.9314436066270115e-06, + "loss": 7.1569, + "step": 362 + }, + { + "epoch": 0.1290896159317212, + "grad_norm": 1.7790296077728271, + "learning_rate": 2.930923338402395e-06, + "loss": 3.5686, + "step": 363 + }, + { + "epoch": 0.12944523470839261, + "grad_norm": 0.7739233374595642, + "learning_rate": 2.930401150020983e-06, + "loss": 4.2552, + "step": 364 + }, + { + "epoch": 0.129800853485064, + "grad_norm": 2.6223373413085938, + "learning_rate": 2.929877042183501e-06, + "loss": 4.1267, + "step": 365 + }, + { + "epoch": 0.1301564722617354, + "grad_norm": 1.145377516746521, + "learning_rate": 2.9293510155932493e-06, + "loss": 4.4507, + "step": 366 + }, + { + "epoch": 0.13051209103840683, + "grad_norm": 1.3254815340042114, + "learning_rate": 2.9288230709561035e-06, + "loss": 6.0496, + "step": 367 + }, + { + "epoch": 0.13086770981507823, + "grad_norm": 0.855785071849823, + "learning_rate": 2.928293208980512e-06, + "loss": 4.6132, + "step": 368 + }, + { + "epoch": 0.13122332859174965, + "grad_norm": 1.0480022430419922, + "learning_rate": 2.9277614303774982e-06, + "loss": 4.5638, + "step": 369 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 1.2895573377609253, + "learning_rate": 2.927227735860655e-06, + "loss": 4.5571, + "step": 370 + }, + { + "epoch": 0.13193456614509247, + "grad_norm": 0.8669893741607666, + "learning_rate": 2.926692126146148e-06, + "loss": 3.878, + "step": 371 + }, + { + "epoch": 0.13229018492176386, + "grad_norm": 1.0412732362747192, + "learning_rate": 2.926154601952712e-06, + "loss": 3.8371, + "step": 372 + }, + { + "epoch": 0.1326458036984353, + "grad_norm": 1.025474190711975, + "learning_rate": 2.925615164001651e-06, + "loss": 4.4361, + "step": 373 + }, + { + "epoch": 0.13300142247510668, + "grad_norm": 0.7745802402496338, + "learning_rate": 2.9250738130168364e-06, + "loss": 3.5167, + "step": 374 + }, + { + "epoch": 0.13335704125177808, + "grad_norm": 1.8033794164657593, + "learning_rate": 2.9245305497247086e-06, + "loss": 4.3072, + "step": 375 + }, + { + "epoch": 0.1337126600284495, + "grad_norm": 0.8972547650337219, + "learning_rate": 2.9239853748542717e-06, + "loss": 4.6146, + "step": 376 + }, + { + "epoch": 0.1340682788051209, + "grad_norm": 4.214621543884277, + "learning_rate": 2.9234382891370966e-06, + "loss": 3.8283, + "step": 377 + }, + { + "epoch": 0.13442389758179232, + "grad_norm": 2.4759042263031006, + "learning_rate": 2.922889293307319e-06, + "loss": 6.2074, + "step": 378 + }, + { + "epoch": 0.13477951635846372, + "grad_norm": 0.7912075519561768, + "learning_rate": 2.922338388101635e-06, + "loss": 4.0259, + "step": 379 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 1.282339096069336, + "learning_rate": 2.9217855742593053e-06, + "loss": 4.38, + "step": 380 + }, + { + "epoch": 0.13549075391180654, + "grad_norm": 2.1565747261047363, + "learning_rate": 2.921230852522151e-06, + "loss": 4.5373, + "step": 381 + }, + { + "epoch": 0.13584637268847796, + "grad_norm": 1.2422152757644653, + "learning_rate": 2.920674223634554e-06, + "loss": 5.2234, + "step": 382 + }, + { + "epoch": 0.13620199146514936, + "grad_norm": 1.2851698398590088, + "learning_rate": 2.9201156883434544e-06, + "loss": 3.3728, + "step": 383 + }, + { + "epoch": 0.13655761024182078, + "grad_norm": 1.8690694570541382, + "learning_rate": 2.9195552473983515e-06, + "loss": 5.5719, + "step": 384 + }, + { + "epoch": 0.13691322901849218, + "grad_norm": 1.6750600337982178, + "learning_rate": 2.918992901551301e-06, + "loss": 5.2886, + "step": 385 + }, + { + "epoch": 0.13726884779516357, + "grad_norm": 1.2670941352844238, + "learning_rate": 2.918428651556914e-06, + "loss": 4.2929, + "step": 386 + }, + { + "epoch": 0.137624466571835, + "grad_norm": 1.3003813028335571, + "learning_rate": 2.91786249817236e-06, + "loss": 3.6213, + "step": 387 + }, + { + "epoch": 0.1379800853485064, + "grad_norm": 1.0740904808044434, + "learning_rate": 2.9172944421573588e-06, + "loss": 4.3491, + "step": 388 + }, + { + "epoch": 0.13833570412517782, + "grad_norm": 1.8878662586212158, + "learning_rate": 2.9167244842741857e-06, + "loss": 5.5508, + "step": 389 + }, + { + "epoch": 0.1386913229018492, + "grad_norm": 0.8962175250053406, + "learning_rate": 2.9161526252876678e-06, + "loss": 3.0892, + "step": 390 + }, + { + "epoch": 0.13904694167852064, + "grad_norm": 2.370478391647339, + "learning_rate": 2.9155788659651826e-06, + "loss": 4.2346, + "step": 391 + }, + { + "epoch": 0.13940256045519203, + "grad_norm": 1.3908495903015137, + "learning_rate": 2.9150032070766577e-06, + "loss": 3.3026, + "step": 392 + }, + { + "epoch": 0.13975817923186346, + "grad_norm": 1.0625017881393433, + "learning_rate": 2.914425649394571e-06, + "loss": 4.3274, + "step": 393 + }, + { + "epoch": 0.14011379800853485, + "grad_norm": 6.0378618240356445, + "learning_rate": 2.9138461936939467e-06, + "loss": 6.2655, + "step": 394 + }, + { + "epoch": 0.14046941678520625, + "grad_norm": 1.2048847675323486, + "learning_rate": 2.913264840752357e-06, + "loss": 3.6459, + "step": 395 + }, + { + "epoch": 0.14082503556187767, + "grad_norm": 1.7702374458312988, + "learning_rate": 2.9126815913499194e-06, + "loss": 4.397, + "step": 396 + }, + { + "epoch": 0.14118065433854907, + "grad_norm": 2.21747088432312, + "learning_rate": 2.9120964462692972e-06, + "loss": 5.8482, + "step": 397 + }, + { + "epoch": 0.1415362731152205, + "grad_norm": 3.0990536212921143, + "learning_rate": 2.9115094062956967e-06, + "loss": 5.3346, + "step": 398 + }, + { + "epoch": 0.14189189189189189, + "grad_norm": 0.7880954146385193, + "learning_rate": 2.9109204722168668e-06, + "loss": 3.6419, + "step": 399 + }, + { + "epoch": 0.1422475106685633, + "grad_norm": 2.010579824447632, + "learning_rate": 2.9103296448230986e-06, + "loss": 6.4316, + "step": 400 + }, + { + "epoch": 0.1426031294452347, + "grad_norm": 1.3603864908218384, + "learning_rate": 2.909736924907224e-06, + "loss": 3.7582, + "step": 401 + }, + { + "epoch": 0.14295874822190613, + "grad_norm": 0.899783730506897, + "learning_rate": 2.9091423132646134e-06, + "loss": 3.199, + "step": 402 + }, + { + "epoch": 0.14331436699857752, + "grad_norm": 0.7850140333175659, + "learning_rate": 2.9085458106931776e-06, + "loss": 3.7833, + "step": 403 + }, + { + "epoch": 0.14366998577524892, + "grad_norm": 1.5178565979003906, + "learning_rate": 2.9079474179933635e-06, + "loss": 4.8171, + "step": 404 + }, + { + "epoch": 0.14402560455192034, + "grad_norm": 1.0936920642852783, + "learning_rate": 2.9073471359681537e-06, + "loss": 4.9758, + "step": 405 + }, + { + "epoch": 0.14438122332859174, + "grad_norm": 4.55645751953125, + "learning_rate": 2.906744965423067e-06, + "loss": 7.5298, + "step": 406 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 1.4274687767028809, + "learning_rate": 2.9061409071661576e-06, + "loss": 4.4045, + "step": 407 + }, + { + "epoch": 0.14509246088193456, + "grad_norm": 1.1000972986221313, + "learning_rate": 2.9055349620080108e-06, + "loss": 4.3448, + "step": 408 + }, + { + "epoch": 0.14544807965860598, + "grad_norm": 2.840977907180786, + "learning_rate": 2.9049271307617446e-06, + "loss": 6.425, + "step": 409 + }, + { + "epoch": 0.14580369843527738, + "grad_norm": 4.55898904800415, + "learning_rate": 2.9043174142430084e-06, + "loss": 7.459, + "step": 410 + }, + { + "epoch": 0.1461593172119488, + "grad_norm": 3.023228406906128, + "learning_rate": 2.9037058132699812e-06, + "loss": 5.9826, + "step": 411 + }, + { + "epoch": 0.1465149359886202, + "grad_norm": 1.4194024801254272, + "learning_rate": 2.9030923286633703e-06, + "loss": 4.7581, + "step": 412 + }, + { + "epoch": 0.1468705547652916, + "grad_norm": 0.8513787388801575, + "learning_rate": 2.902476961246411e-06, + "loss": 3.775, + "step": 413 + }, + { + "epoch": 0.14722617354196302, + "grad_norm": 1.5157617330551147, + "learning_rate": 2.901859711844866e-06, + "loss": 4.1951, + "step": 414 + }, + { + "epoch": 0.1475817923186344, + "grad_norm": 0.8644416332244873, + "learning_rate": 2.9012405812870213e-06, + "loss": 3.6441, + "step": 415 + }, + { + "epoch": 0.14793741109530584, + "grad_norm": 2.594719648361206, + "learning_rate": 2.90061957040369e-06, + "loss": 2.7785, + "step": 416 + }, + { + "epoch": 0.14829302987197723, + "grad_norm": 0.8559796214103699, + "learning_rate": 2.8999966800282054e-06, + "loss": 4.7348, + "step": 417 + }, + { + "epoch": 0.14864864864864866, + "grad_norm": 1.0496301651000977, + "learning_rate": 2.8993719109964255e-06, + "loss": 3.819, + "step": 418 + }, + { + "epoch": 0.14900426742532005, + "grad_norm": 1.1008789539337158, + "learning_rate": 2.8987452641467275e-06, + "loss": 5.3184, + "step": 419 + }, + { + "epoch": 0.14935988620199148, + "grad_norm": 1.7244600057601929, + "learning_rate": 2.89811674032001e-06, + "loss": 6.0207, + "step": 420 + }, + { + "epoch": 0.14971550497866287, + "grad_norm": 1.0164358615875244, + "learning_rate": 2.8974863403596885e-06, + "loss": 3.9044, + "step": 421 + }, + { + "epoch": 0.15007112375533427, + "grad_norm": 1.498863935470581, + "learning_rate": 2.8968540651116977e-06, + "loss": 5.9092, + "step": 422 + }, + { + "epoch": 0.1504267425320057, + "grad_norm": 0.9629373550415039, + "learning_rate": 2.8962199154244883e-06, + "loss": 3.7031, + "step": 423 + }, + { + "epoch": 0.1507823613086771, + "grad_norm": 0.9522832036018372, + "learning_rate": 2.895583892149025e-06, + "loss": 4.172, + "step": 424 + }, + { + "epoch": 0.1511379800853485, + "grad_norm": 1.036159634590149, + "learning_rate": 2.8949459961387893e-06, + "loss": 3.6561, + "step": 425 + }, + { + "epoch": 0.1514935988620199, + "grad_norm": 1.724344253540039, + "learning_rate": 2.8943062282497728e-06, + "loss": 5.8948, + "step": 426 + }, + { + "epoch": 0.15184921763869133, + "grad_norm": 0.8653039336204529, + "learning_rate": 2.893664589340481e-06, + "loss": 3.4425, + "step": 427 + }, + { + "epoch": 0.15220483641536273, + "grad_norm": 0.9935992956161499, + "learning_rate": 2.89302108027193e-06, + "loss": 3.9269, + "step": 428 + }, + { + "epoch": 0.15256045519203415, + "grad_norm": 1.3698943853378296, + "learning_rate": 2.892375701907644e-06, + "loss": 5.4388, + "step": 429 + }, + { + "epoch": 0.15291607396870555, + "grad_norm": 1.107921838760376, + "learning_rate": 2.891728455113657e-06, + "loss": 5.0098, + "step": 430 + }, + { + "epoch": 0.15327169274537697, + "grad_norm": 1.2677091360092163, + "learning_rate": 2.8910793407585097e-06, + "loss": 4.1724, + "step": 431 + }, + { + "epoch": 0.15362731152204837, + "grad_norm": 3.229732036590576, + "learning_rate": 2.8904283597132496e-06, + "loss": 7.0383, + "step": 432 + }, + { + "epoch": 0.15398293029871976, + "grad_norm": 0.8769774436950684, + "learning_rate": 2.8897755128514277e-06, + "loss": 3.1472, + "step": 433 + }, + { + "epoch": 0.15433854907539118, + "grad_norm": 1.3042701482772827, + "learning_rate": 2.8891208010491003e-06, + "loss": 5.074, + "step": 434 + }, + { + "epoch": 0.15469416785206258, + "grad_norm": 1.6202664375305176, + "learning_rate": 2.8884642251848244e-06, + "loss": 4.46, + "step": 435 + }, + { + "epoch": 0.155049786628734, + "grad_norm": 1.6061434745788574, + "learning_rate": 2.8878057861396606e-06, + "loss": 4.2319, + "step": 436 + }, + { + "epoch": 0.1554054054054054, + "grad_norm": 1.0755480527877808, + "learning_rate": 2.887145484797168e-06, + "loss": 4.0267, + "step": 437 + }, + { + "epoch": 0.15576102418207682, + "grad_norm": 0.9695757031440735, + "learning_rate": 2.886483322043406e-06, + "loss": 4.1416, + "step": 438 + }, + { + "epoch": 0.15611664295874822, + "grad_norm": 0.9161505103111267, + "learning_rate": 2.88581929876693e-06, + "loss": 3.5891, + "step": 439 + }, + { + "epoch": 0.15647226173541964, + "grad_norm": 0.7957665920257568, + "learning_rate": 2.8851534158587944e-06, + "loss": 4.1601, + "step": 440 + }, + { + "epoch": 0.15682788051209104, + "grad_norm": 1.2745026350021362, + "learning_rate": 2.8844856742125472e-06, + "loss": 4.9032, + "step": 441 + }, + { + "epoch": 0.15718349928876243, + "grad_norm": 0.703338086605072, + "learning_rate": 2.8838160747242317e-06, + "loss": 3.4565, + "step": 442 + }, + { + "epoch": 0.15753911806543386, + "grad_norm": 1.0484261512756348, + "learning_rate": 2.883144618292383e-06, + "loss": 3.8791, + "step": 443 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 2.3019142150878906, + "learning_rate": 2.8824713058180296e-06, + "loss": 5.2501, + "step": 444 + }, + { + "epoch": 0.15825035561877668, + "grad_norm": 0.8409615159034729, + "learning_rate": 2.8817961382046896e-06, + "loss": 4.2325, + "step": 445 + }, + { + "epoch": 0.15860597439544807, + "grad_norm": 0.716831624507904, + "learning_rate": 2.881119116358371e-06, + "loss": 2.8239, + "step": 446 + }, + { + "epoch": 0.1589615931721195, + "grad_norm": 2.049302101135254, + "learning_rate": 2.8804402411875693e-06, + "loss": 4.0652, + "step": 447 + }, + { + "epoch": 0.1593172119487909, + "grad_norm": 1.3637961149215698, + "learning_rate": 2.8797595136032674e-06, + "loss": 3.8916, + "step": 448 + }, + { + "epoch": 0.15967283072546232, + "grad_norm": 1.3892077207565308, + "learning_rate": 2.879076934518935e-06, + "loss": 4.5348, + "step": 449 + }, + { + "epoch": 0.1600284495021337, + "grad_norm": 0.8823988437652588, + "learning_rate": 2.8783925048505246e-06, + "loss": 3.923, + "step": 450 + }, + { + "epoch": 0.1603840682788051, + "grad_norm": 1.1572587490081787, + "learning_rate": 2.8777062255164724e-06, + "loss": 4.2422, + "step": 451 + }, + { + "epoch": 0.16073968705547653, + "grad_norm": 1.4720207452774048, + "learning_rate": 2.877018097437698e-06, + "loss": 5.9722, + "step": 452 + }, + { + "epoch": 0.16109530583214793, + "grad_norm": 1.6064141988754272, + "learning_rate": 2.8763281215376e-06, + "loss": 5.0243, + "step": 453 + }, + { + "epoch": 0.16145092460881935, + "grad_norm": 0.8029962778091431, + "learning_rate": 2.875636298742058e-06, + "loss": 4.4418, + "step": 454 + }, + { + "epoch": 0.16180654338549075, + "grad_norm": 0.7559634447097778, + "learning_rate": 2.874942629979428e-06, + "loss": 3.6726, + "step": 455 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 1.3918873071670532, + "learning_rate": 2.874247116180547e-06, + "loss": 4.5257, + "step": 456 + }, + { + "epoch": 0.16251778093883357, + "grad_norm": 1.180337905883789, + "learning_rate": 2.873549758278723e-06, + "loss": 3.7322, + "step": 457 + }, + { + "epoch": 0.162873399715505, + "grad_norm": 1.9698855876922607, + "learning_rate": 2.872850557209742e-06, + "loss": 5.2614, + "step": 458 + }, + { + "epoch": 0.1632290184921764, + "grad_norm": 1.4811458587646484, + "learning_rate": 2.8721495139118622e-06, + "loss": 4.042, + "step": 459 + }, + { + "epoch": 0.16358463726884778, + "grad_norm": 1.279442548751831, + "learning_rate": 2.871446629325814e-06, + "loss": 4.3024, + "step": 460 + }, + { + "epoch": 0.1639402560455192, + "grad_norm": 1.3064237833023071, + "learning_rate": 2.8707419043947985e-06, + "loss": 5.8484, + "step": 461 + }, + { + "epoch": 0.1642958748221906, + "grad_norm": 1.0669044256210327, + "learning_rate": 2.8700353400644867e-06, + "loss": 3.9624, + "step": 462 + }, + { + "epoch": 0.16465149359886203, + "grad_norm": 3.148326873779297, + "learning_rate": 2.8693269372830174e-06, + "loss": 6.9432, + "step": 463 + }, + { + "epoch": 0.16500711237553342, + "grad_norm": 1.7402124404907227, + "learning_rate": 2.8686166970009964e-06, + "loss": 5.8382, + "step": 464 + }, + { + "epoch": 0.16536273115220484, + "grad_norm": 1.422709345817566, + "learning_rate": 2.867904620171496e-06, + "loss": 4.3613, + "step": 465 + }, + { + "epoch": 0.16571834992887624, + "grad_norm": 2.5943527221679688, + "learning_rate": 2.867190707750052e-06, + "loss": 4.6915, + "step": 466 + }, + { + "epoch": 0.16607396870554766, + "grad_norm": 2.6017744541168213, + "learning_rate": 2.8664749606946642e-06, + "loss": 2.8053, + "step": 467 + }, + { + "epoch": 0.16642958748221906, + "grad_norm": 1.0881963968276978, + "learning_rate": 2.8657573799657944e-06, + "loss": 3.3331, + "step": 468 + }, + { + "epoch": 0.16678520625889046, + "grad_norm": 3.782257556915283, + "learning_rate": 2.8650379665263636e-06, + "loss": 3.5796, + "step": 469 + }, + { + "epoch": 0.16714082503556188, + "grad_norm": 2.4624698162078857, + "learning_rate": 2.864316721341754e-06, + "loss": 5.327, + "step": 470 + }, + { + "epoch": 0.16749644381223328, + "grad_norm": 1.6265795230865479, + "learning_rate": 2.863593645379804e-06, + "loss": 4.4254, + "step": 471 + }, + { + "epoch": 0.1678520625889047, + "grad_norm": 0.9265016913414001, + "learning_rate": 2.8628687396108106e-06, + "loss": 4.2407, + "step": 472 + }, + { + "epoch": 0.1682076813655761, + "grad_norm": 1.0075774192810059, + "learning_rate": 2.862142005007524e-06, + "loss": 3.8666, + "step": 473 + }, + { + "epoch": 0.16856330014224752, + "grad_norm": 1.0373834371566772, + "learning_rate": 2.8614134425451513e-06, + "loss": 4.4611, + "step": 474 + }, + { + "epoch": 0.16891891891891891, + "grad_norm": 0.7770723700523376, + "learning_rate": 2.8606830532013497e-06, + "loss": 3.6458, + "step": 475 + }, + { + "epoch": 0.16927453769559034, + "grad_norm": 0.9007360935211182, + "learning_rate": 2.8599508379562295e-06, + "loss": 3.5867, + "step": 476 + }, + { + "epoch": 0.16963015647226173, + "grad_norm": 2.185502529144287, + "learning_rate": 2.8592167977923505e-06, + "loss": 3.1777, + "step": 477 + }, + { + "epoch": 0.16998577524893316, + "grad_norm": 4.050443649291992, + "learning_rate": 2.8584809336947216e-06, + "loss": 6.1831, + "step": 478 + }, + { + "epoch": 0.17034139402560455, + "grad_norm": 1.0748915672302246, + "learning_rate": 2.8577432466507997e-06, + "loss": 3.7625, + "step": 479 + }, + { + "epoch": 0.17069701280227595, + "grad_norm": 1.016385793685913, + "learning_rate": 2.857003737650487e-06, + "loss": 2.2661, + "step": 480 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 3.203923463821411, + "learning_rate": 2.8562624076861303e-06, + "loss": 6.788, + "step": 481 + }, + { + "epoch": 0.17140825035561877, + "grad_norm": 0.8450173139572144, + "learning_rate": 2.855519257752522e-06, + "loss": 3.4364, + "step": 482 + }, + { + "epoch": 0.1717638691322902, + "grad_norm": 1.4651472568511963, + "learning_rate": 2.8547742888468954e-06, + "loss": 4.9542, + "step": 483 + }, + { + "epoch": 0.1721194879089616, + "grad_norm": 1.3444221019744873, + "learning_rate": 2.8540275019689238e-06, + "loss": 4.5488, + "step": 484 + }, + { + "epoch": 0.172475106685633, + "grad_norm": 3.588040590286255, + "learning_rate": 2.853278898120721e-06, + "loss": 6.5722, + "step": 485 + }, + { + "epoch": 0.1728307254623044, + "grad_norm": 3.30137300491333, + "learning_rate": 2.8525284783068394e-06, + "loss": 6.5606, + "step": 486 + }, + { + "epoch": 0.17318634423897583, + "grad_norm": 1.0483907461166382, + "learning_rate": 2.8517762435342676e-06, + "loss": 3.8013, + "step": 487 + }, + { + "epoch": 0.17354196301564723, + "grad_norm": 1.5078085660934448, + "learning_rate": 2.8510221948124293e-06, + "loss": 5.0763, + "step": 488 + }, + { + "epoch": 0.17389758179231862, + "grad_norm": 1.0674434900283813, + "learning_rate": 2.850266333153184e-06, + "loss": 4.0963, + "step": 489 + }, + { + "epoch": 0.17425320056899005, + "grad_norm": 1.273695468902588, + "learning_rate": 2.8495086595708216e-06, + "loss": 3.5468, + "step": 490 + }, + { + "epoch": 0.17460881934566144, + "grad_norm": 0.8278428912162781, + "learning_rate": 2.8487491750820658e-06, + "loss": 2.9686, + "step": 491 + }, + { + "epoch": 0.17496443812233287, + "grad_norm": 0.8448724150657654, + "learning_rate": 2.8479878807060686e-06, + "loss": 4.0522, + "step": 492 + }, + { + "epoch": 0.17532005689900426, + "grad_norm": 0.6080875396728516, + "learning_rate": 2.8472247774644112e-06, + "loss": 3.5537, + "step": 493 + }, + { + "epoch": 0.17567567567567569, + "grad_norm": 1.062381386756897, + "learning_rate": 2.8464598663811027e-06, + "loss": 4.0672, + "step": 494 + }, + { + "epoch": 0.17603129445234708, + "grad_norm": 0.8491649031639099, + "learning_rate": 2.845693148482578e-06, + "loss": 4.127, + "step": 495 + }, + { + "epoch": 0.1763869132290185, + "grad_norm": 0.6444569230079651, + "learning_rate": 2.8449246247976946e-06, + "loss": 3.1155, + "step": 496 + }, + { + "epoch": 0.1767425320056899, + "grad_norm": 0.9954007267951965, + "learning_rate": 2.844154296357737e-06, + "loss": 3.7328, + "step": 497 + }, + { + "epoch": 0.1770981507823613, + "grad_norm": 1.710366129875183, + "learning_rate": 2.843382164196408e-06, + "loss": 4.8118, + "step": 498 + }, + { + "epoch": 0.17745376955903272, + "grad_norm": 1.5804247856140137, + "learning_rate": 2.842608229349833e-06, + "loss": 5.9447, + "step": 499 + }, + { + "epoch": 0.17780938833570412, + "grad_norm": 0.9345231652259827, + "learning_rate": 2.841832492856554e-06, + "loss": 4.361, + "step": 500 + }, + { + "epoch": 0.17816500711237554, + "grad_norm": 3.776193141937256, + "learning_rate": 2.841054955757534e-06, + "loss": 7.7796, + "step": 501 + }, + { + "epoch": 0.17852062588904694, + "grad_norm": 2.03364896774292, + "learning_rate": 2.84027561909615e-06, + "loss": 4.5487, + "step": 502 + }, + { + "epoch": 0.17887624466571836, + "grad_norm": 1.6485395431518555, + "learning_rate": 2.839494483918194e-06, + "loss": 4.1081, + "step": 503 + }, + { + "epoch": 0.17923186344238975, + "grad_norm": 2.844550132751465, + "learning_rate": 2.838711551271872e-06, + "loss": 4.176, + "step": 504 + }, + { + "epoch": 0.17958748221906118, + "grad_norm": 1.325052261352539, + "learning_rate": 2.8379268222078005e-06, + "loss": 2.1712, + "step": 505 + }, + { + "epoch": 0.17994310099573257, + "grad_norm": 0.726956307888031, + "learning_rate": 2.83714029777901e-06, + "loss": 2.9441, + "step": 506 + }, + { + "epoch": 0.18029871977240397, + "grad_norm": 1.7597980499267578, + "learning_rate": 2.8363519790409363e-06, + "loss": 5.9519, + "step": 507 + }, + { + "epoch": 0.1806543385490754, + "grad_norm": 0.7890097498893738, + "learning_rate": 2.8355618670514258e-06, + "loss": 3.9007, + "step": 508 + }, + { + "epoch": 0.1810099573257468, + "grad_norm": 0.9674394726753235, + "learning_rate": 2.8347699628707296e-06, + "loss": 3.7591, + "step": 509 + }, + { + "epoch": 0.1813655761024182, + "grad_norm": 0.8734735250473022, + "learning_rate": 2.833976267561504e-06, + "loss": 3.4163, + "step": 510 + }, + { + "epoch": 0.1817211948790896, + "grad_norm": 1.2141437530517578, + "learning_rate": 2.83318078218881e-06, + "loss": 5.151, + "step": 511 + }, + { + "epoch": 0.18207681365576103, + "grad_norm": 1.0548174381256104, + "learning_rate": 2.8323835078201093e-06, + "loss": 3.5243, + "step": 512 + }, + { + "epoch": 0.18243243243243243, + "grad_norm": 1.612585425376892, + "learning_rate": 2.831584445525266e-06, + "loss": 4.4866, + "step": 513 + }, + { + "epoch": 0.18278805120910385, + "grad_norm": 0.9087425470352173, + "learning_rate": 2.8307835963765403e-06, + "loss": 3.8177, + "step": 514 + }, + { + "epoch": 0.18314366998577525, + "grad_norm": 0.6460033059120178, + "learning_rate": 2.829980961448593e-06, + "loss": 2.2058, + "step": 515 + }, + { + "epoch": 0.18349928876244664, + "grad_norm": 1.585752248764038, + "learning_rate": 2.829176541818481e-06, + "loss": 2.8631, + "step": 516 + }, + { + "epoch": 0.18385490753911807, + "grad_norm": 1.0954480171203613, + "learning_rate": 2.828370338565654e-06, + "loss": 4.3102, + "step": 517 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 2.8611817359924316, + "learning_rate": 2.827562352771958e-06, + "loss": 5.5732, + "step": 518 + }, + { + "epoch": 0.1845661450924609, + "grad_norm": 1.1231988668441772, + "learning_rate": 2.8267525855216288e-06, + "loss": 4.0545, + "step": 519 + }, + { + "epoch": 0.18492176386913228, + "grad_norm": 1.0016043186187744, + "learning_rate": 2.825941037901294e-06, + "loss": 3.7288, + "step": 520 + }, + { + "epoch": 0.1852773826458037, + "grad_norm": 2.3641672134399414, + "learning_rate": 2.8251277109999688e-06, + "loss": 4.2448, + "step": 521 + }, + { + "epoch": 0.1856330014224751, + "grad_norm": 0.7726828455924988, + "learning_rate": 2.824312605909058e-06, + "loss": 3.5728, + "step": 522 + }, + { + "epoch": 0.18598862019914653, + "grad_norm": 0.7674732208251953, + "learning_rate": 2.823495723722351e-06, + "loss": 3.5124, + "step": 523 + }, + { + "epoch": 0.18634423897581792, + "grad_norm": 1.5398187637329102, + "learning_rate": 2.8226770655360226e-06, + "loss": 3.6528, + "step": 524 + }, + { + "epoch": 0.18669985775248932, + "grad_norm": 2.3653814792633057, + "learning_rate": 2.821856632448631e-06, + "loss": 5.3185, + "step": 525 + }, + { + "epoch": 0.18705547652916074, + "grad_norm": 1.4469196796417236, + "learning_rate": 2.8210344255611157e-06, + "loss": 3.6778, + "step": 526 + }, + { + "epoch": 0.18741109530583214, + "grad_norm": 1.4013607501983643, + "learning_rate": 2.820210445976796e-06, + "loss": 4.3378, + "step": 527 + }, + { + "epoch": 0.18776671408250356, + "grad_norm": 2.5016520023345947, + "learning_rate": 2.819384694801371e-06, + "loss": 3.7682, + "step": 528 + }, + { + "epoch": 0.18812233285917496, + "grad_norm": 1.364500880241394, + "learning_rate": 2.818557173142917e-06, + "loss": 4.4085, + "step": 529 + }, + { + "epoch": 0.18847795163584638, + "grad_norm": 0.8757725954055786, + "learning_rate": 2.817727882111885e-06, + "loss": 4.0587, + "step": 530 + }, + { + "epoch": 0.18883357041251778, + "grad_norm": 0.8975615501403809, + "learning_rate": 2.816896822821101e-06, + "loss": 3.9759, + "step": 531 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 1.054650068283081, + "learning_rate": 2.816063996385765e-06, + "loss": 4.9169, + "step": 532 + }, + { + "epoch": 0.1895448079658606, + "grad_norm": 1.2848254442214966, + "learning_rate": 2.8152294039234457e-06, + "loss": 5.119, + "step": 533 + }, + { + "epoch": 0.18990042674253202, + "grad_norm": 1.0389490127563477, + "learning_rate": 2.814393046554085e-06, + "loss": 3.952, + "step": 534 + }, + { + "epoch": 0.19025604551920342, + "grad_norm": 1.275508165359497, + "learning_rate": 2.8135549253999896e-06, + "loss": 5.1179, + "step": 535 + }, + { + "epoch": 0.1906116642958748, + "grad_norm": 1.0334125757217407, + "learning_rate": 2.8127150415858364e-06, + "loss": 3.4547, + "step": 536 + }, + { + "epoch": 0.19096728307254623, + "grad_norm": 4.698150157928467, + "learning_rate": 2.8118733962386644e-06, + "loss": 3.4054, + "step": 537 + }, + { + "epoch": 0.19132290184921763, + "grad_norm": 0.7376463413238525, + "learning_rate": 2.811029990487878e-06, + "loss": 3.1261, + "step": 538 + }, + { + "epoch": 0.19167852062588905, + "grad_norm": 0.8174617290496826, + "learning_rate": 2.8101848254652452e-06, + "loss": 3.8789, + "step": 539 + }, + { + "epoch": 0.19203413940256045, + "grad_norm": 1.7538596391677856, + "learning_rate": 2.8093379023048925e-06, + "loss": 3.5618, + "step": 540 + }, + { + "epoch": 0.19238975817923187, + "grad_norm": 1.0602490901947021, + "learning_rate": 2.808489222143306e-06, + "loss": 4.5452, + "step": 541 + }, + { + "epoch": 0.19274537695590327, + "grad_norm": 1.1062183380126953, + "learning_rate": 2.807638786119331e-06, + "loss": 3.8009, + "step": 542 + }, + { + "epoch": 0.1931009957325747, + "grad_norm": 0.7910590171813965, + "learning_rate": 2.806786595374168e-06, + "loss": 2.9511, + "step": 543 + }, + { + "epoch": 0.1934566145092461, + "grad_norm": 0.7155831456184387, + "learning_rate": 2.8059326510513718e-06, + "loss": 4.0346, + "step": 544 + }, + { + "epoch": 0.19381223328591748, + "grad_norm": 1.0348141193389893, + "learning_rate": 2.805076954296851e-06, + "loss": 3.3954, + "step": 545 + }, + { + "epoch": 0.1941678520625889, + "grad_norm": 1.1615036725997925, + "learning_rate": 2.804219506258865e-06, + "loss": 4.9397, + "step": 546 + }, + { + "epoch": 0.1945234708392603, + "grad_norm": 1.064865231513977, + "learning_rate": 2.8033603080880247e-06, + "loss": 4.6749, + "step": 547 + }, + { + "epoch": 0.19487908961593173, + "grad_norm": 0.8134397864341736, + "learning_rate": 2.8024993609372878e-06, + "loss": 2.9612, + "step": 548 + }, + { + "epoch": 0.19523470839260312, + "grad_norm": 1.5713715553283691, + "learning_rate": 2.80163666596196e-06, + "loss": 4.0352, + "step": 549 + }, + { + "epoch": 0.19559032716927455, + "grad_norm": 0.816437840461731, + "learning_rate": 2.8007722243196922e-06, + "loss": 3.5071, + "step": 550 + }, + { + "epoch": 0.19594594594594594, + "grad_norm": 1.4520788192749023, + "learning_rate": 2.799906037170479e-06, + "loss": 5.2718, + "step": 551 + }, + { + "epoch": 0.19630156472261737, + "grad_norm": 0.8753415942192078, + "learning_rate": 2.7990381056766585e-06, + "loss": 3.6946, + "step": 552 + }, + { + "epoch": 0.19665718349928876, + "grad_norm": 1.2038344144821167, + "learning_rate": 2.7981684310029063e-06, + "loss": 4.3704, + "step": 553 + }, + { + "epoch": 0.19701280227596016, + "grad_norm": 3.3635547161102295, + "learning_rate": 2.797297014316241e-06, + "loss": 7.6366, + "step": 554 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 0.6830400228500366, + "learning_rate": 2.796423856786016e-06, + "loss": 4.238, + "step": 555 + }, + { + "epoch": 0.19772403982930298, + "grad_norm": 1.3281424045562744, + "learning_rate": 2.795548959583923e-06, + "loss": 5.1301, + "step": 556 + }, + { + "epoch": 0.1980796586059744, + "grad_norm": 1.6344794034957886, + "learning_rate": 2.794672323883986e-06, + "loss": 5.1569, + "step": 557 + }, + { + "epoch": 0.1984352773826458, + "grad_norm": 1.057875394821167, + "learning_rate": 2.7937939508625634e-06, + "loss": 2.4669, + "step": 558 + }, + { + "epoch": 0.19879089615931722, + "grad_norm": 1.050342321395874, + "learning_rate": 2.792913841698345e-06, + "loss": 3.9789, + "step": 559 + }, + { + "epoch": 0.19914651493598862, + "grad_norm": 1.4310963153839111, + "learning_rate": 2.7920319975723482e-06, + "loss": 4.7825, + "step": 560 + }, + { + "epoch": 0.19950213371266004, + "grad_norm": 1.2031478881835938, + "learning_rate": 2.7911484196679217e-06, + "loss": 3.8083, + "step": 561 + }, + { + "epoch": 0.19985775248933144, + "grad_norm": 0.9954594969749451, + "learning_rate": 2.7902631091707387e-06, + "loss": 4.2271, + "step": 562 + }, + { + "epoch": 0.20021337126600283, + "grad_norm": 7.33213472366333, + "learning_rate": 2.789376067268797e-06, + "loss": 3.7894, + "step": 563 + }, + { + "epoch": 0.20056899004267426, + "grad_norm": 0.7132086157798767, + "learning_rate": 2.7884872951524196e-06, + "loss": 4.0386, + "step": 564 + }, + { + "epoch": 0.20092460881934565, + "grad_norm": 1.0619257688522339, + "learning_rate": 2.78759679401425e-06, + "loss": 3.5723, + "step": 565 + }, + { + "epoch": 0.20128022759601708, + "grad_norm": 0.7991335391998291, + "learning_rate": 2.7867045650492514e-06, + "loss": 3.8146, + "step": 566 + }, + { + "epoch": 0.20163584637268847, + "grad_norm": 1.1837235689163208, + "learning_rate": 2.785810609454708e-06, + "loss": 3.014, + "step": 567 + }, + { + "epoch": 0.2019914651493599, + "grad_norm": 0.9383053779602051, + "learning_rate": 2.784914928430218e-06, + "loss": 3.9208, + "step": 568 + }, + { + "epoch": 0.2023470839260313, + "grad_norm": 0.9446107745170593, + "learning_rate": 2.784017523177696e-06, + "loss": 3.6964, + "step": 569 + }, + { + "epoch": 0.20270270270270271, + "grad_norm": 1.3008809089660645, + "learning_rate": 2.783118394901372e-06, + "loss": 4.7449, + "step": 570 + }, + { + "epoch": 0.2030583214793741, + "grad_norm": 1.8068037033081055, + "learning_rate": 2.782217544807785e-06, + "loss": 5.4764, + "step": 571 + }, + { + "epoch": 0.2034139402560455, + "grad_norm": 1.39872145652771, + "learning_rate": 2.781314974105788e-06, + "loss": 4.7104, + "step": 572 + }, + { + "epoch": 0.20376955903271693, + "grad_norm": 0.7165918946266174, + "learning_rate": 2.78041068400654e-06, + "loss": 3.7487, + "step": 573 + }, + { + "epoch": 0.20412517780938833, + "grad_norm": 1.0714852809906006, + "learning_rate": 2.779504675723508e-06, + "loss": 4.3709, + "step": 574 + }, + { + "epoch": 0.20448079658605975, + "grad_norm": 0.8296200037002563, + "learning_rate": 2.7785969504724658e-06, + "loss": 4.0169, + "step": 575 + }, + { + "epoch": 0.20483641536273114, + "grad_norm": 0.7418464422225952, + "learning_rate": 2.77768750947149e-06, + "loss": 3.8664, + "step": 576 + }, + { + "epoch": 0.20519203413940257, + "grad_norm": 1.020323395729065, + "learning_rate": 2.7767763539409603e-06, + "loss": 3.5337, + "step": 577 + }, + { + "epoch": 0.20554765291607396, + "grad_norm": 2.9989380836486816, + "learning_rate": 2.775863485103557e-06, + "loss": 3.6817, + "step": 578 + }, + { + "epoch": 0.2059032716927454, + "grad_norm": 1.3198949098587036, + "learning_rate": 2.7749489041842583e-06, + "loss": 4.3752, + "step": 579 + }, + { + "epoch": 0.20625889046941678, + "grad_norm": 1.4052988290786743, + "learning_rate": 2.7740326124103415e-06, + "loss": 2.9968, + "step": 580 + }, + { + "epoch": 0.2066145092460882, + "grad_norm": 1.8199570178985596, + "learning_rate": 2.7731146110113794e-06, + "loss": 5.3422, + "step": 581 + }, + { + "epoch": 0.2069701280227596, + "grad_norm": 1.2858275175094604, + "learning_rate": 2.7721949012192375e-06, + "loss": 3.6558, + "step": 582 + }, + { + "epoch": 0.207325746799431, + "grad_norm": 1.106763482093811, + "learning_rate": 2.7712734842680758e-06, + "loss": 4.6902, + "step": 583 + }, + { + "epoch": 0.20768136557610242, + "grad_norm": 1.139768362045288, + "learning_rate": 2.7703503613943442e-06, + "loss": 3.5806, + "step": 584 + }, + { + "epoch": 0.20803698435277382, + "grad_norm": 1.5262422561645508, + "learning_rate": 2.769425533836781e-06, + "loss": 4.0224, + "step": 585 + }, + { + "epoch": 0.20839260312944524, + "grad_norm": 1.1449605226516724, + "learning_rate": 2.7684990028364135e-06, + "loss": 4.4434, + "step": 586 + }, + { + "epoch": 0.20874822190611664, + "grad_norm": 1.054160714149475, + "learning_rate": 2.767570769636554e-06, + "loss": 3.951, + "step": 587 + }, + { + "epoch": 0.20910384068278806, + "grad_norm": 2.272674322128296, + "learning_rate": 2.7666408354827985e-06, + "loss": 5.87, + "step": 588 + }, + { + "epoch": 0.20945945945945946, + "grad_norm": 1.2373629808425903, + "learning_rate": 2.7657092016230273e-06, + "loss": 4.8631, + "step": 589 + }, + { + "epoch": 0.20981507823613088, + "grad_norm": 0.8076127171516418, + "learning_rate": 2.7647758693073995e-06, + "loss": 4.0761, + "step": 590 + }, + { + "epoch": 0.21017069701280228, + "grad_norm": 0.8446835279464722, + "learning_rate": 2.7638408397883545e-06, + "loss": 4.1402, + "step": 591 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.9384989142417908, + "learning_rate": 2.762904114320609e-06, + "loss": 3.5092, + "step": 592 + }, + { + "epoch": 0.2108819345661451, + "grad_norm": 1.0348140001296997, + "learning_rate": 2.7619656941611555e-06, + "loss": 3.9671, + "step": 593 + }, + { + "epoch": 0.2112375533428165, + "grad_norm": 1.002903699874878, + "learning_rate": 2.76102558056926e-06, + "loss": 4.4568, + "step": 594 + }, + { + "epoch": 0.21159317211948792, + "grad_norm": 1.361482858657837, + "learning_rate": 2.7600837748064616e-06, + "loss": 4.2297, + "step": 595 + }, + { + "epoch": 0.2119487908961593, + "grad_norm": 1.110694169998169, + "learning_rate": 2.75914027813657e-06, + "loss": 4.9307, + "step": 596 + }, + { + "epoch": 0.21230440967283074, + "grad_norm": 1.8984003067016602, + "learning_rate": 2.7581950918256646e-06, + "loss": 3.8455, + "step": 597 + }, + { + "epoch": 0.21266002844950213, + "grad_norm": 1.019620418548584, + "learning_rate": 2.7572482171420906e-06, + "loss": 4.7256, + "step": 598 + }, + { + "epoch": 0.21301564722617355, + "grad_norm": 1.3629940748214722, + "learning_rate": 2.7562996553564597e-06, + "loss": 6.0131, + "step": 599 + }, + { + "epoch": 0.21337126600284495, + "grad_norm": 1.7468572854995728, + "learning_rate": 2.7553494077416475e-06, + "loss": 3.7973, + "step": 600 + }, + { + "epoch": 0.21372688477951635, + "grad_norm": 0.7532409429550171, + "learning_rate": 2.754397475572792e-06, + "loss": 4.3469, + "step": 601 + }, + { + "epoch": 0.21408250355618777, + "grad_norm": 0.6986871361732483, + "learning_rate": 2.7534438601272917e-06, + "loss": 2.5505, + "step": 602 + }, + { + "epoch": 0.21443812233285917, + "grad_norm": 0.8937066793441772, + "learning_rate": 2.752488562684803e-06, + "loss": 3.1002, + "step": 603 + }, + { + "epoch": 0.2147937411095306, + "grad_norm": 1.8986876010894775, + "learning_rate": 2.7515315845272412e-06, + "loss": 5.374, + "step": 604 + }, + { + "epoch": 0.21514935988620199, + "grad_norm": 0.9406418204307556, + "learning_rate": 2.750572926938774e-06, + "loss": 3.5349, + "step": 605 + }, + { + "epoch": 0.2155049786628734, + "grad_norm": 1.3899831771850586, + "learning_rate": 2.7496125912058264e-06, + "loss": 3.3324, + "step": 606 + }, + { + "epoch": 0.2158605974395448, + "grad_norm": 0.9788717031478882, + "learning_rate": 2.748650578617072e-06, + "loss": 4.461, + "step": 607 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 2.3836209774017334, + "learning_rate": 2.7476868904634368e-06, + "loss": 4.2802, + "step": 608 + }, + { + "epoch": 0.21657183499288762, + "grad_norm": 1.717826008796692, + "learning_rate": 2.7467215280380945e-06, + "loss": 5.4637, + "step": 609 + }, + { + "epoch": 0.21692745376955902, + "grad_norm": 0.8967366814613342, + "learning_rate": 2.745754492636465e-06, + "loss": 3.5967, + "step": 610 + }, + { + "epoch": 0.21728307254623044, + "grad_norm": 1.0470154285430908, + "learning_rate": 2.744785785556214e-06, + "loss": 4.4526, + "step": 611 + }, + { + "epoch": 0.21763869132290184, + "grad_norm": 1.1880040168762207, + "learning_rate": 2.74381540809725e-06, + "loss": 3.7388, + "step": 612 + }, + { + "epoch": 0.21799431009957326, + "grad_norm": 0.7820875644683838, + "learning_rate": 2.7428433615617225e-06, + "loss": 3.0922, + "step": 613 + }, + { + "epoch": 0.21834992887624466, + "grad_norm": 1.1832407712936401, + "learning_rate": 2.741869647254022e-06, + "loss": 3.6144, + "step": 614 + }, + { + "epoch": 0.21870554765291608, + "grad_norm": 1.5965702533721924, + "learning_rate": 2.7408942664807755e-06, + "loss": 2.9523, + "step": 615 + }, + { + "epoch": 0.21906116642958748, + "grad_norm": 1.4567738771438599, + "learning_rate": 2.7399172205508476e-06, + "loss": 4.4125, + "step": 616 + }, + { + "epoch": 0.2194167852062589, + "grad_norm": 1.1813263893127441, + "learning_rate": 2.738938510775337e-06, + "loss": 3.3708, + "step": 617 + }, + { + "epoch": 0.2197724039829303, + "grad_norm": 0.8762450814247131, + "learning_rate": 2.737958138467574e-06, + "loss": 3.8739, + "step": 618 + }, + { + "epoch": 0.2201280227596017, + "grad_norm": 1.0313694477081299, + "learning_rate": 2.736976104943121e-06, + "loss": 4.7414, + "step": 619 + }, + { + "epoch": 0.22048364153627312, + "grad_norm": 1.4330986738204956, + "learning_rate": 2.73599241151977e-06, + "loss": 3.4602, + "step": 620 + }, + { + "epoch": 0.2208392603129445, + "grad_norm": 0.7581644058227539, + "learning_rate": 2.735007059517539e-06, + "loss": 3.5996, + "step": 621 + }, + { + "epoch": 0.22119487908961594, + "grad_norm": 2.552283525466919, + "learning_rate": 2.734020050258673e-06, + "loss": 2.5479, + "step": 622 + }, + { + "epoch": 0.22155049786628733, + "grad_norm": 1.6027405261993408, + "learning_rate": 2.7330313850676396e-06, + "loss": 3.7962, + "step": 623 + }, + { + "epoch": 0.22190611664295876, + "grad_norm": 2.1549534797668457, + "learning_rate": 2.7320410652711294e-06, + "loss": 3.822, + "step": 624 + }, + { + "epoch": 0.22226173541963015, + "grad_norm": 0.7982567548751831, + "learning_rate": 2.7310490921980532e-06, + "loss": 3.1795, + "step": 625 + }, + { + "epoch": 0.22261735419630158, + "grad_norm": 0.899829626083374, + "learning_rate": 2.73005546717954e-06, + "loss": 4.16, + "step": 626 + }, + { + "epoch": 0.22297297297297297, + "grad_norm": 1.8331987857818604, + "learning_rate": 2.7290601915489358e-06, + "loss": 5.199, + "step": 627 + }, + { + "epoch": 0.2233285917496444, + "grad_norm": 1.9667061567306519, + "learning_rate": 2.7280632666418012e-06, + "loss": 4.9917, + "step": 628 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 0.7060828804969788, + "learning_rate": 2.727064693795911e-06, + "loss": 3.3883, + "step": 629 + }, + { + "epoch": 0.2240398293029872, + "grad_norm": 1.7026444673538208, + "learning_rate": 2.72606447435125e-06, + "loss": 2.7631, + "step": 630 + }, + { + "epoch": 0.2243954480796586, + "grad_norm": 1.1947602033615112, + "learning_rate": 2.7250626096500137e-06, + "loss": 2.4736, + "step": 631 + }, + { + "epoch": 0.22475106685633, + "grad_norm": 1.2797578573226929, + "learning_rate": 2.724059101036604e-06, + "loss": 4.4002, + "step": 632 + }, + { + "epoch": 0.22510668563300143, + "grad_norm": 0.839379608631134, + "learning_rate": 2.7230539498576305e-06, + "loss": 3.86, + "step": 633 + }, + { + "epoch": 0.22546230440967283, + "grad_norm": 0.9941189885139465, + "learning_rate": 2.722047157461906e-06, + "loss": 3.8836, + "step": 634 + }, + { + "epoch": 0.22581792318634425, + "grad_norm": 0.8675352334976196, + "learning_rate": 2.7210387252004457e-06, + "loss": 3.1715, + "step": 635 + }, + { + "epoch": 0.22617354196301565, + "grad_norm": 1.0379489660263062, + "learning_rate": 2.7200286544264656e-06, + "loss": 4.3559, + "step": 636 + }, + { + "epoch": 0.22652916073968707, + "grad_norm": 0.906231164932251, + "learning_rate": 2.719016946495379e-06, + "loss": 3.4762, + "step": 637 + }, + { + "epoch": 0.22688477951635846, + "grad_norm": 9.208588600158691, + "learning_rate": 2.7180036027648e-06, + "loss": 6.6573, + "step": 638 + }, + { + "epoch": 0.22724039829302986, + "grad_norm": 1.3824528455734253, + "learning_rate": 2.716988624594532e-06, + "loss": 4.3509, + "step": 639 + }, + { + "epoch": 0.22759601706970128, + "grad_norm": 0.8570268750190735, + "learning_rate": 2.715972013346576e-06, + "loss": 3.2436, + "step": 640 + }, + { + "epoch": 0.22795163584637268, + "grad_norm": 0.6935247182846069, + "learning_rate": 2.7149537703851235e-06, + "loss": 3.344, + "step": 641 + }, + { + "epoch": 0.2283072546230441, + "grad_norm": 0.7278545498847961, + "learning_rate": 2.7139338970765553e-06, + "loss": 3.5413, + "step": 642 + }, + { + "epoch": 0.2286628733997155, + "grad_norm": 0.8006585240364075, + "learning_rate": 2.712912394789439e-06, + "loss": 3.8686, + "step": 643 + }, + { + "epoch": 0.22901849217638692, + "grad_norm": 0.8582196831703186, + "learning_rate": 2.7118892648945306e-06, + "loss": 3.2208, + "step": 644 + }, + { + "epoch": 0.22937411095305832, + "grad_norm": 1.185124158859253, + "learning_rate": 2.710864508764767e-06, + "loss": 3.0794, + "step": 645 + }, + { + "epoch": 0.22972972972972974, + "grad_norm": 1.3713538646697998, + "learning_rate": 2.709838127775271e-06, + "loss": 3.373, + "step": 646 + }, + { + "epoch": 0.23008534850640114, + "grad_norm": 1.332764744758606, + "learning_rate": 2.7088101233033418e-06, + "loss": 3.4263, + "step": 647 + }, + { + "epoch": 0.23044096728307253, + "grad_norm": 1.6630849838256836, + "learning_rate": 2.70778049672846e-06, + "loss": 4.8072, + "step": 648 + }, + { + "epoch": 0.23079658605974396, + "grad_norm": 2.1606640815734863, + "learning_rate": 2.706749249432282e-06, + "loss": 3.0748, + "step": 649 + }, + { + "epoch": 0.23115220483641535, + "grad_norm": 0.8283726572990417, + "learning_rate": 2.7057163827986387e-06, + "loss": 1.1913, + "step": 650 + }, + { + "epoch": 0.23150782361308678, + "grad_norm": 1.6619617938995361, + "learning_rate": 2.7046818982135356e-06, + "loss": 4.8091, + "step": 651 + }, + { + "epoch": 0.23186344238975817, + "grad_norm": 1.005832552909851, + "learning_rate": 2.703645797065147e-06, + "loss": 4.3272, + "step": 652 + }, + { + "epoch": 0.2322190611664296, + "grad_norm": 1.199280023574829, + "learning_rate": 2.7026080807438176e-06, + "loss": 4.9665, + "step": 653 + }, + { + "epoch": 0.232574679943101, + "grad_norm": 2.922279119491577, + "learning_rate": 2.7015687506420603e-06, + "loss": 4.5084, + "step": 654 + }, + { + "epoch": 0.23293029871977242, + "grad_norm": 2.833789110183716, + "learning_rate": 2.700527808154552e-06, + "loss": 2.3148, + "step": 655 + }, + { + "epoch": 0.2332859174964438, + "grad_norm": 1.4761559963226318, + "learning_rate": 2.6994852546781344e-06, + "loss": 4.3111, + "step": 656 + }, + { + "epoch": 0.2336415362731152, + "grad_norm": 1.610658049583435, + "learning_rate": 2.6984410916118097e-06, + "loss": 4.2574, + "step": 657 + }, + { + "epoch": 0.23399715504978663, + "grad_norm": 0.7970765829086304, + "learning_rate": 2.697395320356742e-06, + "loss": 3.9668, + "step": 658 + }, + { + "epoch": 0.23435277382645803, + "grad_norm": 10.661136627197266, + "learning_rate": 2.696347942316252e-06, + "loss": 3.9532, + "step": 659 + }, + { + "epoch": 0.23470839260312945, + "grad_norm": 0.8836580514907837, + "learning_rate": 2.6952989588958166e-06, + "loss": 2.8524, + "step": 660 + }, + { + "epoch": 0.23506401137980085, + "grad_norm": 0.758213460445404, + "learning_rate": 2.6942483715030675e-06, + "loss": 3.7073, + "step": 661 + }, + { + "epoch": 0.23541963015647227, + "grad_norm": 1.4629117250442505, + "learning_rate": 2.693196181547788e-06, + "loss": 2.9451, + "step": 662 + }, + { + "epoch": 0.23577524893314367, + "grad_norm": 2.3312976360321045, + "learning_rate": 2.6921423904419126e-06, + "loss": 4.1724, + "step": 663 + }, + { + "epoch": 0.2361308677098151, + "grad_norm": 1.5716831684112549, + "learning_rate": 2.6910869995995247e-06, + "loss": 4.9346, + "step": 664 + }, + { + "epoch": 0.23648648648648649, + "grad_norm": 1.726789116859436, + "learning_rate": 2.690030010436853e-06, + "loss": 3.6274, + "step": 665 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.9751147031784058, + "learning_rate": 2.6889714243722724e-06, + "loss": 3.5183, + "step": 666 + }, + { + "epoch": 0.2371977240398293, + "grad_norm": 1.133361577987671, + "learning_rate": 2.6879112428262993e-06, + "loss": 3.0786, + "step": 667 + }, + { + "epoch": 0.2375533428165007, + "grad_norm": 1.5973809957504272, + "learning_rate": 2.686849467221593e-06, + "loss": 6.064, + "step": 668 + }, + { + "epoch": 0.23790896159317212, + "grad_norm": 0.7160729765892029, + "learning_rate": 2.6857860989829503e-06, + "loss": 3.6209, + "step": 669 + }, + { + "epoch": 0.23826458036984352, + "grad_norm": 0.899535059928894, + "learning_rate": 2.6847211395373056e-06, + "loss": 3.1777, + "step": 670 + }, + { + "epoch": 0.23862019914651494, + "grad_norm": 1.0743366479873657, + "learning_rate": 2.683654590313728e-06, + "loss": 3.1497, + "step": 671 + }, + { + "epoch": 0.23897581792318634, + "grad_norm": 0.97952800989151, + "learning_rate": 2.6825864527434213e-06, + "loss": 3.0002, + "step": 672 + }, + { + "epoch": 0.23933143669985776, + "grad_norm": 1.1182763576507568, + "learning_rate": 2.681516728259719e-06, + "loss": 2.7237, + "step": 673 + }, + { + "epoch": 0.23968705547652916, + "grad_norm": 2.2061007022857666, + "learning_rate": 2.6804454182980866e-06, + "loss": 5.4802, + "step": 674 + }, + { + "epoch": 0.24004267425320056, + "grad_norm": 1.2925832271575928, + "learning_rate": 2.6793725242961134e-06, + "loss": 3.5583, + "step": 675 + }, + { + "epoch": 0.24039829302987198, + "grad_norm": 0.9436748623847961, + "learning_rate": 2.6782980476935176e-06, + "loss": 3.4403, + "step": 676 + }, + { + "epoch": 0.24075391180654337, + "grad_norm": 1.5992109775543213, + "learning_rate": 2.6772219899321403e-06, + "loss": 3.9976, + "step": 677 + }, + { + "epoch": 0.2411095305832148, + "grad_norm": 0.8857548832893372, + "learning_rate": 2.676144352455943e-06, + "loss": 4.3068, + "step": 678 + }, + { + "epoch": 0.2414651493598862, + "grad_norm": 0.9935418367385864, + "learning_rate": 2.675065136711009e-06, + "loss": 3.8587, + "step": 679 + }, + { + "epoch": 0.24182076813655762, + "grad_norm": 1.159232497215271, + "learning_rate": 2.6739843441455373e-06, + "loss": 4.1631, + "step": 680 + }, + { + "epoch": 0.242176386913229, + "grad_norm": 0.7476597428321838, + "learning_rate": 2.672901976209845e-06, + "loss": 3.4575, + "step": 681 + }, + { + "epoch": 0.24253200568990044, + "grad_norm": 0.8902198672294617, + "learning_rate": 2.671818034356362e-06, + "loss": 3.2204, + "step": 682 + }, + { + "epoch": 0.24288762446657183, + "grad_norm": 1.1233264207839966, + "learning_rate": 2.6707325200396305e-06, + "loss": 4.2252, + "step": 683 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 2.163823366165161, + "learning_rate": 2.6696454347163024e-06, + "loss": 4.4814, + "step": 684 + }, + { + "epoch": 0.24359886201991465, + "grad_norm": 0.8959883451461792, + "learning_rate": 2.6685567798451383e-06, + "loss": 3.0798, + "step": 685 + }, + { + "epoch": 0.24395448079658605, + "grad_norm": 0.9322090744972229, + "learning_rate": 2.6674665568870045e-06, + "loss": 3.6241, + "step": 686 + }, + { + "epoch": 0.24431009957325747, + "grad_norm": 0.8230917453765869, + "learning_rate": 2.666374767304872e-06, + "loss": 3.6797, + "step": 687 + }, + { + "epoch": 0.24466571834992887, + "grad_norm": 1.6458431482315063, + "learning_rate": 2.665281412563814e-06, + "loss": 5.1813, + "step": 688 + }, + { + "epoch": 0.2450213371266003, + "grad_norm": 1.1850699186325073, + "learning_rate": 2.664186494131004e-06, + "loss": 4.2165, + "step": 689 + }, + { + "epoch": 0.2453769559032717, + "grad_norm": 1.1556246280670166, + "learning_rate": 2.663090013475713e-06, + "loss": 3.0347, + "step": 690 + }, + { + "epoch": 0.2457325746799431, + "grad_norm": 1.0865049362182617, + "learning_rate": 2.661991972069309e-06, + "loss": 3.2507, + "step": 691 + }, + { + "epoch": 0.2460881934566145, + "grad_norm": 1.1898510456085205, + "learning_rate": 2.660892371385255e-06, + "loss": 4.7697, + "step": 692 + }, + { + "epoch": 0.24644381223328593, + "grad_norm": 1.3320579528808594, + "learning_rate": 2.6597912128991045e-06, + "loss": 4.7602, + "step": 693 + }, + { + "epoch": 0.24679943100995733, + "grad_norm": 0.9560115337371826, + "learning_rate": 2.6586884980885044e-06, + "loss": 3.8053, + "step": 694 + }, + { + "epoch": 0.24715504978662872, + "grad_norm": 0.9088101983070374, + "learning_rate": 2.657584228433187e-06, + "loss": 3.2694, + "step": 695 + }, + { + "epoch": 0.24751066856330015, + "grad_norm": 1.3509434461593628, + "learning_rate": 2.656478405414973e-06, + "loss": 3.1238, + "step": 696 + }, + { + "epoch": 0.24786628733997154, + "grad_norm": 0.7131809592247009, + "learning_rate": 2.6553710305177664e-06, + "loss": 3.2152, + "step": 697 + }, + { + "epoch": 0.24822190611664297, + "grad_norm": 0.6559486985206604, + "learning_rate": 2.6542621052275548e-06, + "loss": 3.2391, + "step": 698 + }, + { + "epoch": 0.24857752489331436, + "grad_norm": 1.542479395866394, + "learning_rate": 2.653151631032405e-06, + "loss": 4.7109, + "step": 699 + }, + { + "epoch": 0.24893314366998578, + "grad_norm": 1.0903455018997192, + "learning_rate": 2.652039609422463e-06, + "loss": 5.3317, + "step": 700 + }, + { + "epoch": 0.24928876244665718, + "grad_norm": 0.9442124366760254, + "learning_rate": 2.6509260418899515e-06, + "loss": 3.4979, + "step": 701 + }, + { + "epoch": 0.2496443812233286, + "grad_norm": 0.7235139608383179, + "learning_rate": 2.649810929929168e-06, + "loss": 3.357, + "step": 702 + }, + { + "epoch": 0.25, + "grad_norm": 3.271620035171509, + "learning_rate": 2.6486942750364803e-06, + "loss": 2.0538, + "step": 703 + }, + { + "epoch": 0.25, + "eval_loss": 5.059045314788818, + "eval_runtime": 305.1883, + "eval_samples_per_second": 4.086, + "eval_steps_per_second": 4.086, + "step": 703 + }, + { + "epoch": 0.2503556187766714, + "grad_norm": 1.6224945783615112, + "learning_rate": 2.647576078710329e-06, + "loss": 3.8267, + "step": 704 + }, + { + "epoch": 0.2507112375533428, + "grad_norm": 1.0629414319992065, + "learning_rate": 2.6464563424512223e-06, + "loss": 3.5751, + "step": 705 + }, + { + "epoch": 0.25106685633001424, + "grad_norm": 2.674192190170288, + "learning_rate": 2.645335067761735e-06, + "loss": 5.6799, + "step": 706 + }, + { + "epoch": 0.25142247510668564, + "grad_norm": 2.219778537750244, + "learning_rate": 2.6442122561465062e-06, + "loss": 4.1345, + "step": 707 + }, + { + "epoch": 0.25177809388335703, + "grad_norm": 0.6873036623001099, + "learning_rate": 2.6430879091122376e-06, + "loss": 3.4541, + "step": 708 + }, + { + "epoch": 0.25213371266002843, + "grad_norm": 0.9722671508789062, + "learning_rate": 2.6419620281676903e-06, + "loss": 4.7558, + "step": 709 + }, + { + "epoch": 0.2524893314366999, + "grad_norm": 0.6708479523658752, + "learning_rate": 2.6408346148236855e-06, + "loss": 2.7371, + "step": 710 + }, + { + "epoch": 0.2528449502133713, + "grad_norm": 1.7530995607376099, + "learning_rate": 2.639705670593099e-06, + "loss": 4.8371, + "step": 711 + }, + { + "epoch": 0.2532005689900427, + "grad_norm": 1.280150055885315, + "learning_rate": 2.638575196990862e-06, + "loss": 4.2502, + "step": 712 + }, + { + "epoch": 0.25355618776671407, + "grad_norm": 0.746826708316803, + "learning_rate": 2.637443195533958e-06, + "loss": 3.3368, + "step": 713 + }, + { + "epoch": 0.25391180654338547, + "grad_norm": 1.036576747894287, + "learning_rate": 2.63630966774142e-06, + "loss": 3.4235, + "step": 714 + }, + { + "epoch": 0.2542674253200569, + "grad_norm": 0.8828042149543762, + "learning_rate": 2.6351746151343294e-06, + "loss": 3.5666, + "step": 715 + }, + { + "epoch": 0.2546230440967283, + "grad_norm": 0.7639567852020264, + "learning_rate": 2.6340380392358137e-06, + "loss": 3.834, + "step": 716 + }, + { + "epoch": 0.2549786628733997, + "grad_norm": 0.983221709728241, + "learning_rate": 2.6328999415710454e-06, + "loss": 4.1018, + "step": 717 + }, + { + "epoch": 0.2553342816500711, + "grad_norm": 0.7098877429962158, + "learning_rate": 2.631760323667238e-06, + "loss": 2.9385, + "step": 718 + }, + { + "epoch": 0.25568990042674256, + "grad_norm": 0.9221937656402588, + "learning_rate": 2.6306191870536452e-06, + "loss": 3.3947, + "step": 719 + }, + { + "epoch": 0.25604551920341395, + "grad_norm": 1.028336763381958, + "learning_rate": 2.62947653326156e-06, + "loss": 3.9735, + "step": 720 + }, + { + "epoch": 0.25640113798008535, + "grad_norm": 0.7828941345214844, + "learning_rate": 2.6283323638243084e-06, + "loss": 3.4862, + "step": 721 + }, + { + "epoch": 0.25675675675675674, + "grad_norm": 1.3754442930221558, + "learning_rate": 2.6271866802772525e-06, + "loss": 4.9882, + "step": 722 + }, + { + "epoch": 0.25711237553342814, + "grad_norm": 3.3400089740753174, + "learning_rate": 2.6260394841577857e-06, + "loss": 4.6909, + "step": 723 + }, + { + "epoch": 0.2574679943100996, + "grad_norm": 0.8125085234642029, + "learning_rate": 2.624890777005332e-06, + "loss": 3.3342, + "step": 724 + }, + { + "epoch": 0.257823613086771, + "grad_norm": 0.8807397484779358, + "learning_rate": 2.6237405603613414e-06, + "loss": 4.3581, + "step": 725 + }, + { + "epoch": 0.2581792318634424, + "grad_norm": 1.0933582782745361, + "learning_rate": 2.62258883576929e-06, + "loss": 4.8021, + "step": 726 + }, + { + "epoch": 0.2585348506401138, + "grad_norm": 1.414589762687683, + "learning_rate": 2.6214356047746785e-06, + "loss": 3.3678, + "step": 727 + }, + { + "epoch": 0.25889046941678523, + "grad_norm": 0.6845437288284302, + "learning_rate": 2.620280868925027e-06, + "loss": 3.0753, + "step": 728 + }, + { + "epoch": 0.2592460881934566, + "grad_norm": 2.197880268096924, + "learning_rate": 2.619124629769877e-06, + "loss": 4.6836, + "step": 729 + }, + { + "epoch": 0.259601706970128, + "grad_norm": 1.04655122756958, + "learning_rate": 2.6179668888607866e-06, + "loss": 4.2242, + "step": 730 + }, + { + "epoch": 0.2599573257467994, + "grad_norm": 0.9438437223434448, + "learning_rate": 2.616807647751328e-06, + "loss": 4.3986, + "step": 731 + }, + { + "epoch": 0.2603129445234708, + "grad_norm": 0.9931744933128357, + "learning_rate": 2.615646907997088e-06, + "loss": 3.3108, + "step": 732 + }, + { + "epoch": 0.26066856330014226, + "grad_norm": 1.3172379732131958, + "learning_rate": 2.614484671155664e-06, + "loss": 3.7888, + "step": 733 + }, + { + "epoch": 0.26102418207681366, + "grad_norm": 1.2226686477661133, + "learning_rate": 2.6133209387866628e-06, + "loss": 4.2841, + "step": 734 + }, + { + "epoch": 0.26137980085348506, + "grad_norm": 1.1598528623580933, + "learning_rate": 2.612155712451696e-06, + "loss": 4.542, + "step": 735 + }, + { + "epoch": 0.26173541963015645, + "grad_norm": 1.0974695682525635, + "learning_rate": 2.6109889937143828e-06, + "loss": 2.947, + "step": 736 + }, + { + "epoch": 0.2620910384068279, + "grad_norm": 1.5603867769241333, + "learning_rate": 2.609820784140343e-06, + "loss": 3.8496, + "step": 737 + }, + { + "epoch": 0.2624466571834993, + "grad_norm": 1.0050320625305176, + "learning_rate": 2.6086510852971985e-06, + "loss": 3.458, + "step": 738 + }, + { + "epoch": 0.2628022759601707, + "grad_norm": 0.8407775163650513, + "learning_rate": 2.607479898754567e-06, + "loss": 2.9487, + "step": 739 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 1.000401496887207, + "learning_rate": 2.6063072260840664e-06, + "loss": 2.8696, + "step": 740 + }, + { + "epoch": 0.2635135135135135, + "grad_norm": 0.7847100496292114, + "learning_rate": 2.605133068859306e-06, + "loss": 2.9961, + "step": 741 + }, + { + "epoch": 0.26386913229018494, + "grad_norm": 1.0771046876907349, + "learning_rate": 2.603957428655887e-06, + "loss": 3.6045, + "step": 742 + }, + { + "epoch": 0.26422475106685633, + "grad_norm": 0.94215327501297, + "learning_rate": 2.602780307051403e-06, + "loss": 3.6559, + "step": 743 + }, + { + "epoch": 0.26458036984352773, + "grad_norm": 0.9935820698738098, + "learning_rate": 2.6016017056254342e-06, + "loss": 3.1574, + "step": 744 + }, + { + "epoch": 0.2649359886201991, + "grad_norm": 0.8710533976554871, + "learning_rate": 2.6004216259595453e-06, + "loss": 4.2447, + "step": 745 + }, + { + "epoch": 0.2652916073968706, + "grad_norm": 0.7572230100631714, + "learning_rate": 2.5992400696372864e-06, + "loss": 3.5638, + "step": 746 + }, + { + "epoch": 0.265647226173542, + "grad_norm": 1.3242597579956055, + "learning_rate": 2.598057038244189e-06, + "loss": 3.643, + "step": 747 + }, + { + "epoch": 0.26600284495021337, + "grad_norm": 0.8385727405548096, + "learning_rate": 2.596872533367763e-06, + "loss": 3.616, + "step": 748 + }, + { + "epoch": 0.26635846372688476, + "grad_norm": 1.0160984992980957, + "learning_rate": 2.5956865565974965e-06, + "loss": 4.4077, + "step": 749 + }, + { + "epoch": 0.26671408250355616, + "grad_norm": 0.6757259368896484, + "learning_rate": 2.5944991095248516e-06, + "loss": 3.4093, + "step": 750 + }, + { + "epoch": 0.2670697012802276, + "grad_norm": 2.0506560802459717, + "learning_rate": 2.5933101937432653e-06, + "loss": 4.7066, + "step": 751 + }, + { + "epoch": 0.267425320056899, + "grad_norm": 1.2473889589309692, + "learning_rate": 2.5921198108481436e-06, + "loss": 3.5231, + "step": 752 + }, + { + "epoch": 0.2677809388335704, + "grad_norm": 1.3727161884307861, + "learning_rate": 2.5909279624368624e-06, + "loss": 3.1451, + "step": 753 + }, + { + "epoch": 0.2681365576102418, + "grad_norm": 1.5640524625778198, + "learning_rate": 2.5897346501087633e-06, + "loss": 4.6035, + "step": 754 + }, + { + "epoch": 0.26849217638691325, + "grad_norm": 0.995383083820343, + "learning_rate": 2.5885398754651526e-06, + "loss": 2.8048, + "step": 755 + }, + { + "epoch": 0.26884779516358465, + "grad_norm": 1.0866503715515137, + "learning_rate": 2.5873436401092995e-06, + "loss": 3.7654, + "step": 756 + }, + { + "epoch": 0.26920341394025604, + "grad_norm": 1.0672723054885864, + "learning_rate": 2.586145945646433e-06, + "loss": 3.5701, + "step": 757 + }, + { + "epoch": 0.26955903271692744, + "grad_norm": 1.9060710668563843, + "learning_rate": 2.584946793683739e-06, + "loss": 4.0838, + "step": 758 + }, + { + "epoch": 0.2699146514935989, + "grad_norm": 0.7644263505935669, + "learning_rate": 2.5837461858303613e-06, + "loss": 2.7869, + "step": 759 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.8247077465057373, + "learning_rate": 2.582544123697395e-06, + "loss": 5.2272, + "step": 760 + }, + { + "epoch": 0.2706258890469417, + "grad_norm": 0.6990036964416504, + "learning_rate": 2.5813406088978893e-06, + "loss": 3.7641, + "step": 761 + }, + { + "epoch": 0.2709815078236131, + "grad_norm": 1.7287918329238892, + "learning_rate": 2.580135643046841e-06, + "loss": 3.7941, + "step": 762 + }, + { + "epoch": 0.2713371266002845, + "grad_norm": 3.006856918334961, + "learning_rate": 2.5789292277611936e-06, + "loss": 4.5077, + "step": 763 + }, + { + "epoch": 0.2716927453769559, + "grad_norm": 0.83155757188797, + "learning_rate": 2.577721364659837e-06, + "loss": 3.8642, + "step": 764 + }, + { + "epoch": 0.2720483641536273, + "grad_norm": 0.9824575781822205, + "learning_rate": 2.5765120553636033e-06, + "loss": 3.5662, + "step": 765 + }, + { + "epoch": 0.2724039829302987, + "grad_norm": 0.8452388048171997, + "learning_rate": 2.575301301495265e-06, + "loss": 3.7735, + "step": 766 + }, + { + "epoch": 0.2727596017069701, + "grad_norm": 1.499502420425415, + "learning_rate": 2.574089104679534e-06, + "loss": 3.6313, + "step": 767 + }, + { + "epoch": 0.27311522048364156, + "grad_norm": 1.102519154548645, + "learning_rate": 2.572875466543057e-06, + "loss": 4.0843, + "step": 768 + }, + { + "epoch": 0.27347083926031296, + "grad_norm": 0.9981449246406555, + "learning_rate": 2.571660388714417e-06, + "loss": 3.7752, + "step": 769 + }, + { + "epoch": 0.27382645803698435, + "grad_norm": 1.1938055753707886, + "learning_rate": 2.5704438728241265e-06, + "loss": 2.2039, + "step": 770 + }, + { + "epoch": 0.27418207681365575, + "grad_norm": 0.6633055806159973, + "learning_rate": 2.5692259205046283e-06, + "loss": 3.2921, + "step": 771 + }, + { + "epoch": 0.27453769559032715, + "grad_norm": 0.8210715055465698, + "learning_rate": 2.5680065333902947e-06, + "loss": 3.4993, + "step": 772 + }, + { + "epoch": 0.2748933143669986, + "grad_norm": 1.0507222414016724, + "learning_rate": 2.566785713117421e-06, + "loss": 4.2307, + "step": 773 + }, + { + "epoch": 0.27524893314367, + "grad_norm": 0.895935595035553, + "learning_rate": 2.5655634613242272e-06, + "loss": 3.3719, + "step": 774 + }, + { + "epoch": 0.2756045519203414, + "grad_norm": 0.7829636335372925, + "learning_rate": 2.564339779650853e-06, + "loss": 3.1787, + "step": 775 + }, + { + "epoch": 0.2759601706970128, + "grad_norm": 0.9634225964546204, + "learning_rate": 2.5631146697393584e-06, + "loss": 3.2513, + "step": 776 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 0.7909715175628662, + "learning_rate": 2.5618881332337176e-06, + "loss": 3.2298, + "step": 777 + }, + { + "epoch": 0.27667140825035563, + "grad_norm": 1.0231810808181763, + "learning_rate": 2.5606601717798212e-06, + "loss": 2.7432, + "step": 778 + }, + { + "epoch": 0.27702702702702703, + "grad_norm": 0.7850843071937561, + "learning_rate": 2.5594307870254724e-06, + "loss": 3.3455, + "step": 779 + }, + { + "epoch": 0.2773826458036984, + "grad_norm": 1.2300156354904175, + "learning_rate": 2.558199980620382e-06, + "loss": 4.9093, + "step": 780 + }, + { + "epoch": 0.2777382645803698, + "grad_norm": 3.159804105758667, + "learning_rate": 2.55696775421617e-06, + "loss": 5.7643, + "step": 781 + }, + { + "epoch": 0.27809388335704127, + "grad_norm": 1.6208535432815552, + "learning_rate": 2.5557341094663623e-06, + "loss": 4.5729, + "step": 782 + }, + { + "epoch": 0.27844950213371267, + "grad_norm": 0.8429439663887024, + "learning_rate": 2.5544990480263866e-06, + "loss": 3.1906, + "step": 783 + }, + { + "epoch": 0.27880512091038406, + "grad_norm": 1.417028546333313, + "learning_rate": 2.553262571553573e-06, + "loss": 3.778, + "step": 784 + }, + { + "epoch": 0.27916073968705546, + "grad_norm": 2.1450343132019043, + "learning_rate": 2.55202468170715e-06, + "loss": 4.3634, + "step": 785 + }, + { + "epoch": 0.2795163584637269, + "grad_norm": 2.031125068664551, + "learning_rate": 2.5507853801482423e-06, + "loss": 3.8631, + "step": 786 + }, + { + "epoch": 0.2798719772403983, + "grad_norm": 1.208133578300476, + "learning_rate": 2.54954466853987e-06, + "loss": 3.2338, + "step": 787 + }, + { + "epoch": 0.2802275960170697, + "grad_norm": 0.8182170391082764, + "learning_rate": 2.5483025485469437e-06, + "loss": 3.4599, + "step": 788 + }, + { + "epoch": 0.2805832147937411, + "grad_norm": 2.1296184062957764, + "learning_rate": 2.5470590218362655e-06, + "loss": 3.2277, + "step": 789 + }, + { + "epoch": 0.2809388335704125, + "grad_norm": 1.0333919525146484, + "learning_rate": 2.545814090076525e-06, + "loss": 3.3946, + "step": 790 + }, + { + "epoch": 0.28129445234708395, + "grad_norm": 1.0602388381958008, + "learning_rate": 2.5445677549382955e-06, + "loss": 4.5916, + "step": 791 + }, + { + "epoch": 0.28165007112375534, + "grad_norm": 1.4563558101654053, + "learning_rate": 2.543320018094036e-06, + "loss": 4.829, + "step": 792 + }, + { + "epoch": 0.28200568990042674, + "grad_norm": 1.090834617614746, + "learning_rate": 2.5420708812180846e-06, + "loss": 4.0221, + "step": 793 + }, + { + "epoch": 0.28236130867709813, + "grad_norm": 1.364560604095459, + "learning_rate": 2.5408203459866586e-06, + "loss": 3.5019, + "step": 794 + }, + { + "epoch": 0.2827169274537696, + "grad_norm": 1.2062021493911743, + "learning_rate": 2.5395684140778527e-06, + "loss": 3.0705, + "step": 795 + }, + { + "epoch": 0.283072546230441, + "grad_norm": 1.5442376136779785, + "learning_rate": 2.5383150871716344e-06, + "loss": 4.1456, + "step": 796 + }, + { + "epoch": 0.2834281650071124, + "grad_norm": 2.1239452362060547, + "learning_rate": 2.537060366949844e-06, + "loss": 4.7807, + "step": 797 + }, + { + "epoch": 0.28378378378378377, + "grad_norm": 0.814894437789917, + "learning_rate": 2.5358042550961906e-06, + "loss": 3.1939, + "step": 798 + }, + { + "epoch": 0.28413940256045517, + "grad_norm": 0.9808216094970703, + "learning_rate": 2.5345467532962524e-06, + "loss": 3.6754, + "step": 799 + }, + { + "epoch": 0.2844950213371266, + "grad_norm": 1.064163327217102, + "learning_rate": 2.5332878632374713e-06, + "loss": 4.0974, + "step": 800 + }, + { + "epoch": 0.284850640113798, + "grad_norm": 0.9691075682640076, + "learning_rate": 2.532027586609152e-06, + "loss": 1.4263, + "step": 801 + }, + { + "epoch": 0.2852062588904694, + "grad_norm": 1.8264943361282349, + "learning_rate": 2.530765925102461e-06, + "loss": 3.9761, + "step": 802 + }, + { + "epoch": 0.2855618776671408, + "grad_norm": 1.6267762184143066, + "learning_rate": 2.529502880410422e-06, + "loss": 4.6224, + "step": 803 + }, + { + "epoch": 0.28591749644381226, + "grad_norm": 1.5282135009765625, + "learning_rate": 2.528238454227917e-06, + "loss": 3.9707, + "step": 804 + }, + { + "epoch": 0.28627311522048365, + "grad_norm": 1.1919347047805786, + "learning_rate": 2.5269726482516776e-06, + "loss": 4.2519, + "step": 805 + }, + { + "epoch": 0.28662873399715505, + "grad_norm": 1.3646609783172607, + "learning_rate": 2.525705464180291e-06, + "loss": 3.5784, + "step": 806 + }, + { + "epoch": 0.28698435277382645, + "grad_norm": 1.0018821954727173, + "learning_rate": 2.5244369037141924e-06, + "loss": 3.928, + "step": 807 + }, + { + "epoch": 0.28733997155049784, + "grad_norm": 0.8082791566848755, + "learning_rate": 2.5231669685556636e-06, + "loss": 3.773, + "step": 808 + }, + { + "epoch": 0.2876955903271693, + "grad_norm": 1.3152085542678833, + "learning_rate": 2.5218956604088305e-06, + "loss": 4.7516, + "step": 809 + }, + { + "epoch": 0.2880512091038407, + "grad_norm": 7.593021869659424, + "learning_rate": 2.520622980979663e-06, + "loss": 6.2464, + "step": 810 + }, + { + "epoch": 0.2884068278805121, + "grad_norm": 1.0364235639572144, + "learning_rate": 2.5193489319759703e-06, + "loss": 2.9074, + "step": 811 + }, + { + "epoch": 0.2887624466571835, + "grad_norm": 2.093153715133667, + "learning_rate": 2.518073515107399e-06, + "loss": 5.7398, + "step": 812 + }, + { + "epoch": 0.28911806543385493, + "grad_norm": 0.9259242415428162, + "learning_rate": 2.5167967320854315e-06, + "loss": 4.0485, + "step": 813 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 2.289004325866699, + "learning_rate": 2.5155185846233844e-06, + "loss": 5.8169, + "step": 814 + }, + { + "epoch": 0.2898293029871977, + "grad_norm": 0.7427979707717896, + "learning_rate": 2.514239074436404e-06, + "loss": 3.1485, + "step": 815 + }, + { + "epoch": 0.2901849217638691, + "grad_norm": 1.0738167762756348, + "learning_rate": 2.5129582032414662e-06, + "loss": 2.9432, + "step": 816 + }, + { + "epoch": 0.2905405405405405, + "grad_norm": 0.9063039422035217, + "learning_rate": 2.5116759727573717e-06, + "loss": 2.6597, + "step": 817 + }, + { + "epoch": 0.29089615931721197, + "grad_norm": 1.7403827905654907, + "learning_rate": 2.510392384704747e-06, + "loss": 4.855, + "step": 818 + }, + { + "epoch": 0.29125177809388336, + "grad_norm": 1.0226014852523804, + "learning_rate": 2.5091074408060397e-06, + "loss": 3.0772, + "step": 819 + }, + { + "epoch": 0.29160739687055476, + "grad_norm": 1.0368446111679077, + "learning_rate": 2.507821142785516e-06, + "loss": 3.3664, + "step": 820 + }, + { + "epoch": 0.29196301564722615, + "grad_norm": 0.757840096950531, + "learning_rate": 2.5065334923692606e-06, + "loss": 2.9117, + "step": 821 + }, + { + "epoch": 0.2923186344238976, + "grad_norm": 0.8658721446990967, + "learning_rate": 2.505244491285172e-06, + "loss": 2.5356, + "step": 822 + }, + { + "epoch": 0.292674253200569, + "grad_norm": 0.739858865737915, + "learning_rate": 2.503954141262962e-06, + "loss": 3.4591, + "step": 823 + }, + { + "epoch": 0.2930298719772404, + "grad_norm": 1.9910534620285034, + "learning_rate": 2.5026624440341514e-06, + "loss": 3.6428, + "step": 824 + }, + { + "epoch": 0.2933854907539118, + "grad_norm": 0.9817492961883545, + "learning_rate": 2.5013694013320693e-06, + "loss": 3.0795, + "step": 825 + }, + { + "epoch": 0.2937411095305832, + "grad_norm": 1.088672399520874, + "learning_rate": 2.50007501489185e-06, + "loss": 3.5438, + "step": 826 + }, + { + "epoch": 0.29409672830725464, + "grad_norm": 0.9922055602073669, + "learning_rate": 2.498779286450433e-06, + "loss": 3.1859, + "step": 827 + }, + { + "epoch": 0.29445234708392604, + "grad_norm": 0.9291802644729614, + "learning_rate": 2.4974822177465558e-06, + "loss": 2.7328, + "step": 828 + }, + { + "epoch": 0.29480796586059743, + "grad_norm": 1.1373990774154663, + "learning_rate": 2.496183810520755e-06, + "loss": 3.5107, + "step": 829 + }, + { + "epoch": 0.2951635846372688, + "grad_norm": 0.832957923412323, + "learning_rate": 2.4948840665153654e-06, + "loss": 3.129, + "step": 830 + }, + { + "epoch": 0.2955192034139403, + "grad_norm": 0.815495491027832, + "learning_rate": 2.4935829874745133e-06, + "loss": 2.9769, + "step": 831 + }, + { + "epoch": 0.2958748221906117, + "grad_norm": 3.002371311187744, + "learning_rate": 2.4922805751441174e-06, + "loss": 3.3692, + "step": 832 + }, + { + "epoch": 0.29623044096728307, + "grad_norm": 2.7487213611602783, + "learning_rate": 2.4909768312718856e-06, + "loss": 5.589, + "step": 833 + }, + { + "epoch": 0.29658605974395447, + "grad_norm": 2.3894166946411133, + "learning_rate": 2.4896717576073125e-06, + "loss": 4.3007, + "step": 834 + }, + { + "epoch": 0.29694167852062586, + "grad_norm": 1.1454797983169556, + "learning_rate": 2.4883653559016776e-06, + "loss": 3.7947, + "step": 835 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 0.853417694568634, + "learning_rate": 2.4870576279080413e-06, + "loss": 3.0958, + "step": 836 + }, + { + "epoch": 0.2976529160739687, + "grad_norm": 1.0395786762237549, + "learning_rate": 2.485748575381245e-06, + "loss": 4.1887, + "step": 837 + }, + { + "epoch": 0.2980085348506401, + "grad_norm": 0.9292153716087341, + "learning_rate": 2.484438200077907e-06, + "loss": 3.4544, + "step": 838 + }, + { + "epoch": 0.2983641536273115, + "grad_norm": 0.8966376185417175, + "learning_rate": 2.48312650375642e-06, + "loss": 2.5651, + "step": 839 + }, + { + "epoch": 0.29871977240398295, + "grad_norm": 1.865938663482666, + "learning_rate": 2.4818134881769506e-06, + "loss": 2.7104, + "step": 840 + }, + { + "epoch": 0.29907539118065435, + "grad_norm": 1.317899465560913, + "learning_rate": 2.480499155101435e-06, + "loss": 4.0851, + "step": 841 + }, + { + "epoch": 0.29943100995732574, + "grad_norm": 0.8014160394668579, + "learning_rate": 2.4791835062935774e-06, + "loss": 3.463, + "step": 842 + }, + { + "epoch": 0.29978662873399714, + "grad_norm": 1.084154486656189, + "learning_rate": 2.477866543518848e-06, + "loss": 3.7647, + "step": 843 + }, + { + "epoch": 0.30014224751066854, + "grad_norm": 1.2534087896347046, + "learning_rate": 2.476548268544479e-06, + "loss": 3.8597, + "step": 844 + }, + { + "epoch": 0.30049786628734, + "grad_norm": 0.7010981440544128, + "learning_rate": 2.475228683139465e-06, + "loss": 3.6258, + "step": 845 + }, + { + "epoch": 0.3008534850640114, + "grad_norm": 0.9970294237136841, + "learning_rate": 2.473907789074558e-06, + "loss": 3.9296, + "step": 846 + }, + { + "epoch": 0.3012091038406828, + "grad_norm": 2.688616991043091, + "learning_rate": 2.4725855881222667e-06, + "loss": 5.3265, + "step": 847 + }, + { + "epoch": 0.3015647226173542, + "grad_norm": 1.2317426204681396, + "learning_rate": 2.471262082056853e-06, + "loss": 2.8201, + "step": 848 + }, + { + "epoch": 0.3019203413940256, + "grad_norm": 0.969599187374115, + "learning_rate": 2.469937272654331e-06, + "loss": 2.9367, + "step": 849 + }, + { + "epoch": 0.302275960170697, + "grad_norm": 0.8620771765708923, + "learning_rate": 2.4686111616924627e-06, + "loss": 3.6555, + "step": 850 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 1.3380078077316284, + "learning_rate": 2.467283750950757e-06, + "loss": 3.6994, + "step": 851 + }, + { + "epoch": 0.3029871977240398, + "grad_norm": 1.7946288585662842, + "learning_rate": 2.465955042210467e-06, + "loss": 2.4515, + "step": 852 + }, + { + "epoch": 0.3033428165007112, + "grad_norm": 2.287774085998535, + "learning_rate": 2.4646250372545878e-06, + "loss": 4.7345, + "step": 853 + }, + { + "epoch": 0.30369843527738266, + "grad_norm": 0.6448686718940735, + "learning_rate": 2.4632937378678545e-06, + "loss": 2.9043, + "step": 854 + }, + { + "epoch": 0.30405405405405406, + "grad_norm": 0.8745740056037903, + "learning_rate": 2.4619611458367376e-06, + "loss": 3.7745, + "step": 855 + }, + { + "epoch": 0.30440967283072545, + "grad_norm": 1.0605822801589966, + "learning_rate": 2.4606272629494433e-06, + "loss": 4.157, + "step": 856 + }, + { + "epoch": 0.30476529160739685, + "grad_norm": 0.7116619944572449, + "learning_rate": 2.4592920909959094e-06, + "loss": 3.3159, + "step": 857 + }, + { + "epoch": 0.3051209103840683, + "grad_norm": 1.6184613704681396, + "learning_rate": 2.457955631767804e-06, + "loss": 4.2369, + "step": 858 + }, + { + "epoch": 0.3054765291607397, + "grad_norm": 0.981221616268158, + "learning_rate": 2.4566178870585237e-06, + "loss": 3.1595, + "step": 859 + }, + { + "epoch": 0.3058321479374111, + "grad_norm": 0.998651385307312, + "learning_rate": 2.455278858663187e-06, + "loss": 3.483, + "step": 860 + }, + { + "epoch": 0.3061877667140825, + "grad_norm": 1.0472383499145508, + "learning_rate": 2.453938548378638e-06, + "loss": 2.8622, + "step": 861 + }, + { + "epoch": 0.30654338549075394, + "grad_norm": 1.2357455492019653, + "learning_rate": 2.452596958003439e-06, + "loss": 2.9773, + "step": 862 + }, + { + "epoch": 0.30689900426742533, + "grad_norm": 2.4977025985717773, + "learning_rate": 2.451254089337872e-06, + "loss": 2.5184, + "step": 863 + }, + { + "epoch": 0.30725462304409673, + "grad_norm": 1.0408294200897217, + "learning_rate": 2.4499099441839316e-06, + "loss": 3.1285, + "step": 864 + }, + { + "epoch": 0.3076102418207681, + "grad_norm": 1.2815154790878296, + "learning_rate": 2.4485645243453283e-06, + "loss": 4.5722, + "step": 865 + }, + { + "epoch": 0.3079658605974395, + "grad_norm": 1.1620107889175415, + "learning_rate": 2.4472178316274808e-06, + "loss": 3.4076, + "step": 866 + }, + { + "epoch": 0.308321479374111, + "grad_norm": 0.8315699696540833, + "learning_rate": 2.445869867837517e-06, + "loss": 4.1655, + "step": 867 + }, + { + "epoch": 0.30867709815078237, + "grad_norm": 1.4980186223983765, + "learning_rate": 2.4445206347842714e-06, + "loss": 3.3662, + "step": 868 + }, + { + "epoch": 0.30903271692745377, + "grad_norm": 4.333240032196045, + "learning_rate": 2.4431701342782783e-06, + "loss": 4.0278, + "step": 869 + }, + { + "epoch": 0.30938833570412516, + "grad_norm": 1.6848045587539673, + "learning_rate": 2.441818368131777e-06, + "loss": 3.55, + "step": 870 + }, + { + "epoch": 0.3097439544807966, + "grad_norm": 2.442152261734009, + "learning_rate": 2.440465338158702e-06, + "loss": 6.0433, + "step": 871 + }, + { + "epoch": 0.310099573257468, + "grad_norm": 1.209162950515747, + "learning_rate": 2.4391110461746854e-06, + "loss": 3.9391, + "step": 872 + }, + { + "epoch": 0.3104551920341394, + "grad_norm": 1.0875166654586792, + "learning_rate": 2.437755493997053e-06, + "loss": 3.7035, + "step": 873 + }, + { + "epoch": 0.3108108108108108, + "grad_norm": 1.4684008359909058, + "learning_rate": 2.43639868344482e-06, + "loss": 3.1289, + "step": 874 + }, + { + "epoch": 0.3111664295874822, + "grad_norm": 0.7221288681030273, + "learning_rate": 2.4350406163386916e-06, + "loss": 3.1457, + "step": 875 + }, + { + "epoch": 0.31152204836415365, + "grad_norm": 3.2170395851135254, + "learning_rate": 2.4336812945010587e-06, + "loss": 4.9899, + "step": 876 + }, + { + "epoch": 0.31187766714082504, + "grad_norm": 2.0230603218078613, + "learning_rate": 2.4323207197559963e-06, + "loss": 3.6904, + "step": 877 + }, + { + "epoch": 0.31223328591749644, + "grad_norm": 1.0182172060012817, + "learning_rate": 2.4309588939292595e-06, + "loss": 2.9135, + "step": 878 + }, + { + "epoch": 0.31258890469416784, + "grad_norm": 1.3056840896606445, + "learning_rate": 2.429595818848284e-06, + "loss": 3.7108, + "step": 879 + }, + { + "epoch": 0.3129445234708393, + "grad_norm": 0.8187604546546936, + "learning_rate": 2.428231496342181e-06, + "loss": 3.938, + "step": 880 + }, + { + "epoch": 0.3133001422475107, + "grad_norm": 0.7346046566963196, + "learning_rate": 2.4268659282417352e-06, + "loss": 2.7231, + "step": 881 + }, + { + "epoch": 0.3136557610241821, + "grad_norm": 0.6974596977233887, + "learning_rate": 2.4254991163794035e-06, + "loss": 2.9713, + "step": 882 + }, + { + "epoch": 0.3140113798008535, + "grad_norm": 0.832760751247406, + "learning_rate": 2.424131062589311e-06, + "loss": 3.0483, + "step": 883 + }, + { + "epoch": 0.31436699857752487, + "grad_norm": 0.7894182205200195, + "learning_rate": 2.42276176870725e-06, + "loss": 3.0304, + "step": 884 + }, + { + "epoch": 0.3147226173541963, + "grad_norm": 0.833278477191925, + "learning_rate": 2.421391236570677e-06, + "loss": 3.4759, + "step": 885 + }, + { + "epoch": 0.3150782361308677, + "grad_norm": 1.9163501262664795, + "learning_rate": 2.4200194680187097e-06, + "loss": 1.9163, + "step": 886 + }, + { + "epoch": 0.3154338549075391, + "grad_norm": 2.0980982780456543, + "learning_rate": 2.4186464648921248e-06, + "loss": 2.7463, + "step": 887 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.8153195381164551, + "learning_rate": 2.417272229033356e-06, + "loss": 3.6677, + "step": 888 + }, + { + "epoch": 0.31614509246088196, + "grad_norm": 0.9330801963806152, + "learning_rate": 2.415896762286491e-06, + "loss": 2.8687, + "step": 889 + }, + { + "epoch": 0.31650071123755336, + "grad_norm": 1.2200357913970947, + "learning_rate": 2.41452006649727e-06, + "loss": 4.1817, + "step": 890 + }, + { + "epoch": 0.31685633001422475, + "grad_norm": 3.0129482746124268, + "learning_rate": 2.4131421435130812e-06, + "loss": 5.131, + "step": 891 + }, + { + "epoch": 0.31721194879089615, + "grad_norm": 0.7623541355133057, + "learning_rate": 2.4117629951829604e-06, + "loss": 3.2028, + "step": 892 + }, + { + "epoch": 0.31756756756756754, + "grad_norm": 5.975355625152588, + "learning_rate": 2.4103826233575872e-06, + "loss": 5.0792, + "step": 893 + }, + { + "epoch": 0.317923186344239, + "grad_norm": 1.192233681678772, + "learning_rate": 2.4090010298892838e-06, + "loss": 3.9848, + "step": 894 + }, + { + "epoch": 0.3182788051209104, + "grad_norm": 1.4108399152755737, + "learning_rate": 2.4076182166320107e-06, + "loss": 3.8628, + "step": 895 + }, + { + "epoch": 0.3186344238975818, + "grad_norm": 0.8813436031341553, + "learning_rate": 2.4062341854413666e-06, + "loss": 3.6155, + "step": 896 + }, + { + "epoch": 0.3189900426742532, + "grad_norm": 0.8536984324455261, + "learning_rate": 2.404848938174583e-06, + "loss": 3.236, + "step": 897 + }, + { + "epoch": 0.31934566145092463, + "grad_norm": 0.9356421828269958, + "learning_rate": 2.4034624766905235e-06, + "loss": 3.4243, + "step": 898 + }, + { + "epoch": 0.31970128022759603, + "grad_norm": 0.7950219511985779, + "learning_rate": 2.4020748028496826e-06, + "loss": 3.1405, + "step": 899 + }, + { + "epoch": 0.3200568990042674, + "grad_norm": 0.7823470830917358, + "learning_rate": 2.40068591851418e-06, + "loss": 3.2498, + "step": 900 + }, + { + "epoch": 0.3204125177809388, + "grad_norm": 0.9287840723991394, + "learning_rate": 2.3992958255477606e-06, + "loss": 3.4983, + "step": 901 + }, + { + "epoch": 0.3207681365576102, + "grad_norm": 1.0321606397628784, + "learning_rate": 2.39790452581579e-06, + "loss": 3.7234, + "step": 902 + }, + { + "epoch": 0.32112375533428167, + "grad_norm": 0.9713765978813171, + "learning_rate": 2.396512021185255e-06, + "loss": 3.5848, + "step": 903 + }, + { + "epoch": 0.32147937411095306, + "grad_norm": 0.8856178522109985, + "learning_rate": 2.395118313524758e-06, + "loss": 4.1305, + "step": 904 + }, + { + "epoch": 0.32183499288762446, + "grad_norm": 1.426372766494751, + "learning_rate": 2.3937234047045165e-06, + "loss": 3.4193, + "step": 905 + }, + { + "epoch": 0.32219061166429586, + "grad_norm": 0.821883499622345, + "learning_rate": 2.392327296596359e-06, + "loss": 3.1193, + "step": 906 + }, + { + "epoch": 0.3225462304409673, + "grad_norm": 1.083262324333191, + "learning_rate": 2.3909299910737235e-06, + "loss": 3.3216, + "step": 907 + }, + { + "epoch": 0.3229018492176387, + "grad_norm": 2.0723891258239746, + "learning_rate": 2.3895314900116554e-06, + "loss": 3.6992, + "step": 908 + }, + { + "epoch": 0.3232574679943101, + "grad_norm": 0.8153877854347229, + "learning_rate": 2.3881317952868035e-06, + "loss": 3.7079, + "step": 909 + }, + { + "epoch": 0.3236130867709815, + "grad_norm": 1.3863168954849243, + "learning_rate": 2.3867309087774194e-06, + "loss": 4.4432, + "step": 910 + }, + { + "epoch": 0.3239687055476529, + "grad_norm": 1.0526527166366577, + "learning_rate": 2.3853288323633532e-06, + "loss": 3.5999, + "step": 911 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.9954222440719604, + "learning_rate": 2.3839255679260525e-06, + "loss": 3.7718, + "step": 912 + }, + { + "epoch": 0.32467994310099574, + "grad_norm": 1.2216415405273438, + "learning_rate": 2.382521117348558e-06, + "loss": 3.2399, + "step": 913 + }, + { + "epoch": 0.32503556187766713, + "grad_norm": 1.6081119775772095, + "learning_rate": 2.3811154825155034e-06, + "loss": 2.5966, + "step": 914 + }, + { + "epoch": 0.32539118065433853, + "grad_norm": 1.1468662023544312, + "learning_rate": 2.37970866531311e-06, + "loss": 3.2134, + "step": 915 + }, + { + "epoch": 0.32574679943101, + "grad_norm": 1.5943692922592163, + "learning_rate": 2.3783006676291864e-06, + "loss": 2.4085, + "step": 916 + }, + { + "epoch": 0.3261024182076814, + "grad_norm": 1.2626765966415405, + "learning_rate": 2.376891491353126e-06, + "loss": 2.5421, + "step": 917 + }, + { + "epoch": 0.3264580369843528, + "grad_norm": 0.9772995114326477, + "learning_rate": 2.3754811383759043e-06, + "loss": 3.2706, + "step": 918 + }, + { + "epoch": 0.32681365576102417, + "grad_norm": 1.224816083908081, + "learning_rate": 2.3740696105900727e-06, + "loss": 2.3731, + "step": 919 + }, + { + "epoch": 0.32716927453769556, + "grad_norm": 0.6690019369125366, + "learning_rate": 2.372656909889762e-06, + "loss": 3.1023, + "step": 920 + }, + { + "epoch": 0.327524893314367, + "grad_norm": 1.246256709098816, + "learning_rate": 2.371243038170676e-06, + "loss": 4.2785, + "step": 921 + }, + { + "epoch": 0.3278805120910384, + "grad_norm": 1.7149187326431274, + "learning_rate": 2.36982799733009e-06, + "loss": 2.7003, + "step": 922 + }, + { + "epoch": 0.3282361308677098, + "grad_norm": 3.970747232437134, + "learning_rate": 2.368411789266848e-06, + "loss": 3.417, + "step": 923 + }, + { + "epoch": 0.3285917496443812, + "grad_norm": 1.0317715406417847, + "learning_rate": 2.3669944158813604e-06, + "loss": 3.8697, + "step": 924 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 1.163714051246643, + "learning_rate": 2.3655758790756008e-06, + "loss": 3.7248, + "step": 925 + }, + { + "epoch": 0.32930298719772405, + "grad_norm": 1.208899736404419, + "learning_rate": 2.3641561807531055e-06, + "loss": 3.7583, + "step": 926 + }, + { + "epoch": 0.32965860597439545, + "grad_norm": 0.8847064971923828, + "learning_rate": 2.3627353228189672e-06, + "loss": 3.504, + "step": 927 + }, + { + "epoch": 0.33001422475106684, + "grad_norm": 1.7846226692199707, + "learning_rate": 2.361313307179837e-06, + "loss": 3.9117, + "step": 928 + }, + { + "epoch": 0.33036984352773824, + "grad_norm": 3.5800559520721436, + "learning_rate": 2.3598901357439185e-06, + "loss": 5.6708, + "step": 929 + }, + { + "epoch": 0.3307254623044097, + "grad_norm": 1.4454108476638794, + "learning_rate": 2.358465810420965e-06, + "loss": 3.2356, + "step": 930 + }, + { + "epoch": 0.3310810810810811, + "grad_norm": 0.8741567730903625, + "learning_rate": 2.3570403331222808e-06, + "loss": 2.2357, + "step": 931 + }, + { + "epoch": 0.3314366998577525, + "grad_norm": 1.5689786672592163, + "learning_rate": 2.3556137057607135e-06, + "loss": 4.0382, + "step": 932 + }, + { + "epoch": 0.3317923186344239, + "grad_norm": 1.3819125890731812, + "learning_rate": 2.354185930250656e-06, + "loss": 2.7652, + "step": 933 + }, + { + "epoch": 0.33214793741109533, + "grad_norm": 1.2802753448486328, + "learning_rate": 2.3527570085080407e-06, + "loss": 4.0063, + "step": 934 + }, + { + "epoch": 0.3325035561877667, + "grad_norm": 0.8539674878120422, + "learning_rate": 2.351326942450338e-06, + "loss": 3.1647, + "step": 935 + }, + { + "epoch": 0.3328591749644381, + "grad_norm": 0.9437277317047119, + "learning_rate": 2.3498957339965553e-06, + "loss": 2.8193, + "step": 936 + }, + { + "epoch": 0.3332147937411095, + "grad_norm": 1.5123707056045532, + "learning_rate": 2.348463385067231e-06, + "loss": 3.9004, + "step": 937 + }, + { + "epoch": 0.3335704125177809, + "grad_norm": 0.8143560290336609, + "learning_rate": 2.3470298975844354e-06, + "loss": 3.598, + "step": 938 + }, + { + "epoch": 0.33392603129445236, + "grad_norm": 0.8994200229644775, + "learning_rate": 2.345595273471766e-06, + "loss": 2.6602, + "step": 939 + }, + { + "epoch": 0.33428165007112376, + "grad_norm": 2.265617609024048, + "learning_rate": 2.344159514654346e-06, + "loss": 5.0461, + "step": 940 + }, + { + "epoch": 0.33463726884779516, + "grad_norm": 1.1440644264221191, + "learning_rate": 2.34272262305882e-06, + "loss": 3.7262, + "step": 941 + }, + { + "epoch": 0.33499288762446655, + "grad_norm": 1.1526408195495605, + "learning_rate": 2.3412846006133547e-06, + "loss": 2.6377, + "step": 942 + }, + { + "epoch": 0.335348506401138, + "grad_norm": 1.168662190437317, + "learning_rate": 2.339845449247633e-06, + "loss": 3.4055, + "step": 943 + }, + { + "epoch": 0.3357041251778094, + "grad_norm": 0.9566501975059509, + "learning_rate": 2.338405170892852e-06, + "loss": 3.5879, + "step": 944 + }, + { + "epoch": 0.3360597439544808, + "grad_norm": 1.1486681699752808, + "learning_rate": 2.3369637674817235e-06, + "loss": 2.7751, + "step": 945 + }, + { + "epoch": 0.3364153627311522, + "grad_norm": 0.8852776288986206, + "learning_rate": 2.335521240948466e-06, + "loss": 2.9443, + "step": 946 + }, + { + "epoch": 0.3367709815078236, + "grad_norm": 0.5680617690086365, + "learning_rate": 2.334077593228807e-06, + "loss": 3.0919, + "step": 947 + }, + { + "epoch": 0.33712660028449504, + "grad_norm": 1.1448578834533691, + "learning_rate": 2.3326328262599787e-06, + "loss": 3.9303, + "step": 948 + }, + { + "epoch": 0.33748221906116643, + "grad_norm": 0.8241201639175415, + "learning_rate": 2.3311869419807144e-06, + "loss": 3.0625, + "step": 949 + }, + { + "epoch": 0.33783783783783783, + "grad_norm": 1.3148895502090454, + "learning_rate": 2.3297399423312472e-06, + "loss": 3.873, + "step": 950 + }, + { + "epoch": 0.3381934566145092, + "grad_norm": 1.5300127267837524, + "learning_rate": 2.328291829253306e-06, + "loss": 2.2708, + "step": 951 + }, + { + "epoch": 0.3385490753911807, + "grad_norm": 1.3690248727798462, + "learning_rate": 2.3268426046901153e-06, + "loss": 4.6182, + "step": 952 + }, + { + "epoch": 0.33890469416785207, + "grad_norm": 0.8831199407577515, + "learning_rate": 2.32539227058639e-06, + "loss": 2.5436, + "step": 953 + }, + { + "epoch": 0.33926031294452347, + "grad_norm": 0.811884880065918, + "learning_rate": 2.3239408288883336e-06, + "loss": 2.9321, + "step": 954 + }, + { + "epoch": 0.33961593172119486, + "grad_norm": 1.005915641784668, + "learning_rate": 2.322488281543638e-06, + "loss": 3.677, + "step": 955 + }, + { + "epoch": 0.3399715504978663, + "grad_norm": 0.7735291719436646, + "learning_rate": 2.321034630501476e-06, + "loss": 2.7414, + "step": 956 + }, + { + "epoch": 0.3403271692745377, + "grad_norm": 0.860776960849762, + "learning_rate": 2.319579877712503e-06, + "loss": 2.8889, + "step": 957 + }, + { + "epoch": 0.3406827880512091, + "grad_norm": 1.0457979440689087, + "learning_rate": 2.318124025128853e-06, + "loss": 2.7019, + "step": 958 + }, + { + "epoch": 0.3410384068278805, + "grad_norm": 0.9042894840240479, + "learning_rate": 2.3166670747041342e-06, + "loss": 3.1584, + "step": 959 + }, + { + "epoch": 0.3413940256045519, + "grad_norm": 1.8279428482055664, + "learning_rate": 2.3152090283934307e-06, + "loss": 4.3317, + "step": 960 + }, + { + "epoch": 0.34174964438122335, + "grad_norm": 2.417863130569458, + "learning_rate": 2.3137498881532944e-06, + "loss": 2.8965, + "step": 961 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 1.156972050666809, + "learning_rate": 2.312289655941747e-06, + "loss": 2.5499, + "step": 962 + }, + { + "epoch": 0.34246088193456614, + "grad_norm": 1.1509331464767456, + "learning_rate": 2.3108283337182748e-06, + "loss": 2.6873, + "step": 963 + }, + { + "epoch": 0.34281650071123754, + "grad_norm": 1.517385721206665, + "learning_rate": 2.3093659234438266e-06, + "loss": 2.3411, + "step": 964 + }, + { + "epoch": 0.343172119487909, + "grad_norm": 2.8550565242767334, + "learning_rate": 2.3079024270808124e-06, + "loss": 1.6081, + "step": 965 + }, + { + "epoch": 0.3435277382645804, + "grad_norm": 0.8293601274490356, + "learning_rate": 2.3064378465930975e-06, + "loss": 2.4831, + "step": 966 + }, + { + "epoch": 0.3438833570412518, + "grad_norm": 2.7941455841064453, + "learning_rate": 2.304972183946004e-06, + "loss": 2.6809, + "step": 967 + }, + { + "epoch": 0.3442389758179232, + "grad_norm": 1.381650686264038, + "learning_rate": 2.303505441106305e-06, + "loss": 3.8688, + "step": 968 + }, + { + "epoch": 0.34459459459459457, + "grad_norm": 0.8142032623291016, + "learning_rate": 2.302037620042224e-06, + "loss": 3.0402, + "step": 969 + }, + { + "epoch": 0.344950213371266, + "grad_norm": 2.2664520740509033, + "learning_rate": 2.3005687227234304e-06, + "loss": 3.7844, + "step": 970 + }, + { + "epoch": 0.3453058321479374, + "grad_norm": 1.37871253490448, + "learning_rate": 2.299098751121039e-06, + "loss": 3.9501, + "step": 971 + }, + { + "epoch": 0.3456614509246088, + "grad_norm": 3.456779718399048, + "learning_rate": 2.2976277072076044e-06, + "loss": 4.6606, + "step": 972 + }, + { + "epoch": 0.3460170697012802, + "grad_norm": 0.8456706404685974, + "learning_rate": 2.2961555929571222e-06, + "loss": 3.2891, + "step": 973 + }, + { + "epoch": 0.34637268847795166, + "grad_norm": 1.5483243465423584, + "learning_rate": 2.2946824103450225e-06, + "loss": 3.8921, + "step": 974 + }, + { + "epoch": 0.34672830725462306, + "grad_norm": 0.8586994409561157, + "learning_rate": 2.29320816134817e-06, + "loss": 3.0591, + "step": 975 + }, + { + "epoch": 0.34708392603129445, + "grad_norm": 0.8495815992355347, + "learning_rate": 2.291732847944861e-06, + "loss": 3.1268, + "step": 976 + }, + { + "epoch": 0.34743954480796585, + "grad_norm": 0.678496778011322, + "learning_rate": 2.290256472114819e-06, + "loss": 2.4444, + "step": 977 + }, + { + "epoch": 0.34779516358463725, + "grad_norm": 1.2302916049957275, + "learning_rate": 2.288779035839193e-06, + "loss": 4.5122, + "step": 978 + }, + { + "epoch": 0.3481507823613087, + "grad_norm": 0.8982106447219849, + "learning_rate": 2.2873005411005558e-06, + "loss": 3.2799, + "step": 979 + }, + { + "epoch": 0.3485064011379801, + "grad_norm": 1.1062242984771729, + "learning_rate": 2.2858209898829006e-06, + "loss": 3.7146, + "step": 980 + }, + { + "epoch": 0.3488620199146515, + "grad_norm": 1.8458932638168335, + "learning_rate": 2.284340384171637e-06, + "loss": 3.4881, + "step": 981 + }, + { + "epoch": 0.3492176386913229, + "grad_norm": 1.1017279624938965, + "learning_rate": 2.282858725953592e-06, + "loss": 3.9106, + "step": 982 + }, + { + "epoch": 0.34957325746799434, + "grad_norm": 1.427855372428894, + "learning_rate": 2.281376017217003e-06, + "loss": 3.3534, + "step": 983 + }, + { + "epoch": 0.34992887624466573, + "grad_norm": 0.9542606472969055, + "learning_rate": 2.2798922599515174e-06, + "loss": 2.8057, + "step": 984 + }, + { + "epoch": 0.35028449502133713, + "grad_norm": 2.6164705753326416, + "learning_rate": 2.2784074561481893e-06, + "loss": 5.1249, + "step": 985 + }, + { + "epoch": 0.3506401137980085, + "grad_norm": 0.8287191987037659, + "learning_rate": 2.2769216077994787e-06, + "loss": 3.1147, + "step": 986 + }, + { + "epoch": 0.3509957325746799, + "grad_norm": 1.1186838150024414, + "learning_rate": 2.275434716899246e-06, + "loss": 3.6001, + "step": 987 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 0.7288455367088318, + "learning_rate": 2.2739467854427515e-06, + "loss": 2.8285, + "step": 988 + }, + { + "epoch": 0.35170697012802277, + "grad_norm": 0.8333854675292969, + "learning_rate": 2.2724578154266503e-06, + "loss": 3.1553, + "step": 989 + }, + { + "epoch": 0.35206258890469416, + "grad_norm": 1.1338744163513184, + "learning_rate": 2.270967808848992e-06, + "loss": 3.6945, + "step": 990 + }, + { + "epoch": 0.35241820768136556, + "grad_norm": 1.1525299549102783, + "learning_rate": 2.269476767709218e-06, + "loss": 2.4851, + "step": 991 + }, + { + "epoch": 0.352773826458037, + "grad_norm": 1.7245208024978638, + "learning_rate": 2.267984694008157e-06, + "loss": 3.8096, + "step": 992 + }, + { + "epoch": 0.3531294452347084, + "grad_norm": 1.5534204244613647, + "learning_rate": 2.2664915897480225e-06, + "loss": 3.5616, + "step": 993 + }, + { + "epoch": 0.3534850640113798, + "grad_norm": 1.11049485206604, + "learning_rate": 2.264997456932413e-06, + "loss": 2.7319, + "step": 994 + }, + { + "epoch": 0.3538406827880512, + "grad_norm": 4.791476249694824, + "learning_rate": 2.2635022975663065e-06, + "loss": 2.1553, + "step": 995 + }, + { + "epoch": 0.3541963015647226, + "grad_norm": 0.7531734108924866, + "learning_rate": 2.262006113656057e-06, + "loss": 2.7709, + "step": 996 + }, + { + "epoch": 0.35455192034139404, + "grad_norm": 1.2087410688400269, + "learning_rate": 2.260508907209395e-06, + "loss": 3.394, + "step": 997 + }, + { + "epoch": 0.35490753911806544, + "grad_norm": 2.7788126468658447, + "learning_rate": 2.2590106802354227e-06, + "loss": 4.5085, + "step": 998 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 1.2040718793869019, + "learning_rate": 2.2575114347446116e-06, + "loss": 3.7428, + "step": 999 + }, + { + "epoch": 0.35561877667140823, + "grad_norm": 0.7722324132919312, + "learning_rate": 2.2560111727488e-06, + "loss": 3.2095, + "step": 1000 + }, + { + "epoch": 0.3559743954480797, + "grad_norm": 2.273879289627075, + "learning_rate": 2.25450989626119e-06, + "loss": 2.6117, + "step": 1001 + }, + { + "epoch": 0.3563300142247511, + "grad_norm": 1.4606269598007202, + "learning_rate": 2.253007607296346e-06, + "loss": 3.0436, + "step": 1002 + }, + { + "epoch": 0.3566856330014225, + "grad_norm": 0.673508882522583, + "learning_rate": 2.25150430787019e-06, + "loss": 2.8695, + "step": 1003 + }, + { + "epoch": 0.35704125177809387, + "grad_norm": 1.0594412088394165, + "learning_rate": 2.25e-06, + "loss": 3.5145, + "step": 1004 + }, + { + "epoch": 0.35739687055476527, + "grad_norm": 1.2906697988510132, + "learning_rate": 2.248494685704408e-06, + "loss": 3.5625, + "step": 1005 + }, + { + "epoch": 0.3577524893314367, + "grad_norm": 0.8561844229698181, + "learning_rate": 2.246988367003396e-06, + "loss": 3.651, + "step": 1006 + }, + { + "epoch": 0.3581081081081081, + "grad_norm": 2.2645504474639893, + "learning_rate": 2.245481045918294e-06, + "loss": 5.0059, + "step": 1007 + }, + { + "epoch": 0.3584637268847795, + "grad_norm": 0.947851300239563, + "learning_rate": 2.243972724471776e-06, + "loss": 3.0536, + "step": 1008 + }, + { + "epoch": 0.3588193456614509, + "grad_norm": 1.2084481716156006, + "learning_rate": 2.242463404687861e-06, + "loss": 3.8602, + "step": 1009 + }, + { + "epoch": 0.35917496443812236, + "grad_norm": 1.7934727668762207, + "learning_rate": 2.240953088591905e-06, + "loss": 2.8041, + "step": 1010 + }, + { + "epoch": 0.35953058321479375, + "grad_norm": 2.1593291759490967, + "learning_rate": 2.2394417782106014e-06, + "loss": 4.7967, + "step": 1011 + }, + { + "epoch": 0.35988620199146515, + "grad_norm": 1.7242709398269653, + "learning_rate": 2.2379294755719794e-06, + "loss": 4.2092, + "step": 1012 + }, + { + "epoch": 0.36024182076813654, + "grad_norm": 2.0019173622131348, + "learning_rate": 2.236416182705399e-06, + "loss": 3.2417, + "step": 1013 + }, + { + "epoch": 0.36059743954480794, + "grad_norm": 0.7752237319946289, + "learning_rate": 2.2349019016415474e-06, + "loss": 3.195, + "step": 1014 + }, + { + "epoch": 0.3609530583214794, + "grad_norm": 1.3047748804092407, + "learning_rate": 2.2333866344124403e-06, + "loss": 2.1678, + "step": 1015 + }, + { + "epoch": 0.3613086770981508, + "grad_norm": 0.7946853041648865, + "learning_rate": 2.231870383051415e-06, + "loss": 2.5156, + "step": 1016 + }, + { + "epoch": 0.3616642958748222, + "grad_norm": 0.8299159407615662, + "learning_rate": 2.2303531495931306e-06, + "loss": 2.7414, + "step": 1017 + }, + { + "epoch": 0.3620199146514936, + "grad_norm": 1.0711421966552734, + "learning_rate": 2.228834936073563e-06, + "loss": 3.89, + "step": 1018 + }, + { + "epoch": 0.36237553342816503, + "grad_norm": 0.8314558267593384, + "learning_rate": 2.227315744530003e-06, + "loss": 2.6579, + "step": 1019 + }, + { + "epoch": 0.3627311522048364, + "grad_norm": 0.9284935593605042, + "learning_rate": 2.225795577001057e-06, + "loss": 4.1469, + "step": 1020 + }, + { + "epoch": 0.3630867709815078, + "grad_norm": 0.8329579830169678, + "learning_rate": 2.224274435526636e-06, + "loss": 3.475, + "step": 1021 + }, + { + "epoch": 0.3634423897581792, + "grad_norm": 1.067373514175415, + "learning_rate": 2.222752322147962e-06, + "loss": 3.6268, + "step": 1022 + }, + { + "epoch": 0.3637980085348506, + "grad_norm": 0.8603855967521667, + "learning_rate": 2.221229238907559e-06, + "loss": 2.9408, + "step": 1023 + }, + { + "epoch": 0.36415362731152207, + "grad_norm": 1.0914207696914673, + "learning_rate": 2.2197051878492543e-06, + "loss": 3.7185, + "step": 1024 + }, + { + "epoch": 0.36450924608819346, + "grad_norm": 2.4922783374786377, + "learning_rate": 2.218180171018171e-06, + "loss": 3.4309, + "step": 1025 + }, + { + "epoch": 0.36486486486486486, + "grad_norm": 0.9771096706390381, + "learning_rate": 2.216654190460732e-06, + "loss": 2.5239, + "step": 1026 + }, + { + "epoch": 0.36522048364153625, + "grad_norm": 2.3474457263946533, + "learning_rate": 2.2151272482246504e-06, + "loss": 4.1891, + "step": 1027 + }, + { + "epoch": 0.3655761024182077, + "grad_norm": 0.8036423325538635, + "learning_rate": 2.213599346358931e-06, + "loss": 3.3956, + "step": 1028 + }, + { + "epoch": 0.3659317211948791, + "grad_norm": 0.8628100752830505, + "learning_rate": 2.212070486913866e-06, + "loss": 2.8492, + "step": 1029 + }, + { + "epoch": 0.3662873399715505, + "grad_norm": 1.087672472000122, + "learning_rate": 2.2105406719410325e-06, + "loss": 3.539, + "step": 1030 + }, + { + "epoch": 0.3666429587482219, + "grad_norm": 2.6317200660705566, + "learning_rate": 2.2090099034932904e-06, + "loss": 3.5928, + "step": 1031 + }, + { + "epoch": 0.3669985775248933, + "grad_norm": 1.1888188123703003, + "learning_rate": 2.207478183624779e-06, + "loss": 2.4773, + "step": 1032 + }, + { + "epoch": 0.36735419630156474, + "grad_norm": 1.152649164199829, + "learning_rate": 2.205945514390913e-06, + "loss": 3.7501, + "step": 1033 + }, + { + "epoch": 0.36770981507823614, + "grad_norm": 1.0651531219482422, + "learning_rate": 2.204411897848383e-06, + "loss": 3.5764, + "step": 1034 + }, + { + "epoch": 0.36806543385490753, + "grad_norm": 1.1541025638580322, + "learning_rate": 2.2028773360551495e-06, + "loss": 4.0317, + "step": 1035 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 1.01836097240448, + "learning_rate": 2.2013418310704425e-06, + "loss": 2.345, + "step": 1036 + }, + { + "epoch": 0.3687766714082504, + "grad_norm": 2.078566074371338, + "learning_rate": 2.1998053849547558e-06, + "loss": 4.428, + "step": 1037 + }, + { + "epoch": 0.3691322901849218, + "grad_norm": 0.8235486149787903, + "learning_rate": 2.1982679997698478e-06, + "loss": 3.5999, + "step": 1038 + }, + { + "epoch": 0.36948790896159317, + "grad_norm": 1.0775457620620728, + "learning_rate": 2.1967296775787366e-06, + "loss": 3.1494, + "step": 1039 + }, + { + "epoch": 0.36984352773826457, + "grad_norm": 1.9739669561386108, + "learning_rate": 2.195190420445697e-06, + "loss": 3.648, + "step": 1040 + }, + { + "epoch": 0.37019914651493596, + "grad_norm": 1.10073721408844, + "learning_rate": 2.19365023043626e-06, + "loss": 3.4709, + "step": 1041 + }, + { + "epoch": 0.3705547652916074, + "grad_norm": 0.9456813931465149, + "learning_rate": 2.1921091096172063e-06, + "loss": 2.2699, + "step": 1042 + }, + { + "epoch": 0.3709103840682788, + "grad_norm": 0.787166178226471, + "learning_rate": 2.1905670600565676e-06, + "loss": 2.123, + "step": 1043 + }, + { + "epoch": 0.3712660028449502, + "grad_norm": 1.2471235990524292, + "learning_rate": 2.189024083823621e-06, + "loss": 3.2217, + "step": 1044 + }, + { + "epoch": 0.3716216216216216, + "grad_norm": 1.075080394744873, + "learning_rate": 2.187480182988886e-06, + "loss": 2.7917, + "step": 1045 + }, + { + "epoch": 0.37197724039829305, + "grad_norm": 1.5198115110397339, + "learning_rate": 2.185935359624126e-06, + "loss": 3.097, + "step": 1046 + }, + { + "epoch": 0.37233285917496445, + "grad_norm": 1.3291716575622559, + "learning_rate": 2.1843896158023383e-06, + "loss": 3.1922, + "step": 1047 + }, + { + "epoch": 0.37268847795163584, + "grad_norm": 0.9463316202163696, + "learning_rate": 2.1828429535977583e-06, + "loss": 3.3974, + "step": 1048 + }, + { + "epoch": 0.37304409672830724, + "grad_norm": 1.2691882848739624, + "learning_rate": 2.181295375085853e-06, + "loss": 3.6521, + "step": 1049 + }, + { + "epoch": 0.37339971550497864, + "grad_norm": 0.9994092583656311, + "learning_rate": 2.179746882343318e-06, + "loss": 3.6154, + "step": 1050 + }, + { + "epoch": 0.3737553342816501, + "grad_norm": 1.1417800188064575, + "learning_rate": 2.1781974774480773e-06, + "loss": 3.4912, + "step": 1051 + }, + { + "epoch": 0.3741109530583215, + "grad_norm": 4.547869682312012, + "learning_rate": 2.176647162479278e-06, + "loss": 3.6279, + "step": 1052 + }, + { + "epoch": 0.3744665718349929, + "grad_norm": 0.9309819936752319, + "learning_rate": 2.175095939517289e-06, + "loss": 2.7586, + "step": 1053 + }, + { + "epoch": 0.3748221906116643, + "grad_norm": 0.9633996486663818, + "learning_rate": 2.1735438106436967e-06, + "loss": 3.2519, + "step": 1054 + }, + { + "epoch": 0.3751778093883357, + "grad_norm": 0.9169543385505676, + "learning_rate": 2.171990777941303e-06, + "loss": 2.8552, + "step": 1055 + }, + { + "epoch": 0.3755334281650071, + "grad_norm": 0.7317620515823364, + "learning_rate": 2.1704368434941242e-06, + "loss": 2.3586, + "step": 1056 + }, + { + "epoch": 0.3758890469416785, + "grad_norm": 0.8483390212059021, + "learning_rate": 2.168882009387386e-06, + "loss": 2.5079, + "step": 1057 + }, + { + "epoch": 0.3762446657183499, + "grad_norm": 0.958132266998291, + "learning_rate": 2.1673262777075206e-06, + "loss": 2.3, + "step": 1058 + }, + { + "epoch": 0.37660028449502136, + "grad_norm": 0.9538259506225586, + "learning_rate": 2.1657696505421658e-06, + "loss": 1.6684, + "step": 1059 + }, + { + "epoch": 0.37695590327169276, + "grad_norm": 2.3030972480773926, + "learning_rate": 2.1642121299801597e-06, + "loss": 4.2996, + "step": 1060 + }, + { + "epoch": 0.37731152204836416, + "grad_norm": 1.5412087440490723, + "learning_rate": 2.1626537181115395e-06, + "loss": 3.3394, + "step": 1061 + }, + { + "epoch": 0.37766714082503555, + "grad_norm": 0.7600622177124023, + "learning_rate": 2.1610944170275403e-06, + "loss": 2.8574, + "step": 1062 + }, + { + "epoch": 0.37802275960170695, + "grad_norm": 1.4596750736236572, + "learning_rate": 2.159534228820588e-06, + "loss": 2.1638, + "step": 1063 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 1.2480350732803345, + "learning_rate": 2.1579731555843007e-06, + "loss": 2.448, + "step": 1064 + }, + { + "epoch": 0.3787339971550498, + "grad_norm": 16.611412048339844, + "learning_rate": 2.1564111994134832e-06, + "loss": 3.3608, + "step": 1065 + }, + { + "epoch": 0.3790896159317212, + "grad_norm": 1.4607133865356445, + "learning_rate": 2.154848362404125e-06, + "loss": 4.8392, + "step": 1066 + }, + { + "epoch": 0.3794452347083926, + "grad_norm": 1.164340615272522, + "learning_rate": 2.1532846466533985e-06, + "loss": 3.4349, + "step": 1067 + }, + { + "epoch": 0.37980085348506404, + "grad_norm": 1.810619831085205, + "learning_rate": 2.1517200542596543e-06, + "loss": 2.9716, + "step": 1068 + }, + { + "epoch": 0.38015647226173543, + "grad_norm": 0.7474228143692017, + "learning_rate": 2.150154587322419e-06, + "loss": 2.6097, + "step": 1069 + }, + { + "epoch": 0.38051209103840683, + "grad_norm": 1.835815668106079, + "learning_rate": 2.148588247942395e-06, + "loss": 4.5224, + "step": 1070 + }, + { + "epoch": 0.3808677098150782, + "grad_norm": 1.0006234645843506, + "learning_rate": 2.1470210382214536e-06, + "loss": 2.753, + "step": 1071 + }, + { + "epoch": 0.3812233285917496, + "grad_norm": 1.0669505596160889, + "learning_rate": 2.1454529602626337e-06, + "loss": 2.814, + "step": 1072 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 1.394275188446045, + "learning_rate": 2.1438840161701405e-06, + "loss": 4.296, + "step": 1073 + }, + { + "epoch": 0.38193456614509247, + "grad_norm": 0.7441378235816956, + "learning_rate": 2.1423142080493406e-06, + "loss": 2.863, + "step": 1074 + }, + { + "epoch": 0.38229018492176386, + "grad_norm": 1.6233694553375244, + "learning_rate": 2.1407435380067604e-06, + "loss": 3.4993, + "step": 1075 + }, + { + "epoch": 0.38264580369843526, + "grad_norm": 0.7468914985656738, + "learning_rate": 2.139172008150083e-06, + "loss": 3.4488, + "step": 1076 + }, + { + "epoch": 0.3830014224751067, + "grad_norm": 1.7445799112319946, + "learning_rate": 2.1375996205881453e-06, + "loss": 3.7134, + "step": 1077 + }, + { + "epoch": 0.3833570412517781, + "grad_norm": 0.7061755061149597, + "learning_rate": 2.1360263774309346e-06, + "loss": 2.4939, + "step": 1078 + }, + { + "epoch": 0.3837126600284495, + "grad_norm": 0.7381698489189148, + "learning_rate": 2.1344522807895873e-06, + "loss": 3.1719, + "step": 1079 + }, + { + "epoch": 0.3840682788051209, + "grad_norm": 1.441740870475769, + "learning_rate": 2.1328773327763843e-06, + "loss": 2.4261, + "step": 1080 + }, + { + "epoch": 0.3844238975817923, + "grad_norm": 1.023163080215454, + "learning_rate": 2.1313015355047486e-06, + "loss": 2.8845, + "step": 1081 + }, + { + "epoch": 0.38477951635846375, + "grad_norm": 0.8988614082336426, + "learning_rate": 2.129724891089244e-06, + "loss": 3.2744, + "step": 1082 + }, + { + "epoch": 0.38513513513513514, + "grad_norm": 1.2602511644363403, + "learning_rate": 2.1281474016455703e-06, + "loss": 3.5254, + "step": 1083 + }, + { + "epoch": 0.38549075391180654, + "grad_norm": 2.0748188495635986, + "learning_rate": 2.126569069290562e-06, + "loss": 5.3045, + "step": 1084 + }, + { + "epoch": 0.38584637268847793, + "grad_norm": 1.7678908109664917, + "learning_rate": 2.1249898961421836e-06, + "loss": 4.0427, + "step": 1085 + }, + { + "epoch": 0.3862019914651494, + "grad_norm": 0.962956428527832, + "learning_rate": 2.123409884319528e-06, + "loss": 3.6552, + "step": 1086 + }, + { + "epoch": 0.3865576102418208, + "grad_norm": 0.9997053146362305, + "learning_rate": 2.1218290359428147e-06, + "loss": 2.9691, + "step": 1087 + }, + { + "epoch": 0.3869132290184922, + "grad_norm": 1.1902942657470703, + "learning_rate": 2.1202473531333846e-06, + "loss": 3.0431, + "step": 1088 + }, + { + "epoch": 0.3872688477951636, + "grad_norm": 0.7246391773223877, + "learning_rate": 2.118664838013698e-06, + "loss": 3.101, + "step": 1089 + }, + { + "epoch": 0.38762446657183497, + "grad_norm": 0.9167899489402771, + "learning_rate": 2.117081492707334e-06, + "loss": 3.2283, + "step": 1090 + }, + { + "epoch": 0.3879800853485064, + "grad_norm": 3.419940710067749, + "learning_rate": 2.1154973193389847e-06, + "loss": 4.1738, + "step": 1091 + }, + { + "epoch": 0.3883357041251778, + "grad_norm": 0.977875828742981, + "learning_rate": 2.1139123200344522e-06, + "loss": 2.8493, + "step": 1092 + }, + { + "epoch": 0.3886913229018492, + "grad_norm": 0.8069586753845215, + "learning_rate": 2.112326496920648e-06, + "loss": 3.0492, + "step": 1093 + }, + { + "epoch": 0.3890469416785206, + "grad_norm": 1.1764451265335083, + "learning_rate": 2.1107398521255897e-06, + "loss": 4.4232, + "step": 1094 + }, + { + "epoch": 0.38940256045519206, + "grad_norm": 0.7940906882286072, + "learning_rate": 2.1091523877783956e-06, + "loss": 3.4908, + "step": 1095 + }, + { + "epoch": 0.38975817923186346, + "grad_norm": 2.2564597129821777, + "learning_rate": 2.107564106009286e-06, + "loss": 4.3987, + "step": 1096 + }, + { + "epoch": 0.39011379800853485, + "grad_norm": 1.0815752744674683, + "learning_rate": 2.105975008949577e-06, + "loss": 3.3077, + "step": 1097 + }, + { + "epoch": 0.39046941678520625, + "grad_norm": 0.7657842040061951, + "learning_rate": 2.104385098731679e-06, + "loss": 2.9355, + "step": 1098 + }, + { + "epoch": 0.39082503556187764, + "grad_norm": 1.1141129732131958, + "learning_rate": 2.102794377489092e-06, + "loss": 4.0039, + "step": 1099 + }, + { + "epoch": 0.3911806543385491, + "grad_norm": 1.053391933441162, + "learning_rate": 2.1012028473564066e-06, + "loss": 3.6283, + "step": 1100 + }, + { + "epoch": 0.3915362731152205, + "grad_norm": 1.0684351921081543, + "learning_rate": 2.099610510469299e-06, + "loss": 3.4515, + "step": 1101 + }, + { + "epoch": 0.3918918918918919, + "grad_norm": 1.1530365943908691, + "learning_rate": 2.098017368964525e-06, + "loss": 3.4689, + "step": 1102 + }, + { + "epoch": 0.3922475106685633, + "grad_norm": 0.9683912396430969, + "learning_rate": 2.0964234249799233e-06, + "loss": 4.1813, + "step": 1103 + }, + { + "epoch": 0.39260312944523473, + "grad_norm": 1.1624358892440796, + "learning_rate": 2.094828680654407e-06, + "loss": 3.5439, + "step": 1104 + }, + { + "epoch": 0.39295874822190613, + "grad_norm": 0.7081165313720703, + "learning_rate": 2.093233138127966e-06, + "loss": 2.6779, + "step": 1105 + }, + { + "epoch": 0.3933143669985775, + "grad_norm": 1.106188416481018, + "learning_rate": 2.0916367995416587e-06, + "loss": 1.1767, + "step": 1106 + }, + { + "epoch": 0.3936699857752489, + "grad_norm": 1.393075942993164, + "learning_rate": 2.090039667037613e-06, + "loss": 3.5982, + "step": 1107 + }, + { + "epoch": 0.3940256045519203, + "grad_norm": 0.825031042098999, + "learning_rate": 2.0884417427590214e-06, + "loss": 3.0273, + "step": 1108 + }, + { + "epoch": 0.39438122332859177, + "grad_norm": 0.8744937777519226, + "learning_rate": 2.08684302885014e-06, + "loss": 2.9077, + "step": 1109 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.1774131059646606, + "learning_rate": 2.085243527456283e-06, + "loss": 3.3453, + "step": 1110 + }, + { + "epoch": 0.39509246088193456, + "grad_norm": 0.8759106993675232, + "learning_rate": 2.083643240723823e-06, + "loss": 2.7318, + "step": 1111 + }, + { + "epoch": 0.39544807965860596, + "grad_norm": 0.9117372035980225, + "learning_rate": 2.0820421708001857e-06, + "loss": 3.0624, + "step": 1112 + }, + { + "epoch": 0.3958036984352774, + "grad_norm": 0.7021335959434509, + "learning_rate": 2.080440319833847e-06, + "loss": 2.0454, + "step": 1113 + }, + { + "epoch": 0.3961593172119488, + "grad_norm": 0.9842357635498047, + "learning_rate": 2.078837689974332e-06, + "loss": 3.2162, + "step": 1114 + }, + { + "epoch": 0.3965149359886202, + "grad_norm": 1.0207566022872925, + "learning_rate": 2.0772342833722097e-06, + "loss": 3.7758, + "step": 1115 + }, + { + "epoch": 0.3968705547652916, + "grad_norm": 0.7428815364837646, + "learning_rate": 2.0756301021790935e-06, + "loss": 2.8899, + "step": 1116 + }, + { + "epoch": 0.397226173541963, + "grad_norm": 1.0433361530303955, + "learning_rate": 2.074025148547635e-06, + "loss": 3.0669, + "step": 1117 + }, + { + "epoch": 0.39758179231863444, + "grad_norm": 0.8764773011207581, + "learning_rate": 2.072419424631521e-06, + "loss": 2.4529, + "step": 1118 + }, + { + "epoch": 0.39793741109530584, + "grad_norm": 0.7380859851837158, + "learning_rate": 2.070812932585475e-06, + "loss": 3.0697, + "step": 1119 + }, + { + "epoch": 0.39829302987197723, + "grad_norm": 0.9669122695922852, + "learning_rate": 2.0692056745652484e-06, + "loss": 2.8262, + "step": 1120 + }, + { + "epoch": 0.39864864864864863, + "grad_norm": 0.9579645991325378, + "learning_rate": 2.0675976527276215e-06, + "loss": 2.9387, + "step": 1121 + }, + { + "epoch": 0.3990042674253201, + "grad_norm": 2.326634168624878, + "learning_rate": 2.0659888692304e-06, + "loss": 4.0829, + "step": 1122 + }, + { + "epoch": 0.3993598862019915, + "grad_norm": 1.1296617984771729, + "learning_rate": 2.064379326232412e-06, + "loss": 2.59, + "step": 1123 + }, + { + "epoch": 0.39971550497866287, + "grad_norm": 1.3198505640029907, + "learning_rate": 2.0627690258935034e-06, + "loss": 4.1455, + "step": 1124 + }, + { + "epoch": 0.40007112375533427, + "grad_norm": 1.0118577480316162, + "learning_rate": 2.061157970374537e-06, + "loss": 3.2975, + "step": 1125 + }, + { + "epoch": 0.40042674253200566, + "grad_norm": 2.0194759368896484, + "learning_rate": 2.059546161837389e-06, + "loss": 3.0562, + "step": 1126 + }, + { + "epoch": 0.4007823613086771, + "grad_norm": 2.3234996795654297, + "learning_rate": 2.0579336024449463e-06, + "loss": 4.1388, + "step": 1127 + }, + { + "epoch": 0.4011379800853485, + "grad_norm": 0.8515639901161194, + "learning_rate": 2.056320294361104e-06, + "loss": 3.4021, + "step": 1128 + }, + { + "epoch": 0.4014935988620199, + "grad_norm": 0.9911292791366577, + "learning_rate": 2.0547062397507603e-06, + "loss": 1.7065, + "step": 1129 + }, + { + "epoch": 0.4018492176386913, + "grad_norm": 0.9843156933784485, + "learning_rate": 2.053091440779816e-06, + "loss": 2.9164, + "step": 1130 + }, + { + "epoch": 0.40220483641536275, + "grad_norm": 4.411909103393555, + "learning_rate": 2.05147589961517e-06, + "loss": 4.1948, + "step": 1131 + }, + { + "epoch": 0.40256045519203415, + "grad_norm": 0.7390024065971375, + "learning_rate": 2.0498596184247196e-06, + "loss": 2.678, + "step": 1132 + }, + { + "epoch": 0.40291607396870555, + "grad_norm": 0.930162787437439, + "learning_rate": 2.0482425993773517e-06, + "loss": 2.704, + "step": 1133 + }, + { + "epoch": 0.40327169274537694, + "grad_norm": 0.9428355693817139, + "learning_rate": 2.046624844642946e-06, + "loss": 2.7526, + "step": 1134 + }, + { + "epoch": 0.40362731152204834, + "grad_norm": 1.2081080675125122, + "learning_rate": 2.045006356392368e-06, + "loss": 3.7157, + "step": 1135 + }, + { + "epoch": 0.4039829302987198, + "grad_norm": 0.7740315794944763, + "learning_rate": 2.043387136797468e-06, + "loss": 2.907, + "step": 1136 + }, + { + "epoch": 0.4043385490753912, + "grad_norm": 0.7332438230514526, + "learning_rate": 2.041767188031078e-06, + "loss": 2.7317, + "step": 1137 + }, + { + "epoch": 0.4046941678520626, + "grad_norm": 1.4491857290267944, + "learning_rate": 2.040146512267008e-06, + "loss": 3.0003, + "step": 1138 + }, + { + "epoch": 0.405049786628734, + "grad_norm": 1.0476363897323608, + "learning_rate": 2.0385251116800436e-06, + "loss": 3.4415, + "step": 1139 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 2.0498874187469482, + "learning_rate": 2.036902988445943e-06, + "loss": 2.8422, + "step": 1140 + }, + { + "epoch": 0.4057610241820768, + "grad_norm": 0.7885639667510986, + "learning_rate": 2.035280144741434e-06, + "loss": 3.1441, + "step": 1141 + }, + { + "epoch": 0.4061166429587482, + "grad_norm": 1.3697552680969238, + "learning_rate": 2.033656582744212e-06, + "loss": 4.3672, + "step": 1142 + }, + { + "epoch": 0.4064722617354196, + "grad_norm": 0.8039948344230652, + "learning_rate": 2.0320323046329353e-06, + "loss": 1.7856, + "step": 1143 + }, + { + "epoch": 0.406827880512091, + "grad_norm": 0.7894800305366516, + "learning_rate": 2.030407312587224e-06, + "loss": 3.0207, + "step": 1144 + }, + { + "epoch": 0.40718349928876246, + "grad_norm": 1.1990467309951782, + "learning_rate": 2.0287816087876552e-06, + "loss": 3.22, + "step": 1145 + }, + { + "epoch": 0.40753911806543386, + "grad_norm": 2.4318552017211914, + "learning_rate": 2.0271551954157624e-06, + "loss": 3.9964, + "step": 1146 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 0.6308404207229614, + "learning_rate": 2.0255280746540296e-06, + "loss": 2.7231, + "step": 1147 + }, + { + "epoch": 0.40825035561877665, + "grad_norm": 0.7565923929214478, + "learning_rate": 2.023900248685892e-06, + "loss": 3.1629, + "step": 1148 + }, + { + "epoch": 0.4086059743954481, + "grad_norm": 1.6498216390609741, + "learning_rate": 2.02227171969573e-06, + "loss": 3.0751, + "step": 1149 + }, + { + "epoch": 0.4089615931721195, + "grad_norm": 0.704748272895813, + "learning_rate": 2.0206424898688674e-06, + "loss": 2.6129, + "step": 1150 + }, + { + "epoch": 0.4093172119487909, + "grad_norm": 0.8071818351745605, + "learning_rate": 2.0190125613915683e-06, + "loss": 2.6933, + "step": 1151 + }, + { + "epoch": 0.4096728307254623, + "grad_norm": 1.5490033626556396, + "learning_rate": 2.0173819364510345e-06, + "loss": 3.5448, + "step": 1152 + }, + { + "epoch": 0.4100284495021337, + "grad_norm": 1.3847293853759766, + "learning_rate": 2.0157506172354025e-06, + "loss": 3.6885, + "step": 1153 + }, + { + "epoch": 0.41038406827880514, + "grad_norm": 1.3351260423660278, + "learning_rate": 2.014118605933741e-06, + "loss": 3.2476, + "step": 1154 + }, + { + "epoch": 0.41073968705547653, + "grad_norm": 1.4039450883865356, + "learning_rate": 2.012485904736047e-06, + "loss": 3.6051, + "step": 1155 + }, + { + "epoch": 0.41109530583214793, + "grad_norm": 0.7516759037971497, + "learning_rate": 2.0108525158332423e-06, + "loss": 2.6798, + "step": 1156 + }, + { + "epoch": 0.4114509246088193, + "grad_norm": 1.0528777837753296, + "learning_rate": 2.0092184414171727e-06, + "loss": 3.4356, + "step": 1157 + }, + { + "epoch": 0.4118065433854908, + "grad_norm": 1.4796075820922852, + "learning_rate": 2.0075836836806027e-06, + "loss": 3.8089, + "step": 1158 + }, + { + "epoch": 0.41216216216216217, + "grad_norm": 1.1250168085098267, + "learning_rate": 2.0059482448172164e-06, + "loss": 2.7872, + "step": 1159 + }, + { + "epoch": 0.41251778093883357, + "grad_norm": 1.4898128509521484, + "learning_rate": 2.0043121270216087e-06, + "loss": 3.9729, + "step": 1160 + }, + { + "epoch": 0.41287339971550496, + "grad_norm": 2.951084613800049, + "learning_rate": 2.002675332489287e-06, + "loss": 4.5276, + "step": 1161 + }, + { + "epoch": 0.4132290184921764, + "grad_norm": 1.3630584478378296, + "learning_rate": 2.001037863416668e-06, + "loss": 3.1415, + "step": 1162 + }, + { + "epoch": 0.4135846372688478, + "grad_norm": 1.1953030824661255, + "learning_rate": 1.999399722001071e-06, + "loss": 3.135, + "step": 1163 + }, + { + "epoch": 0.4139402560455192, + "grad_norm": 1.2339359521865845, + "learning_rate": 1.997760910440719e-06, + "loss": 3.59, + "step": 1164 + }, + { + "epoch": 0.4142958748221906, + "grad_norm": 0.9299193620681763, + "learning_rate": 1.996121430934734e-06, + "loss": 3.1334, + "step": 1165 + }, + { + "epoch": 0.414651493598862, + "grad_norm": 1.4810932874679565, + "learning_rate": 1.9944812856831358e-06, + "loss": 2.9842, + "step": 1166 + }, + { + "epoch": 0.41500711237553345, + "grad_norm": 1.0820094347000122, + "learning_rate": 1.9928404768868347e-06, + "loss": 3.4282, + "step": 1167 + }, + { + "epoch": 0.41536273115220484, + "grad_norm": 1.3282561302185059, + "learning_rate": 1.9911990067476337e-06, + "loss": 4.1438, + "step": 1168 + }, + { + "epoch": 0.41571834992887624, + "grad_norm": 0.761600911617279, + "learning_rate": 1.9895568774682217e-06, + "loss": 2.9506, + "step": 1169 + }, + { + "epoch": 0.41607396870554764, + "grad_norm": 1.3871254920959473, + "learning_rate": 1.9879140912521736e-06, + "loss": 3.2562, + "step": 1170 + }, + { + "epoch": 0.4164295874822191, + "grad_norm": 1.1699870824813843, + "learning_rate": 1.986270650303945e-06, + "loss": 3.1369, + "step": 1171 + }, + { + "epoch": 0.4167852062588905, + "grad_norm": 1.8160303831100464, + "learning_rate": 1.9846265568288694e-06, + "loss": 4.4223, + "step": 1172 + }, + { + "epoch": 0.4171408250355619, + "grad_norm": 0.9804153442382812, + "learning_rate": 1.9829818130331574e-06, + "loss": 3.2789, + "step": 1173 + }, + { + "epoch": 0.4174964438122333, + "grad_norm": 1.4471160173416138, + "learning_rate": 1.981336421123892e-06, + "loss": 4.2731, + "step": 1174 + }, + { + "epoch": 0.41785206258890467, + "grad_norm": 0.7979751825332642, + "learning_rate": 1.979690383309025e-06, + "loss": 2.5355, + "step": 1175 + }, + { + "epoch": 0.4182076813655761, + "grad_norm": 0.9428005814552307, + "learning_rate": 1.978043701797375e-06, + "loss": 2.7687, + "step": 1176 + }, + { + "epoch": 0.4185633001422475, + "grad_norm": 15.704306602478027, + "learning_rate": 1.976396378798626e-06, + "loss": 4.3187, + "step": 1177 + }, + { + "epoch": 0.4189189189189189, + "grad_norm": 1.6610827445983887, + "learning_rate": 1.9747484165233196e-06, + "loss": 2.5397, + "step": 1178 + }, + { + "epoch": 0.4192745376955903, + "grad_norm": 1.2655655145645142, + "learning_rate": 1.9730998171828595e-06, + "loss": 2.3354, + "step": 1179 + }, + { + "epoch": 0.41963015647226176, + "grad_norm": 1.2587281465530396, + "learning_rate": 1.9714505829895003e-06, + "loss": 2.6767, + "step": 1180 + }, + { + "epoch": 0.41998577524893316, + "grad_norm": 2.3342533111572266, + "learning_rate": 1.969800716156352e-06, + "loss": 3.562, + "step": 1181 + }, + { + "epoch": 0.42034139402560455, + "grad_norm": 0.7963041663169861, + "learning_rate": 1.96815021889737e-06, + "loss": 3.238, + "step": 1182 + }, + { + "epoch": 0.42069701280227595, + "grad_norm": 0.9960249066352844, + "learning_rate": 1.9664990934273583e-06, + "loss": 2.8909, + "step": 1183 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.9552269577980042, + "learning_rate": 1.964847341961963e-06, + "loss": 3.5228, + "step": 1184 + }, + { + "epoch": 0.4214082503556188, + "grad_norm": 1.0879406929016113, + "learning_rate": 1.96319496671767e-06, + "loss": 3.6223, + "step": 1185 + }, + { + "epoch": 0.4217638691322902, + "grad_norm": 2.7427046298980713, + "learning_rate": 1.9615419699118033e-06, + "loss": 2.2224, + "step": 1186 + }, + { + "epoch": 0.4221194879089616, + "grad_norm": 2.741199016571045, + "learning_rate": 1.9598883537625195e-06, + "loss": 2.3994, + "step": 1187 + }, + { + "epoch": 0.422475106685633, + "grad_norm": 1.1361947059631348, + "learning_rate": 1.9582341204888077e-06, + "loss": 3.6497, + "step": 1188 + }, + { + "epoch": 0.42283072546230444, + "grad_norm": 0.9967765808105469, + "learning_rate": 1.9565792723104835e-06, + "loss": 3.7234, + "step": 1189 + }, + { + "epoch": 0.42318634423897583, + "grad_norm": 0.8949447870254517, + "learning_rate": 1.9549238114481886e-06, + "loss": 2.8521, + "step": 1190 + }, + { + "epoch": 0.4235419630156472, + "grad_norm": 1.0552979707717896, + "learning_rate": 1.9532677401233876e-06, + "loss": 2.2779, + "step": 1191 + }, + { + "epoch": 0.4238975817923186, + "grad_norm": 2.112567901611328, + "learning_rate": 1.951611060558363e-06, + "loss": 4.3097, + "step": 1192 + }, + { + "epoch": 0.42425320056899, + "grad_norm": 1.0670408010482788, + "learning_rate": 1.9499537749762137e-06, + "loss": 2.4092, + "step": 1193 + }, + { + "epoch": 0.42460881934566147, + "grad_norm": 4.230320453643799, + "learning_rate": 1.9482958856008532e-06, + "loss": 1.7695, + "step": 1194 + }, + { + "epoch": 0.42496443812233287, + "grad_norm": 1.573508381843567, + "learning_rate": 1.946637394657003e-06, + "loss": 3.5192, + "step": 1195 + }, + { + "epoch": 0.42532005689900426, + "grad_norm": 0.7917094826698303, + "learning_rate": 1.9449783043701933e-06, + "loss": 2.8455, + "step": 1196 + }, + { + "epoch": 0.42567567567567566, + "grad_norm": 1.4040340185165405, + "learning_rate": 1.9433186169667584e-06, + "loss": 4.2244, + "step": 1197 + }, + { + "epoch": 0.4260312944523471, + "grad_norm": 0.7811653017997742, + "learning_rate": 1.941658334673834e-06, + "loss": 2.935, + "step": 1198 + }, + { + "epoch": 0.4263869132290185, + "grad_norm": 1.0627952814102173, + "learning_rate": 1.9399974597193536e-06, + "loss": 3.1178, + "step": 1199 + }, + { + "epoch": 0.4267425320056899, + "grad_norm": 1.7185695171356201, + "learning_rate": 1.938335994332046e-06, + "loss": 3.2969, + "step": 1200 + }, + { + "epoch": 0.4270981507823613, + "grad_norm": 1.0585577487945557, + "learning_rate": 1.9366739407414316e-06, + "loss": 3.6087, + "step": 1201 + }, + { + "epoch": 0.4274537695590327, + "grad_norm": 0.8509253263473511, + "learning_rate": 1.935011301177823e-06, + "loss": 2.7568, + "step": 1202 + }, + { + "epoch": 0.42780938833570414, + "grad_norm": 1.1837538480758667, + "learning_rate": 1.9333480778723156e-06, + "loss": 3.5351, + "step": 1203 + }, + { + "epoch": 0.42816500711237554, + "grad_norm": 0.9431052803993225, + "learning_rate": 1.9316842730567903e-06, + "loss": 2.8073, + "step": 1204 + }, + { + "epoch": 0.42852062588904694, + "grad_norm": 0.8835837244987488, + "learning_rate": 1.9300198889639077e-06, + "loss": 2.8014, + "step": 1205 + }, + { + "epoch": 0.42887624466571833, + "grad_norm": 1.9524058103561401, + "learning_rate": 1.928354927827105e-06, + "loss": 4.1647, + "step": 1206 + }, + { + "epoch": 0.4292318634423898, + "grad_norm": 1.2943880558013916, + "learning_rate": 1.9266893918805956e-06, + "loss": 1.5046, + "step": 1207 + }, + { + "epoch": 0.4295874822190612, + "grad_norm": 1.3878569602966309, + "learning_rate": 1.9250232833593623e-06, + "loss": 3.5798, + "step": 1208 + }, + { + "epoch": 0.4299431009957326, + "grad_norm": 0.9317572712898254, + "learning_rate": 1.923356604499157e-06, + "loss": 2.8699, + "step": 1209 + }, + { + "epoch": 0.43029871977240397, + "grad_norm": 0.8179857134819031, + "learning_rate": 1.9216893575364967e-06, + "loss": 2.5532, + "step": 1210 + }, + { + "epoch": 0.43065433854907537, + "grad_norm": 1.4965237379074097, + "learning_rate": 1.920021544708662e-06, + "loss": 2.9192, + "step": 1211 + }, + { + "epoch": 0.4310099573257468, + "grad_norm": 1.0958713293075562, + "learning_rate": 1.918353168253691e-06, + "loss": 3.3453, + "step": 1212 + }, + { + "epoch": 0.4313655761024182, + "grad_norm": 1.4860501289367676, + "learning_rate": 1.9166842304103794e-06, + "loss": 3.07, + "step": 1213 + }, + { + "epoch": 0.4317211948790896, + "grad_norm": 1.0943331718444824, + "learning_rate": 1.9150147334182753e-06, + "loss": 3.1878, + "step": 1214 + }, + { + "epoch": 0.432076813655761, + "grad_norm": 0.9233648777008057, + "learning_rate": 1.913344679517678e-06, + "loss": 2.9321, + "step": 1215 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.824061632156372, + "learning_rate": 1.9116740709496336e-06, + "loss": 1.8086, + "step": 1216 + }, + { + "epoch": 0.43278805120910385, + "grad_norm": 1.2022380828857422, + "learning_rate": 1.9100029099559324e-06, + "loss": 3.3336, + "step": 1217 + }, + { + "epoch": 0.43314366998577525, + "grad_norm": 1.983272910118103, + "learning_rate": 1.9083311987791067e-06, + "loss": 2.2804, + "step": 1218 + }, + { + "epoch": 0.43349928876244664, + "grad_norm": 0.9970908164978027, + "learning_rate": 1.906658939662427e-06, + "loss": 2.4573, + "step": 1219 + }, + { + "epoch": 0.43385490753911804, + "grad_norm": 1.2265111207962036, + "learning_rate": 1.9049861348498973e-06, + "loss": 3.7674, + "step": 1220 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 1.3449424505233765, + "learning_rate": 1.9033127865862568e-06, + "loss": 3.1217, + "step": 1221 + }, + { + "epoch": 0.4345661450924609, + "grad_norm": 1.6573138236999512, + "learning_rate": 1.901638897116971e-06, + "loss": 3.5256, + "step": 1222 + }, + { + "epoch": 0.4349217638691323, + "grad_norm": 0.8883774876594543, + "learning_rate": 1.8999644686882338e-06, + "loss": 3.2192, + "step": 1223 + }, + { + "epoch": 0.4352773826458037, + "grad_norm": 1.5854660272598267, + "learning_rate": 1.898289503546962e-06, + "loss": 3.3843, + "step": 1224 + }, + { + "epoch": 0.43563300142247513, + "grad_norm": 0.9947431087493896, + "learning_rate": 1.8966140039407917e-06, + "loss": 2.8039, + "step": 1225 + }, + { + "epoch": 0.4359886201991465, + "grad_norm": 1.0700260400772095, + "learning_rate": 1.894937972118077e-06, + "loss": 3.5483, + "step": 1226 + }, + { + "epoch": 0.4363442389758179, + "grad_norm": 0.8027447462081909, + "learning_rate": 1.8932614103278855e-06, + "loss": 2.3641, + "step": 1227 + }, + { + "epoch": 0.4366998577524893, + "grad_norm": 0.9464264512062073, + "learning_rate": 1.8915843208199966e-06, + "loss": 2.725, + "step": 1228 + }, + { + "epoch": 0.4370554765291607, + "grad_norm": 0.7861636877059937, + "learning_rate": 1.8899067058448978e-06, + "loss": 2.011, + "step": 1229 + }, + { + "epoch": 0.43741109530583216, + "grad_norm": 0.980937659740448, + "learning_rate": 1.888228567653781e-06, + "loss": 1.7486, + "step": 1230 + }, + { + "epoch": 0.43776671408250356, + "grad_norm": 0.9673341512680054, + "learning_rate": 1.8865499084985416e-06, + "loss": 3.0849, + "step": 1231 + }, + { + "epoch": 0.43812233285917496, + "grad_norm": 5.620379447937012, + "learning_rate": 1.8848707306317725e-06, + "loss": 2.5479, + "step": 1232 + }, + { + "epoch": 0.43847795163584635, + "grad_norm": 0.8235766291618347, + "learning_rate": 1.8831910363067635e-06, + "loss": 2.2512, + "step": 1233 + }, + { + "epoch": 0.4388335704125178, + "grad_norm": 1.0376087427139282, + "learning_rate": 1.8815108277774976e-06, + "loss": 2.6616, + "step": 1234 + }, + { + "epoch": 0.4391891891891892, + "grad_norm": 1.013095736503601, + "learning_rate": 1.8798301072986473e-06, + "loss": 3.5354, + "step": 1235 + }, + { + "epoch": 0.4395448079658606, + "grad_norm": 0.9926885366439819, + "learning_rate": 1.878148877125572e-06, + "loss": 3.13, + "step": 1236 + }, + { + "epoch": 0.439900426742532, + "grad_norm": 1.1407909393310547, + "learning_rate": 1.876467139514316e-06, + "loss": 2.2162, + "step": 1237 + }, + { + "epoch": 0.4402560455192034, + "grad_norm": 1.0908992290496826, + "learning_rate": 1.8747848967216038e-06, + "loss": 1.41, + "step": 1238 + }, + { + "epoch": 0.44061166429587484, + "grad_norm": 1.0481173992156982, + "learning_rate": 1.8731021510048372e-06, + "loss": 2.9928, + "step": 1239 + }, + { + "epoch": 0.44096728307254623, + "grad_norm": 0.8020336031913757, + "learning_rate": 1.8714189046220946e-06, + "loss": 1.4569, + "step": 1240 + }, + { + "epoch": 0.44132290184921763, + "grad_norm": 1.4173551797866821, + "learning_rate": 1.8697351598321248e-06, + "loss": 3.8112, + "step": 1241 + }, + { + "epoch": 0.441678520625889, + "grad_norm": 1.1720863580703735, + "learning_rate": 1.868050918894345e-06, + "loss": 3.5973, + "step": 1242 + }, + { + "epoch": 0.4420341394025605, + "grad_norm": 1.0001498460769653, + "learning_rate": 1.8663661840688405e-06, + "loss": 2.814, + "step": 1243 + }, + { + "epoch": 0.4423897581792319, + "grad_norm": 1.0631517171859741, + "learning_rate": 1.8646809576163566e-06, + "loss": 2.876, + "step": 1244 + }, + { + "epoch": 0.44274537695590327, + "grad_norm": 0.8408397436141968, + "learning_rate": 1.8629952417983008e-06, + "loss": 3.1742, + "step": 1245 + }, + { + "epoch": 0.44310099573257467, + "grad_norm": 1.2707439661026, + "learning_rate": 1.861309038876735e-06, + "loss": 3.4186, + "step": 1246 + }, + { + "epoch": 0.44345661450924606, + "grad_norm": 3.573448657989502, + "learning_rate": 1.8596223511143764e-06, + "loss": 3.8985, + "step": 1247 + }, + { + "epoch": 0.4438122332859175, + "grad_norm": 1.5537052154541016, + "learning_rate": 1.8579351807745921e-06, + "loss": 3.2982, + "step": 1248 + }, + { + "epoch": 0.4441678520625889, + "grad_norm": 1.3650177717208862, + "learning_rate": 1.856247530121396e-06, + "loss": 3.3587, + "step": 1249 + }, + { + "epoch": 0.4445234708392603, + "grad_norm": 1.2084611654281616, + "learning_rate": 1.8545594014194486e-06, + "loss": 3.1128, + "step": 1250 + }, + { + "epoch": 0.4448790896159317, + "grad_norm": 1.840624451637268, + "learning_rate": 1.8528707969340508e-06, + "loss": 4.3627, + "step": 1251 + }, + { + "epoch": 0.44523470839260315, + "grad_norm": 1.0589988231658936, + "learning_rate": 1.851181718931141e-06, + "loss": 3.3584, + "step": 1252 + }, + { + "epoch": 0.44559032716927455, + "grad_norm": 2.659188985824585, + "learning_rate": 1.8494921696772942e-06, + "loss": 4.4691, + "step": 1253 + }, + { + "epoch": 0.44594594594594594, + "grad_norm": 1.6308623552322388, + "learning_rate": 1.8478021514397174e-06, + "loss": 2.9489, + "step": 1254 + }, + { + "epoch": 0.44630156472261734, + "grad_norm": 1.50052011013031, + "learning_rate": 1.8461116664862473e-06, + "loss": 2.909, + "step": 1255 + }, + { + "epoch": 0.4466571834992888, + "grad_norm": 1.1606327295303345, + "learning_rate": 1.8444207170853464e-06, + "loss": 3.5379, + "step": 1256 + }, + { + "epoch": 0.4470128022759602, + "grad_norm": 1.7311278581619263, + "learning_rate": 1.8427293055061008e-06, + "loss": 2.8877, + "step": 1257 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 3.4432742595672607, + "learning_rate": 1.841037434018216e-06, + "loss": 3.8808, + "step": 1258 + }, + { + "epoch": 0.447724039829303, + "grad_norm": 1.134280800819397, + "learning_rate": 1.8393451048920157e-06, + "loss": 3.0706, + "step": 1259 + }, + { + "epoch": 0.4480796586059744, + "grad_norm": 2.280168056488037, + "learning_rate": 1.8376523203984371e-06, + "loss": 3.0887, + "step": 1260 + }, + { + "epoch": 0.4484352773826458, + "grad_norm": 6.238641738891602, + "learning_rate": 1.8359590828090286e-06, + "loss": 2.1619, + "step": 1261 + }, + { + "epoch": 0.4487908961593172, + "grad_norm": 2.950145959854126, + "learning_rate": 1.8342653943959468e-06, + "loss": 2.0421, + "step": 1262 + }, + { + "epoch": 0.4491465149359886, + "grad_norm": 2.5830514430999756, + "learning_rate": 1.8325712574319534e-06, + "loss": 3.9402, + "step": 1263 + }, + { + "epoch": 0.44950213371266, + "grad_norm": 2.1258039474487305, + "learning_rate": 1.830876674190411e-06, + "loss": 5.298, + "step": 1264 + }, + { + "epoch": 0.44985775248933146, + "grad_norm": 1.2029622793197632, + "learning_rate": 1.8291816469452821e-06, + "loss": 3.3404, + "step": 1265 + }, + { + "epoch": 0.45021337126600286, + "grad_norm": 0.9771932363510132, + "learning_rate": 1.8274861779711248e-06, + "loss": 2.7401, + "step": 1266 + }, + { + "epoch": 0.45056899004267426, + "grad_norm": 0.7700138092041016, + "learning_rate": 1.8257902695430895e-06, + "loss": 2.8072, + "step": 1267 + }, + { + "epoch": 0.45092460881934565, + "grad_norm": 1.0733921527862549, + "learning_rate": 1.824093923936917e-06, + "loss": 3.3446, + "step": 1268 + }, + { + "epoch": 0.45128022759601705, + "grad_norm": 1.1741714477539062, + "learning_rate": 1.8223971434289341e-06, + "loss": 3.8211, + "step": 1269 + }, + { + "epoch": 0.4516358463726885, + "grad_norm": 1.2993524074554443, + "learning_rate": 1.8206999302960515e-06, + "loss": 2.5868, + "step": 1270 + }, + { + "epoch": 0.4519914651493599, + "grad_norm": 0.8700075745582581, + "learning_rate": 1.8190022868157604e-06, + "loss": 2.6442, + "step": 1271 + }, + { + "epoch": 0.4523470839260313, + "grad_norm": 0.955193817615509, + "learning_rate": 1.8173042152661296e-06, + "loss": 2.9403, + "step": 1272 + }, + { + "epoch": 0.4527027027027027, + "grad_norm": 1.2979075908660889, + "learning_rate": 1.8156057179258025e-06, + "loss": 3.4038, + "step": 1273 + }, + { + "epoch": 0.45305832147937414, + "grad_norm": 1.3996999263763428, + "learning_rate": 1.8139067970739927e-06, + "loss": 3.2477, + "step": 1274 + }, + { + "epoch": 0.45341394025604553, + "grad_norm": 0.9934287667274475, + "learning_rate": 1.8122074549904843e-06, + "loss": 2.6547, + "step": 1275 + }, + { + "epoch": 0.45376955903271693, + "grad_norm": 0.8089761734008789, + "learning_rate": 1.8105076939556238e-06, + "loss": 2.4564, + "step": 1276 + }, + { + "epoch": 0.4541251778093883, + "grad_norm": 1.0582878589630127, + "learning_rate": 1.808807516250323e-06, + "loss": 3.4766, + "step": 1277 + }, + { + "epoch": 0.4544807965860597, + "grad_norm": 0.9200790524482727, + "learning_rate": 1.8071069241560503e-06, + "loss": 3.0662, + "step": 1278 + }, + { + "epoch": 0.4548364153627312, + "grad_norm": 0.9415099024772644, + "learning_rate": 1.8054059199548313e-06, + "loss": 2.6878, + "step": 1279 + }, + { + "epoch": 0.45519203413940257, + "grad_norm": 0.880761981010437, + "learning_rate": 1.803704505929245e-06, + "loss": 2.7868, + "step": 1280 + }, + { + "epoch": 0.45554765291607396, + "grad_norm": 0.8638026714324951, + "learning_rate": 1.8020026843624188e-06, + "loss": 2.3101, + "step": 1281 + }, + { + "epoch": 0.45590327169274536, + "grad_norm": 1.0444724559783936, + "learning_rate": 1.8003004575380284e-06, + "loss": 2.0608, + "step": 1282 + }, + { + "epoch": 0.4562588904694168, + "grad_norm": 1.1470823287963867, + "learning_rate": 1.7985978277402933e-06, + "loss": 2.7246, + "step": 1283 + }, + { + "epoch": 0.4566145092460882, + "grad_norm": 1.3363723754882812, + "learning_rate": 1.7968947972539733e-06, + "loss": 3.834, + "step": 1284 + }, + { + "epoch": 0.4569701280227596, + "grad_norm": 0.751015841960907, + "learning_rate": 1.7951913683643656e-06, + "loss": 2.3934, + "step": 1285 + }, + { + "epoch": 0.457325746799431, + "grad_norm": 0.7517382502555847, + "learning_rate": 1.7934875433573023e-06, + "loss": 2.2481, + "step": 1286 + }, + { + "epoch": 0.4576813655761024, + "grad_norm": 1.222005009651184, + "learning_rate": 1.7917833245191467e-06, + "loss": 3.9211, + "step": 1287 + }, + { + "epoch": 0.45803698435277385, + "grad_norm": 0.9670055508613586, + "learning_rate": 1.7900787141367921e-06, + "loss": 2.7069, + "step": 1288 + }, + { + "epoch": 0.45839260312944524, + "grad_norm": 0.7189592719078064, + "learning_rate": 1.7883737144976552e-06, + "loss": 2.8167, + "step": 1289 + }, + { + "epoch": 0.45874822190611664, + "grad_norm": 1.471415400505066, + "learning_rate": 1.7866683278896764e-06, + "loss": 2.6093, + "step": 1290 + }, + { + "epoch": 0.45910384068278803, + "grad_norm": 6.371167182922363, + "learning_rate": 1.7849625566013146e-06, + "loss": 2.9362, + "step": 1291 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 1.6711004972457886, + "learning_rate": 1.7832564029215447e-06, + "loss": 3.8002, + "step": 1292 + }, + { + "epoch": 0.4598150782361309, + "grad_norm": 1.2341808080673218, + "learning_rate": 1.7815498691398563e-06, + "loss": 3.3448, + "step": 1293 + }, + { + "epoch": 0.4601706970128023, + "grad_norm": 1.596807599067688, + "learning_rate": 1.7798429575462477e-06, + "loss": 3.8788, + "step": 1294 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.104864239692688, + "learning_rate": 1.7781356704312244e-06, + "loss": 3.591, + "step": 1295 + }, + { + "epoch": 0.46088193456614507, + "grad_norm": 0.8450590968132019, + "learning_rate": 1.7764280100857958e-06, + "loss": 2.899, + "step": 1296 + }, + { + "epoch": 0.4612375533428165, + "grad_norm": 1.0732132196426392, + "learning_rate": 1.7747199788014719e-06, + "loss": 3.2735, + "step": 1297 + }, + { + "epoch": 0.4615931721194879, + "grad_norm": 1.0417392253875732, + "learning_rate": 1.7730115788702612e-06, + "loss": 3.2742, + "step": 1298 + }, + { + "epoch": 0.4619487908961593, + "grad_norm": 2.256059408187866, + "learning_rate": 1.7713028125846667e-06, + "loss": 4.4522, + "step": 1299 + }, + { + "epoch": 0.4623044096728307, + "grad_norm": 1.192878246307373, + "learning_rate": 1.769593682237682e-06, + "loss": 3.2235, + "step": 1300 + }, + { + "epoch": 0.46266002844950216, + "grad_norm": 1.0733771324157715, + "learning_rate": 1.767884190122791e-06, + "loss": 3.1767, + "step": 1301 + }, + { + "epoch": 0.46301564722617355, + "grad_norm": 0.9384622573852539, + "learning_rate": 1.7661743385339615e-06, + "loss": 3.1815, + "step": 1302 + }, + { + "epoch": 0.46337126600284495, + "grad_norm": 0.8645901679992676, + "learning_rate": 1.7644641297656445e-06, + "loss": 2.3878, + "step": 1303 + }, + { + "epoch": 0.46372688477951635, + "grad_norm": 1.2439237833023071, + "learning_rate": 1.7627535661127697e-06, + "loss": 2.4303, + "step": 1304 + }, + { + "epoch": 0.46408250355618774, + "grad_norm": 1.449986219406128, + "learning_rate": 1.7610426498707441e-06, + "loss": 2.424, + "step": 1305 + }, + { + "epoch": 0.4644381223328592, + "grad_norm": 0.8437250852584839, + "learning_rate": 1.7593313833354463e-06, + "loss": 2.455, + "step": 1306 + }, + { + "epoch": 0.4647937411095306, + "grad_norm": 0.8924964070320129, + "learning_rate": 1.7576197688032261e-06, + "loss": 2.781, + "step": 1307 + }, + { + "epoch": 0.465149359886202, + "grad_norm": 0.7133435010910034, + "learning_rate": 1.7559078085709001e-06, + "loss": 2.3107, + "step": 1308 + }, + { + "epoch": 0.4655049786628734, + "grad_norm": 1.2549934387207031, + "learning_rate": 1.7541955049357485e-06, + "loss": 3.2302, + "step": 1309 + }, + { + "epoch": 0.46586059743954483, + "grad_norm": 1.137670636177063, + "learning_rate": 1.7524828601955126e-06, + "loss": 2.7607, + "step": 1310 + }, + { + "epoch": 0.46621621621621623, + "grad_norm": 4.090147495269775, + "learning_rate": 1.7507698766483913e-06, + "loss": 4.1473, + "step": 1311 + }, + { + "epoch": 0.4665718349928876, + "grad_norm": 1.2547032833099365, + "learning_rate": 1.7490565565930381e-06, + "loss": 2.6256, + "step": 1312 + }, + { + "epoch": 0.466927453769559, + "grad_norm": 0.7914095520973206, + "learning_rate": 1.747342902328558e-06, + "loss": 2.8957, + "step": 1313 + }, + { + "epoch": 0.4672830725462304, + "grad_norm": 0.8543210625648499, + "learning_rate": 1.7456289161545042e-06, + "loss": 2.7542, + "step": 1314 + }, + { + "epoch": 0.46763869132290187, + "grad_norm": 1.5435365438461304, + "learning_rate": 1.7439146003708765e-06, + "loss": 3.8132, + "step": 1315 + }, + { + "epoch": 0.46799431009957326, + "grad_norm": 1.5442830324172974, + "learning_rate": 1.742199957278116e-06, + "loss": 4.301, + "step": 1316 + }, + { + "epoch": 0.46834992887624466, + "grad_norm": 0.9005233645439148, + "learning_rate": 1.7404849891771025e-06, + "loss": 2.5786, + "step": 1317 + }, + { + "epoch": 0.46870554765291605, + "grad_norm": 0.8497214913368225, + "learning_rate": 1.7387696983691536e-06, + "loss": 2.4358, + "step": 1318 + }, + { + "epoch": 0.4690611664295875, + "grad_norm": 1.1385059356689453, + "learning_rate": 1.7370540871560178e-06, + "loss": 3.4884, + "step": 1319 + }, + { + "epoch": 0.4694167852062589, + "grad_norm": 0.7453454732894897, + "learning_rate": 1.7353381578398753e-06, + "loss": 2.5682, + "step": 1320 + }, + { + "epoch": 0.4697724039829303, + "grad_norm": 1.2529376745224, + "learning_rate": 1.7336219127233332e-06, + "loss": 3.5173, + "step": 1321 + }, + { + "epoch": 0.4701280227596017, + "grad_norm": 1.4965184926986694, + "learning_rate": 1.731905354109421e-06, + "loss": 4.4684, + "step": 1322 + }, + { + "epoch": 0.4704836415362731, + "grad_norm": 1.3085218667984009, + "learning_rate": 1.7301884843015898e-06, + "loss": 3.1181, + "step": 1323 + }, + { + "epoch": 0.47083926031294454, + "grad_norm": 1.1635643243789673, + "learning_rate": 1.7284713056037075e-06, + "loss": 3.4994, + "step": 1324 + }, + { + "epoch": 0.47119487908961594, + "grad_norm": 0.8440811038017273, + "learning_rate": 1.726753820320058e-06, + "loss": 2.7632, + "step": 1325 + }, + { + "epoch": 0.47155049786628733, + "grad_norm": 1.1214430332183838, + "learning_rate": 1.725036030755336e-06, + "loss": 3.1511, + "step": 1326 + }, + { + "epoch": 0.47190611664295873, + "grad_norm": 0.9827189445495605, + "learning_rate": 1.7233179392146433e-06, + "loss": 2.6547, + "step": 1327 + }, + { + "epoch": 0.4722617354196302, + "grad_norm": 1.458299160003662, + "learning_rate": 1.721599548003488e-06, + "loss": 2.9758, + "step": 1328 + }, + { + "epoch": 0.4726173541963016, + "grad_norm": 0.9266345500946045, + "learning_rate": 1.7198808594277806e-06, + "loss": 2.7469, + "step": 1329 + }, + { + "epoch": 0.47297297297297297, + "grad_norm": 1.2484135627746582, + "learning_rate": 1.71816187579383e-06, + "loss": 3.0157, + "step": 1330 + }, + { + "epoch": 0.47332859174964437, + "grad_norm": 0.7482984066009521, + "learning_rate": 1.716442599408341e-06, + "loss": 2.6035, + "step": 1331 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9790404438972473, + "learning_rate": 1.7147230325784123e-06, + "loss": 3.4682, + "step": 1332 + }, + { + "epoch": 0.4740398293029872, + "grad_norm": 0.8533628582954407, + "learning_rate": 1.7130031776115308e-06, + "loss": 2.747, + "step": 1333 + }, + { + "epoch": 0.4743954480796586, + "grad_norm": 1.6351687908172607, + "learning_rate": 1.7112830368155709e-06, + "loss": 2.7579, + "step": 1334 + }, + { + "epoch": 0.47475106685633, + "grad_norm": 2.4416415691375732, + "learning_rate": 1.7095626124987906e-06, + "loss": 3.7209, + "step": 1335 + }, + { + "epoch": 0.4751066856330014, + "grad_norm": 2.8643083572387695, + "learning_rate": 1.7078419069698285e-06, + "loss": 3.8339, + "step": 1336 + }, + { + "epoch": 0.47546230440967285, + "grad_norm": 2.5443849563598633, + "learning_rate": 1.7061209225377e-06, + "loss": 5.1044, + "step": 1337 + }, + { + "epoch": 0.47581792318634425, + "grad_norm": 1.0005624294281006, + "learning_rate": 1.7043996615117948e-06, + "loss": 2.7049, + "step": 1338 + }, + { + "epoch": 0.47617354196301565, + "grad_norm": 1.3932358026504517, + "learning_rate": 1.7026781262018743e-06, + "loss": 1.8377, + "step": 1339 + }, + { + "epoch": 0.47652916073968704, + "grad_norm": 1.0409512519836426, + "learning_rate": 1.7009563189180677e-06, + "loss": 2.661, + "step": 1340 + }, + { + "epoch": 0.47688477951635844, + "grad_norm": 0.9010860919952393, + "learning_rate": 1.699234241970869e-06, + "loss": 3.1422, + "step": 1341 + }, + { + "epoch": 0.4772403982930299, + "grad_norm": 1.2027314901351929, + "learning_rate": 1.697511897671134e-06, + "loss": 3.6802, + "step": 1342 + }, + { + "epoch": 0.4775960170697013, + "grad_norm": 1.0025852918624878, + "learning_rate": 1.6957892883300778e-06, + "loss": 2.6317, + "step": 1343 + }, + { + "epoch": 0.4779516358463727, + "grad_norm": 7.699251651763916, + "learning_rate": 1.6940664162592704e-06, + "loss": 1.7063, + "step": 1344 + }, + { + "epoch": 0.4783072546230441, + "grad_norm": 0.8945563435554504, + "learning_rate": 1.6923432837706349e-06, + "loss": 3.2205, + "step": 1345 + }, + { + "epoch": 0.4786628733997155, + "grad_norm": 1.2540184259414673, + "learning_rate": 1.6906198931764435e-06, + "loss": 3.8415, + "step": 1346 + }, + { + "epoch": 0.4790184921763869, + "grad_norm": 0.9175933599472046, + "learning_rate": 1.6888962467893157e-06, + "loss": 2.7157, + "step": 1347 + }, + { + "epoch": 0.4793741109530583, + "grad_norm": 0.8572230935096741, + "learning_rate": 1.687172346922213e-06, + "loss": 2.3027, + "step": 1348 + }, + { + "epoch": 0.4797297297297297, + "grad_norm": 1.731175184249878, + "learning_rate": 1.6854481958884378e-06, + "loss": 4.1895, + "step": 1349 + }, + { + "epoch": 0.4800853485064011, + "grad_norm": 0.7315400838851929, + "learning_rate": 1.683723796001629e-06, + "loss": 2.8777, + "step": 1350 + }, + { + "epoch": 0.48044096728307256, + "grad_norm": 0.8528626561164856, + "learning_rate": 1.6819991495757594e-06, + "loss": 2.6813, + "step": 1351 + }, + { + "epoch": 0.48079658605974396, + "grad_norm": 1.218915581703186, + "learning_rate": 1.6802742589251334e-06, + "loss": 4.0362, + "step": 1352 + }, + { + "epoch": 0.48115220483641535, + "grad_norm": 0.8760390877723694, + "learning_rate": 1.6785491263643832e-06, + "loss": 2.4521, + "step": 1353 + }, + { + "epoch": 0.48150782361308675, + "grad_norm": 2.292841672897339, + "learning_rate": 1.6768237542084645e-06, + "loss": 3.2559, + "step": 1354 + }, + { + "epoch": 0.4818634423897582, + "grad_norm": 0.9341546893119812, + "learning_rate": 1.675098144772655e-06, + "loss": 2.6295, + "step": 1355 + }, + { + "epoch": 0.4822190611664296, + "grad_norm": 0.9849392175674438, + "learning_rate": 1.6733723003725516e-06, + "loss": 3.4302, + "step": 1356 + }, + { + "epoch": 0.482574679943101, + "grad_norm": 0.9687156081199646, + "learning_rate": 1.6716462233240645e-06, + "loss": 2.8786, + "step": 1357 + }, + { + "epoch": 0.4829302987197724, + "grad_norm": 0.8506249189376831, + "learning_rate": 1.6699199159434188e-06, + "loss": 0.9033, + "step": 1358 + }, + { + "epoch": 0.48328591749644384, + "grad_norm": 1.0987849235534668, + "learning_rate": 1.6681933805471467e-06, + "loss": 3.5598, + "step": 1359 + }, + { + "epoch": 0.48364153627311524, + "grad_norm": 1.231833577156067, + "learning_rate": 1.6664666194520873e-06, + "loss": 2.7887, + "step": 1360 + }, + { + "epoch": 0.48399715504978663, + "grad_norm": 1.154465675354004, + "learning_rate": 1.6647396349753816e-06, + "loss": 2.5387, + "step": 1361 + }, + { + "epoch": 0.484352773826458, + "grad_norm": 1.3123425245285034, + "learning_rate": 1.6630124294344715e-06, + "loss": 3.0801, + "step": 1362 + }, + { + "epoch": 0.4847083926031294, + "grad_norm": 1.0267293453216553, + "learning_rate": 1.6612850051470953e-06, + "loss": 2.9382, + "step": 1363 + }, + { + "epoch": 0.4850640113798009, + "grad_norm": 1.9080570936203003, + "learning_rate": 1.6595573644312836e-06, + "loss": 3.2559, + "step": 1364 + }, + { + "epoch": 0.48541963015647227, + "grad_norm": 0.7708364725112915, + "learning_rate": 1.6578295096053592e-06, + "loss": 2.4393, + "step": 1365 + }, + { + "epoch": 0.48577524893314367, + "grad_norm": 0.949171781539917, + "learning_rate": 1.6561014429879316e-06, + "loss": 3.1404, + "step": 1366 + }, + { + "epoch": 0.48613086770981506, + "grad_norm": 2.1036911010742188, + "learning_rate": 1.6543731668978942e-06, + "loss": 4.0751, + "step": 1367 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 1.583046317100525, + "learning_rate": 1.6526446836544205e-06, + "loss": 3.1863, + "step": 1368 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 1.1970021724700928, + "learning_rate": 1.6509159955769644e-06, + "loss": 2.5898, + "step": 1369 + }, + { + "epoch": 0.4871977240398293, + "grad_norm": 1.533984661102295, + "learning_rate": 1.6491871049852527e-06, + "loss": 2.9912, + "step": 1370 + }, + { + "epoch": 0.4875533428165007, + "grad_norm": 1.501013994216919, + "learning_rate": 1.6474580141992849e-06, + "loss": 2.0602, + "step": 1371 + }, + { + "epoch": 0.4879089615931721, + "grad_norm": 1.342531681060791, + "learning_rate": 1.6457287255393288e-06, + "loss": 1.8611, + "step": 1372 + }, + { + "epoch": 0.48826458036984355, + "grad_norm": 3.1259913444519043, + "learning_rate": 1.643999241325918e-06, + "loss": 3.6697, + "step": 1373 + }, + { + "epoch": 0.48862019914651494, + "grad_norm": 0.8179345726966858, + "learning_rate": 1.6422695638798478e-06, + "loss": 2.73, + "step": 1374 + }, + { + "epoch": 0.48897581792318634, + "grad_norm": 0.8469723463058472, + "learning_rate": 1.6405396955221735e-06, + "loss": 2.981, + "step": 1375 + }, + { + "epoch": 0.48933143669985774, + "grad_norm": 0.9565249085426331, + "learning_rate": 1.638809638574207e-06, + "loss": 1.9799, + "step": 1376 + }, + { + "epoch": 0.4896870554765292, + "grad_norm": 1.3571863174438477, + "learning_rate": 1.637079395357511e-06, + "loss": 2.2189, + "step": 1377 + }, + { + "epoch": 0.4900426742532006, + "grad_norm": 1.263188362121582, + "learning_rate": 1.6353489681939015e-06, + "loss": 3.1712, + "step": 1378 + }, + { + "epoch": 0.490398293029872, + "grad_norm": 1.2114607095718384, + "learning_rate": 1.633618359405439e-06, + "loss": 3.2528, + "step": 1379 + }, + { + "epoch": 0.4907539118065434, + "grad_norm": 1.0519704818725586, + "learning_rate": 1.6318875713144285e-06, + "loss": 2.6979, + "step": 1380 + }, + { + "epoch": 0.49110953058321477, + "grad_norm": 0.9647424817085266, + "learning_rate": 1.630156606243415e-06, + "loss": 2.5719, + "step": 1381 + }, + { + "epoch": 0.4914651493598862, + "grad_norm": 1.0372058153152466, + "learning_rate": 1.6284254665151822e-06, + "loss": 2.4307, + "step": 1382 + }, + { + "epoch": 0.4918207681365576, + "grad_norm": 1.1604715585708618, + "learning_rate": 1.6266941544527465e-06, + "loss": 3.2178, + "step": 1383 + }, + { + "epoch": 0.492176386913229, + "grad_norm": 5.8509602546691895, + "learning_rate": 1.624962672379357e-06, + "loss": 1.7796, + "step": 1384 + }, + { + "epoch": 0.4925320056899004, + "grad_norm": 1.0543705224990845, + "learning_rate": 1.6232310226184908e-06, + "loss": 2.2713, + "step": 1385 + }, + { + "epoch": 0.49288762446657186, + "grad_norm": 0.6987451910972595, + "learning_rate": 1.6214992074938493e-06, + "loss": 2.5966, + "step": 1386 + }, + { + "epoch": 0.49324324324324326, + "grad_norm": 1.5500653982162476, + "learning_rate": 1.619767229329356e-06, + "loss": 3.4879, + "step": 1387 + }, + { + "epoch": 0.49359886201991465, + "grad_norm": 0.8806231021881104, + "learning_rate": 1.6180350904491539e-06, + "loss": 2.9497, + "step": 1388 + }, + { + "epoch": 0.49395448079658605, + "grad_norm": 1.2620893716812134, + "learning_rate": 1.6163027931775997e-06, + "loss": 3.18, + "step": 1389 + }, + { + "epoch": 0.49431009957325744, + "grad_norm": 1.3732428550720215, + "learning_rate": 1.6145703398392653e-06, + "loss": 1.8278, + "step": 1390 + }, + { + "epoch": 0.4946657183499289, + "grad_norm": 3.4755802154541016, + "learning_rate": 1.6128377327589306e-06, + "loss": 3.3798, + "step": 1391 + }, + { + "epoch": 0.4950213371266003, + "grad_norm": 1.7760398387908936, + "learning_rate": 1.6111049742615817e-06, + "loss": 3.9879, + "step": 1392 + }, + { + "epoch": 0.4953769559032717, + "grad_norm": 1.6767441034317017, + "learning_rate": 1.6093720666724087e-06, + "loss": 2.1446, + "step": 1393 + }, + { + "epoch": 0.4957325746799431, + "grad_norm": 1.1307353973388672, + "learning_rate": 1.6076390123168002e-06, + "loss": 3.3299, + "step": 1394 + }, + { + "epoch": 0.49608819345661453, + "grad_norm": 0.990177571773529, + "learning_rate": 1.6059058135203435e-06, + "loss": 2.9783, + "step": 1395 + }, + { + "epoch": 0.49644381223328593, + "grad_norm": 1.0065711736679077, + "learning_rate": 1.6041724726088188e-06, + "loss": 2.0744, + "step": 1396 + }, + { + "epoch": 0.4967994310099573, + "grad_norm": 1.4098777770996094, + "learning_rate": 1.6024389919081974e-06, + "loss": 3.2268, + "step": 1397 + }, + { + "epoch": 0.4971550497866287, + "grad_norm": 0.9934019446372986, + "learning_rate": 1.600705373744638e-06, + "loss": 1.8216, + "step": 1398 + }, + { + "epoch": 0.4975106685633001, + "grad_norm": 1.249240517616272, + "learning_rate": 1.5989716204444835e-06, + "loss": 3.0794, + "step": 1399 + }, + { + "epoch": 0.49786628733997157, + "grad_norm": 2.715534210205078, + "learning_rate": 1.5972377343342578e-06, + "loss": 5.5457, + "step": 1400 + }, + { + "epoch": 0.49822190611664297, + "grad_norm": 0.9801180958747864, + "learning_rate": 1.5955037177406651e-06, + "loss": 3.0857, + "step": 1401 + }, + { + "epoch": 0.49857752489331436, + "grad_norm": 1.9959789514541626, + "learning_rate": 1.5937695729905818e-06, + "loss": 3.5013, + "step": 1402 + }, + { + "epoch": 0.49893314366998576, + "grad_norm": 1.0075305700302124, + "learning_rate": 1.5920353024110586e-06, + "loss": 2.8206, + "step": 1403 + }, + { + "epoch": 0.4992887624466572, + "grad_norm": 0.7663953900337219, + "learning_rate": 1.5903009083293139e-06, + "loss": 2.2102, + "step": 1404 + }, + { + "epoch": 0.4996443812233286, + "grad_norm": 1.0697321891784668, + "learning_rate": 1.5885663930727312e-06, + "loss": 2.4838, + "step": 1405 + }, + { + "epoch": 0.5, + "grad_norm": 1.4750500917434692, + "learning_rate": 1.5868317589688585e-06, + "loss": 3.6188, + "step": 1406 + }, + { + "epoch": 0.5, + "eval_loss": 4.445728302001953, + "eval_runtime": 303.0852, + "eval_samples_per_second": 4.114, + "eval_steps_per_second": 4.114, + "step": 1406 + }, + { + "epoch": 0.5003556187766715, + "grad_norm": 0.9771082401275635, + "learning_rate": 1.5850970083454023e-06, + "loss": 2.6673, + "step": 1407 + }, + { + "epoch": 0.5007112375533428, + "grad_norm": 1.5634876489639282, + "learning_rate": 1.5833621435302246e-06, + "loss": 2.6183, + "step": 1408 + }, + { + "epoch": 0.5010668563300142, + "grad_norm": 0.9939993023872375, + "learning_rate": 1.5816271668513415e-06, + "loss": 2.8561, + "step": 1409 + }, + { + "epoch": 0.5014224751066856, + "grad_norm": 1.1568063497543335, + "learning_rate": 1.5798920806369198e-06, + "loss": 2.0619, + "step": 1410 + }, + { + "epoch": 0.501778093883357, + "grad_norm": 1.620995044708252, + "learning_rate": 1.5781568872152721e-06, + "loss": 3.242, + "step": 1411 + }, + { + "epoch": 0.5021337126600285, + "grad_norm": 0.9835986495018005, + "learning_rate": 1.5764215889148557e-06, + "loss": 2.7315, + "step": 1412 + }, + { + "epoch": 0.5024893314366998, + "grad_norm": 1.9066803455352783, + "learning_rate": 1.574686188064268e-06, + "loss": 4.0072, + "step": 1413 + }, + { + "epoch": 0.5028449502133713, + "grad_norm": 0.8386759161949158, + "learning_rate": 1.5729506869922447e-06, + "loss": 2.7188, + "step": 1414 + }, + { + "epoch": 0.5032005689900427, + "grad_norm": 2.255856513977051, + "learning_rate": 1.5712150880276552e-06, + "loss": 3.1918, + "step": 1415 + }, + { + "epoch": 0.5035561877667141, + "grad_norm": 0.898554265499115, + "learning_rate": 1.5694793934995007e-06, + "loss": 2.9617, + "step": 1416 + }, + { + "epoch": 0.5039118065433855, + "grad_norm": 1.355476975440979, + "learning_rate": 1.5677436057369112e-06, + "loss": 2.2724, + "step": 1417 + }, + { + "epoch": 0.5042674253200569, + "grad_norm": 2.227055072784424, + "learning_rate": 1.5660077270691406e-06, + "loss": 3.9802, + "step": 1418 + }, + { + "epoch": 0.5046230440967283, + "grad_norm": 1.3774868249893188, + "learning_rate": 1.5642717598255661e-06, + "loss": 2.8737, + "step": 1419 + }, + { + "epoch": 0.5049786628733998, + "grad_norm": 1.0584720373153687, + "learning_rate": 1.5625357063356823e-06, + "loss": 2.693, + "step": 1420 + }, + { + "epoch": 0.5053342816500711, + "grad_norm": 0.8690099716186523, + "learning_rate": 1.5607995689291003e-06, + "loss": 2.6324, + "step": 1421 + }, + { + "epoch": 0.5056899004267426, + "grad_norm": 0.9574740529060364, + "learning_rate": 1.5590633499355442e-06, + "loss": 2.5474, + "step": 1422 + }, + { + "epoch": 0.5060455192034139, + "grad_norm": 0.8769509792327881, + "learning_rate": 1.5573270516848476e-06, + "loss": 2.4171, + "step": 1423 + }, + { + "epoch": 0.5064011379800853, + "grad_norm": 2.0183348655700684, + "learning_rate": 1.5555906765069497e-06, + "loss": 4.0968, + "step": 1424 + }, + { + "epoch": 0.5067567567567568, + "grad_norm": 1.1624144315719604, + "learning_rate": 1.5538542267318928e-06, + "loss": 3.1228, + "step": 1425 + }, + { + "epoch": 0.5071123755334281, + "grad_norm": 1.1308130025863647, + "learning_rate": 1.5521177046898204e-06, + "loss": 3.5486, + "step": 1426 + }, + { + "epoch": 0.5074679943100996, + "grad_norm": 0.8648583292961121, + "learning_rate": 1.550381112710972e-06, + "loss": 3.2376, + "step": 1427 + }, + { + "epoch": 0.5078236130867709, + "grad_norm": 1.3093385696411133, + "learning_rate": 1.5486444531256811e-06, + "loss": 2.8711, + "step": 1428 + }, + { + "epoch": 0.5081792318634424, + "grad_norm": 3.3950860500335693, + "learning_rate": 1.546907728264373e-06, + "loss": 4.3465, + "step": 1429 + }, + { + "epoch": 0.5085348506401138, + "grad_norm": 1.6052616834640503, + "learning_rate": 1.545170940457559e-06, + "loss": 1.7418, + "step": 1430 + }, + { + "epoch": 0.5088904694167852, + "grad_norm": 1.1446068286895752, + "learning_rate": 1.543434092035836e-06, + "loss": 3.3941, + "step": 1431 + }, + { + "epoch": 0.5092460881934566, + "grad_norm": 0.8656386733055115, + "learning_rate": 1.541697185329881e-06, + "loss": 1.8959, + "step": 1432 + }, + { + "epoch": 0.5096017069701281, + "grad_norm": 1.0785155296325684, + "learning_rate": 1.5399602226704511e-06, + "loss": 2.3684, + "step": 1433 + }, + { + "epoch": 0.5099573257467994, + "grad_norm": 0.924345850944519, + "learning_rate": 1.5382232063883767e-06, + "loss": 2.1821, + "step": 1434 + }, + { + "epoch": 0.5103129445234709, + "grad_norm": 0.8546257615089417, + "learning_rate": 1.5364861388145617e-06, + "loss": 2.9544, + "step": 1435 + }, + { + "epoch": 0.5106685633001422, + "grad_norm": 1.3512953519821167, + "learning_rate": 1.5347490222799773e-06, + "loss": 2.4434, + "step": 1436 + }, + { + "epoch": 0.5110241820768137, + "grad_norm": 0.9144914746284485, + "learning_rate": 1.5330118591156612e-06, + "loss": 2.5306, + "step": 1437 + }, + { + "epoch": 0.5113798008534851, + "grad_norm": 1.6684690713882446, + "learning_rate": 1.5312746516527131e-06, + "loss": 2.678, + "step": 1438 + }, + { + "epoch": 0.5117354196301565, + "grad_norm": 1.274344563484192, + "learning_rate": 1.5295374022222937e-06, + "loss": 2.3424, + "step": 1439 + }, + { + "epoch": 0.5120910384068279, + "grad_norm": 1.6485141515731812, + "learning_rate": 1.5278001131556185e-06, + "loss": 3.6411, + "step": 1440 + }, + { + "epoch": 0.5124466571834992, + "grad_norm": 1.0582098960876465, + "learning_rate": 1.526062786783956e-06, + "loss": 2.0621, + "step": 1441 + }, + { + "epoch": 0.5128022759601707, + "grad_norm": 0.8463141918182373, + "learning_rate": 1.5243254254386264e-06, + "loss": 2.9822, + "step": 1442 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 1.2196093797683716, + "learning_rate": 1.5225880314509954e-06, + "loss": 2.4233, + "step": 1443 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 1.0325490236282349, + "learning_rate": 1.5208506071524727e-06, + "loss": 2.6472, + "step": 1444 + }, + { + "epoch": 0.5138691322901849, + "grad_norm": 1.219367265701294, + "learning_rate": 1.5191131548745093e-06, + "loss": 2.8642, + "step": 1445 + }, + { + "epoch": 0.5142247510668563, + "grad_norm": 1.5758132934570312, + "learning_rate": 1.5173756769485932e-06, + "loss": 2.4906, + "step": 1446 + }, + { + "epoch": 0.5145803698435277, + "grad_norm": 1.0653682947158813, + "learning_rate": 1.5156381757062466e-06, + "loss": 1.7472, + "step": 1447 + }, + { + "epoch": 0.5149359886201992, + "grad_norm": 1.268584132194519, + "learning_rate": 1.5139006534790238e-06, + "loss": 2.9171, + "step": 1448 + }, + { + "epoch": 0.5152916073968705, + "grad_norm": 0.791918933391571, + "learning_rate": 1.512163112598506e-06, + "loss": 2.2458, + "step": 1449 + }, + { + "epoch": 0.515647226173542, + "grad_norm": 0.7999536991119385, + "learning_rate": 1.5104255553963018e-06, + "loss": 2.4475, + "step": 1450 + }, + { + "epoch": 0.5160028449502134, + "grad_norm": 0.9209456443786621, + "learning_rate": 1.5086879842040389e-06, + "loss": 2.8915, + "step": 1451 + }, + { + "epoch": 0.5163584637268848, + "grad_norm": 1.9973077774047852, + "learning_rate": 1.506950401353365e-06, + "loss": 3.7054, + "step": 1452 + }, + { + "epoch": 0.5167140825035562, + "grad_norm": 1.161856770515442, + "learning_rate": 1.505212809175944e-06, + "loss": 2.9926, + "step": 1453 + }, + { + "epoch": 0.5170697012802276, + "grad_norm": 0.8276859521865845, + "learning_rate": 1.5034752100034514e-06, + "loss": 2.5678, + "step": 1454 + }, + { + "epoch": 0.517425320056899, + "grad_norm": 1.0238596200942993, + "learning_rate": 1.5017376061675732e-06, + "loss": 2.9909, + "step": 1455 + }, + { + "epoch": 0.5177809388335705, + "grad_norm": 0.8834866285324097, + "learning_rate": 1.5e-06, + "loss": 2.6388, + "step": 1456 + }, + { + "epoch": 0.5181365576102418, + "grad_norm": 1.0921919345855713, + "learning_rate": 1.4982623938324267e-06, + "loss": 2.764, + "step": 1457 + }, + { + "epoch": 0.5184921763869133, + "grad_norm": 1.007649540901184, + "learning_rate": 1.4965247899965487e-06, + "loss": 2.5706, + "step": 1458 + }, + { + "epoch": 0.5188477951635846, + "grad_norm": 1.3393572568893433, + "learning_rate": 1.494787190824056e-06, + "loss": 2.7415, + "step": 1459 + }, + { + "epoch": 0.519203413940256, + "grad_norm": 3.3063161373138428, + "learning_rate": 1.4930495986466352e-06, + "loss": 4.6247, + "step": 1460 + }, + { + "epoch": 0.5195590327169275, + "grad_norm": 2.594233274459839, + "learning_rate": 1.4913120157959614e-06, + "loss": 2.6948, + "step": 1461 + }, + { + "epoch": 0.5199146514935988, + "grad_norm": 1.0245544910430908, + "learning_rate": 1.489574444603699e-06, + "loss": 2.6321, + "step": 1462 + }, + { + "epoch": 0.5202702702702703, + "grad_norm": 0.9326840043067932, + "learning_rate": 1.4878368874014943e-06, + "loss": 2.5668, + "step": 1463 + }, + { + "epoch": 0.5206258890469416, + "grad_norm": 1.7256430387496948, + "learning_rate": 1.4860993465209767e-06, + "loss": 3.4845, + "step": 1464 + }, + { + "epoch": 0.5209815078236131, + "grad_norm": 0.8920795917510986, + "learning_rate": 1.484361824293754e-06, + "loss": 3.039, + "step": 1465 + }, + { + "epoch": 0.5213371266002845, + "grad_norm": 0.9072233438491821, + "learning_rate": 1.482624323051407e-06, + "loss": 2.8183, + "step": 1466 + }, + { + "epoch": 0.5216927453769559, + "grad_norm": 1.6290687322616577, + "learning_rate": 1.4808868451254912e-06, + "loss": 3.3625, + "step": 1467 + }, + { + "epoch": 0.5220483641536273, + "grad_norm": 1.484682559967041, + "learning_rate": 1.4791493928475276e-06, + "loss": 2.4631, + "step": 1468 + }, + { + "epoch": 0.5224039829302988, + "grad_norm": 1.3314359188079834, + "learning_rate": 1.4774119685490047e-06, + "loss": 3.577, + "step": 1469 + }, + { + "epoch": 0.5227596017069701, + "grad_norm": 0.9494376182556152, + "learning_rate": 1.4756745745613736e-06, + "loss": 2.8563, + "step": 1470 + }, + { + "epoch": 0.5231152204836416, + "grad_norm": 0.8908175826072693, + "learning_rate": 1.4739372132160438e-06, + "loss": 2.7427, + "step": 1471 + }, + { + "epoch": 0.5234708392603129, + "grad_norm": 1.0303457975387573, + "learning_rate": 1.472199886844382e-06, + "loss": 3.2469, + "step": 1472 + }, + { + "epoch": 0.5238264580369844, + "grad_norm": 1.0263373851776123, + "learning_rate": 1.4704625977777066e-06, + "loss": 2.9298, + "step": 1473 + }, + { + "epoch": 0.5241820768136558, + "grad_norm": 1.3913066387176514, + "learning_rate": 1.4687253483472872e-06, + "loss": 2.9679, + "step": 1474 + }, + { + "epoch": 0.5245376955903271, + "grad_norm": 1.3191438913345337, + "learning_rate": 1.466988140884339e-06, + "loss": 3.7368, + "step": 1475 + }, + { + "epoch": 0.5248933143669986, + "grad_norm": 0.8257669806480408, + "learning_rate": 1.4652509777200228e-06, + "loss": 3.0847, + "step": 1476 + }, + { + "epoch": 0.5252489331436699, + "grad_norm": 1.0876842737197876, + "learning_rate": 1.4635138611854386e-06, + "loss": 2.9323, + "step": 1477 + }, + { + "epoch": 0.5256045519203414, + "grad_norm": 0.7772853374481201, + "learning_rate": 1.4617767936116231e-06, + "loss": 2.6031, + "step": 1478 + }, + { + "epoch": 0.5259601706970128, + "grad_norm": 1.0743046998977661, + "learning_rate": 1.4600397773295494e-06, + "loss": 2.1765, + "step": 1479 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8423651456832886, + "learning_rate": 1.458302814670119e-06, + "loss": 2.2174, + "step": 1480 + }, + { + "epoch": 0.5266714082503556, + "grad_norm": 1.7881324291229248, + "learning_rate": 1.4565659079641645e-06, + "loss": 3.065, + "step": 1481 + }, + { + "epoch": 0.527027027027027, + "grad_norm": 2.031830310821533, + "learning_rate": 1.4548290595424413e-06, + "loss": 3.2143, + "step": 1482 + }, + { + "epoch": 0.5273826458036984, + "grad_norm": 1.0049747228622437, + "learning_rate": 1.4530922717356269e-06, + "loss": 3.0329, + "step": 1483 + }, + { + "epoch": 0.5277382645803699, + "grad_norm": 0.7155492305755615, + "learning_rate": 1.4513555468743191e-06, + "loss": 1.3468, + "step": 1484 + }, + { + "epoch": 0.5280938833570412, + "grad_norm": 1.9398192167282104, + "learning_rate": 1.4496188872890285e-06, + "loss": 2.2357, + "step": 1485 + }, + { + "epoch": 0.5284495021337127, + "grad_norm": 3.6682279109954834, + "learning_rate": 1.44788229531018e-06, + "loss": 2.0443, + "step": 1486 + }, + { + "epoch": 0.5288051209103841, + "grad_norm": 1.3508946895599365, + "learning_rate": 1.4461457732681072e-06, + "loss": 3.2821, + "step": 1487 + }, + { + "epoch": 0.5291607396870555, + "grad_norm": 0.8034976124763489, + "learning_rate": 1.4444093234930502e-06, + "loss": 2.6948, + "step": 1488 + }, + { + "epoch": 0.5295163584637269, + "grad_norm": 1.4893591403961182, + "learning_rate": 1.4426729483151525e-06, + "loss": 2.5336, + "step": 1489 + }, + { + "epoch": 0.5298719772403983, + "grad_norm": 1.251322627067566, + "learning_rate": 1.4409366500644556e-06, + "loss": 3.5009, + "step": 1490 + }, + { + "epoch": 0.5302275960170697, + "grad_norm": 0.8680931329727173, + "learning_rate": 1.4392004310709e-06, + "loss": 2.6385, + "step": 1491 + }, + { + "epoch": 0.5305832147937412, + "grad_norm": 1.864842414855957, + "learning_rate": 1.437464293664318e-06, + "loss": 3.2241, + "step": 1492 + }, + { + "epoch": 0.5309388335704125, + "grad_norm": 3.023165702819824, + "learning_rate": 1.4357282401744346e-06, + "loss": 2.9437, + "step": 1493 + }, + { + "epoch": 0.531294452347084, + "grad_norm": 0.9825669527053833, + "learning_rate": 1.4339922729308594e-06, + "loss": 2.4218, + "step": 1494 + }, + { + "epoch": 0.5316500711237553, + "grad_norm": 1.3121941089630127, + "learning_rate": 1.4322563942630889e-06, + "loss": 3.543, + "step": 1495 + }, + { + "epoch": 0.5320056899004267, + "grad_norm": 1.9199869632720947, + "learning_rate": 1.4305206065004996e-06, + "loss": 4.6015, + "step": 1496 + }, + { + "epoch": 0.5323613086770982, + "grad_norm": 0.9981526136398315, + "learning_rate": 1.4287849119723451e-06, + "loss": 2.4964, + "step": 1497 + }, + { + "epoch": 0.5327169274537695, + "grad_norm": 4.057604789733887, + "learning_rate": 1.4270493130077558e-06, + "loss": 4.2741, + "step": 1498 + }, + { + "epoch": 0.533072546230441, + "grad_norm": 1.136657476425171, + "learning_rate": 1.425313811935732e-06, + "loss": 3.2009, + "step": 1499 + }, + { + "epoch": 0.5334281650071123, + "grad_norm": 0.8216729164123535, + "learning_rate": 1.423578411085145e-06, + "loss": 2.5263, + "step": 1500 + }, + { + "epoch": 0.5337837837837838, + "grad_norm": 0.8289672136306763, + "learning_rate": 1.4218431127847282e-06, + "loss": 2.4234, + "step": 1501 + }, + { + "epoch": 0.5341394025604552, + "grad_norm": 1.045714020729065, + "learning_rate": 1.4201079193630802e-06, + "loss": 2.264, + "step": 1502 + }, + { + "epoch": 0.5344950213371266, + "grad_norm": 1.4927000999450684, + "learning_rate": 1.4183728331486586e-06, + "loss": 3.1367, + "step": 1503 + }, + { + "epoch": 0.534850640113798, + "grad_norm": 1.1352112293243408, + "learning_rate": 1.4166378564697757e-06, + "loss": 3.3413, + "step": 1504 + }, + { + "epoch": 0.5352062588904695, + "grad_norm": 1.3658416271209717, + "learning_rate": 1.4149029916545984e-06, + "loss": 4.1431, + "step": 1505 + }, + { + "epoch": 0.5355618776671408, + "grad_norm": 1.4818263053894043, + "learning_rate": 1.4131682410311418e-06, + "loss": 3.6289, + "step": 1506 + }, + { + "epoch": 0.5359174964438123, + "grad_norm": 1.1149414777755737, + "learning_rate": 1.411433606927269e-06, + "loss": 2.925, + "step": 1507 + }, + { + "epoch": 0.5362731152204836, + "grad_norm": 1.8448301553726196, + "learning_rate": 1.4096990916706866e-06, + "loss": 3.764, + "step": 1508 + }, + { + "epoch": 0.536628733997155, + "grad_norm": 1.2490705251693726, + "learning_rate": 1.4079646975889412e-06, + "loss": 2.618, + "step": 1509 + }, + { + "epoch": 0.5369843527738265, + "grad_norm": 0.9155372381210327, + "learning_rate": 1.4062304270094183e-06, + "loss": 2.8209, + "step": 1510 + }, + { + "epoch": 0.5373399715504978, + "grad_norm": 0.8197996020317078, + "learning_rate": 1.4044962822593351e-06, + "loss": 2.3953, + "step": 1511 + }, + { + "epoch": 0.5376955903271693, + "grad_norm": 1.1225457191467285, + "learning_rate": 1.4027622656657422e-06, + "loss": 2.5458, + "step": 1512 + }, + { + "epoch": 0.5380512091038406, + "grad_norm": 1.044634222984314, + "learning_rate": 1.401028379555517e-06, + "loss": 2.3806, + "step": 1513 + }, + { + "epoch": 0.5384068278805121, + "grad_norm": 1.0711735486984253, + "learning_rate": 1.399294626255362e-06, + "loss": 3.3705, + "step": 1514 + }, + { + "epoch": 0.5387624466571835, + "grad_norm": 1.4023677110671997, + "learning_rate": 1.3975610080918027e-06, + "loss": 3.1577, + "step": 1515 + }, + { + "epoch": 0.5391180654338549, + "grad_norm": 1.100409746170044, + "learning_rate": 1.3958275273911813e-06, + "loss": 2.1731, + "step": 1516 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.9197340607643127, + "learning_rate": 1.3940941864796572e-06, + "loss": 3.6485, + "step": 1517 + }, + { + "epoch": 0.5398293029871978, + "grad_norm": 1.0701816082000732, + "learning_rate": 1.3923609876832e-06, + "loss": 1.8824, + "step": 1518 + }, + { + "epoch": 0.5401849217638691, + "grad_norm": 0.902917742729187, + "learning_rate": 1.3906279333275922e-06, + "loss": 1.8996, + "step": 1519 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.0297616720199585, + "learning_rate": 1.3888950257384183e-06, + "loss": 2.4632, + "step": 1520 + }, + { + "epoch": 0.5408961593172119, + "grad_norm": 1.022627353668213, + "learning_rate": 1.3871622672410694e-06, + "loss": 2.9029, + "step": 1521 + }, + { + "epoch": 0.5412517780938834, + "grad_norm": 1.035480260848999, + "learning_rate": 1.3854296601607352e-06, + "loss": 2.4614, + "step": 1522 + }, + { + "epoch": 0.5416073968705548, + "grad_norm": 1.054465651512146, + "learning_rate": 1.3836972068224006e-06, + "loss": 2.7366, + "step": 1523 + }, + { + "epoch": 0.5419630156472262, + "grad_norm": 1.502935528755188, + "learning_rate": 1.381964909550847e-06, + "loss": 3.5118, + "step": 1524 + }, + { + "epoch": 0.5423186344238976, + "grad_norm": 1.0489780902862549, + "learning_rate": 1.3802327706706443e-06, + "loss": 2.6502, + "step": 1525 + }, + { + "epoch": 0.542674253200569, + "grad_norm": 0.9073423743247986, + "learning_rate": 1.3785007925061512e-06, + "loss": 2.4342, + "step": 1526 + }, + { + "epoch": 0.5430298719772404, + "grad_norm": 0.6701890230178833, + "learning_rate": 1.3767689773815093e-06, + "loss": 2.102, + "step": 1527 + }, + { + "epoch": 0.5433854907539118, + "grad_norm": 1.1082252264022827, + "learning_rate": 1.375037327620643e-06, + "loss": 2.8031, + "step": 1528 + }, + { + "epoch": 0.5437411095305832, + "grad_norm": 1.8333821296691895, + "learning_rate": 1.3733058455472538e-06, + "loss": 3.0663, + "step": 1529 + }, + { + "epoch": 0.5440967283072546, + "grad_norm": 0.7892056107521057, + "learning_rate": 1.3715745334848181e-06, + "loss": 2.4321, + "step": 1530 + }, + { + "epoch": 0.544452347083926, + "grad_norm": 0.9996071457862854, + "learning_rate": 1.3698433937565855e-06, + "loss": 3.2907, + "step": 1531 + }, + { + "epoch": 0.5448079658605974, + "grad_norm": 1.2443190813064575, + "learning_rate": 1.368112428685572e-06, + "loss": 2.9022, + "step": 1532 + }, + { + "epoch": 0.5451635846372689, + "grad_norm": 2.0706276893615723, + "learning_rate": 1.366381640594561e-06, + "loss": 2.7621, + "step": 1533 + }, + { + "epoch": 0.5455192034139402, + "grad_norm": 0.9573854207992554, + "learning_rate": 1.3646510318060986e-06, + "loss": 2.1287, + "step": 1534 + }, + { + "epoch": 0.5458748221906117, + "grad_norm": 1.283465027809143, + "learning_rate": 1.3629206046424888e-06, + "loss": 2.1025, + "step": 1535 + }, + { + "epoch": 0.5462304409672831, + "grad_norm": 1.6512552499771118, + "learning_rate": 1.361190361425794e-06, + "loss": 3.3889, + "step": 1536 + }, + { + "epoch": 0.5465860597439545, + "grad_norm": 0.9411128163337708, + "learning_rate": 1.3594603044778266e-06, + "loss": 2.1518, + "step": 1537 + }, + { + "epoch": 0.5469416785206259, + "grad_norm": 6.611198425292969, + "learning_rate": 1.357730436120153e-06, + "loss": 2.2192, + "step": 1538 + }, + { + "epoch": 0.5472972972972973, + "grad_norm": 1.2217298746109009, + "learning_rate": 1.3560007586740824e-06, + "loss": 2.7573, + "step": 1539 + }, + { + "epoch": 0.5476529160739687, + "grad_norm": 2.388923168182373, + "learning_rate": 1.3542712744606712e-06, + "loss": 3.6021, + "step": 1540 + }, + { + "epoch": 0.5480085348506402, + "grad_norm": 0.6587908864021301, + "learning_rate": 1.3525419858007154e-06, + "loss": 2.6506, + "step": 1541 + }, + { + "epoch": 0.5483641536273115, + "grad_norm": 0.939460039138794, + "learning_rate": 1.3508128950147474e-06, + "loss": 2.7177, + "step": 1542 + }, + { + "epoch": 0.548719772403983, + "grad_norm": 0.850816011428833, + "learning_rate": 1.3490840044230361e-06, + "loss": 2.5217, + "step": 1543 + }, + { + "epoch": 0.5490753911806543, + "grad_norm": 1.1026942729949951, + "learning_rate": 1.34735531634558e-06, + "loss": 3.1526, + "step": 1544 + }, + { + "epoch": 0.5494310099573257, + "grad_norm": 1.3605300188064575, + "learning_rate": 1.3456268331021066e-06, + "loss": 2.5522, + "step": 1545 + }, + { + "epoch": 0.5497866287339972, + "grad_norm": 1.1707429885864258, + "learning_rate": 1.3438985570120686e-06, + "loss": 2.6716, + "step": 1546 + }, + { + "epoch": 0.5501422475106685, + "grad_norm": 1.487268090248108, + "learning_rate": 1.3421704903946404e-06, + "loss": 3.166, + "step": 1547 + }, + { + "epoch": 0.55049786628734, + "grad_norm": 1.089787244796753, + "learning_rate": 1.3404426355687166e-06, + "loss": 3.0245, + "step": 1548 + }, + { + "epoch": 0.5508534850640113, + "grad_norm": 0.8312658071517944, + "learning_rate": 1.3387149948529052e-06, + "loss": 2.6101, + "step": 1549 + }, + { + "epoch": 0.5512091038406828, + "grad_norm": 0.8008679151535034, + "learning_rate": 1.3369875705655286e-06, + "loss": 2.7998, + "step": 1550 + }, + { + "epoch": 0.5515647226173542, + "grad_norm": 1.4112894535064697, + "learning_rate": 1.3352603650246184e-06, + "loss": 3.4024, + "step": 1551 + }, + { + "epoch": 0.5519203413940256, + "grad_norm": 2.3961293697357178, + "learning_rate": 1.3335333805479128e-06, + "loss": 2.5573, + "step": 1552 + }, + { + "epoch": 0.552275960170697, + "grad_norm": 1.5547196865081787, + "learning_rate": 1.3318066194528535e-06, + "loss": 3.2089, + "step": 1553 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 1.0061249732971191, + "learning_rate": 1.3300800840565813e-06, + "loss": 2.5462, + "step": 1554 + }, + { + "epoch": 0.5529871977240398, + "grad_norm": 2.157578706741333, + "learning_rate": 1.3283537766759356e-06, + "loss": 3.1488, + "step": 1555 + }, + { + "epoch": 0.5533428165007113, + "grad_norm": 0.9172126054763794, + "learning_rate": 1.326627699627449e-06, + "loss": 2.2377, + "step": 1556 + }, + { + "epoch": 0.5536984352773826, + "grad_norm": 8.375680923461914, + "learning_rate": 1.3249018552273454e-06, + "loss": 3.9321, + "step": 1557 + }, + { + "epoch": 0.5540540540540541, + "grad_norm": 0.9361571669578552, + "learning_rate": 1.3231762457915358e-06, + "loss": 3.2137, + "step": 1558 + }, + { + "epoch": 0.5544096728307255, + "grad_norm": 0.8818283081054688, + "learning_rate": 1.3214508736356167e-06, + "loss": 2.6707, + "step": 1559 + }, + { + "epoch": 0.5547652916073968, + "grad_norm": 1.1433403491973877, + "learning_rate": 1.3197257410748666e-06, + "loss": 3.4449, + "step": 1560 + }, + { + "epoch": 0.5551209103840683, + "grad_norm": 0.9566643834114075, + "learning_rate": 1.3180008504242407e-06, + "loss": 2.5582, + "step": 1561 + }, + { + "epoch": 0.5554765291607396, + "grad_norm": 0.8360999822616577, + "learning_rate": 1.3162762039983717e-06, + "loss": 2.5683, + "step": 1562 + }, + { + "epoch": 0.5558321479374111, + "grad_norm": 1.315352201461792, + "learning_rate": 1.3145518041115625e-06, + "loss": 3.2864, + "step": 1563 + }, + { + "epoch": 0.5561877667140825, + "grad_norm": 0.9605684280395508, + "learning_rate": 1.3128276530777875e-06, + "loss": 2.4372, + "step": 1564 + }, + { + "epoch": 0.5565433854907539, + "grad_norm": 1.056595802307129, + "learning_rate": 1.3111037532106844e-06, + "loss": 2.1708, + "step": 1565 + }, + { + "epoch": 0.5568990042674253, + "grad_norm": 1.0051207542419434, + "learning_rate": 1.3093801068235563e-06, + "loss": 3.1301, + "step": 1566 + }, + { + "epoch": 0.5572546230440967, + "grad_norm": 1.9133470058441162, + "learning_rate": 1.3076567162293656e-06, + "loss": 3.588, + "step": 1567 + }, + { + "epoch": 0.5576102418207681, + "grad_norm": 0.8696346879005432, + "learning_rate": 1.3059335837407297e-06, + "loss": 2.2534, + "step": 1568 + }, + { + "epoch": 0.5579658605974396, + "grad_norm": 2.3601417541503906, + "learning_rate": 1.304210711669923e-06, + "loss": 3.0322, + "step": 1569 + }, + { + "epoch": 0.5583214793741109, + "grad_norm": 1.2175562381744385, + "learning_rate": 1.3024881023288663e-06, + "loss": 3.4811, + "step": 1570 + }, + { + "epoch": 0.5586770981507824, + "grad_norm": 2.0281405448913574, + "learning_rate": 1.3007657580291316e-06, + "loss": 4.4554, + "step": 1571 + }, + { + "epoch": 0.5590327169274538, + "grad_norm": 1.3114901781082153, + "learning_rate": 1.2990436810819324e-06, + "loss": 2.6664, + "step": 1572 + }, + { + "epoch": 0.5593883357041252, + "grad_norm": 0.9831101894378662, + "learning_rate": 1.2973218737981256e-06, + "loss": 3.2115, + "step": 1573 + }, + { + "epoch": 0.5597439544807966, + "grad_norm": 0.8332681655883789, + "learning_rate": 1.2956003384882055e-06, + "loss": 3.0125, + "step": 1574 + }, + { + "epoch": 0.560099573257468, + "grad_norm": 1.152746319770813, + "learning_rate": 1.2938790774623002e-06, + "loss": 2.823, + "step": 1575 + }, + { + "epoch": 0.5604551920341394, + "grad_norm": 1.0294358730316162, + "learning_rate": 1.292158093030172e-06, + "loss": 3.2273, + "step": 1576 + }, + { + "epoch": 0.5608108108108109, + "grad_norm": 3.760831832885742, + "learning_rate": 1.2904373875012097e-06, + "loss": 3.695, + "step": 1577 + }, + { + "epoch": 0.5611664295874822, + "grad_norm": 0.6936827898025513, + "learning_rate": 1.2887169631844292e-06, + "loss": 2.4113, + "step": 1578 + }, + { + "epoch": 0.5615220483641536, + "grad_norm": 1.104067087173462, + "learning_rate": 1.2869968223884697e-06, + "loss": 2.7163, + "step": 1579 + }, + { + "epoch": 0.561877667140825, + "grad_norm": 0.8611269593238831, + "learning_rate": 1.2852769674215878e-06, + "loss": 2.5977, + "step": 1580 + }, + { + "epoch": 0.5622332859174964, + "grad_norm": 0.9159924983978271, + "learning_rate": 1.2835574005916594e-06, + "loss": 2.3552, + "step": 1581 + }, + { + "epoch": 0.5625889046941679, + "grad_norm": 1.43521249294281, + "learning_rate": 1.2818381242061703e-06, + "loss": 2.9112, + "step": 1582 + }, + { + "epoch": 0.5629445234708392, + "grad_norm": 1.5094164609909058, + "learning_rate": 1.2801191405722199e-06, + "loss": 3.7183, + "step": 1583 + }, + { + "epoch": 0.5633001422475107, + "grad_norm": 1.0878278017044067, + "learning_rate": 1.2784004519965124e-06, + "loss": 2.7389, + "step": 1584 + }, + { + "epoch": 0.563655761024182, + "grad_norm": 1.072373628616333, + "learning_rate": 1.2766820607853568e-06, + "loss": 3.0085, + "step": 1585 + }, + { + "epoch": 0.5640113798008535, + "grad_norm": 1.3602298498153687, + "learning_rate": 1.2749639692446645e-06, + "loss": 2.5601, + "step": 1586 + }, + { + "epoch": 0.5643669985775249, + "grad_norm": 0.9332229495048523, + "learning_rate": 1.273246179679942e-06, + "loss": 2.386, + "step": 1587 + }, + { + "epoch": 0.5647226173541963, + "grad_norm": 0.866671085357666, + "learning_rate": 1.2715286943962925e-06, + "loss": 1.8631, + "step": 1588 + }, + { + "epoch": 0.5650782361308677, + "grad_norm": 0.9761409163475037, + "learning_rate": 1.2698115156984105e-06, + "loss": 2.7495, + "step": 1589 + }, + { + "epoch": 0.5654338549075392, + "grad_norm": 0.946271538734436, + "learning_rate": 1.2680946458905797e-06, + "loss": 2.5076, + "step": 1590 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 1.3151607513427734, + "learning_rate": 1.266378087276667e-06, + "loss": 3.0267, + "step": 1591 + }, + { + "epoch": 0.566145092460882, + "grad_norm": 0.9644737839698792, + "learning_rate": 1.2646618421601244e-06, + "loss": 2.9274, + "step": 1592 + }, + { + "epoch": 0.5665007112375533, + "grad_norm": 1.4392282962799072, + "learning_rate": 1.2629459128439825e-06, + "loss": 4.1107, + "step": 1593 + }, + { + "epoch": 0.5668563300142248, + "grad_norm": 0.7777478694915771, + "learning_rate": 1.2612303016308466e-06, + "loss": 2.0112, + "step": 1594 + }, + { + "epoch": 0.5672119487908962, + "grad_norm": 1.2344564199447632, + "learning_rate": 1.2595150108228978e-06, + "loss": 2.6722, + "step": 1595 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.6110988855361938, + "learning_rate": 1.2578000427218845e-06, + "loss": 3.2461, + "step": 1596 + }, + { + "epoch": 0.567923186344239, + "grad_norm": 1.2645403146743774, + "learning_rate": 1.2560853996291234e-06, + "loss": 3.0357, + "step": 1597 + }, + { + "epoch": 0.5682788051209103, + "grad_norm": 0.6805746555328369, + "learning_rate": 1.2543710838454963e-06, + "loss": 2.2095, + "step": 1598 + }, + { + "epoch": 0.5686344238975818, + "grad_norm": 0.8025508522987366, + "learning_rate": 1.2526570976714426e-06, + "loss": 2.2406, + "step": 1599 + }, + { + "epoch": 0.5689900426742532, + "grad_norm": 1.3613890409469604, + "learning_rate": 1.2509434434069624e-06, + "loss": 3.2733, + "step": 1600 + }, + { + "epoch": 0.5693456614509246, + "grad_norm": 1.5672816038131714, + "learning_rate": 1.2492301233516088e-06, + "loss": 2.1228, + "step": 1601 + }, + { + "epoch": 0.569701280227596, + "grad_norm": 1.224216341972351, + "learning_rate": 1.247517139804488e-06, + "loss": 3.1221, + "step": 1602 + }, + { + "epoch": 0.5700568990042674, + "grad_norm": 1.2097480297088623, + "learning_rate": 1.2458044950642518e-06, + "loss": 3.5238, + "step": 1603 + }, + { + "epoch": 0.5704125177809388, + "grad_norm": 0.7986658215522766, + "learning_rate": 1.2440921914291e-06, + "loss": 2.1428, + "step": 1604 + }, + { + "epoch": 0.5707681365576103, + "grad_norm": 0.8093884587287903, + "learning_rate": 1.2423802311967741e-06, + "loss": 2.3735, + "step": 1605 + }, + { + "epoch": 0.5711237553342816, + "grad_norm": 1.543238878250122, + "learning_rate": 1.2406686166645538e-06, + "loss": 2.5391, + "step": 1606 + }, + { + "epoch": 0.5714793741109531, + "grad_norm": 0.7497166395187378, + "learning_rate": 1.2389573501292566e-06, + "loss": 2.1158, + "step": 1607 + }, + { + "epoch": 0.5718349928876245, + "grad_norm": 2.5874087810516357, + "learning_rate": 1.2372464338872303e-06, + "loss": 4.1061, + "step": 1608 + }, + { + "epoch": 0.5721906116642959, + "grad_norm": 0.7466740608215332, + "learning_rate": 1.235535870234356e-06, + "loss": 2.5177, + "step": 1609 + }, + { + "epoch": 0.5725462304409673, + "grad_norm": 0.9618914127349854, + "learning_rate": 1.2338256614660385e-06, + "loss": 2.3485, + "step": 1610 + }, + { + "epoch": 0.5729018492176386, + "grad_norm": 1.0836193561553955, + "learning_rate": 1.232115809877209e-06, + "loss": 2.6279, + "step": 1611 + }, + { + "epoch": 0.5732574679943101, + "grad_norm": 1.164204716682434, + "learning_rate": 1.2304063177623183e-06, + "loss": 1.9965, + "step": 1612 + }, + { + "epoch": 0.5736130867709816, + "grad_norm": 1.19977867603302, + "learning_rate": 1.2286971874153336e-06, + "loss": 3.4786, + "step": 1613 + }, + { + "epoch": 0.5739687055476529, + "grad_norm": 1.0600868463516235, + "learning_rate": 1.226988421129739e-06, + "loss": 2.793, + "step": 1614 + }, + { + "epoch": 0.5743243243243243, + "grad_norm": 1.8599703311920166, + "learning_rate": 1.2252800211985282e-06, + "loss": 4.0723, + "step": 1615 + }, + { + "epoch": 0.5746799431009957, + "grad_norm": 1.1172808408737183, + "learning_rate": 1.2235719899142043e-06, + "loss": 3.125, + "step": 1616 + }, + { + "epoch": 0.5750355618776671, + "grad_norm": 1.0137203931808472, + "learning_rate": 1.2218643295687758e-06, + "loss": 2.6931, + "step": 1617 + }, + { + "epoch": 0.5753911806543386, + "grad_norm": 1.0707441568374634, + "learning_rate": 1.220157042453752e-06, + "loss": 3.1215, + "step": 1618 + }, + { + "epoch": 0.5757467994310099, + "grad_norm": 1.0839987993240356, + "learning_rate": 1.2184501308601438e-06, + "loss": 2.9256, + "step": 1619 + }, + { + "epoch": 0.5761024182076814, + "grad_norm": 1.1116533279418945, + "learning_rate": 1.2167435970784554e-06, + "loss": 2.6292, + "step": 1620 + }, + { + "epoch": 0.5764580369843528, + "grad_norm": 0.8376097679138184, + "learning_rate": 1.2150374433986861e-06, + "loss": 2.4503, + "step": 1621 + }, + { + "epoch": 0.5768136557610242, + "grad_norm": 1.219063639640808, + "learning_rate": 1.213331672110324e-06, + "loss": 3.2536, + "step": 1622 + }, + { + "epoch": 0.5771692745376956, + "grad_norm": 1.035886526107788, + "learning_rate": 1.2116262855023447e-06, + "loss": 2.5457, + "step": 1623 + }, + { + "epoch": 0.577524893314367, + "grad_norm": 1.4368425607681274, + "learning_rate": 1.2099212858632084e-06, + "loss": 3.0835, + "step": 1624 + }, + { + "epoch": 0.5778805120910384, + "grad_norm": 0.9798745512962341, + "learning_rate": 1.2082166754808534e-06, + "loss": 2.835, + "step": 1625 + }, + { + "epoch": 0.5782361308677099, + "grad_norm": 1.0871566534042358, + "learning_rate": 1.2065124566426982e-06, + "loss": 3.0555, + "step": 1626 + }, + { + "epoch": 0.5785917496443812, + "grad_norm": 1.1869548559188843, + "learning_rate": 1.2048086316356347e-06, + "loss": 2.8083, + "step": 1627 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 2.8214962482452393, + "learning_rate": 1.2031052027460272e-06, + "loss": 3.5476, + "step": 1628 + }, + { + "epoch": 0.579302987197724, + "grad_norm": 2.2772343158721924, + "learning_rate": 1.2014021722597067e-06, + "loss": 3.5973, + "step": 1629 + }, + { + "epoch": 0.5796586059743954, + "grad_norm": 1.0679072141647339, + "learning_rate": 1.1996995424619715e-06, + "loss": 2.281, + "step": 1630 + }, + { + "epoch": 0.5800142247510669, + "grad_norm": 0.966766357421875, + "learning_rate": 1.1979973156375815e-06, + "loss": 2.5088, + "step": 1631 + }, + { + "epoch": 0.5803698435277382, + "grad_norm": 1.252256989479065, + "learning_rate": 1.1962954940707553e-06, + "loss": 3.3415, + "step": 1632 + }, + { + "epoch": 0.5807254623044097, + "grad_norm": 1.058682918548584, + "learning_rate": 1.194594080045169e-06, + "loss": 2.9808, + "step": 1633 + }, + { + "epoch": 0.581081081081081, + "grad_norm": 1.6873550415039062, + "learning_rate": 1.19289307584395e-06, + "loss": 2.3822, + "step": 1634 + }, + { + "epoch": 0.5814366998577525, + "grad_norm": 0.9933912754058838, + "learning_rate": 1.1911924837496776e-06, + "loss": 2.0247, + "step": 1635 + }, + { + "epoch": 0.5817923186344239, + "grad_norm": 0.9728233218193054, + "learning_rate": 1.1894923060443763e-06, + "loss": 2.694, + "step": 1636 + }, + { + "epoch": 0.5821479374110953, + "grad_norm": 1.2525315284729004, + "learning_rate": 1.1877925450095162e-06, + "loss": 2.4682, + "step": 1637 + }, + { + "epoch": 0.5825035561877667, + "grad_norm": 0.9505037665367126, + "learning_rate": 1.1860932029260074e-06, + "loss": 2.9338, + "step": 1638 + }, + { + "epoch": 0.5828591749644382, + "grad_norm": 0.7580243945121765, + "learning_rate": 1.1843942820741978e-06, + "loss": 2.2824, + "step": 1639 + }, + { + "epoch": 0.5832147937411095, + "grad_norm": 2.1714565753936768, + "learning_rate": 1.182695784733871e-06, + "loss": 3.486, + "step": 1640 + }, + { + "epoch": 0.583570412517781, + "grad_norm": 1.3893463611602783, + "learning_rate": 1.18099771318424e-06, + "loss": 2.9129, + "step": 1641 + }, + { + "epoch": 0.5839260312944523, + "grad_norm": 1.181989073753357, + "learning_rate": 1.1793000697039486e-06, + "loss": 3.0369, + "step": 1642 + }, + { + "epoch": 0.5842816500711238, + "grad_norm": 0.9645770788192749, + "learning_rate": 1.1776028565710662e-06, + "loss": 2.6741, + "step": 1643 + }, + { + "epoch": 0.5846372688477952, + "grad_norm": 1.262559175491333, + "learning_rate": 1.175906076063083e-06, + "loss": 2.0482, + "step": 1644 + }, + { + "epoch": 0.5849928876244666, + "grad_norm": 0.8001596927642822, + "learning_rate": 1.1742097304569108e-06, + "loss": 2.2099, + "step": 1645 + }, + { + "epoch": 0.585348506401138, + "grad_norm": 1.2755495309829712, + "learning_rate": 1.1725138220288755e-06, + "loss": 3.7783, + "step": 1646 + }, + { + "epoch": 0.5857041251778093, + "grad_norm": 1.6751089096069336, + "learning_rate": 1.1708183530547182e-06, + "loss": 3.3778, + "step": 1647 + }, + { + "epoch": 0.5860597439544808, + "grad_norm": 1.4346178770065308, + "learning_rate": 1.169123325809589e-06, + "loss": 3.2525, + "step": 1648 + }, + { + "epoch": 0.5864153627311522, + "grad_norm": 0.9830198884010315, + "learning_rate": 1.1674287425680465e-06, + "loss": 3.1095, + "step": 1649 + }, + { + "epoch": 0.5867709815078236, + "grad_norm": 1.1481764316558838, + "learning_rate": 1.1657346056040533e-06, + "loss": 2.9655, + "step": 1650 + }, + { + "epoch": 0.587126600284495, + "grad_norm": 1.1823936700820923, + "learning_rate": 1.1640409171909713e-06, + "loss": 2.6934, + "step": 1651 + }, + { + "epoch": 0.5874822190611664, + "grad_norm": 1.116373062133789, + "learning_rate": 1.1623476796015631e-06, + "loss": 2.9082, + "step": 1652 + }, + { + "epoch": 0.5878378378378378, + "grad_norm": 1.2296828031539917, + "learning_rate": 1.1606548951079843e-06, + "loss": 2.359, + "step": 1653 + }, + { + "epoch": 0.5881934566145093, + "grad_norm": 3.2339799404144287, + "learning_rate": 1.1589625659817845e-06, + "loss": 3.2091, + "step": 1654 + }, + { + "epoch": 0.5885490753911806, + "grad_norm": 0.8671964406967163, + "learning_rate": 1.1572706944938997e-06, + "loss": 2.5883, + "step": 1655 + }, + { + "epoch": 0.5889046941678521, + "grad_norm": 0.9265004396438599, + "learning_rate": 1.1555792829146535e-06, + "loss": 2.2744, + "step": 1656 + }, + { + "epoch": 0.5892603129445235, + "grad_norm": 1.104708194732666, + "learning_rate": 1.153888333513753e-06, + "loss": 2.1695, + "step": 1657 + }, + { + "epoch": 0.5896159317211949, + "grad_norm": 0.9018216729164124, + "learning_rate": 1.1521978485602826e-06, + "loss": 2.1523, + "step": 1658 + }, + { + "epoch": 0.5899715504978663, + "grad_norm": 1.0308228731155396, + "learning_rate": 1.150507830322706e-06, + "loss": 2.8173, + "step": 1659 + }, + { + "epoch": 0.5903271692745377, + "grad_norm": 1.3419702053070068, + "learning_rate": 1.1488182810688594e-06, + "loss": 3.1771, + "step": 1660 + }, + { + "epoch": 0.5906827880512091, + "grad_norm": 1.9020614624023438, + "learning_rate": 1.1471292030659493e-06, + "loss": 3.6308, + "step": 1661 + }, + { + "epoch": 0.5910384068278806, + "grad_norm": 1.7927685976028442, + "learning_rate": 1.1454405985805515e-06, + "loss": 3.2815, + "step": 1662 + }, + { + "epoch": 0.5913940256045519, + "grad_norm": 1.5071463584899902, + "learning_rate": 1.143752469878604e-06, + "loss": 2.6997, + "step": 1663 + }, + { + "epoch": 0.5917496443812233, + "grad_norm": 1.2682580947875977, + "learning_rate": 1.1420648192254086e-06, + "loss": 3.4043, + "step": 1664 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.8343205451965332, + "learning_rate": 1.140377648885624e-06, + "loss": 1.9375, + "step": 1665 + }, + { + "epoch": 0.5924608819345661, + "grad_norm": 1.447309970855713, + "learning_rate": 1.1386909611232657e-06, + "loss": 2.7547, + "step": 1666 + }, + { + "epoch": 0.5928165007112376, + "grad_norm": 1.1791456937789917, + "learning_rate": 1.1370047582016995e-06, + "loss": 2.8306, + "step": 1667 + }, + { + "epoch": 0.5931721194879089, + "grad_norm": 1.055708885192871, + "learning_rate": 1.1353190423836432e-06, + "loss": 1.4711, + "step": 1668 + }, + { + "epoch": 0.5935277382645804, + "grad_norm": 1.1678980588912964, + "learning_rate": 1.1336338159311596e-06, + "loss": 2.8645, + "step": 1669 + }, + { + "epoch": 0.5938833570412517, + "grad_norm": 1.2266043424606323, + "learning_rate": 1.1319490811056548e-06, + "loss": 2.6213, + "step": 1670 + }, + { + "epoch": 0.5942389758179232, + "grad_norm": 1.3018580675125122, + "learning_rate": 1.130264840167876e-06, + "loss": 3.4013, + "step": 1671 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 1.3630139827728271, + "learning_rate": 1.1285810953779057e-06, + "loss": 2.8088, + "step": 1672 + }, + { + "epoch": 0.594950213371266, + "grad_norm": 0.9662373065948486, + "learning_rate": 1.1268978489951631e-06, + "loss": 2.7605, + "step": 1673 + }, + { + "epoch": 0.5953058321479374, + "grad_norm": 1.0017868280410767, + "learning_rate": 1.1252151032783965e-06, + "loss": 2.6343, + "step": 1674 + }, + { + "epoch": 0.5956614509246089, + "grad_norm": 1.3229529857635498, + "learning_rate": 1.123532860485684e-06, + "loss": 2.2027, + "step": 1675 + }, + { + "epoch": 0.5960170697012802, + "grad_norm": 1.4044855833053589, + "learning_rate": 1.1218511228744283e-06, + "loss": 1.0209, + "step": 1676 + }, + { + "epoch": 0.5963726884779517, + "grad_norm": 1.4488521814346313, + "learning_rate": 1.1201698927013532e-06, + "loss": 0.7367, + "step": 1677 + }, + { + "epoch": 0.596728307254623, + "grad_norm": 1.0793721675872803, + "learning_rate": 1.1184891722225031e-06, + "loss": 3.0298, + "step": 1678 + }, + { + "epoch": 0.5970839260312945, + "grad_norm": 1.478691816329956, + "learning_rate": 1.116808963693237e-06, + "loss": 2.731, + "step": 1679 + }, + { + "epoch": 0.5974395448079659, + "grad_norm": 1.448178768157959, + "learning_rate": 1.1151292693682276e-06, + "loss": 2.4577, + "step": 1680 + }, + { + "epoch": 0.5977951635846372, + "grad_norm": 2.3192074298858643, + "learning_rate": 1.1134500915014587e-06, + "loss": 3.4398, + "step": 1681 + }, + { + "epoch": 0.5981507823613087, + "grad_norm": 1.0344318151474, + "learning_rate": 1.1117714323462188e-06, + "loss": 3.0702, + "step": 1682 + }, + { + "epoch": 0.59850640113798, + "grad_norm": 1.4172985553741455, + "learning_rate": 1.1100932941551027e-06, + "loss": 2.6222, + "step": 1683 + }, + { + "epoch": 0.5988620199146515, + "grad_norm": 1.0687344074249268, + "learning_rate": 1.1084156791800035e-06, + "loss": 3.4314, + "step": 1684 + }, + { + "epoch": 0.5992176386913229, + "grad_norm": 1.4657456874847412, + "learning_rate": 1.1067385896721148e-06, + "loss": 3.8971, + "step": 1685 + }, + { + "epoch": 0.5995732574679943, + "grad_norm": 1.1157350540161133, + "learning_rate": 1.1050620278819233e-06, + "loss": 2.77, + "step": 1686 + }, + { + "epoch": 0.5999288762446657, + "grad_norm": 0.9183606505393982, + "learning_rate": 1.1033859960592081e-06, + "loss": 2.5319, + "step": 1687 + }, + { + "epoch": 0.6002844950213371, + "grad_norm": 0.842345654964447, + "learning_rate": 1.1017104964530383e-06, + "loss": 2.4813, + "step": 1688 + }, + { + "epoch": 0.6006401137980085, + "grad_norm": 0.9420804977416992, + "learning_rate": 1.1000355313117662e-06, + "loss": 2.6017, + "step": 1689 + }, + { + "epoch": 0.60099573257468, + "grad_norm": 0.9706472158432007, + "learning_rate": 1.0983611028830292e-06, + "loss": 2.4428, + "step": 1690 + }, + { + "epoch": 0.6013513513513513, + "grad_norm": 1.1689740419387817, + "learning_rate": 1.0966872134137437e-06, + "loss": 2.1547, + "step": 1691 + }, + { + "epoch": 0.6017069701280228, + "grad_norm": 1.170046091079712, + "learning_rate": 1.095013865150103e-06, + "loss": 2.4999, + "step": 1692 + }, + { + "epoch": 0.6020625889046942, + "grad_norm": 0.8130316138267517, + "learning_rate": 1.0933410603375736e-06, + "loss": 2.4691, + "step": 1693 + }, + { + "epoch": 0.6024182076813656, + "grad_norm": 0.7871066331863403, + "learning_rate": 1.091668801220893e-06, + "loss": 2.3294, + "step": 1694 + }, + { + "epoch": 0.602773826458037, + "grad_norm": 0.9585657119750977, + "learning_rate": 1.0899970900440677e-06, + "loss": 2.4801, + "step": 1695 + }, + { + "epoch": 0.6031294452347084, + "grad_norm": 1.5794215202331543, + "learning_rate": 1.0883259290503664e-06, + "loss": 2.4579, + "step": 1696 + }, + { + "epoch": 0.6034850640113798, + "grad_norm": 0.8226059079170227, + "learning_rate": 1.0866553204823224e-06, + "loss": 2.5777, + "step": 1697 + }, + { + "epoch": 0.6038406827880513, + "grad_norm": 0.9495562314987183, + "learning_rate": 1.0849852665817248e-06, + "loss": 1.7837, + "step": 1698 + }, + { + "epoch": 0.6041963015647226, + "grad_norm": 1.9423400163650513, + "learning_rate": 1.0833157695896213e-06, + "loss": 2.7107, + "step": 1699 + }, + { + "epoch": 0.604551920341394, + "grad_norm": 11.418302536010742, + "learning_rate": 1.0816468317463094e-06, + "loss": 4.0404, + "step": 1700 + }, + { + "epoch": 0.6049075391180654, + "grad_norm": 0.8017799258232117, + "learning_rate": 1.0799784552913382e-06, + "loss": 2.4763, + "step": 1701 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 1.3110682964324951, + "learning_rate": 1.0783106424635034e-06, + "loss": 3.7797, + "step": 1702 + }, + { + "epoch": 0.6056187766714083, + "grad_norm": 0.8757337331771851, + "learning_rate": 1.0766433955008433e-06, + "loss": 2.2407, + "step": 1703 + }, + { + "epoch": 0.6059743954480796, + "grad_norm": 4.876639366149902, + "learning_rate": 1.0749767166406384e-06, + "loss": 1.4203, + "step": 1704 + }, + { + "epoch": 0.6063300142247511, + "grad_norm": 1.3712618350982666, + "learning_rate": 1.0733106081194049e-06, + "loss": 3.3901, + "step": 1705 + }, + { + "epoch": 0.6066856330014224, + "grad_norm": 1.6431909799575806, + "learning_rate": 1.071645072172895e-06, + "loss": 4.0617, + "step": 1706 + }, + { + "epoch": 0.6070412517780939, + "grad_norm": 0.7992070317268372, + "learning_rate": 1.0699801110360926e-06, + "loss": 1.6454, + "step": 1707 + }, + { + "epoch": 0.6073968705547653, + "grad_norm": 2.7122933864593506, + "learning_rate": 1.0683157269432096e-06, + "loss": 4.6507, + "step": 1708 + }, + { + "epoch": 0.6077524893314367, + "grad_norm": 0.9466752409934998, + "learning_rate": 1.0666519221276849e-06, + "loss": 2.7897, + "step": 1709 + }, + { + "epoch": 0.6081081081081081, + "grad_norm": 1.3168225288391113, + "learning_rate": 1.0649886988221775e-06, + "loss": 3.1445, + "step": 1710 + }, + { + "epoch": 0.6084637268847796, + "grad_norm": 2.2756147384643555, + "learning_rate": 1.0633260592585685e-06, + "loss": 3.4284, + "step": 1711 + }, + { + "epoch": 0.6088193456614509, + "grad_norm": 2.851876735687256, + "learning_rate": 1.0616640056679548e-06, + "loss": 5.1641, + "step": 1712 + }, + { + "epoch": 0.6091749644381224, + "grad_norm": 1.0120997428894043, + "learning_rate": 1.0600025402806467e-06, + "loss": 3.0286, + "step": 1713 + }, + { + "epoch": 0.6095305832147937, + "grad_norm": 1.0557596683502197, + "learning_rate": 1.0583416653261663e-06, + "loss": 3.3886, + "step": 1714 + }, + { + "epoch": 0.6098862019914651, + "grad_norm": 0.8052176237106323, + "learning_rate": 1.0566813830332415e-06, + "loss": 2.4441, + "step": 1715 + }, + { + "epoch": 0.6102418207681366, + "grad_norm": 0.8851521611213684, + "learning_rate": 1.0550216956298072e-06, + "loss": 1.6196, + "step": 1716 + }, + { + "epoch": 0.6105974395448079, + "grad_norm": 0.9852917194366455, + "learning_rate": 1.0533626053429974e-06, + "loss": 2.3766, + "step": 1717 + }, + { + "epoch": 0.6109530583214794, + "grad_norm": 2.116347312927246, + "learning_rate": 1.0517041143991475e-06, + "loss": 2.5179, + "step": 1718 + }, + { + "epoch": 0.6113086770981507, + "grad_norm": 1.2577205896377563, + "learning_rate": 1.0500462250237864e-06, + "loss": 2.7525, + "step": 1719 + }, + { + "epoch": 0.6116642958748222, + "grad_norm": 2.02299427986145, + "learning_rate": 1.0483889394416373e-06, + "loss": 3.7706, + "step": 1720 + }, + { + "epoch": 0.6120199146514936, + "grad_norm": 0.837478518486023, + "learning_rate": 1.0467322598766131e-06, + "loss": 2.8156, + "step": 1721 + }, + { + "epoch": 0.612375533428165, + "grad_norm": 0.7562845945358276, + "learning_rate": 1.0450761885518117e-06, + "loss": 2.4149, + "step": 1722 + }, + { + "epoch": 0.6127311522048364, + "grad_norm": 0.9935852289199829, + "learning_rate": 1.0434207276895172e-06, + "loss": 2.2846, + "step": 1723 + }, + { + "epoch": 0.6130867709815079, + "grad_norm": 1.0399624109268188, + "learning_rate": 1.0417658795111926e-06, + "loss": 2.3869, + "step": 1724 + }, + { + "epoch": 0.6134423897581792, + "grad_norm": 1.5192818641662598, + "learning_rate": 1.0401116462374802e-06, + "loss": 3.2968, + "step": 1725 + }, + { + "epoch": 0.6137980085348507, + "grad_norm": 0.9097810387611389, + "learning_rate": 1.0384580300881968e-06, + "loss": 2.5415, + "step": 1726 + }, + { + "epoch": 0.614153627311522, + "grad_norm": 0.8523087501525879, + "learning_rate": 1.0368050332823298e-06, + "loss": 2.5386, + "step": 1727 + }, + { + "epoch": 0.6145092460881935, + "grad_norm": 1.5694955587387085, + "learning_rate": 1.0351526580380373e-06, + "loss": 2.8312, + "step": 1728 + }, + { + "epoch": 0.6148648648648649, + "grad_norm": 0.9246408939361572, + "learning_rate": 1.0335009065726417e-06, + "loss": 1.5445, + "step": 1729 + }, + { + "epoch": 0.6152204836415363, + "grad_norm": 1.8572087287902832, + "learning_rate": 1.0318497811026308e-06, + "loss": 1.2966, + "step": 1730 + }, + { + "epoch": 0.6155761024182077, + "grad_norm": 1.0830531120300293, + "learning_rate": 1.0301992838436486e-06, + "loss": 2.2505, + "step": 1731 + }, + { + "epoch": 0.615931721194879, + "grad_norm": 0.8718277812004089, + "learning_rate": 1.0285494170104994e-06, + "loss": 2.7388, + "step": 1732 + }, + { + "epoch": 0.6162873399715505, + "grad_norm": 0.6860084533691406, + "learning_rate": 1.0269001828171408e-06, + "loss": 2.0842, + "step": 1733 + }, + { + "epoch": 0.616642958748222, + "grad_norm": 0.881122350692749, + "learning_rate": 1.02525158347668e-06, + "loss": 1.7453, + "step": 1734 + }, + { + "epoch": 0.6169985775248933, + "grad_norm": 0.8647218942642212, + "learning_rate": 1.023603621201375e-06, + "loss": 2.1973, + "step": 1735 + }, + { + "epoch": 0.6173541963015647, + "grad_norm": 4.771215915679932, + "learning_rate": 1.021956298202625e-06, + "loss": 3.2183, + "step": 1736 + }, + { + "epoch": 0.6177098150782361, + "grad_norm": 1.1472195386886597, + "learning_rate": 1.0203096166909757e-06, + "loss": 3.0259, + "step": 1737 + }, + { + "epoch": 0.6180654338549075, + "grad_norm": 1.042616605758667, + "learning_rate": 1.0186635788761083e-06, + "loss": 2.6648, + "step": 1738 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 1.1069908142089844, + "learning_rate": 1.0170181869668424e-06, + "loss": 2.8473, + "step": 1739 + }, + { + "epoch": 0.6187766714082503, + "grad_norm": 0.8631059527397156, + "learning_rate": 1.0153734431711307e-06, + "loss": 2.3322, + "step": 1740 + }, + { + "epoch": 0.6191322901849218, + "grad_norm": 1.2840993404388428, + "learning_rate": 1.0137293496960554e-06, + "loss": 2.723, + "step": 1741 + }, + { + "epoch": 0.6194879089615932, + "grad_norm": 1.6495139598846436, + "learning_rate": 1.0120859087478271e-06, + "loss": 3.3065, + "step": 1742 + }, + { + "epoch": 0.6198435277382646, + "grad_norm": 0.7940443754196167, + "learning_rate": 1.0104431225317785e-06, + "loss": 2.5142, + "step": 1743 + }, + { + "epoch": 0.620199146514936, + "grad_norm": 0.9893941283226013, + "learning_rate": 1.0088009932523666e-06, + "loss": 2.3373, + "step": 1744 + }, + { + "epoch": 0.6205547652916074, + "grad_norm": 0.7733781933784485, + "learning_rate": 1.0071595231131654e-06, + "loss": 2.47, + "step": 1745 + }, + { + "epoch": 0.6209103840682788, + "grad_norm": 0.9413859248161316, + "learning_rate": 1.005518714316864e-06, + "loss": 2.4703, + "step": 1746 + }, + { + "epoch": 0.6212660028449503, + "grad_norm": 0.9699928760528564, + "learning_rate": 1.003878569065266e-06, + "loss": 2.6626, + "step": 1747 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 0.9395011067390442, + "learning_rate": 1.0022390895592814e-06, + "loss": 2.6521, + "step": 1748 + }, + { + "epoch": 0.621977240398293, + "grad_norm": 0.8290120363235474, + "learning_rate": 1.0006002779989295e-06, + "loss": 2.5407, + "step": 1749 + }, + { + "epoch": 0.6223328591749644, + "grad_norm": 1.1608049869537354, + "learning_rate": 9.989621365833323e-07, + "loss": 2.7905, + "step": 1750 + }, + { + "epoch": 0.6226884779516358, + "grad_norm": 0.9459041357040405, + "learning_rate": 9.973246675107126e-07, + "loss": 1.9838, + "step": 1751 + }, + { + "epoch": 0.6230440967283073, + "grad_norm": 1.1427068710327148, + "learning_rate": 9.956878729783918e-07, + "loss": 2.7565, + "step": 1752 + }, + { + "epoch": 0.6233997155049786, + "grad_norm": 1.292556881904602, + "learning_rate": 9.94051755182784e-07, + "loss": 3.4528, + "step": 1753 + }, + { + "epoch": 0.6237553342816501, + "grad_norm": 0.8551099300384521, + "learning_rate": 9.924163163193972e-07, + "loss": 2.2375, + "step": 1754 + }, + { + "epoch": 0.6241109530583214, + "grad_norm": 0.9358735084533691, + "learning_rate": 9.907815585828278e-07, + "loss": 2.3503, + "step": 1755 + }, + { + "epoch": 0.6244665718349929, + "grad_norm": 0.8137001395225525, + "learning_rate": 9.891474841667586e-07, + "loss": 2.6347, + "step": 1756 + }, + { + "epoch": 0.6248221906116643, + "grad_norm": 1.1376190185546875, + "learning_rate": 9.875140952639535e-07, + "loss": 3.7789, + "step": 1757 + }, + { + "epoch": 0.6251778093883357, + "grad_norm": 0.7547426223754883, + "learning_rate": 9.858813940662587e-07, + "loss": 1.9924, + "step": 1758 + }, + { + "epoch": 0.6255334281650071, + "grad_norm": 0.7949218153953552, + "learning_rate": 9.842493827645978e-07, + "loss": 1.651, + "step": 1759 + }, + { + "epoch": 0.6258890469416786, + "grad_norm": 5.9967241287231445, + "learning_rate": 9.82618063548966e-07, + "loss": 3.8898, + "step": 1760 + }, + { + "epoch": 0.6262446657183499, + "grad_norm": 1.0381572246551514, + "learning_rate": 9.809874386084324e-07, + "loss": 2.9333, + "step": 1761 + }, + { + "epoch": 0.6266002844950214, + "grad_norm": 0.8031646013259888, + "learning_rate": 9.793575101311331e-07, + "loss": 1.9199, + "step": 1762 + }, + { + "epoch": 0.6269559032716927, + "grad_norm": 1.4146262407302856, + "learning_rate": 9.777282803042704e-07, + "loss": 4.2151, + "step": 1763 + }, + { + "epoch": 0.6273115220483642, + "grad_norm": 0.7584929466247559, + "learning_rate": 9.76099751314108e-07, + "loss": 2.5716, + "step": 1764 + }, + { + "epoch": 0.6276671408250356, + "grad_norm": 0.9141373634338379, + "learning_rate": 9.744719253459705e-07, + "loss": 2.8005, + "step": 1765 + }, + { + "epoch": 0.628022759601707, + "grad_norm": 1.611637830734253, + "learning_rate": 9.72844804584238e-07, + "loss": 2.6805, + "step": 1766 + }, + { + "epoch": 0.6283783783783784, + "grad_norm": 3.21397066116333, + "learning_rate": 9.712183912123446e-07, + "loss": 4.6581, + "step": 1767 + }, + { + "epoch": 0.6287339971550497, + "grad_norm": 0.956745982170105, + "learning_rate": 9.695926874127766e-07, + "loss": 2.3872, + "step": 1768 + }, + { + "epoch": 0.6290896159317212, + "grad_norm": 1.241743803024292, + "learning_rate": 9.67967695367065e-07, + "loss": 3.4766, + "step": 1769 + }, + { + "epoch": 0.6294452347083926, + "grad_norm": 1.720165729522705, + "learning_rate": 9.66343417255788e-07, + "loss": 3.4502, + "step": 1770 + }, + { + "epoch": 0.629800853485064, + "grad_norm": 1.5268566608428955, + "learning_rate": 9.64719855258566e-07, + "loss": 2.9634, + "step": 1771 + }, + { + "epoch": 0.6301564722617354, + "grad_norm": 1.1240977048873901, + "learning_rate": 9.630970115540572e-07, + "loss": 2.4448, + "step": 1772 + }, + { + "epoch": 0.6305120910384068, + "grad_norm": 1.8318135738372803, + "learning_rate": 9.614748883199567e-07, + "loss": 2.1348, + "step": 1773 + }, + { + "epoch": 0.6308677098150782, + "grad_norm": 4.190740585327148, + "learning_rate": 9.598534877329919e-07, + "loss": 1.8656, + "step": 1774 + }, + { + "epoch": 0.6312233285917497, + "grad_norm": 0.7923961281776428, + "learning_rate": 9.582328119689224e-07, + "loss": 1.6527, + "step": 1775 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 2.478618860244751, + "learning_rate": 9.56612863202532e-07, + "loss": 2.3684, + "step": 1776 + }, + { + "epoch": 0.6319345661450925, + "grad_norm": 1.0341283082962036, + "learning_rate": 9.54993643607632e-07, + "loss": 2.9508, + "step": 1777 + }, + { + "epoch": 0.6322901849217639, + "grad_norm": 0.7132455706596375, + "learning_rate": 9.533751553570543e-07, + "loss": 2.1831, + "step": 1778 + }, + { + "epoch": 0.6326458036984353, + "grad_norm": 2.183997869491577, + "learning_rate": 9.517574006226485e-07, + "loss": 4.047, + "step": 1779 + }, + { + "epoch": 0.6330014224751067, + "grad_norm": 0.7654913067817688, + "learning_rate": 9.501403815752812e-07, + "loss": 2.6844, + "step": 1780 + }, + { + "epoch": 0.633357041251778, + "grad_norm": 0.8546299338340759, + "learning_rate": 9.485241003848301e-07, + "loss": 2.707, + "step": 1781 + }, + { + "epoch": 0.6337126600284495, + "grad_norm": 2.2622311115264893, + "learning_rate": 9.469085592201847e-07, + "loss": 3.9623, + "step": 1782 + }, + { + "epoch": 0.634068278805121, + "grad_norm": 1.3957542181015015, + "learning_rate": 9.452937602492401e-07, + "loss": 2.9773, + "step": 1783 + }, + { + "epoch": 0.6344238975817923, + "grad_norm": 0.7801440954208374, + "learning_rate": 9.436797056388959e-07, + "loss": 2.5377, + "step": 1784 + }, + { + "epoch": 0.6347795163584637, + "grad_norm": 1.0299055576324463, + "learning_rate": 9.420663975550536e-07, + "loss": 2.1839, + "step": 1785 + }, + { + "epoch": 0.6351351351351351, + "grad_norm": 1.0628803968429565, + "learning_rate": 9.404538381626111e-07, + "loss": 2.4721, + "step": 1786 + }, + { + "epoch": 0.6354907539118065, + "grad_norm": 0.9485183954238892, + "learning_rate": 9.388420296254635e-07, + "loss": 1.4858, + "step": 1787 + }, + { + "epoch": 0.635846372688478, + "grad_norm": 0.9490165114402771, + "learning_rate": 9.372309741064968e-07, + "loss": 3.0799, + "step": 1788 + }, + { + "epoch": 0.6362019914651493, + "grad_norm": 1.2074594497680664, + "learning_rate": 9.356206737675877e-07, + "loss": 2.882, + "step": 1789 + }, + { + "epoch": 0.6365576102418208, + "grad_norm": 1.517573356628418, + "learning_rate": 9.340111307696001e-07, + "loss": 3.2359, + "step": 1790 + }, + { + "epoch": 0.6369132290184921, + "grad_norm": 0.8884390592575073, + "learning_rate": 9.324023472723787e-07, + "loss": 2.3849, + "step": 1791 + }, + { + "epoch": 0.6372688477951636, + "grad_norm": 1.1333503723144531, + "learning_rate": 9.30794325434752e-07, + "loss": 2.4791, + "step": 1792 + }, + { + "epoch": 0.637624466571835, + "grad_norm": 0.9250131845474243, + "learning_rate": 9.29187067414525e-07, + "loss": 2.377, + "step": 1793 + }, + { + "epoch": 0.6379800853485064, + "grad_norm": 0.9004590511322021, + "learning_rate": 9.275805753684792e-07, + "loss": 2.6642, + "step": 1794 + }, + { + "epoch": 0.6383357041251778, + "grad_norm": 0.8738925457000732, + "learning_rate": 9.259748514523654e-07, + "loss": 2.558, + "step": 1795 + }, + { + "epoch": 0.6386913229018493, + "grad_norm": 0.9268471598625183, + "learning_rate": 9.243698978209064e-07, + "loss": 1.8126, + "step": 1796 + }, + { + "epoch": 0.6390469416785206, + "grad_norm": 1.2282906770706177, + "learning_rate": 9.227657166277906e-07, + "loss": 2.5933, + "step": 1797 + }, + { + "epoch": 0.6394025604551921, + "grad_norm": 2.149836778640747, + "learning_rate": 9.211623100256686e-07, + "loss": 1.9478, + "step": 1798 + }, + { + "epoch": 0.6397581792318634, + "grad_norm": 1.3983783721923828, + "learning_rate": 9.195596801661537e-07, + "loss": 1.2617, + "step": 1799 + }, + { + "epoch": 0.6401137980085349, + "grad_norm": 1.5488317012786865, + "learning_rate": 9.179578291998146e-07, + "loss": 4.0784, + "step": 1800 + }, + { + "epoch": 0.6404694167852063, + "grad_norm": 0.7725949883460999, + "learning_rate": 9.163567592761775e-07, + "loss": 1.9561, + "step": 1801 + }, + { + "epoch": 0.6408250355618776, + "grad_norm": 3.8975327014923096, + "learning_rate": 9.147564725437172e-07, + "loss": 2.5082, + "step": 1802 + }, + { + "epoch": 0.6411806543385491, + "grad_norm": 2.0656533241271973, + "learning_rate": 9.131569711498602e-07, + "loss": 4.5366, + "step": 1803 + }, + { + "epoch": 0.6415362731152204, + "grad_norm": 1.8180439472198486, + "learning_rate": 9.115582572409788e-07, + "loss": 3.1124, + "step": 1804 + }, + { + "epoch": 0.6418918918918919, + "grad_norm": 3.5820205211639404, + "learning_rate": 9.099603329623872e-07, + "loss": 3.9057, + "step": 1805 + }, + { + "epoch": 0.6422475106685633, + "grad_norm": 0.9416325688362122, + "learning_rate": 9.083632004583417e-07, + "loss": 2.5301, + "step": 1806 + }, + { + "epoch": 0.6426031294452347, + "grad_norm": 0.9045472145080566, + "learning_rate": 9.067668618720341e-07, + "loss": 2.1144, + "step": 1807 + }, + { + "epoch": 0.6429587482219061, + "grad_norm": 0.9156874418258667, + "learning_rate": 9.051713193455928e-07, + "loss": 1.582, + "step": 1808 + }, + { + "epoch": 0.6433143669985776, + "grad_norm": 0.8072073459625244, + "learning_rate": 9.035765750200773e-07, + "loss": 2.6895, + "step": 1809 + }, + { + "epoch": 0.6436699857752489, + "grad_norm": 0.9054638147354126, + "learning_rate": 9.019826310354753e-07, + "loss": 2.6841, + "step": 1810 + }, + { + "epoch": 0.6440256045519204, + "grad_norm": 4.012231826782227, + "learning_rate": 9.003894895307019e-07, + "loss": 3.8258, + "step": 1811 + }, + { + "epoch": 0.6443812233285917, + "grad_norm": 0.885108470916748, + "learning_rate": 8.987971526435933e-07, + "loss": 3.0356, + "step": 1812 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.9840932488441467, + "learning_rate": 8.972056225109083e-07, + "loss": 3.1238, + "step": 1813 + }, + { + "epoch": 0.6450924608819346, + "grad_norm": 1.1713752746582031, + "learning_rate": 8.956149012683216e-07, + "loss": 2.0204, + "step": 1814 + }, + { + "epoch": 0.645448079658606, + "grad_norm": 1.1312954425811768, + "learning_rate": 8.940249910504229e-07, + "loss": 2.9987, + "step": 1815 + }, + { + "epoch": 0.6458036984352774, + "grad_norm": 2.700533628463745, + "learning_rate": 8.92435893990714e-07, + "loss": 1.4538, + "step": 1816 + }, + { + "epoch": 0.6461593172119487, + "grad_norm": 0.7441496253013611, + "learning_rate": 8.908476122216045e-07, + "loss": 2.4625, + "step": 1817 + }, + { + "epoch": 0.6465149359886202, + "grad_norm": 1.364848256111145, + "learning_rate": 8.892601478744111e-07, + "loss": 3.0599, + "step": 1818 + }, + { + "epoch": 0.6468705547652916, + "grad_norm": 1.4207336902618408, + "learning_rate": 8.876735030793523e-07, + "loss": 3.4517, + "step": 1819 + }, + { + "epoch": 0.647226173541963, + "grad_norm": 0.854752242565155, + "learning_rate": 8.860876799655484e-07, + "loss": 1.9666, + "step": 1820 + }, + { + "epoch": 0.6475817923186344, + "grad_norm": 1.3320212364196777, + "learning_rate": 8.845026806610153e-07, + "loss": 2.7507, + "step": 1821 + }, + { + "epoch": 0.6479374110953058, + "grad_norm": 0.915772557258606, + "learning_rate": 8.829185072926654e-07, + "loss": 2.5959, + "step": 1822 + }, + { + "epoch": 0.6482930298719772, + "grad_norm": 2.3456673622131348, + "learning_rate": 8.813351619863021e-07, + "loss": 3.2665, + "step": 1823 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.8875810503959656, + "learning_rate": 8.797526468666159e-07, + "loss": 2.6638, + "step": 1824 + }, + { + "epoch": 0.64900426742532, + "grad_norm": 0.9412809610366821, + "learning_rate": 8.781709640571858e-07, + "loss": 2.4315, + "step": 1825 + }, + { + "epoch": 0.6493598862019915, + "grad_norm": 1.031601905822754, + "learning_rate": 8.765901156804722e-07, + "loss": 2.7496, + "step": 1826 + }, + { + "epoch": 0.6497155049786629, + "grad_norm": 0.8166359066963196, + "learning_rate": 8.750101038578166e-07, + "loss": 2.6175, + "step": 1827 + }, + { + "epoch": 0.6500711237553343, + "grad_norm": 1.0555791854858398, + "learning_rate": 8.734309307094381e-07, + "loss": 2.666, + "step": 1828 + }, + { + "epoch": 0.6504267425320057, + "grad_norm": 1.1276434659957886, + "learning_rate": 8.718525983544296e-07, + "loss": 3.0113, + "step": 1829 + }, + { + "epoch": 0.6507823613086771, + "grad_norm": 1.6120827198028564, + "learning_rate": 8.702751089107562e-07, + "loss": 2.5069, + "step": 1830 + }, + { + "epoch": 0.6511379800853485, + "grad_norm": 1.4630787372589111, + "learning_rate": 8.686984644952518e-07, + "loss": 2.4136, + "step": 1831 + }, + { + "epoch": 0.65149359886202, + "grad_norm": 1.3724414110183716, + "learning_rate": 8.671226672236166e-07, + "loss": 2.6669, + "step": 1832 + }, + { + "epoch": 0.6518492176386913, + "grad_norm": 1.0139156579971313, + "learning_rate": 8.655477192104127e-07, + "loss": 2.6942, + "step": 1833 + }, + { + "epoch": 0.6522048364153628, + "grad_norm": 1.3816533088684082, + "learning_rate": 8.639736225690654e-07, + "loss": 2.0469, + "step": 1834 + }, + { + "epoch": 0.6525604551920341, + "grad_norm": 0.9188684225082397, + "learning_rate": 8.624003794118549e-07, + "loss": 2.3521, + "step": 1835 + }, + { + "epoch": 0.6529160739687055, + "grad_norm": 1.3211535215377808, + "learning_rate": 8.608279918499171e-07, + "loss": 2.9866, + "step": 1836 + }, + { + "epoch": 0.653271692745377, + "grad_norm": 0.8296423554420471, + "learning_rate": 8.592564619932399e-07, + "loss": 2.2837, + "step": 1837 + }, + { + "epoch": 0.6536273115220483, + "grad_norm": 1.1084462404251099, + "learning_rate": 8.576857919506601e-07, + "loss": 3.3671, + "step": 1838 + }, + { + "epoch": 0.6539829302987198, + "grad_norm": 1.1288994550704956, + "learning_rate": 8.561159838298602e-07, + "loss": 2.9139, + "step": 1839 + }, + { + "epoch": 0.6543385490753911, + "grad_norm": 0.8854053020477295, + "learning_rate": 8.545470397373665e-07, + "loss": 1.864, + "step": 1840 + }, + { + "epoch": 0.6546941678520626, + "grad_norm": 1.8054566383361816, + "learning_rate": 8.529789617785467e-07, + "loss": 4.1507, + "step": 1841 + }, + { + "epoch": 0.655049786628734, + "grad_norm": 1.1008636951446533, + "learning_rate": 8.514117520576049e-07, + "loss": 2.9504, + "step": 1842 + }, + { + "epoch": 0.6554054054054054, + "grad_norm": 1.622671365737915, + "learning_rate": 8.498454126775811e-07, + "loss": 2.8304, + "step": 1843 + }, + { + "epoch": 0.6557610241820768, + "grad_norm": 0.9324979186058044, + "learning_rate": 8.482799457403466e-07, + "loss": 2.2326, + "step": 1844 + }, + { + "epoch": 0.6561166429587483, + "grad_norm": 2.845000743865967, + "learning_rate": 8.467153533466016e-07, + "loss": 3.3165, + "step": 1845 + }, + { + "epoch": 0.6564722617354196, + "grad_norm": 0.9395184516906738, + "learning_rate": 8.451516375958755e-07, + "loss": 2.5511, + "step": 1846 + }, + { + "epoch": 0.6568278805120911, + "grad_norm": 0.9025001525878906, + "learning_rate": 8.435888005865169e-07, + "loss": 2.4884, + "step": 1847 + }, + { + "epoch": 0.6571834992887624, + "grad_norm": 1.0356730222702026, + "learning_rate": 8.420268444156993e-07, + "loss": 2.2462, + "step": 1848 + }, + { + "epoch": 0.6575391180654339, + "grad_norm": 0.706518292427063, + "learning_rate": 8.404657711794121e-07, + "loss": 2.1249, + "step": 1849 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.8488615155220032, + "learning_rate": 8.389055829724595e-07, + "loss": 2.385, + "step": 1850 + }, + { + "epoch": 0.6582503556187767, + "grad_norm": 1.0328747034072876, + "learning_rate": 8.373462818884611e-07, + "loss": 2.9035, + "step": 1851 + }, + { + "epoch": 0.6586059743954481, + "grad_norm": 1.4566404819488525, + "learning_rate": 8.357878700198407e-07, + "loss": 2.0317, + "step": 1852 + }, + { + "epoch": 0.6589615931721194, + "grad_norm": 1.0177923440933228, + "learning_rate": 8.342303494578346e-07, + "loss": 2.1565, + "step": 1853 + }, + { + "epoch": 0.6593172119487909, + "grad_norm": 1.0607157945632935, + "learning_rate": 8.326737222924795e-07, + "loss": 2.4758, + "step": 1854 + }, + { + "epoch": 0.6596728307254623, + "grad_norm": 1.2303181886672974, + "learning_rate": 8.311179906126135e-07, + "loss": 2.4502, + "step": 1855 + }, + { + "epoch": 0.6600284495021337, + "grad_norm": 2.477726936340332, + "learning_rate": 8.29563156505876e-07, + "loss": 3.009, + "step": 1856 + }, + { + "epoch": 0.6603840682788051, + "grad_norm": 1.3717228174209595, + "learning_rate": 8.28009222058697e-07, + "loss": 2.6845, + "step": 1857 + }, + { + "epoch": 0.6607396870554765, + "grad_norm": 1.1445283889770508, + "learning_rate": 8.264561893563044e-07, + "loss": 3.2734, + "step": 1858 + }, + { + "epoch": 0.6610953058321479, + "grad_norm": 1.0073117017745972, + "learning_rate": 8.249040604827112e-07, + "loss": 2.222, + "step": 1859 + }, + { + "epoch": 0.6614509246088194, + "grad_norm": 1.869721531867981, + "learning_rate": 8.23352837520722e-07, + "loss": 2.4835, + "step": 1860 + }, + { + "epoch": 0.6618065433854907, + "grad_norm": 0.958949089050293, + "learning_rate": 8.218025225519228e-07, + "loss": 3.0919, + "step": 1861 + }, + { + "epoch": 0.6621621621621622, + "grad_norm": 0.8914245963096619, + "learning_rate": 8.202531176566818e-07, + "loss": 2.2757, + "step": 1862 + }, + { + "epoch": 0.6625177809388336, + "grad_norm": 1.2934887409210205, + "learning_rate": 8.187046249141477e-07, + "loss": 3.4637, + "step": 1863 + }, + { + "epoch": 0.662873399715505, + "grad_norm": 2.298330545425415, + "learning_rate": 8.171570464022418e-07, + "loss": 4.3233, + "step": 1864 + }, + { + "epoch": 0.6632290184921764, + "grad_norm": 1.136576533317566, + "learning_rate": 8.156103841976619e-07, + "loss": 2.9618, + "step": 1865 + }, + { + "epoch": 0.6635846372688478, + "grad_norm": 1.60581636428833, + "learning_rate": 8.140646403758746e-07, + "loss": 3.0846, + "step": 1866 + }, + { + "epoch": 0.6639402560455192, + "grad_norm": 1.0427199602127075, + "learning_rate": 8.125198170111135e-07, + "loss": 2.5552, + "step": 1867 + }, + { + "epoch": 0.6642958748221907, + "grad_norm": 1.3851534128189087, + "learning_rate": 8.109759161763797e-07, + "loss": 3.2057, + "step": 1868 + }, + { + "epoch": 0.664651493598862, + "grad_norm": 0.8315195441246033, + "learning_rate": 8.094329399434324e-07, + "loss": 2.245, + "step": 1869 + }, + { + "epoch": 0.6650071123755334, + "grad_norm": 0.768724799156189, + "learning_rate": 8.078908903827937e-07, + "loss": 2.1724, + "step": 1870 + }, + { + "epoch": 0.6653627311522048, + "grad_norm": 1.100614309310913, + "learning_rate": 8.063497695637404e-07, + "loss": 2.1837, + "step": 1871 + }, + { + "epoch": 0.6657183499288762, + "grad_norm": 1.4938305616378784, + "learning_rate": 8.048095795543028e-07, + "loss": 3.2676, + "step": 1872 + }, + { + "epoch": 0.6660739687055477, + "grad_norm": 0.7999220490455627, + "learning_rate": 8.032703224212641e-07, + "loss": 2.473, + "step": 1873 + }, + { + "epoch": 0.666429587482219, + "grad_norm": 1.185294508934021, + "learning_rate": 8.017320002301523e-07, + "loss": 1.9969, + "step": 1874 + }, + { + "epoch": 0.6667852062588905, + "grad_norm": 0.8007413148880005, + "learning_rate": 8.00194615045245e-07, + "loss": 2.6, + "step": 1875 + }, + { + "epoch": 0.6671408250355618, + "grad_norm": 1.2028226852416992, + "learning_rate": 7.986581689295578e-07, + "loss": 3.1367, + "step": 1876 + }, + { + "epoch": 0.6674964438122333, + "grad_norm": 0.9643940329551697, + "learning_rate": 7.971226639448503e-07, + "loss": 2.2334, + "step": 1877 + }, + { + "epoch": 0.6678520625889047, + "grad_norm": 1.2686632871627808, + "learning_rate": 7.955881021516172e-07, + "loss": 2.2399, + "step": 1878 + }, + { + "epoch": 0.6682076813655761, + "grad_norm": 1.7077945470809937, + "learning_rate": 7.940544856090867e-07, + "loss": 3.553, + "step": 1879 + }, + { + "epoch": 0.6685633001422475, + "grad_norm": 0.8624612092971802, + "learning_rate": 7.925218163752217e-07, + "loss": 2.2072, + "step": 1880 + }, + { + "epoch": 0.668918918918919, + "grad_norm": 1.268304467201233, + "learning_rate": 7.909900965067097e-07, + "loss": 2.236, + "step": 1881 + }, + { + "epoch": 0.6692745376955903, + "grad_norm": 1.3337498903274536, + "learning_rate": 7.894593280589678e-07, + "loss": 2.9251, + "step": 1882 + }, + { + "epoch": 0.6696301564722618, + "grad_norm": 1.1672158241271973, + "learning_rate": 7.879295130861345e-07, + "loss": 1.6471, + "step": 1883 + }, + { + "epoch": 0.6699857752489331, + "grad_norm": 1.6919870376586914, + "learning_rate": 7.864006536410696e-07, + "loss": 2.2255, + "step": 1884 + }, + { + "epoch": 0.6703413940256046, + "grad_norm": 1.129447102546692, + "learning_rate": 7.848727517753501e-07, + "loss": 2.0836, + "step": 1885 + }, + { + "epoch": 0.670697012802276, + "grad_norm": 0.9926920533180237, + "learning_rate": 7.833458095392679e-07, + "loss": 1.8681, + "step": 1886 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 1.82706880569458, + "learning_rate": 7.818198289818287e-07, + "loss": 3.4436, + "step": 1887 + }, + { + "epoch": 0.6714082503556188, + "grad_norm": 1.1752760410308838, + "learning_rate": 7.802948121507461e-07, + "loss": 2.9943, + "step": 1888 + }, + { + "epoch": 0.6717638691322901, + "grad_norm": 1.1689441204071045, + "learning_rate": 7.78770761092441e-07, + "loss": 2.3984, + "step": 1889 + }, + { + "epoch": 0.6721194879089616, + "grad_norm": 0.858314573764801, + "learning_rate": 7.772476778520385e-07, + "loss": 2.4296, + "step": 1890 + }, + { + "epoch": 0.672475106685633, + "grad_norm": 1.11139714717865, + "learning_rate": 7.757255644733638e-07, + "loss": 3.1046, + "step": 1891 + }, + { + "epoch": 0.6728307254623044, + "grad_norm": 1.6716831922531128, + "learning_rate": 7.742044229989431e-07, + "loss": 3.5216, + "step": 1892 + }, + { + "epoch": 0.6731863442389758, + "grad_norm": 2.5692408084869385, + "learning_rate": 7.726842554699964e-07, + "loss": 2.4543, + "step": 1893 + }, + { + "epoch": 0.6735419630156472, + "grad_norm": 1.227446436882019, + "learning_rate": 7.711650639264374e-07, + "loss": 3.2727, + "step": 1894 + }, + { + "epoch": 0.6738975817923186, + "grad_norm": 0.8895146250724792, + "learning_rate": 7.696468504068699e-07, + "loss": 3.0294, + "step": 1895 + }, + { + "epoch": 0.6742532005689901, + "grad_norm": 0.8892061114311218, + "learning_rate": 7.681296169485853e-07, + "loss": 2.1375, + "step": 1896 + }, + { + "epoch": 0.6746088193456614, + "grad_norm": 0.9749287962913513, + "learning_rate": 7.666133655875604e-07, + "loss": 2.4192, + "step": 1897 + }, + { + "epoch": 0.6749644381223329, + "grad_norm": 0.7124261260032654, + "learning_rate": 7.650980983584528e-07, + "loss": 2.1834, + "step": 1898 + }, + { + "epoch": 0.6753200568990043, + "grad_norm": 0.8885782957077026, + "learning_rate": 7.635838172946015e-07, + "loss": 2.1822, + "step": 1899 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.7545700073242188, + "learning_rate": 7.620705244280209e-07, + "loss": 2.992, + "step": 1900 + }, + { + "epoch": 0.6760312944523471, + "grad_norm": 1.1204743385314941, + "learning_rate": 7.60558221789399e-07, + "loss": 3.0611, + "step": 1901 + }, + { + "epoch": 0.6763869132290184, + "grad_norm": 1.0545529127120972, + "learning_rate": 7.590469114080958e-07, + "loss": 3.0196, + "step": 1902 + }, + { + "epoch": 0.6767425320056899, + "grad_norm": 1.4174400568008423, + "learning_rate": 7.575365953121398e-07, + "loss": 3.7143, + "step": 1903 + }, + { + "epoch": 0.6770981507823614, + "grad_norm": 0.9021718502044678, + "learning_rate": 7.560272755282237e-07, + "loss": 2.7222, + "step": 1904 + }, + { + "epoch": 0.6774537695590327, + "grad_norm": 1.6318130493164062, + "learning_rate": 7.545189540817064e-07, + "loss": 3.3152, + "step": 1905 + }, + { + "epoch": 0.6778093883357041, + "grad_norm": 1.4574998617172241, + "learning_rate": 7.53011632996604e-07, + "loss": 1.9245, + "step": 1906 + }, + { + "epoch": 0.6781650071123755, + "grad_norm": 1.845809817314148, + "learning_rate": 7.515053142955921e-07, + "loss": 3.6921, + "step": 1907 + }, + { + "epoch": 0.6785206258890469, + "grad_norm": 0.8910509347915649, + "learning_rate": 7.500000000000003e-07, + "loss": 2.8216, + "step": 1908 + }, + { + "epoch": 0.6788762446657184, + "grad_norm": 0.732796311378479, + "learning_rate": 7.484956921298101e-07, + "loss": 1.7863, + "step": 1909 + }, + { + "epoch": 0.6792318634423897, + "grad_norm": 0.9174123406410217, + "learning_rate": 7.469923927036547e-07, + "loss": 2.5873, + "step": 1910 + }, + { + "epoch": 0.6795874822190612, + "grad_norm": 1.2347885370254517, + "learning_rate": 7.4549010373881e-07, + "loss": 2.481, + "step": 1911 + }, + { + "epoch": 0.6799431009957326, + "grad_norm": 1.2688474655151367, + "learning_rate": 7.439888272512003e-07, + "loss": 2.9995, + "step": 1912 + }, + { + "epoch": 0.680298719772404, + "grad_norm": 1.5153416395187378, + "learning_rate": 7.424885652553888e-07, + "loss": 3.3727, + "step": 1913 + }, + { + "epoch": 0.6806543385490754, + "grad_norm": 1.462003231048584, + "learning_rate": 7.409893197645772e-07, + "loss": 3.7716, + "step": 1914 + }, + { + "epoch": 0.6810099573257468, + "grad_norm": 0.8915738463401794, + "learning_rate": 7.394910927906056e-07, + "loss": 2.6499, + "step": 1915 + }, + { + "epoch": 0.6813655761024182, + "grad_norm": 1.1301155090332031, + "learning_rate": 7.379938863439431e-07, + "loss": 3.3625, + "step": 1916 + }, + { + "epoch": 0.6817211948790897, + "grad_norm": 0.7895405888557434, + "learning_rate": 7.364977024336937e-07, + "loss": 1.5927, + "step": 1917 + }, + { + "epoch": 0.682076813655761, + "grad_norm": 1.7603590488433838, + "learning_rate": 7.350025430675868e-07, + "loss": 2.149, + "step": 1918 + }, + { + "epoch": 0.6824324324324325, + "grad_norm": 2.544466257095337, + "learning_rate": 7.335084102519776e-07, + "loss": 1.9125, + "step": 1919 + }, + { + "epoch": 0.6827880512091038, + "grad_norm": 0.913755476474762, + "learning_rate": 7.320153059918436e-07, + "loss": 2.5787, + "step": 1920 + }, + { + "epoch": 0.6831436699857752, + "grad_norm": 1.0412038564682007, + "learning_rate": 7.305232322907818e-07, + "loss": 2.7811, + "step": 1921 + }, + { + "epoch": 0.6834992887624467, + "grad_norm": 1.4742921590805054, + "learning_rate": 7.290321911510085e-07, + "loss": 2.7527, + "step": 1922 + }, + { + "epoch": 0.683854907539118, + "grad_norm": 4.068294525146484, + "learning_rate": 7.2754218457335e-07, + "loss": 4.8182, + "step": 1923 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.9013786315917969, + "learning_rate": 7.260532145572487e-07, + "loss": 2.6121, + "step": 1924 + }, + { + "epoch": 0.6845661450924608, + "grad_norm": 1.0609240531921387, + "learning_rate": 7.245652831007539e-07, + "loss": 2.6523, + "step": 1925 + }, + { + "epoch": 0.6849217638691323, + "grad_norm": 0.8816744685173035, + "learning_rate": 7.230783922005209e-07, + "loss": 2.1019, + "step": 1926 + }, + { + "epoch": 0.6852773826458037, + "grad_norm": 0.9124554395675659, + "learning_rate": 7.215925438518111e-07, + "loss": 2.3818, + "step": 1927 + }, + { + "epoch": 0.6856330014224751, + "grad_norm": 0.7553700804710388, + "learning_rate": 7.201077400484831e-07, + "loss": 2.3024, + "step": 1928 + }, + { + "epoch": 0.6859886201991465, + "grad_norm": 1.2415990829467773, + "learning_rate": 7.186239827829973e-07, + "loss": 1.8371, + "step": 1929 + }, + { + "epoch": 0.686344238975818, + "grad_norm": 1.7498940229415894, + "learning_rate": 7.171412740464081e-07, + "loss": 2.3939, + "step": 1930 + }, + { + "epoch": 0.6866998577524893, + "grad_norm": 3.4696574211120605, + "learning_rate": 7.156596158283626e-07, + "loss": 3.5447, + "step": 1931 + }, + { + "epoch": 0.6870554765291608, + "grad_norm": 1.621468424797058, + "learning_rate": 7.141790101171e-07, + "loss": 3.1064, + "step": 1932 + }, + { + "epoch": 0.6874110953058321, + "grad_norm": 0.8481424450874329, + "learning_rate": 7.126994588994443e-07, + "loss": 2.25, + "step": 1933 + }, + { + "epoch": 0.6877667140825036, + "grad_norm": 1.487852692604065, + "learning_rate": 7.112209641608078e-07, + "loss": 2.5852, + "step": 1934 + }, + { + "epoch": 0.688122332859175, + "grad_norm": 1.5368783473968506, + "learning_rate": 7.097435278851812e-07, + "loss": 2.9114, + "step": 1935 + }, + { + "epoch": 0.6884779516358464, + "grad_norm": 1.870802879333496, + "learning_rate": 7.082671520551391e-07, + "loss": 2.82, + "step": 1936 + }, + { + "epoch": 0.6888335704125178, + "grad_norm": 1.252590298652649, + "learning_rate": 7.0679183865183e-07, + "loss": 3.3679, + "step": 1937 + }, + { + "epoch": 0.6891891891891891, + "grad_norm": 0.8500372171401978, + "learning_rate": 7.053175896549776e-07, + "loss": 2.4431, + "step": 1938 + }, + { + "epoch": 0.6895448079658606, + "grad_norm": 1.0033198595046997, + "learning_rate": 7.038444070428787e-07, + "loss": 1.6115, + "step": 1939 + }, + { + "epoch": 0.689900426742532, + "grad_norm": 1.2883167266845703, + "learning_rate": 7.023722927923958e-07, + "loss": 3.2652, + "step": 1940 + }, + { + "epoch": 0.6902560455192034, + "grad_norm": 1.5955833196640015, + "learning_rate": 7.009012488789615e-07, + "loss": 3.345, + "step": 1941 + }, + { + "epoch": 0.6906116642958748, + "grad_norm": 1.0401811599731445, + "learning_rate": 6.994312772765698e-07, + "loss": 2.6747, + "step": 1942 + }, + { + "epoch": 0.6909672830725462, + "grad_norm": 1.1520590782165527, + "learning_rate": 6.979623799577759e-07, + "loss": 2.2442, + "step": 1943 + }, + { + "epoch": 0.6913229018492176, + "grad_norm": 1.854566216468811, + "learning_rate": 6.964945588936954e-07, + "loss": 3.9432, + "step": 1944 + }, + { + "epoch": 0.6916785206258891, + "grad_norm": 0.7332730889320374, + "learning_rate": 6.95027816053996e-07, + "loss": 2.4236, + "step": 1945 + }, + { + "epoch": 0.6920341394025604, + "grad_norm": 0.9055619835853577, + "learning_rate": 6.935621534069026e-07, + "loss": 2.3678, + "step": 1946 + }, + { + "epoch": 0.6923897581792319, + "grad_norm": 0.8630459904670715, + "learning_rate": 6.920975729191879e-07, + "loss": 2.14, + "step": 1947 + }, + { + "epoch": 0.6927453769559033, + "grad_norm": 1.2736016511917114, + "learning_rate": 6.906340765561734e-07, + "loss": 3.0628, + "step": 1948 + }, + { + "epoch": 0.6931009957325747, + "grad_norm": 1.022598147392273, + "learning_rate": 6.891716662817254e-07, + "loss": 2.0869, + "step": 1949 + }, + { + "epoch": 0.6934566145092461, + "grad_norm": 1.3021979331970215, + "learning_rate": 6.877103440582528e-07, + "loss": 2.3957, + "step": 1950 + }, + { + "epoch": 0.6938122332859175, + "grad_norm": 0.7917354106903076, + "learning_rate": 6.862501118467054e-07, + "loss": 2.0266, + "step": 1951 + }, + { + "epoch": 0.6941678520625889, + "grad_norm": 1.3620195388793945, + "learning_rate": 6.847909716065695e-07, + "loss": 2.8261, + "step": 1952 + }, + { + "epoch": 0.6945234708392604, + "grad_norm": 0.8473620414733887, + "learning_rate": 6.833329252958657e-07, + "loss": 2.4222, + "step": 1953 + }, + { + "epoch": 0.6948790896159317, + "grad_norm": 4.826970100402832, + "learning_rate": 6.818759748711476e-07, + "loss": 2.2551, + "step": 1954 + }, + { + "epoch": 0.6952347083926032, + "grad_norm": 1.1631945371627808, + "learning_rate": 6.80420122287497e-07, + "loss": 2.9747, + "step": 1955 + }, + { + "epoch": 0.6955903271692745, + "grad_norm": 1.2064534425735474, + "learning_rate": 6.789653694985246e-07, + "loss": 2.3499, + "step": 1956 + }, + { + "epoch": 0.6959459459459459, + "grad_norm": 1.0833230018615723, + "learning_rate": 6.775117184563621e-07, + "loss": 2.3018, + "step": 1957 + }, + { + "epoch": 0.6963015647226174, + "grad_norm": 1.3955198526382446, + "learning_rate": 6.760591711116662e-07, + "loss": 2.6445, + "step": 1958 + }, + { + "epoch": 0.6966571834992887, + "grad_norm": 1.6794805526733398, + "learning_rate": 6.746077294136105e-07, + "loss": 3.1986, + "step": 1959 + }, + { + "epoch": 0.6970128022759602, + "grad_norm": 1.1153299808502197, + "learning_rate": 6.731573953098851e-07, + "loss": 3.2555, + "step": 1960 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 3.741192102432251, + "learning_rate": 6.717081707466944e-07, + "loss": 4.418, + "step": 1961 + }, + { + "epoch": 0.697724039829303, + "grad_norm": 0.9878949522972107, + "learning_rate": 6.70260057668753e-07, + "loss": 2.8231, + "step": 1962 + }, + { + "epoch": 0.6980796586059744, + "grad_norm": 1.6467419862747192, + "learning_rate": 6.688130580192857e-07, + "loss": 2.8471, + "step": 1963 + }, + { + "epoch": 0.6984352773826458, + "grad_norm": 0.9989914298057556, + "learning_rate": 6.673671737400213e-07, + "loss": 2.3096, + "step": 1964 + }, + { + "epoch": 0.6987908961593172, + "grad_norm": 0.856245219707489, + "learning_rate": 6.659224067711932e-07, + "loss": 2.3587, + "step": 1965 + }, + { + "epoch": 0.6991465149359887, + "grad_norm": 1.8853057622909546, + "learning_rate": 6.644787590515346e-07, + "loss": 3.2678, + "step": 1966 + }, + { + "epoch": 0.69950213371266, + "grad_norm": 0.9428020715713501, + "learning_rate": 6.630362325182773e-07, + "loss": 1.934, + "step": 1967 + }, + { + "epoch": 0.6998577524893315, + "grad_norm": 1.4871022701263428, + "learning_rate": 6.615948291071477e-07, + "loss": 3.3794, + "step": 1968 + }, + { + "epoch": 0.7002133712660028, + "grad_norm": 1.00119149684906, + "learning_rate": 6.601545507523672e-07, + "loss": 2.0666, + "step": 1969 + }, + { + "epoch": 0.7005689900426743, + "grad_norm": 1.2600195407867432, + "learning_rate": 6.587153993866452e-07, + "loss": 3.4841, + "step": 1970 + }, + { + "epoch": 0.7009246088193457, + "grad_norm": 1.944810390472412, + "learning_rate": 6.5727737694118e-07, + "loss": 3.9282, + "step": 1971 + }, + { + "epoch": 0.701280227596017, + "grad_norm": 1.0013489723205566, + "learning_rate": 6.558404853456545e-07, + "loss": 1.9954, + "step": 1972 + }, + { + "epoch": 0.7016358463726885, + "grad_norm": 1.1192461252212524, + "learning_rate": 6.544047265282338e-07, + "loss": 2.3168, + "step": 1973 + }, + { + "epoch": 0.7019914651493598, + "grad_norm": 1.6649013757705688, + "learning_rate": 6.529701024155652e-07, + "loss": 3.1312, + "step": 1974 + }, + { + "epoch": 0.7023470839260313, + "grad_norm": 2.2638208866119385, + "learning_rate": 6.515366149327691e-07, + "loss": 4.1504, + "step": 1975 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 1.2956135272979736, + "learning_rate": 6.50104266003445e-07, + "loss": 1.7187, + "step": 1976 + }, + { + "epoch": 0.7030583214793741, + "grad_norm": 2.028820514678955, + "learning_rate": 6.486730575496623e-07, + "loss": 1.1796, + "step": 1977 + }, + { + "epoch": 0.7034139402560455, + "grad_norm": 0.9227649569511414, + "learning_rate": 6.472429914919599e-07, + "loss": 3.0207, + "step": 1978 + }, + { + "epoch": 0.7037695590327169, + "grad_norm": 0.7881325483322144, + "learning_rate": 6.458140697493445e-07, + "loss": 1.945, + "step": 1979 + }, + { + "epoch": 0.7041251778093883, + "grad_norm": 1.434604287147522, + "learning_rate": 6.443862942392865e-07, + "loss": 2.8811, + "step": 1980 + }, + { + "epoch": 0.7044807965860598, + "grad_norm": 0.7917636036872864, + "learning_rate": 6.429596668777194e-07, + "loss": 1.7832, + "step": 1981 + }, + { + "epoch": 0.7048364153627311, + "grad_norm": 0.9547821283340454, + "learning_rate": 6.415341895790351e-07, + "loss": 2.7582, + "step": 1982 + }, + { + "epoch": 0.7051920341394026, + "grad_norm": 0.9232505559921265, + "learning_rate": 6.401098642560819e-07, + "loss": 2.6706, + "step": 1983 + }, + { + "epoch": 0.705547652916074, + "grad_norm": 1.444894552230835, + "learning_rate": 6.386866928201631e-07, + "loss": 3.5921, + "step": 1984 + }, + { + "epoch": 0.7059032716927454, + "grad_norm": 1.3847182989120483, + "learning_rate": 6.372646771810324e-07, + "loss": 3.068, + "step": 1985 + }, + { + "epoch": 0.7062588904694168, + "grad_norm": 1.170749306678772, + "learning_rate": 6.358438192468953e-07, + "loss": 2.9815, + "step": 1986 + }, + { + "epoch": 0.7066145092460882, + "grad_norm": 1.387197732925415, + "learning_rate": 6.344241209243993e-07, + "loss": 2.9739, + "step": 1987 + }, + { + "epoch": 0.7069701280227596, + "grad_norm": 0.9489420056343079, + "learning_rate": 6.3300558411864e-07, + "loss": 3.0793, + "step": 1988 + }, + { + "epoch": 0.707325746799431, + "grad_norm": 1.4366698265075684, + "learning_rate": 6.315882107331524e-07, + "loss": 3.1197, + "step": 1989 + }, + { + "epoch": 0.7076813655761024, + "grad_norm": 1.6259618997573853, + "learning_rate": 6.301720026699098e-07, + "loss": 3.3715, + "step": 1990 + }, + { + "epoch": 0.7080369843527738, + "grad_norm": 1.1229667663574219, + "learning_rate": 6.287569618293244e-07, + "loss": 2.9034, + "step": 1991 + }, + { + "epoch": 0.7083926031294452, + "grad_norm": 9.075174331665039, + "learning_rate": 6.27343090110238e-07, + "loss": 5.0195, + "step": 1992 + }, + { + "epoch": 0.7087482219061166, + "grad_norm": 1.5251123905181885, + "learning_rate": 6.259303894099276e-07, + "loss": 3.3, + "step": 1993 + }, + { + "epoch": 0.7091038406827881, + "grad_norm": 0.9029699563980103, + "learning_rate": 6.245188616240961e-07, + "loss": 2.198, + "step": 1994 + }, + { + "epoch": 0.7094594594594594, + "grad_norm": 1.6343899965286255, + "learning_rate": 6.231085086468732e-07, + "loss": 1.6878, + "step": 1995 + }, + { + "epoch": 0.7098150782361309, + "grad_norm": 1.0486420392990112, + "learning_rate": 6.216993323708139e-07, + "loss": 2.1211, + "step": 1996 + }, + { + "epoch": 0.7101706970128022, + "grad_norm": 1.5222691297531128, + "learning_rate": 6.202913346868903e-07, + "loss": 3.4944, + "step": 1997 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.9459429979324341, + "learning_rate": 6.188845174844975e-07, + "loss": 2.6263, + "step": 1998 + }, + { + "epoch": 0.7108819345661451, + "grad_norm": 1.1155476570129395, + "learning_rate": 6.17478882651442e-07, + "loss": 2.34, + "step": 1999 + }, + { + "epoch": 0.7112375533428165, + "grad_norm": 0.8817419409751892, + "learning_rate": 6.160744320739476e-07, + "loss": 2.1266, + "step": 2000 + }, + { + "epoch": 0.7115931721194879, + "grad_norm": 0.8255553245544434, + "learning_rate": 6.146711676366469e-07, + "loss": 2.4201, + "step": 2001 + }, + { + "epoch": 0.7119487908961594, + "grad_norm": 0.92173832654953, + "learning_rate": 6.132690912225806e-07, + "loss": 2.3552, + "step": 2002 + }, + { + "epoch": 0.7123044096728307, + "grad_norm": 0.9803165793418884, + "learning_rate": 6.118682047131972e-07, + "loss": 3.0186, + "step": 2003 + }, + { + "epoch": 0.7126600284495022, + "grad_norm": 1.048230767250061, + "learning_rate": 6.10468509988345e-07, + "loss": 2.9921, + "step": 2004 + }, + { + "epoch": 0.7130156472261735, + "grad_norm": 0.8653060793876648, + "learning_rate": 6.090700089262769e-07, + "loss": 2.3588, + "step": 2005 + }, + { + "epoch": 0.713371266002845, + "grad_norm": 0.8200167417526245, + "learning_rate": 6.076727034036415e-07, + "loss": 2.2189, + "step": 2006 + }, + { + "epoch": 0.7137268847795164, + "grad_norm": 0.8677060604095459, + "learning_rate": 6.062765952954832e-07, + "loss": 2.7712, + "step": 2007 + }, + { + "epoch": 0.7140825035561877, + "grad_norm": 1.1639448404312134, + "learning_rate": 6.048816864752422e-07, + "loss": 3.0917, + "step": 2008 + }, + { + "epoch": 0.7144381223328592, + "grad_norm": 0.9230960607528687, + "learning_rate": 6.034879788147449e-07, + "loss": 2.5191, + "step": 2009 + }, + { + "epoch": 0.7147937411095305, + "grad_norm": 1.358371376991272, + "learning_rate": 6.0209547418421e-07, + "loss": 2.6325, + "step": 2010 + }, + { + "epoch": 0.715149359886202, + "grad_norm": 0.9120518565177917, + "learning_rate": 6.0070417445224e-07, + "loss": 2.2321, + "step": 2011 + }, + { + "epoch": 0.7155049786628734, + "grad_norm": 1.0035511255264282, + "learning_rate": 5.993140814858204e-07, + "loss": 2.3999, + "step": 2012 + }, + { + "epoch": 0.7158605974395448, + "grad_norm": 2.1622304916381836, + "learning_rate": 5.979251971503177e-07, + "loss": 4.0365, + "step": 2013 + }, + { + "epoch": 0.7162162162162162, + "grad_norm": 1.2741608619689941, + "learning_rate": 5.965375233094762e-07, + "loss": 2.4775, + "step": 2014 + }, + { + "epoch": 0.7165718349928877, + "grad_norm": 1.4885985851287842, + "learning_rate": 5.951510618254177e-07, + "loss": 3.5179, + "step": 2015 + }, + { + "epoch": 0.716927453769559, + "grad_norm": 0.9757147431373596, + "learning_rate": 5.937658145586336e-07, + "loss": 3.0757, + "step": 2016 + }, + { + "epoch": 0.7172830725462305, + "grad_norm": 0.8708642721176147, + "learning_rate": 5.923817833679893e-07, + "loss": 2.1607, + "step": 2017 + }, + { + "epoch": 0.7176386913229018, + "grad_norm": 0.8295237421989441, + "learning_rate": 5.909989701107165e-07, + "loss": 2.3289, + "step": 2018 + }, + { + "epoch": 0.7179943100995733, + "grad_norm": 0.8004297018051147, + "learning_rate": 5.896173766424126e-07, + "loss": 2.2931, + "step": 2019 + }, + { + "epoch": 0.7183499288762447, + "grad_norm": 1.470733880996704, + "learning_rate": 5.882370048170403e-07, + "loss": 2.4832, + "step": 2020 + }, + { + "epoch": 0.718705547652916, + "grad_norm": 1.8473515510559082, + "learning_rate": 5.868578564869191e-07, + "loss": 3.1515, + "step": 2021 + }, + { + "epoch": 0.7190611664295875, + "grad_norm": 0.7398523688316345, + "learning_rate": 5.854799335027304e-07, + "loss": 2.1376, + "step": 2022 + }, + { + "epoch": 0.7194167852062588, + "grad_norm": 1.4016271829605103, + "learning_rate": 5.841032377135091e-07, + "loss": 2.561, + "step": 2023 + }, + { + "epoch": 0.7197724039829303, + "grad_norm": 1.3104861974716187, + "learning_rate": 5.827277709666445e-07, + "loss": 2.4119, + "step": 2024 + }, + { + "epoch": 0.7201280227596017, + "grad_norm": 1.108368158340454, + "learning_rate": 5.813535351078757e-07, + "loss": 2.2013, + "step": 2025 + }, + { + "epoch": 0.7204836415362731, + "grad_norm": 0.9443823099136353, + "learning_rate": 5.799805319812903e-07, + "loss": 2.6142, + "step": 2026 + }, + { + "epoch": 0.7208392603129445, + "grad_norm": 1.0661629438400269, + "learning_rate": 5.78608763429323e-07, + "loss": 2.5755, + "step": 2027 + }, + { + "epoch": 0.7211948790896159, + "grad_norm": 1.0669431686401367, + "learning_rate": 5.7723823129275e-07, + "loss": 2.1823, + "step": 2028 + }, + { + "epoch": 0.7215504978662873, + "grad_norm": 1.6387563943862915, + "learning_rate": 5.758689374106893e-07, + "loss": 2.2221, + "step": 2029 + }, + { + "epoch": 0.7219061166429588, + "grad_norm": 1.2553825378417969, + "learning_rate": 5.745008836205969e-07, + "loss": 3.2637, + "step": 2030 + }, + { + "epoch": 0.7222617354196301, + "grad_norm": 0.8824705481529236, + "learning_rate": 5.731340717582651e-07, + "loss": 2.3455, + "step": 2031 + }, + { + "epoch": 0.7226173541963016, + "grad_norm": 0.9875997304916382, + "learning_rate": 5.71768503657819e-07, + "loss": 2.709, + "step": 2032 + }, + { + "epoch": 0.722972972972973, + "grad_norm": 1.4031248092651367, + "learning_rate": 5.704041811517159e-07, + "loss": 3.3381, + "step": 2033 + }, + { + "epoch": 0.7233285917496444, + "grad_norm": 0.8189743161201477, + "learning_rate": 5.690411060707406e-07, + "loss": 2.3553, + "step": 2034 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 1.8350245952606201, + "learning_rate": 5.676792802440044e-07, + "loss": 3.1881, + "step": 2035 + }, + { + "epoch": 0.7240398293029872, + "grad_norm": 0.7673594355583191, + "learning_rate": 5.663187054989418e-07, + "loss": 2.1639, + "step": 2036 + }, + { + "epoch": 0.7243954480796586, + "grad_norm": 1.0746439695358276, + "learning_rate": 5.64959383661309e-07, + "loss": 2.7958, + "step": 2037 + }, + { + "epoch": 0.7247510668563301, + "grad_norm": 0.8814980387687683, + "learning_rate": 5.636013165551807e-07, + "loss": 1.3853, + "step": 2038 + }, + { + "epoch": 0.7251066856330014, + "grad_norm": 1.254216194152832, + "learning_rate": 5.622445060029472e-07, + "loss": 2.9159, + "step": 2039 + }, + { + "epoch": 0.7254623044096729, + "grad_norm": 1.0896435976028442, + "learning_rate": 5.608889538253145e-07, + "loss": 2.7196, + "step": 2040 + }, + { + "epoch": 0.7258179231863442, + "grad_norm": 1.1005626916885376, + "learning_rate": 5.595346618412982e-07, + "loss": 2.0398, + "step": 2041 + }, + { + "epoch": 0.7261735419630156, + "grad_norm": 0.9181206226348877, + "learning_rate": 5.581816318682236e-07, + "loss": 2.5848, + "step": 2042 + }, + { + "epoch": 0.7265291607396871, + "grad_norm": 1.392725944519043, + "learning_rate": 5.56829865721722e-07, + "loss": 2.5968, + "step": 2043 + }, + { + "epoch": 0.7268847795163584, + "grad_norm": 0.8128013014793396, + "learning_rate": 5.55479365215729e-07, + "loss": 2.1255, + "step": 2044 + }, + { + "epoch": 0.7272403982930299, + "grad_norm": 1.0733991861343384, + "learning_rate": 5.541301321624828e-07, + "loss": 2.2324, + "step": 2045 + }, + { + "epoch": 0.7275960170697012, + "grad_norm": 0.9107247591018677, + "learning_rate": 5.527821683725193e-07, + "loss": 2.3829, + "step": 2046 + }, + { + "epoch": 0.7279516358463727, + "grad_norm": 1.1434545516967773, + "learning_rate": 5.514354756546722e-07, + "loss": 3.3969, + "step": 2047 + }, + { + "epoch": 0.7283072546230441, + "grad_norm": 1.4208749532699585, + "learning_rate": 5.500900558160686e-07, + "loss": 2.5625, + "step": 2048 + }, + { + "epoch": 0.7286628733997155, + "grad_norm": 1.1940701007843018, + "learning_rate": 5.487459106621282e-07, + "loss": 2.0786, + "step": 2049 + }, + { + "epoch": 0.7290184921763869, + "grad_norm": 0.8537723422050476, + "learning_rate": 5.474030419965613e-07, + "loss": 2.3903, + "step": 2050 + }, + { + "epoch": 0.7293741109530584, + "grad_norm": 0.7934257388114929, + "learning_rate": 5.460614516213622e-07, + "loss": 2.0117, + "step": 2051 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.9512789845466614, + "learning_rate": 5.44721141336813e-07, + "loss": 2.6758, + "step": 2052 + }, + { + "epoch": 0.7300853485064012, + "grad_norm": 3.2168350219726562, + "learning_rate": 5.433821129414766e-07, + "loss": 1.9448, + "step": 2053 + }, + { + "epoch": 0.7304409672830725, + "grad_norm": 0.9158034920692444, + "learning_rate": 5.420443682321953e-07, + "loss": 3.157, + "step": 2054 + }, + { + "epoch": 0.730796586059744, + "grad_norm": 1.4356510639190674, + "learning_rate": 5.407079090040909e-07, + "loss": 3.4573, + "step": 2055 + }, + { + "epoch": 0.7311522048364154, + "grad_norm": 1.1486363410949707, + "learning_rate": 5.393727370505569e-07, + "loss": 2.6987, + "step": 2056 + }, + { + "epoch": 0.7315078236130867, + "grad_norm": 0.9288201928138733, + "learning_rate": 5.380388541632629e-07, + "loss": 2.7773, + "step": 2057 + }, + { + "epoch": 0.7318634423897582, + "grad_norm": 1.5307451486587524, + "learning_rate": 5.367062621321456e-07, + "loss": 3.4139, + "step": 2058 + }, + { + "epoch": 0.7322190611664295, + "grad_norm": 0.8764315843582153, + "learning_rate": 5.353749627454121e-07, + "loss": 2.3439, + "step": 2059 + }, + { + "epoch": 0.732574679943101, + "grad_norm": 0.9296457171440125, + "learning_rate": 5.340449577895333e-07, + "loss": 2.5559, + "step": 2060 + }, + { + "epoch": 0.7329302987197724, + "grad_norm": 1.173366904258728, + "learning_rate": 5.327162490492431e-07, + "loss": 2.687, + "step": 2061 + }, + { + "epoch": 0.7332859174964438, + "grad_norm": 1.061095118522644, + "learning_rate": 5.313888383075379e-07, + "loss": 2.8197, + "step": 2062 + }, + { + "epoch": 0.7336415362731152, + "grad_norm": 1.2056710720062256, + "learning_rate": 5.300627273456691e-07, + "loss": 2.9536, + "step": 2063 + }, + { + "epoch": 0.7339971550497866, + "grad_norm": 4.46789026260376, + "learning_rate": 5.287379179431471e-07, + "loss": 2.5485, + "step": 2064 + }, + { + "epoch": 0.734352773826458, + "grad_norm": 1.4920649528503418, + "learning_rate": 5.274144118777335e-07, + "loss": 2.3941, + "step": 2065 + }, + { + "epoch": 0.7347083926031295, + "grad_norm": 0.8657049536705017, + "learning_rate": 5.26092210925442e-07, + "loss": 2.2346, + "step": 2066 + }, + { + "epoch": 0.7350640113798008, + "grad_norm": 0.7849623560905457, + "learning_rate": 5.247713168605358e-07, + "loss": 2.5376, + "step": 2067 + }, + { + "epoch": 0.7354196301564723, + "grad_norm": 1.1294870376586914, + "learning_rate": 5.234517314555213e-07, + "loss": 2.6823, + "step": 2068 + }, + { + "epoch": 0.7357752489331437, + "grad_norm": 1.1866953372955322, + "learning_rate": 5.221334564811525e-07, + "loss": 2.432, + "step": 2069 + }, + { + "epoch": 0.7361308677098151, + "grad_norm": 0.8753446340560913, + "learning_rate": 5.208164937064228e-07, + "loss": 2.186, + "step": 2070 + }, + { + "epoch": 0.7364864864864865, + "grad_norm": 1.4483497142791748, + "learning_rate": 5.195008448985649e-07, + "loss": 3.1514, + "step": 2071 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.0669904947280884, + "learning_rate": 5.181865118230499e-07, + "loss": 0.8214, + "step": 2072 + }, + { + "epoch": 0.7371977240398293, + "grad_norm": 1.0729142427444458, + "learning_rate": 5.1687349624358e-07, + "loss": 2.2855, + "step": 2073 + }, + { + "epoch": 0.7375533428165008, + "grad_norm": 1.2901486158370972, + "learning_rate": 5.155617999220938e-07, + "loss": 2.7661, + "step": 2074 + }, + { + "epoch": 0.7379089615931721, + "grad_norm": 1.0250614881515503, + "learning_rate": 5.142514246187551e-07, + "loss": 2.6766, + "step": 2075 + }, + { + "epoch": 0.7382645803698435, + "grad_norm": 0.7859494686126709, + "learning_rate": 5.129423720919587e-07, + "loss": 1.6384, + "step": 2076 + }, + { + "epoch": 0.7386201991465149, + "grad_norm": 0.8338843584060669, + "learning_rate": 5.116346440983227e-07, + "loss": 1.8537, + "step": 2077 + }, + { + "epoch": 0.7389758179231863, + "grad_norm": 0.9726145267486572, + "learning_rate": 5.103282423926871e-07, + "loss": 2.4008, + "step": 2078 + }, + { + "epoch": 0.7393314366998578, + "grad_norm": 0.9647275805473328, + "learning_rate": 5.090231687281148e-07, + "loss": 2.0924, + "step": 2079 + }, + { + "epoch": 0.7396870554765291, + "grad_norm": 1.319894790649414, + "learning_rate": 5.077194248558827e-07, + "loss": 3.3011, + "step": 2080 + }, + { + "epoch": 0.7400426742532006, + "grad_norm": 0.9327278733253479, + "learning_rate": 5.064170125254869e-07, + "loss": 2.1144, + "step": 2081 + }, + { + "epoch": 0.7403982930298719, + "grad_norm": 3.225750207901001, + "learning_rate": 5.051159334846349e-07, + "loss": 4.5972, + "step": 2082 + }, + { + "epoch": 0.7407539118065434, + "grad_norm": 0.8121932744979858, + "learning_rate": 5.038161894792447e-07, + "loss": 2.1753, + "step": 2083 + }, + { + "epoch": 0.7411095305832148, + "grad_norm": 1.068221926689148, + "learning_rate": 5.025177822534448e-07, + "loss": 2.4865, + "step": 2084 + }, + { + "epoch": 0.7414651493598862, + "grad_norm": 0.8545746207237244, + "learning_rate": 5.01220713549567e-07, + "loss": 2.5142, + "step": 2085 + }, + { + "epoch": 0.7418207681365576, + "grad_norm": 1.098678708076477, + "learning_rate": 4.999249851081497e-07, + "loss": 3.1039, + "step": 2086 + }, + { + "epoch": 0.7421763869132291, + "grad_norm": 1.179352879524231, + "learning_rate": 4.98630598667931e-07, + "loss": 2.8404, + "step": 2087 + }, + { + "epoch": 0.7425320056899004, + "grad_norm": 1.036306381225586, + "learning_rate": 4.973375559658491e-07, + "loss": 1.7978, + "step": 2088 + }, + { + "epoch": 0.7428876244665719, + "grad_norm": 0.8610964417457581, + "learning_rate": 4.960458587370383e-07, + "loss": 2.65, + "step": 2089 + }, + { + "epoch": 0.7432432432432432, + "grad_norm": 1.2495901584625244, + "learning_rate": 4.947555087148276e-07, + "loss": 3.2519, + "step": 2090 + }, + { + "epoch": 0.7435988620199147, + "grad_norm": 0.892518937587738, + "learning_rate": 4.934665076307393e-07, + "loss": 2.4518, + "step": 2091 + }, + { + "epoch": 0.7439544807965861, + "grad_norm": 1.0349003076553345, + "learning_rate": 4.921788572144841e-07, + "loss": 2.9418, + "step": 2092 + }, + { + "epoch": 0.7443100995732574, + "grad_norm": 1.252000331878662, + "learning_rate": 4.908925591939607e-07, + "loss": 2.5586, + "step": 2093 + }, + { + "epoch": 0.7446657183499289, + "grad_norm": 1.4145439863204956, + "learning_rate": 4.896076152952533e-07, + "loss": 2.6504, + "step": 2094 + }, + { + "epoch": 0.7450213371266002, + "grad_norm": 1.0203174352645874, + "learning_rate": 4.883240272426287e-07, + "loss": 2.5316, + "step": 2095 + }, + { + "epoch": 0.7453769559032717, + "grad_norm": 0.9361113905906677, + "learning_rate": 4.870417967585346e-07, + "loss": 2.6399, + "step": 2096 + }, + { + "epoch": 0.7457325746799431, + "grad_norm": 2.033576011657715, + "learning_rate": 4.857609255635958e-07, + "loss": 3.9801, + "step": 2097 + }, + { + "epoch": 0.7460881934566145, + "grad_norm": 1.1791621446609497, + "learning_rate": 4.844814153766155e-07, + "loss": 3.0289, + "step": 2098 + }, + { + "epoch": 0.7464438122332859, + "grad_norm": 9.057015419006348, + "learning_rate": 4.832032679145683e-07, + "loss": 1.513, + "step": 2099 + }, + { + "epoch": 0.7467994310099573, + "grad_norm": 1.3742718696594238, + "learning_rate": 4.819264848926014e-07, + "loss": 3.6885, + "step": 2100 + }, + { + "epoch": 0.7471550497866287, + "grad_norm": 1.035131812095642, + "learning_rate": 4.806510680240301e-07, + "loss": 2.6256, + "step": 2101 + }, + { + "epoch": 0.7475106685633002, + "grad_norm": 1.6493074893951416, + "learning_rate": 4.793770190203372e-07, + "loss": 3.3612, + "step": 2102 + }, + { + "epoch": 0.7478662873399715, + "grad_norm": 1.6541359424591064, + "learning_rate": 4.781043395911694e-07, + "loss": 2.7344, + "step": 2103 + }, + { + "epoch": 0.748221906116643, + "grad_norm": 0.9448461532592773, + "learning_rate": 4.768330314443367e-07, + "loss": 2.1875, + "step": 2104 + }, + { + "epoch": 0.7485775248933144, + "grad_norm": 1.1382784843444824, + "learning_rate": 4.7556309628580756e-07, + "loss": 3.3535, + "step": 2105 + }, + { + "epoch": 0.7489331436699858, + "grad_norm": 1.0689194202423096, + "learning_rate": 4.74294535819709e-07, + "loss": 3.0844, + "step": 2106 + }, + { + "epoch": 0.7492887624466572, + "grad_norm": 0.8016451597213745, + "learning_rate": 4.7302735174832277e-07, + "loss": 1.7234, + "step": 2107 + }, + { + "epoch": 0.7496443812233285, + "grad_norm": 0.9672231674194336, + "learning_rate": 4.717615457720836e-07, + "loss": 2.2303, + "step": 2108 + }, + { + "epoch": 0.75, + "grad_norm": 0.9428424835205078, + "learning_rate": 4.7049711958957783e-07, + "loss": 2.8575, + "step": 2109 + }, + { + "epoch": 0.75, + "eval_loss": 4.213598251342773, + "eval_runtime": 302.2115, + "eval_samples_per_second": 4.126, + "eval_steps_per_second": 4.126, + "step": 2109 + }, + { + "epoch": 0.7503556187766715, + "grad_norm": 0.856499195098877, + "learning_rate": 4.6923407489753923e-07, + "loss": 2.1706, + "step": 2110 + }, + { + "epoch": 0.7507112375533428, + "grad_norm": 0.832511305809021, + "learning_rate": 4.679724133908484e-07, + "loss": 2.2594, + "step": 2111 + }, + { + "epoch": 0.7510668563300142, + "grad_norm": 1.1470288038253784, + "learning_rate": 4.667121367625294e-07, + "loss": 2.7121, + "step": 2112 + }, + { + "epoch": 0.7514224751066856, + "grad_norm": 1.2491710186004639, + "learning_rate": 4.654532467037476e-07, + "loss": 2.1631, + "step": 2113 + }, + { + "epoch": 0.751778093883357, + "grad_norm": 0.988521933555603, + "learning_rate": 4.641957449038098e-07, + "loss": 2.7526, + "step": 2114 + }, + { + "epoch": 0.7521337126600285, + "grad_norm": 0.943181574344635, + "learning_rate": 4.6293963305015624e-07, + "loss": 2.3161, + "step": 2115 + }, + { + "epoch": 0.7524893314366998, + "grad_norm": 1.4212512969970703, + "learning_rate": 4.616849128283658e-07, + "loss": 2.9849, + "step": 2116 + }, + { + "epoch": 0.7528449502133713, + "grad_norm": 1.1027655601501465, + "learning_rate": 4.6043158592214754e-07, + "loss": 2.9745, + "step": 2117 + }, + { + "epoch": 0.7532005689900427, + "grad_norm": 0.9412669539451599, + "learning_rate": 4.591796540133416e-07, + "loss": 2.7482, + "step": 2118 + }, + { + "epoch": 0.7535561877667141, + "grad_norm": 1.0525084733963013, + "learning_rate": 4.579291187819159e-07, + "loss": 2.4847, + "step": 2119 + }, + { + "epoch": 0.7539118065433855, + "grad_norm": 2.776388168334961, + "learning_rate": 4.566799819059641e-07, + "loss": 3.2866, + "step": 2120 + }, + { + "epoch": 0.7542674253200569, + "grad_norm": 1.4674513339996338, + "learning_rate": 4.5543224506170507e-07, + "loss": 3.454, + "step": 2121 + }, + { + "epoch": 0.7546230440967283, + "grad_norm": 1.1216015815734863, + "learning_rate": 4.541859099234754e-07, + "loss": 2.6102, + "step": 2122 + }, + { + "epoch": 0.7549786628733998, + "grad_norm": 0.9598565101623535, + "learning_rate": 4.529409781637345e-07, + "loss": 2.5211, + "step": 2123 + }, + { + "epoch": 0.7553342816500711, + "grad_norm": 0.8876523971557617, + "learning_rate": 4.5169745145305656e-07, + "loss": 1.4619, + "step": 2124 + }, + { + "epoch": 0.7556899004267426, + "grad_norm": 1.3019949197769165, + "learning_rate": 4.504553314601301e-07, + "loss": 2.4246, + "step": 2125 + }, + { + "epoch": 0.7560455192034139, + "grad_norm": 0.7721450328826904, + "learning_rate": 4.49214619851758e-07, + "loss": 2.4351, + "step": 2126 + }, + { + "epoch": 0.7564011379800853, + "grad_norm": 1.243194341659546, + "learning_rate": 4.4797531829285e-07, + "loss": 1.7794, + "step": 2127 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 0.8247648477554321, + "learning_rate": 4.467374284464271e-07, + "loss": 2.2986, + "step": 2128 + }, + { + "epoch": 0.7571123755334281, + "grad_norm": 0.9378613829612732, + "learning_rate": 4.455009519736137e-07, + "loss": 2.5049, + "step": 2129 + }, + { + "epoch": 0.7574679943100996, + "grad_norm": 1.3727219104766846, + "learning_rate": 4.442658905336378e-07, + "loss": 3.133, + "step": 2130 + }, + { + "epoch": 0.7578236130867709, + "grad_norm": 0.8675791621208191, + "learning_rate": 4.4303224578383043e-07, + "loss": 2.3356, + "step": 2131 + }, + { + "epoch": 0.7581792318634424, + "grad_norm": 0.8302528858184814, + "learning_rate": 4.418000193796182e-07, + "loss": 2.216, + "step": 2132 + }, + { + "epoch": 0.7585348506401138, + "grad_norm": 0.8602368831634521, + "learning_rate": 4.4056921297452843e-07, + "loss": 2.5355, + "step": 2133 + }, + { + "epoch": 0.7588904694167852, + "grad_norm": 4.178295612335205, + "learning_rate": 4.3933982822017883e-07, + "loss": 2.7518, + "step": 2134 + }, + { + "epoch": 0.7592460881934566, + "grad_norm": 0.9761065244674683, + "learning_rate": 4.3811186676628253e-07, + "loss": 2.8188, + "step": 2135 + }, + { + "epoch": 0.7596017069701281, + "grad_norm": 0.7233547568321228, + "learning_rate": 4.368853302606426e-07, + "loss": 2.3105, + "step": 2136 + }, + { + "epoch": 0.7599573257467994, + "grad_norm": 0.9760587215423584, + "learning_rate": 4.35660220349147e-07, + "loss": 2.2756, + "step": 2137 + }, + { + "epoch": 0.7603129445234709, + "grad_norm": 1.148197054862976, + "learning_rate": 4.344365386757733e-07, + "loss": 2.9189, + "step": 2138 + }, + { + "epoch": 0.7606685633001422, + "grad_norm": 1.1012871265411377, + "learning_rate": 4.3321428688257893e-07, + "loss": 2.4296, + "step": 2139 + }, + { + "epoch": 0.7610241820768137, + "grad_norm": 0.9694759249687195, + "learning_rate": 4.3199346660970545e-07, + "loss": 2.633, + "step": 2140 + }, + { + "epoch": 0.7613798008534851, + "grad_norm": 1.2191606760025024, + "learning_rate": 4.307740794953718e-07, + "loss": 2.75, + "step": 2141 + }, + { + "epoch": 0.7617354196301565, + "grad_norm": 0.988735020160675, + "learning_rate": 4.295561271758738e-07, + "loss": 2.2427, + "step": 2142 + }, + { + "epoch": 0.7620910384068279, + "grad_norm": 0.876829981803894, + "learning_rate": 4.2833961128558357e-07, + "loss": 2.2281, + "step": 2143 + }, + { + "epoch": 0.7624466571834992, + "grad_norm": 0.9283367991447449, + "learning_rate": 4.2712453345694273e-07, + "loss": 2.6304, + "step": 2144 + }, + { + "epoch": 0.7628022759601707, + "grad_norm": 1.129348874092102, + "learning_rate": 4.2591089532046623e-07, + "loss": 3.3829, + "step": 2145 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 1.4231501817703247, + "learning_rate": 4.2469869850473515e-07, + "loss": 1.7693, + "step": 2146 + }, + { + "epoch": 0.7635135135135135, + "grad_norm": 1.3078492879867554, + "learning_rate": 4.234879446363966e-07, + "loss": 1.9644, + "step": 2147 + }, + { + "epoch": 0.7638691322901849, + "grad_norm": 1.18837571144104, + "learning_rate": 4.2227863534016353e-07, + "loss": 3.4245, + "step": 2148 + }, + { + "epoch": 0.7642247510668563, + "grad_norm": 1.2394648790359497, + "learning_rate": 4.210707722388065e-07, + "loss": 2.3787, + "step": 2149 + }, + { + "epoch": 0.7645803698435277, + "grad_norm": 1.0515272617340088, + "learning_rate": 4.198643569531592e-07, + "loss": 2.7011, + "step": 2150 + }, + { + "epoch": 0.7649359886201992, + "grad_norm": 1.0762999057769775, + "learning_rate": 4.1865939110211065e-07, + "loss": 2.6766, + "step": 2151 + }, + { + "epoch": 0.7652916073968705, + "grad_norm": 0.865943968296051, + "learning_rate": 4.1745587630260485e-07, + "loss": 2.7149, + "step": 2152 + }, + { + "epoch": 0.765647226173542, + "grad_norm": 1.3377312421798706, + "learning_rate": 4.162538141696391e-07, + "loss": 2.9626, + "step": 2153 + }, + { + "epoch": 0.7660028449502134, + "grad_norm": 0.8539925217628479, + "learning_rate": 4.150532063162609e-07, + "loss": 2.5195, + "step": 2154 + }, + { + "epoch": 0.7663584637268848, + "grad_norm": 1.0895754098892212, + "learning_rate": 4.1385405435356776e-07, + "loss": 2.8303, + "step": 2155 + }, + { + "epoch": 0.7667140825035562, + "grad_norm": 1.4293694496154785, + "learning_rate": 4.126563598907006e-07, + "loss": 2.9352, + "step": 2156 + }, + { + "epoch": 0.7670697012802276, + "grad_norm": 0.7961404323577881, + "learning_rate": 4.114601245348475e-07, + "loss": 2.0611, + "step": 2157 + }, + { + "epoch": 0.767425320056899, + "grad_norm": 0.8184587955474854, + "learning_rate": 4.1026534989123705e-07, + "loss": 2.6097, + "step": 2158 + }, + { + "epoch": 0.7677809388335705, + "grad_norm": 0.7758621573448181, + "learning_rate": 4.090720375631379e-07, + "loss": 2.4562, + "step": 2159 + }, + { + "epoch": 0.7681365576102418, + "grad_norm": 2.264059543609619, + "learning_rate": 4.078801891518566e-07, + "loss": 3.3252, + "step": 2160 + }, + { + "epoch": 0.7684921763869133, + "grad_norm": 1.9543564319610596, + "learning_rate": 4.066898062567345e-07, + "loss": 2.8272, + "step": 2161 + }, + { + "epoch": 0.7688477951635846, + "grad_norm": 0.9910685420036316, + "learning_rate": 4.055008904751483e-07, + "loss": 2.8665, + "step": 2162 + }, + { + "epoch": 0.769203413940256, + "grad_norm": 2.3957161903381348, + "learning_rate": 4.043134434025038e-07, + "loss": 3.3774, + "step": 2163 + }, + { + "epoch": 0.7695590327169275, + "grad_norm": 0.8275718092918396, + "learning_rate": 4.031274666322372e-07, + "loss": 1.5009, + "step": 2164 + }, + { + "epoch": 0.7699146514935988, + "grad_norm": 0.9054645299911499, + "learning_rate": 4.019429617558114e-07, + "loss": 2.5736, + "step": 2165 + }, + { + "epoch": 0.7702702702702703, + "grad_norm": 0.7187370657920837, + "learning_rate": 4.007599303627135e-07, + "loss": 2.339, + "step": 2166 + }, + { + "epoch": 0.7706258890469416, + "grad_norm": 1.2835021018981934, + "learning_rate": 3.9957837404045484e-07, + "loss": 3.0733, + "step": 2167 + }, + { + "epoch": 0.7709815078236131, + "grad_norm": 1.0151550769805908, + "learning_rate": 3.983982943745662e-07, + "loss": 2.1213, + "step": 2168 + }, + { + "epoch": 0.7713371266002845, + "grad_norm": 1.034286379814148, + "learning_rate": 3.9721969294859707e-07, + "loss": 3.0767, + "step": 2169 + }, + { + "epoch": 0.7716927453769559, + "grad_norm": 0.9762433767318726, + "learning_rate": 3.960425713441131e-07, + "loss": 2.3903, + "step": 2170 + }, + { + "epoch": 0.7720483641536273, + "grad_norm": 1.173304796218872, + "learning_rate": 3.948669311406948e-07, + "loss": 2.6437, + "step": 2171 + }, + { + "epoch": 0.7724039829302988, + "grad_norm": 0.900320291519165, + "learning_rate": 3.9369277391593365e-07, + "loss": 2.1603, + "step": 2172 + }, + { + "epoch": 0.7727596017069701, + "grad_norm": 0.9166988730430603, + "learning_rate": 3.925201012454329e-07, + "loss": 2.2074, + "step": 2173 + }, + { + "epoch": 0.7731152204836416, + "grad_norm": 1.3775848150253296, + "learning_rate": 3.913489147028021e-07, + "loss": 2.6271, + "step": 2174 + }, + { + "epoch": 0.7734708392603129, + "grad_norm": 0.8113077878952026, + "learning_rate": 3.901792158596572e-07, + "loss": 2.5981, + "step": 2175 + }, + { + "epoch": 0.7738264580369844, + "grad_norm": 1.461656928062439, + "learning_rate": 3.890110062856175e-07, + "loss": 3.4514, + "step": 2176 + }, + { + "epoch": 0.7741820768136558, + "grad_norm": 1.0511623620986938, + "learning_rate": 3.878442875483043e-07, + "loss": 2.4244, + "step": 2177 + }, + { + "epoch": 0.7745376955903271, + "grad_norm": 0.9942576289176941, + "learning_rate": 3.86679061213338e-07, + "loss": 2.694, + "step": 2178 + }, + { + "epoch": 0.7748933143669986, + "grad_norm": 1.2753386497497559, + "learning_rate": 3.8551532884433586e-07, + "loss": 2.3456, + "step": 2179 + }, + { + "epoch": 0.7752489331436699, + "grad_norm": 1.389668345451355, + "learning_rate": 3.8435309200291217e-07, + "loss": 3.3608, + "step": 2180 + }, + { + "epoch": 0.7756045519203414, + "grad_norm": 2.20207142829895, + "learning_rate": 3.831923522486724e-07, + "loss": 4.2375, + "step": 2181 + }, + { + "epoch": 0.7759601706970128, + "grad_norm": 1.4331722259521484, + "learning_rate": 3.8203311113921404e-07, + "loss": 2.4162, + "step": 2182 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 1.1279487609863281, + "learning_rate": 3.8087537023012344e-07, + "loss": 2.4641, + "step": 2183 + }, + { + "epoch": 0.7766714082503556, + "grad_norm": 1.1716697216033936, + "learning_rate": 3.7971913107497304e-07, + "loss": 2.5496, + "step": 2184 + }, + { + "epoch": 0.777027027027027, + "grad_norm": 1.5054595470428467, + "learning_rate": 3.7856439522532223e-07, + "loss": 2.9236, + "step": 2185 + }, + { + "epoch": 0.7773826458036984, + "grad_norm": 0.9791932106018066, + "learning_rate": 3.7741116423071e-07, + "loss": 1.8047, + "step": 2186 + }, + { + "epoch": 0.7777382645803699, + "grad_norm": 0.8603526949882507, + "learning_rate": 3.7625943963865875e-07, + "loss": 2.347, + "step": 2187 + }, + { + "epoch": 0.7780938833570412, + "grad_norm": 1.0486632585525513, + "learning_rate": 3.7510922299466815e-07, + "loss": 2.5208, + "step": 2188 + }, + { + "epoch": 0.7784495021337127, + "grad_norm": 0.9722015261650085, + "learning_rate": 3.739605158422138e-07, + "loss": 3.1489, + "step": 2189 + }, + { + "epoch": 0.7788051209103841, + "grad_norm": 0.7264953255653381, + "learning_rate": 3.72813319722748e-07, + "loss": 2.5734, + "step": 2190 + }, + { + "epoch": 0.7791607396870555, + "grad_norm": 0.8321019411087036, + "learning_rate": 3.7166763617569204e-07, + "loss": 2.3255, + "step": 2191 + }, + { + "epoch": 0.7795163584637269, + "grad_norm": 0.7549997568130493, + "learning_rate": 3.705234667384406e-07, + "loss": 2.1553, + "step": 2192 + }, + { + "epoch": 0.7798719772403983, + "grad_norm": 1.904625415802002, + "learning_rate": 3.6938081294635473e-07, + "loss": 4.1663, + "step": 2193 + }, + { + "epoch": 0.7802275960170697, + "grad_norm": 1.7420480251312256, + "learning_rate": 3.6823967633276183e-07, + "loss": 3.2239, + "step": 2194 + }, + { + "epoch": 0.7805832147937412, + "grad_norm": 1.1319738626480103, + "learning_rate": 3.671000584289549e-07, + "loss": 3.0752, + "step": 2195 + }, + { + "epoch": 0.7809388335704125, + "grad_norm": 1.0542619228363037, + "learning_rate": 3.6596196076418624e-07, + "loss": 2.7092, + "step": 2196 + }, + { + "epoch": 0.781294452347084, + "grad_norm": 1.452858567237854, + "learning_rate": 3.648253848656713e-07, + "loss": 2.818, + "step": 2197 + }, + { + "epoch": 0.7816500711237553, + "grad_norm": 1.9157034158706665, + "learning_rate": 3.6369033225858035e-07, + "loss": 3.6601, + "step": 2198 + }, + { + "epoch": 0.7820056899004267, + "grad_norm": 2.98496150970459, + "learning_rate": 3.6255680446604217e-07, + "loss": 1.8727, + "step": 2199 + }, + { + "epoch": 0.7823613086770982, + "grad_norm": 0.7684885859489441, + "learning_rate": 3.61424803009138e-07, + "loss": 2.335, + "step": 2200 + }, + { + "epoch": 0.7827169274537695, + "grad_norm": 0.7593629956245422, + "learning_rate": 3.602943294069009e-07, + "loss": 2.5046, + "step": 2201 + }, + { + "epoch": 0.783072546230441, + "grad_norm": 0.8953066468238831, + "learning_rate": 3.5916538517631504e-07, + "loss": 2.4548, + "step": 2202 + }, + { + "epoch": 0.7834281650071123, + "grad_norm": 0.9090556502342224, + "learning_rate": 3.580379718323097e-07, + "loss": 2.6556, + "step": 2203 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 0.930807888507843, + "learning_rate": 3.569120908877627e-07, + "loss": 2.452, + "step": 2204 + }, + { + "epoch": 0.7841394025604552, + "grad_norm": 1.4258122444152832, + "learning_rate": 3.5578774385349396e-07, + "loss": 2.9465, + "step": 2205 + }, + { + "epoch": 0.7844950213371266, + "grad_norm": 0.8963679671287537, + "learning_rate": 3.546649322382646e-07, + "loss": 2.026, + "step": 2206 + }, + { + "epoch": 0.784850640113798, + "grad_norm": 1.3299936056137085, + "learning_rate": 3.53543657548778e-07, + "loss": 2.7278, + "step": 2207 + }, + { + "epoch": 0.7852062588904695, + "grad_norm": 1.230420708656311, + "learning_rate": 3.524239212896711e-07, + "loss": 2.9411, + "step": 2208 + }, + { + "epoch": 0.7855618776671408, + "grad_norm": 0.9805211424827576, + "learning_rate": 3.5130572496351987e-07, + "loss": 2.4516, + "step": 2209 + }, + { + "epoch": 0.7859174964438123, + "grad_norm": 1.560009479522705, + "learning_rate": 3.501890700708325e-07, + "loss": 3.1589, + "step": 2210 + }, + { + "epoch": 0.7862731152204836, + "grad_norm": 1.0299580097198486, + "learning_rate": 3.490739581100479e-07, + "loss": 3.032, + "step": 2211 + }, + { + "epoch": 0.786628733997155, + "grad_norm": 2.3408868312835693, + "learning_rate": 3.47960390577537e-07, + "loss": 3.804, + "step": 2212 + }, + { + "epoch": 0.7869843527738265, + "grad_norm": 0.9957606792449951, + "learning_rate": 3.46848368967595e-07, + "loss": 2.3026, + "step": 2213 + }, + { + "epoch": 0.7873399715504978, + "grad_norm": 0.7373713850975037, + "learning_rate": 3.457378947724457e-07, + "loss": 0.5093, + "step": 2214 + }, + { + "epoch": 0.7876955903271693, + "grad_norm": 11.077432632446289, + "learning_rate": 3.4462896948223343e-07, + "loss": 2.425, + "step": 2215 + }, + { + "epoch": 0.7880512091038406, + "grad_norm": 1.2921781539916992, + "learning_rate": 3.4352159458502713e-07, + "loss": 3.4197, + "step": 2216 + }, + { + "epoch": 0.7884068278805121, + "grad_norm": 0.9075039029121399, + "learning_rate": 3.4241577156681314e-07, + "loss": 2.175, + "step": 2217 + }, + { + "epoch": 0.7887624466571835, + "grad_norm": 0.9724758863449097, + "learning_rate": 3.4131150191149546e-07, + "loss": 2.608, + "step": 2218 + }, + { + "epoch": 0.7891180654338549, + "grad_norm": 1.9673429727554321, + "learning_rate": 3.402087871008956e-07, + "loss": 1.8757, + "step": 2219 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 2.060476303100586, + "learning_rate": 3.391076286147454e-07, + "loss": 2.6257, + "step": 2220 + }, + { + "epoch": 0.7898293029871978, + "grad_norm": 0.8453624844551086, + "learning_rate": 3.380080279306913e-07, + "loss": 2.2092, + "step": 2221 + }, + { + "epoch": 0.7901849217638691, + "grad_norm": 0.866963267326355, + "learning_rate": 3.369099865242874e-07, + "loss": 2.6848, + "step": 2222 + }, + { + "epoch": 0.7905405405405406, + "grad_norm": 0.8228059411048889, + "learning_rate": 3.358135058689965e-07, + "loss": 2.319, + "step": 2223 + }, + { + "epoch": 0.7908961593172119, + "grad_norm": 0.8892799615859985, + "learning_rate": 3.3471858743618616e-07, + "loss": 3.1018, + "step": 2224 + }, + { + "epoch": 0.7912517780938834, + "grad_norm": 1.1145501136779785, + "learning_rate": 3.336252326951277e-07, + "loss": 3.0855, + "step": 2225 + }, + { + "epoch": 0.7916073968705548, + "grad_norm": 0.930536687374115, + "learning_rate": 3.325334431129956e-07, + "loss": 2.8498, + "step": 2226 + }, + { + "epoch": 0.7919630156472262, + "grad_norm": 0.8028265237808228, + "learning_rate": 3.31443220154862e-07, + "loss": 2.0001, + "step": 2227 + }, + { + "epoch": 0.7923186344238976, + "grad_norm": 0.7684826850891113, + "learning_rate": 3.3035456528369784e-07, + "loss": 1.9722, + "step": 2228 + }, + { + "epoch": 0.792674253200569, + "grad_norm": 1.1092820167541504, + "learning_rate": 3.2926747996036987e-07, + "loss": 2.9149, + "step": 2229 + }, + { + "epoch": 0.7930298719772404, + "grad_norm": 0.8335224390029907, + "learning_rate": 3.2818196564363773e-07, + "loss": 2.2989, + "step": 2230 + }, + { + "epoch": 0.7933854907539118, + "grad_norm": 0.9953006505966187, + "learning_rate": 3.2709802379015467e-07, + "loss": 1.7579, + "step": 2231 + }, + { + "epoch": 0.7937411095305832, + "grad_norm": 1.8108501434326172, + "learning_rate": 3.2601565585446256e-07, + "loss": 2.5817, + "step": 2232 + }, + { + "epoch": 0.7940967283072546, + "grad_norm": 1.450566053390503, + "learning_rate": 3.2493486328899123e-07, + "loss": 3.4173, + "step": 2233 + }, + { + "epoch": 0.794452347083926, + "grad_norm": 1.1858546733856201, + "learning_rate": 3.2385564754405707e-07, + "loss": 2.9533, + "step": 2234 + }, + { + "epoch": 0.7948079658605974, + "grad_norm": 2.0366008281707764, + "learning_rate": 3.227780100678599e-07, + "loss": 4.4006, + "step": 2235 + }, + { + "epoch": 0.7951635846372689, + "grad_norm": 1.297795295715332, + "learning_rate": 3.2170195230648253e-07, + "loss": 3.3154, + "step": 2236 + }, + { + "epoch": 0.7955192034139402, + "grad_norm": 1.2484252452850342, + "learning_rate": 3.206274757038866e-07, + "loss": 3.3917, + "step": 2237 + }, + { + "epoch": 0.7958748221906117, + "grad_norm": 1.747867226600647, + "learning_rate": 3.1955458170191383e-07, + "loss": 3.2944, + "step": 2238 + }, + { + "epoch": 0.7962304409672831, + "grad_norm": 1.0438530445098877, + "learning_rate": 3.184832717402808e-07, + "loss": 2.2232, + "step": 2239 + }, + { + "epoch": 0.7965860597439545, + "grad_norm": 1.0181710720062256, + "learning_rate": 3.174135472565791e-07, + "loss": 2.9545, + "step": 2240 + }, + { + "epoch": 0.7969416785206259, + "grad_norm": 2.3968396186828613, + "learning_rate": 3.1634540968627236e-07, + "loss": 3.9299, + "step": 2241 + }, + { + "epoch": 0.7972972972972973, + "grad_norm": 1.39536452293396, + "learning_rate": 3.1527886046269513e-07, + "loss": 3.0851, + "step": 2242 + }, + { + "epoch": 0.7976529160739687, + "grad_norm": 1.1165210008621216, + "learning_rate": 3.1421390101704984e-07, + "loss": 3.3529, + "step": 2243 + }, + { + "epoch": 0.7980085348506402, + "grad_norm": 1.0955935716629028, + "learning_rate": 3.1315053277840707e-07, + "loss": 2.9223, + "step": 2244 + }, + { + "epoch": 0.7983641536273115, + "grad_norm": 1.1782984733581543, + "learning_rate": 3.120887571737008e-07, + "loss": 2.6707, + "step": 2245 + }, + { + "epoch": 0.798719772403983, + "grad_norm": 0.9109994769096375, + "learning_rate": 3.1102857562772814e-07, + "loss": 2.634, + "step": 2246 + }, + { + "epoch": 0.7990753911806543, + "grad_norm": 1.4174785614013672, + "learning_rate": 3.0996998956314745e-07, + "loss": 2.5164, + "step": 2247 + }, + { + "epoch": 0.7994310099573257, + "grad_norm": 1.1560662984848022, + "learning_rate": 3.0891300040047544e-07, + "loss": 3.4226, + "step": 2248 + }, + { + "epoch": 0.7997866287339972, + "grad_norm": 1.2088466882705688, + "learning_rate": 3.0785760955808774e-07, + "loss": 2.2021, + "step": 2249 + }, + { + "epoch": 0.8001422475106685, + "grad_norm": 1.7375088930130005, + "learning_rate": 3.068038184522121e-07, + "loss": 2.4438, + "step": 2250 + }, + { + "epoch": 0.80049786628734, + "grad_norm": 1.3170695304870605, + "learning_rate": 3.0575162849693276e-07, + "loss": 2.9203, + "step": 2251 + }, + { + "epoch": 0.8008534850640113, + "grad_norm": 1.7973964214324951, + "learning_rate": 3.047010411041836e-07, + "loss": 2.2919, + "step": 2252 + }, + { + "epoch": 0.8012091038406828, + "grad_norm": 1.751617193222046, + "learning_rate": 3.0365205768374775e-07, + "loss": 3.2441, + "step": 2253 + }, + { + "epoch": 0.8015647226173542, + "grad_norm": 1.0787187814712524, + "learning_rate": 3.026046796432582e-07, + "loss": 2.5079, + "step": 2254 + }, + { + "epoch": 0.8019203413940256, + "grad_norm": 0.8324382901191711, + "learning_rate": 3.015589083881901e-07, + "loss": 2.2692, + "step": 2255 + }, + { + "epoch": 0.802275960170697, + "grad_norm": 0.909200131893158, + "learning_rate": 3.005147453218659e-07, + "loss": 2.6528, + "step": 2256 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 1.0985475778579712, + "learning_rate": 2.994721918454483e-07, + "loss": 2.7651, + "step": 2257 + }, + { + "epoch": 0.8029871977240398, + "grad_norm": 1.0441832542419434, + "learning_rate": 2.984312493579399e-07, + "loss": 2.3999, + "step": 2258 + }, + { + "epoch": 0.8033428165007113, + "grad_norm": 0.9709333181381226, + "learning_rate": 2.973919192561825e-07, + "loss": 1.8392, + "step": 2259 + }, + { + "epoch": 0.8036984352773826, + "grad_norm": 1.0891106128692627, + "learning_rate": 2.96354202934853e-07, + "loss": 2.4697, + "step": 2260 + }, + { + "epoch": 0.8040540540540541, + "grad_norm": 1.3485108613967896, + "learning_rate": 2.953181017864649e-07, + "loss": 2.8816, + "step": 2261 + }, + { + "epoch": 0.8044096728307255, + "grad_norm": 0.8342669606208801, + "learning_rate": 2.9428361720136123e-07, + "loss": 2.4413, + "step": 2262 + }, + { + "epoch": 0.8047652916073968, + "grad_norm": 0.8444363474845886, + "learning_rate": 2.932507505677183e-07, + "loss": 2.5663, + "step": 2263 + }, + { + "epoch": 0.8051209103840683, + "grad_norm": 1.3097003698349, + "learning_rate": 2.922195032715404e-07, + "loss": 2.4187, + "step": 2264 + }, + { + "epoch": 0.8054765291607396, + "grad_norm": 1.3382989168167114, + "learning_rate": 2.911898766966583e-07, + "loss": 2.5816, + "step": 2265 + }, + { + "epoch": 0.8058321479374111, + "grad_norm": 0.8791672587394714, + "learning_rate": 2.9016187222472966e-07, + "loss": 2.0917, + "step": 2266 + }, + { + "epoch": 0.8061877667140825, + "grad_norm": 1.69907546043396, + "learning_rate": 2.891354912352327e-07, + "loss": 2.7869, + "step": 2267 + }, + { + "epoch": 0.8065433854907539, + "grad_norm": 1.118682861328125, + "learning_rate": 2.881107351054695e-07, + "loss": 2.4509, + "step": 2268 + }, + { + "epoch": 0.8068990042674253, + "grad_norm": 1.1169490814208984, + "learning_rate": 2.8708760521056086e-07, + "loss": 2.8574, + "step": 2269 + }, + { + "epoch": 0.8072546230440967, + "grad_norm": 1.1351274251937866, + "learning_rate": 2.860661029234448e-07, + "loss": 2.8382, + "step": 2270 + }, + { + "epoch": 0.8076102418207681, + "grad_norm": 1.780268907546997, + "learning_rate": 2.850462296148768e-07, + "loss": 2.9841, + "step": 2271 + }, + { + "epoch": 0.8079658605974396, + "grad_norm": 0.9727625250816345, + "learning_rate": 2.840279866534241e-07, + "loss": 2.0696, + "step": 2272 + }, + { + "epoch": 0.8083214793741109, + "grad_norm": 1.2791810035705566, + "learning_rate": 2.8301137540546875e-07, + "loss": 2.8023, + "step": 2273 + }, + { + "epoch": 0.8086770981507824, + "grad_norm": 0.8913934826850891, + "learning_rate": 2.819963972352006e-07, + "loss": 2.4443, + "step": 2274 + }, + { + "epoch": 0.8090327169274538, + "grad_norm": 2.1409003734588623, + "learning_rate": 2.8098305350462054e-07, + "loss": 3.7514, + "step": 2275 + }, + { + "epoch": 0.8093883357041252, + "grad_norm": 0.9975666403770447, + "learning_rate": 2.799713455735347e-07, + "loss": 2.4821, + "step": 2276 + }, + { + "epoch": 0.8097439544807966, + "grad_norm": 0.8923506140708923, + "learning_rate": 2.789612747995539e-07, + "loss": 2.4111, + "step": 2277 + }, + { + "epoch": 0.810099573257468, + "grad_norm": 0.9661974906921387, + "learning_rate": 2.779528425380941e-07, + "loss": 2.6763, + "step": 2278 + }, + { + "epoch": 0.8104551920341394, + "grad_norm": 0.8934873938560486, + "learning_rate": 2.7694605014236937e-07, + "loss": 1.9153, + "step": 2279 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 1.2134568691253662, + "learning_rate": 2.759408989633961e-07, + "loss": 1.935, + "step": 2280 + }, + { + "epoch": 0.8111664295874822, + "grad_norm": 2.772639036178589, + "learning_rate": 2.749373903499869e-07, + "loss": 2.425, + "step": 2281 + }, + { + "epoch": 0.8115220483641536, + "grad_norm": 0.7657691836357117, + "learning_rate": 2.7393552564875005e-07, + "loss": 2.6027, + "step": 2282 + }, + { + "epoch": 0.811877667140825, + "grad_norm": 1.0437690019607544, + "learning_rate": 2.729353062040896e-07, + "loss": 2.0763, + "step": 2283 + }, + { + "epoch": 0.8122332859174964, + "grad_norm": 0.8721387982368469, + "learning_rate": 2.719367333581989e-07, + "loss": 2.5378, + "step": 2284 + }, + { + "epoch": 0.8125889046941679, + "grad_norm": 0.7150872945785522, + "learning_rate": 2.709398084510647e-07, + "loss": 2.0177, + "step": 2285 + }, + { + "epoch": 0.8129445234708392, + "grad_norm": 1.1544194221496582, + "learning_rate": 2.699445328204605e-07, + "loss": 1.466, + "step": 2286 + }, + { + "epoch": 0.8133001422475107, + "grad_norm": 1.0188549757003784, + "learning_rate": 2.689509078019471e-07, + "loss": 3.0971, + "step": 2287 + }, + { + "epoch": 0.813655761024182, + "grad_norm": 0.8782582879066467, + "learning_rate": 2.679589347288709e-07, + "loss": 2.4693, + "step": 2288 + }, + { + "epoch": 0.8140113798008535, + "grad_norm": 0.8890851140022278, + "learning_rate": 2.669686149323603e-07, + "loss": 2.4679, + "step": 2289 + }, + { + "epoch": 0.8143669985775249, + "grad_norm": 2.757200241088867, + "learning_rate": 2.65979949741327e-07, + "loss": 4.5294, + "step": 2290 + }, + { + "epoch": 0.8147226173541963, + "grad_norm": 0.863023579120636, + "learning_rate": 2.6499294048246077e-07, + "loss": 2.4636, + "step": 2291 + }, + { + "epoch": 0.8150782361308677, + "grad_norm": 0.851080596446991, + "learning_rate": 2.640075884802299e-07, + "loss": 2.444, + "step": 2292 + }, + { + "epoch": 0.8154338549075392, + "grad_norm": 1.2778605222702026, + "learning_rate": 2.630238950568789e-07, + "loss": 2.5231, + "step": 2293 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.7835622429847717, + "learning_rate": 2.620418615324259e-07, + "loss": 2.5527, + "step": 2294 + }, + { + "epoch": 0.816145092460882, + "grad_norm": 1.6541062593460083, + "learning_rate": 2.6106148922466356e-07, + "loss": 3.1858, + "step": 2295 + }, + { + "epoch": 0.8165007112375533, + "grad_norm": 1.41825532913208, + "learning_rate": 2.6008277944915236e-07, + "loss": 2.3084, + "step": 2296 + }, + { + "epoch": 0.8168563300142248, + "grad_norm": 0.8028482794761658, + "learning_rate": 2.5910573351922466e-07, + "loss": 1.7702, + "step": 2297 + }, + { + "epoch": 0.8172119487908962, + "grad_norm": 1.0909109115600586, + "learning_rate": 2.5813035274597853e-07, + "loss": 2.9749, + "step": 2298 + }, + { + "epoch": 0.8175675675675675, + "grad_norm": 1.4204156398773193, + "learning_rate": 2.571566384382779e-07, + "loss": 3.0136, + "step": 2299 + }, + { + "epoch": 0.817923186344239, + "grad_norm": 0.9035689234733582, + "learning_rate": 2.5618459190275065e-07, + "loss": 2.5701, + "step": 2300 + }, + { + "epoch": 0.8182788051209103, + "grad_norm": 0.9647578597068787, + "learning_rate": 2.5521421444378624e-07, + "loss": 2.4909, + "step": 2301 + }, + { + "epoch": 0.8186344238975818, + "grad_norm": 0.8178501725196838, + "learning_rate": 2.5424550736353516e-07, + "loss": 2.2662, + "step": 2302 + }, + { + "epoch": 0.8189900426742532, + "grad_norm": 1.9288287162780762, + "learning_rate": 2.532784719619057e-07, + "loss": 3.5421, + "step": 2303 + }, + { + "epoch": 0.8193456614509246, + "grad_norm": 0.8939301371574402, + "learning_rate": 2.5231310953656336e-07, + "loss": 2.4943, + "step": 2304 + }, + { + "epoch": 0.819701280227596, + "grad_norm": 1.095521092414856, + "learning_rate": 2.513494213829282e-07, + "loss": 2.687, + "step": 2305 + }, + { + "epoch": 0.8200568990042674, + "grad_norm": 1.063281774520874, + "learning_rate": 2.503874087941741e-07, + "loss": 2.9151, + "step": 2306 + }, + { + "epoch": 0.8204125177809388, + "grad_norm": 1.0725953578948975, + "learning_rate": 2.4942707306122587e-07, + "loss": 3.0705, + "step": 2307 + }, + { + "epoch": 0.8207681365576103, + "grad_norm": 0.9433916807174683, + "learning_rate": 2.484684154727592e-07, + "loss": 2.0669, + "step": 2308 + }, + { + "epoch": 0.8211237553342816, + "grad_norm": 1.3127261400222778, + "learning_rate": 2.47511437315197e-07, + "loss": 2.774, + "step": 2309 + }, + { + "epoch": 0.8214793741109531, + "grad_norm": 1.7126781940460205, + "learning_rate": 2.465561398727086e-07, + "loss": 3.9204, + "step": 2310 + }, + { + "epoch": 0.8218349928876245, + "grad_norm": 0.9677002429962158, + "learning_rate": 2.4560252442720803e-07, + "loss": 1.9387, + "step": 2311 + }, + { + "epoch": 0.8221906116642959, + "grad_norm": 1.5666791200637817, + "learning_rate": 2.446505922583524e-07, + "loss": 1.8384, + "step": 2312 + }, + { + "epoch": 0.8225462304409673, + "grad_norm": 0.8234903216362, + "learning_rate": 2.437003446435409e-07, + "loss": 2.4488, + "step": 2313 + }, + { + "epoch": 0.8229018492176386, + "grad_norm": 1.2035095691680908, + "learning_rate": 2.4275178285790973e-07, + "loss": 2.7185, + "step": 2314 + }, + { + "epoch": 0.8232574679943101, + "grad_norm": 1.2424958944320679, + "learning_rate": 2.4180490817433566e-07, + "loss": 3.0211, + "step": 2315 + }, + { + "epoch": 0.8236130867709816, + "grad_norm": 1.5601844787597656, + "learning_rate": 2.4085972186343007e-07, + "loss": 2.3163, + "step": 2316 + }, + { + "epoch": 0.8239687055476529, + "grad_norm": 0.8907254338264465, + "learning_rate": 2.399162251935388e-07, + "loss": 2.4571, + "step": 2317 + }, + { + "epoch": 0.8243243243243243, + "grad_norm": 1.4503557682037354, + "learning_rate": 2.389744194307407e-07, + "loss": 3.2947, + "step": 2318 + }, + { + "epoch": 0.8246799431009957, + "grad_norm": 1.4631301164627075, + "learning_rate": 2.3803430583884494e-07, + "loss": 2.9233, + "step": 2319 + }, + { + "epoch": 0.8250355618776671, + "grad_norm": 1.1515644788742065, + "learning_rate": 2.3709588567939118e-07, + "loss": 2.7524, + "step": 2320 + }, + { + "epoch": 0.8253911806543386, + "grad_norm": 1.24463951587677, + "learning_rate": 2.3615916021164568e-07, + "loss": 3.2803, + "step": 2321 + }, + { + "epoch": 0.8257467994310099, + "grad_norm": 1.601910948753357, + "learning_rate": 2.352241306926007e-07, + "loss": 3.7661, + "step": 2322 + }, + { + "epoch": 0.8261024182076814, + "grad_norm": 1.610396146774292, + "learning_rate": 2.34290798376973e-07, + "loss": 3.1098, + "step": 2323 + }, + { + "epoch": 0.8264580369843528, + "grad_norm": 1.2427988052368164, + "learning_rate": 2.3335916451720123e-07, + "loss": 3.5155, + "step": 2324 + }, + { + "epoch": 0.8268136557610242, + "grad_norm": 2.206895112991333, + "learning_rate": 2.324292303634466e-07, + "loss": 2.6571, + "step": 2325 + }, + { + "epoch": 0.8271692745376956, + "grad_norm": 0.9320984482765198, + "learning_rate": 2.315009971635867e-07, + "loss": 2.0064, + "step": 2326 + }, + { + "epoch": 0.827524893314367, + "grad_norm": 1.2808932065963745, + "learning_rate": 2.3057446616321915e-07, + "loss": 1.7615, + "step": 2327 + }, + { + "epoch": 0.8278805120910384, + "grad_norm": 1.7617377042770386, + "learning_rate": 2.2964963860565625e-07, + "loss": 2.2783, + "step": 2328 + }, + { + "epoch": 0.8282361308677099, + "grad_norm": 1.6348552703857422, + "learning_rate": 2.2872651573192394e-07, + "loss": 2.993, + "step": 2329 + }, + { + "epoch": 0.8285917496443812, + "grad_norm": 0.8253128528594971, + "learning_rate": 2.2780509878076266e-07, + "loss": 2.1967, + "step": 2330 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.8641490340232849, + "learning_rate": 2.2688538898862087e-07, + "loss": 2.192, + "step": 2331 + }, + { + "epoch": 0.829302987197724, + "grad_norm": 0.8628028035163879, + "learning_rate": 2.2596738758965852e-07, + "loss": 2.775, + "step": 2332 + }, + { + "epoch": 0.8296586059743954, + "grad_norm": 1.0588856935501099, + "learning_rate": 2.25051095815742e-07, + "loss": 3.1297, + "step": 2333 + }, + { + "epoch": 0.8300142247510669, + "grad_norm": 1.3221750259399414, + "learning_rate": 2.2413651489644316e-07, + "loss": 2.7776, + "step": 2334 + }, + { + "epoch": 0.8303698435277382, + "grad_norm": 2.239002227783203, + "learning_rate": 2.2322364605904005e-07, + "loss": 3.865, + "step": 2335 + }, + { + "epoch": 0.8307254623044097, + "grad_norm": 0.7822991609573364, + "learning_rate": 2.2231249052850998e-07, + "loss": 1.8756, + "step": 2336 + }, + { + "epoch": 0.831081081081081, + "grad_norm": 0.66325443983078, + "learning_rate": 2.2140304952753477e-07, + "loss": 2.1854, + "step": 2337 + }, + { + "epoch": 0.8314366998577525, + "grad_norm": 1.0730146169662476, + "learning_rate": 2.2049532427649233e-07, + "loss": 3.1378, + "step": 2338 + }, + { + "epoch": 0.8317923186344239, + "grad_norm": 1.0283939838409424, + "learning_rate": 2.1958931599346067e-07, + "loss": 3.0855, + "step": 2339 + }, + { + "epoch": 0.8321479374110953, + "grad_norm": 0.8933717012405396, + "learning_rate": 2.186850258942124e-07, + "loss": 1.8443, + "step": 2340 + }, + { + "epoch": 0.8325035561877667, + "grad_norm": 0.9092789888381958, + "learning_rate": 2.1778245519221456e-07, + "loss": 2.081, + "step": 2341 + }, + { + "epoch": 0.8328591749644382, + "grad_norm": 2.443702220916748, + "learning_rate": 2.1688160509862848e-07, + "loss": 3.7645, + "step": 2342 + }, + { + "epoch": 0.8332147937411095, + "grad_norm": 7.195422172546387, + "learning_rate": 2.159824768223038e-07, + "loss": 2.0921, + "step": 2343 + }, + { + "epoch": 0.833570412517781, + "grad_norm": 1.082523226737976, + "learning_rate": 2.150850715697823e-07, + "loss": 2.4664, + "step": 2344 + }, + { + "epoch": 0.8339260312944523, + "grad_norm": 0.8580636382102966, + "learning_rate": 2.141893905452923e-07, + "loss": 2.4402, + "step": 2345 + }, + { + "epoch": 0.8342816500711238, + "grad_norm": 1.176915168762207, + "learning_rate": 2.132954349507482e-07, + "loss": 3.2938, + "step": 2346 + }, + { + "epoch": 0.8346372688477952, + "grad_norm": 0.9440568089485168, + "learning_rate": 2.1240320598575048e-07, + "loss": 2.5489, + "step": 2347 + }, + { + "epoch": 0.8349928876244666, + "grad_norm": 1.0521808862686157, + "learning_rate": 2.115127048475805e-07, + "loss": 2.5575, + "step": 2348 + }, + { + "epoch": 0.835348506401138, + "grad_norm": 0.97047358751297, + "learning_rate": 2.106239327312031e-07, + "loss": 1.9918, + "step": 2349 + }, + { + "epoch": 0.8357041251778093, + "grad_norm": 1.23579740524292, + "learning_rate": 2.097368908292618e-07, + "loss": 2.7581, + "step": 2350 + }, + { + "epoch": 0.8360597439544808, + "grad_norm": 1.4545401334762573, + "learning_rate": 2.088515803320785e-07, + "loss": 3.3338, + "step": 2351 + }, + { + "epoch": 0.8364153627311522, + "grad_norm": 1.5493648052215576, + "learning_rate": 2.0796800242765185e-07, + "loss": 3.203, + "step": 2352 + }, + { + "epoch": 0.8367709815078236, + "grad_norm": 1.2965432405471802, + "learning_rate": 2.0708615830165535e-07, + "loss": 2.8793, + "step": 2353 + }, + { + "epoch": 0.837126600284495, + "grad_norm": 0.9494686126708984, + "learning_rate": 2.062060491374369e-07, + "loss": 2.5847, + "step": 2354 + }, + { + "epoch": 0.8374822190611664, + "grad_norm": 0.7240934371948242, + "learning_rate": 2.0532767611601417e-07, + "loss": 1.5119, + "step": 2355 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 0.8726705312728882, + "learning_rate": 2.0445104041607743e-07, + "loss": 2.657, + "step": 2356 + }, + { + "epoch": 0.8381934566145093, + "grad_norm": 1.664900779724121, + "learning_rate": 2.0357614321398422e-07, + "loss": 3.3832, + "step": 2357 + }, + { + "epoch": 0.8385490753911806, + "grad_norm": 0.9923958778381348, + "learning_rate": 2.0270298568375923e-07, + "loss": 2.0969, + "step": 2358 + }, + { + "epoch": 0.8389046941678521, + "grad_norm": 1.807671308517456, + "learning_rate": 2.018315689970942e-07, + "loss": 3.5057, + "step": 2359 + }, + { + "epoch": 0.8392603129445235, + "grad_norm": 1.5743175745010376, + "learning_rate": 2.0096189432334195e-07, + "loss": 3.6149, + "step": 2360 + }, + { + "epoch": 0.8396159317211949, + "grad_norm": 0.8883889317512512, + "learning_rate": 2.0009396282952074e-07, + "loss": 2.4654, + "step": 2361 + }, + { + "epoch": 0.8399715504978663, + "grad_norm": 1.0718340873718262, + "learning_rate": 1.9922777568030782e-07, + "loss": 2.3258, + "step": 2362 + }, + { + "epoch": 0.8403271692745377, + "grad_norm": 0.8844662308692932, + "learning_rate": 1.9836333403804018e-07, + "loss": 2.3525, + "step": 2363 + }, + { + "epoch": 0.8406827880512091, + "grad_norm": 3.546264410018921, + "learning_rate": 1.9750063906271266e-07, + "loss": 3.9568, + "step": 2364 + }, + { + "epoch": 0.8410384068278806, + "grad_norm": 0.9292370080947876, + "learning_rate": 1.966396919119755e-07, + "loss": 2.622, + "step": 2365 + }, + { + "epoch": 0.8413940256045519, + "grad_norm": 1.2340316772460938, + "learning_rate": 1.957804937411351e-07, + "loss": 3.2001, + "step": 2366 + }, + { + "epoch": 0.8417496443812233, + "grad_norm": 0.9745893478393555, + "learning_rate": 1.9492304570314935e-07, + "loss": 2.9859, + "step": 2367 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 1.0931134223937988, + "learning_rate": 1.940673489486285e-07, + "loss": 2.3696, + "step": 2368 + }, + { + "epoch": 0.8424608819345661, + "grad_norm": 1.1146442890167236, + "learning_rate": 1.932134046258322e-07, + "loss": 2.4756, + "step": 2369 + }, + { + "epoch": 0.8428165007112376, + "grad_norm": 1.2338624000549316, + "learning_rate": 1.923612138806692e-07, + "loss": 2.4653, + "step": 2370 + }, + { + "epoch": 0.8431721194879089, + "grad_norm": 1.2167789936065674, + "learning_rate": 1.9151077785669385e-07, + "loss": 2.7354, + "step": 2371 + }, + { + "epoch": 0.8435277382645804, + "grad_norm": 1.0949010848999023, + "learning_rate": 1.9066209769510785e-07, + "loss": 3.0861, + "step": 2372 + }, + { + "epoch": 0.8438833570412517, + "grad_norm": 1.028254508972168, + "learning_rate": 1.8981517453475499e-07, + "loss": 2.5478, + "step": 2373 + }, + { + "epoch": 0.8442389758179232, + "grad_norm": 1.6373111009597778, + "learning_rate": 1.889700095121219e-07, + "loss": 2.578, + "step": 2374 + }, + { + "epoch": 0.8445945945945946, + "grad_norm": 1.7328667640686035, + "learning_rate": 1.8812660376133618e-07, + "loss": 1.9747, + "step": 2375 + }, + { + "epoch": 0.844950213371266, + "grad_norm": 0.7474656701087952, + "learning_rate": 1.8728495841416415e-07, + "loss": 2.2058, + "step": 2376 + }, + { + "epoch": 0.8453058321479374, + "grad_norm": 0.7675513029098511, + "learning_rate": 1.8644507460001043e-07, + "loss": 2.0013, + "step": 2377 + }, + { + "epoch": 0.8456614509246089, + "grad_norm": 0.9975234866142273, + "learning_rate": 1.856069534459151e-07, + "loss": 2.6741, + "step": 2378 + }, + { + "epoch": 0.8460170697012802, + "grad_norm": 1.0986260175704956, + "learning_rate": 1.8477059607655407e-07, + "loss": 3.0529, + "step": 2379 + }, + { + "epoch": 0.8463726884779517, + "grad_norm": 0.9797425270080566, + "learning_rate": 1.8393600361423534e-07, + "loss": 2.4118, + "step": 2380 + }, + { + "epoch": 0.846728307254623, + "grad_norm": 0.9024088382720947, + "learning_rate": 1.8310317717889913e-07, + "loss": 2.3698, + "step": 2381 + }, + { + "epoch": 0.8470839260312945, + "grad_norm": 1.6451040506362915, + "learning_rate": 1.822721178881156e-07, + "loss": 3.5949, + "step": 2382 + }, + { + "epoch": 0.8474395448079659, + "grad_norm": 1.0953186750411987, + "learning_rate": 1.8144282685708336e-07, + "loss": 3.3312, + "step": 2383 + }, + { + "epoch": 0.8477951635846372, + "grad_norm": 1.2633748054504395, + "learning_rate": 1.8061530519862907e-07, + "loss": 3.78, + "step": 2384 + }, + { + "epoch": 0.8481507823613087, + "grad_norm": 0.9972317814826965, + "learning_rate": 1.7978955402320412e-07, + "loss": 2.7391, + "step": 2385 + }, + { + "epoch": 0.84850640113798, + "grad_norm": 1.0927222967147827, + "learning_rate": 1.7896557443888467e-07, + "loss": 2.3443, + "step": 2386 + }, + { + "epoch": 0.8488620199146515, + "grad_norm": 1.5056382417678833, + "learning_rate": 1.7814336755136923e-07, + "loss": 2.8452, + "step": 2387 + }, + { + "epoch": 0.8492176386913229, + "grad_norm": 2.5311219692230225, + "learning_rate": 1.7732293446397723e-07, + "loss": 3.6175, + "step": 2388 + }, + { + "epoch": 0.8495732574679943, + "grad_norm": 1.3796595335006714, + "learning_rate": 1.7650427627764938e-07, + "loss": 3.0397, + "step": 2389 + }, + { + "epoch": 0.8499288762446657, + "grad_norm": 1.0554769039154053, + "learning_rate": 1.7568739409094236e-07, + "loss": 2.9682, + "step": 2390 + }, + { + "epoch": 0.8502844950213371, + "grad_norm": 1.739322543144226, + "learning_rate": 1.7487228900003155e-07, + "loss": 2.6757, + "step": 2391 + }, + { + "epoch": 0.8506401137980085, + "grad_norm": 1.1780493259429932, + "learning_rate": 1.7405896209870663e-07, + "loss": 1.8977, + "step": 2392 + }, + { + "epoch": 0.85099573257468, + "grad_norm": 1.3362414836883545, + "learning_rate": 1.732474144783713e-07, + "loss": 3.2885, + "step": 2393 + }, + { + "epoch": 0.8513513513513513, + "grad_norm": 1.329075813293457, + "learning_rate": 1.7243764722804233e-07, + "loss": 2.9995, + "step": 2394 + }, + { + "epoch": 0.8517069701280228, + "grad_norm": 0.8766903281211853, + "learning_rate": 1.7162966143434595e-07, + "loss": 2.3116, + "step": 2395 + }, + { + "epoch": 0.8520625889046942, + "grad_norm": 1.0890787839889526, + "learning_rate": 1.7082345818151978e-07, + "loss": 2.526, + "step": 2396 + }, + { + "epoch": 0.8524182076813656, + "grad_norm": 1.0623602867126465, + "learning_rate": 1.70019038551407e-07, + "loss": 2.5228, + "step": 2397 + }, + { + "epoch": 0.852773826458037, + "grad_norm": 1.4346168041229248, + "learning_rate": 1.692164036234601e-07, + "loss": 2.5998, + "step": 2398 + }, + { + "epoch": 0.8531294452347084, + "grad_norm": 0.6973717212677002, + "learning_rate": 1.6841555447473466e-07, + "loss": 1.5138, + "step": 2399 + }, + { + "epoch": 0.8534850640113798, + "grad_norm": 0.7696399688720703, + "learning_rate": 1.6761649217989028e-07, + "loss": 2.2044, + "step": 2400 + }, + { + "epoch": 0.8538406827880513, + "grad_norm": 1.8270294666290283, + "learning_rate": 1.668192178111902e-07, + "loss": 2.8866, + "step": 2401 + }, + { + "epoch": 0.8541963015647226, + "grad_norm": 0.950035572052002, + "learning_rate": 1.6602373243849595e-07, + "loss": 2.7106, + "step": 2402 + }, + { + "epoch": 0.854551920341394, + "grad_norm": 0.9316115975379944, + "learning_rate": 1.652300371292708e-07, + "loss": 2.495, + "step": 2403 + }, + { + "epoch": 0.8549075391180654, + "grad_norm": 2.9807474613189697, + "learning_rate": 1.6443813294857452e-07, + "loss": 3.4639, + "step": 2404 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 1.2970131635665894, + "learning_rate": 1.6364802095906351e-07, + "loss": 2.7048, + "step": 2405 + }, + { + "epoch": 0.8556187766714083, + "grad_norm": 1.569514513015747, + "learning_rate": 1.6285970222099033e-07, + "loss": 2.5713, + "step": 2406 + }, + { + "epoch": 0.8559743954480796, + "grad_norm": 0.7236081957817078, + "learning_rate": 1.6207317779219916e-07, + "loss": 2.076, + "step": 2407 + }, + { + "epoch": 0.8563300142247511, + "grad_norm": 1.0364186763763428, + "learning_rate": 1.6128844872812836e-07, + "loss": 2.005, + "step": 2408 + }, + { + "epoch": 0.8566856330014224, + "grad_norm": 1.0293139219284058, + "learning_rate": 1.6050551608180598e-07, + "loss": 3.0917, + "step": 2409 + }, + { + "epoch": 0.8570412517780939, + "grad_norm": 1.7371875047683716, + "learning_rate": 1.5972438090384973e-07, + "loss": 1.8531, + "step": 2410 + }, + { + "epoch": 0.8573968705547653, + "grad_norm": 0.9789634943008423, + "learning_rate": 1.589450442424658e-07, + "loss": 2.8343, + "step": 2411 + }, + { + "epoch": 0.8577524893314367, + "grad_norm": 2.098870038986206, + "learning_rate": 1.581675071434457e-07, + "loss": 3.6242, + "step": 2412 + }, + { + "epoch": 0.8581081081081081, + "grad_norm": 0.8093407154083252, + "learning_rate": 1.5739177065016774e-07, + "loss": 1.6011, + "step": 2413 + }, + { + "epoch": 0.8584637268847796, + "grad_norm": 0.8898791074752808, + "learning_rate": 1.566178358035921e-07, + "loss": 2.4232, + "step": 2414 + }, + { + "epoch": 0.8588193456614509, + "grad_norm": 0.87107253074646, + "learning_rate": 1.5584570364226325e-07, + "loss": 2.3484, + "step": 2415 + }, + { + "epoch": 0.8591749644381224, + "grad_norm": 2.0402588844299316, + "learning_rate": 1.550753752023053e-07, + "loss": 3.5264, + "step": 2416 + }, + { + "epoch": 0.8595305832147937, + "grad_norm": 1.9066723585128784, + "learning_rate": 1.543068515174224e-07, + "loss": 3.509, + "step": 2417 + }, + { + "epoch": 0.8598862019914651, + "grad_norm": 1.3495210409164429, + "learning_rate": 1.5354013361889764e-07, + "loss": 3.0587, + "step": 2418 + }, + { + "epoch": 0.8602418207681366, + "grad_norm": 0.7739881873130798, + "learning_rate": 1.5277522253558878e-07, + "loss": 2.1499, + "step": 2419 + }, + { + "epoch": 0.8605974395448079, + "grad_norm": 1.607041358947754, + "learning_rate": 1.5201211929393166e-07, + "loss": 3.86, + "step": 2420 + }, + { + "epoch": 0.8609530583214794, + "grad_norm": 0.951346755027771, + "learning_rate": 1.5125082491793445e-07, + "loss": 2.5052, + "step": 2421 + }, + { + "epoch": 0.8613086770981507, + "grad_norm": 1.512324333190918, + "learning_rate": 1.5049134042917816e-07, + "loss": 3.1775, + "step": 2422 + }, + { + "epoch": 0.8616642958748222, + "grad_norm": 1.5048972368240356, + "learning_rate": 1.497336668468164e-07, + "loss": 2.1403, + "step": 2423 + }, + { + "epoch": 0.8620199146514936, + "grad_norm": 0.9908884167671204, + "learning_rate": 1.4897780518757064e-07, + "loss": 2.6964, + "step": 2424 + }, + { + "epoch": 0.862375533428165, + "grad_norm": 1.0359039306640625, + "learning_rate": 1.482237564657326e-07, + "loss": 2.4509, + "step": 2425 + }, + { + "epoch": 0.8627311522048364, + "grad_norm": 0.9363325238227844, + "learning_rate": 1.4747152169316086e-07, + "loss": 2.6857, + "step": 2426 + }, + { + "epoch": 0.8630867709815079, + "grad_norm": 0.8140444159507751, + "learning_rate": 1.4672110187927928e-07, + "loss": 2.6207, + "step": 2427 + }, + { + "epoch": 0.8634423897581792, + "grad_norm": 0.7848725318908691, + "learning_rate": 1.459724980310767e-07, + "loss": 1.3529, + "step": 2428 + }, + { + "epoch": 0.8637980085348507, + "grad_norm": 1.3557345867156982, + "learning_rate": 1.4522571115310474e-07, + "loss": 2.6624, + "step": 2429 + }, + { + "epoch": 0.864153627311522, + "grad_norm": 1.5919333696365356, + "learning_rate": 1.4448074224747775e-07, + "loss": 2.4437, + "step": 2430 + }, + { + "epoch": 0.8645092460881935, + "grad_norm": 3.5430004596710205, + "learning_rate": 1.4373759231386964e-07, + "loss": 1.5868, + "step": 2431 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 1.0483205318450928, + "learning_rate": 1.4299626234951363e-07, + "loss": 2.6446, + "step": 2432 + }, + { + "epoch": 0.8652204836415363, + "grad_norm": 1.7366807460784912, + "learning_rate": 1.4225675334920085e-07, + "loss": 3.215, + "step": 2433 + }, + { + "epoch": 0.8655761024182077, + "grad_norm": 0.8591130375862122, + "learning_rate": 1.4151906630527865e-07, + "loss": 2.4761, + "step": 2434 + }, + { + "epoch": 0.865931721194879, + "grad_norm": 0.9327000379562378, + "learning_rate": 1.407832022076499e-07, + "loss": 2.0149, + "step": 2435 + }, + { + "epoch": 0.8662873399715505, + "grad_norm": 3.4652833938598633, + "learning_rate": 1.4004916204377066e-07, + "loss": 4.1612, + "step": 2436 + }, + { + "epoch": 0.866642958748222, + "grad_norm": 2.274749279022217, + "learning_rate": 1.3931694679865036e-07, + "loss": 3.4673, + "step": 2437 + }, + { + "epoch": 0.8669985775248933, + "grad_norm": 0.9224753975868225, + "learning_rate": 1.385865574548489e-07, + "loss": 1.9922, + "step": 2438 + }, + { + "epoch": 0.8673541963015647, + "grad_norm": 1.0968791246414185, + "learning_rate": 1.3785799499247586e-07, + "loss": 2.7737, + "step": 2439 + }, + { + "epoch": 0.8677098150782361, + "grad_norm": 1.2448872327804565, + "learning_rate": 1.3713126038918978e-07, + "loss": 2.9315, + "step": 2440 + }, + { + "epoch": 0.8680654338549075, + "grad_norm": 2.0699589252471924, + "learning_rate": 1.3640635462019617e-07, + "loss": 2.359, + "step": 2441 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 3.2989768981933594, + "learning_rate": 1.3568327865824615e-07, + "loss": 1.8676, + "step": 2442 + }, + { + "epoch": 0.8687766714082503, + "grad_norm": 0.9583457708358765, + "learning_rate": 1.3496203347363634e-07, + "loss": 2.4553, + "step": 2443 + }, + { + "epoch": 0.8691322901849218, + "grad_norm": 1.2067304849624634, + "learning_rate": 1.3424262003420572e-07, + "loss": 3.0757, + "step": 2444 + }, + { + "epoch": 0.8694879089615932, + "grad_norm": 1.123315453529358, + "learning_rate": 1.3352503930533577e-07, + "loss": 2.2998, + "step": 2445 + }, + { + "epoch": 0.8698435277382646, + "grad_norm": 1.3711894750595093, + "learning_rate": 1.328092922499482e-07, + "loss": 3.5453, + "step": 2446 + }, + { + "epoch": 0.870199146514936, + "grad_norm": 1.8096699714660645, + "learning_rate": 1.3209537982850422e-07, + "loss": 3.2119, + "step": 2447 + }, + { + "epoch": 0.8705547652916074, + "grad_norm": 1.8809894323349, + "learning_rate": 1.3138330299900386e-07, + "loss": 3.135, + "step": 2448 + }, + { + "epoch": 0.8709103840682788, + "grad_norm": 0.8985826969146729, + "learning_rate": 1.3067306271698293e-07, + "loss": 1.7511, + "step": 2449 + }, + { + "epoch": 0.8712660028449503, + "grad_norm": 1.6436909437179565, + "learning_rate": 1.2996465993551355e-07, + "loss": 2.3905, + "step": 2450 + }, + { + "epoch": 0.8716216216216216, + "grad_norm": 0.974930465221405, + "learning_rate": 1.2925809560520147e-07, + "loss": 2.348, + "step": 2451 + }, + { + "epoch": 0.871977240398293, + "grad_norm": 0.851917564868927, + "learning_rate": 1.2855337067418575e-07, + "loss": 2.726, + "step": 2452 + }, + { + "epoch": 0.8723328591749644, + "grad_norm": 1.2084070444107056, + "learning_rate": 1.2785048608813781e-07, + "loss": 2.5622, + "step": 2453 + }, + { + "epoch": 0.8726884779516358, + "grad_norm": 2.7707347869873047, + "learning_rate": 1.2714944279025798e-07, + "loss": 3.7803, + "step": 2454 + }, + { + "epoch": 0.8730440967283073, + "grad_norm": 0.8989809155464172, + "learning_rate": 1.2645024172127706e-07, + "loss": 2.6797, + "step": 2455 + }, + { + "epoch": 0.8733997155049786, + "grad_norm": 0.8197609782218933, + "learning_rate": 1.2575288381945337e-07, + "loss": 2.7372, + "step": 2456 + }, + { + "epoch": 0.8737553342816501, + "grad_norm": 0.903705358505249, + "learning_rate": 1.250573700205717e-07, + "loss": 2.4876, + "step": 2457 + }, + { + "epoch": 0.8741109530583214, + "grad_norm": 1.2178266048431396, + "learning_rate": 1.2436370125794267e-07, + "loss": 2.6774, + "step": 2458 + }, + { + "epoch": 0.8744665718349929, + "grad_norm": 0.7250889539718628, + "learning_rate": 1.2367187846240013e-07, + "loss": 2.008, + "step": 2459 + }, + { + "epoch": 0.8748221906116643, + "grad_norm": 0.9249468445777893, + "learning_rate": 1.2298190256230234e-07, + "loss": 1.8678, + "step": 2460 + }, + { + "epoch": 0.8751778093883357, + "grad_norm": 0.7880070805549622, + "learning_rate": 1.222937744835275e-07, + "loss": 1.9868, + "step": 2461 + }, + { + "epoch": 0.8755334281650071, + "grad_norm": 0.7734028697013855, + "learning_rate": 1.2160749514947567e-07, + "loss": 2.247, + "step": 2462 + }, + { + "epoch": 0.8758890469416786, + "grad_norm": 1.110905408859253, + "learning_rate": 1.2092306548106514e-07, + "loss": 2.8861, + "step": 2463 + }, + { + "epoch": 0.8762446657183499, + "grad_norm": 1.939935326576233, + "learning_rate": 1.2024048639673225e-07, + "loss": 3.4705, + "step": 2464 + }, + { + "epoch": 0.8766002844950214, + "grad_norm": 1.1875029802322388, + "learning_rate": 1.1955975881243114e-07, + "loss": 2.7227, + "step": 2465 + }, + { + "epoch": 0.8769559032716927, + "grad_norm": 1.1750072240829468, + "learning_rate": 1.188808836416293e-07, + "loss": 3.1694, + "step": 2466 + }, + { + "epoch": 0.8773115220483642, + "grad_norm": 0.9898868203163147, + "learning_rate": 1.1820386179531051e-07, + "loss": 2.5873, + "step": 2467 + }, + { + "epoch": 0.8776671408250356, + "grad_norm": 0.9459788799285889, + "learning_rate": 1.1752869418197054e-07, + "loss": 2.8962, + "step": 2468 + }, + { + "epoch": 0.878022759601707, + "grad_norm": 1.0694953203201294, + "learning_rate": 1.1685538170761683e-07, + "loss": 3.1551, + "step": 2469 + }, + { + "epoch": 0.8783783783783784, + "grad_norm": 0.8763519525527954, + "learning_rate": 1.1618392527576866e-07, + "loss": 2.1963, + "step": 2470 + }, + { + "epoch": 0.8787339971550497, + "grad_norm": 1.558605432510376, + "learning_rate": 1.1551432578745274e-07, + "loss": 2.0686, + "step": 2471 + }, + { + "epoch": 0.8790896159317212, + "grad_norm": 0.8532381057739258, + "learning_rate": 1.1484658414120585e-07, + "loss": 2.3852, + "step": 2472 + }, + { + "epoch": 0.8794452347083926, + "grad_norm": 1.6501785516738892, + "learning_rate": 1.141807012330699e-07, + "loss": 2.5481, + "step": 2473 + }, + { + "epoch": 0.879800853485064, + "grad_norm": 2.3818094730377197, + "learning_rate": 1.135166779565941e-07, + "loss": 3.1519, + "step": 2474 + }, + { + "epoch": 0.8801564722617354, + "grad_norm": 0.8475176692008972, + "learning_rate": 1.1285451520283219e-07, + "loss": 2.7264, + "step": 2475 + }, + { + "epoch": 0.8805120910384068, + "grad_norm": 0.8944806456565857, + "learning_rate": 1.1219421386033957e-07, + "loss": 2.5661, + "step": 2476 + }, + { + "epoch": 0.8808677098150782, + "grad_norm": 2.2286622524261475, + "learning_rate": 1.1153577481517596e-07, + "loss": 3.6922, + "step": 2477 + }, + { + "epoch": 0.8812233285917497, + "grad_norm": 2.1408016681671143, + "learning_rate": 1.108791989509001e-07, + "loss": 3.5962, + "step": 2478 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 2.155402898788452, + "learning_rate": 1.1022448714857236e-07, + "loss": 3.2836, + "step": 2479 + }, + { + "epoch": 0.8819345661450925, + "grad_norm": 1.158920407295227, + "learning_rate": 1.0957164028675066e-07, + "loss": 3.1252, + "step": 2480 + }, + { + "epoch": 0.8822901849217639, + "grad_norm": 1.1661713123321533, + "learning_rate": 1.0892065924149003e-07, + "loss": 2.8777, + "step": 2481 + }, + { + "epoch": 0.8826458036984353, + "grad_norm": 0.9143955707550049, + "learning_rate": 1.0827154488634322e-07, + "loss": 1.9897, + "step": 2482 + }, + { + "epoch": 0.8830014224751067, + "grad_norm": 0.8053215742111206, + "learning_rate": 1.0762429809235597e-07, + "loss": 2.195, + "step": 2483 + }, + { + "epoch": 0.883357041251778, + "grad_norm": 1.129955768585205, + "learning_rate": 1.0697891972807017e-07, + "loss": 2.7395, + "step": 2484 + }, + { + "epoch": 0.8837126600284495, + "grad_norm": 3.0594727993011475, + "learning_rate": 1.0633541065951874e-07, + "loss": 4.0858, + "step": 2485 + }, + { + "epoch": 0.884068278805121, + "grad_norm": 1.5086842775344849, + "learning_rate": 1.0569377175022692e-07, + "loss": 2.7735, + "step": 2486 + }, + { + "epoch": 0.8844238975817923, + "grad_norm": 1.0455458164215088, + "learning_rate": 1.05054003861211e-07, + "loss": 2.0582, + "step": 2487 + }, + { + "epoch": 0.8847795163584637, + "grad_norm": 0.783622145652771, + "learning_rate": 1.0441610785097471e-07, + "loss": 2.5255, + "step": 2488 + }, + { + "epoch": 0.8851351351351351, + "grad_norm": 0.7707294225692749, + "learning_rate": 1.0378008457551186e-07, + "loss": 2.1863, + "step": 2489 + }, + { + "epoch": 0.8854907539118065, + "grad_norm": 1.5105761289596558, + "learning_rate": 1.0314593488830221e-07, + "loss": 3.1195, + "step": 2490 + }, + { + "epoch": 0.885846372688478, + "grad_norm": 1.5354403257369995, + "learning_rate": 1.0251365964031156e-07, + "loss": 3.0472, + "step": 2491 + }, + { + "epoch": 0.8862019914651493, + "grad_norm": 1.401746392250061, + "learning_rate": 1.018832596799904e-07, + "loss": 3.1333, + "step": 2492 + }, + { + "epoch": 0.8865576102418208, + "grad_norm": 0.8898289203643799, + "learning_rate": 1.0125473585327238e-07, + "loss": 2.5713, + "step": 2493 + }, + { + "epoch": 0.8869132290184921, + "grad_norm": 0.9587617516517639, + "learning_rate": 1.00628089003575e-07, + "loss": 2.3555, + "step": 2494 + }, + { + "epoch": 0.8872688477951636, + "grad_norm": 1.1104379892349243, + "learning_rate": 1.0000331997179479e-07, + "loss": 2.6714, + "step": 2495 + }, + { + "epoch": 0.887624466571835, + "grad_norm": 1.1933680772781372, + "learning_rate": 9.938042959631044e-08, + "loss": 2.7524, + "step": 2496 + }, + { + "epoch": 0.8879800853485064, + "grad_norm": 1.133814811706543, + "learning_rate": 9.875941871297867e-08, + "loss": 2.9004, + "step": 2497 + }, + { + "epoch": 0.8883357041251778, + "grad_norm": 1.859856367111206, + "learning_rate": 9.814028815513438e-08, + "loss": 3.4326, + "step": 2498 + }, + { + "epoch": 0.8886913229018493, + "grad_norm": 1.7822531461715698, + "learning_rate": 9.752303875358897e-08, + "loss": 3.0936, + "step": 2499 + }, + { + "epoch": 0.8890469416785206, + "grad_norm": 4.646233081817627, + "learning_rate": 9.690767133662976e-08, + "loss": 2.2074, + "step": 2500 + }, + { + "epoch": 0.8894025604551921, + "grad_norm": 0.850702702999115, + "learning_rate": 9.629418673001883e-08, + "loss": 2.1169, + "step": 2501 + }, + { + "epoch": 0.8897581792318634, + "grad_norm": 0.8942809700965881, + "learning_rate": 9.568258575699152e-08, + "loss": 2.5059, + "step": 2502 + }, + { + "epoch": 0.8901137980085349, + "grad_norm": 2.073350667953491, + "learning_rate": 9.507286923825532e-08, + "loss": 3.9391, + "step": 2503 + }, + { + "epoch": 0.8904694167852063, + "grad_norm": 1.6095678806304932, + "learning_rate": 9.446503799198941e-08, + "loss": 3.2135, + "step": 2504 + }, + { + "epoch": 0.8908250355618776, + "grad_norm": 0.9050130248069763, + "learning_rate": 9.385909283384219e-08, + "loss": 2.6423, + "step": 2505 + }, + { + "epoch": 0.8911806543385491, + "grad_norm": 0.7607124447822571, + "learning_rate": 9.325503457693274e-08, + "loss": 2.2157, + "step": 2506 + }, + { + "epoch": 0.8915362731152204, + "grad_norm": 1.1713802814483643, + "learning_rate": 9.265286403184664e-08, + "loss": 1.501, + "step": 2507 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 1.737778663635254, + "learning_rate": 9.205258200663685e-08, + "loss": 2.4676, + "step": 2508 + }, + { + "epoch": 0.8922475106685633, + "grad_norm": 1.9424411058425903, + "learning_rate": 9.145418930682236e-08, + "loss": 2.3786, + "step": 2509 + }, + { + "epoch": 0.8926031294452347, + "grad_norm": 0.9744926691055298, + "learning_rate": 9.085768673538652e-08, + "loss": 1.9417, + "step": 2510 + }, + { + "epoch": 0.8929587482219061, + "grad_norm": 0.9820840358734131, + "learning_rate": 9.026307509277603e-08, + "loss": 2.4412, + "step": 2511 + }, + { + "epoch": 0.8933143669985776, + "grad_norm": 0.944648027420044, + "learning_rate": 8.967035517690148e-08, + "loss": 2.7016, + "step": 2512 + }, + { + "epoch": 0.8936699857752489, + "grad_norm": 1.1615550518035889, + "learning_rate": 8.907952778313328e-08, + "loss": 3.1074, + "step": 2513 + }, + { + "epoch": 0.8940256045519204, + "grad_norm": 0.9611197710037231, + "learning_rate": 8.849059370430357e-08, + "loss": 2.7969, + "step": 2514 + }, + { + "epoch": 0.8943812233285917, + "grad_norm": 0.9086797833442688, + "learning_rate": 8.790355373070286e-08, + "loss": 1.8826, + "step": 2515 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 1.3273969888687134, + "learning_rate": 8.731840865008067e-08, + "loss": 3.1461, + "step": 2516 + }, + { + "epoch": 0.8950924608819346, + "grad_norm": 1.058911681175232, + "learning_rate": 8.673515924764342e-08, + "loss": 2.8729, + "step": 2517 + }, + { + "epoch": 0.895448079658606, + "grad_norm": 1.3024638891220093, + "learning_rate": 8.615380630605352e-08, + "loss": 2.772, + "step": 2518 + }, + { + "epoch": 0.8958036984352774, + "grad_norm": 1.385345458984375, + "learning_rate": 8.557435060542929e-08, + "loss": 3.2017, + "step": 2519 + }, + { + "epoch": 0.8961593172119487, + "grad_norm": 1.0232610702514648, + "learning_rate": 8.499679292334239e-08, + "loss": 2.7298, + "step": 2520 + }, + { + "epoch": 0.8965149359886202, + "grad_norm": 0.8883377909660339, + "learning_rate": 8.442113403481772e-08, + "loss": 1.3049, + "step": 2521 + }, + { + "epoch": 0.8968705547652916, + "grad_norm": 0.9521697163581848, + "learning_rate": 8.38473747123325e-08, + "loss": 2.3793, + "step": 2522 + }, + { + "epoch": 0.897226173541963, + "grad_norm": 1.4534027576446533, + "learning_rate": 8.32755157258142e-08, + "loss": 3.2021, + "step": 2523 + }, + { + "epoch": 0.8975817923186344, + "grad_norm": 1.210422396659851, + "learning_rate": 8.270555784264167e-08, + "loss": 2.9562, + "step": 2524 + }, + { + "epoch": 0.8979374110953058, + "grad_norm": 1.586084246635437, + "learning_rate": 8.21375018276404e-08, + "loss": 3.418, + "step": 2525 + }, + { + "epoch": 0.8982930298719772, + "grad_norm": 1.4202969074249268, + "learning_rate": 8.1571348443086e-08, + "loss": 3.2633, + "step": 2526 + }, + { + "epoch": 0.8986486486486487, + "grad_norm": 0.8447686433792114, + "learning_rate": 8.100709844869957e-08, + "loss": 2.6043, + "step": 2527 + }, + { + "epoch": 0.89900426742532, + "grad_norm": 1.4949678182601929, + "learning_rate": 8.044475260164846e-08, + "loss": 3.1431, + "step": 2528 + }, + { + "epoch": 0.8993598862019915, + "grad_norm": 0.9618422389030457, + "learning_rate": 7.988431165654553e-08, + "loss": 2.3744, + "step": 2529 + }, + { + "epoch": 0.8997155049786629, + "grad_norm": 1.4464744329452515, + "learning_rate": 7.932577636544585e-08, + "loss": 3.2173, + "step": 2530 + }, + { + "epoch": 0.9000711237553343, + "grad_norm": 1.0862232446670532, + "learning_rate": 7.876914747784875e-08, + "loss": 2.6778, + "step": 2531 + }, + { + "epoch": 0.9004267425320057, + "grad_norm": 0.7457472085952759, + "learning_rate": 7.821442574069488e-08, + "loss": 2.6597, + "step": 2532 + }, + { + "epoch": 0.9007823613086771, + "grad_norm": 1.30258047580719, + "learning_rate": 7.766161189836513e-08, + "loss": 2.9496, + "step": 2533 + }, + { + "epoch": 0.9011379800853485, + "grad_norm": 0.9052330255508423, + "learning_rate": 7.711070669268161e-08, + "loss": 2.0128, + "step": 2534 + }, + { + "epoch": 0.90149359886202, + "grad_norm": 1.820069432258606, + "learning_rate": 7.656171086290314e-08, + "loss": 2.5086, + "step": 2535 + }, + { + "epoch": 0.9018492176386913, + "grad_norm": 1.0625287294387817, + "learning_rate": 7.601462514572876e-08, + "loss": 1.9561, + "step": 2536 + }, + { + "epoch": 0.9022048364153628, + "grad_norm": 1.3438924551010132, + "learning_rate": 7.546945027529189e-08, + "loss": 3.6227, + "step": 2537 + }, + { + "epoch": 0.9025604551920341, + "grad_norm": 1.0083012580871582, + "learning_rate": 7.492618698316384e-08, + "loss": 2.4819, + "step": 2538 + }, + { + "epoch": 0.9029160739687055, + "grad_norm": 0.785523533821106, + "learning_rate": 7.438483599834961e-08, + "loss": 2.3033, + "step": 2539 + }, + { + "epoch": 0.903271692745377, + "grad_norm": 1.111541986465454, + "learning_rate": 7.384539804728813e-08, + "loss": 2.1019, + "step": 2540 + }, + { + "epoch": 0.9036273115220483, + "grad_norm": 1.2175710201263428, + "learning_rate": 7.330787385385218e-08, + "loss": 2.9459, + "step": 2541 + }, + { + "epoch": 0.9039829302987198, + "grad_norm": 0.8092734217643738, + "learning_rate": 7.277226413934496e-08, + "loss": 2.0118, + "step": 2542 + }, + { + "epoch": 0.9043385490753911, + "grad_norm": 1.262076735496521, + "learning_rate": 7.223856962250186e-08, + "loss": 2.984, + "step": 2543 + }, + { + "epoch": 0.9046941678520626, + "grad_norm": 1.2550485134124756, + "learning_rate": 7.170679101948785e-08, + "loss": 3.0367, + "step": 2544 + }, + { + "epoch": 0.905049786628734, + "grad_norm": 0.7519182562828064, + "learning_rate": 7.11769290438966e-08, + "loss": 2.3447, + "step": 2545 + }, + { + "epoch": 0.9054054054054054, + "grad_norm": 1.1353259086608887, + "learning_rate": 7.064898440675088e-08, + "loss": 3.0398, + "step": 2546 + }, + { + "epoch": 0.9057610241820768, + "grad_norm": 0.7707204222679138, + "learning_rate": 7.012295781649897e-08, + "loss": 1.8984, + "step": 2547 + }, + { + "epoch": 0.9061166429587483, + "grad_norm": 1.3598417043685913, + "learning_rate": 6.959884997901706e-08, + "loss": 3.0992, + "step": 2548 + }, + { + "epoch": 0.9064722617354196, + "grad_norm": 0.8454831838607788, + "learning_rate": 6.907666159760523e-08, + "loss": 2.8715, + "step": 2549 + }, + { + "epoch": 0.9068278805120911, + "grad_norm": 0.9987355470657349, + "learning_rate": 6.855639337298813e-08, + "loss": 1.7462, + "step": 2550 + }, + { + "epoch": 0.9071834992887624, + "grad_norm": 1.4492485523223877, + "learning_rate": 6.803804600331498e-08, + "loss": 2.2464, + "step": 2551 + }, + { + "epoch": 0.9075391180654339, + "grad_norm": 0.8108601570129395, + "learning_rate": 6.752162018415519e-08, + "loss": 1.6843, + "step": 2552 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 1.8020656108856201, + "learning_rate": 6.700711660850178e-08, + "loss": 2.6673, + "step": 2553 + }, + { + "epoch": 0.9082503556187767, + "grad_norm": 1.1484051942825317, + "learning_rate": 6.649453596676663e-08, + "loss": 2.7069, + "step": 2554 + }, + { + "epoch": 0.9086059743954481, + "grad_norm": 0.9080120325088501, + "learning_rate": 6.598387894678254e-08, + "loss": 2.1373, + "step": 2555 + }, + { + "epoch": 0.9089615931721194, + "grad_norm": 1.248227596282959, + "learning_rate": 6.547514623380019e-08, + "loss": 3.4072, + "step": 2556 + }, + { + "epoch": 0.9093172119487909, + "grad_norm": 1.902525544166565, + "learning_rate": 6.496833851048817e-08, + "loss": 3.6694, + "step": 2557 + }, + { + "epoch": 0.9096728307254623, + "grad_norm": 1.570737361907959, + "learning_rate": 6.446345645693264e-08, + "loss": 3.3173, + "step": 2558 + }, + { + "epoch": 0.9100284495021337, + "grad_norm": 1.3414281606674194, + "learning_rate": 6.396050075063414e-08, + "loss": 3.1664, + "step": 2559 + }, + { + "epoch": 0.9103840682788051, + "grad_norm": 1.5885238647460938, + "learning_rate": 6.345947206650981e-08, + "loss": 3.9724, + "step": 2560 + }, + { + "epoch": 0.9107396870554765, + "grad_norm": 2.044727325439453, + "learning_rate": 6.296037107689034e-08, + "loss": 2.9752, + "step": 2561 + }, + { + "epoch": 0.9110953058321479, + "grad_norm": 2.1974058151245117, + "learning_rate": 6.246319845151949e-08, + "loss": 4.2412, + "step": 2562 + }, + { + "epoch": 0.9114509246088194, + "grad_norm": 0.8739327192306519, + "learning_rate": 6.196795485755341e-08, + "loss": 2.4992, + "step": 2563 + }, + { + "epoch": 0.9118065433854907, + "grad_norm": 1.0753624439239502, + "learning_rate": 6.147464095955968e-08, + "loss": 2.6647, + "step": 2564 + }, + { + "epoch": 0.9121621621621622, + "grad_norm": 1.102317452430725, + "learning_rate": 6.098325741951677e-08, + "loss": 2.6693, + "step": 2565 + }, + { + "epoch": 0.9125177809388336, + "grad_norm": 0.8216279745101929, + "learning_rate": 6.049380489681239e-08, + "loss": 2.5145, + "step": 2566 + }, + { + "epoch": 0.912873399715505, + "grad_norm": 1.2792588472366333, + "learning_rate": 6.000628404824299e-08, + "loss": 3.0374, + "step": 2567 + }, + { + "epoch": 0.9132290184921764, + "grad_norm": 1.2484123706817627, + "learning_rate": 5.952069552801326e-08, + "loss": 3.3186, + "step": 2568 + }, + { + "epoch": 0.9135846372688478, + "grad_norm": 1.100712537765503, + "learning_rate": 5.9037039987734295e-08, + "loss": 2.9102, + "step": 2569 + }, + { + "epoch": 0.9139402560455192, + "grad_norm": 1.452695369720459, + "learning_rate": 5.855531807642445e-08, + "loss": 3.7025, + "step": 2570 + }, + { + "epoch": 0.9142958748221907, + "grad_norm": 2.8853187561035156, + "learning_rate": 5.8075530440505955e-08, + "loss": 3.2948, + "step": 2571 + }, + { + "epoch": 0.914651493598862, + "grad_norm": 1.1485629081726074, + "learning_rate": 5.759767772380647e-08, + "loss": 2.6258, + "step": 2572 + }, + { + "epoch": 0.9150071123755334, + "grad_norm": 0.995788037776947, + "learning_rate": 5.7121760567556746e-08, + "loss": 1.8348, + "step": 2573 + }, + { + "epoch": 0.9153627311522048, + "grad_norm": 0.9340800046920776, + "learning_rate": 5.6647779610390085e-08, + "loss": 2.0579, + "step": 2574 + }, + { + "epoch": 0.9157183499288762, + "grad_norm": 1.309481143951416, + "learning_rate": 5.6175735488341875e-08, + "loss": 3.3254, + "step": 2575 + }, + { + "epoch": 0.9160739687055477, + "grad_norm": 1.4374123811721802, + "learning_rate": 5.570562883484842e-08, + "loss": 2.3815, + "step": 2576 + }, + { + "epoch": 0.916429587482219, + "grad_norm": 1.2015122175216675, + "learning_rate": 5.5237460280746114e-08, + "loss": 1.9137, + "step": 2577 + }, + { + "epoch": 0.9167852062588905, + "grad_norm": 0.716677188873291, + "learning_rate": 5.4771230454270574e-08, + "loss": 2.0632, + "step": 2578 + }, + { + "epoch": 0.9171408250355618, + "grad_norm": 0.9137453436851501, + "learning_rate": 5.430693998105585e-08, + "loss": 2.1798, + "step": 2579 + }, + { + "epoch": 0.9174964438122333, + "grad_norm": 0.9911420345306396, + "learning_rate": 5.384458948413357e-08, + "loss": 2.9637, + "step": 2580 + }, + { + "epoch": 0.9178520625889047, + "grad_norm": 1.3207865953445435, + "learning_rate": 5.3384179583932104e-08, + "loss": 3.0712, + "step": 2581 + }, + { + "epoch": 0.9182076813655761, + "grad_norm": 0.8522067070007324, + "learning_rate": 5.292571089827558e-08, + "loss": 2.3044, + "step": 2582 + }, + { + "epoch": 0.9185633001422475, + "grad_norm": 1.7202147245407104, + "learning_rate": 5.246918404238371e-08, + "loss": 2.7706, + "step": 2583 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 1.2162996530532837, + "learning_rate": 5.201459962886995e-08, + "loss": 1.9764, + "step": 2584 + }, + { + "epoch": 0.9192745376955903, + "grad_norm": 0.9013593792915344, + "learning_rate": 5.1561958267741346e-08, + "loss": 2.2481, + "step": 2585 + }, + { + "epoch": 0.9196301564722618, + "grad_norm": 0.8621492981910706, + "learning_rate": 5.11112605663977e-08, + "loss": 2.382, + "step": 2586 + }, + { + "epoch": 0.9199857752489331, + "grad_norm": 1.3751604557037354, + "learning_rate": 5.066250712963022e-08, + "loss": 3.5431, + "step": 2587 + }, + { + "epoch": 0.9203413940256046, + "grad_norm": 1.900725245475769, + "learning_rate": 5.0215698559621884e-08, + "loss": 3.3187, + "step": 2588 + }, + { + "epoch": 0.920697012802276, + "grad_norm": 0.7517723441123962, + "learning_rate": 4.977083545594474e-08, + "loss": 2.4848, + "step": 2589 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.1940184831619263, + "learning_rate": 4.9327918415561276e-08, + "loss": 2.4622, + "step": 2590 + }, + { + "epoch": 0.9214082503556188, + "grad_norm": 0.9237121939659119, + "learning_rate": 4.88869480328219e-08, + "loss": 2.5144, + "step": 2591 + }, + { + "epoch": 0.9217638691322901, + "grad_norm": 1.1017674207687378, + "learning_rate": 4.844792489946492e-08, + "loss": 2.9987, + "step": 2592 + }, + { + "epoch": 0.9221194879089616, + "grad_norm": 1.3304879665374756, + "learning_rate": 4.801084960461627e-08, + "loss": 3.3882, + "step": 2593 + }, + { + "epoch": 0.922475106685633, + "grad_norm": 1.4895907640457153, + "learning_rate": 4.7575722734786774e-08, + "loss": 2.4773, + "step": 2594 + }, + { + "epoch": 0.9228307254623044, + "grad_norm": 1.262916922569275, + "learning_rate": 4.7142544873873874e-08, + "loss": 3.1503, + "step": 2595 + }, + { + "epoch": 0.9231863442389758, + "grad_norm": 0.8211431503295898, + "learning_rate": 4.671131660315908e-08, + "loss": 2.1552, + "step": 2596 + }, + { + "epoch": 0.9235419630156472, + "grad_norm": 1.1762890815734863, + "learning_rate": 4.628203850130769e-08, + "loss": 1.9, + "step": 2597 + }, + { + "epoch": 0.9238975817923186, + "grad_norm": 1.4060721397399902, + "learning_rate": 4.585471114436857e-08, + "loss": 3.0755, + "step": 2598 + }, + { + "epoch": 0.9242532005689901, + "grad_norm": 2.476511240005493, + "learning_rate": 4.5429335105772015e-08, + "loss": 2.8653, + "step": 2599 + }, + { + "epoch": 0.9246088193456614, + "grad_norm": 0.7913942337036133, + "learning_rate": 4.500591095633094e-08, + "loss": 2.6633, + "step": 2600 + }, + { + "epoch": 0.9249644381223329, + "grad_norm": 0.9811316132545471, + "learning_rate": 4.458443926423783e-08, + "loss": 2.5715, + "step": 2601 + }, + { + "epoch": 0.9253200568990043, + "grad_norm": 1.5360791683197021, + "learning_rate": 4.4164920595066275e-08, + "loss": 2.9703, + "step": 2602 + }, + { + "epoch": 0.9256756756756757, + "grad_norm": 1.2810865640640259, + "learning_rate": 4.3747355511768286e-08, + "loss": 3.0202, + "step": 2603 + }, + { + "epoch": 0.9260312944523471, + "grad_norm": 0.9524503946304321, + "learning_rate": 4.3331744574674815e-08, + "loss": 2.1796, + "step": 2604 + }, + { + "epoch": 0.9263869132290184, + "grad_norm": 1.226622462272644, + "learning_rate": 4.2918088341494577e-08, + "loss": 2.993, + "step": 2605 + }, + { + "epoch": 0.9267425320056899, + "grad_norm": 1.1527286767959595, + "learning_rate": 4.2506387367312547e-08, + "loss": 2.963, + "step": 2606 + }, + { + "epoch": 0.9270981507823614, + "grad_norm": 1.0365331172943115, + "learning_rate": 4.209664220459114e-08, + "loss": 2.3882, + "step": 2607 + }, + { + "epoch": 0.9274537695590327, + "grad_norm": 1.0270909070968628, + "learning_rate": 4.1688853403167195e-08, + "loss": 2.3453, + "step": 2608 + }, + { + "epoch": 0.9278093883357041, + "grad_norm": 1.5208865404129028, + "learning_rate": 4.1283021510252816e-08, + "loss": 1.2656, + "step": 2609 + }, + { + "epoch": 0.9281650071123755, + "grad_norm": 1.0716466903686523, + "learning_rate": 4.087914707043422e-08, + "loss": 2.8408, + "step": 2610 + }, + { + "epoch": 0.9285206258890469, + "grad_norm": 0.869314968585968, + "learning_rate": 4.047723062567038e-08, + "loss": 2.4465, + "step": 2611 + }, + { + "epoch": 0.9288762446657184, + "grad_norm": 0.9348461627960205, + "learning_rate": 4.0077272715293545e-08, + "loss": 2.2469, + "step": 2612 + }, + { + "epoch": 0.9292318634423897, + "grad_norm": 0.888795793056488, + "learning_rate": 3.967927387600706e-08, + "loss": 2.1039, + "step": 2613 + }, + { + "epoch": 0.9295874822190612, + "grad_norm": 2.0117907524108887, + "learning_rate": 3.928323464188621e-08, + "loss": 3.328, + "step": 2614 + }, + { + "epoch": 0.9299431009957326, + "grad_norm": 1.32895827293396, + "learning_rate": 3.8889155544376056e-08, + "loss": 2.673, + "step": 2615 + }, + { + "epoch": 0.930298719772404, + "grad_norm": 2.1060872077941895, + "learning_rate": 3.849703711229124e-08, + "loss": 3.5273, + "step": 2616 + }, + { + "epoch": 0.9306543385490754, + "grad_norm": 1.479435920715332, + "learning_rate": 3.810687987181638e-08, + "loss": 2.366, + "step": 2617 + }, + { + "epoch": 0.9310099573257468, + "grad_norm": 1.5519169569015503, + "learning_rate": 3.7718684346502994e-08, + "loss": 2.8182, + "step": 2618 + }, + { + "epoch": 0.9313655761024182, + "grad_norm": 0.8013685941696167, + "learning_rate": 3.73324510572714e-08, + "loss": 2.2782, + "step": 2619 + }, + { + "epoch": 0.9317211948790897, + "grad_norm": 1.0786528587341309, + "learning_rate": 3.6948180522408006e-08, + "loss": 2.354, + "step": 2620 + }, + { + "epoch": 0.932076813655761, + "grad_norm": 0.9507488012313843, + "learning_rate": 3.6565873257565495e-08, + "loss": 2.6128, + "step": 2621 + }, + { + "epoch": 0.9324324324324325, + "grad_norm": 0.9627034664154053, + "learning_rate": 3.618552977576267e-08, + "loss": 3.0443, + "step": 2622 + }, + { + "epoch": 0.9327880512091038, + "grad_norm": 1.3926628828048706, + "learning_rate": 3.58071505873821e-08, + "loss": 3.3907, + "step": 2623 + }, + { + "epoch": 0.9331436699857752, + "grad_norm": 1.1232130527496338, + "learning_rate": 3.543073620017145e-08, + "loss": 2.0711, + "step": 2624 + }, + { + "epoch": 0.9334992887624467, + "grad_norm": 0.8736873865127563, + "learning_rate": 3.505628711924119e-08, + "loss": 2.4666, + "step": 2625 + }, + { + "epoch": 0.933854907539118, + "grad_norm": 0.8112239241600037, + "learning_rate": 3.468380384706471e-08, + "loss": 2.4311, + "step": 2626 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.8814767599105835, + "learning_rate": 3.4313286883477515e-08, + "loss": 2.3741, + "step": 2627 + }, + { + "epoch": 0.9345661450924608, + "grad_norm": 1.245666742324829, + "learning_rate": 3.394473672567655e-08, + "loss": 2.8167, + "step": 2628 + }, + { + "epoch": 0.9349217638691323, + "grad_norm": 0.7368335723876953, + "learning_rate": 3.3578153868219555e-08, + "loss": 1.9586, + "step": 2629 + }, + { + "epoch": 0.9352773826458037, + "grad_norm": 0.9250180125236511, + "learning_rate": 3.321353880302436e-08, + "loss": 2.5097, + "step": 2630 + }, + { + "epoch": 0.9356330014224751, + "grad_norm": 1.1117792129516602, + "learning_rate": 3.285089201936775e-08, + "loss": 2.7701, + "step": 2631 + }, + { + "epoch": 0.9359886201991465, + "grad_norm": 1.244602084159851, + "learning_rate": 3.2490214003885966e-08, + "loss": 2.917, + "step": 2632 + }, + { + "epoch": 0.936344238975818, + "grad_norm": 0.9393265843391418, + "learning_rate": 3.213150524057268e-08, + "loss": 2.2718, + "step": 2633 + }, + { + "epoch": 0.9366998577524893, + "grad_norm": 0.8983284831047058, + "learning_rate": 3.1774766210780016e-08, + "loss": 1.7002, + "step": 2634 + }, + { + "epoch": 0.9370554765291608, + "grad_norm": 1.0256023406982422, + "learning_rate": 3.141999739321555e-08, + "loss": 3.0482, + "step": 2635 + }, + { + "epoch": 0.9374110953058321, + "grad_norm": 1.2034715414047241, + "learning_rate": 3.106719926394413e-08, + "loss": 1.6625, + "step": 2636 + }, + { + "epoch": 0.9377667140825036, + "grad_norm": 1.0244909524917603, + "learning_rate": 3.071637229638558e-08, + "loss": 2.626, + "step": 2637 + }, + { + "epoch": 0.938122332859175, + "grad_norm": 1.4263097047805786, + "learning_rate": 3.0367516961315124e-08, + "loss": 3.0395, + "step": 2638 + }, + { + "epoch": 0.9384779516358464, + "grad_norm": 0.7793145179748535, + "learning_rate": 3.002063372686148e-08, + "loss": 2.3919, + "step": 2639 + }, + { + "epoch": 0.9388335704125178, + "grad_norm": 1.6585009098052979, + "learning_rate": 2.967572305850763e-08, + "loss": 2.9444, + "step": 2640 + }, + { + "epoch": 0.9391891891891891, + "grad_norm": 0.8393080830574036, + "learning_rate": 2.9332785419089515e-08, + "loss": 2.4892, + "step": 2641 + }, + { + "epoch": 0.9395448079658606, + "grad_norm": 1.1286747455596924, + "learning_rate": 2.899182126879535e-08, + "loss": 1.3972, + "step": 2642 + }, + { + "epoch": 0.939900426742532, + "grad_norm": 1.0688728094100952, + "learning_rate": 2.8652831065164975e-08, + "loss": 2.7771, + "step": 2643 + }, + { + "epoch": 0.9402560455192034, + "grad_norm": 1.0394896268844604, + "learning_rate": 2.831581526308935e-08, + "loss": 2.0897, + "step": 2644 + }, + { + "epoch": 0.9406116642958748, + "grad_norm": 1.0127592086791992, + "learning_rate": 2.7980774314810553e-08, + "loss": 2.4512, + "step": 2645 + }, + { + "epoch": 0.9409672830725462, + "grad_norm": 0.9198864698410034, + "learning_rate": 2.764770866991978e-08, + "loss": 2.3447, + "step": 2646 + }, + { + "epoch": 0.9413229018492176, + "grad_norm": 1.0005885362625122, + "learning_rate": 2.7316618775358514e-08, + "loss": 2.2806, + "step": 2647 + }, + { + "epoch": 0.9416785206258891, + "grad_norm": 1.6848604679107666, + "learning_rate": 2.698750507541603e-08, + "loss": 2.8723, + "step": 2648 + }, + { + "epoch": 0.9420341394025604, + "grad_norm": 1.496047854423523, + "learning_rate": 2.6660368011730384e-08, + "loss": 3.2618, + "step": 2649 + }, + { + "epoch": 0.9423897581792319, + "grad_norm": 1.8597649335861206, + "learning_rate": 2.6335208023287094e-08, + "loss": 3.34, + "step": 2650 + }, + { + "epoch": 0.9427453769559033, + "grad_norm": 1.136025309562683, + "learning_rate": 2.6012025546417963e-08, + "loss": 2.8683, + "step": 2651 + }, + { + "epoch": 0.9431009957325747, + "grad_norm": 1.1489259004592896, + "learning_rate": 2.569082101480258e-08, + "loss": 2.921, + "step": 2652 + }, + { + "epoch": 0.9434566145092461, + "grad_norm": 1.6814714670181274, + "learning_rate": 2.5371594859464665e-08, + "loss": 2.9082, + "step": 2653 + }, + { + "epoch": 0.9438122332859175, + "grad_norm": 1.204319953918457, + "learning_rate": 2.5054347508774388e-08, + "loss": 3.1063, + "step": 2654 + }, + { + "epoch": 0.9441678520625889, + "grad_norm": 1.0434119701385498, + "learning_rate": 2.473907938844622e-08, + "loss": 2.8667, + "step": 2655 + }, + { + "epoch": 0.9445234708392604, + "grad_norm": 1.4094754457473755, + "learning_rate": 2.4425790921538404e-08, + "loss": 2.6815, + "step": 2656 + }, + { + "epoch": 0.9448790896159317, + "grad_norm": 1.7981960773468018, + "learning_rate": 2.4114482528452998e-08, + "loss": 3.4184, + "step": 2657 + }, + { + "epoch": 0.9452347083926032, + "grad_norm": 1.1218594312667847, + "learning_rate": 2.3805154626934665e-08, + "loss": 2.5096, + "step": 2658 + }, + { + "epoch": 0.9455903271692745, + "grad_norm": 2.7216269969940186, + "learning_rate": 2.349780763207121e-08, + "loss": 2.9578, + "step": 2659 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8283654451370239, + "learning_rate": 2.3192441956291223e-08, + "loss": 2.3331, + "step": 2660 + }, + { + "epoch": 0.9463015647226174, + "grad_norm": 1.0878918170928955, + "learning_rate": 2.288905800936525e-08, + "loss": 3.2015, + "step": 2661 + }, + { + "epoch": 0.9466571834992887, + "grad_norm": 1.1165096759796143, + "learning_rate": 2.258765619840447e-08, + "loss": 2.1622, + "step": 2662 + }, + { + "epoch": 0.9470128022759602, + "grad_norm": 1.0980963706970215, + "learning_rate": 2.2288236927860027e-08, + "loss": 2.8104, + "step": 2663 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.9822747707366943, + "learning_rate": 2.1990800599522853e-08, + "loss": 2.5989, + "step": 2664 + }, + { + "epoch": 0.947724039829303, + "grad_norm": 1.2510371208190918, + "learning_rate": 2.169534761252284e-08, + "loss": 2.8873, + "step": 2665 + }, + { + "epoch": 0.9480796586059744, + "grad_norm": 0.9311444759368896, + "learning_rate": 2.140187836332852e-08, + "loss": 2.5388, + "step": 2666 + }, + { + "epoch": 0.9484352773826458, + "grad_norm": 1.1219313144683838, + "learning_rate": 2.111039324574654e-08, + "loss": 2.6207, + "step": 2667 + }, + { + "epoch": 0.9487908961593172, + "grad_norm": 0.9679698348045349, + "learning_rate": 2.0820892650920686e-08, + "loss": 2.1582, + "step": 2668 + }, + { + "epoch": 0.9491465149359887, + "grad_norm": 1.2089468240737915, + "learning_rate": 2.0533376967332375e-08, + "loss": 1.5658, + "step": 2669 + }, + { + "epoch": 0.94950213371266, + "grad_norm": 1.3141651153564453, + "learning_rate": 2.0247846580798644e-08, + "loss": 3.4708, + "step": 2670 + }, + { + "epoch": 0.9498577524893315, + "grad_norm": 1.4757776260375977, + "learning_rate": 1.9964301874473178e-08, + "loss": 3.3459, + "step": 2671 + }, + { + "epoch": 0.9502133712660028, + "grad_norm": 1.0169541835784912, + "learning_rate": 1.9682743228844614e-08, + "loss": 2.2289, + "step": 2672 + }, + { + "epoch": 0.9505689900426743, + "grad_norm": 1.5536192655563354, + "learning_rate": 1.9403171021736553e-08, + "loss": 3.1998, + "step": 2673 + }, + { + "epoch": 0.9509246088193457, + "grad_norm": 1.0917459726333618, + "learning_rate": 1.9125585628307407e-08, + "loss": 2.6263, + "step": 2674 + }, + { + "epoch": 0.951280227596017, + "grad_norm": 1.5003606081008911, + "learning_rate": 1.8849987421048874e-08, + "loss": 3.4106, + "step": 2675 + }, + { + "epoch": 0.9516358463726885, + "grad_norm": 0.8202641010284424, + "learning_rate": 1.8576376769786462e-08, + "loss": 2.2935, + "step": 2676 + }, + { + "epoch": 0.9519914651493598, + "grad_norm": 1.3667757511138916, + "learning_rate": 1.8304754041678308e-08, + "loss": 3.1876, + "step": 2677 + }, + { + "epoch": 0.9523470839260313, + "grad_norm": 1.2230191230773926, + "learning_rate": 1.8035119601215344e-08, + "loss": 3.372, + "step": 2678 + }, + { + "epoch": 0.9527027027027027, + "grad_norm": 1.4305301904678345, + "learning_rate": 1.776747381021998e-08, + "loss": 2.9374, + "step": 2679 + }, + { + "epoch": 0.9530583214793741, + "grad_norm": 0.804465651512146, + "learning_rate": 1.7501817027846256e-08, + "loss": 2.3896, + "step": 2680 + }, + { + "epoch": 0.9534139402560455, + "grad_norm": 0.8358513116836548, + "learning_rate": 1.7238149610579346e-08, + "loss": 2.3943, + "step": 2681 + }, + { + "epoch": 0.9537695590327169, + "grad_norm": 1.245842456817627, + "learning_rate": 1.6976471912234394e-08, + "loss": 2.6463, + "step": 2682 + }, + { + "epoch": 0.9541251778093883, + "grad_norm": 1.2417232990264893, + "learning_rate": 1.6716784283957175e-08, + "loss": 3.1244, + "step": 2683 + }, + { + "epoch": 0.9544807965860598, + "grad_norm": 1.8511295318603516, + "learning_rate": 1.6459087074222278e-08, + "loss": 2.3464, + "step": 2684 + }, + { + "epoch": 0.9548364153627311, + "grad_norm": 1.127949833869934, + "learning_rate": 1.6203380628834085e-08, + "loss": 2.4722, + "step": 2685 + }, + { + "epoch": 0.9551920341394026, + "grad_norm": 1.3857216835021973, + "learning_rate": 1.594966529092512e-08, + "loss": 3.0116, + "step": 2686 + }, + { + "epoch": 0.955547652916074, + "grad_norm": 1.1307075023651123, + "learning_rate": 1.5697941400955874e-08, + "loss": 3.0924, + "step": 2687 + }, + { + "epoch": 0.9559032716927454, + "grad_norm": 1.345800757408142, + "learning_rate": 1.5448209296714977e-08, + "loss": 3.0952, + "step": 2688 + }, + { + "epoch": 0.9562588904694168, + "grad_norm": 0.8123469352722168, + "learning_rate": 1.52004693133182e-08, + "loss": 2.5234, + "step": 2689 + }, + { + "epoch": 0.9566145092460882, + "grad_norm": 0.9545384049415588, + "learning_rate": 1.495472178320778e-08, + "loss": 2.6874, + "step": 2690 + }, + { + "epoch": 0.9569701280227596, + "grad_norm": 2.852726697921753, + "learning_rate": 1.4710967036152434e-08, + "loss": 4.2172, + "step": 2691 + }, + { + "epoch": 0.957325746799431, + "grad_norm": 1.0143544673919678, + "learning_rate": 1.4469205399246843e-08, + "loss": 2.6244, + "step": 2692 + }, + { + "epoch": 0.9576813655761024, + "grad_norm": 1.1793705224990845, + "learning_rate": 1.4229437196911165e-08, + "loss": 2.2277, + "step": 2693 + }, + { + "epoch": 0.9580369843527738, + "grad_norm": 1.4817203283309937, + "learning_rate": 1.3991662750890365e-08, + "loss": 2.8214, + "step": 2694 + }, + { + "epoch": 0.9583926031294452, + "grad_norm": 0.9167364239692688, + "learning_rate": 1.3755882380254047e-08, + "loss": 1.9791, + "step": 2695 + }, + { + "epoch": 0.9587482219061166, + "grad_norm": 1.217136025428772, + "learning_rate": 1.3522096401396289e-08, + "loss": 2.821, + "step": 2696 + }, + { + "epoch": 0.9591038406827881, + "grad_norm": 1.1354341506958008, + "learning_rate": 1.3290305128034307e-08, + "loss": 3.2126, + "step": 2697 + }, + { + "epoch": 0.9594594594594594, + "grad_norm": 1.2274484634399414, + "learning_rate": 1.3060508871209131e-08, + "loss": 2.7602, + "step": 2698 + }, + { + "epoch": 0.9598150782361309, + "grad_norm": 0.8509036898612976, + "learning_rate": 1.2832707939284426e-08, + "loss": 2.4314, + "step": 2699 + }, + { + "epoch": 0.9601706970128022, + "grad_norm": 0.8851708769798279, + "learning_rate": 1.2606902637946339e-08, + "loss": 2.4353, + "step": 2700 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.9624223113059998, + "learning_rate": 1.238309327020315e-08, + "loss": 2.7163, + "step": 2701 + }, + { + "epoch": 0.9608819345661451, + "grad_norm": 0.9531762003898621, + "learning_rate": 1.2161280136384789e-08, + "loss": 2.2711, + "step": 2702 + }, + { + "epoch": 0.9612375533428165, + "grad_norm": 1.4468376636505127, + "learning_rate": 1.1941463534142493e-08, + "loss": 3.5977, + "step": 2703 + }, + { + "epoch": 0.9615931721194879, + "grad_norm": 0.9758815765380859, + "learning_rate": 1.1723643758448144e-08, + "loss": 2.6889, + "step": 2704 + }, + { + "epoch": 0.9619487908961594, + "grad_norm": 1.3818002939224243, + "learning_rate": 1.1507821101594262e-08, + "loss": 3.0136, + "step": 2705 + }, + { + "epoch": 0.9623044096728307, + "grad_norm": 0.7386743426322937, + "learning_rate": 1.129399585319335e-08, + "loss": 2.2469, + "step": 2706 + }, + { + "epoch": 0.9626600284495022, + "grad_norm": 1.0185256004333496, + "learning_rate": 1.108216830017772e-08, + "loss": 2.5261, + "step": 2707 + }, + { + "epoch": 0.9630156472261735, + "grad_norm": 1.0434327125549316, + "learning_rate": 1.0872338726798826e-08, + "loss": 3.0122, + "step": 2708 + }, + { + "epoch": 0.963371266002845, + "grad_norm": 0.716729998588562, + "learning_rate": 1.0664507414627101e-08, + "loss": 1.829, + "step": 2709 + }, + { + "epoch": 0.9637268847795164, + "grad_norm": 1.0544958114624023, + "learning_rate": 1.045867464255129e-08, + "loss": 1.9964, + "step": 2710 + }, + { + "epoch": 0.9640825035561877, + "grad_norm": 1.102440357208252, + "learning_rate": 1.0254840686778954e-08, + "loss": 2.7803, + "step": 2711 + }, + { + "epoch": 0.9644381223328592, + "grad_norm": 0.9526596069335938, + "learning_rate": 1.0053005820834626e-08, + "loss": 2.5356, + "step": 2712 + }, + { + "epoch": 0.9647937411095305, + "grad_norm": 0.9620009064674377, + "learning_rate": 9.853170315560656e-09, + "loss": 2.14, + "step": 2713 + }, + { + "epoch": 0.965149359886202, + "grad_norm": 1.1649222373962402, + "learning_rate": 9.655334439116536e-09, + "loss": 2.974, + "step": 2714 + }, + { + "epoch": 0.9655049786628734, + "grad_norm": 1.0276812314987183, + "learning_rate": 9.45949845697841e-09, + "loss": 1.8521, + "step": 2715 + }, + { + "epoch": 0.9658605974395448, + "grad_norm": 1.6321741342544556, + "learning_rate": 9.265662631938398e-09, + "loss": 2.6412, + "step": 2716 + }, + { + "epoch": 0.9662162162162162, + "grad_norm": 1.238337516784668, + "learning_rate": 9.073827224104937e-09, + "loss": 2.4944, + "step": 2717 + }, + { + "epoch": 0.9665718349928877, + "grad_norm": 2.55171537399292, + "learning_rate": 8.88399249090227e-09, + "loss": 4.3503, + "step": 2718 + }, + { + "epoch": 0.966927453769559, + "grad_norm": 1.8550622463226318, + "learning_rate": 8.696158687069799e-09, + "loss": 2.6232, + "step": 2719 + }, + { + "epoch": 0.9672830725462305, + "grad_norm": 1.3518965244293213, + "learning_rate": 8.5103260646614e-09, + "loss": 1.8052, + "step": 2720 + }, + { + "epoch": 0.9676386913229018, + "grad_norm": 1.0886080265045166, + "learning_rate": 8.32649487304643e-09, + "loss": 2.9315, + "step": 2721 + }, + { + "epoch": 0.9679943100995733, + "grad_norm": 1.2780132293701172, + "learning_rate": 8.144665358907732e-09, + "loss": 2.9263, + "step": 2722 + }, + { + "epoch": 0.9683499288762447, + "grad_norm": 0.8618134260177612, + "learning_rate": 7.964837766242462e-09, + "loss": 2.447, + "step": 2723 + }, + { + "epoch": 0.968705547652916, + "grad_norm": 0.9919854402542114, + "learning_rate": 7.787012336361587e-09, + "loss": 2.794, + "step": 2724 + }, + { + "epoch": 0.9690611664295875, + "grad_norm": 0.8955703377723694, + "learning_rate": 7.6111893078889e-09, + "loss": 2.8583, + "step": 2725 + }, + { + "epoch": 0.9694167852062588, + "grad_norm": 0.9419286251068115, + "learning_rate": 7.437368916761666e-09, + "loss": 2.6671, + "step": 2726 + }, + { + "epoch": 0.9697724039829303, + "grad_norm": 1.0621057748794556, + "learning_rate": 7.265551396229308e-09, + "loss": 3.1801, + "step": 2727 + }, + { + "epoch": 0.9701280227596017, + "grad_norm": 0.8553555011749268, + "learning_rate": 7.095736976853895e-09, + "loss": 2.2654, + "step": 2728 + }, + { + "epoch": 0.9704836415362731, + "grad_norm": 0.7735542058944702, + "learning_rate": 6.927925886509645e-09, + "loss": 2.3165, + "step": 2729 + }, + { + "epoch": 0.9708392603129445, + "grad_norm": 0.8696247339248657, + "learning_rate": 6.762118350382263e-09, + "loss": 2.6446, + "step": 2730 + }, + { + "epoch": 0.9711948790896159, + "grad_norm": 0.9516143202781677, + "learning_rate": 6.598314590968935e-09, + "loss": 2.1147, + "step": 2731 + }, + { + "epoch": 0.9715504978662873, + "grad_norm": 0.8598852753639221, + "learning_rate": 6.436514828078e-09, + "loss": 1.885, + "step": 2732 + }, + { + "epoch": 0.9719061166429588, + "grad_norm": 1.0057663917541504, + "learning_rate": 6.27671927882878e-09, + "loss": 2.3421, + "step": 2733 + }, + { + "epoch": 0.9722617354196301, + "grad_norm": 1.2034446001052856, + "learning_rate": 6.118928157650749e-09, + "loss": 3.5344, + "step": 2734 + }, + { + "epoch": 0.9726173541963016, + "grad_norm": 1.16869056224823, + "learning_rate": 5.963141676284201e-09, + "loss": 2.8863, + "step": 2735 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.8031733632087708, + "learning_rate": 5.809360043778911e-09, + "loss": 2.3495, + "step": 2736 + }, + { + "epoch": 0.9733285917496444, + "grad_norm": 0.7725136876106262, + "learning_rate": 5.657583466494643e-09, + "loss": 2.4147, + "step": 2737 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 2.2770895957946777, + "learning_rate": 5.507812148100311e-09, + "loss": 3.7185, + "step": 2738 + }, + { + "epoch": 0.9740398293029872, + "grad_norm": 1.0554115772247314, + "learning_rate": 5.36004628957415e-09, + "loss": 2.1185, + "step": 2739 + }, + { + "epoch": 0.9743954480796586, + "grad_norm": 0.8301745653152466, + "learning_rate": 5.214286089203546e-09, + "loss": 2.162, + "step": 2740 + }, + { + "epoch": 0.9747510668563301, + "grad_norm": 0.6639955043792725, + "learning_rate": 5.0705317425838725e-09, + "loss": 1.1518, + "step": 2741 + }, + { + "epoch": 0.9751066856330014, + "grad_norm": 5.28521728515625, + "learning_rate": 4.928783442619156e-09, + "loss": 2.2946, + "step": 2742 + }, + { + "epoch": 0.9754623044096729, + "grad_norm": 2.604051351547241, + "learning_rate": 4.789041379521742e-09, + "loss": 3.9669, + "step": 2743 + }, + { + "epoch": 0.9758179231863442, + "grad_norm": 1.5582717657089233, + "learning_rate": 4.651305740811462e-09, + "loss": 3.0631, + "step": 2744 + }, + { + "epoch": 0.9761735419630156, + "grad_norm": 1.2274402379989624, + "learning_rate": 4.5155767113158056e-09, + "loss": 2.6569, + "step": 2745 + }, + { + "epoch": 0.9765291607396871, + "grad_norm": 1.3239761590957642, + "learning_rate": 4.381854473169578e-09, + "loss": 3.3931, + "step": 2746 + }, + { + "epoch": 0.9768847795163584, + "grad_norm": 1.0665327310562134, + "learning_rate": 4.2501392058149065e-09, + "loss": 1.9314, + "step": 2747 + }, + { + "epoch": 0.9772403982930299, + "grad_norm": 0.8255475759506226, + "learning_rate": 4.120431086000409e-09, + "loss": 2.2601, + "step": 2748 + }, + { + "epoch": 0.9775960170697012, + "grad_norm": 1.6984111070632935, + "learning_rate": 3.992730287781521e-09, + "loss": 3.1976, + "step": 2749 + }, + { + "epoch": 0.9779516358463727, + "grad_norm": 1.2321832180023193, + "learning_rate": 3.867036982520167e-09, + "loss": 3.0328, + "step": 2750 + }, + { + "epoch": 0.9783072546230441, + "grad_norm": 1.0668723583221436, + "learning_rate": 3.743351338884093e-09, + "loss": 2.9334, + "step": 2751 + }, + { + "epoch": 0.9786628733997155, + "grad_norm": 1.0424878597259521, + "learning_rate": 3.6216735228470357e-09, + "loss": 2.6025, + "step": 2752 + }, + { + "epoch": 0.9790184921763869, + "grad_norm": 1.8712481260299683, + "learning_rate": 3.502003697688716e-09, + "loss": 2.6514, + "step": 2753 + }, + { + "epoch": 0.9793741109530584, + "grad_norm": 0.9733873009681702, + "learning_rate": 3.3843420239941804e-09, + "loss": 2.4891, + "step": 2754 + }, + { + "epoch": 0.9797297297297297, + "grad_norm": 1.401926040649414, + "learning_rate": 3.2686886596536293e-09, + "loss": 3.2122, + "step": 2755 + }, + { + "epoch": 0.9800853485064012, + "grad_norm": 0.8304654955863953, + "learning_rate": 3.1550437598620863e-09, + "loss": 2.2164, + "step": 2756 + }, + { + "epoch": 0.9804409672830725, + "grad_norm": 1.4394091367721558, + "learning_rate": 3.04340747712023e-09, + "loss": 3.7349, + "step": 2757 + }, + { + "epoch": 0.980796586059744, + "grad_norm": 0.9719075560569763, + "learning_rate": 2.933779961232397e-09, + "loss": 2.5837, + "step": 2758 + }, + { + "epoch": 0.9811522048364154, + "grad_norm": 1.0550214052200317, + "learning_rate": 2.8261613593079103e-09, + "loss": 2.4344, + "step": 2759 + }, + { + "epoch": 0.9815078236130867, + "grad_norm": 0.9153699278831482, + "learning_rate": 2.7205518157604193e-09, + "loss": 2.4576, + "step": 2760 + }, + { + "epoch": 0.9818634423897582, + "grad_norm": 1.3414394855499268, + "learning_rate": 2.6169514723072275e-09, + "loss": 3.1541, + "step": 2761 + }, + { + "epoch": 0.9822190611664295, + "grad_norm": 1.9665026664733887, + "learning_rate": 2.515360467969963e-09, + "loss": 4.0977, + "step": 2762 + }, + { + "epoch": 0.982574679943101, + "grad_norm": 1.036718487739563, + "learning_rate": 2.4157789390732433e-09, + "loss": 2.4083, + "step": 2763 + }, + { + "epoch": 0.9829302987197724, + "grad_norm": 0.8127712607383728, + "learning_rate": 2.3182070192460104e-09, + "loss": 1.4473, + "step": 2764 + }, + { + "epoch": 0.9832859174964438, + "grad_norm": 1.0412272214889526, + "learning_rate": 2.222644839419696e-09, + "loss": 2.4428, + "step": 2765 + }, + { + "epoch": 0.9836415362731152, + "grad_norm": 1.6508699655532837, + "learning_rate": 2.1290925278293904e-09, + "loss": 3.3002, + "step": 2766 + }, + { + "epoch": 0.9839971550497866, + "grad_norm": 0.828844428062439, + "learning_rate": 2.037550210013006e-09, + "loss": 2.429, + "step": 2767 + }, + { + "epoch": 0.984352773826458, + "grad_norm": 0.9736850261688232, + "learning_rate": 1.9480180088112808e-09, + "loss": 2.2993, + "step": 2768 + }, + { + "epoch": 0.9847083926031295, + "grad_norm": 1.286438226699829, + "learning_rate": 1.8604960443674434e-09, + "loss": 2.9171, + "step": 2769 + }, + { + "epoch": 0.9850640113798008, + "grad_norm": 1.0234746932983398, + "learning_rate": 1.7749844341272136e-09, + "loss": 1.4719, + "step": 2770 + }, + { + "epoch": 0.9854196301564723, + "grad_norm": 1.5789586305618286, + "learning_rate": 1.6914832928388024e-09, + "loss": 3.2819, + "step": 2771 + }, + { + "epoch": 0.9857752489331437, + "grad_norm": 0.946391761302948, + "learning_rate": 1.6099927325524123e-09, + "loss": 1.6946, + "step": 2772 + }, + { + "epoch": 0.9861308677098151, + "grad_norm": 1.1391221284866333, + "learning_rate": 1.530512862620237e-09, + "loss": 2.7476, + "step": 2773 + }, + { + "epoch": 0.9864864864864865, + "grad_norm": 1.0719448328018188, + "learning_rate": 1.4530437896962956e-09, + "loss": 2.8172, + "step": 2774 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 1.180323600769043, + "learning_rate": 1.3775856177364322e-09, + "loss": 3.3585, + "step": 2775 + }, + { + "epoch": 0.9871977240398293, + "grad_norm": 1.231269121170044, + "learning_rate": 1.3041384479981488e-09, + "loss": 3.1036, + "step": 2776 + }, + { + "epoch": 0.9875533428165008, + "grad_norm": 1.1699529886245728, + "learning_rate": 1.2327023790399406e-09, + "loss": 2.6157, + "step": 2777 + }, + { + "epoch": 0.9879089615931721, + "grad_norm": 0.7382627129554749, + "learning_rate": 1.1632775067221268e-09, + "loss": 2.3599, + "step": 2778 + }, + { + "epoch": 0.9882645803698435, + "grad_norm": 1.1128385066986084, + "learning_rate": 1.0958639242058532e-09, + "loss": 2.7508, + "step": 2779 + }, + { + "epoch": 0.9886201991465149, + "grad_norm": 1.4299579858779907, + "learning_rate": 1.0304617219535905e-09, + "loss": 3.2704, + "step": 2780 + }, + { + "epoch": 0.9889758179231863, + "grad_norm": 1.1064879894256592, + "learning_rate": 9.670709877284689e-10, + "loss": 1.7313, + "step": 2781 + }, + { + "epoch": 0.9893314366998578, + "grad_norm": 0.8100749850273132, + "learning_rate": 9.056918065946107e-10, + "loss": 2.3899, + "step": 2782 + }, + { + "epoch": 0.9896870554765291, + "grad_norm": 1.482354998588562, + "learning_rate": 8.463242609167975e-10, + "loss": 3.2969, + "step": 2783 + }, + { + "epoch": 0.9900426742532006, + "grad_norm": 1.2556992769241333, + "learning_rate": 7.88968430360304e-10, + "loss": 3.3475, + "step": 2784 + }, + { + "epoch": 0.9903982930298719, + "grad_norm": 1.109882116317749, + "learning_rate": 7.336243918908969e-10, + "loss": 2.2137, + "step": 2785 + }, + { + "epoch": 0.9907539118065434, + "grad_norm": 1.325905203819275, + "learning_rate": 6.802922197748363e-10, + "loss": 2.348, + "step": 2786 + }, + { + "epoch": 0.9911095305832148, + "grad_norm": 1.7952167987823486, + "learning_rate": 6.28971985578708e-10, + "loss": 2.1949, + "step": 2787 + }, + { + "epoch": 0.9914651493598862, + "grad_norm": 1.2754321098327637, + "learning_rate": 5.796637581689246e-10, + "loss": 2.0678, + "step": 2788 + }, + { + "epoch": 0.9918207681365576, + "grad_norm": 2.024022102355957, + "learning_rate": 5.32367603712558e-10, + "loss": 4.2538, + "step": 2789 + }, + { + "epoch": 0.9921763869132291, + "grad_norm": 1.4179251194000244, + "learning_rate": 4.870835856760069e-10, + "loss": 2.4066, + "step": 2790 + }, + { + "epoch": 0.9925320056899004, + "grad_norm": 0.7890263795852661, + "learning_rate": 4.438117648259965e-10, + "loss": 2.3522, + "step": 2791 + }, + { + "epoch": 0.9928876244665719, + "grad_norm": 0.8037734031677246, + "learning_rate": 4.0255219922907816e-10, + "loss": 2.5716, + "step": 2792 + }, + { + "epoch": 0.9932432432432432, + "grad_norm": 0.8985860347747803, + "learning_rate": 3.633049442516301e-10, + "loss": 2.6007, + "step": 2793 + }, + { + "epoch": 0.9935988620199147, + "grad_norm": 0.7945637106895447, + "learning_rate": 3.260700525591909e-10, + "loss": 1.9217, + "step": 2794 + }, + { + "epoch": 0.9939544807965861, + "grad_norm": 1.0150834321975708, + "learning_rate": 2.908475741176253e-10, + "loss": 1.762, + "step": 2795 + }, + { + "epoch": 0.9943100995732574, + "grad_norm": 1.065558671951294, + "learning_rate": 2.5763755619179207e-10, + "loss": 1.6415, + "step": 2796 + }, + { + "epoch": 0.9946657183499289, + "grad_norm": 1.4686152935028076, + "learning_rate": 2.2644004334637648e-10, + "loss": 1.6136, + "step": 2797 + }, + { + "epoch": 0.9950213371266002, + "grad_norm": 0.9632654190063477, + "learning_rate": 1.972550774452242e-10, + "loss": 3.1481, + "step": 2798 + }, + { + "epoch": 0.9953769559032717, + "grad_norm": 2.0203945636749268, + "learning_rate": 1.700826976516745e-10, + "loss": 3.7108, + "step": 2799 + }, + { + "epoch": 0.9957325746799431, + "grad_norm": 1.2362414598464966, + "learning_rate": 1.449229404283936e-10, + "loss": 3.179, + "step": 2800 + }, + { + "epoch": 0.9960881934566145, + "grad_norm": 0.9769591689109802, + "learning_rate": 1.217758395373747e-10, + "loss": 2.747, + "step": 2801 + }, + { + "epoch": 0.9964438122332859, + "grad_norm": 0.9882287979125977, + "learning_rate": 1.0064142603943838e-10, + "loss": 3.1684, + "step": 2802 + }, + { + "epoch": 0.9967994310099573, + "grad_norm": 0.9427874088287354, + "learning_rate": 8.15197282952318e-11, + "loss": 2.0091, + "step": 2803 + }, + { + "epoch": 0.9971550497866287, + "grad_norm": 0.759112536907196, + "learning_rate": 6.441077196389644e-11, + "loss": 2.4666, + "step": 2804 + }, + { + "epoch": 0.9975106685633002, + "grad_norm": 0.9073725342750549, + "learning_rate": 4.931458000390077e-11, + "loss": 2.8276, + "step": 2805 + }, + { + "epoch": 0.9978662873399715, + "grad_norm": 2.253350257873535, + "learning_rate": 3.6231172673040215e-11, + "loss": 3.904, + "step": 2806 + }, + { + "epoch": 0.998221906116643, + "grad_norm": 0.8732554316520691, + "learning_rate": 2.5160567527937607e-11, + "loss": 2.0521, + "step": 2807 + }, + { + "epoch": 0.9985775248933144, + "grad_norm": 1.1283258199691772, + "learning_rate": 1.6102779424043145e-11, + "loss": 2.7997, + "step": 2808 + }, + { + "epoch": 0.9989331436699858, + "grad_norm": 0.8531999588012695, + "learning_rate": 9.057820516300553e-12, + "loss": 2.3537, + "step": 2809 + }, + { + "epoch": 0.9992887624466572, + "grad_norm": 0.9417319297790527, + "learning_rate": 4.025700258147857e-12, + "loss": 2.8646, + "step": 2810 + }, + { + "epoch": 0.9996443812233285, + "grad_norm": 1.6764167547225952, + "learning_rate": 1.0064254021835417e-12, + "loss": 3.7249, + "step": 2811 + }, + { + "epoch": 1.0, + "grad_norm": 0.9863743782043457, + "learning_rate": 0.0, + "loss": 2.8208, + "step": 2812 + }, + { + "epoch": 1.0, + "eval_loss": 4.174140930175781, + "eval_runtime": 305.1474, + "eval_samples_per_second": 4.087, + "eval_steps_per_second": 4.087, + "step": 2812 + } + ], + "logging_steps": 1, + "max_steps": 2812, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.6421552362815488e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}